1 /* 2 * Copyright (c) 2000-2006 Silicon Graphics, Inc. 3 * All Rights Reserved. 4 * 5 * This program is free software; you can redistribute it and/or 6 * modify it under the terms of the GNU General Public License as 7 * published by the Free Software Foundation. 8 * 9 * This program is distributed in the hope that it would be useful, 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 * GNU General Public License for more details. 13 * 14 * You should have received a copy of the GNU General Public License 15 * along with this program; if not, write the Free Software Foundation, 16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 17 */ 18 #include <linux/log2.h> 19 20 #include "xfs.h" 21 #include "xfs_fs.h" 22 #include "xfs_types.h" 23 #include "xfs_log.h" 24 #include "xfs_inum.h" 25 #include "xfs_trans.h" 26 #include "xfs_trans_priv.h" 27 #include "xfs_sb.h" 28 #include "xfs_ag.h" 29 #include "xfs_mount.h" 30 #include "xfs_bmap_btree.h" 31 #include "xfs_alloc_btree.h" 32 #include "xfs_ialloc_btree.h" 33 #include "xfs_attr_sf.h" 34 #include "xfs_dinode.h" 35 #include "xfs_inode.h" 36 #include "xfs_buf_item.h" 37 #include "xfs_inode_item.h" 38 #include "xfs_btree.h" 39 #include "xfs_alloc.h" 40 #include "xfs_ialloc.h" 41 #include "xfs_bmap.h" 42 #include "xfs_error.h" 43 #include "xfs_utils.h" 44 #include "xfs_quota.h" 45 #include "xfs_filestream.h" 46 #include "xfs_vnodeops.h" 47 #include "xfs_trace.h" 48 49 kmem_zone_t *xfs_ifork_zone; 50 kmem_zone_t *xfs_inode_zone; 51 52 /* 53 * Used in xfs_itruncate_extents(). This is the maximum number of extents 54 * freed from a file in a single transaction. 55 */ 56 #define XFS_ITRUNC_MAX_EXTENTS 2 57 58 STATIC int xfs_iflush_int(xfs_inode_t *, xfs_buf_t *); 59 STATIC int xfs_iformat_local(xfs_inode_t *, xfs_dinode_t *, int, int); 60 STATIC int xfs_iformat_extents(xfs_inode_t *, xfs_dinode_t *, int); 61 STATIC int xfs_iformat_btree(xfs_inode_t *, xfs_dinode_t *, int); 62 63 /* 64 * helper function to extract extent size hint from inode 65 */ 66 xfs_extlen_t 67 xfs_get_extsz_hint( 68 struct xfs_inode *ip) 69 { 70 if ((ip->i_d.di_flags & XFS_DIFLAG_EXTSIZE) && ip->i_d.di_extsize) 71 return ip->i_d.di_extsize; 72 if (XFS_IS_REALTIME_INODE(ip)) 73 return ip->i_mount->m_sb.sb_rextsize; 74 return 0; 75 } 76 77 /* 78 * This is a wrapper routine around the xfs_ilock() routine used to centralize 79 * some grungy code. It is used in places that wish to lock the inode solely 80 * for reading the extents. The reason these places can't just call 81 * xfs_ilock(SHARED) is that the inode lock also guards to bringing in of the 82 * extents from disk for a file in b-tree format. If the inode is in b-tree 83 * format, then we need to lock the inode exclusively until the extents are read 84 * in. Locking it exclusively all the time would limit our parallelism 85 * unnecessarily, though. What we do instead is check to see if the extents 86 * have been read in yet, and only lock the inode exclusively if they have not. 87 * 88 * The function returns a value which should be given to the corresponding 89 * xfs_iunlock_map_shared(). This value is the mode in which the lock was 90 * actually taken. 91 */ 92 uint 93 xfs_ilock_map_shared( 94 xfs_inode_t *ip) 95 { 96 uint lock_mode; 97 98 if ((ip->i_d.di_format == XFS_DINODE_FMT_BTREE) && 99 ((ip->i_df.if_flags & XFS_IFEXTENTS) == 0)) { 100 lock_mode = XFS_ILOCK_EXCL; 101 } else { 102 lock_mode = XFS_ILOCK_SHARED; 103 } 104 105 xfs_ilock(ip, lock_mode); 106 107 return lock_mode; 108 } 109 110 /* 111 * This is simply the unlock routine to go with xfs_ilock_map_shared(). 112 * All it does is call xfs_iunlock() with the given lock_mode. 113 */ 114 void 115 xfs_iunlock_map_shared( 116 xfs_inode_t *ip, 117 unsigned int lock_mode) 118 { 119 xfs_iunlock(ip, lock_mode); 120 } 121 122 /* 123 * The xfs inode contains 2 locks: a multi-reader lock called the 124 * i_iolock and a multi-reader lock called the i_lock. This routine 125 * allows either or both of the locks to be obtained. 126 * 127 * The 2 locks should always be ordered so that the IO lock is 128 * obtained first in order to prevent deadlock. 129 * 130 * ip -- the inode being locked 131 * lock_flags -- this parameter indicates the inode's locks 132 * to be locked. It can be: 133 * XFS_IOLOCK_SHARED, 134 * XFS_IOLOCK_EXCL, 135 * XFS_ILOCK_SHARED, 136 * XFS_ILOCK_EXCL, 137 * XFS_IOLOCK_SHARED | XFS_ILOCK_SHARED, 138 * XFS_IOLOCK_SHARED | XFS_ILOCK_EXCL, 139 * XFS_IOLOCK_EXCL | XFS_ILOCK_SHARED, 140 * XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL 141 */ 142 void 143 xfs_ilock( 144 xfs_inode_t *ip, 145 uint lock_flags) 146 { 147 trace_xfs_ilock(ip, lock_flags, _RET_IP_); 148 149 /* 150 * You can't set both SHARED and EXCL for the same lock, 151 * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED, 152 * and XFS_ILOCK_EXCL are valid values to set in lock_flags. 153 */ 154 ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) != 155 (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)); 156 ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) != 157 (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); 158 ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0); 159 160 if (lock_flags & XFS_IOLOCK_EXCL) 161 mrupdate_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags)); 162 else if (lock_flags & XFS_IOLOCK_SHARED) 163 mraccess_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags)); 164 165 if (lock_flags & XFS_ILOCK_EXCL) 166 mrupdate_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags)); 167 else if (lock_flags & XFS_ILOCK_SHARED) 168 mraccess_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags)); 169 } 170 171 /* 172 * This is just like xfs_ilock(), except that the caller 173 * is guaranteed not to sleep. It returns 1 if it gets 174 * the requested locks and 0 otherwise. If the IO lock is 175 * obtained but the inode lock cannot be, then the IO lock 176 * is dropped before returning. 177 * 178 * ip -- the inode being locked 179 * lock_flags -- this parameter indicates the inode's locks to be 180 * to be locked. See the comment for xfs_ilock() for a list 181 * of valid values. 182 */ 183 int 184 xfs_ilock_nowait( 185 xfs_inode_t *ip, 186 uint lock_flags) 187 { 188 trace_xfs_ilock_nowait(ip, lock_flags, _RET_IP_); 189 190 /* 191 * You can't set both SHARED and EXCL for the same lock, 192 * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED, 193 * and XFS_ILOCK_EXCL are valid values to set in lock_flags. 194 */ 195 ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) != 196 (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)); 197 ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) != 198 (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); 199 ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0); 200 201 if (lock_flags & XFS_IOLOCK_EXCL) { 202 if (!mrtryupdate(&ip->i_iolock)) 203 goto out; 204 } else if (lock_flags & XFS_IOLOCK_SHARED) { 205 if (!mrtryaccess(&ip->i_iolock)) 206 goto out; 207 } 208 if (lock_flags & XFS_ILOCK_EXCL) { 209 if (!mrtryupdate(&ip->i_lock)) 210 goto out_undo_iolock; 211 } else if (lock_flags & XFS_ILOCK_SHARED) { 212 if (!mrtryaccess(&ip->i_lock)) 213 goto out_undo_iolock; 214 } 215 return 1; 216 217 out_undo_iolock: 218 if (lock_flags & XFS_IOLOCK_EXCL) 219 mrunlock_excl(&ip->i_iolock); 220 else if (lock_flags & XFS_IOLOCK_SHARED) 221 mrunlock_shared(&ip->i_iolock); 222 out: 223 return 0; 224 } 225 226 /* 227 * xfs_iunlock() is used to drop the inode locks acquired with 228 * xfs_ilock() and xfs_ilock_nowait(). The caller must pass 229 * in the flags given to xfs_ilock() or xfs_ilock_nowait() so 230 * that we know which locks to drop. 231 * 232 * ip -- the inode being unlocked 233 * lock_flags -- this parameter indicates the inode's locks to be 234 * to be unlocked. See the comment for xfs_ilock() for a list 235 * of valid values for this parameter. 236 * 237 */ 238 void 239 xfs_iunlock( 240 xfs_inode_t *ip, 241 uint lock_flags) 242 { 243 /* 244 * You can't set both SHARED and EXCL for the same lock, 245 * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED, 246 * and XFS_ILOCK_EXCL are valid values to set in lock_flags. 247 */ 248 ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) != 249 (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)); 250 ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) != 251 (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); 252 ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0); 253 ASSERT(lock_flags != 0); 254 255 if (lock_flags & XFS_IOLOCK_EXCL) 256 mrunlock_excl(&ip->i_iolock); 257 else if (lock_flags & XFS_IOLOCK_SHARED) 258 mrunlock_shared(&ip->i_iolock); 259 260 if (lock_flags & XFS_ILOCK_EXCL) 261 mrunlock_excl(&ip->i_lock); 262 else if (lock_flags & XFS_ILOCK_SHARED) 263 mrunlock_shared(&ip->i_lock); 264 265 trace_xfs_iunlock(ip, lock_flags, _RET_IP_); 266 } 267 268 /* 269 * give up write locks. the i/o lock cannot be held nested 270 * if it is being demoted. 271 */ 272 void 273 xfs_ilock_demote( 274 xfs_inode_t *ip, 275 uint lock_flags) 276 { 277 ASSERT(lock_flags & (XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL)); 278 ASSERT((lock_flags & ~(XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL)) == 0); 279 280 if (lock_flags & XFS_ILOCK_EXCL) 281 mrdemote(&ip->i_lock); 282 if (lock_flags & XFS_IOLOCK_EXCL) 283 mrdemote(&ip->i_iolock); 284 285 trace_xfs_ilock_demote(ip, lock_flags, _RET_IP_); 286 } 287 288 #ifdef DEBUG 289 int 290 xfs_isilocked( 291 xfs_inode_t *ip, 292 uint lock_flags) 293 { 294 if (lock_flags & (XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)) { 295 if (!(lock_flags & XFS_ILOCK_SHARED)) 296 return !!ip->i_lock.mr_writer; 297 return rwsem_is_locked(&ip->i_lock.mr_lock); 298 } 299 300 if (lock_flags & (XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED)) { 301 if (!(lock_flags & XFS_IOLOCK_SHARED)) 302 return !!ip->i_iolock.mr_writer; 303 return rwsem_is_locked(&ip->i_iolock.mr_lock); 304 } 305 306 ASSERT(0); 307 return 0; 308 } 309 #endif 310 311 void 312 __xfs_iflock( 313 struct xfs_inode *ip) 314 { 315 wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_IFLOCK_BIT); 316 DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IFLOCK_BIT); 317 318 do { 319 prepare_to_wait_exclusive(wq, &wait.wait, TASK_UNINTERRUPTIBLE); 320 if (xfs_isiflocked(ip)) 321 io_schedule(); 322 } while (!xfs_iflock_nowait(ip)); 323 324 finish_wait(wq, &wait.wait); 325 } 326 327 #ifdef DEBUG 328 /* 329 * Make sure that the extents in the given memory buffer 330 * are valid. 331 */ 332 STATIC void 333 xfs_validate_extents( 334 xfs_ifork_t *ifp, 335 int nrecs, 336 xfs_exntfmt_t fmt) 337 { 338 xfs_bmbt_irec_t irec; 339 xfs_bmbt_rec_host_t rec; 340 int i; 341 342 for (i = 0; i < nrecs; i++) { 343 xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i); 344 rec.l0 = get_unaligned(&ep->l0); 345 rec.l1 = get_unaligned(&ep->l1); 346 xfs_bmbt_get_all(&rec, &irec); 347 if (fmt == XFS_EXTFMT_NOSTATE) 348 ASSERT(irec.br_state == XFS_EXT_NORM); 349 } 350 } 351 #else /* DEBUG */ 352 #define xfs_validate_extents(ifp, nrecs, fmt) 353 #endif /* DEBUG */ 354 355 /* 356 * Check that none of the inode's in the buffer have a next 357 * unlinked field of 0. 358 */ 359 #if defined(DEBUG) 360 void 361 xfs_inobp_check( 362 xfs_mount_t *mp, 363 xfs_buf_t *bp) 364 { 365 int i; 366 int j; 367 xfs_dinode_t *dip; 368 369 j = mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog; 370 371 for (i = 0; i < j; i++) { 372 dip = (xfs_dinode_t *)xfs_buf_offset(bp, 373 i * mp->m_sb.sb_inodesize); 374 if (!dip->di_next_unlinked) { 375 xfs_alert(mp, 376 "Detected bogus zero next_unlinked field in incore inode buffer 0x%p.", 377 bp); 378 ASSERT(dip->di_next_unlinked); 379 } 380 } 381 } 382 #endif 383 384 /* 385 * This routine is called to map an inode to the buffer containing the on-disk 386 * version of the inode. It returns a pointer to the buffer containing the 387 * on-disk inode in the bpp parameter, and in the dipp parameter it returns a 388 * pointer to the on-disk inode within that buffer. 389 * 390 * If a non-zero error is returned, then the contents of bpp and dipp are 391 * undefined. 392 */ 393 int 394 xfs_imap_to_bp( 395 struct xfs_mount *mp, 396 struct xfs_trans *tp, 397 struct xfs_imap *imap, 398 struct xfs_dinode **dipp, 399 struct xfs_buf **bpp, 400 uint buf_flags, 401 uint iget_flags) 402 { 403 struct xfs_buf *bp; 404 int error; 405 int i; 406 int ni; 407 408 buf_flags |= XBF_UNMAPPED; 409 error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap->im_blkno, 410 (int)imap->im_len, buf_flags, &bp); 411 if (error) { 412 if (error != EAGAIN) { 413 xfs_warn(mp, 414 "%s: xfs_trans_read_buf() returned error %d.", 415 __func__, error); 416 } else { 417 ASSERT(buf_flags & XBF_TRYLOCK); 418 } 419 return error; 420 } 421 422 /* 423 * Validate the magic number and version of every inode in the buffer 424 * (if DEBUG kernel) or the first inode in the buffer, otherwise. 425 */ 426 #ifdef DEBUG 427 ni = BBTOB(imap->im_len) >> mp->m_sb.sb_inodelog; 428 #else /* usual case */ 429 ni = 1; 430 #endif 431 432 for (i = 0; i < ni; i++) { 433 int di_ok; 434 xfs_dinode_t *dip; 435 436 dip = (xfs_dinode_t *)xfs_buf_offset(bp, 437 (i << mp->m_sb.sb_inodelog)); 438 di_ok = dip->di_magic == cpu_to_be16(XFS_DINODE_MAGIC) && 439 XFS_DINODE_GOOD_VERSION(dip->di_version); 440 if (unlikely(XFS_TEST_ERROR(!di_ok, mp, 441 XFS_ERRTAG_ITOBP_INOTOBP, 442 XFS_RANDOM_ITOBP_INOTOBP))) { 443 if (iget_flags & XFS_IGET_UNTRUSTED) { 444 xfs_trans_brelse(tp, bp); 445 return XFS_ERROR(EINVAL); 446 } 447 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_HIGH, 448 mp, dip); 449 #ifdef DEBUG 450 xfs_emerg(mp, 451 "bad inode magic/vsn daddr %lld #%d (magic=%x)", 452 (unsigned long long)imap->im_blkno, i, 453 be16_to_cpu(dip->di_magic)); 454 ASSERT(0); 455 #endif 456 xfs_trans_brelse(tp, bp); 457 return XFS_ERROR(EFSCORRUPTED); 458 } 459 } 460 461 xfs_inobp_check(mp, bp); 462 463 *bpp = bp; 464 *dipp = (struct xfs_dinode *)xfs_buf_offset(bp, imap->im_boffset); 465 return 0; 466 } 467 468 /* 469 * Move inode type and inode format specific information from the 470 * on-disk inode to the in-core inode. For fifos, devs, and sockets 471 * this means set if_rdev to the proper value. For files, directories, 472 * and symlinks this means to bring in the in-line data or extent 473 * pointers. For a file in B-tree format, only the root is immediately 474 * brought in-core. The rest will be in-lined in if_extents when it 475 * is first referenced (see xfs_iread_extents()). 476 */ 477 STATIC int 478 xfs_iformat( 479 xfs_inode_t *ip, 480 xfs_dinode_t *dip) 481 { 482 xfs_attr_shortform_t *atp; 483 int size; 484 int error = 0; 485 xfs_fsize_t di_size; 486 487 if (unlikely(be32_to_cpu(dip->di_nextents) + 488 be16_to_cpu(dip->di_anextents) > 489 be64_to_cpu(dip->di_nblocks))) { 490 xfs_warn(ip->i_mount, 491 "corrupt dinode %Lu, extent total = %d, nblocks = %Lu.", 492 (unsigned long long)ip->i_ino, 493 (int)(be32_to_cpu(dip->di_nextents) + 494 be16_to_cpu(dip->di_anextents)), 495 (unsigned long long) 496 be64_to_cpu(dip->di_nblocks)); 497 XFS_CORRUPTION_ERROR("xfs_iformat(1)", XFS_ERRLEVEL_LOW, 498 ip->i_mount, dip); 499 return XFS_ERROR(EFSCORRUPTED); 500 } 501 502 if (unlikely(dip->di_forkoff > ip->i_mount->m_sb.sb_inodesize)) { 503 xfs_warn(ip->i_mount, "corrupt dinode %Lu, forkoff = 0x%x.", 504 (unsigned long long)ip->i_ino, 505 dip->di_forkoff); 506 XFS_CORRUPTION_ERROR("xfs_iformat(2)", XFS_ERRLEVEL_LOW, 507 ip->i_mount, dip); 508 return XFS_ERROR(EFSCORRUPTED); 509 } 510 511 if (unlikely((ip->i_d.di_flags & XFS_DIFLAG_REALTIME) && 512 !ip->i_mount->m_rtdev_targp)) { 513 xfs_warn(ip->i_mount, 514 "corrupt dinode %Lu, has realtime flag set.", 515 ip->i_ino); 516 XFS_CORRUPTION_ERROR("xfs_iformat(realtime)", 517 XFS_ERRLEVEL_LOW, ip->i_mount, dip); 518 return XFS_ERROR(EFSCORRUPTED); 519 } 520 521 switch (ip->i_d.di_mode & S_IFMT) { 522 case S_IFIFO: 523 case S_IFCHR: 524 case S_IFBLK: 525 case S_IFSOCK: 526 if (unlikely(dip->di_format != XFS_DINODE_FMT_DEV)) { 527 XFS_CORRUPTION_ERROR("xfs_iformat(3)", XFS_ERRLEVEL_LOW, 528 ip->i_mount, dip); 529 return XFS_ERROR(EFSCORRUPTED); 530 } 531 ip->i_d.di_size = 0; 532 ip->i_df.if_u2.if_rdev = xfs_dinode_get_rdev(dip); 533 break; 534 535 case S_IFREG: 536 case S_IFLNK: 537 case S_IFDIR: 538 switch (dip->di_format) { 539 case XFS_DINODE_FMT_LOCAL: 540 /* 541 * no local regular files yet 542 */ 543 if (unlikely(S_ISREG(be16_to_cpu(dip->di_mode)))) { 544 xfs_warn(ip->i_mount, 545 "corrupt inode %Lu (local format for regular file).", 546 (unsigned long long) ip->i_ino); 547 XFS_CORRUPTION_ERROR("xfs_iformat(4)", 548 XFS_ERRLEVEL_LOW, 549 ip->i_mount, dip); 550 return XFS_ERROR(EFSCORRUPTED); 551 } 552 553 di_size = be64_to_cpu(dip->di_size); 554 if (unlikely(di_size > XFS_DFORK_DSIZE(dip, ip->i_mount))) { 555 xfs_warn(ip->i_mount, 556 "corrupt inode %Lu (bad size %Ld for local inode).", 557 (unsigned long long) ip->i_ino, 558 (long long) di_size); 559 XFS_CORRUPTION_ERROR("xfs_iformat(5)", 560 XFS_ERRLEVEL_LOW, 561 ip->i_mount, dip); 562 return XFS_ERROR(EFSCORRUPTED); 563 } 564 565 size = (int)di_size; 566 error = xfs_iformat_local(ip, dip, XFS_DATA_FORK, size); 567 break; 568 case XFS_DINODE_FMT_EXTENTS: 569 error = xfs_iformat_extents(ip, dip, XFS_DATA_FORK); 570 break; 571 case XFS_DINODE_FMT_BTREE: 572 error = xfs_iformat_btree(ip, dip, XFS_DATA_FORK); 573 break; 574 default: 575 XFS_ERROR_REPORT("xfs_iformat(6)", XFS_ERRLEVEL_LOW, 576 ip->i_mount); 577 return XFS_ERROR(EFSCORRUPTED); 578 } 579 break; 580 581 default: 582 XFS_ERROR_REPORT("xfs_iformat(7)", XFS_ERRLEVEL_LOW, ip->i_mount); 583 return XFS_ERROR(EFSCORRUPTED); 584 } 585 if (error) { 586 return error; 587 } 588 if (!XFS_DFORK_Q(dip)) 589 return 0; 590 591 ASSERT(ip->i_afp == NULL); 592 ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP | KM_NOFS); 593 594 switch (dip->di_aformat) { 595 case XFS_DINODE_FMT_LOCAL: 596 atp = (xfs_attr_shortform_t *)XFS_DFORK_APTR(dip); 597 size = be16_to_cpu(atp->hdr.totsize); 598 599 if (unlikely(size < sizeof(struct xfs_attr_sf_hdr))) { 600 xfs_warn(ip->i_mount, 601 "corrupt inode %Lu (bad attr fork size %Ld).", 602 (unsigned long long) ip->i_ino, 603 (long long) size); 604 XFS_CORRUPTION_ERROR("xfs_iformat(8)", 605 XFS_ERRLEVEL_LOW, 606 ip->i_mount, dip); 607 return XFS_ERROR(EFSCORRUPTED); 608 } 609 610 error = xfs_iformat_local(ip, dip, XFS_ATTR_FORK, size); 611 break; 612 case XFS_DINODE_FMT_EXTENTS: 613 error = xfs_iformat_extents(ip, dip, XFS_ATTR_FORK); 614 break; 615 case XFS_DINODE_FMT_BTREE: 616 error = xfs_iformat_btree(ip, dip, XFS_ATTR_FORK); 617 break; 618 default: 619 error = XFS_ERROR(EFSCORRUPTED); 620 break; 621 } 622 if (error) { 623 kmem_zone_free(xfs_ifork_zone, ip->i_afp); 624 ip->i_afp = NULL; 625 xfs_idestroy_fork(ip, XFS_DATA_FORK); 626 } 627 return error; 628 } 629 630 /* 631 * The file is in-lined in the on-disk inode. 632 * If it fits into if_inline_data, then copy 633 * it there, otherwise allocate a buffer for it 634 * and copy the data there. Either way, set 635 * if_data to point at the data. 636 * If we allocate a buffer for the data, make 637 * sure that its size is a multiple of 4 and 638 * record the real size in i_real_bytes. 639 */ 640 STATIC int 641 xfs_iformat_local( 642 xfs_inode_t *ip, 643 xfs_dinode_t *dip, 644 int whichfork, 645 int size) 646 { 647 xfs_ifork_t *ifp; 648 int real_size; 649 650 /* 651 * If the size is unreasonable, then something 652 * is wrong and we just bail out rather than crash in 653 * kmem_alloc() or memcpy() below. 654 */ 655 if (unlikely(size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) { 656 xfs_warn(ip->i_mount, 657 "corrupt inode %Lu (bad size %d for local fork, size = %d).", 658 (unsigned long long) ip->i_ino, size, 659 XFS_DFORK_SIZE(dip, ip->i_mount, whichfork)); 660 XFS_CORRUPTION_ERROR("xfs_iformat_local", XFS_ERRLEVEL_LOW, 661 ip->i_mount, dip); 662 return XFS_ERROR(EFSCORRUPTED); 663 } 664 ifp = XFS_IFORK_PTR(ip, whichfork); 665 real_size = 0; 666 if (size == 0) 667 ifp->if_u1.if_data = NULL; 668 else if (size <= sizeof(ifp->if_u2.if_inline_data)) 669 ifp->if_u1.if_data = ifp->if_u2.if_inline_data; 670 else { 671 real_size = roundup(size, 4); 672 ifp->if_u1.if_data = kmem_alloc(real_size, KM_SLEEP | KM_NOFS); 673 } 674 ifp->if_bytes = size; 675 ifp->if_real_bytes = real_size; 676 if (size) 677 memcpy(ifp->if_u1.if_data, XFS_DFORK_PTR(dip, whichfork), size); 678 ifp->if_flags &= ~XFS_IFEXTENTS; 679 ifp->if_flags |= XFS_IFINLINE; 680 return 0; 681 } 682 683 /* 684 * The file consists of a set of extents all 685 * of which fit into the on-disk inode. 686 * If there are few enough extents to fit into 687 * the if_inline_ext, then copy them there. 688 * Otherwise allocate a buffer for them and copy 689 * them into it. Either way, set if_extents 690 * to point at the extents. 691 */ 692 STATIC int 693 xfs_iformat_extents( 694 xfs_inode_t *ip, 695 xfs_dinode_t *dip, 696 int whichfork) 697 { 698 xfs_bmbt_rec_t *dp; 699 xfs_ifork_t *ifp; 700 int nex; 701 int size; 702 int i; 703 704 ifp = XFS_IFORK_PTR(ip, whichfork); 705 nex = XFS_DFORK_NEXTENTS(dip, whichfork); 706 size = nex * (uint)sizeof(xfs_bmbt_rec_t); 707 708 /* 709 * If the number of extents is unreasonable, then something 710 * is wrong and we just bail out rather than crash in 711 * kmem_alloc() or memcpy() below. 712 */ 713 if (unlikely(size < 0 || size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) { 714 xfs_warn(ip->i_mount, "corrupt inode %Lu ((a)extents = %d).", 715 (unsigned long long) ip->i_ino, nex); 716 XFS_CORRUPTION_ERROR("xfs_iformat_extents(1)", XFS_ERRLEVEL_LOW, 717 ip->i_mount, dip); 718 return XFS_ERROR(EFSCORRUPTED); 719 } 720 721 ifp->if_real_bytes = 0; 722 if (nex == 0) 723 ifp->if_u1.if_extents = NULL; 724 else if (nex <= XFS_INLINE_EXTS) 725 ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext; 726 else 727 xfs_iext_add(ifp, 0, nex); 728 729 ifp->if_bytes = size; 730 if (size) { 731 dp = (xfs_bmbt_rec_t *) XFS_DFORK_PTR(dip, whichfork); 732 xfs_validate_extents(ifp, nex, XFS_EXTFMT_INODE(ip)); 733 for (i = 0; i < nex; i++, dp++) { 734 xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i); 735 ep->l0 = get_unaligned_be64(&dp->l0); 736 ep->l1 = get_unaligned_be64(&dp->l1); 737 } 738 XFS_BMAP_TRACE_EXLIST(ip, nex, whichfork); 739 if (whichfork != XFS_DATA_FORK || 740 XFS_EXTFMT_INODE(ip) == XFS_EXTFMT_NOSTATE) 741 if (unlikely(xfs_check_nostate_extents( 742 ifp, 0, nex))) { 743 XFS_ERROR_REPORT("xfs_iformat_extents(2)", 744 XFS_ERRLEVEL_LOW, 745 ip->i_mount); 746 return XFS_ERROR(EFSCORRUPTED); 747 } 748 } 749 ifp->if_flags |= XFS_IFEXTENTS; 750 return 0; 751 } 752 753 /* 754 * The file has too many extents to fit into 755 * the inode, so they are in B-tree format. 756 * Allocate a buffer for the root of the B-tree 757 * and copy the root into it. The i_extents 758 * field will remain NULL until all of the 759 * extents are read in (when they are needed). 760 */ 761 STATIC int 762 xfs_iformat_btree( 763 xfs_inode_t *ip, 764 xfs_dinode_t *dip, 765 int whichfork) 766 { 767 xfs_bmdr_block_t *dfp; 768 xfs_ifork_t *ifp; 769 /* REFERENCED */ 770 int nrecs; 771 int size; 772 773 ifp = XFS_IFORK_PTR(ip, whichfork); 774 dfp = (xfs_bmdr_block_t *)XFS_DFORK_PTR(dip, whichfork); 775 size = XFS_BMAP_BROOT_SPACE(dfp); 776 nrecs = be16_to_cpu(dfp->bb_numrecs); 777 778 /* 779 * blow out if -- fork has less extents than can fit in 780 * fork (fork shouldn't be a btree format), root btree 781 * block has more records than can fit into the fork, 782 * or the number of extents is greater than the number of 783 * blocks. 784 */ 785 if (unlikely(XFS_IFORK_NEXTENTS(ip, whichfork) <= 786 XFS_IFORK_MAXEXT(ip, whichfork) || 787 XFS_BMDR_SPACE_CALC(nrecs) > 788 XFS_DFORK_SIZE(dip, ip->i_mount, whichfork) || 789 XFS_IFORK_NEXTENTS(ip, whichfork) > ip->i_d.di_nblocks)) { 790 xfs_warn(ip->i_mount, "corrupt inode %Lu (btree).", 791 (unsigned long long) ip->i_ino); 792 XFS_CORRUPTION_ERROR("xfs_iformat_btree", XFS_ERRLEVEL_LOW, 793 ip->i_mount, dip); 794 return XFS_ERROR(EFSCORRUPTED); 795 } 796 797 ifp->if_broot_bytes = size; 798 ifp->if_broot = kmem_alloc(size, KM_SLEEP | KM_NOFS); 799 ASSERT(ifp->if_broot != NULL); 800 /* 801 * Copy and convert from the on-disk structure 802 * to the in-memory structure. 803 */ 804 xfs_bmdr_to_bmbt(ip->i_mount, dfp, 805 XFS_DFORK_SIZE(dip, ip->i_mount, whichfork), 806 ifp->if_broot, size); 807 ifp->if_flags &= ~XFS_IFEXTENTS; 808 ifp->if_flags |= XFS_IFBROOT; 809 810 return 0; 811 } 812 813 STATIC void 814 xfs_dinode_from_disk( 815 xfs_icdinode_t *to, 816 xfs_dinode_t *from) 817 { 818 to->di_magic = be16_to_cpu(from->di_magic); 819 to->di_mode = be16_to_cpu(from->di_mode); 820 to->di_version = from ->di_version; 821 to->di_format = from->di_format; 822 to->di_onlink = be16_to_cpu(from->di_onlink); 823 to->di_uid = be32_to_cpu(from->di_uid); 824 to->di_gid = be32_to_cpu(from->di_gid); 825 to->di_nlink = be32_to_cpu(from->di_nlink); 826 to->di_projid_lo = be16_to_cpu(from->di_projid_lo); 827 to->di_projid_hi = be16_to_cpu(from->di_projid_hi); 828 memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad)); 829 to->di_flushiter = be16_to_cpu(from->di_flushiter); 830 to->di_atime.t_sec = be32_to_cpu(from->di_atime.t_sec); 831 to->di_atime.t_nsec = be32_to_cpu(from->di_atime.t_nsec); 832 to->di_mtime.t_sec = be32_to_cpu(from->di_mtime.t_sec); 833 to->di_mtime.t_nsec = be32_to_cpu(from->di_mtime.t_nsec); 834 to->di_ctime.t_sec = be32_to_cpu(from->di_ctime.t_sec); 835 to->di_ctime.t_nsec = be32_to_cpu(from->di_ctime.t_nsec); 836 to->di_size = be64_to_cpu(from->di_size); 837 to->di_nblocks = be64_to_cpu(from->di_nblocks); 838 to->di_extsize = be32_to_cpu(from->di_extsize); 839 to->di_nextents = be32_to_cpu(from->di_nextents); 840 to->di_anextents = be16_to_cpu(from->di_anextents); 841 to->di_forkoff = from->di_forkoff; 842 to->di_aformat = from->di_aformat; 843 to->di_dmevmask = be32_to_cpu(from->di_dmevmask); 844 to->di_dmstate = be16_to_cpu(from->di_dmstate); 845 to->di_flags = be16_to_cpu(from->di_flags); 846 to->di_gen = be32_to_cpu(from->di_gen); 847 } 848 849 void 850 xfs_dinode_to_disk( 851 xfs_dinode_t *to, 852 xfs_icdinode_t *from) 853 { 854 to->di_magic = cpu_to_be16(from->di_magic); 855 to->di_mode = cpu_to_be16(from->di_mode); 856 to->di_version = from ->di_version; 857 to->di_format = from->di_format; 858 to->di_onlink = cpu_to_be16(from->di_onlink); 859 to->di_uid = cpu_to_be32(from->di_uid); 860 to->di_gid = cpu_to_be32(from->di_gid); 861 to->di_nlink = cpu_to_be32(from->di_nlink); 862 to->di_projid_lo = cpu_to_be16(from->di_projid_lo); 863 to->di_projid_hi = cpu_to_be16(from->di_projid_hi); 864 memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad)); 865 to->di_flushiter = cpu_to_be16(from->di_flushiter); 866 to->di_atime.t_sec = cpu_to_be32(from->di_atime.t_sec); 867 to->di_atime.t_nsec = cpu_to_be32(from->di_atime.t_nsec); 868 to->di_mtime.t_sec = cpu_to_be32(from->di_mtime.t_sec); 869 to->di_mtime.t_nsec = cpu_to_be32(from->di_mtime.t_nsec); 870 to->di_ctime.t_sec = cpu_to_be32(from->di_ctime.t_sec); 871 to->di_ctime.t_nsec = cpu_to_be32(from->di_ctime.t_nsec); 872 to->di_size = cpu_to_be64(from->di_size); 873 to->di_nblocks = cpu_to_be64(from->di_nblocks); 874 to->di_extsize = cpu_to_be32(from->di_extsize); 875 to->di_nextents = cpu_to_be32(from->di_nextents); 876 to->di_anextents = cpu_to_be16(from->di_anextents); 877 to->di_forkoff = from->di_forkoff; 878 to->di_aformat = from->di_aformat; 879 to->di_dmevmask = cpu_to_be32(from->di_dmevmask); 880 to->di_dmstate = cpu_to_be16(from->di_dmstate); 881 to->di_flags = cpu_to_be16(from->di_flags); 882 to->di_gen = cpu_to_be32(from->di_gen); 883 } 884 885 STATIC uint 886 _xfs_dic2xflags( 887 __uint16_t di_flags) 888 { 889 uint flags = 0; 890 891 if (di_flags & XFS_DIFLAG_ANY) { 892 if (di_flags & XFS_DIFLAG_REALTIME) 893 flags |= XFS_XFLAG_REALTIME; 894 if (di_flags & XFS_DIFLAG_PREALLOC) 895 flags |= XFS_XFLAG_PREALLOC; 896 if (di_flags & XFS_DIFLAG_IMMUTABLE) 897 flags |= XFS_XFLAG_IMMUTABLE; 898 if (di_flags & XFS_DIFLAG_APPEND) 899 flags |= XFS_XFLAG_APPEND; 900 if (di_flags & XFS_DIFLAG_SYNC) 901 flags |= XFS_XFLAG_SYNC; 902 if (di_flags & XFS_DIFLAG_NOATIME) 903 flags |= XFS_XFLAG_NOATIME; 904 if (di_flags & XFS_DIFLAG_NODUMP) 905 flags |= XFS_XFLAG_NODUMP; 906 if (di_flags & XFS_DIFLAG_RTINHERIT) 907 flags |= XFS_XFLAG_RTINHERIT; 908 if (di_flags & XFS_DIFLAG_PROJINHERIT) 909 flags |= XFS_XFLAG_PROJINHERIT; 910 if (di_flags & XFS_DIFLAG_NOSYMLINKS) 911 flags |= XFS_XFLAG_NOSYMLINKS; 912 if (di_flags & XFS_DIFLAG_EXTSIZE) 913 flags |= XFS_XFLAG_EXTSIZE; 914 if (di_flags & XFS_DIFLAG_EXTSZINHERIT) 915 flags |= XFS_XFLAG_EXTSZINHERIT; 916 if (di_flags & XFS_DIFLAG_NODEFRAG) 917 flags |= XFS_XFLAG_NODEFRAG; 918 if (di_flags & XFS_DIFLAG_FILESTREAM) 919 flags |= XFS_XFLAG_FILESTREAM; 920 } 921 922 return flags; 923 } 924 925 uint 926 xfs_ip2xflags( 927 xfs_inode_t *ip) 928 { 929 xfs_icdinode_t *dic = &ip->i_d; 930 931 return _xfs_dic2xflags(dic->di_flags) | 932 (XFS_IFORK_Q(ip) ? XFS_XFLAG_HASATTR : 0); 933 } 934 935 uint 936 xfs_dic2xflags( 937 xfs_dinode_t *dip) 938 { 939 return _xfs_dic2xflags(be16_to_cpu(dip->di_flags)) | 940 (XFS_DFORK_Q(dip) ? XFS_XFLAG_HASATTR : 0); 941 } 942 943 /* 944 * Read the disk inode attributes into the in-core inode structure. 945 */ 946 int 947 xfs_iread( 948 xfs_mount_t *mp, 949 xfs_trans_t *tp, 950 xfs_inode_t *ip, 951 uint iget_flags) 952 { 953 xfs_buf_t *bp; 954 xfs_dinode_t *dip; 955 int error; 956 957 /* 958 * Fill in the location information in the in-core inode. 959 */ 960 error = xfs_imap(mp, tp, ip->i_ino, &ip->i_imap, iget_flags); 961 if (error) 962 return error; 963 964 /* 965 * Get pointers to the on-disk inode and the buffer containing it. 966 */ 967 error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &bp, 0, iget_flags); 968 if (error) 969 return error; 970 971 /* 972 * If we got something that isn't an inode it means someone 973 * (nfs or dmi) has a stale handle. 974 */ 975 if (dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC)) { 976 #ifdef DEBUG 977 xfs_alert(mp, 978 "%s: dip->di_magic (0x%x) != XFS_DINODE_MAGIC (0x%x)", 979 __func__, be16_to_cpu(dip->di_magic), XFS_DINODE_MAGIC); 980 #endif /* DEBUG */ 981 error = XFS_ERROR(EINVAL); 982 goto out_brelse; 983 } 984 985 /* 986 * If the on-disk inode is already linked to a directory 987 * entry, copy all of the inode into the in-core inode. 988 * xfs_iformat() handles copying in the inode format 989 * specific information. 990 * Otherwise, just get the truly permanent information. 991 */ 992 if (dip->di_mode) { 993 xfs_dinode_from_disk(&ip->i_d, dip); 994 error = xfs_iformat(ip, dip); 995 if (error) { 996 #ifdef DEBUG 997 xfs_alert(mp, "%s: xfs_iformat() returned error %d", 998 __func__, error); 999 #endif /* DEBUG */ 1000 goto out_brelse; 1001 } 1002 } else { 1003 ip->i_d.di_magic = be16_to_cpu(dip->di_magic); 1004 ip->i_d.di_version = dip->di_version; 1005 ip->i_d.di_gen = be32_to_cpu(dip->di_gen); 1006 ip->i_d.di_flushiter = be16_to_cpu(dip->di_flushiter); 1007 /* 1008 * Make sure to pull in the mode here as well in 1009 * case the inode is released without being used. 1010 * This ensures that xfs_inactive() will see that 1011 * the inode is already free and not try to mess 1012 * with the uninitialized part of it. 1013 */ 1014 ip->i_d.di_mode = 0; 1015 } 1016 1017 /* 1018 * The inode format changed when we moved the link count and 1019 * made it 32 bits long. If this is an old format inode, 1020 * convert it in memory to look like a new one. If it gets 1021 * flushed to disk we will convert back before flushing or 1022 * logging it. We zero out the new projid field and the old link 1023 * count field. We'll handle clearing the pad field (the remains 1024 * of the old uuid field) when we actually convert the inode to 1025 * the new format. We don't change the version number so that we 1026 * can distinguish this from a real new format inode. 1027 */ 1028 if (ip->i_d.di_version == 1) { 1029 ip->i_d.di_nlink = ip->i_d.di_onlink; 1030 ip->i_d.di_onlink = 0; 1031 xfs_set_projid(ip, 0); 1032 } 1033 1034 ip->i_delayed_blks = 0; 1035 1036 /* 1037 * Mark the buffer containing the inode as something to keep 1038 * around for a while. This helps to keep recently accessed 1039 * meta-data in-core longer. 1040 */ 1041 xfs_buf_set_ref(bp, XFS_INO_REF); 1042 1043 /* 1044 * Use xfs_trans_brelse() to release the buffer containing the 1045 * on-disk inode, because it was acquired with xfs_trans_read_buf() 1046 * in xfs_imap_to_bp() above. If tp is NULL, this is just a normal 1047 * brelse(). If we're within a transaction, then xfs_trans_brelse() 1048 * will only release the buffer if it is not dirty within the 1049 * transaction. It will be OK to release the buffer in this case, 1050 * because inodes on disk are never destroyed and we will be 1051 * locking the new in-core inode before putting it in the hash 1052 * table where other processes can find it. Thus we don't have 1053 * to worry about the inode being changed just because we released 1054 * the buffer. 1055 */ 1056 out_brelse: 1057 xfs_trans_brelse(tp, bp); 1058 return error; 1059 } 1060 1061 /* 1062 * Read in extents from a btree-format inode. 1063 * Allocate and fill in if_extents. Real work is done in xfs_bmap.c. 1064 */ 1065 int 1066 xfs_iread_extents( 1067 xfs_trans_t *tp, 1068 xfs_inode_t *ip, 1069 int whichfork) 1070 { 1071 int error; 1072 xfs_ifork_t *ifp; 1073 xfs_extnum_t nextents; 1074 1075 if (unlikely(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)) { 1076 XFS_ERROR_REPORT("xfs_iread_extents", XFS_ERRLEVEL_LOW, 1077 ip->i_mount); 1078 return XFS_ERROR(EFSCORRUPTED); 1079 } 1080 nextents = XFS_IFORK_NEXTENTS(ip, whichfork); 1081 ifp = XFS_IFORK_PTR(ip, whichfork); 1082 1083 /* 1084 * We know that the size is valid (it's checked in iformat_btree) 1085 */ 1086 ifp->if_bytes = ifp->if_real_bytes = 0; 1087 ifp->if_flags |= XFS_IFEXTENTS; 1088 xfs_iext_add(ifp, 0, nextents); 1089 error = xfs_bmap_read_extents(tp, ip, whichfork); 1090 if (error) { 1091 xfs_iext_destroy(ifp); 1092 ifp->if_flags &= ~XFS_IFEXTENTS; 1093 return error; 1094 } 1095 xfs_validate_extents(ifp, nextents, XFS_EXTFMT_INODE(ip)); 1096 return 0; 1097 } 1098 1099 /* 1100 * Allocate an inode on disk and return a copy of its in-core version. 1101 * The in-core inode is locked exclusively. Set mode, nlink, and rdev 1102 * appropriately within the inode. The uid and gid for the inode are 1103 * set according to the contents of the given cred structure. 1104 * 1105 * Use xfs_dialloc() to allocate the on-disk inode. If xfs_dialloc() 1106 * has a free inode available, call xfs_iget() 1107 * to obtain the in-core version of the allocated inode. Finally, 1108 * fill in the inode and log its initial contents. In this case, 1109 * ialloc_context would be set to NULL and call_again set to false. 1110 * 1111 * If xfs_dialloc() does not have an available inode, 1112 * it will replenish its supply by doing an allocation. Since we can 1113 * only do one allocation within a transaction without deadlocks, we 1114 * must commit the current transaction before returning the inode itself. 1115 * In this case, therefore, we will set call_again to true and return. 1116 * The caller should then commit the current transaction, start a new 1117 * transaction, and call xfs_ialloc() again to actually get the inode. 1118 * 1119 * To ensure that some other process does not grab the inode that 1120 * was allocated during the first call to xfs_ialloc(), this routine 1121 * also returns the [locked] bp pointing to the head of the freelist 1122 * as ialloc_context. The caller should hold this buffer across 1123 * the commit and pass it back into this routine on the second call. 1124 * 1125 * If we are allocating quota inodes, we do not have a parent inode 1126 * to attach to or associate with (i.e. pip == NULL) because they 1127 * are not linked into the directory structure - they are attached 1128 * directly to the superblock - and so have no parent. 1129 */ 1130 int 1131 xfs_ialloc( 1132 xfs_trans_t *tp, 1133 xfs_inode_t *pip, 1134 umode_t mode, 1135 xfs_nlink_t nlink, 1136 xfs_dev_t rdev, 1137 prid_t prid, 1138 int okalloc, 1139 xfs_buf_t **ialloc_context, 1140 xfs_inode_t **ipp) 1141 { 1142 xfs_ino_t ino; 1143 xfs_inode_t *ip; 1144 uint flags; 1145 int error; 1146 timespec_t tv; 1147 int filestreams = 0; 1148 1149 /* 1150 * Call the space management code to pick 1151 * the on-disk inode to be allocated. 1152 */ 1153 error = xfs_dialloc(tp, pip ? pip->i_ino : 0, mode, okalloc, 1154 ialloc_context, &ino); 1155 if (error) 1156 return error; 1157 if (*ialloc_context || ino == NULLFSINO) { 1158 *ipp = NULL; 1159 return 0; 1160 } 1161 ASSERT(*ialloc_context == NULL); 1162 1163 /* 1164 * Get the in-core inode with the lock held exclusively. 1165 * This is because we're setting fields here we need 1166 * to prevent others from looking at until we're done. 1167 */ 1168 error = xfs_iget(tp->t_mountp, tp, ino, XFS_IGET_CREATE, 1169 XFS_ILOCK_EXCL, &ip); 1170 if (error) 1171 return error; 1172 ASSERT(ip != NULL); 1173 1174 ip->i_d.di_mode = mode; 1175 ip->i_d.di_onlink = 0; 1176 ip->i_d.di_nlink = nlink; 1177 ASSERT(ip->i_d.di_nlink == nlink); 1178 ip->i_d.di_uid = current_fsuid(); 1179 ip->i_d.di_gid = current_fsgid(); 1180 xfs_set_projid(ip, prid); 1181 memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad)); 1182 1183 /* 1184 * If the superblock version is up to where we support new format 1185 * inodes and this is currently an old format inode, then change 1186 * the inode version number now. This way we only do the conversion 1187 * here rather than here and in the flush/logging code. 1188 */ 1189 if (xfs_sb_version_hasnlink(&tp->t_mountp->m_sb) && 1190 ip->i_d.di_version == 1) { 1191 ip->i_d.di_version = 2; 1192 /* 1193 * We've already zeroed the old link count, the projid field, 1194 * and the pad field. 1195 */ 1196 } 1197 1198 /* 1199 * Project ids won't be stored on disk if we are using a version 1 inode. 1200 */ 1201 if ((prid != 0) && (ip->i_d.di_version == 1)) 1202 xfs_bump_ino_vers2(tp, ip); 1203 1204 if (pip && XFS_INHERIT_GID(pip)) { 1205 ip->i_d.di_gid = pip->i_d.di_gid; 1206 if ((pip->i_d.di_mode & S_ISGID) && S_ISDIR(mode)) { 1207 ip->i_d.di_mode |= S_ISGID; 1208 } 1209 } 1210 1211 /* 1212 * If the group ID of the new file does not match the effective group 1213 * ID or one of the supplementary group IDs, the S_ISGID bit is cleared 1214 * (and only if the irix_sgid_inherit compatibility variable is set). 1215 */ 1216 if ((irix_sgid_inherit) && 1217 (ip->i_d.di_mode & S_ISGID) && 1218 (!in_group_p((gid_t)ip->i_d.di_gid))) { 1219 ip->i_d.di_mode &= ~S_ISGID; 1220 } 1221 1222 ip->i_d.di_size = 0; 1223 ip->i_d.di_nextents = 0; 1224 ASSERT(ip->i_d.di_nblocks == 0); 1225 1226 nanotime(&tv); 1227 ip->i_d.di_mtime.t_sec = (__int32_t)tv.tv_sec; 1228 ip->i_d.di_mtime.t_nsec = (__int32_t)tv.tv_nsec; 1229 ip->i_d.di_atime = ip->i_d.di_mtime; 1230 ip->i_d.di_ctime = ip->i_d.di_mtime; 1231 1232 /* 1233 * di_gen will have been taken care of in xfs_iread. 1234 */ 1235 ip->i_d.di_extsize = 0; 1236 ip->i_d.di_dmevmask = 0; 1237 ip->i_d.di_dmstate = 0; 1238 ip->i_d.di_flags = 0; 1239 flags = XFS_ILOG_CORE; 1240 switch (mode & S_IFMT) { 1241 case S_IFIFO: 1242 case S_IFCHR: 1243 case S_IFBLK: 1244 case S_IFSOCK: 1245 ip->i_d.di_format = XFS_DINODE_FMT_DEV; 1246 ip->i_df.if_u2.if_rdev = rdev; 1247 ip->i_df.if_flags = 0; 1248 flags |= XFS_ILOG_DEV; 1249 break; 1250 case S_IFREG: 1251 /* 1252 * we can't set up filestreams until after the VFS inode 1253 * is set up properly. 1254 */ 1255 if (pip && xfs_inode_is_filestream(pip)) 1256 filestreams = 1; 1257 /* fall through */ 1258 case S_IFDIR: 1259 if (pip && (pip->i_d.di_flags & XFS_DIFLAG_ANY)) { 1260 uint di_flags = 0; 1261 1262 if (S_ISDIR(mode)) { 1263 if (pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT) 1264 di_flags |= XFS_DIFLAG_RTINHERIT; 1265 if (pip->i_d.di_flags & XFS_DIFLAG_EXTSZINHERIT) { 1266 di_flags |= XFS_DIFLAG_EXTSZINHERIT; 1267 ip->i_d.di_extsize = pip->i_d.di_extsize; 1268 } 1269 } else if (S_ISREG(mode)) { 1270 if (pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT) 1271 di_flags |= XFS_DIFLAG_REALTIME; 1272 if (pip->i_d.di_flags & XFS_DIFLAG_EXTSZINHERIT) { 1273 di_flags |= XFS_DIFLAG_EXTSIZE; 1274 ip->i_d.di_extsize = pip->i_d.di_extsize; 1275 } 1276 } 1277 if ((pip->i_d.di_flags & XFS_DIFLAG_NOATIME) && 1278 xfs_inherit_noatime) 1279 di_flags |= XFS_DIFLAG_NOATIME; 1280 if ((pip->i_d.di_flags & XFS_DIFLAG_NODUMP) && 1281 xfs_inherit_nodump) 1282 di_flags |= XFS_DIFLAG_NODUMP; 1283 if ((pip->i_d.di_flags & XFS_DIFLAG_SYNC) && 1284 xfs_inherit_sync) 1285 di_flags |= XFS_DIFLAG_SYNC; 1286 if ((pip->i_d.di_flags & XFS_DIFLAG_NOSYMLINKS) && 1287 xfs_inherit_nosymlinks) 1288 di_flags |= XFS_DIFLAG_NOSYMLINKS; 1289 if (pip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) 1290 di_flags |= XFS_DIFLAG_PROJINHERIT; 1291 if ((pip->i_d.di_flags & XFS_DIFLAG_NODEFRAG) && 1292 xfs_inherit_nodefrag) 1293 di_flags |= XFS_DIFLAG_NODEFRAG; 1294 if (pip->i_d.di_flags & XFS_DIFLAG_FILESTREAM) 1295 di_flags |= XFS_DIFLAG_FILESTREAM; 1296 ip->i_d.di_flags |= di_flags; 1297 } 1298 /* FALLTHROUGH */ 1299 case S_IFLNK: 1300 ip->i_d.di_format = XFS_DINODE_FMT_EXTENTS; 1301 ip->i_df.if_flags = XFS_IFEXTENTS; 1302 ip->i_df.if_bytes = ip->i_df.if_real_bytes = 0; 1303 ip->i_df.if_u1.if_extents = NULL; 1304 break; 1305 default: 1306 ASSERT(0); 1307 } 1308 /* 1309 * Attribute fork settings for new inode. 1310 */ 1311 ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS; 1312 ip->i_d.di_anextents = 0; 1313 1314 /* 1315 * Log the new values stuffed into the inode. 1316 */ 1317 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); 1318 xfs_trans_log_inode(tp, ip, flags); 1319 1320 /* now that we have an i_mode we can setup inode ops and unlock */ 1321 xfs_setup_inode(ip); 1322 1323 /* now we have set up the vfs inode we can associate the filestream */ 1324 if (filestreams) { 1325 error = xfs_filestream_associate(pip, ip); 1326 if (error < 0) 1327 return -error; 1328 if (!error) 1329 xfs_iflags_set(ip, XFS_IFILESTREAM); 1330 } 1331 1332 *ipp = ip; 1333 return 0; 1334 } 1335 1336 /* 1337 * Free up the underlying blocks past new_size. The new size must be smaller 1338 * than the current size. This routine can be used both for the attribute and 1339 * data fork, and does not modify the inode size, which is left to the caller. 1340 * 1341 * The transaction passed to this routine must have made a permanent log 1342 * reservation of at least XFS_ITRUNCATE_LOG_RES. This routine may commit the 1343 * given transaction and start new ones, so make sure everything involved in 1344 * the transaction is tidy before calling here. Some transaction will be 1345 * returned to the caller to be committed. The incoming transaction must 1346 * already include the inode, and both inode locks must be held exclusively. 1347 * The inode must also be "held" within the transaction. On return the inode 1348 * will be "held" within the returned transaction. This routine does NOT 1349 * require any disk space to be reserved for it within the transaction. 1350 * 1351 * If we get an error, we must return with the inode locked and linked into the 1352 * current transaction. This keeps things simple for the higher level code, 1353 * because it always knows that the inode is locked and held in the transaction 1354 * that returns to it whether errors occur or not. We don't mark the inode 1355 * dirty on error so that transactions can be easily aborted if possible. 1356 */ 1357 int 1358 xfs_itruncate_extents( 1359 struct xfs_trans **tpp, 1360 struct xfs_inode *ip, 1361 int whichfork, 1362 xfs_fsize_t new_size) 1363 { 1364 struct xfs_mount *mp = ip->i_mount; 1365 struct xfs_trans *tp = *tpp; 1366 struct xfs_trans *ntp; 1367 xfs_bmap_free_t free_list; 1368 xfs_fsblock_t first_block; 1369 xfs_fileoff_t first_unmap_block; 1370 xfs_fileoff_t last_block; 1371 xfs_filblks_t unmap_len; 1372 int committed; 1373 int error = 0; 1374 int done = 0; 1375 1376 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); 1377 ASSERT(!atomic_read(&VFS_I(ip)->i_count) || 1378 xfs_isilocked(ip, XFS_IOLOCK_EXCL)); 1379 ASSERT(new_size <= XFS_ISIZE(ip)); 1380 ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); 1381 ASSERT(ip->i_itemp != NULL); 1382 ASSERT(ip->i_itemp->ili_lock_flags == 0); 1383 ASSERT(!XFS_NOT_DQATTACHED(mp, ip)); 1384 1385 trace_xfs_itruncate_extents_start(ip, new_size); 1386 1387 /* 1388 * Since it is possible for space to become allocated beyond 1389 * the end of the file (in a crash where the space is allocated 1390 * but the inode size is not yet updated), simply remove any 1391 * blocks which show up between the new EOF and the maximum 1392 * possible file size. If the first block to be removed is 1393 * beyond the maximum file size (ie it is the same as last_block), 1394 * then there is nothing to do. 1395 */ 1396 first_unmap_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)new_size); 1397 last_block = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes); 1398 if (first_unmap_block == last_block) 1399 return 0; 1400 1401 ASSERT(first_unmap_block < last_block); 1402 unmap_len = last_block - first_unmap_block + 1; 1403 while (!done) { 1404 xfs_bmap_init(&free_list, &first_block); 1405 error = xfs_bunmapi(tp, ip, 1406 first_unmap_block, unmap_len, 1407 xfs_bmapi_aflag(whichfork), 1408 XFS_ITRUNC_MAX_EXTENTS, 1409 &first_block, &free_list, 1410 &done); 1411 if (error) 1412 goto out_bmap_cancel; 1413 1414 /* 1415 * Duplicate the transaction that has the permanent 1416 * reservation and commit the old transaction. 1417 */ 1418 error = xfs_bmap_finish(&tp, &free_list, &committed); 1419 if (committed) 1420 xfs_trans_ijoin(tp, ip, 0); 1421 if (error) 1422 goto out_bmap_cancel; 1423 1424 if (committed) { 1425 /* 1426 * Mark the inode dirty so it will be logged and 1427 * moved forward in the log as part of every commit. 1428 */ 1429 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 1430 } 1431 1432 ntp = xfs_trans_dup(tp); 1433 error = xfs_trans_commit(tp, 0); 1434 tp = ntp; 1435 1436 xfs_trans_ijoin(tp, ip, 0); 1437 1438 if (error) 1439 goto out; 1440 1441 /* 1442 * Transaction commit worked ok so we can drop the extra ticket 1443 * reference that we gained in xfs_trans_dup() 1444 */ 1445 xfs_log_ticket_put(tp->t_ticket); 1446 error = xfs_trans_reserve(tp, 0, 1447 XFS_ITRUNCATE_LOG_RES(mp), 0, 1448 XFS_TRANS_PERM_LOG_RES, 1449 XFS_ITRUNCATE_LOG_COUNT); 1450 if (error) 1451 goto out; 1452 } 1453 1454 /* 1455 * Always re-log the inode so that our permanent transaction can keep 1456 * on rolling it forward in the log. 1457 */ 1458 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 1459 1460 trace_xfs_itruncate_extents_end(ip, new_size); 1461 1462 out: 1463 *tpp = tp; 1464 return error; 1465 out_bmap_cancel: 1466 /* 1467 * If the bunmapi call encounters an error, return to the caller where 1468 * the transaction can be properly aborted. We just need to make sure 1469 * we're not holding any resources that we were not when we came in. 1470 */ 1471 xfs_bmap_cancel(&free_list); 1472 goto out; 1473 } 1474 1475 /* 1476 * This is called when the inode's link count goes to 0. 1477 * We place the on-disk inode on a list in the AGI. It 1478 * will be pulled from this list when the inode is freed. 1479 */ 1480 int 1481 xfs_iunlink( 1482 xfs_trans_t *tp, 1483 xfs_inode_t *ip) 1484 { 1485 xfs_mount_t *mp; 1486 xfs_agi_t *agi; 1487 xfs_dinode_t *dip; 1488 xfs_buf_t *agibp; 1489 xfs_buf_t *ibp; 1490 xfs_agino_t agino; 1491 short bucket_index; 1492 int offset; 1493 int error; 1494 1495 ASSERT(ip->i_d.di_nlink == 0); 1496 ASSERT(ip->i_d.di_mode != 0); 1497 1498 mp = tp->t_mountp; 1499 1500 /* 1501 * Get the agi buffer first. It ensures lock ordering 1502 * on the list. 1503 */ 1504 error = xfs_read_agi(mp, tp, XFS_INO_TO_AGNO(mp, ip->i_ino), &agibp); 1505 if (error) 1506 return error; 1507 agi = XFS_BUF_TO_AGI(agibp); 1508 1509 /* 1510 * Get the index into the agi hash table for the 1511 * list this inode will go on. 1512 */ 1513 agino = XFS_INO_TO_AGINO(mp, ip->i_ino); 1514 ASSERT(agino != 0); 1515 bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS; 1516 ASSERT(agi->agi_unlinked[bucket_index]); 1517 ASSERT(be32_to_cpu(agi->agi_unlinked[bucket_index]) != agino); 1518 1519 if (agi->agi_unlinked[bucket_index] != cpu_to_be32(NULLAGINO)) { 1520 /* 1521 * There is already another inode in the bucket we need 1522 * to add ourselves to. Add us at the front of the list. 1523 * Here we put the head pointer into our next pointer, 1524 * and then we fall through to point the head at us. 1525 */ 1526 error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp, 1527 0, 0); 1528 if (error) 1529 return error; 1530 1531 ASSERT(dip->di_next_unlinked == cpu_to_be32(NULLAGINO)); 1532 dip->di_next_unlinked = agi->agi_unlinked[bucket_index]; 1533 offset = ip->i_imap.im_boffset + 1534 offsetof(xfs_dinode_t, di_next_unlinked); 1535 xfs_trans_inode_buf(tp, ibp); 1536 xfs_trans_log_buf(tp, ibp, offset, 1537 (offset + sizeof(xfs_agino_t) - 1)); 1538 xfs_inobp_check(mp, ibp); 1539 } 1540 1541 /* 1542 * Point the bucket head pointer at the inode being inserted. 1543 */ 1544 ASSERT(agino != 0); 1545 agi->agi_unlinked[bucket_index] = cpu_to_be32(agino); 1546 offset = offsetof(xfs_agi_t, agi_unlinked) + 1547 (sizeof(xfs_agino_t) * bucket_index); 1548 xfs_trans_log_buf(tp, agibp, offset, 1549 (offset + sizeof(xfs_agino_t) - 1)); 1550 return 0; 1551 } 1552 1553 /* 1554 * Pull the on-disk inode from the AGI unlinked list. 1555 */ 1556 STATIC int 1557 xfs_iunlink_remove( 1558 xfs_trans_t *tp, 1559 xfs_inode_t *ip) 1560 { 1561 xfs_ino_t next_ino; 1562 xfs_mount_t *mp; 1563 xfs_agi_t *agi; 1564 xfs_dinode_t *dip; 1565 xfs_buf_t *agibp; 1566 xfs_buf_t *ibp; 1567 xfs_agnumber_t agno; 1568 xfs_agino_t agino; 1569 xfs_agino_t next_agino; 1570 xfs_buf_t *last_ibp; 1571 xfs_dinode_t *last_dip = NULL; 1572 short bucket_index; 1573 int offset, last_offset = 0; 1574 int error; 1575 1576 mp = tp->t_mountp; 1577 agno = XFS_INO_TO_AGNO(mp, ip->i_ino); 1578 1579 /* 1580 * Get the agi buffer first. It ensures lock ordering 1581 * on the list. 1582 */ 1583 error = xfs_read_agi(mp, tp, agno, &agibp); 1584 if (error) 1585 return error; 1586 1587 agi = XFS_BUF_TO_AGI(agibp); 1588 1589 /* 1590 * Get the index into the agi hash table for the 1591 * list this inode will go on. 1592 */ 1593 agino = XFS_INO_TO_AGINO(mp, ip->i_ino); 1594 ASSERT(agino != 0); 1595 bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS; 1596 ASSERT(agi->agi_unlinked[bucket_index] != cpu_to_be32(NULLAGINO)); 1597 ASSERT(agi->agi_unlinked[bucket_index]); 1598 1599 if (be32_to_cpu(agi->agi_unlinked[bucket_index]) == agino) { 1600 /* 1601 * We're at the head of the list. Get the inode's on-disk 1602 * buffer to see if there is anyone after us on the list. 1603 * Only modify our next pointer if it is not already NULLAGINO. 1604 * This saves us the overhead of dealing with the buffer when 1605 * there is no need to change it. 1606 */ 1607 error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp, 1608 0, 0); 1609 if (error) { 1610 xfs_warn(mp, "%s: xfs_imap_to_bp returned error %d.", 1611 __func__, error); 1612 return error; 1613 } 1614 next_agino = be32_to_cpu(dip->di_next_unlinked); 1615 ASSERT(next_agino != 0); 1616 if (next_agino != NULLAGINO) { 1617 dip->di_next_unlinked = cpu_to_be32(NULLAGINO); 1618 offset = ip->i_imap.im_boffset + 1619 offsetof(xfs_dinode_t, di_next_unlinked); 1620 xfs_trans_inode_buf(tp, ibp); 1621 xfs_trans_log_buf(tp, ibp, offset, 1622 (offset + sizeof(xfs_agino_t) - 1)); 1623 xfs_inobp_check(mp, ibp); 1624 } else { 1625 xfs_trans_brelse(tp, ibp); 1626 } 1627 /* 1628 * Point the bucket head pointer at the next inode. 1629 */ 1630 ASSERT(next_agino != 0); 1631 ASSERT(next_agino != agino); 1632 agi->agi_unlinked[bucket_index] = cpu_to_be32(next_agino); 1633 offset = offsetof(xfs_agi_t, agi_unlinked) + 1634 (sizeof(xfs_agino_t) * bucket_index); 1635 xfs_trans_log_buf(tp, agibp, offset, 1636 (offset + sizeof(xfs_agino_t) - 1)); 1637 } else { 1638 /* 1639 * We need to search the list for the inode being freed. 1640 */ 1641 next_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]); 1642 last_ibp = NULL; 1643 while (next_agino != agino) { 1644 struct xfs_imap imap; 1645 1646 if (last_ibp) 1647 xfs_trans_brelse(tp, last_ibp); 1648 1649 imap.im_blkno = 0; 1650 next_ino = XFS_AGINO_TO_INO(mp, agno, next_agino); 1651 1652 error = xfs_imap(mp, tp, next_ino, &imap, 0); 1653 if (error) { 1654 xfs_warn(mp, 1655 "%s: xfs_imap returned error %d.", 1656 __func__, error); 1657 return error; 1658 } 1659 1660 error = xfs_imap_to_bp(mp, tp, &imap, &last_dip, 1661 &last_ibp, 0, 0); 1662 if (error) { 1663 xfs_warn(mp, 1664 "%s: xfs_imap_to_bp returned error %d.", 1665 __func__, error); 1666 return error; 1667 } 1668 1669 last_offset = imap.im_boffset; 1670 next_agino = be32_to_cpu(last_dip->di_next_unlinked); 1671 ASSERT(next_agino != NULLAGINO); 1672 ASSERT(next_agino != 0); 1673 } 1674 1675 /* 1676 * Now last_ibp points to the buffer previous to us on the 1677 * unlinked list. Pull us from the list. 1678 */ 1679 error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp, 1680 0, 0); 1681 if (error) { 1682 xfs_warn(mp, "%s: xfs_imap_to_bp(2) returned error %d.", 1683 __func__, error); 1684 return error; 1685 } 1686 next_agino = be32_to_cpu(dip->di_next_unlinked); 1687 ASSERT(next_agino != 0); 1688 ASSERT(next_agino != agino); 1689 if (next_agino != NULLAGINO) { 1690 dip->di_next_unlinked = cpu_to_be32(NULLAGINO); 1691 offset = ip->i_imap.im_boffset + 1692 offsetof(xfs_dinode_t, di_next_unlinked); 1693 xfs_trans_inode_buf(tp, ibp); 1694 xfs_trans_log_buf(tp, ibp, offset, 1695 (offset + sizeof(xfs_agino_t) - 1)); 1696 xfs_inobp_check(mp, ibp); 1697 } else { 1698 xfs_trans_brelse(tp, ibp); 1699 } 1700 /* 1701 * Point the previous inode on the list to the next inode. 1702 */ 1703 last_dip->di_next_unlinked = cpu_to_be32(next_agino); 1704 ASSERT(next_agino != 0); 1705 offset = last_offset + offsetof(xfs_dinode_t, di_next_unlinked); 1706 xfs_trans_inode_buf(tp, last_ibp); 1707 xfs_trans_log_buf(tp, last_ibp, offset, 1708 (offset + sizeof(xfs_agino_t) - 1)); 1709 xfs_inobp_check(mp, last_ibp); 1710 } 1711 return 0; 1712 } 1713 1714 /* 1715 * A big issue when freeing the inode cluster is is that we _cannot_ skip any 1716 * inodes that are in memory - they all must be marked stale and attached to 1717 * the cluster buffer. 1718 */ 1719 STATIC int 1720 xfs_ifree_cluster( 1721 xfs_inode_t *free_ip, 1722 xfs_trans_t *tp, 1723 xfs_ino_t inum) 1724 { 1725 xfs_mount_t *mp = free_ip->i_mount; 1726 int blks_per_cluster; 1727 int nbufs; 1728 int ninodes; 1729 int i, j; 1730 xfs_daddr_t blkno; 1731 xfs_buf_t *bp; 1732 xfs_inode_t *ip; 1733 xfs_inode_log_item_t *iip; 1734 xfs_log_item_t *lip; 1735 struct xfs_perag *pag; 1736 1737 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, inum)); 1738 if (mp->m_sb.sb_blocksize >= XFS_INODE_CLUSTER_SIZE(mp)) { 1739 blks_per_cluster = 1; 1740 ninodes = mp->m_sb.sb_inopblock; 1741 nbufs = XFS_IALLOC_BLOCKS(mp); 1742 } else { 1743 blks_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) / 1744 mp->m_sb.sb_blocksize; 1745 ninodes = blks_per_cluster * mp->m_sb.sb_inopblock; 1746 nbufs = XFS_IALLOC_BLOCKS(mp) / blks_per_cluster; 1747 } 1748 1749 for (j = 0; j < nbufs; j++, inum += ninodes) { 1750 blkno = XFS_AGB_TO_DADDR(mp, XFS_INO_TO_AGNO(mp, inum), 1751 XFS_INO_TO_AGBNO(mp, inum)); 1752 1753 /* 1754 * We obtain and lock the backing buffer first in the process 1755 * here, as we have to ensure that any dirty inode that we 1756 * can't get the flush lock on is attached to the buffer. 1757 * If we scan the in-memory inodes first, then buffer IO can 1758 * complete before we get a lock on it, and hence we may fail 1759 * to mark all the active inodes on the buffer stale. 1760 */ 1761 bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno, 1762 mp->m_bsize * blks_per_cluster, 0); 1763 1764 if (!bp) 1765 return ENOMEM; 1766 /* 1767 * Walk the inodes already attached to the buffer and mark them 1768 * stale. These will all have the flush locks held, so an 1769 * in-memory inode walk can't lock them. By marking them all 1770 * stale first, we will not attempt to lock them in the loop 1771 * below as the XFS_ISTALE flag will be set. 1772 */ 1773 lip = bp->b_fspriv; 1774 while (lip) { 1775 if (lip->li_type == XFS_LI_INODE) { 1776 iip = (xfs_inode_log_item_t *)lip; 1777 ASSERT(iip->ili_logged == 1); 1778 lip->li_cb = xfs_istale_done; 1779 xfs_trans_ail_copy_lsn(mp->m_ail, 1780 &iip->ili_flush_lsn, 1781 &iip->ili_item.li_lsn); 1782 xfs_iflags_set(iip->ili_inode, XFS_ISTALE); 1783 } 1784 lip = lip->li_bio_list; 1785 } 1786 1787 1788 /* 1789 * For each inode in memory attempt to add it to the inode 1790 * buffer and set it up for being staled on buffer IO 1791 * completion. This is safe as we've locked out tail pushing 1792 * and flushing by locking the buffer. 1793 * 1794 * We have already marked every inode that was part of a 1795 * transaction stale above, which means there is no point in 1796 * even trying to lock them. 1797 */ 1798 for (i = 0; i < ninodes; i++) { 1799 retry: 1800 rcu_read_lock(); 1801 ip = radix_tree_lookup(&pag->pag_ici_root, 1802 XFS_INO_TO_AGINO(mp, (inum + i))); 1803 1804 /* Inode not in memory, nothing to do */ 1805 if (!ip) { 1806 rcu_read_unlock(); 1807 continue; 1808 } 1809 1810 /* 1811 * because this is an RCU protected lookup, we could 1812 * find a recently freed or even reallocated inode 1813 * during the lookup. We need to check under the 1814 * i_flags_lock for a valid inode here. Skip it if it 1815 * is not valid, the wrong inode or stale. 1816 */ 1817 spin_lock(&ip->i_flags_lock); 1818 if (ip->i_ino != inum + i || 1819 __xfs_iflags_test(ip, XFS_ISTALE)) { 1820 spin_unlock(&ip->i_flags_lock); 1821 rcu_read_unlock(); 1822 continue; 1823 } 1824 spin_unlock(&ip->i_flags_lock); 1825 1826 /* 1827 * Don't try to lock/unlock the current inode, but we 1828 * _cannot_ skip the other inodes that we did not find 1829 * in the list attached to the buffer and are not 1830 * already marked stale. If we can't lock it, back off 1831 * and retry. 1832 */ 1833 if (ip != free_ip && 1834 !xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) { 1835 rcu_read_unlock(); 1836 delay(1); 1837 goto retry; 1838 } 1839 rcu_read_unlock(); 1840 1841 xfs_iflock(ip); 1842 xfs_iflags_set(ip, XFS_ISTALE); 1843 1844 /* 1845 * we don't need to attach clean inodes or those only 1846 * with unlogged changes (which we throw away, anyway). 1847 */ 1848 iip = ip->i_itemp; 1849 if (!iip || xfs_inode_clean(ip)) { 1850 ASSERT(ip != free_ip); 1851 xfs_ifunlock(ip); 1852 xfs_iunlock(ip, XFS_ILOCK_EXCL); 1853 continue; 1854 } 1855 1856 iip->ili_last_fields = iip->ili_fields; 1857 iip->ili_fields = 0; 1858 iip->ili_logged = 1; 1859 xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn, 1860 &iip->ili_item.li_lsn); 1861 1862 xfs_buf_attach_iodone(bp, xfs_istale_done, 1863 &iip->ili_item); 1864 1865 if (ip != free_ip) 1866 xfs_iunlock(ip, XFS_ILOCK_EXCL); 1867 } 1868 1869 xfs_trans_stale_inode_buf(tp, bp); 1870 xfs_trans_binval(tp, bp); 1871 } 1872 1873 xfs_perag_put(pag); 1874 return 0; 1875 } 1876 1877 /* 1878 * This is called to return an inode to the inode free list. 1879 * The inode should already be truncated to 0 length and have 1880 * no pages associated with it. This routine also assumes that 1881 * the inode is already a part of the transaction. 1882 * 1883 * The on-disk copy of the inode will have been added to the list 1884 * of unlinked inodes in the AGI. We need to remove the inode from 1885 * that list atomically with respect to freeing it here. 1886 */ 1887 int 1888 xfs_ifree( 1889 xfs_trans_t *tp, 1890 xfs_inode_t *ip, 1891 xfs_bmap_free_t *flist) 1892 { 1893 int error; 1894 int delete; 1895 xfs_ino_t first_ino; 1896 xfs_dinode_t *dip; 1897 xfs_buf_t *ibp; 1898 1899 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); 1900 ASSERT(ip->i_d.di_nlink == 0); 1901 ASSERT(ip->i_d.di_nextents == 0); 1902 ASSERT(ip->i_d.di_anextents == 0); 1903 ASSERT(ip->i_d.di_size == 0 || !S_ISREG(ip->i_d.di_mode)); 1904 ASSERT(ip->i_d.di_nblocks == 0); 1905 1906 /* 1907 * Pull the on-disk inode from the AGI unlinked list. 1908 */ 1909 error = xfs_iunlink_remove(tp, ip); 1910 if (error != 0) { 1911 return error; 1912 } 1913 1914 error = xfs_difree(tp, ip->i_ino, flist, &delete, &first_ino); 1915 if (error != 0) { 1916 return error; 1917 } 1918 ip->i_d.di_mode = 0; /* mark incore inode as free */ 1919 ip->i_d.di_flags = 0; 1920 ip->i_d.di_dmevmask = 0; 1921 ip->i_d.di_forkoff = 0; /* mark the attr fork not in use */ 1922 ip->i_d.di_format = XFS_DINODE_FMT_EXTENTS; 1923 ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS; 1924 /* 1925 * Bump the generation count so no one will be confused 1926 * by reincarnations of this inode. 1927 */ 1928 ip->i_d.di_gen++; 1929 1930 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 1931 1932 error = xfs_imap_to_bp(ip->i_mount, tp, &ip->i_imap, &dip, &ibp, 1933 0, 0); 1934 if (error) 1935 return error; 1936 1937 /* 1938 * Clear the on-disk di_mode. This is to prevent xfs_bulkstat 1939 * from picking up this inode when it is reclaimed (its incore state 1940 * initialzed but not flushed to disk yet). The in-core di_mode is 1941 * already cleared and a corresponding transaction logged. 1942 * The hack here just synchronizes the in-core to on-disk 1943 * di_mode value in advance before the actual inode sync to disk. 1944 * This is OK because the inode is already unlinked and would never 1945 * change its di_mode again for this inode generation. 1946 * This is a temporary hack that would require a proper fix 1947 * in the future. 1948 */ 1949 dip->di_mode = 0; 1950 1951 if (delete) { 1952 error = xfs_ifree_cluster(ip, tp, first_ino); 1953 } 1954 1955 return error; 1956 } 1957 1958 /* 1959 * Reallocate the space for if_broot based on the number of records 1960 * being added or deleted as indicated in rec_diff. Move the records 1961 * and pointers in if_broot to fit the new size. When shrinking this 1962 * will eliminate holes between the records and pointers created by 1963 * the caller. When growing this will create holes to be filled in 1964 * by the caller. 1965 * 1966 * The caller must not request to add more records than would fit in 1967 * the on-disk inode root. If the if_broot is currently NULL, then 1968 * if we adding records one will be allocated. The caller must also 1969 * not request that the number of records go below zero, although 1970 * it can go to zero. 1971 * 1972 * ip -- the inode whose if_broot area is changing 1973 * ext_diff -- the change in the number of records, positive or negative, 1974 * requested for the if_broot array. 1975 */ 1976 void 1977 xfs_iroot_realloc( 1978 xfs_inode_t *ip, 1979 int rec_diff, 1980 int whichfork) 1981 { 1982 struct xfs_mount *mp = ip->i_mount; 1983 int cur_max; 1984 xfs_ifork_t *ifp; 1985 struct xfs_btree_block *new_broot; 1986 int new_max; 1987 size_t new_size; 1988 char *np; 1989 char *op; 1990 1991 /* 1992 * Handle the degenerate case quietly. 1993 */ 1994 if (rec_diff == 0) { 1995 return; 1996 } 1997 1998 ifp = XFS_IFORK_PTR(ip, whichfork); 1999 if (rec_diff > 0) { 2000 /* 2001 * If there wasn't any memory allocated before, just 2002 * allocate it now and get out. 2003 */ 2004 if (ifp->if_broot_bytes == 0) { 2005 new_size = (size_t)XFS_BMAP_BROOT_SPACE_CALC(rec_diff); 2006 ifp->if_broot = kmem_alloc(new_size, KM_SLEEP | KM_NOFS); 2007 ifp->if_broot_bytes = (int)new_size; 2008 return; 2009 } 2010 2011 /* 2012 * If there is already an existing if_broot, then we need 2013 * to realloc() it and shift the pointers to their new 2014 * location. The records don't change location because 2015 * they are kept butted up against the btree block header. 2016 */ 2017 cur_max = xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0); 2018 new_max = cur_max + rec_diff; 2019 new_size = (size_t)XFS_BMAP_BROOT_SPACE_CALC(new_max); 2020 ifp->if_broot = kmem_realloc(ifp->if_broot, new_size, 2021 (size_t)XFS_BMAP_BROOT_SPACE_CALC(cur_max), /* old size */ 2022 KM_SLEEP | KM_NOFS); 2023 op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1, 2024 ifp->if_broot_bytes); 2025 np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1, 2026 (int)new_size); 2027 ifp->if_broot_bytes = (int)new_size; 2028 ASSERT(ifp->if_broot_bytes <= 2029 XFS_IFORK_SIZE(ip, whichfork) + XFS_BROOT_SIZE_ADJ); 2030 memmove(np, op, cur_max * (uint)sizeof(xfs_dfsbno_t)); 2031 return; 2032 } 2033 2034 /* 2035 * rec_diff is less than 0. In this case, we are shrinking the 2036 * if_broot buffer. It must already exist. If we go to zero 2037 * records, just get rid of the root and clear the status bit. 2038 */ 2039 ASSERT((ifp->if_broot != NULL) && (ifp->if_broot_bytes > 0)); 2040 cur_max = xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0); 2041 new_max = cur_max + rec_diff; 2042 ASSERT(new_max >= 0); 2043 if (new_max > 0) 2044 new_size = (size_t)XFS_BMAP_BROOT_SPACE_CALC(new_max); 2045 else 2046 new_size = 0; 2047 if (new_size > 0) { 2048 new_broot = kmem_alloc(new_size, KM_SLEEP | KM_NOFS); 2049 /* 2050 * First copy over the btree block header. 2051 */ 2052 memcpy(new_broot, ifp->if_broot, XFS_BTREE_LBLOCK_LEN); 2053 } else { 2054 new_broot = NULL; 2055 ifp->if_flags &= ~XFS_IFBROOT; 2056 } 2057 2058 /* 2059 * Only copy the records and pointers if there are any. 2060 */ 2061 if (new_max > 0) { 2062 /* 2063 * First copy the records. 2064 */ 2065 op = (char *)XFS_BMBT_REC_ADDR(mp, ifp->if_broot, 1); 2066 np = (char *)XFS_BMBT_REC_ADDR(mp, new_broot, 1); 2067 memcpy(np, op, new_max * (uint)sizeof(xfs_bmbt_rec_t)); 2068 2069 /* 2070 * Then copy the pointers. 2071 */ 2072 op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1, 2073 ifp->if_broot_bytes); 2074 np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, new_broot, 1, 2075 (int)new_size); 2076 memcpy(np, op, new_max * (uint)sizeof(xfs_dfsbno_t)); 2077 } 2078 kmem_free(ifp->if_broot); 2079 ifp->if_broot = new_broot; 2080 ifp->if_broot_bytes = (int)new_size; 2081 ASSERT(ifp->if_broot_bytes <= 2082 XFS_IFORK_SIZE(ip, whichfork) + XFS_BROOT_SIZE_ADJ); 2083 return; 2084 } 2085 2086 2087 /* 2088 * This is called when the amount of space needed for if_data 2089 * is increased or decreased. The change in size is indicated by 2090 * the number of bytes that need to be added or deleted in the 2091 * byte_diff parameter. 2092 * 2093 * If the amount of space needed has decreased below the size of the 2094 * inline buffer, then switch to using the inline buffer. Otherwise, 2095 * use kmem_realloc() or kmem_alloc() to adjust the size of the buffer 2096 * to what is needed. 2097 * 2098 * ip -- the inode whose if_data area is changing 2099 * byte_diff -- the change in the number of bytes, positive or negative, 2100 * requested for the if_data array. 2101 */ 2102 void 2103 xfs_idata_realloc( 2104 xfs_inode_t *ip, 2105 int byte_diff, 2106 int whichfork) 2107 { 2108 xfs_ifork_t *ifp; 2109 int new_size; 2110 int real_size; 2111 2112 if (byte_diff == 0) { 2113 return; 2114 } 2115 2116 ifp = XFS_IFORK_PTR(ip, whichfork); 2117 new_size = (int)ifp->if_bytes + byte_diff; 2118 ASSERT(new_size >= 0); 2119 2120 if (new_size == 0) { 2121 if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) { 2122 kmem_free(ifp->if_u1.if_data); 2123 } 2124 ifp->if_u1.if_data = NULL; 2125 real_size = 0; 2126 } else if (new_size <= sizeof(ifp->if_u2.if_inline_data)) { 2127 /* 2128 * If the valid extents/data can fit in if_inline_ext/data, 2129 * copy them from the malloc'd vector and free it. 2130 */ 2131 if (ifp->if_u1.if_data == NULL) { 2132 ifp->if_u1.if_data = ifp->if_u2.if_inline_data; 2133 } else if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) { 2134 ASSERT(ifp->if_real_bytes != 0); 2135 memcpy(ifp->if_u2.if_inline_data, ifp->if_u1.if_data, 2136 new_size); 2137 kmem_free(ifp->if_u1.if_data); 2138 ifp->if_u1.if_data = ifp->if_u2.if_inline_data; 2139 } 2140 real_size = 0; 2141 } else { 2142 /* 2143 * Stuck with malloc/realloc. 2144 * For inline data, the underlying buffer must be 2145 * a multiple of 4 bytes in size so that it can be 2146 * logged and stay on word boundaries. We enforce 2147 * that here. 2148 */ 2149 real_size = roundup(new_size, 4); 2150 if (ifp->if_u1.if_data == NULL) { 2151 ASSERT(ifp->if_real_bytes == 0); 2152 ifp->if_u1.if_data = kmem_alloc(real_size, 2153 KM_SLEEP | KM_NOFS); 2154 } else if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) { 2155 /* 2156 * Only do the realloc if the underlying size 2157 * is really changing. 2158 */ 2159 if (ifp->if_real_bytes != real_size) { 2160 ifp->if_u1.if_data = 2161 kmem_realloc(ifp->if_u1.if_data, 2162 real_size, 2163 ifp->if_real_bytes, 2164 KM_SLEEP | KM_NOFS); 2165 } 2166 } else { 2167 ASSERT(ifp->if_real_bytes == 0); 2168 ifp->if_u1.if_data = kmem_alloc(real_size, 2169 KM_SLEEP | KM_NOFS); 2170 memcpy(ifp->if_u1.if_data, ifp->if_u2.if_inline_data, 2171 ifp->if_bytes); 2172 } 2173 } 2174 ifp->if_real_bytes = real_size; 2175 ifp->if_bytes = new_size; 2176 ASSERT(ifp->if_bytes <= XFS_IFORK_SIZE(ip, whichfork)); 2177 } 2178 2179 void 2180 xfs_idestroy_fork( 2181 xfs_inode_t *ip, 2182 int whichfork) 2183 { 2184 xfs_ifork_t *ifp; 2185 2186 ifp = XFS_IFORK_PTR(ip, whichfork); 2187 if (ifp->if_broot != NULL) { 2188 kmem_free(ifp->if_broot); 2189 ifp->if_broot = NULL; 2190 } 2191 2192 /* 2193 * If the format is local, then we can't have an extents 2194 * array so just look for an inline data array. If we're 2195 * not local then we may or may not have an extents list, 2196 * so check and free it up if we do. 2197 */ 2198 if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) { 2199 if ((ifp->if_u1.if_data != ifp->if_u2.if_inline_data) && 2200 (ifp->if_u1.if_data != NULL)) { 2201 ASSERT(ifp->if_real_bytes != 0); 2202 kmem_free(ifp->if_u1.if_data); 2203 ifp->if_u1.if_data = NULL; 2204 ifp->if_real_bytes = 0; 2205 } 2206 } else if ((ifp->if_flags & XFS_IFEXTENTS) && 2207 ((ifp->if_flags & XFS_IFEXTIREC) || 2208 ((ifp->if_u1.if_extents != NULL) && 2209 (ifp->if_u1.if_extents != ifp->if_u2.if_inline_ext)))) { 2210 ASSERT(ifp->if_real_bytes != 0); 2211 xfs_iext_destroy(ifp); 2212 } 2213 ASSERT(ifp->if_u1.if_extents == NULL || 2214 ifp->if_u1.if_extents == ifp->if_u2.if_inline_ext); 2215 ASSERT(ifp->if_real_bytes == 0); 2216 if (whichfork == XFS_ATTR_FORK) { 2217 kmem_zone_free(xfs_ifork_zone, ip->i_afp); 2218 ip->i_afp = NULL; 2219 } 2220 } 2221 2222 /* 2223 * This is called to unpin an inode. The caller must have the inode locked 2224 * in at least shared mode so that the buffer cannot be subsequently pinned 2225 * once someone is waiting for it to be unpinned. 2226 */ 2227 static void 2228 xfs_iunpin( 2229 struct xfs_inode *ip) 2230 { 2231 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); 2232 2233 trace_xfs_inode_unpin_nowait(ip, _RET_IP_); 2234 2235 /* Give the log a push to start the unpinning I/O */ 2236 xfs_log_force_lsn(ip->i_mount, ip->i_itemp->ili_last_lsn, 0); 2237 2238 } 2239 2240 static void 2241 __xfs_iunpin_wait( 2242 struct xfs_inode *ip) 2243 { 2244 wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_IPINNED_BIT); 2245 DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IPINNED_BIT); 2246 2247 xfs_iunpin(ip); 2248 2249 do { 2250 prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE); 2251 if (xfs_ipincount(ip)) 2252 io_schedule(); 2253 } while (xfs_ipincount(ip)); 2254 finish_wait(wq, &wait.wait); 2255 } 2256 2257 void 2258 xfs_iunpin_wait( 2259 struct xfs_inode *ip) 2260 { 2261 if (xfs_ipincount(ip)) 2262 __xfs_iunpin_wait(ip); 2263 } 2264 2265 /* 2266 * xfs_iextents_copy() 2267 * 2268 * This is called to copy the REAL extents (as opposed to the delayed 2269 * allocation extents) from the inode into the given buffer. It 2270 * returns the number of bytes copied into the buffer. 2271 * 2272 * If there are no delayed allocation extents, then we can just 2273 * memcpy() the extents into the buffer. Otherwise, we need to 2274 * examine each extent in turn and skip those which are delayed. 2275 */ 2276 int 2277 xfs_iextents_copy( 2278 xfs_inode_t *ip, 2279 xfs_bmbt_rec_t *dp, 2280 int whichfork) 2281 { 2282 int copied; 2283 int i; 2284 xfs_ifork_t *ifp; 2285 int nrecs; 2286 xfs_fsblock_t start_block; 2287 2288 ifp = XFS_IFORK_PTR(ip, whichfork); 2289 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); 2290 ASSERT(ifp->if_bytes > 0); 2291 2292 nrecs = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); 2293 XFS_BMAP_TRACE_EXLIST(ip, nrecs, whichfork); 2294 ASSERT(nrecs > 0); 2295 2296 /* 2297 * There are some delayed allocation extents in the 2298 * inode, so copy the extents one at a time and skip 2299 * the delayed ones. There must be at least one 2300 * non-delayed extent. 2301 */ 2302 copied = 0; 2303 for (i = 0; i < nrecs; i++) { 2304 xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i); 2305 start_block = xfs_bmbt_get_startblock(ep); 2306 if (isnullstartblock(start_block)) { 2307 /* 2308 * It's a delayed allocation extent, so skip it. 2309 */ 2310 continue; 2311 } 2312 2313 /* Translate to on disk format */ 2314 put_unaligned(cpu_to_be64(ep->l0), &dp->l0); 2315 put_unaligned(cpu_to_be64(ep->l1), &dp->l1); 2316 dp++; 2317 copied++; 2318 } 2319 ASSERT(copied != 0); 2320 xfs_validate_extents(ifp, copied, XFS_EXTFMT_INODE(ip)); 2321 2322 return (copied * (uint)sizeof(xfs_bmbt_rec_t)); 2323 } 2324 2325 /* 2326 * Each of the following cases stores data into the same region 2327 * of the on-disk inode, so only one of them can be valid at 2328 * any given time. While it is possible to have conflicting formats 2329 * and log flags, e.g. having XFS_ILOG_?DATA set when the fork is 2330 * in EXTENTS format, this can only happen when the fork has 2331 * changed formats after being modified but before being flushed. 2332 * In these cases, the format always takes precedence, because the 2333 * format indicates the current state of the fork. 2334 */ 2335 /*ARGSUSED*/ 2336 STATIC void 2337 xfs_iflush_fork( 2338 xfs_inode_t *ip, 2339 xfs_dinode_t *dip, 2340 xfs_inode_log_item_t *iip, 2341 int whichfork, 2342 xfs_buf_t *bp) 2343 { 2344 char *cp; 2345 xfs_ifork_t *ifp; 2346 xfs_mount_t *mp; 2347 #ifdef XFS_TRANS_DEBUG 2348 int first; 2349 #endif 2350 static const short brootflag[2] = 2351 { XFS_ILOG_DBROOT, XFS_ILOG_ABROOT }; 2352 static const short dataflag[2] = 2353 { XFS_ILOG_DDATA, XFS_ILOG_ADATA }; 2354 static const short extflag[2] = 2355 { XFS_ILOG_DEXT, XFS_ILOG_AEXT }; 2356 2357 if (!iip) 2358 return; 2359 ifp = XFS_IFORK_PTR(ip, whichfork); 2360 /* 2361 * This can happen if we gave up in iformat in an error path, 2362 * for the attribute fork. 2363 */ 2364 if (!ifp) { 2365 ASSERT(whichfork == XFS_ATTR_FORK); 2366 return; 2367 } 2368 cp = XFS_DFORK_PTR(dip, whichfork); 2369 mp = ip->i_mount; 2370 switch (XFS_IFORK_FORMAT(ip, whichfork)) { 2371 case XFS_DINODE_FMT_LOCAL: 2372 if ((iip->ili_fields & dataflag[whichfork]) && 2373 (ifp->if_bytes > 0)) { 2374 ASSERT(ifp->if_u1.if_data != NULL); 2375 ASSERT(ifp->if_bytes <= XFS_IFORK_SIZE(ip, whichfork)); 2376 memcpy(cp, ifp->if_u1.if_data, ifp->if_bytes); 2377 } 2378 break; 2379 2380 case XFS_DINODE_FMT_EXTENTS: 2381 ASSERT((ifp->if_flags & XFS_IFEXTENTS) || 2382 !(iip->ili_fields & extflag[whichfork])); 2383 if ((iip->ili_fields & extflag[whichfork]) && 2384 (ifp->if_bytes > 0)) { 2385 ASSERT(xfs_iext_get_ext(ifp, 0)); 2386 ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) > 0); 2387 (void)xfs_iextents_copy(ip, (xfs_bmbt_rec_t *)cp, 2388 whichfork); 2389 } 2390 break; 2391 2392 case XFS_DINODE_FMT_BTREE: 2393 if ((iip->ili_fields & brootflag[whichfork]) && 2394 (ifp->if_broot_bytes > 0)) { 2395 ASSERT(ifp->if_broot != NULL); 2396 ASSERT(ifp->if_broot_bytes <= 2397 (XFS_IFORK_SIZE(ip, whichfork) + 2398 XFS_BROOT_SIZE_ADJ)); 2399 xfs_bmbt_to_bmdr(mp, ifp->if_broot, ifp->if_broot_bytes, 2400 (xfs_bmdr_block_t *)cp, 2401 XFS_DFORK_SIZE(dip, mp, whichfork)); 2402 } 2403 break; 2404 2405 case XFS_DINODE_FMT_DEV: 2406 if (iip->ili_fields & XFS_ILOG_DEV) { 2407 ASSERT(whichfork == XFS_DATA_FORK); 2408 xfs_dinode_put_rdev(dip, ip->i_df.if_u2.if_rdev); 2409 } 2410 break; 2411 2412 case XFS_DINODE_FMT_UUID: 2413 if (iip->ili_fields & XFS_ILOG_UUID) { 2414 ASSERT(whichfork == XFS_DATA_FORK); 2415 memcpy(XFS_DFORK_DPTR(dip), 2416 &ip->i_df.if_u2.if_uuid, 2417 sizeof(uuid_t)); 2418 } 2419 break; 2420 2421 default: 2422 ASSERT(0); 2423 break; 2424 } 2425 } 2426 2427 STATIC int 2428 xfs_iflush_cluster( 2429 xfs_inode_t *ip, 2430 xfs_buf_t *bp) 2431 { 2432 xfs_mount_t *mp = ip->i_mount; 2433 struct xfs_perag *pag; 2434 unsigned long first_index, mask; 2435 unsigned long inodes_per_cluster; 2436 int ilist_size; 2437 xfs_inode_t **ilist; 2438 xfs_inode_t *iq; 2439 int nr_found; 2440 int clcount = 0; 2441 int bufwasdelwri; 2442 int i; 2443 2444 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); 2445 2446 inodes_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog; 2447 ilist_size = inodes_per_cluster * sizeof(xfs_inode_t *); 2448 ilist = kmem_alloc(ilist_size, KM_MAYFAIL|KM_NOFS); 2449 if (!ilist) 2450 goto out_put; 2451 2452 mask = ~(((XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog)) - 1); 2453 first_index = XFS_INO_TO_AGINO(mp, ip->i_ino) & mask; 2454 rcu_read_lock(); 2455 /* really need a gang lookup range call here */ 2456 nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void**)ilist, 2457 first_index, inodes_per_cluster); 2458 if (nr_found == 0) 2459 goto out_free; 2460 2461 for (i = 0; i < nr_found; i++) { 2462 iq = ilist[i]; 2463 if (iq == ip) 2464 continue; 2465 2466 /* 2467 * because this is an RCU protected lookup, we could find a 2468 * recently freed or even reallocated inode during the lookup. 2469 * We need to check under the i_flags_lock for a valid inode 2470 * here. Skip it if it is not valid or the wrong inode. 2471 */ 2472 spin_lock(&ip->i_flags_lock); 2473 if (!ip->i_ino || 2474 (XFS_INO_TO_AGINO(mp, iq->i_ino) & mask) != first_index) { 2475 spin_unlock(&ip->i_flags_lock); 2476 continue; 2477 } 2478 spin_unlock(&ip->i_flags_lock); 2479 2480 /* 2481 * Do an un-protected check to see if the inode is dirty and 2482 * is a candidate for flushing. These checks will be repeated 2483 * later after the appropriate locks are acquired. 2484 */ 2485 if (xfs_inode_clean(iq) && xfs_ipincount(iq) == 0) 2486 continue; 2487 2488 /* 2489 * Try to get locks. If any are unavailable or it is pinned, 2490 * then this inode cannot be flushed and is skipped. 2491 */ 2492 2493 if (!xfs_ilock_nowait(iq, XFS_ILOCK_SHARED)) 2494 continue; 2495 if (!xfs_iflock_nowait(iq)) { 2496 xfs_iunlock(iq, XFS_ILOCK_SHARED); 2497 continue; 2498 } 2499 if (xfs_ipincount(iq)) { 2500 xfs_ifunlock(iq); 2501 xfs_iunlock(iq, XFS_ILOCK_SHARED); 2502 continue; 2503 } 2504 2505 /* 2506 * arriving here means that this inode can be flushed. First 2507 * re-check that it's dirty before flushing. 2508 */ 2509 if (!xfs_inode_clean(iq)) { 2510 int error; 2511 error = xfs_iflush_int(iq, bp); 2512 if (error) { 2513 xfs_iunlock(iq, XFS_ILOCK_SHARED); 2514 goto cluster_corrupt_out; 2515 } 2516 clcount++; 2517 } else { 2518 xfs_ifunlock(iq); 2519 } 2520 xfs_iunlock(iq, XFS_ILOCK_SHARED); 2521 } 2522 2523 if (clcount) { 2524 XFS_STATS_INC(xs_icluster_flushcnt); 2525 XFS_STATS_ADD(xs_icluster_flushinode, clcount); 2526 } 2527 2528 out_free: 2529 rcu_read_unlock(); 2530 kmem_free(ilist); 2531 out_put: 2532 xfs_perag_put(pag); 2533 return 0; 2534 2535 2536 cluster_corrupt_out: 2537 /* 2538 * Corruption detected in the clustering loop. Invalidate the 2539 * inode buffer and shut down the filesystem. 2540 */ 2541 rcu_read_unlock(); 2542 /* 2543 * Clean up the buffer. If it was delwri, just release it -- 2544 * brelse can handle it with no problems. If not, shut down the 2545 * filesystem before releasing the buffer. 2546 */ 2547 bufwasdelwri = (bp->b_flags & _XBF_DELWRI_Q); 2548 if (bufwasdelwri) 2549 xfs_buf_relse(bp); 2550 2551 xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); 2552 2553 if (!bufwasdelwri) { 2554 /* 2555 * Just like incore_relse: if we have b_iodone functions, 2556 * mark the buffer as an error and call them. Otherwise 2557 * mark it as stale and brelse. 2558 */ 2559 if (bp->b_iodone) { 2560 XFS_BUF_UNDONE(bp); 2561 xfs_buf_stale(bp); 2562 xfs_buf_ioerror(bp, EIO); 2563 xfs_buf_ioend(bp, 0); 2564 } else { 2565 xfs_buf_stale(bp); 2566 xfs_buf_relse(bp); 2567 } 2568 } 2569 2570 /* 2571 * Unlocks the flush lock 2572 */ 2573 xfs_iflush_abort(iq, false); 2574 kmem_free(ilist); 2575 xfs_perag_put(pag); 2576 return XFS_ERROR(EFSCORRUPTED); 2577 } 2578 2579 /* 2580 * Flush dirty inode metadata into the backing buffer. 2581 * 2582 * The caller must have the inode lock and the inode flush lock held. The 2583 * inode lock will still be held upon return to the caller, and the inode 2584 * flush lock will be released after the inode has reached the disk. 2585 * 2586 * The caller must write out the buffer returned in *bpp and release it. 2587 */ 2588 int 2589 xfs_iflush( 2590 struct xfs_inode *ip, 2591 struct xfs_buf **bpp) 2592 { 2593 struct xfs_mount *mp = ip->i_mount; 2594 struct xfs_buf *bp; 2595 struct xfs_dinode *dip; 2596 int error; 2597 2598 XFS_STATS_INC(xs_iflush_count); 2599 2600 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); 2601 ASSERT(xfs_isiflocked(ip)); 2602 ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE || 2603 ip->i_d.di_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK)); 2604 2605 *bpp = NULL; 2606 2607 xfs_iunpin_wait(ip); 2608 2609 /* 2610 * For stale inodes we cannot rely on the backing buffer remaining 2611 * stale in cache for the remaining life of the stale inode and so 2612 * xfs_imap_to_bp() below may give us a buffer that no longer contains 2613 * inodes below. We have to check this after ensuring the inode is 2614 * unpinned so that it is safe to reclaim the stale inode after the 2615 * flush call. 2616 */ 2617 if (xfs_iflags_test(ip, XFS_ISTALE)) { 2618 xfs_ifunlock(ip); 2619 return 0; 2620 } 2621 2622 /* 2623 * This may have been unpinned because the filesystem is shutting 2624 * down forcibly. If that's the case we must not write this inode 2625 * to disk, because the log record didn't make it to disk. 2626 * 2627 * We also have to remove the log item from the AIL in this case, 2628 * as we wait for an empty AIL as part of the unmount process. 2629 */ 2630 if (XFS_FORCED_SHUTDOWN(mp)) { 2631 error = XFS_ERROR(EIO); 2632 goto abort_out; 2633 } 2634 2635 /* 2636 * Get the buffer containing the on-disk inode. 2637 */ 2638 error = xfs_imap_to_bp(mp, NULL, &ip->i_imap, &dip, &bp, XBF_TRYLOCK, 2639 0); 2640 if (error || !bp) { 2641 xfs_ifunlock(ip); 2642 return error; 2643 } 2644 2645 /* 2646 * First flush out the inode that xfs_iflush was called with. 2647 */ 2648 error = xfs_iflush_int(ip, bp); 2649 if (error) 2650 goto corrupt_out; 2651 2652 /* 2653 * If the buffer is pinned then push on the log now so we won't 2654 * get stuck waiting in the write for too long. 2655 */ 2656 if (xfs_buf_ispinned(bp)) 2657 xfs_log_force(mp, 0); 2658 2659 /* 2660 * inode clustering: 2661 * see if other inodes can be gathered into this write 2662 */ 2663 error = xfs_iflush_cluster(ip, bp); 2664 if (error) 2665 goto cluster_corrupt_out; 2666 2667 *bpp = bp; 2668 return 0; 2669 2670 corrupt_out: 2671 xfs_buf_relse(bp); 2672 xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); 2673 cluster_corrupt_out: 2674 error = XFS_ERROR(EFSCORRUPTED); 2675 abort_out: 2676 /* 2677 * Unlocks the flush lock 2678 */ 2679 xfs_iflush_abort(ip, false); 2680 return error; 2681 } 2682 2683 2684 STATIC int 2685 xfs_iflush_int( 2686 xfs_inode_t *ip, 2687 xfs_buf_t *bp) 2688 { 2689 xfs_inode_log_item_t *iip; 2690 xfs_dinode_t *dip; 2691 xfs_mount_t *mp; 2692 #ifdef XFS_TRANS_DEBUG 2693 int first; 2694 #endif 2695 2696 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); 2697 ASSERT(xfs_isiflocked(ip)); 2698 ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE || 2699 ip->i_d.di_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK)); 2700 2701 iip = ip->i_itemp; 2702 mp = ip->i_mount; 2703 2704 /* set *dip = inode's place in the buffer */ 2705 dip = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_imap.im_boffset); 2706 2707 if (XFS_TEST_ERROR(dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC), 2708 mp, XFS_ERRTAG_IFLUSH_1, XFS_RANDOM_IFLUSH_1)) { 2709 xfs_alert_tag(mp, XFS_PTAG_IFLUSH, 2710 "%s: Bad inode %Lu magic number 0x%x, ptr 0x%p", 2711 __func__, ip->i_ino, be16_to_cpu(dip->di_magic), dip); 2712 goto corrupt_out; 2713 } 2714 if (XFS_TEST_ERROR(ip->i_d.di_magic != XFS_DINODE_MAGIC, 2715 mp, XFS_ERRTAG_IFLUSH_2, XFS_RANDOM_IFLUSH_2)) { 2716 xfs_alert_tag(mp, XFS_PTAG_IFLUSH, 2717 "%s: Bad inode %Lu, ptr 0x%p, magic number 0x%x", 2718 __func__, ip->i_ino, ip, ip->i_d.di_magic); 2719 goto corrupt_out; 2720 } 2721 if (S_ISREG(ip->i_d.di_mode)) { 2722 if (XFS_TEST_ERROR( 2723 (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) && 2724 (ip->i_d.di_format != XFS_DINODE_FMT_BTREE), 2725 mp, XFS_ERRTAG_IFLUSH_3, XFS_RANDOM_IFLUSH_3)) { 2726 xfs_alert_tag(mp, XFS_PTAG_IFLUSH, 2727 "%s: Bad regular inode %Lu, ptr 0x%p", 2728 __func__, ip->i_ino, ip); 2729 goto corrupt_out; 2730 } 2731 } else if (S_ISDIR(ip->i_d.di_mode)) { 2732 if (XFS_TEST_ERROR( 2733 (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) && 2734 (ip->i_d.di_format != XFS_DINODE_FMT_BTREE) && 2735 (ip->i_d.di_format != XFS_DINODE_FMT_LOCAL), 2736 mp, XFS_ERRTAG_IFLUSH_4, XFS_RANDOM_IFLUSH_4)) { 2737 xfs_alert_tag(mp, XFS_PTAG_IFLUSH, 2738 "%s: Bad directory inode %Lu, ptr 0x%p", 2739 __func__, ip->i_ino, ip); 2740 goto corrupt_out; 2741 } 2742 } 2743 if (XFS_TEST_ERROR(ip->i_d.di_nextents + ip->i_d.di_anextents > 2744 ip->i_d.di_nblocks, mp, XFS_ERRTAG_IFLUSH_5, 2745 XFS_RANDOM_IFLUSH_5)) { 2746 xfs_alert_tag(mp, XFS_PTAG_IFLUSH, 2747 "%s: detected corrupt incore inode %Lu, " 2748 "total extents = %d, nblocks = %Ld, ptr 0x%p", 2749 __func__, ip->i_ino, 2750 ip->i_d.di_nextents + ip->i_d.di_anextents, 2751 ip->i_d.di_nblocks, ip); 2752 goto corrupt_out; 2753 } 2754 if (XFS_TEST_ERROR(ip->i_d.di_forkoff > mp->m_sb.sb_inodesize, 2755 mp, XFS_ERRTAG_IFLUSH_6, XFS_RANDOM_IFLUSH_6)) { 2756 xfs_alert_tag(mp, XFS_PTAG_IFLUSH, 2757 "%s: bad inode %Lu, forkoff 0x%x, ptr 0x%p", 2758 __func__, ip->i_ino, ip->i_d.di_forkoff, ip); 2759 goto corrupt_out; 2760 } 2761 /* 2762 * bump the flush iteration count, used to detect flushes which 2763 * postdate a log record during recovery. 2764 */ 2765 2766 ip->i_d.di_flushiter++; 2767 2768 /* 2769 * Copy the dirty parts of the inode into the on-disk 2770 * inode. We always copy out the core of the inode, 2771 * because if the inode is dirty at all the core must 2772 * be. 2773 */ 2774 xfs_dinode_to_disk(dip, &ip->i_d); 2775 2776 /* Wrap, we never let the log put out DI_MAX_FLUSH */ 2777 if (ip->i_d.di_flushiter == DI_MAX_FLUSH) 2778 ip->i_d.di_flushiter = 0; 2779 2780 /* 2781 * If this is really an old format inode and the superblock version 2782 * has not been updated to support only new format inodes, then 2783 * convert back to the old inode format. If the superblock version 2784 * has been updated, then make the conversion permanent. 2785 */ 2786 ASSERT(ip->i_d.di_version == 1 || xfs_sb_version_hasnlink(&mp->m_sb)); 2787 if (ip->i_d.di_version == 1) { 2788 if (!xfs_sb_version_hasnlink(&mp->m_sb)) { 2789 /* 2790 * Convert it back. 2791 */ 2792 ASSERT(ip->i_d.di_nlink <= XFS_MAXLINK_1); 2793 dip->di_onlink = cpu_to_be16(ip->i_d.di_nlink); 2794 } else { 2795 /* 2796 * The superblock version has already been bumped, 2797 * so just make the conversion to the new inode 2798 * format permanent. 2799 */ 2800 ip->i_d.di_version = 2; 2801 dip->di_version = 2; 2802 ip->i_d.di_onlink = 0; 2803 dip->di_onlink = 0; 2804 memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad)); 2805 memset(&(dip->di_pad[0]), 0, 2806 sizeof(dip->di_pad)); 2807 ASSERT(xfs_get_projid(ip) == 0); 2808 } 2809 } 2810 2811 xfs_iflush_fork(ip, dip, iip, XFS_DATA_FORK, bp); 2812 if (XFS_IFORK_Q(ip)) 2813 xfs_iflush_fork(ip, dip, iip, XFS_ATTR_FORK, bp); 2814 xfs_inobp_check(mp, bp); 2815 2816 /* 2817 * We've recorded everything logged in the inode, so we'd like to clear 2818 * the ili_fields bits so we don't log and flush things unnecessarily. 2819 * However, we can't stop logging all this information until the data 2820 * we've copied into the disk buffer is written to disk. If we did we 2821 * might overwrite the copy of the inode in the log with all the data 2822 * after re-logging only part of it, and in the face of a crash we 2823 * wouldn't have all the data we need to recover. 2824 * 2825 * What we do is move the bits to the ili_last_fields field. When 2826 * logging the inode, these bits are moved back to the ili_fields field. 2827 * In the xfs_iflush_done() routine we clear ili_last_fields, since we 2828 * know that the information those bits represent is permanently on 2829 * disk. As long as the flush completes before the inode is logged 2830 * again, then both ili_fields and ili_last_fields will be cleared. 2831 * 2832 * We can play with the ili_fields bits here, because the inode lock 2833 * must be held exclusively in order to set bits there and the flush 2834 * lock protects the ili_last_fields bits. Set ili_logged so the flush 2835 * done routine can tell whether or not to look in the AIL. Also, store 2836 * the current LSN of the inode so that we can tell whether the item has 2837 * moved in the AIL from xfs_iflush_done(). In order to read the lsn we 2838 * need the AIL lock, because it is a 64 bit value that cannot be read 2839 * atomically. 2840 */ 2841 if (iip != NULL && iip->ili_fields != 0) { 2842 iip->ili_last_fields = iip->ili_fields; 2843 iip->ili_fields = 0; 2844 iip->ili_logged = 1; 2845 2846 xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn, 2847 &iip->ili_item.li_lsn); 2848 2849 /* 2850 * Attach the function xfs_iflush_done to the inode's 2851 * buffer. This will remove the inode from the AIL 2852 * and unlock the inode's flush lock when the inode is 2853 * completely written to disk. 2854 */ 2855 xfs_buf_attach_iodone(bp, xfs_iflush_done, &iip->ili_item); 2856 2857 ASSERT(bp->b_fspriv != NULL); 2858 ASSERT(bp->b_iodone != NULL); 2859 } else { 2860 /* 2861 * We're flushing an inode which is not in the AIL and has 2862 * not been logged. For this case we can immediately drop 2863 * the inode flush lock because we can avoid the whole 2864 * AIL state thing. It's OK to drop the flush lock now, 2865 * because we've already locked the buffer and to do anything 2866 * you really need both. 2867 */ 2868 if (iip != NULL) { 2869 ASSERT(iip->ili_logged == 0); 2870 ASSERT(iip->ili_last_fields == 0); 2871 ASSERT((iip->ili_item.li_flags & XFS_LI_IN_AIL) == 0); 2872 } 2873 xfs_ifunlock(ip); 2874 } 2875 2876 return 0; 2877 2878 corrupt_out: 2879 return XFS_ERROR(EFSCORRUPTED); 2880 } 2881 2882 /* 2883 * Return a pointer to the extent record at file index idx. 2884 */ 2885 xfs_bmbt_rec_host_t * 2886 xfs_iext_get_ext( 2887 xfs_ifork_t *ifp, /* inode fork pointer */ 2888 xfs_extnum_t idx) /* index of target extent */ 2889 { 2890 ASSERT(idx >= 0); 2891 ASSERT(idx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t)); 2892 2893 if ((ifp->if_flags & XFS_IFEXTIREC) && (idx == 0)) { 2894 return ifp->if_u1.if_ext_irec->er_extbuf; 2895 } else if (ifp->if_flags & XFS_IFEXTIREC) { 2896 xfs_ext_irec_t *erp; /* irec pointer */ 2897 int erp_idx = 0; /* irec index */ 2898 xfs_extnum_t page_idx = idx; /* ext index in target list */ 2899 2900 erp = xfs_iext_idx_to_irec(ifp, &page_idx, &erp_idx, 0); 2901 return &erp->er_extbuf[page_idx]; 2902 } else if (ifp->if_bytes) { 2903 return &ifp->if_u1.if_extents[idx]; 2904 } else { 2905 return NULL; 2906 } 2907 } 2908 2909 /* 2910 * Insert new item(s) into the extent records for incore inode 2911 * fork 'ifp'. 'count' new items are inserted at index 'idx'. 2912 */ 2913 void 2914 xfs_iext_insert( 2915 xfs_inode_t *ip, /* incore inode pointer */ 2916 xfs_extnum_t idx, /* starting index of new items */ 2917 xfs_extnum_t count, /* number of inserted items */ 2918 xfs_bmbt_irec_t *new, /* items to insert */ 2919 int state) /* type of extent conversion */ 2920 { 2921 xfs_ifork_t *ifp = (state & BMAP_ATTRFORK) ? ip->i_afp : &ip->i_df; 2922 xfs_extnum_t i; /* extent record index */ 2923 2924 trace_xfs_iext_insert(ip, idx, new, state, _RET_IP_); 2925 2926 ASSERT(ifp->if_flags & XFS_IFEXTENTS); 2927 xfs_iext_add(ifp, idx, count); 2928 for (i = idx; i < idx + count; i++, new++) 2929 xfs_bmbt_set_all(xfs_iext_get_ext(ifp, i), new); 2930 } 2931 2932 /* 2933 * This is called when the amount of space required for incore file 2934 * extents needs to be increased. The ext_diff parameter stores the 2935 * number of new extents being added and the idx parameter contains 2936 * the extent index where the new extents will be added. If the new 2937 * extents are being appended, then we just need to (re)allocate and 2938 * initialize the space. Otherwise, if the new extents are being 2939 * inserted into the middle of the existing entries, a bit more work 2940 * is required to make room for the new extents to be inserted. The 2941 * caller is responsible for filling in the new extent entries upon 2942 * return. 2943 */ 2944 void 2945 xfs_iext_add( 2946 xfs_ifork_t *ifp, /* inode fork pointer */ 2947 xfs_extnum_t idx, /* index to begin adding exts */ 2948 int ext_diff) /* number of extents to add */ 2949 { 2950 int byte_diff; /* new bytes being added */ 2951 int new_size; /* size of extents after adding */ 2952 xfs_extnum_t nextents; /* number of extents in file */ 2953 2954 nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); 2955 ASSERT((idx >= 0) && (idx <= nextents)); 2956 byte_diff = ext_diff * sizeof(xfs_bmbt_rec_t); 2957 new_size = ifp->if_bytes + byte_diff; 2958 /* 2959 * If the new number of extents (nextents + ext_diff) 2960 * fits inside the inode, then continue to use the inline 2961 * extent buffer. 2962 */ 2963 if (nextents + ext_diff <= XFS_INLINE_EXTS) { 2964 if (idx < nextents) { 2965 memmove(&ifp->if_u2.if_inline_ext[idx + ext_diff], 2966 &ifp->if_u2.if_inline_ext[idx], 2967 (nextents - idx) * sizeof(xfs_bmbt_rec_t)); 2968 memset(&ifp->if_u2.if_inline_ext[idx], 0, byte_diff); 2969 } 2970 ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext; 2971 ifp->if_real_bytes = 0; 2972 } 2973 /* 2974 * Otherwise use a linear (direct) extent list. 2975 * If the extents are currently inside the inode, 2976 * xfs_iext_realloc_direct will switch us from 2977 * inline to direct extent allocation mode. 2978 */ 2979 else if (nextents + ext_diff <= XFS_LINEAR_EXTS) { 2980 xfs_iext_realloc_direct(ifp, new_size); 2981 if (idx < nextents) { 2982 memmove(&ifp->if_u1.if_extents[idx + ext_diff], 2983 &ifp->if_u1.if_extents[idx], 2984 (nextents - idx) * sizeof(xfs_bmbt_rec_t)); 2985 memset(&ifp->if_u1.if_extents[idx], 0, byte_diff); 2986 } 2987 } 2988 /* Indirection array */ 2989 else { 2990 xfs_ext_irec_t *erp; 2991 int erp_idx = 0; 2992 int page_idx = idx; 2993 2994 ASSERT(nextents + ext_diff > XFS_LINEAR_EXTS); 2995 if (ifp->if_flags & XFS_IFEXTIREC) { 2996 erp = xfs_iext_idx_to_irec(ifp, &page_idx, &erp_idx, 1); 2997 } else { 2998 xfs_iext_irec_init(ifp); 2999 ASSERT(ifp->if_flags & XFS_IFEXTIREC); 3000 erp = ifp->if_u1.if_ext_irec; 3001 } 3002 /* Extents fit in target extent page */ 3003 if (erp && erp->er_extcount + ext_diff <= XFS_LINEAR_EXTS) { 3004 if (page_idx < erp->er_extcount) { 3005 memmove(&erp->er_extbuf[page_idx + ext_diff], 3006 &erp->er_extbuf[page_idx], 3007 (erp->er_extcount - page_idx) * 3008 sizeof(xfs_bmbt_rec_t)); 3009 memset(&erp->er_extbuf[page_idx], 0, byte_diff); 3010 } 3011 erp->er_extcount += ext_diff; 3012 xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, ext_diff); 3013 } 3014 /* Insert a new extent page */ 3015 else if (erp) { 3016 xfs_iext_add_indirect_multi(ifp, 3017 erp_idx, page_idx, ext_diff); 3018 } 3019 /* 3020 * If extent(s) are being appended to the last page in 3021 * the indirection array and the new extent(s) don't fit 3022 * in the page, then erp is NULL and erp_idx is set to 3023 * the next index needed in the indirection array. 3024 */ 3025 else { 3026 int count = ext_diff; 3027 3028 while (count) { 3029 erp = xfs_iext_irec_new(ifp, erp_idx); 3030 erp->er_extcount = count; 3031 count -= MIN(count, (int)XFS_LINEAR_EXTS); 3032 if (count) { 3033 erp_idx++; 3034 } 3035 } 3036 } 3037 } 3038 ifp->if_bytes = new_size; 3039 } 3040 3041 /* 3042 * This is called when incore extents are being added to the indirection 3043 * array and the new extents do not fit in the target extent list. The 3044 * erp_idx parameter contains the irec index for the target extent list 3045 * in the indirection array, and the idx parameter contains the extent 3046 * index within the list. The number of extents being added is stored 3047 * in the count parameter. 3048 * 3049 * |-------| |-------| 3050 * | | | | idx - number of extents before idx 3051 * | idx | | count | 3052 * | | | | count - number of extents being inserted at idx 3053 * |-------| |-------| 3054 * | count | | nex2 | nex2 - number of extents after idx + count 3055 * |-------| |-------| 3056 */ 3057 void 3058 xfs_iext_add_indirect_multi( 3059 xfs_ifork_t *ifp, /* inode fork pointer */ 3060 int erp_idx, /* target extent irec index */ 3061 xfs_extnum_t idx, /* index within target list */ 3062 int count) /* new extents being added */ 3063 { 3064 int byte_diff; /* new bytes being added */ 3065 xfs_ext_irec_t *erp; /* pointer to irec entry */ 3066 xfs_extnum_t ext_diff; /* number of extents to add */ 3067 xfs_extnum_t ext_cnt; /* new extents still needed */ 3068 xfs_extnum_t nex2; /* extents after idx + count */ 3069 xfs_bmbt_rec_t *nex2_ep = NULL; /* temp list for nex2 extents */ 3070 int nlists; /* number of irec's (lists) */ 3071 3072 ASSERT(ifp->if_flags & XFS_IFEXTIREC); 3073 erp = &ifp->if_u1.if_ext_irec[erp_idx]; 3074 nex2 = erp->er_extcount - idx; 3075 nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; 3076 3077 /* 3078 * Save second part of target extent list 3079 * (all extents past */ 3080 if (nex2) { 3081 byte_diff = nex2 * sizeof(xfs_bmbt_rec_t); 3082 nex2_ep = (xfs_bmbt_rec_t *) kmem_alloc(byte_diff, KM_NOFS); 3083 memmove(nex2_ep, &erp->er_extbuf[idx], byte_diff); 3084 erp->er_extcount -= nex2; 3085 xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, -nex2); 3086 memset(&erp->er_extbuf[idx], 0, byte_diff); 3087 } 3088 3089 /* 3090 * Add the new extents to the end of the target 3091 * list, then allocate new irec record(s) and 3092 * extent buffer(s) as needed to store the rest 3093 * of the new extents. 3094 */ 3095 ext_cnt = count; 3096 ext_diff = MIN(ext_cnt, (int)XFS_LINEAR_EXTS - erp->er_extcount); 3097 if (ext_diff) { 3098 erp->er_extcount += ext_diff; 3099 xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, ext_diff); 3100 ext_cnt -= ext_diff; 3101 } 3102 while (ext_cnt) { 3103 erp_idx++; 3104 erp = xfs_iext_irec_new(ifp, erp_idx); 3105 ext_diff = MIN(ext_cnt, (int)XFS_LINEAR_EXTS); 3106 erp->er_extcount = ext_diff; 3107 xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, ext_diff); 3108 ext_cnt -= ext_diff; 3109 } 3110 3111 /* Add nex2 extents back to indirection array */ 3112 if (nex2) { 3113 xfs_extnum_t ext_avail; 3114 int i; 3115 3116 byte_diff = nex2 * sizeof(xfs_bmbt_rec_t); 3117 ext_avail = XFS_LINEAR_EXTS - erp->er_extcount; 3118 i = 0; 3119 /* 3120 * If nex2 extents fit in the current page, append 3121 * nex2_ep after the new extents. 3122 */ 3123 if (nex2 <= ext_avail) { 3124 i = erp->er_extcount; 3125 } 3126 /* 3127 * Otherwise, check if space is available in the 3128 * next page. 3129 */ 3130 else if ((erp_idx < nlists - 1) && 3131 (nex2 <= (ext_avail = XFS_LINEAR_EXTS - 3132 ifp->if_u1.if_ext_irec[erp_idx+1].er_extcount))) { 3133 erp_idx++; 3134 erp++; 3135 /* Create a hole for nex2 extents */ 3136 memmove(&erp->er_extbuf[nex2], erp->er_extbuf, 3137 erp->er_extcount * sizeof(xfs_bmbt_rec_t)); 3138 } 3139 /* 3140 * Final choice, create a new extent page for 3141 * nex2 extents. 3142 */ 3143 else { 3144 erp_idx++; 3145 erp = xfs_iext_irec_new(ifp, erp_idx); 3146 } 3147 memmove(&erp->er_extbuf[i], nex2_ep, byte_diff); 3148 kmem_free(nex2_ep); 3149 erp->er_extcount += nex2; 3150 xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, nex2); 3151 } 3152 } 3153 3154 /* 3155 * This is called when the amount of space required for incore file 3156 * extents needs to be decreased. The ext_diff parameter stores the 3157 * number of extents to be removed and the idx parameter contains 3158 * the extent index where the extents will be removed from. 3159 * 3160 * If the amount of space needed has decreased below the linear 3161 * limit, XFS_IEXT_BUFSZ, then switch to using the contiguous 3162 * extent array. Otherwise, use kmem_realloc() to adjust the 3163 * size to what is needed. 3164 */ 3165 void 3166 xfs_iext_remove( 3167 xfs_inode_t *ip, /* incore inode pointer */ 3168 xfs_extnum_t idx, /* index to begin removing exts */ 3169 int ext_diff, /* number of extents to remove */ 3170 int state) /* type of extent conversion */ 3171 { 3172 xfs_ifork_t *ifp = (state & BMAP_ATTRFORK) ? ip->i_afp : &ip->i_df; 3173 xfs_extnum_t nextents; /* number of extents in file */ 3174 int new_size; /* size of extents after removal */ 3175 3176 trace_xfs_iext_remove(ip, idx, state, _RET_IP_); 3177 3178 ASSERT(ext_diff > 0); 3179 nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); 3180 new_size = (nextents - ext_diff) * sizeof(xfs_bmbt_rec_t); 3181 3182 if (new_size == 0) { 3183 xfs_iext_destroy(ifp); 3184 } else if (ifp->if_flags & XFS_IFEXTIREC) { 3185 xfs_iext_remove_indirect(ifp, idx, ext_diff); 3186 } else if (ifp->if_real_bytes) { 3187 xfs_iext_remove_direct(ifp, idx, ext_diff); 3188 } else { 3189 xfs_iext_remove_inline(ifp, idx, ext_diff); 3190 } 3191 ifp->if_bytes = new_size; 3192 } 3193 3194 /* 3195 * This removes ext_diff extents from the inline buffer, beginning 3196 * at extent index idx. 3197 */ 3198 void 3199 xfs_iext_remove_inline( 3200 xfs_ifork_t *ifp, /* inode fork pointer */ 3201 xfs_extnum_t idx, /* index to begin removing exts */ 3202 int ext_diff) /* number of extents to remove */ 3203 { 3204 int nextents; /* number of extents in file */ 3205 3206 ASSERT(!(ifp->if_flags & XFS_IFEXTIREC)); 3207 ASSERT(idx < XFS_INLINE_EXTS); 3208 nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); 3209 ASSERT(((nextents - ext_diff) > 0) && 3210 (nextents - ext_diff) < XFS_INLINE_EXTS); 3211 3212 if (idx + ext_diff < nextents) { 3213 memmove(&ifp->if_u2.if_inline_ext[idx], 3214 &ifp->if_u2.if_inline_ext[idx + ext_diff], 3215 (nextents - (idx + ext_diff)) * 3216 sizeof(xfs_bmbt_rec_t)); 3217 memset(&ifp->if_u2.if_inline_ext[nextents - ext_diff], 3218 0, ext_diff * sizeof(xfs_bmbt_rec_t)); 3219 } else { 3220 memset(&ifp->if_u2.if_inline_ext[idx], 0, 3221 ext_diff * sizeof(xfs_bmbt_rec_t)); 3222 } 3223 } 3224 3225 /* 3226 * This removes ext_diff extents from a linear (direct) extent list, 3227 * beginning at extent index idx. If the extents are being removed 3228 * from the end of the list (ie. truncate) then we just need to re- 3229 * allocate the list to remove the extra space. Otherwise, if the 3230 * extents are being removed from the middle of the existing extent 3231 * entries, then we first need to move the extent records beginning 3232 * at idx + ext_diff up in the list to overwrite the records being 3233 * removed, then remove the extra space via kmem_realloc. 3234 */ 3235 void 3236 xfs_iext_remove_direct( 3237 xfs_ifork_t *ifp, /* inode fork pointer */ 3238 xfs_extnum_t idx, /* index to begin removing exts */ 3239 int ext_diff) /* number of extents to remove */ 3240 { 3241 xfs_extnum_t nextents; /* number of extents in file */ 3242 int new_size; /* size of extents after removal */ 3243 3244 ASSERT(!(ifp->if_flags & XFS_IFEXTIREC)); 3245 new_size = ifp->if_bytes - 3246 (ext_diff * sizeof(xfs_bmbt_rec_t)); 3247 nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); 3248 3249 if (new_size == 0) { 3250 xfs_iext_destroy(ifp); 3251 return; 3252 } 3253 /* Move extents up in the list (if needed) */ 3254 if (idx + ext_diff < nextents) { 3255 memmove(&ifp->if_u1.if_extents[idx], 3256 &ifp->if_u1.if_extents[idx + ext_diff], 3257 (nextents - (idx + ext_diff)) * 3258 sizeof(xfs_bmbt_rec_t)); 3259 } 3260 memset(&ifp->if_u1.if_extents[nextents - ext_diff], 3261 0, ext_diff * sizeof(xfs_bmbt_rec_t)); 3262 /* 3263 * Reallocate the direct extent list. If the extents 3264 * will fit inside the inode then xfs_iext_realloc_direct 3265 * will switch from direct to inline extent allocation 3266 * mode for us. 3267 */ 3268 xfs_iext_realloc_direct(ifp, new_size); 3269 ifp->if_bytes = new_size; 3270 } 3271 3272 /* 3273 * This is called when incore extents are being removed from the 3274 * indirection array and the extents being removed span multiple extent 3275 * buffers. The idx parameter contains the file extent index where we 3276 * want to begin removing extents, and the count parameter contains 3277 * how many extents need to be removed. 3278 * 3279 * |-------| |-------| 3280 * | nex1 | | | nex1 - number of extents before idx 3281 * |-------| | count | 3282 * | | | | count - number of extents being removed at idx 3283 * | count | |-------| 3284 * | | | nex2 | nex2 - number of extents after idx + count 3285 * |-------| |-------| 3286 */ 3287 void 3288 xfs_iext_remove_indirect( 3289 xfs_ifork_t *ifp, /* inode fork pointer */ 3290 xfs_extnum_t idx, /* index to begin removing extents */ 3291 int count) /* number of extents to remove */ 3292 { 3293 xfs_ext_irec_t *erp; /* indirection array pointer */ 3294 int erp_idx = 0; /* indirection array index */ 3295 xfs_extnum_t ext_cnt; /* extents left to remove */ 3296 xfs_extnum_t ext_diff; /* extents to remove in current list */ 3297 xfs_extnum_t nex1; /* number of extents before idx */ 3298 xfs_extnum_t nex2; /* extents after idx + count */ 3299 int page_idx = idx; /* index in target extent list */ 3300 3301 ASSERT(ifp->if_flags & XFS_IFEXTIREC); 3302 erp = xfs_iext_idx_to_irec(ifp, &page_idx, &erp_idx, 0); 3303 ASSERT(erp != NULL); 3304 nex1 = page_idx; 3305 ext_cnt = count; 3306 while (ext_cnt) { 3307 nex2 = MAX((erp->er_extcount - (nex1 + ext_cnt)), 0); 3308 ext_diff = MIN(ext_cnt, (erp->er_extcount - nex1)); 3309 /* 3310 * Check for deletion of entire list; 3311 * xfs_iext_irec_remove() updates extent offsets. 3312 */ 3313 if (ext_diff == erp->er_extcount) { 3314 xfs_iext_irec_remove(ifp, erp_idx); 3315 ext_cnt -= ext_diff; 3316 nex1 = 0; 3317 if (ext_cnt) { 3318 ASSERT(erp_idx < ifp->if_real_bytes / 3319 XFS_IEXT_BUFSZ); 3320 erp = &ifp->if_u1.if_ext_irec[erp_idx]; 3321 nex1 = 0; 3322 continue; 3323 } else { 3324 break; 3325 } 3326 } 3327 /* Move extents up (if needed) */ 3328 if (nex2) { 3329 memmove(&erp->er_extbuf[nex1], 3330 &erp->er_extbuf[nex1 + ext_diff], 3331 nex2 * sizeof(xfs_bmbt_rec_t)); 3332 } 3333 /* Zero out rest of page */ 3334 memset(&erp->er_extbuf[nex1 + nex2], 0, (XFS_IEXT_BUFSZ - 3335 ((nex1 + nex2) * sizeof(xfs_bmbt_rec_t)))); 3336 /* Update remaining counters */ 3337 erp->er_extcount -= ext_diff; 3338 xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, -ext_diff); 3339 ext_cnt -= ext_diff; 3340 nex1 = 0; 3341 erp_idx++; 3342 erp++; 3343 } 3344 ifp->if_bytes -= count * sizeof(xfs_bmbt_rec_t); 3345 xfs_iext_irec_compact(ifp); 3346 } 3347 3348 /* 3349 * Create, destroy, or resize a linear (direct) block of extents. 3350 */ 3351 void 3352 xfs_iext_realloc_direct( 3353 xfs_ifork_t *ifp, /* inode fork pointer */ 3354 int new_size) /* new size of extents */ 3355 { 3356 int rnew_size; /* real new size of extents */ 3357 3358 rnew_size = new_size; 3359 3360 ASSERT(!(ifp->if_flags & XFS_IFEXTIREC) || 3361 ((new_size >= 0) && (new_size <= XFS_IEXT_BUFSZ) && 3362 (new_size != ifp->if_real_bytes))); 3363 3364 /* Free extent records */ 3365 if (new_size == 0) { 3366 xfs_iext_destroy(ifp); 3367 } 3368 /* Resize direct extent list and zero any new bytes */ 3369 else if (ifp->if_real_bytes) { 3370 /* Check if extents will fit inside the inode */ 3371 if (new_size <= XFS_INLINE_EXTS * sizeof(xfs_bmbt_rec_t)) { 3372 xfs_iext_direct_to_inline(ifp, new_size / 3373 (uint)sizeof(xfs_bmbt_rec_t)); 3374 ifp->if_bytes = new_size; 3375 return; 3376 } 3377 if (!is_power_of_2(new_size)){ 3378 rnew_size = roundup_pow_of_two(new_size); 3379 } 3380 if (rnew_size != ifp->if_real_bytes) { 3381 ifp->if_u1.if_extents = 3382 kmem_realloc(ifp->if_u1.if_extents, 3383 rnew_size, 3384 ifp->if_real_bytes, KM_NOFS); 3385 } 3386 if (rnew_size > ifp->if_real_bytes) { 3387 memset(&ifp->if_u1.if_extents[ifp->if_bytes / 3388 (uint)sizeof(xfs_bmbt_rec_t)], 0, 3389 rnew_size - ifp->if_real_bytes); 3390 } 3391 } 3392 /* 3393 * Switch from the inline extent buffer to a direct 3394 * extent list. Be sure to include the inline extent 3395 * bytes in new_size. 3396 */ 3397 else { 3398 new_size += ifp->if_bytes; 3399 if (!is_power_of_2(new_size)) { 3400 rnew_size = roundup_pow_of_two(new_size); 3401 } 3402 xfs_iext_inline_to_direct(ifp, rnew_size); 3403 } 3404 ifp->if_real_bytes = rnew_size; 3405 ifp->if_bytes = new_size; 3406 } 3407 3408 /* 3409 * Switch from linear (direct) extent records to inline buffer. 3410 */ 3411 void 3412 xfs_iext_direct_to_inline( 3413 xfs_ifork_t *ifp, /* inode fork pointer */ 3414 xfs_extnum_t nextents) /* number of extents in file */ 3415 { 3416 ASSERT(ifp->if_flags & XFS_IFEXTENTS); 3417 ASSERT(nextents <= XFS_INLINE_EXTS); 3418 /* 3419 * The inline buffer was zeroed when we switched 3420 * from inline to direct extent allocation mode, 3421 * so we don't need to clear it here. 3422 */ 3423 memcpy(ifp->if_u2.if_inline_ext, ifp->if_u1.if_extents, 3424 nextents * sizeof(xfs_bmbt_rec_t)); 3425 kmem_free(ifp->if_u1.if_extents); 3426 ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext; 3427 ifp->if_real_bytes = 0; 3428 } 3429 3430 /* 3431 * Switch from inline buffer to linear (direct) extent records. 3432 * new_size should already be rounded up to the next power of 2 3433 * by the caller (when appropriate), so use new_size as it is. 3434 * However, since new_size may be rounded up, we can't update 3435 * if_bytes here. It is the caller's responsibility to update 3436 * if_bytes upon return. 3437 */ 3438 void 3439 xfs_iext_inline_to_direct( 3440 xfs_ifork_t *ifp, /* inode fork pointer */ 3441 int new_size) /* number of extents in file */ 3442 { 3443 ifp->if_u1.if_extents = kmem_alloc(new_size, KM_NOFS); 3444 memset(ifp->if_u1.if_extents, 0, new_size); 3445 if (ifp->if_bytes) { 3446 memcpy(ifp->if_u1.if_extents, ifp->if_u2.if_inline_ext, 3447 ifp->if_bytes); 3448 memset(ifp->if_u2.if_inline_ext, 0, XFS_INLINE_EXTS * 3449 sizeof(xfs_bmbt_rec_t)); 3450 } 3451 ifp->if_real_bytes = new_size; 3452 } 3453 3454 /* 3455 * Resize an extent indirection array to new_size bytes. 3456 */ 3457 STATIC void 3458 xfs_iext_realloc_indirect( 3459 xfs_ifork_t *ifp, /* inode fork pointer */ 3460 int new_size) /* new indirection array size */ 3461 { 3462 int nlists; /* number of irec's (ex lists) */ 3463 int size; /* current indirection array size */ 3464 3465 ASSERT(ifp->if_flags & XFS_IFEXTIREC); 3466 nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; 3467 size = nlists * sizeof(xfs_ext_irec_t); 3468 ASSERT(ifp->if_real_bytes); 3469 ASSERT((new_size >= 0) && (new_size != size)); 3470 if (new_size == 0) { 3471 xfs_iext_destroy(ifp); 3472 } else { 3473 ifp->if_u1.if_ext_irec = (xfs_ext_irec_t *) 3474 kmem_realloc(ifp->if_u1.if_ext_irec, 3475 new_size, size, KM_NOFS); 3476 } 3477 } 3478 3479 /* 3480 * Switch from indirection array to linear (direct) extent allocations. 3481 */ 3482 STATIC void 3483 xfs_iext_indirect_to_direct( 3484 xfs_ifork_t *ifp) /* inode fork pointer */ 3485 { 3486 xfs_bmbt_rec_host_t *ep; /* extent record pointer */ 3487 xfs_extnum_t nextents; /* number of extents in file */ 3488 int size; /* size of file extents */ 3489 3490 ASSERT(ifp->if_flags & XFS_IFEXTIREC); 3491 nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); 3492 ASSERT(nextents <= XFS_LINEAR_EXTS); 3493 size = nextents * sizeof(xfs_bmbt_rec_t); 3494 3495 xfs_iext_irec_compact_pages(ifp); 3496 ASSERT(ifp->if_real_bytes == XFS_IEXT_BUFSZ); 3497 3498 ep = ifp->if_u1.if_ext_irec->er_extbuf; 3499 kmem_free(ifp->if_u1.if_ext_irec); 3500 ifp->if_flags &= ~XFS_IFEXTIREC; 3501 ifp->if_u1.if_extents = ep; 3502 ifp->if_bytes = size; 3503 if (nextents < XFS_LINEAR_EXTS) { 3504 xfs_iext_realloc_direct(ifp, size); 3505 } 3506 } 3507 3508 /* 3509 * Free incore file extents. 3510 */ 3511 void 3512 xfs_iext_destroy( 3513 xfs_ifork_t *ifp) /* inode fork pointer */ 3514 { 3515 if (ifp->if_flags & XFS_IFEXTIREC) { 3516 int erp_idx; 3517 int nlists; 3518 3519 nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; 3520 for (erp_idx = nlists - 1; erp_idx >= 0 ; erp_idx--) { 3521 xfs_iext_irec_remove(ifp, erp_idx); 3522 } 3523 ifp->if_flags &= ~XFS_IFEXTIREC; 3524 } else if (ifp->if_real_bytes) { 3525 kmem_free(ifp->if_u1.if_extents); 3526 } else if (ifp->if_bytes) { 3527 memset(ifp->if_u2.if_inline_ext, 0, XFS_INLINE_EXTS * 3528 sizeof(xfs_bmbt_rec_t)); 3529 } 3530 ifp->if_u1.if_extents = NULL; 3531 ifp->if_real_bytes = 0; 3532 ifp->if_bytes = 0; 3533 } 3534 3535 /* 3536 * Return a pointer to the extent record for file system block bno. 3537 */ 3538 xfs_bmbt_rec_host_t * /* pointer to found extent record */ 3539 xfs_iext_bno_to_ext( 3540 xfs_ifork_t *ifp, /* inode fork pointer */ 3541 xfs_fileoff_t bno, /* block number to search for */ 3542 xfs_extnum_t *idxp) /* index of target extent */ 3543 { 3544 xfs_bmbt_rec_host_t *base; /* pointer to first extent */ 3545 xfs_filblks_t blockcount = 0; /* number of blocks in extent */ 3546 xfs_bmbt_rec_host_t *ep = NULL; /* pointer to target extent */ 3547 xfs_ext_irec_t *erp = NULL; /* indirection array pointer */ 3548 int high; /* upper boundary in search */ 3549 xfs_extnum_t idx = 0; /* index of target extent */ 3550 int low; /* lower boundary in search */ 3551 xfs_extnum_t nextents; /* number of file extents */ 3552 xfs_fileoff_t startoff = 0; /* start offset of extent */ 3553 3554 nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); 3555 if (nextents == 0) { 3556 *idxp = 0; 3557 return NULL; 3558 } 3559 low = 0; 3560 if (ifp->if_flags & XFS_IFEXTIREC) { 3561 /* Find target extent list */ 3562 int erp_idx = 0; 3563 erp = xfs_iext_bno_to_irec(ifp, bno, &erp_idx); 3564 base = erp->er_extbuf; 3565 high = erp->er_extcount - 1; 3566 } else { 3567 base = ifp->if_u1.if_extents; 3568 high = nextents - 1; 3569 } 3570 /* Binary search extent records */ 3571 while (low <= high) { 3572 idx = (low + high) >> 1; 3573 ep = base + idx; 3574 startoff = xfs_bmbt_get_startoff(ep); 3575 blockcount = xfs_bmbt_get_blockcount(ep); 3576 if (bno < startoff) { 3577 high = idx - 1; 3578 } else if (bno >= startoff + blockcount) { 3579 low = idx + 1; 3580 } else { 3581 /* Convert back to file-based extent index */ 3582 if (ifp->if_flags & XFS_IFEXTIREC) { 3583 idx += erp->er_extoff; 3584 } 3585 *idxp = idx; 3586 return ep; 3587 } 3588 } 3589 /* Convert back to file-based extent index */ 3590 if (ifp->if_flags & XFS_IFEXTIREC) { 3591 idx += erp->er_extoff; 3592 } 3593 if (bno >= startoff + blockcount) { 3594 if (++idx == nextents) { 3595 ep = NULL; 3596 } else { 3597 ep = xfs_iext_get_ext(ifp, idx); 3598 } 3599 } 3600 *idxp = idx; 3601 return ep; 3602 } 3603 3604 /* 3605 * Return a pointer to the indirection array entry containing the 3606 * extent record for filesystem block bno. Store the index of the 3607 * target irec in *erp_idxp. 3608 */ 3609 xfs_ext_irec_t * /* pointer to found extent record */ 3610 xfs_iext_bno_to_irec( 3611 xfs_ifork_t *ifp, /* inode fork pointer */ 3612 xfs_fileoff_t bno, /* block number to search for */ 3613 int *erp_idxp) /* irec index of target ext list */ 3614 { 3615 xfs_ext_irec_t *erp = NULL; /* indirection array pointer */ 3616 xfs_ext_irec_t *erp_next; /* next indirection array entry */ 3617 int erp_idx; /* indirection array index */ 3618 int nlists; /* number of extent irec's (lists) */ 3619 int high; /* binary search upper limit */ 3620 int low; /* binary search lower limit */ 3621 3622 ASSERT(ifp->if_flags & XFS_IFEXTIREC); 3623 nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; 3624 erp_idx = 0; 3625 low = 0; 3626 high = nlists - 1; 3627 while (low <= high) { 3628 erp_idx = (low + high) >> 1; 3629 erp = &ifp->if_u1.if_ext_irec[erp_idx]; 3630 erp_next = erp_idx < nlists - 1 ? erp + 1 : NULL; 3631 if (bno < xfs_bmbt_get_startoff(erp->er_extbuf)) { 3632 high = erp_idx - 1; 3633 } else if (erp_next && bno >= 3634 xfs_bmbt_get_startoff(erp_next->er_extbuf)) { 3635 low = erp_idx + 1; 3636 } else { 3637 break; 3638 } 3639 } 3640 *erp_idxp = erp_idx; 3641 return erp; 3642 } 3643 3644 /* 3645 * Return a pointer to the indirection array entry containing the 3646 * extent record at file extent index *idxp. Store the index of the 3647 * target irec in *erp_idxp and store the page index of the target 3648 * extent record in *idxp. 3649 */ 3650 xfs_ext_irec_t * 3651 xfs_iext_idx_to_irec( 3652 xfs_ifork_t *ifp, /* inode fork pointer */ 3653 xfs_extnum_t *idxp, /* extent index (file -> page) */ 3654 int *erp_idxp, /* pointer to target irec */ 3655 int realloc) /* new bytes were just added */ 3656 { 3657 xfs_ext_irec_t *prev; /* pointer to previous irec */ 3658 xfs_ext_irec_t *erp = NULL; /* pointer to current irec */ 3659 int erp_idx; /* indirection array index */ 3660 int nlists; /* number of irec's (ex lists) */ 3661 int high; /* binary search upper limit */ 3662 int low; /* binary search lower limit */ 3663 xfs_extnum_t page_idx = *idxp; /* extent index in target list */ 3664 3665 ASSERT(ifp->if_flags & XFS_IFEXTIREC); 3666 ASSERT(page_idx >= 0); 3667 ASSERT(page_idx <= ifp->if_bytes / sizeof(xfs_bmbt_rec_t)); 3668 ASSERT(page_idx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t) || realloc); 3669 3670 nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; 3671 erp_idx = 0; 3672 low = 0; 3673 high = nlists - 1; 3674 3675 /* Binary search extent irec's */ 3676 while (low <= high) { 3677 erp_idx = (low + high) >> 1; 3678 erp = &ifp->if_u1.if_ext_irec[erp_idx]; 3679 prev = erp_idx > 0 ? erp - 1 : NULL; 3680 if (page_idx < erp->er_extoff || (page_idx == erp->er_extoff && 3681 realloc && prev && prev->er_extcount < XFS_LINEAR_EXTS)) { 3682 high = erp_idx - 1; 3683 } else if (page_idx > erp->er_extoff + erp->er_extcount || 3684 (page_idx == erp->er_extoff + erp->er_extcount && 3685 !realloc)) { 3686 low = erp_idx + 1; 3687 } else if (page_idx == erp->er_extoff + erp->er_extcount && 3688 erp->er_extcount == XFS_LINEAR_EXTS) { 3689 ASSERT(realloc); 3690 page_idx = 0; 3691 erp_idx++; 3692 erp = erp_idx < nlists ? erp + 1 : NULL; 3693 break; 3694 } else { 3695 page_idx -= erp->er_extoff; 3696 break; 3697 } 3698 } 3699 *idxp = page_idx; 3700 *erp_idxp = erp_idx; 3701 return(erp); 3702 } 3703 3704 /* 3705 * Allocate and initialize an indirection array once the space needed 3706 * for incore extents increases above XFS_IEXT_BUFSZ. 3707 */ 3708 void 3709 xfs_iext_irec_init( 3710 xfs_ifork_t *ifp) /* inode fork pointer */ 3711 { 3712 xfs_ext_irec_t *erp; /* indirection array pointer */ 3713 xfs_extnum_t nextents; /* number of extents in file */ 3714 3715 ASSERT(!(ifp->if_flags & XFS_IFEXTIREC)); 3716 nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); 3717 ASSERT(nextents <= XFS_LINEAR_EXTS); 3718 3719 erp = kmem_alloc(sizeof(xfs_ext_irec_t), KM_NOFS); 3720 3721 if (nextents == 0) { 3722 ifp->if_u1.if_extents = kmem_alloc(XFS_IEXT_BUFSZ, KM_NOFS); 3723 } else if (!ifp->if_real_bytes) { 3724 xfs_iext_inline_to_direct(ifp, XFS_IEXT_BUFSZ); 3725 } else if (ifp->if_real_bytes < XFS_IEXT_BUFSZ) { 3726 xfs_iext_realloc_direct(ifp, XFS_IEXT_BUFSZ); 3727 } 3728 erp->er_extbuf = ifp->if_u1.if_extents; 3729 erp->er_extcount = nextents; 3730 erp->er_extoff = 0; 3731 3732 ifp->if_flags |= XFS_IFEXTIREC; 3733 ifp->if_real_bytes = XFS_IEXT_BUFSZ; 3734 ifp->if_bytes = nextents * sizeof(xfs_bmbt_rec_t); 3735 ifp->if_u1.if_ext_irec = erp; 3736 3737 return; 3738 } 3739 3740 /* 3741 * Allocate and initialize a new entry in the indirection array. 3742 */ 3743 xfs_ext_irec_t * 3744 xfs_iext_irec_new( 3745 xfs_ifork_t *ifp, /* inode fork pointer */ 3746 int erp_idx) /* index for new irec */ 3747 { 3748 xfs_ext_irec_t *erp; /* indirection array pointer */ 3749 int i; /* loop counter */ 3750 int nlists; /* number of irec's (ex lists) */ 3751 3752 ASSERT(ifp->if_flags & XFS_IFEXTIREC); 3753 nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; 3754 3755 /* Resize indirection array */ 3756 xfs_iext_realloc_indirect(ifp, ++nlists * 3757 sizeof(xfs_ext_irec_t)); 3758 /* 3759 * Move records down in the array so the 3760 * new page can use erp_idx. 3761 */ 3762 erp = ifp->if_u1.if_ext_irec; 3763 for (i = nlists - 1; i > erp_idx; i--) { 3764 memmove(&erp[i], &erp[i-1], sizeof(xfs_ext_irec_t)); 3765 } 3766 ASSERT(i == erp_idx); 3767 3768 /* Initialize new extent record */ 3769 erp = ifp->if_u1.if_ext_irec; 3770 erp[erp_idx].er_extbuf = kmem_alloc(XFS_IEXT_BUFSZ, KM_NOFS); 3771 ifp->if_real_bytes = nlists * XFS_IEXT_BUFSZ; 3772 memset(erp[erp_idx].er_extbuf, 0, XFS_IEXT_BUFSZ); 3773 erp[erp_idx].er_extcount = 0; 3774 erp[erp_idx].er_extoff = erp_idx > 0 ? 3775 erp[erp_idx-1].er_extoff + erp[erp_idx-1].er_extcount : 0; 3776 return (&erp[erp_idx]); 3777 } 3778 3779 /* 3780 * Remove a record from the indirection array. 3781 */ 3782 void 3783 xfs_iext_irec_remove( 3784 xfs_ifork_t *ifp, /* inode fork pointer */ 3785 int erp_idx) /* irec index to remove */ 3786 { 3787 xfs_ext_irec_t *erp; /* indirection array pointer */ 3788 int i; /* loop counter */ 3789 int nlists; /* number of irec's (ex lists) */ 3790 3791 ASSERT(ifp->if_flags & XFS_IFEXTIREC); 3792 nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; 3793 erp = &ifp->if_u1.if_ext_irec[erp_idx]; 3794 if (erp->er_extbuf) { 3795 xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, 3796 -erp->er_extcount); 3797 kmem_free(erp->er_extbuf); 3798 } 3799 /* Compact extent records */ 3800 erp = ifp->if_u1.if_ext_irec; 3801 for (i = erp_idx; i < nlists - 1; i++) { 3802 memmove(&erp[i], &erp[i+1], sizeof(xfs_ext_irec_t)); 3803 } 3804 /* 3805 * Manually free the last extent record from the indirection 3806 * array. A call to xfs_iext_realloc_indirect() with a size 3807 * of zero would result in a call to xfs_iext_destroy() which 3808 * would in turn call this function again, creating a nasty 3809 * infinite loop. 3810 */ 3811 if (--nlists) { 3812 xfs_iext_realloc_indirect(ifp, 3813 nlists * sizeof(xfs_ext_irec_t)); 3814 } else { 3815 kmem_free(ifp->if_u1.if_ext_irec); 3816 } 3817 ifp->if_real_bytes = nlists * XFS_IEXT_BUFSZ; 3818 } 3819 3820 /* 3821 * This is called to clean up large amounts of unused memory allocated 3822 * by the indirection array. Before compacting anything though, verify 3823 * that the indirection array is still needed and switch back to the 3824 * linear extent list (or even the inline buffer) if possible. The 3825 * compaction policy is as follows: 3826 * 3827 * Full Compaction: Extents fit into a single page (or inline buffer) 3828 * Partial Compaction: Extents occupy less than 50% of allocated space 3829 * No Compaction: Extents occupy at least 50% of allocated space 3830 */ 3831 void 3832 xfs_iext_irec_compact( 3833 xfs_ifork_t *ifp) /* inode fork pointer */ 3834 { 3835 xfs_extnum_t nextents; /* number of extents in file */ 3836 int nlists; /* number of irec's (ex lists) */ 3837 3838 ASSERT(ifp->if_flags & XFS_IFEXTIREC); 3839 nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; 3840 nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); 3841 3842 if (nextents == 0) { 3843 xfs_iext_destroy(ifp); 3844 } else if (nextents <= XFS_INLINE_EXTS) { 3845 xfs_iext_indirect_to_direct(ifp); 3846 xfs_iext_direct_to_inline(ifp, nextents); 3847 } else if (nextents <= XFS_LINEAR_EXTS) { 3848 xfs_iext_indirect_to_direct(ifp); 3849 } else if (nextents < (nlists * XFS_LINEAR_EXTS) >> 1) { 3850 xfs_iext_irec_compact_pages(ifp); 3851 } 3852 } 3853 3854 /* 3855 * Combine extents from neighboring extent pages. 3856 */ 3857 void 3858 xfs_iext_irec_compact_pages( 3859 xfs_ifork_t *ifp) /* inode fork pointer */ 3860 { 3861 xfs_ext_irec_t *erp, *erp_next;/* pointers to irec entries */ 3862 int erp_idx = 0; /* indirection array index */ 3863 int nlists; /* number of irec's (ex lists) */ 3864 3865 ASSERT(ifp->if_flags & XFS_IFEXTIREC); 3866 nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; 3867 while (erp_idx < nlists - 1) { 3868 erp = &ifp->if_u1.if_ext_irec[erp_idx]; 3869 erp_next = erp + 1; 3870 if (erp_next->er_extcount <= 3871 (XFS_LINEAR_EXTS - erp->er_extcount)) { 3872 memcpy(&erp->er_extbuf[erp->er_extcount], 3873 erp_next->er_extbuf, erp_next->er_extcount * 3874 sizeof(xfs_bmbt_rec_t)); 3875 erp->er_extcount += erp_next->er_extcount; 3876 /* 3877 * Free page before removing extent record 3878 * so er_extoffs don't get modified in 3879 * xfs_iext_irec_remove. 3880 */ 3881 kmem_free(erp_next->er_extbuf); 3882 erp_next->er_extbuf = NULL; 3883 xfs_iext_irec_remove(ifp, erp_idx + 1); 3884 nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; 3885 } else { 3886 erp_idx++; 3887 } 3888 } 3889 } 3890 3891 /* 3892 * This is called to update the er_extoff field in the indirection 3893 * array when extents have been added or removed from one of the 3894 * extent lists. erp_idx contains the irec index to begin updating 3895 * at and ext_diff contains the number of extents that were added 3896 * or removed. 3897 */ 3898 void 3899 xfs_iext_irec_update_extoffs( 3900 xfs_ifork_t *ifp, /* inode fork pointer */ 3901 int erp_idx, /* irec index to update */ 3902 int ext_diff) /* number of new extents */ 3903 { 3904 int i; /* loop counter */ 3905 int nlists; /* number of irec's (ex lists */ 3906 3907 ASSERT(ifp->if_flags & XFS_IFEXTIREC); 3908 nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; 3909 for (i = erp_idx; i < nlists; i++) { 3910 ifp->if_u1.if_ext_irec[i].er_extoff += ext_diff; 3911 } 3912 } 3913