1 /* 2 * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. 3 * All Rights Reserved. 4 * 5 * This program is free software; you can redistribute it and/or 6 * modify it under the terms of the GNU General Public License as 7 * published by the Free Software Foundation. 8 * 9 * This program is distributed in the hope that it would be useful, 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 * GNU General Public License for more details. 13 * 14 * You should have received a copy of the GNU General Public License 15 * along with this program; if not, write the Free Software Foundation, 16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 17 */ 18 #include "xfs.h" 19 #include "xfs_fs.h" 20 #include "xfs_types.h" 21 #include "xfs_bit.h" 22 #include "xfs_log.h" 23 #include "xfs_inum.h" 24 #include "xfs_imap.h" 25 #include "xfs_trans.h" 26 #include "xfs_trans_priv.h" 27 #include "xfs_sb.h" 28 #include "xfs_ag.h" 29 #include "xfs_dir.h" 30 #include "xfs_dir2.h" 31 #include "xfs_dmapi.h" 32 #include "xfs_mount.h" 33 #include "xfs_bmap_btree.h" 34 #include "xfs_alloc_btree.h" 35 #include "xfs_ialloc_btree.h" 36 #include "xfs_dir_sf.h" 37 #include "xfs_dir2_sf.h" 38 #include "xfs_attr_sf.h" 39 #include "xfs_dinode.h" 40 #include "xfs_inode.h" 41 #include "xfs_buf_item.h" 42 #include "xfs_inode_item.h" 43 #include "xfs_btree.h" 44 #include "xfs_alloc.h" 45 #include "xfs_ialloc.h" 46 #include "xfs_bmap.h" 47 #include "xfs_rw.h" 48 #include "xfs_error.h" 49 #include "xfs_utils.h" 50 #include "xfs_dir2_trace.h" 51 #include "xfs_quota.h" 52 #include "xfs_mac.h" 53 #include "xfs_acl.h" 54 55 56 kmem_zone_t *xfs_ifork_zone; 57 kmem_zone_t *xfs_inode_zone; 58 kmem_zone_t *xfs_chashlist_zone; 59 60 /* 61 * Used in xfs_itruncate(). This is the maximum number of extents 62 * freed from a file in a single transaction. 63 */ 64 #define XFS_ITRUNC_MAX_EXTENTS 2 65 66 STATIC int xfs_iflush_int(xfs_inode_t *, xfs_buf_t *); 67 STATIC int xfs_iformat_local(xfs_inode_t *, xfs_dinode_t *, int, int); 68 STATIC int xfs_iformat_extents(xfs_inode_t *, xfs_dinode_t *, int); 69 STATIC int xfs_iformat_btree(xfs_inode_t *, xfs_dinode_t *, int); 70 71 72 #ifdef DEBUG 73 /* 74 * Make sure that the extents in the given memory buffer 75 * are valid. 76 */ 77 STATIC void 78 xfs_validate_extents( 79 xfs_bmbt_rec_t *ep, 80 int nrecs, 81 int disk, 82 xfs_exntfmt_t fmt) 83 { 84 xfs_bmbt_irec_t irec; 85 xfs_bmbt_rec_t rec; 86 int i; 87 88 for (i = 0; i < nrecs; i++) { 89 rec.l0 = get_unaligned((__uint64_t*)&ep->l0); 90 rec.l1 = get_unaligned((__uint64_t*)&ep->l1); 91 if (disk) 92 xfs_bmbt_disk_get_all(&rec, &irec); 93 else 94 xfs_bmbt_get_all(&rec, &irec); 95 if (fmt == XFS_EXTFMT_NOSTATE) 96 ASSERT(irec.br_state == XFS_EXT_NORM); 97 ep++; 98 } 99 } 100 #else /* DEBUG */ 101 #define xfs_validate_extents(ep, nrecs, disk, fmt) 102 #endif /* DEBUG */ 103 104 /* 105 * Check that none of the inode's in the buffer have a next 106 * unlinked field of 0. 107 */ 108 #if defined(DEBUG) 109 void 110 xfs_inobp_check( 111 xfs_mount_t *mp, 112 xfs_buf_t *bp) 113 { 114 int i; 115 int j; 116 xfs_dinode_t *dip; 117 118 j = mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog; 119 120 for (i = 0; i < j; i++) { 121 dip = (xfs_dinode_t *)xfs_buf_offset(bp, 122 i * mp->m_sb.sb_inodesize); 123 if (!dip->di_next_unlinked) { 124 xfs_fs_cmn_err(CE_ALERT, mp, 125 "Detected a bogus zero next_unlinked field in incore inode buffer 0x%p. About to pop an ASSERT.", 126 bp); 127 ASSERT(dip->di_next_unlinked); 128 } 129 } 130 } 131 #endif 132 133 /* 134 * This routine is called to map an inode number within a file 135 * system to the buffer containing the on-disk version of the 136 * inode. It returns a pointer to the buffer containing the 137 * on-disk inode in the bpp parameter, and in the dip parameter 138 * it returns a pointer to the on-disk inode within that buffer. 139 * 140 * If a non-zero error is returned, then the contents of bpp and 141 * dipp are undefined. 142 * 143 * Use xfs_imap() to determine the size and location of the 144 * buffer to read from disk. 145 */ 146 STATIC int 147 xfs_inotobp( 148 xfs_mount_t *mp, 149 xfs_trans_t *tp, 150 xfs_ino_t ino, 151 xfs_dinode_t **dipp, 152 xfs_buf_t **bpp, 153 int *offset) 154 { 155 int di_ok; 156 xfs_imap_t imap; 157 xfs_buf_t *bp; 158 int error; 159 xfs_dinode_t *dip; 160 161 /* 162 * Call the space managment code to find the location of the 163 * inode on disk. 164 */ 165 imap.im_blkno = 0; 166 error = xfs_imap(mp, tp, ino, &imap, XFS_IMAP_LOOKUP); 167 if (error != 0) { 168 cmn_err(CE_WARN, 169 "xfs_inotobp: xfs_imap() returned an " 170 "error %d on %s. Returning error.", error, mp->m_fsname); 171 return error; 172 } 173 174 /* 175 * If the inode number maps to a block outside the bounds of the 176 * file system then return NULL rather than calling read_buf 177 * and panicing when we get an error from the driver. 178 */ 179 if ((imap.im_blkno + imap.im_len) > 180 XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)) { 181 cmn_err(CE_WARN, 182 "xfs_inotobp: inode number (%llu + %d) maps to a block outside the bounds " 183 "of the file system %s. Returning EINVAL.", 184 (unsigned long long)imap.im_blkno, 185 imap.im_len, mp->m_fsname); 186 return XFS_ERROR(EINVAL); 187 } 188 189 /* 190 * Read in the buffer. If tp is NULL, xfs_trans_read_buf() will 191 * default to just a read_buf() call. 192 */ 193 error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap.im_blkno, 194 (int)imap.im_len, XFS_BUF_LOCK, &bp); 195 196 if (error) { 197 cmn_err(CE_WARN, 198 "xfs_inotobp: xfs_trans_read_buf() returned an " 199 "error %d on %s. Returning error.", error, mp->m_fsname); 200 return error; 201 } 202 dip = (xfs_dinode_t *)xfs_buf_offset(bp, 0); 203 di_ok = 204 INT_GET(dip->di_core.di_magic, ARCH_CONVERT) == XFS_DINODE_MAGIC && 205 XFS_DINODE_GOOD_VERSION(INT_GET(dip->di_core.di_version, ARCH_CONVERT)); 206 if (unlikely(XFS_TEST_ERROR(!di_ok, mp, XFS_ERRTAG_ITOBP_INOTOBP, 207 XFS_RANDOM_ITOBP_INOTOBP))) { 208 XFS_CORRUPTION_ERROR("xfs_inotobp", XFS_ERRLEVEL_LOW, mp, dip); 209 xfs_trans_brelse(tp, bp); 210 cmn_err(CE_WARN, 211 "xfs_inotobp: XFS_TEST_ERROR() returned an " 212 "error on %s. Returning EFSCORRUPTED.", mp->m_fsname); 213 return XFS_ERROR(EFSCORRUPTED); 214 } 215 216 xfs_inobp_check(mp, bp); 217 218 /* 219 * Set *dipp to point to the on-disk inode in the buffer. 220 */ 221 *dipp = (xfs_dinode_t *)xfs_buf_offset(bp, imap.im_boffset); 222 *bpp = bp; 223 *offset = imap.im_boffset; 224 return 0; 225 } 226 227 228 /* 229 * This routine is called to map an inode to the buffer containing 230 * the on-disk version of the inode. It returns a pointer to the 231 * buffer containing the on-disk inode in the bpp parameter, and in 232 * the dip parameter it returns a pointer to the on-disk inode within 233 * that buffer. 234 * 235 * If a non-zero error is returned, then the contents of bpp and 236 * dipp are undefined. 237 * 238 * If the inode is new and has not yet been initialized, use xfs_imap() 239 * to determine the size and location of the buffer to read from disk. 240 * If the inode has already been mapped to its buffer and read in once, 241 * then use the mapping information stored in the inode rather than 242 * calling xfs_imap(). This allows us to avoid the overhead of looking 243 * at the inode btree for small block file systems (see xfs_dilocate()). 244 * We can tell whether the inode has been mapped in before by comparing 245 * its disk block address to 0. Only uninitialized inodes will have 246 * 0 for the disk block address. 247 */ 248 int 249 xfs_itobp( 250 xfs_mount_t *mp, 251 xfs_trans_t *tp, 252 xfs_inode_t *ip, 253 xfs_dinode_t **dipp, 254 xfs_buf_t **bpp, 255 xfs_daddr_t bno) 256 { 257 xfs_buf_t *bp; 258 int error; 259 xfs_imap_t imap; 260 #ifdef __KERNEL__ 261 int i; 262 int ni; 263 #endif 264 265 if (ip->i_blkno == (xfs_daddr_t)0) { 266 /* 267 * Call the space management code to find the location of the 268 * inode on disk. 269 */ 270 imap.im_blkno = bno; 271 error = xfs_imap(mp, tp, ip->i_ino, &imap, XFS_IMAP_LOOKUP); 272 if (error != 0) { 273 return error; 274 } 275 276 /* 277 * If the inode number maps to a block outside the bounds 278 * of the file system then return NULL rather than calling 279 * read_buf and panicing when we get an error from the 280 * driver. 281 */ 282 if ((imap.im_blkno + imap.im_len) > 283 XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)) { 284 #ifdef DEBUG 285 xfs_fs_cmn_err(CE_ALERT, mp, "xfs_itobp: " 286 "(imap.im_blkno (0x%llx) " 287 "+ imap.im_len (0x%llx)) > " 288 " XFS_FSB_TO_BB(mp, " 289 "mp->m_sb.sb_dblocks) (0x%llx)", 290 (unsigned long long) imap.im_blkno, 291 (unsigned long long) imap.im_len, 292 XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)); 293 #endif /* DEBUG */ 294 return XFS_ERROR(EINVAL); 295 } 296 297 /* 298 * Fill in the fields in the inode that will be used to 299 * map the inode to its buffer from now on. 300 */ 301 ip->i_blkno = imap.im_blkno; 302 ip->i_len = imap.im_len; 303 ip->i_boffset = imap.im_boffset; 304 } else { 305 /* 306 * We've already mapped the inode once, so just use the 307 * mapping that we saved the first time. 308 */ 309 imap.im_blkno = ip->i_blkno; 310 imap.im_len = ip->i_len; 311 imap.im_boffset = ip->i_boffset; 312 } 313 ASSERT(bno == 0 || bno == imap.im_blkno); 314 315 /* 316 * Read in the buffer. If tp is NULL, xfs_trans_read_buf() will 317 * default to just a read_buf() call. 318 */ 319 error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap.im_blkno, 320 (int)imap.im_len, XFS_BUF_LOCK, &bp); 321 322 if (error) { 323 #ifdef DEBUG 324 xfs_fs_cmn_err(CE_ALERT, mp, "xfs_itobp: " 325 "xfs_trans_read_buf() returned error %d, " 326 "imap.im_blkno 0x%llx, imap.im_len 0x%llx", 327 error, (unsigned long long) imap.im_blkno, 328 (unsigned long long) imap.im_len); 329 #endif /* DEBUG */ 330 return error; 331 } 332 #ifdef __KERNEL__ 333 /* 334 * Validate the magic number and version of every inode in the buffer 335 * (if DEBUG kernel) or the first inode in the buffer, otherwise. 336 */ 337 #ifdef DEBUG 338 ni = BBTOB(imap.im_len) >> mp->m_sb.sb_inodelog; 339 #else 340 ni = 1; 341 #endif 342 for (i = 0; i < ni; i++) { 343 int di_ok; 344 xfs_dinode_t *dip; 345 346 dip = (xfs_dinode_t *)xfs_buf_offset(bp, 347 (i << mp->m_sb.sb_inodelog)); 348 di_ok = INT_GET(dip->di_core.di_magic, ARCH_CONVERT) == XFS_DINODE_MAGIC && 349 XFS_DINODE_GOOD_VERSION(INT_GET(dip->di_core.di_version, ARCH_CONVERT)); 350 if (unlikely(XFS_TEST_ERROR(!di_ok, mp, XFS_ERRTAG_ITOBP_INOTOBP, 351 XFS_RANDOM_ITOBP_INOTOBP))) { 352 #ifdef DEBUG 353 prdev("bad inode magic/vsn daddr %lld #%d (magic=%x)", 354 mp->m_ddev_targp, 355 (unsigned long long)imap.im_blkno, i, 356 INT_GET(dip->di_core.di_magic, ARCH_CONVERT)); 357 #endif 358 XFS_CORRUPTION_ERROR("xfs_itobp", XFS_ERRLEVEL_HIGH, 359 mp, dip); 360 xfs_trans_brelse(tp, bp); 361 return XFS_ERROR(EFSCORRUPTED); 362 } 363 } 364 #endif /* __KERNEL__ */ 365 366 xfs_inobp_check(mp, bp); 367 368 /* 369 * Mark the buffer as an inode buffer now that it looks good 370 */ 371 XFS_BUF_SET_VTYPE(bp, B_FS_INO); 372 373 /* 374 * Set *dipp to point to the on-disk inode in the buffer. 375 */ 376 *dipp = (xfs_dinode_t *)xfs_buf_offset(bp, imap.im_boffset); 377 *bpp = bp; 378 return 0; 379 } 380 381 /* 382 * Move inode type and inode format specific information from the 383 * on-disk inode to the in-core inode. For fifos, devs, and sockets 384 * this means set if_rdev to the proper value. For files, directories, 385 * and symlinks this means to bring in the in-line data or extent 386 * pointers. For a file in B-tree format, only the root is immediately 387 * brought in-core. The rest will be in-lined in if_extents when it 388 * is first referenced (see xfs_iread_extents()). 389 */ 390 STATIC int 391 xfs_iformat( 392 xfs_inode_t *ip, 393 xfs_dinode_t *dip) 394 { 395 xfs_attr_shortform_t *atp; 396 int size; 397 int error; 398 xfs_fsize_t di_size; 399 ip->i_df.if_ext_max = 400 XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t); 401 error = 0; 402 403 if (unlikely( 404 INT_GET(dip->di_core.di_nextents, ARCH_CONVERT) + 405 INT_GET(dip->di_core.di_anextents, ARCH_CONVERT) > 406 INT_GET(dip->di_core.di_nblocks, ARCH_CONVERT))) { 407 xfs_fs_cmn_err(CE_WARN, ip->i_mount, 408 "corrupt dinode %Lu, extent total = %d, nblocks = %Lu." 409 " Unmount and run xfs_repair.", 410 (unsigned long long)ip->i_ino, 411 (int)(INT_GET(dip->di_core.di_nextents, ARCH_CONVERT) 412 + INT_GET(dip->di_core.di_anextents, ARCH_CONVERT)), 413 (unsigned long long) 414 INT_GET(dip->di_core.di_nblocks, ARCH_CONVERT)); 415 XFS_CORRUPTION_ERROR("xfs_iformat(1)", XFS_ERRLEVEL_LOW, 416 ip->i_mount, dip); 417 return XFS_ERROR(EFSCORRUPTED); 418 } 419 420 if (unlikely(INT_GET(dip->di_core.di_forkoff, ARCH_CONVERT) > ip->i_mount->m_sb.sb_inodesize)) { 421 xfs_fs_cmn_err(CE_WARN, ip->i_mount, 422 "corrupt dinode %Lu, forkoff = 0x%x." 423 " Unmount and run xfs_repair.", 424 (unsigned long long)ip->i_ino, 425 (int)(INT_GET(dip->di_core.di_forkoff, ARCH_CONVERT))); 426 XFS_CORRUPTION_ERROR("xfs_iformat(2)", XFS_ERRLEVEL_LOW, 427 ip->i_mount, dip); 428 return XFS_ERROR(EFSCORRUPTED); 429 } 430 431 switch (ip->i_d.di_mode & S_IFMT) { 432 case S_IFIFO: 433 case S_IFCHR: 434 case S_IFBLK: 435 case S_IFSOCK: 436 if (unlikely(INT_GET(dip->di_core.di_format, ARCH_CONVERT) != XFS_DINODE_FMT_DEV)) { 437 XFS_CORRUPTION_ERROR("xfs_iformat(3)", XFS_ERRLEVEL_LOW, 438 ip->i_mount, dip); 439 return XFS_ERROR(EFSCORRUPTED); 440 } 441 ip->i_d.di_size = 0; 442 ip->i_df.if_u2.if_rdev = INT_GET(dip->di_u.di_dev, ARCH_CONVERT); 443 break; 444 445 case S_IFREG: 446 case S_IFLNK: 447 case S_IFDIR: 448 switch (INT_GET(dip->di_core.di_format, ARCH_CONVERT)) { 449 case XFS_DINODE_FMT_LOCAL: 450 /* 451 * no local regular files yet 452 */ 453 if (unlikely((INT_GET(dip->di_core.di_mode, ARCH_CONVERT) & S_IFMT) == S_IFREG)) { 454 xfs_fs_cmn_err(CE_WARN, ip->i_mount, 455 "corrupt inode (local format for regular file) %Lu. Unmount and run xfs_repair.", 456 (unsigned long long) ip->i_ino); 457 XFS_CORRUPTION_ERROR("xfs_iformat(4)", 458 XFS_ERRLEVEL_LOW, 459 ip->i_mount, dip); 460 return XFS_ERROR(EFSCORRUPTED); 461 } 462 463 di_size = INT_GET(dip->di_core.di_size, ARCH_CONVERT); 464 if (unlikely(di_size > XFS_DFORK_DSIZE(dip, ip->i_mount))) { 465 xfs_fs_cmn_err(CE_WARN, ip->i_mount, 466 "corrupt inode %Lu (bad size %Ld for local inode). Unmount and run xfs_repair.", 467 (unsigned long long) ip->i_ino, 468 (long long) di_size); 469 XFS_CORRUPTION_ERROR("xfs_iformat(5)", 470 XFS_ERRLEVEL_LOW, 471 ip->i_mount, dip); 472 return XFS_ERROR(EFSCORRUPTED); 473 } 474 475 size = (int)di_size; 476 error = xfs_iformat_local(ip, dip, XFS_DATA_FORK, size); 477 break; 478 case XFS_DINODE_FMT_EXTENTS: 479 error = xfs_iformat_extents(ip, dip, XFS_DATA_FORK); 480 break; 481 case XFS_DINODE_FMT_BTREE: 482 error = xfs_iformat_btree(ip, dip, XFS_DATA_FORK); 483 break; 484 default: 485 XFS_ERROR_REPORT("xfs_iformat(6)", XFS_ERRLEVEL_LOW, 486 ip->i_mount); 487 return XFS_ERROR(EFSCORRUPTED); 488 } 489 break; 490 491 default: 492 XFS_ERROR_REPORT("xfs_iformat(7)", XFS_ERRLEVEL_LOW, ip->i_mount); 493 return XFS_ERROR(EFSCORRUPTED); 494 } 495 if (error) { 496 return error; 497 } 498 if (!XFS_DFORK_Q(dip)) 499 return 0; 500 ASSERT(ip->i_afp == NULL); 501 ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP); 502 ip->i_afp->if_ext_max = 503 XFS_IFORK_ASIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t); 504 switch (INT_GET(dip->di_core.di_aformat, ARCH_CONVERT)) { 505 case XFS_DINODE_FMT_LOCAL: 506 atp = (xfs_attr_shortform_t *)XFS_DFORK_APTR(dip); 507 size = (int)INT_GET(atp->hdr.totsize, ARCH_CONVERT); 508 error = xfs_iformat_local(ip, dip, XFS_ATTR_FORK, size); 509 break; 510 case XFS_DINODE_FMT_EXTENTS: 511 error = xfs_iformat_extents(ip, dip, XFS_ATTR_FORK); 512 break; 513 case XFS_DINODE_FMT_BTREE: 514 error = xfs_iformat_btree(ip, dip, XFS_ATTR_FORK); 515 break; 516 default: 517 error = XFS_ERROR(EFSCORRUPTED); 518 break; 519 } 520 if (error) { 521 kmem_zone_free(xfs_ifork_zone, ip->i_afp); 522 ip->i_afp = NULL; 523 xfs_idestroy_fork(ip, XFS_DATA_FORK); 524 } 525 return error; 526 } 527 528 /* 529 * The file is in-lined in the on-disk inode. 530 * If it fits into if_inline_data, then copy 531 * it there, otherwise allocate a buffer for it 532 * and copy the data there. Either way, set 533 * if_data to point at the data. 534 * If we allocate a buffer for the data, make 535 * sure that its size is a multiple of 4 and 536 * record the real size in i_real_bytes. 537 */ 538 STATIC int 539 xfs_iformat_local( 540 xfs_inode_t *ip, 541 xfs_dinode_t *dip, 542 int whichfork, 543 int size) 544 { 545 xfs_ifork_t *ifp; 546 int real_size; 547 548 /* 549 * If the size is unreasonable, then something 550 * is wrong and we just bail out rather than crash in 551 * kmem_alloc() or memcpy() below. 552 */ 553 if (unlikely(size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) { 554 xfs_fs_cmn_err(CE_WARN, ip->i_mount, 555 "corrupt inode %Lu (bad size %d for local fork, size = %d). Unmount and run xfs_repair.", 556 (unsigned long long) ip->i_ino, size, 557 XFS_DFORK_SIZE(dip, ip->i_mount, whichfork)); 558 XFS_CORRUPTION_ERROR("xfs_iformat_local", XFS_ERRLEVEL_LOW, 559 ip->i_mount, dip); 560 return XFS_ERROR(EFSCORRUPTED); 561 } 562 ifp = XFS_IFORK_PTR(ip, whichfork); 563 real_size = 0; 564 if (size == 0) 565 ifp->if_u1.if_data = NULL; 566 else if (size <= sizeof(ifp->if_u2.if_inline_data)) 567 ifp->if_u1.if_data = ifp->if_u2.if_inline_data; 568 else { 569 real_size = roundup(size, 4); 570 ifp->if_u1.if_data = kmem_alloc(real_size, KM_SLEEP); 571 } 572 ifp->if_bytes = size; 573 ifp->if_real_bytes = real_size; 574 if (size) 575 memcpy(ifp->if_u1.if_data, XFS_DFORK_PTR(dip, whichfork), size); 576 ifp->if_flags &= ~XFS_IFEXTENTS; 577 ifp->if_flags |= XFS_IFINLINE; 578 return 0; 579 } 580 581 /* 582 * The file consists of a set of extents all 583 * of which fit into the on-disk inode. 584 * If there are few enough extents to fit into 585 * the if_inline_ext, then copy them there. 586 * Otherwise allocate a buffer for them and copy 587 * them into it. Either way, set if_extents 588 * to point at the extents. 589 */ 590 STATIC int 591 xfs_iformat_extents( 592 xfs_inode_t *ip, 593 xfs_dinode_t *dip, 594 int whichfork) 595 { 596 xfs_bmbt_rec_t *ep, *dp; 597 xfs_ifork_t *ifp; 598 int nex; 599 int real_size; 600 int size; 601 int i; 602 603 ifp = XFS_IFORK_PTR(ip, whichfork); 604 nex = XFS_DFORK_NEXTENTS(dip, whichfork); 605 size = nex * (uint)sizeof(xfs_bmbt_rec_t); 606 607 /* 608 * If the number of extents is unreasonable, then something 609 * is wrong and we just bail out rather than crash in 610 * kmem_alloc() or memcpy() below. 611 */ 612 if (unlikely(size < 0 || size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) { 613 xfs_fs_cmn_err(CE_WARN, ip->i_mount, 614 "corrupt inode %Lu ((a)extents = %d). Unmount and run xfs_repair.", 615 (unsigned long long) ip->i_ino, nex); 616 XFS_CORRUPTION_ERROR("xfs_iformat_extents(1)", XFS_ERRLEVEL_LOW, 617 ip->i_mount, dip); 618 return XFS_ERROR(EFSCORRUPTED); 619 } 620 621 real_size = 0; 622 if (nex == 0) 623 ifp->if_u1.if_extents = NULL; 624 else if (nex <= XFS_INLINE_EXTS) 625 ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext; 626 else { 627 ifp->if_u1.if_extents = kmem_alloc(size, KM_SLEEP); 628 ASSERT(ifp->if_u1.if_extents != NULL); 629 real_size = size; 630 } 631 ifp->if_bytes = size; 632 ifp->if_real_bytes = real_size; 633 if (size) { 634 dp = (xfs_bmbt_rec_t *) XFS_DFORK_PTR(dip, whichfork); 635 xfs_validate_extents(dp, nex, 1, XFS_EXTFMT_INODE(ip)); 636 ep = ifp->if_u1.if_extents; 637 for (i = 0; i < nex; i++, ep++, dp++) { 638 ep->l0 = INT_GET(get_unaligned((__uint64_t*)&dp->l0), 639 ARCH_CONVERT); 640 ep->l1 = INT_GET(get_unaligned((__uint64_t*)&dp->l1), 641 ARCH_CONVERT); 642 } 643 xfs_bmap_trace_exlist("xfs_iformat_extents", ip, nex, 644 whichfork); 645 if (whichfork != XFS_DATA_FORK || 646 XFS_EXTFMT_INODE(ip) == XFS_EXTFMT_NOSTATE) 647 if (unlikely(xfs_check_nostate_extents( 648 ifp->if_u1.if_extents, nex))) { 649 XFS_ERROR_REPORT("xfs_iformat_extents(2)", 650 XFS_ERRLEVEL_LOW, 651 ip->i_mount); 652 return XFS_ERROR(EFSCORRUPTED); 653 } 654 } 655 ifp->if_flags |= XFS_IFEXTENTS; 656 return 0; 657 } 658 659 /* 660 * The file has too many extents to fit into 661 * the inode, so they are in B-tree format. 662 * Allocate a buffer for the root of the B-tree 663 * and copy the root into it. The i_extents 664 * field will remain NULL until all of the 665 * extents are read in (when they are needed). 666 */ 667 STATIC int 668 xfs_iformat_btree( 669 xfs_inode_t *ip, 670 xfs_dinode_t *dip, 671 int whichfork) 672 { 673 xfs_bmdr_block_t *dfp; 674 xfs_ifork_t *ifp; 675 /* REFERENCED */ 676 int nrecs; 677 int size; 678 679 ifp = XFS_IFORK_PTR(ip, whichfork); 680 dfp = (xfs_bmdr_block_t *)XFS_DFORK_PTR(dip, whichfork); 681 size = XFS_BMAP_BROOT_SPACE(dfp); 682 nrecs = XFS_BMAP_BROOT_NUMRECS(dfp); 683 684 /* 685 * blow out if -- fork has less extents than can fit in 686 * fork (fork shouldn't be a btree format), root btree 687 * block has more records than can fit into the fork, 688 * or the number of extents is greater than the number of 689 * blocks. 690 */ 691 if (unlikely(XFS_IFORK_NEXTENTS(ip, whichfork) <= ifp->if_ext_max 692 || XFS_BMDR_SPACE_CALC(nrecs) > 693 XFS_DFORK_SIZE(dip, ip->i_mount, whichfork) 694 || XFS_IFORK_NEXTENTS(ip, whichfork) > ip->i_d.di_nblocks)) { 695 xfs_fs_cmn_err(CE_WARN, ip->i_mount, 696 "corrupt inode %Lu (btree). Unmount and run xfs_repair.", 697 (unsigned long long) ip->i_ino); 698 XFS_ERROR_REPORT("xfs_iformat_btree", XFS_ERRLEVEL_LOW, 699 ip->i_mount); 700 return XFS_ERROR(EFSCORRUPTED); 701 } 702 703 ifp->if_broot_bytes = size; 704 ifp->if_broot = kmem_alloc(size, KM_SLEEP); 705 ASSERT(ifp->if_broot != NULL); 706 /* 707 * Copy and convert from the on-disk structure 708 * to the in-memory structure. 709 */ 710 xfs_bmdr_to_bmbt(dfp, XFS_DFORK_SIZE(dip, ip->i_mount, whichfork), 711 ifp->if_broot, size); 712 ifp->if_flags &= ~XFS_IFEXTENTS; 713 ifp->if_flags |= XFS_IFBROOT; 714 715 return 0; 716 } 717 718 /* 719 * xfs_xlate_dinode_core - translate an xfs_inode_core_t between ondisk 720 * and native format 721 * 722 * buf = on-disk representation 723 * dip = native representation 724 * dir = direction - +ve -> disk to native 725 * -ve -> native to disk 726 */ 727 void 728 xfs_xlate_dinode_core( 729 xfs_caddr_t buf, 730 xfs_dinode_core_t *dip, 731 int dir) 732 { 733 xfs_dinode_core_t *buf_core = (xfs_dinode_core_t *)buf; 734 xfs_dinode_core_t *mem_core = (xfs_dinode_core_t *)dip; 735 xfs_arch_t arch = ARCH_CONVERT; 736 737 ASSERT(dir); 738 739 INT_XLATE(buf_core->di_magic, mem_core->di_magic, dir, arch); 740 INT_XLATE(buf_core->di_mode, mem_core->di_mode, dir, arch); 741 INT_XLATE(buf_core->di_version, mem_core->di_version, dir, arch); 742 INT_XLATE(buf_core->di_format, mem_core->di_format, dir, arch); 743 INT_XLATE(buf_core->di_onlink, mem_core->di_onlink, dir, arch); 744 INT_XLATE(buf_core->di_uid, mem_core->di_uid, dir, arch); 745 INT_XLATE(buf_core->di_gid, mem_core->di_gid, dir, arch); 746 INT_XLATE(buf_core->di_nlink, mem_core->di_nlink, dir, arch); 747 INT_XLATE(buf_core->di_projid, mem_core->di_projid, dir, arch); 748 749 if (dir > 0) { 750 memcpy(mem_core->di_pad, buf_core->di_pad, 751 sizeof(buf_core->di_pad)); 752 } else { 753 memcpy(buf_core->di_pad, mem_core->di_pad, 754 sizeof(buf_core->di_pad)); 755 } 756 757 INT_XLATE(buf_core->di_flushiter, mem_core->di_flushiter, dir, arch); 758 759 INT_XLATE(buf_core->di_atime.t_sec, mem_core->di_atime.t_sec, 760 dir, arch); 761 INT_XLATE(buf_core->di_atime.t_nsec, mem_core->di_atime.t_nsec, 762 dir, arch); 763 INT_XLATE(buf_core->di_mtime.t_sec, mem_core->di_mtime.t_sec, 764 dir, arch); 765 INT_XLATE(buf_core->di_mtime.t_nsec, mem_core->di_mtime.t_nsec, 766 dir, arch); 767 INT_XLATE(buf_core->di_ctime.t_sec, mem_core->di_ctime.t_sec, 768 dir, arch); 769 INT_XLATE(buf_core->di_ctime.t_nsec, mem_core->di_ctime.t_nsec, 770 dir, arch); 771 INT_XLATE(buf_core->di_size, mem_core->di_size, dir, arch); 772 INT_XLATE(buf_core->di_nblocks, mem_core->di_nblocks, dir, arch); 773 INT_XLATE(buf_core->di_extsize, mem_core->di_extsize, dir, arch); 774 INT_XLATE(buf_core->di_nextents, mem_core->di_nextents, dir, arch); 775 INT_XLATE(buf_core->di_anextents, mem_core->di_anextents, dir, arch); 776 INT_XLATE(buf_core->di_forkoff, mem_core->di_forkoff, dir, arch); 777 INT_XLATE(buf_core->di_aformat, mem_core->di_aformat, dir, arch); 778 INT_XLATE(buf_core->di_dmevmask, mem_core->di_dmevmask, dir, arch); 779 INT_XLATE(buf_core->di_dmstate, mem_core->di_dmstate, dir, arch); 780 INT_XLATE(buf_core->di_flags, mem_core->di_flags, dir, arch); 781 INT_XLATE(buf_core->di_gen, mem_core->di_gen, dir, arch); 782 } 783 784 STATIC uint 785 _xfs_dic2xflags( 786 xfs_dinode_core_t *dic, 787 __uint16_t di_flags) 788 { 789 uint flags = 0; 790 791 if (di_flags & XFS_DIFLAG_ANY) { 792 if (di_flags & XFS_DIFLAG_REALTIME) 793 flags |= XFS_XFLAG_REALTIME; 794 if (di_flags & XFS_DIFLAG_PREALLOC) 795 flags |= XFS_XFLAG_PREALLOC; 796 if (di_flags & XFS_DIFLAG_IMMUTABLE) 797 flags |= XFS_XFLAG_IMMUTABLE; 798 if (di_flags & XFS_DIFLAG_APPEND) 799 flags |= XFS_XFLAG_APPEND; 800 if (di_flags & XFS_DIFLAG_SYNC) 801 flags |= XFS_XFLAG_SYNC; 802 if (di_flags & XFS_DIFLAG_NOATIME) 803 flags |= XFS_XFLAG_NOATIME; 804 if (di_flags & XFS_DIFLAG_NODUMP) 805 flags |= XFS_XFLAG_NODUMP; 806 if (di_flags & XFS_DIFLAG_RTINHERIT) 807 flags |= XFS_XFLAG_RTINHERIT; 808 if (di_flags & XFS_DIFLAG_PROJINHERIT) 809 flags |= XFS_XFLAG_PROJINHERIT; 810 if (di_flags & XFS_DIFLAG_NOSYMLINKS) 811 flags |= XFS_XFLAG_NOSYMLINKS; 812 } 813 814 return flags; 815 } 816 817 uint 818 xfs_ip2xflags( 819 xfs_inode_t *ip) 820 { 821 xfs_dinode_core_t *dic = &ip->i_d; 822 823 return _xfs_dic2xflags(dic, dic->di_flags) | 824 (XFS_CFORK_Q(dic) ? XFS_XFLAG_HASATTR : 0); 825 } 826 827 uint 828 xfs_dic2xflags( 829 xfs_dinode_core_t *dic) 830 { 831 return _xfs_dic2xflags(dic, INT_GET(dic->di_flags, ARCH_CONVERT)) | 832 (XFS_CFORK_Q_DISK(dic) ? XFS_XFLAG_HASATTR : 0); 833 } 834 835 /* 836 * Given a mount structure and an inode number, return a pointer 837 * to a newly allocated in-core inode coresponding to the given 838 * inode number. 839 * 840 * Initialize the inode's attributes and extent pointers if it 841 * already has them (it will not if the inode has no links). 842 */ 843 int 844 xfs_iread( 845 xfs_mount_t *mp, 846 xfs_trans_t *tp, 847 xfs_ino_t ino, 848 xfs_inode_t **ipp, 849 xfs_daddr_t bno) 850 { 851 xfs_buf_t *bp; 852 xfs_dinode_t *dip; 853 xfs_inode_t *ip; 854 int error; 855 856 ASSERT(xfs_inode_zone != NULL); 857 858 ip = kmem_zone_zalloc(xfs_inode_zone, KM_SLEEP); 859 ip->i_ino = ino; 860 ip->i_mount = mp; 861 862 /* 863 * Get pointer's to the on-disk inode and the buffer containing it. 864 * If the inode number refers to a block outside the file system 865 * then xfs_itobp() will return NULL. In this case we should 866 * return NULL as well. Set i_blkno to 0 so that xfs_itobp() will 867 * know that this is a new incore inode. 868 */ 869 error = xfs_itobp(mp, tp, ip, &dip, &bp, bno); 870 871 if (error != 0) { 872 kmem_zone_free(xfs_inode_zone, ip); 873 return error; 874 } 875 876 /* 877 * Initialize inode's trace buffers. 878 * Do this before xfs_iformat in case it adds entries. 879 */ 880 #ifdef XFS_BMAP_TRACE 881 ip->i_xtrace = ktrace_alloc(XFS_BMAP_KTRACE_SIZE, KM_SLEEP); 882 #endif 883 #ifdef XFS_BMBT_TRACE 884 ip->i_btrace = ktrace_alloc(XFS_BMBT_KTRACE_SIZE, KM_SLEEP); 885 #endif 886 #ifdef XFS_RW_TRACE 887 ip->i_rwtrace = ktrace_alloc(XFS_RW_KTRACE_SIZE, KM_SLEEP); 888 #endif 889 #ifdef XFS_ILOCK_TRACE 890 ip->i_lock_trace = ktrace_alloc(XFS_ILOCK_KTRACE_SIZE, KM_SLEEP); 891 #endif 892 #ifdef XFS_DIR2_TRACE 893 ip->i_dir_trace = ktrace_alloc(XFS_DIR2_KTRACE_SIZE, KM_SLEEP); 894 #endif 895 896 /* 897 * If we got something that isn't an inode it means someone 898 * (nfs or dmi) has a stale handle. 899 */ 900 if (INT_GET(dip->di_core.di_magic, ARCH_CONVERT) != XFS_DINODE_MAGIC) { 901 kmem_zone_free(xfs_inode_zone, ip); 902 xfs_trans_brelse(tp, bp); 903 #ifdef DEBUG 904 xfs_fs_cmn_err(CE_ALERT, mp, "xfs_iread: " 905 "dip->di_core.di_magic (0x%x) != " 906 "XFS_DINODE_MAGIC (0x%x)", 907 INT_GET(dip->di_core.di_magic, ARCH_CONVERT), 908 XFS_DINODE_MAGIC); 909 #endif /* DEBUG */ 910 return XFS_ERROR(EINVAL); 911 } 912 913 /* 914 * If the on-disk inode is already linked to a directory 915 * entry, copy all of the inode into the in-core inode. 916 * xfs_iformat() handles copying in the inode format 917 * specific information. 918 * Otherwise, just get the truly permanent information. 919 */ 920 if (dip->di_core.di_mode) { 921 xfs_xlate_dinode_core((xfs_caddr_t)&dip->di_core, 922 &(ip->i_d), 1); 923 error = xfs_iformat(ip, dip); 924 if (error) { 925 kmem_zone_free(xfs_inode_zone, ip); 926 xfs_trans_brelse(tp, bp); 927 #ifdef DEBUG 928 xfs_fs_cmn_err(CE_ALERT, mp, "xfs_iread: " 929 "xfs_iformat() returned error %d", 930 error); 931 #endif /* DEBUG */ 932 return error; 933 } 934 } else { 935 ip->i_d.di_magic = INT_GET(dip->di_core.di_magic, ARCH_CONVERT); 936 ip->i_d.di_version = INT_GET(dip->di_core.di_version, ARCH_CONVERT); 937 ip->i_d.di_gen = INT_GET(dip->di_core.di_gen, ARCH_CONVERT); 938 ip->i_d.di_flushiter = INT_GET(dip->di_core.di_flushiter, ARCH_CONVERT); 939 /* 940 * Make sure to pull in the mode here as well in 941 * case the inode is released without being used. 942 * This ensures that xfs_inactive() will see that 943 * the inode is already free and not try to mess 944 * with the uninitialized part of it. 945 */ 946 ip->i_d.di_mode = 0; 947 /* 948 * Initialize the per-fork minima and maxima for a new 949 * inode here. xfs_iformat will do it for old inodes. 950 */ 951 ip->i_df.if_ext_max = 952 XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t); 953 } 954 955 INIT_LIST_HEAD(&ip->i_reclaim); 956 957 /* 958 * The inode format changed when we moved the link count and 959 * made it 32 bits long. If this is an old format inode, 960 * convert it in memory to look like a new one. If it gets 961 * flushed to disk we will convert back before flushing or 962 * logging it. We zero out the new projid field and the old link 963 * count field. We'll handle clearing the pad field (the remains 964 * of the old uuid field) when we actually convert the inode to 965 * the new format. We don't change the version number so that we 966 * can distinguish this from a real new format inode. 967 */ 968 if (ip->i_d.di_version == XFS_DINODE_VERSION_1) { 969 ip->i_d.di_nlink = ip->i_d.di_onlink; 970 ip->i_d.di_onlink = 0; 971 ip->i_d.di_projid = 0; 972 } 973 974 ip->i_delayed_blks = 0; 975 976 /* 977 * Mark the buffer containing the inode as something to keep 978 * around for a while. This helps to keep recently accessed 979 * meta-data in-core longer. 980 */ 981 XFS_BUF_SET_REF(bp, XFS_INO_REF); 982 983 /* 984 * Use xfs_trans_brelse() to release the buffer containing the 985 * on-disk inode, because it was acquired with xfs_trans_read_buf() 986 * in xfs_itobp() above. If tp is NULL, this is just a normal 987 * brelse(). If we're within a transaction, then xfs_trans_brelse() 988 * will only release the buffer if it is not dirty within the 989 * transaction. It will be OK to release the buffer in this case, 990 * because inodes on disk are never destroyed and we will be 991 * locking the new in-core inode before putting it in the hash 992 * table where other processes can find it. Thus we don't have 993 * to worry about the inode being changed just because we released 994 * the buffer. 995 */ 996 xfs_trans_brelse(tp, bp); 997 *ipp = ip; 998 return 0; 999 } 1000 1001 /* 1002 * Read in extents from a btree-format inode. 1003 * Allocate and fill in if_extents. Real work is done in xfs_bmap.c. 1004 */ 1005 int 1006 xfs_iread_extents( 1007 xfs_trans_t *tp, 1008 xfs_inode_t *ip, 1009 int whichfork) 1010 { 1011 int error; 1012 xfs_ifork_t *ifp; 1013 size_t size; 1014 1015 if (unlikely(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)) { 1016 XFS_ERROR_REPORT("xfs_iread_extents", XFS_ERRLEVEL_LOW, 1017 ip->i_mount); 1018 return XFS_ERROR(EFSCORRUPTED); 1019 } 1020 size = XFS_IFORK_NEXTENTS(ip, whichfork) * (uint)sizeof(xfs_bmbt_rec_t); 1021 ifp = XFS_IFORK_PTR(ip, whichfork); 1022 /* 1023 * We know that the size is valid (it's checked in iformat_btree) 1024 */ 1025 ifp->if_u1.if_extents = kmem_alloc(size, KM_SLEEP); 1026 ASSERT(ifp->if_u1.if_extents != NULL); 1027 ifp->if_lastex = NULLEXTNUM; 1028 ifp->if_bytes = ifp->if_real_bytes = (int)size; 1029 ifp->if_flags |= XFS_IFEXTENTS; 1030 error = xfs_bmap_read_extents(tp, ip, whichfork); 1031 if (error) { 1032 kmem_free(ifp->if_u1.if_extents, size); 1033 ifp->if_u1.if_extents = NULL; 1034 ifp->if_bytes = ifp->if_real_bytes = 0; 1035 ifp->if_flags &= ~XFS_IFEXTENTS; 1036 return error; 1037 } 1038 xfs_validate_extents((xfs_bmbt_rec_t *)ifp->if_u1.if_extents, 1039 XFS_IFORK_NEXTENTS(ip, whichfork), 0, XFS_EXTFMT_INODE(ip)); 1040 return 0; 1041 } 1042 1043 /* 1044 * Allocate an inode on disk and return a copy of its in-core version. 1045 * The in-core inode is locked exclusively. Set mode, nlink, and rdev 1046 * appropriately within the inode. The uid and gid for the inode are 1047 * set according to the contents of the given cred structure. 1048 * 1049 * Use xfs_dialloc() to allocate the on-disk inode. If xfs_dialloc() 1050 * has a free inode available, call xfs_iget() 1051 * to obtain the in-core version of the allocated inode. Finally, 1052 * fill in the inode and log its initial contents. In this case, 1053 * ialloc_context would be set to NULL and call_again set to false. 1054 * 1055 * If xfs_dialloc() does not have an available inode, 1056 * it will replenish its supply by doing an allocation. Since we can 1057 * only do one allocation within a transaction without deadlocks, we 1058 * must commit the current transaction before returning the inode itself. 1059 * In this case, therefore, we will set call_again to true and return. 1060 * The caller should then commit the current transaction, start a new 1061 * transaction, and call xfs_ialloc() again to actually get the inode. 1062 * 1063 * To ensure that some other process does not grab the inode that 1064 * was allocated during the first call to xfs_ialloc(), this routine 1065 * also returns the [locked] bp pointing to the head of the freelist 1066 * as ialloc_context. The caller should hold this buffer across 1067 * the commit and pass it back into this routine on the second call. 1068 */ 1069 int 1070 xfs_ialloc( 1071 xfs_trans_t *tp, 1072 xfs_inode_t *pip, 1073 mode_t mode, 1074 xfs_nlink_t nlink, 1075 xfs_dev_t rdev, 1076 cred_t *cr, 1077 xfs_prid_t prid, 1078 int okalloc, 1079 xfs_buf_t **ialloc_context, 1080 boolean_t *call_again, 1081 xfs_inode_t **ipp) 1082 { 1083 xfs_ino_t ino; 1084 xfs_inode_t *ip; 1085 vnode_t *vp; 1086 uint flags; 1087 int error; 1088 1089 /* 1090 * Call the space management code to pick 1091 * the on-disk inode to be allocated. 1092 */ 1093 error = xfs_dialloc(tp, pip->i_ino, mode, okalloc, 1094 ialloc_context, call_again, &ino); 1095 if (error != 0) { 1096 return error; 1097 } 1098 if (*call_again || ino == NULLFSINO) { 1099 *ipp = NULL; 1100 return 0; 1101 } 1102 ASSERT(*ialloc_context == NULL); 1103 1104 /* 1105 * Get the in-core inode with the lock held exclusively. 1106 * This is because we're setting fields here we need 1107 * to prevent others from looking at until we're done. 1108 */ 1109 error = xfs_trans_iget(tp->t_mountp, tp, ino, 1110 IGET_CREATE, XFS_ILOCK_EXCL, &ip); 1111 if (error != 0) { 1112 return error; 1113 } 1114 ASSERT(ip != NULL); 1115 1116 vp = XFS_ITOV(ip); 1117 ip->i_d.di_mode = (__uint16_t)mode; 1118 ip->i_d.di_onlink = 0; 1119 ip->i_d.di_nlink = nlink; 1120 ASSERT(ip->i_d.di_nlink == nlink); 1121 ip->i_d.di_uid = current_fsuid(cr); 1122 ip->i_d.di_gid = current_fsgid(cr); 1123 ip->i_d.di_projid = prid; 1124 memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad)); 1125 1126 /* 1127 * If the superblock version is up to where we support new format 1128 * inodes and this is currently an old format inode, then change 1129 * the inode version number now. This way we only do the conversion 1130 * here rather than here and in the flush/logging code. 1131 */ 1132 if (XFS_SB_VERSION_HASNLINK(&tp->t_mountp->m_sb) && 1133 ip->i_d.di_version == XFS_DINODE_VERSION_1) { 1134 ip->i_d.di_version = XFS_DINODE_VERSION_2; 1135 /* 1136 * We've already zeroed the old link count, the projid field, 1137 * and the pad field. 1138 */ 1139 } 1140 1141 /* 1142 * Project ids won't be stored on disk if we are using a version 1 inode. 1143 */ 1144 if ( (prid != 0) && (ip->i_d.di_version == XFS_DINODE_VERSION_1)) 1145 xfs_bump_ino_vers2(tp, ip); 1146 1147 if (XFS_INHERIT_GID(pip, vp->v_vfsp)) { 1148 ip->i_d.di_gid = pip->i_d.di_gid; 1149 if ((pip->i_d.di_mode & S_ISGID) && (mode & S_IFMT) == S_IFDIR) { 1150 ip->i_d.di_mode |= S_ISGID; 1151 } 1152 } 1153 1154 /* 1155 * If the group ID of the new file does not match the effective group 1156 * ID or one of the supplementary group IDs, the S_ISGID bit is cleared 1157 * (and only if the irix_sgid_inherit compatibility variable is set). 1158 */ 1159 if ((irix_sgid_inherit) && 1160 (ip->i_d.di_mode & S_ISGID) && 1161 (!in_group_p((gid_t)ip->i_d.di_gid))) { 1162 ip->i_d.di_mode &= ~S_ISGID; 1163 } 1164 1165 ip->i_d.di_size = 0; 1166 ip->i_d.di_nextents = 0; 1167 ASSERT(ip->i_d.di_nblocks == 0); 1168 xfs_ichgtime(ip, XFS_ICHGTIME_CHG|XFS_ICHGTIME_ACC|XFS_ICHGTIME_MOD); 1169 /* 1170 * di_gen will have been taken care of in xfs_iread. 1171 */ 1172 ip->i_d.di_extsize = 0; 1173 ip->i_d.di_dmevmask = 0; 1174 ip->i_d.di_dmstate = 0; 1175 ip->i_d.di_flags = 0; 1176 flags = XFS_ILOG_CORE; 1177 switch (mode & S_IFMT) { 1178 case S_IFIFO: 1179 case S_IFCHR: 1180 case S_IFBLK: 1181 case S_IFSOCK: 1182 ip->i_d.di_format = XFS_DINODE_FMT_DEV; 1183 ip->i_df.if_u2.if_rdev = rdev; 1184 ip->i_df.if_flags = 0; 1185 flags |= XFS_ILOG_DEV; 1186 break; 1187 case S_IFREG: 1188 case S_IFDIR: 1189 if (unlikely(pip->i_d.di_flags & XFS_DIFLAG_ANY)) { 1190 uint di_flags = 0; 1191 1192 if ((mode & S_IFMT) == S_IFDIR) { 1193 if (pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT) 1194 di_flags |= XFS_DIFLAG_RTINHERIT; 1195 } else { 1196 if (pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT) { 1197 di_flags |= XFS_DIFLAG_REALTIME; 1198 ip->i_iocore.io_flags |= XFS_IOCORE_RT; 1199 } 1200 } 1201 if ((pip->i_d.di_flags & XFS_DIFLAG_NOATIME) && 1202 xfs_inherit_noatime) 1203 di_flags |= XFS_DIFLAG_NOATIME; 1204 if ((pip->i_d.di_flags & XFS_DIFLAG_NODUMP) && 1205 xfs_inherit_nodump) 1206 di_flags |= XFS_DIFLAG_NODUMP; 1207 if ((pip->i_d.di_flags & XFS_DIFLAG_SYNC) && 1208 xfs_inherit_sync) 1209 di_flags |= XFS_DIFLAG_SYNC; 1210 if ((pip->i_d.di_flags & XFS_DIFLAG_NOSYMLINKS) && 1211 xfs_inherit_nosymlinks) 1212 di_flags |= XFS_DIFLAG_NOSYMLINKS; 1213 if (pip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) 1214 di_flags |= XFS_DIFLAG_PROJINHERIT; 1215 ip->i_d.di_flags |= di_flags; 1216 } 1217 /* FALLTHROUGH */ 1218 case S_IFLNK: 1219 ip->i_d.di_format = XFS_DINODE_FMT_EXTENTS; 1220 ip->i_df.if_flags = XFS_IFEXTENTS; 1221 ip->i_df.if_bytes = ip->i_df.if_real_bytes = 0; 1222 ip->i_df.if_u1.if_extents = NULL; 1223 break; 1224 default: 1225 ASSERT(0); 1226 } 1227 /* 1228 * Attribute fork settings for new inode. 1229 */ 1230 ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS; 1231 ip->i_d.di_anextents = 0; 1232 1233 /* 1234 * Log the new values stuffed into the inode. 1235 */ 1236 xfs_trans_log_inode(tp, ip, flags); 1237 1238 /* now that we have an i_mode we can set Linux inode ops (& unlock) */ 1239 VFS_INIT_VNODE(XFS_MTOVFS(tp->t_mountp), vp, XFS_ITOBHV(ip), 1); 1240 1241 *ipp = ip; 1242 return 0; 1243 } 1244 1245 /* 1246 * Check to make sure that there are no blocks allocated to the 1247 * file beyond the size of the file. We don't check this for 1248 * files with fixed size extents or real time extents, but we 1249 * at least do it for regular files. 1250 */ 1251 #ifdef DEBUG 1252 void 1253 xfs_isize_check( 1254 xfs_mount_t *mp, 1255 xfs_inode_t *ip, 1256 xfs_fsize_t isize) 1257 { 1258 xfs_fileoff_t map_first; 1259 int nimaps; 1260 xfs_bmbt_irec_t imaps[2]; 1261 1262 if ((ip->i_d.di_mode & S_IFMT) != S_IFREG) 1263 return; 1264 1265 if ( ip->i_d.di_flags & XFS_DIFLAG_REALTIME ) 1266 return; 1267 1268 nimaps = 2; 1269 map_first = XFS_B_TO_FSB(mp, (xfs_ufsize_t)isize); 1270 /* 1271 * The filesystem could be shutting down, so bmapi may return 1272 * an error. 1273 */ 1274 if (xfs_bmapi(NULL, ip, map_first, 1275 (XFS_B_TO_FSB(mp, 1276 (xfs_ufsize_t)XFS_MAXIOFFSET(mp)) - 1277 map_first), 1278 XFS_BMAPI_ENTIRE, NULL, 0, imaps, &nimaps, 1279 NULL)) 1280 return; 1281 ASSERT(nimaps == 1); 1282 ASSERT(imaps[0].br_startblock == HOLESTARTBLOCK); 1283 } 1284 #endif /* DEBUG */ 1285 1286 /* 1287 * Calculate the last possible buffered byte in a file. This must 1288 * include data that was buffered beyond the EOF by the write code. 1289 * This also needs to deal with overflowing the xfs_fsize_t type 1290 * which can happen for sizes near the limit. 1291 * 1292 * We also need to take into account any blocks beyond the EOF. It 1293 * may be the case that they were buffered by a write which failed. 1294 * In that case the pages will still be in memory, but the inode size 1295 * will never have been updated. 1296 */ 1297 xfs_fsize_t 1298 xfs_file_last_byte( 1299 xfs_inode_t *ip) 1300 { 1301 xfs_mount_t *mp; 1302 xfs_fsize_t last_byte; 1303 xfs_fileoff_t last_block; 1304 xfs_fileoff_t size_last_block; 1305 int error; 1306 1307 ASSERT(ismrlocked(&(ip->i_iolock), MR_UPDATE | MR_ACCESS)); 1308 1309 mp = ip->i_mount; 1310 /* 1311 * Only check for blocks beyond the EOF if the extents have 1312 * been read in. This eliminates the need for the inode lock, 1313 * and it also saves us from looking when it really isn't 1314 * necessary. 1315 */ 1316 if (ip->i_df.if_flags & XFS_IFEXTENTS) { 1317 error = xfs_bmap_last_offset(NULL, ip, &last_block, 1318 XFS_DATA_FORK); 1319 if (error) { 1320 last_block = 0; 1321 } 1322 } else { 1323 last_block = 0; 1324 } 1325 size_last_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)ip->i_d.di_size); 1326 last_block = XFS_FILEOFF_MAX(last_block, size_last_block); 1327 1328 last_byte = XFS_FSB_TO_B(mp, last_block); 1329 if (last_byte < 0) { 1330 return XFS_MAXIOFFSET(mp); 1331 } 1332 last_byte += (1 << mp->m_writeio_log); 1333 if (last_byte < 0) { 1334 return XFS_MAXIOFFSET(mp); 1335 } 1336 return last_byte; 1337 } 1338 1339 #if defined(XFS_RW_TRACE) 1340 STATIC void 1341 xfs_itrunc_trace( 1342 int tag, 1343 xfs_inode_t *ip, 1344 int flag, 1345 xfs_fsize_t new_size, 1346 xfs_off_t toss_start, 1347 xfs_off_t toss_finish) 1348 { 1349 if (ip->i_rwtrace == NULL) { 1350 return; 1351 } 1352 1353 ktrace_enter(ip->i_rwtrace, 1354 (void*)((long)tag), 1355 (void*)ip, 1356 (void*)(unsigned long)((ip->i_d.di_size >> 32) & 0xffffffff), 1357 (void*)(unsigned long)(ip->i_d.di_size & 0xffffffff), 1358 (void*)((long)flag), 1359 (void*)(unsigned long)((new_size >> 32) & 0xffffffff), 1360 (void*)(unsigned long)(new_size & 0xffffffff), 1361 (void*)(unsigned long)((toss_start >> 32) & 0xffffffff), 1362 (void*)(unsigned long)(toss_start & 0xffffffff), 1363 (void*)(unsigned long)((toss_finish >> 32) & 0xffffffff), 1364 (void*)(unsigned long)(toss_finish & 0xffffffff), 1365 (void*)(unsigned long)current_cpu(), 1366 (void*)0, 1367 (void*)0, 1368 (void*)0, 1369 (void*)0); 1370 } 1371 #else 1372 #define xfs_itrunc_trace(tag, ip, flag, new_size, toss_start, toss_finish) 1373 #endif 1374 1375 /* 1376 * Start the truncation of the file to new_size. The new size 1377 * must be smaller than the current size. This routine will 1378 * clear the buffer and page caches of file data in the removed 1379 * range, and xfs_itruncate_finish() will remove the underlying 1380 * disk blocks. 1381 * 1382 * The inode must have its I/O lock locked EXCLUSIVELY, and it 1383 * must NOT have the inode lock held at all. This is because we're 1384 * calling into the buffer/page cache code and we can't hold the 1385 * inode lock when we do so. 1386 * 1387 * The flags parameter can have either the value XFS_ITRUNC_DEFINITE 1388 * or XFS_ITRUNC_MAYBE. The XFS_ITRUNC_MAYBE value should be used 1389 * in the case that the caller is locking things out of order and 1390 * may not be able to call xfs_itruncate_finish() with the inode lock 1391 * held without dropping the I/O lock. If the caller must drop the 1392 * I/O lock before calling xfs_itruncate_finish(), then xfs_itruncate_start() 1393 * must be called again with all the same restrictions as the initial 1394 * call. 1395 */ 1396 void 1397 xfs_itruncate_start( 1398 xfs_inode_t *ip, 1399 uint flags, 1400 xfs_fsize_t new_size) 1401 { 1402 xfs_fsize_t last_byte; 1403 xfs_off_t toss_start; 1404 xfs_mount_t *mp; 1405 vnode_t *vp; 1406 1407 ASSERT(ismrlocked(&ip->i_iolock, MR_UPDATE) != 0); 1408 ASSERT((new_size == 0) || (new_size <= ip->i_d.di_size)); 1409 ASSERT((flags == XFS_ITRUNC_DEFINITE) || 1410 (flags == XFS_ITRUNC_MAYBE)); 1411 1412 mp = ip->i_mount; 1413 vp = XFS_ITOV(ip); 1414 /* 1415 * Call VOP_TOSS_PAGES() or VOP_FLUSHINVAL_PAGES() to get rid of pages and buffers 1416 * overlapping the region being removed. We have to use 1417 * the less efficient VOP_FLUSHINVAL_PAGES() in the case that the 1418 * caller may not be able to finish the truncate without 1419 * dropping the inode's I/O lock. Make sure 1420 * to catch any pages brought in by buffers overlapping 1421 * the EOF by searching out beyond the isize by our 1422 * block size. We round new_size up to a block boundary 1423 * so that we don't toss things on the same block as 1424 * new_size but before it. 1425 * 1426 * Before calling VOP_TOSS_PAGES() or VOP_FLUSHINVAL_PAGES(), make sure to 1427 * call remapf() over the same region if the file is mapped. 1428 * This frees up mapped file references to the pages in the 1429 * given range and for the VOP_FLUSHINVAL_PAGES() case it ensures 1430 * that we get the latest mapped changes flushed out. 1431 */ 1432 toss_start = XFS_B_TO_FSB(mp, (xfs_ufsize_t)new_size); 1433 toss_start = XFS_FSB_TO_B(mp, toss_start); 1434 if (toss_start < 0) { 1435 /* 1436 * The place to start tossing is beyond our maximum 1437 * file size, so there is no way that the data extended 1438 * out there. 1439 */ 1440 return; 1441 } 1442 last_byte = xfs_file_last_byte(ip); 1443 xfs_itrunc_trace(XFS_ITRUNC_START, ip, flags, new_size, toss_start, 1444 last_byte); 1445 if (last_byte > toss_start) { 1446 if (flags & XFS_ITRUNC_DEFINITE) { 1447 VOP_TOSS_PAGES(vp, toss_start, -1, FI_REMAPF_LOCKED); 1448 } else { 1449 VOP_FLUSHINVAL_PAGES(vp, toss_start, -1, FI_REMAPF_LOCKED); 1450 } 1451 } 1452 1453 #ifdef DEBUG 1454 if (new_size == 0) { 1455 ASSERT(VN_CACHED(vp) == 0); 1456 } 1457 #endif 1458 } 1459 1460 /* 1461 * Shrink the file to the given new_size. The new 1462 * size must be smaller than the current size. 1463 * This will free up the underlying blocks 1464 * in the removed range after a call to xfs_itruncate_start() 1465 * or xfs_atruncate_start(). 1466 * 1467 * The transaction passed to this routine must have made 1468 * a permanent log reservation of at least XFS_ITRUNCATE_LOG_RES. 1469 * This routine may commit the given transaction and 1470 * start new ones, so make sure everything involved in 1471 * the transaction is tidy before calling here. 1472 * Some transaction will be returned to the caller to be 1473 * committed. The incoming transaction must already include 1474 * the inode, and both inode locks must be held exclusively. 1475 * The inode must also be "held" within the transaction. On 1476 * return the inode will be "held" within the returned transaction. 1477 * This routine does NOT require any disk space to be reserved 1478 * for it within the transaction. 1479 * 1480 * The fork parameter must be either xfs_attr_fork or xfs_data_fork, 1481 * and it indicates the fork which is to be truncated. For the 1482 * attribute fork we only support truncation to size 0. 1483 * 1484 * We use the sync parameter to indicate whether or not the first 1485 * transaction we perform might have to be synchronous. For the attr fork, 1486 * it needs to be so if the unlink of the inode is not yet known to be 1487 * permanent in the log. This keeps us from freeing and reusing the 1488 * blocks of the attribute fork before the unlink of the inode becomes 1489 * permanent. 1490 * 1491 * For the data fork, we normally have to run synchronously if we're 1492 * being called out of the inactive path or we're being called 1493 * out of the create path where we're truncating an existing file. 1494 * Either way, the truncate needs to be sync so blocks don't reappear 1495 * in the file with altered data in case of a crash. wsync filesystems 1496 * can run the first case async because anything that shrinks the inode 1497 * has to run sync so by the time we're called here from inactive, the 1498 * inode size is permanently set to 0. 1499 * 1500 * Calls from the truncate path always need to be sync unless we're 1501 * in a wsync filesystem and the file has already been unlinked. 1502 * 1503 * The caller is responsible for correctly setting the sync parameter. 1504 * It gets too hard for us to guess here which path we're being called 1505 * out of just based on inode state. 1506 */ 1507 int 1508 xfs_itruncate_finish( 1509 xfs_trans_t **tp, 1510 xfs_inode_t *ip, 1511 xfs_fsize_t new_size, 1512 int fork, 1513 int sync) 1514 { 1515 xfs_fsblock_t first_block; 1516 xfs_fileoff_t first_unmap_block; 1517 xfs_fileoff_t last_block; 1518 xfs_filblks_t unmap_len=0; 1519 xfs_mount_t *mp; 1520 xfs_trans_t *ntp; 1521 int done; 1522 int committed; 1523 xfs_bmap_free_t free_list; 1524 int error; 1525 1526 ASSERT(ismrlocked(&ip->i_iolock, MR_UPDATE) != 0); 1527 ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE) != 0); 1528 ASSERT((new_size == 0) || (new_size <= ip->i_d.di_size)); 1529 ASSERT(*tp != NULL); 1530 ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES); 1531 ASSERT(ip->i_transp == *tp); 1532 ASSERT(ip->i_itemp != NULL); 1533 ASSERT(ip->i_itemp->ili_flags & XFS_ILI_HOLD); 1534 1535 1536 ntp = *tp; 1537 mp = (ntp)->t_mountp; 1538 ASSERT(! XFS_NOT_DQATTACHED(mp, ip)); 1539 1540 /* 1541 * We only support truncating the entire attribute fork. 1542 */ 1543 if (fork == XFS_ATTR_FORK) { 1544 new_size = 0LL; 1545 } 1546 first_unmap_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)new_size); 1547 xfs_itrunc_trace(XFS_ITRUNC_FINISH1, ip, 0, new_size, 0, 0); 1548 /* 1549 * The first thing we do is set the size to new_size permanently 1550 * on disk. This way we don't have to worry about anyone ever 1551 * being able to look at the data being freed even in the face 1552 * of a crash. What we're getting around here is the case where 1553 * we free a block, it is allocated to another file, it is written 1554 * to, and then we crash. If the new data gets written to the 1555 * file but the log buffers containing the free and reallocation 1556 * don't, then we'd end up with garbage in the blocks being freed. 1557 * As long as we make the new_size permanent before actually 1558 * freeing any blocks it doesn't matter if they get writtten to. 1559 * 1560 * The callers must signal into us whether or not the size 1561 * setting here must be synchronous. There are a few cases 1562 * where it doesn't have to be synchronous. Those cases 1563 * occur if the file is unlinked and we know the unlink is 1564 * permanent or if the blocks being truncated are guaranteed 1565 * to be beyond the inode eof (regardless of the link count) 1566 * and the eof value is permanent. Both of these cases occur 1567 * only on wsync-mounted filesystems. In those cases, we're 1568 * guaranteed that no user will ever see the data in the blocks 1569 * that are being truncated so the truncate can run async. 1570 * In the free beyond eof case, the file may wind up with 1571 * more blocks allocated to it than it needs if we crash 1572 * and that won't get fixed until the next time the file 1573 * is re-opened and closed but that's ok as that shouldn't 1574 * be too many blocks. 1575 * 1576 * However, we can't just make all wsync xactions run async 1577 * because there's one call out of the create path that needs 1578 * to run sync where it's truncating an existing file to size 1579 * 0 whose size is > 0. 1580 * 1581 * It's probably possible to come up with a test in this 1582 * routine that would correctly distinguish all the above 1583 * cases from the values of the function parameters and the 1584 * inode state but for sanity's sake, I've decided to let the 1585 * layers above just tell us. It's simpler to correctly figure 1586 * out in the layer above exactly under what conditions we 1587 * can run async and I think it's easier for others read and 1588 * follow the logic in case something has to be changed. 1589 * cscope is your friend -- rcc. 1590 * 1591 * The attribute fork is much simpler. 1592 * 1593 * For the attribute fork we allow the caller to tell us whether 1594 * the unlink of the inode that led to this call is yet permanent 1595 * in the on disk log. If it is not and we will be freeing extents 1596 * in this inode then we make the first transaction synchronous 1597 * to make sure that the unlink is permanent by the time we free 1598 * the blocks. 1599 */ 1600 if (fork == XFS_DATA_FORK) { 1601 if (ip->i_d.di_nextents > 0) { 1602 ip->i_d.di_size = new_size; 1603 xfs_trans_log_inode(ntp, ip, XFS_ILOG_CORE); 1604 } 1605 } else if (sync) { 1606 ASSERT(!(mp->m_flags & XFS_MOUNT_WSYNC)); 1607 if (ip->i_d.di_anextents > 0) 1608 xfs_trans_set_sync(ntp); 1609 } 1610 ASSERT(fork == XFS_DATA_FORK || 1611 (fork == XFS_ATTR_FORK && 1612 ((sync && !(mp->m_flags & XFS_MOUNT_WSYNC)) || 1613 (sync == 0 && (mp->m_flags & XFS_MOUNT_WSYNC))))); 1614 1615 /* 1616 * Since it is possible for space to become allocated beyond 1617 * the end of the file (in a crash where the space is allocated 1618 * but the inode size is not yet updated), simply remove any 1619 * blocks which show up between the new EOF and the maximum 1620 * possible file size. If the first block to be removed is 1621 * beyond the maximum file size (ie it is the same as last_block), 1622 * then there is nothing to do. 1623 */ 1624 last_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_MAXIOFFSET(mp)); 1625 ASSERT(first_unmap_block <= last_block); 1626 done = 0; 1627 if (last_block == first_unmap_block) { 1628 done = 1; 1629 } else { 1630 unmap_len = last_block - first_unmap_block + 1; 1631 } 1632 while (!done) { 1633 /* 1634 * Free up up to XFS_ITRUNC_MAX_EXTENTS. xfs_bunmapi() 1635 * will tell us whether it freed the entire range or 1636 * not. If this is a synchronous mount (wsync), 1637 * then we can tell bunmapi to keep all the 1638 * transactions asynchronous since the unlink 1639 * transaction that made this inode inactive has 1640 * already hit the disk. There's no danger of 1641 * the freed blocks being reused, there being a 1642 * crash, and the reused blocks suddenly reappearing 1643 * in this file with garbage in them once recovery 1644 * runs. 1645 */ 1646 XFS_BMAP_INIT(&free_list, &first_block); 1647 error = xfs_bunmapi(ntp, ip, first_unmap_block, 1648 unmap_len, 1649 XFS_BMAPI_AFLAG(fork) | 1650 (sync ? 0 : XFS_BMAPI_ASYNC), 1651 XFS_ITRUNC_MAX_EXTENTS, 1652 &first_block, &free_list, &done); 1653 if (error) { 1654 /* 1655 * If the bunmapi call encounters an error, 1656 * return to the caller where the transaction 1657 * can be properly aborted. We just need to 1658 * make sure we're not holding any resources 1659 * that we were not when we came in. 1660 */ 1661 xfs_bmap_cancel(&free_list); 1662 return error; 1663 } 1664 1665 /* 1666 * Duplicate the transaction that has the permanent 1667 * reservation and commit the old transaction. 1668 */ 1669 error = xfs_bmap_finish(tp, &free_list, first_block, 1670 &committed); 1671 ntp = *tp; 1672 if (error) { 1673 /* 1674 * If the bmap finish call encounters an error, 1675 * return to the caller where the transaction 1676 * can be properly aborted. We just need to 1677 * make sure we're not holding any resources 1678 * that we were not when we came in. 1679 * 1680 * Aborting from this point might lose some 1681 * blocks in the file system, but oh well. 1682 */ 1683 xfs_bmap_cancel(&free_list); 1684 if (committed) { 1685 /* 1686 * If the passed in transaction committed 1687 * in xfs_bmap_finish(), then we want to 1688 * add the inode to this one before returning. 1689 * This keeps things simple for the higher 1690 * level code, because it always knows that 1691 * the inode is locked and held in the 1692 * transaction that returns to it whether 1693 * errors occur or not. We don't mark the 1694 * inode dirty so that this transaction can 1695 * be easily aborted if possible. 1696 */ 1697 xfs_trans_ijoin(ntp, ip, 1698 XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); 1699 xfs_trans_ihold(ntp, ip); 1700 } 1701 return error; 1702 } 1703 1704 if (committed) { 1705 /* 1706 * The first xact was committed, 1707 * so add the inode to the new one. 1708 * Mark it dirty so it will be logged 1709 * and moved forward in the log as 1710 * part of every commit. 1711 */ 1712 xfs_trans_ijoin(ntp, ip, 1713 XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); 1714 xfs_trans_ihold(ntp, ip); 1715 xfs_trans_log_inode(ntp, ip, XFS_ILOG_CORE); 1716 } 1717 ntp = xfs_trans_dup(ntp); 1718 (void) xfs_trans_commit(*tp, 0, NULL); 1719 *tp = ntp; 1720 error = xfs_trans_reserve(ntp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0, 1721 XFS_TRANS_PERM_LOG_RES, 1722 XFS_ITRUNCATE_LOG_COUNT); 1723 /* 1724 * Add the inode being truncated to the next chained 1725 * transaction. 1726 */ 1727 xfs_trans_ijoin(ntp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); 1728 xfs_trans_ihold(ntp, ip); 1729 if (error) 1730 return (error); 1731 } 1732 /* 1733 * Only update the size in the case of the data fork, but 1734 * always re-log the inode so that our permanent transaction 1735 * can keep on rolling it forward in the log. 1736 */ 1737 if (fork == XFS_DATA_FORK) { 1738 xfs_isize_check(mp, ip, new_size); 1739 ip->i_d.di_size = new_size; 1740 } 1741 xfs_trans_log_inode(ntp, ip, XFS_ILOG_CORE); 1742 ASSERT((new_size != 0) || 1743 (fork == XFS_ATTR_FORK) || 1744 (ip->i_delayed_blks == 0)); 1745 ASSERT((new_size != 0) || 1746 (fork == XFS_ATTR_FORK) || 1747 (ip->i_d.di_nextents == 0)); 1748 xfs_itrunc_trace(XFS_ITRUNC_FINISH2, ip, 0, new_size, 0, 0); 1749 return 0; 1750 } 1751 1752 1753 /* 1754 * xfs_igrow_start 1755 * 1756 * Do the first part of growing a file: zero any data in the last 1757 * block that is beyond the old EOF. We need to do this before 1758 * the inode is joined to the transaction to modify the i_size. 1759 * That way we can drop the inode lock and call into the buffer 1760 * cache to get the buffer mapping the EOF. 1761 */ 1762 int 1763 xfs_igrow_start( 1764 xfs_inode_t *ip, 1765 xfs_fsize_t new_size, 1766 cred_t *credp) 1767 { 1768 xfs_fsize_t isize; 1769 int error; 1770 1771 ASSERT(ismrlocked(&(ip->i_lock), MR_UPDATE) != 0); 1772 ASSERT(ismrlocked(&(ip->i_iolock), MR_UPDATE) != 0); 1773 ASSERT(new_size > ip->i_d.di_size); 1774 1775 error = 0; 1776 isize = ip->i_d.di_size; 1777 /* 1778 * Zero any pages that may have been created by 1779 * xfs_write_file() beyond the end of the file 1780 * and any blocks between the old and new file sizes. 1781 */ 1782 error = xfs_zero_eof(XFS_ITOV(ip), &ip->i_iocore, new_size, isize, 1783 new_size); 1784 return error; 1785 } 1786 1787 /* 1788 * xfs_igrow_finish 1789 * 1790 * This routine is called to extend the size of a file. 1791 * The inode must have both the iolock and the ilock locked 1792 * for update and it must be a part of the current transaction. 1793 * The xfs_igrow_start() function must have been called previously. 1794 * If the change_flag is not zero, the inode change timestamp will 1795 * be updated. 1796 */ 1797 void 1798 xfs_igrow_finish( 1799 xfs_trans_t *tp, 1800 xfs_inode_t *ip, 1801 xfs_fsize_t new_size, 1802 int change_flag) 1803 { 1804 ASSERT(ismrlocked(&(ip->i_lock), MR_UPDATE) != 0); 1805 ASSERT(ismrlocked(&(ip->i_iolock), MR_UPDATE) != 0); 1806 ASSERT(ip->i_transp == tp); 1807 ASSERT(new_size > ip->i_d.di_size); 1808 1809 /* 1810 * Update the file size. Update the inode change timestamp 1811 * if change_flag set. 1812 */ 1813 ip->i_d.di_size = new_size; 1814 if (change_flag) 1815 xfs_ichgtime(ip, XFS_ICHGTIME_CHG); 1816 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 1817 1818 } 1819 1820 1821 /* 1822 * This is called when the inode's link count goes to 0. 1823 * We place the on-disk inode on a list in the AGI. It 1824 * will be pulled from this list when the inode is freed. 1825 */ 1826 int 1827 xfs_iunlink( 1828 xfs_trans_t *tp, 1829 xfs_inode_t *ip) 1830 { 1831 xfs_mount_t *mp; 1832 xfs_agi_t *agi; 1833 xfs_dinode_t *dip; 1834 xfs_buf_t *agibp; 1835 xfs_buf_t *ibp; 1836 xfs_agnumber_t agno; 1837 xfs_daddr_t agdaddr; 1838 xfs_agino_t agino; 1839 short bucket_index; 1840 int offset; 1841 int error; 1842 int agi_ok; 1843 1844 ASSERT(ip->i_d.di_nlink == 0); 1845 ASSERT(ip->i_d.di_mode != 0); 1846 ASSERT(ip->i_transp == tp); 1847 1848 mp = tp->t_mountp; 1849 1850 agno = XFS_INO_TO_AGNO(mp, ip->i_ino); 1851 agdaddr = XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)); 1852 1853 /* 1854 * Get the agi buffer first. It ensures lock ordering 1855 * on the list. 1856 */ 1857 error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, agdaddr, 1858 XFS_FSS_TO_BB(mp, 1), 0, &agibp); 1859 if (error) { 1860 return error; 1861 } 1862 /* 1863 * Validate the magic number of the agi block. 1864 */ 1865 agi = XFS_BUF_TO_AGI(agibp); 1866 agi_ok = 1867 be32_to_cpu(agi->agi_magicnum) == XFS_AGI_MAGIC && 1868 XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum)); 1869 if (unlikely(XFS_TEST_ERROR(!agi_ok, mp, XFS_ERRTAG_IUNLINK, 1870 XFS_RANDOM_IUNLINK))) { 1871 XFS_CORRUPTION_ERROR("xfs_iunlink", XFS_ERRLEVEL_LOW, mp, agi); 1872 xfs_trans_brelse(tp, agibp); 1873 return XFS_ERROR(EFSCORRUPTED); 1874 } 1875 /* 1876 * Get the index into the agi hash table for the 1877 * list this inode will go on. 1878 */ 1879 agino = XFS_INO_TO_AGINO(mp, ip->i_ino); 1880 ASSERT(agino != 0); 1881 bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS; 1882 ASSERT(agi->agi_unlinked[bucket_index]); 1883 ASSERT(be32_to_cpu(agi->agi_unlinked[bucket_index]) != agino); 1884 1885 if (be32_to_cpu(agi->agi_unlinked[bucket_index]) != NULLAGINO) { 1886 /* 1887 * There is already another inode in the bucket we need 1888 * to add ourselves to. Add us at the front of the list. 1889 * Here we put the head pointer into our next pointer, 1890 * and then we fall through to point the head at us. 1891 */ 1892 error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0); 1893 if (error) { 1894 return error; 1895 } 1896 ASSERT(INT_GET(dip->di_next_unlinked, ARCH_CONVERT) == NULLAGINO); 1897 ASSERT(dip->di_next_unlinked); 1898 /* both on-disk, don't endian flip twice */ 1899 dip->di_next_unlinked = agi->agi_unlinked[bucket_index]; 1900 offset = ip->i_boffset + 1901 offsetof(xfs_dinode_t, di_next_unlinked); 1902 xfs_trans_inode_buf(tp, ibp); 1903 xfs_trans_log_buf(tp, ibp, offset, 1904 (offset + sizeof(xfs_agino_t) - 1)); 1905 xfs_inobp_check(mp, ibp); 1906 } 1907 1908 /* 1909 * Point the bucket head pointer at the inode being inserted. 1910 */ 1911 ASSERT(agino != 0); 1912 agi->agi_unlinked[bucket_index] = cpu_to_be32(agino); 1913 offset = offsetof(xfs_agi_t, agi_unlinked) + 1914 (sizeof(xfs_agino_t) * bucket_index); 1915 xfs_trans_log_buf(tp, agibp, offset, 1916 (offset + sizeof(xfs_agino_t) - 1)); 1917 return 0; 1918 } 1919 1920 /* 1921 * Pull the on-disk inode from the AGI unlinked list. 1922 */ 1923 STATIC int 1924 xfs_iunlink_remove( 1925 xfs_trans_t *tp, 1926 xfs_inode_t *ip) 1927 { 1928 xfs_ino_t next_ino; 1929 xfs_mount_t *mp; 1930 xfs_agi_t *agi; 1931 xfs_dinode_t *dip; 1932 xfs_buf_t *agibp; 1933 xfs_buf_t *ibp; 1934 xfs_agnumber_t agno; 1935 xfs_daddr_t agdaddr; 1936 xfs_agino_t agino; 1937 xfs_agino_t next_agino; 1938 xfs_buf_t *last_ibp; 1939 xfs_dinode_t *last_dip; 1940 short bucket_index; 1941 int offset, last_offset; 1942 int error; 1943 int agi_ok; 1944 1945 /* 1946 * First pull the on-disk inode from the AGI unlinked list. 1947 */ 1948 mp = tp->t_mountp; 1949 1950 agno = XFS_INO_TO_AGNO(mp, ip->i_ino); 1951 agdaddr = XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)); 1952 1953 /* 1954 * Get the agi buffer first. It ensures lock ordering 1955 * on the list. 1956 */ 1957 error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, agdaddr, 1958 XFS_FSS_TO_BB(mp, 1), 0, &agibp); 1959 if (error) { 1960 cmn_err(CE_WARN, 1961 "xfs_iunlink_remove: xfs_trans_read_buf() returned an error %d on %s. Returning error.", 1962 error, mp->m_fsname); 1963 return error; 1964 } 1965 /* 1966 * Validate the magic number of the agi block. 1967 */ 1968 agi = XFS_BUF_TO_AGI(agibp); 1969 agi_ok = 1970 be32_to_cpu(agi->agi_magicnum) == XFS_AGI_MAGIC && 1971 XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum)); 1972 if (unlikely(XFS_TEST_ERROR(!agi_ok, mp, XFS_ERRTAG_IUNLINK_REMOVE, 1973 XFS_RANDOM_IUNLINK_REMOVE))) { 1974 XFS_CORRUPTION_ERROR("xfs_iunlink_remove", XFS_ERRLEVEL_LOW, 1975 mp, agi); 1976 xfs_trans_brelse(tp, agibp); 1977 cmn_err(CE_WARN, 1978 "xfs_iunlink_remove: XFS_TEST_ERROR() returned an error on %s. Returning EFSCORRUPTED.", 1979 mp->m_fsname); 1980 return XFS_ERROR(EFSCORRUPTED); 1981 } 1982 /* 1983 * Get the index into the agi hash table for the 1984 * list this inode will go on. 1985 */ 1986 agino = XFS_INO_TO_AGINO(mp, ip->i_ino); 1987 ASSERT(agino != 0); 1988 bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS; 1989 ASSERT(be32_to_cpu(agi->agi_unlinked[bucket_index]) != NULLAGINO); 1990 ASSERT(agi->agi_unlinked[bucket_index]); 1991 1992 if (be32_to_cpu(agi->agi_unlinked[bucket_index]) == agino) { 1993 /* 1994 * We're at the head of the list. Get the inode's 1995 * on-disk buffer to see if there is anyone after us 1996 * on the list. Only modify our next pointer if it 1997 * is not already NULLAGINO. This saves us the overhead 1998 * of dealing with the buffer when there is no need to 1999 * change it. 2000 */ 2001 error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0); 2002 if (error) { 2003 cmn_err(CE_WARN, 2004 "xfs_iunlink_remove: xfs_itobp() returned an error %d on %s. Returning error.", 2005 error, mp->m_fsname); 2006 return error; 2007 } 2008 next_agino = INT_GET(dip->di_next_unlinked, ARCH_CONVERT); 2009 ASSERT(next_agino != 0); 2010 if (next_agino != NULLAGINO) { 2011 INT_SET(dip->di_next_unlinked, ARCH_CONVERT, NULLAGINO); 2012 offset = ip->i_boffset + 2013 offsetof(xfs_dinode_t, di_next_unlinked); 2014 xfs_trans_inode_buf(tp, ibp); 2015 xfs_trans_log_buf(tp, ibp, offset, 2016 (offset + sizeof(xfs_agino_t) - 1)); 2017 xfs_inobp_check(mp, ibp); 2018 } else { 2019 xfs_trans_brelse(tp, ibp); 2020 } 2021 /* 2022 * Point the bucket head pointer at the next inode. 2023 */ 2024 ASSERT(next_agino != 0); 2025 ASSERT(next_agino != agino); 2026 agi->agi_unlinked[bucket_index] = cpu_to_be32(next_agino); 2027 offset = offsetof(xfs_agi_t, agi_unlinked) + 2028 (sizeof(xfs_agino_t) * bucket_index); 2029 xfs_trans_log_buf(tp, agibp, offset, 2030 (offset + sizeof(xfs_agino_t) - 1)); 2031 } else { 2032 /* 2033 * We need to search the list for the inode being freed. 2034 */ 2035 next_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]); 2036 last_ibp = NULL; 2037 while (next_agino != agino) { 2038 /* 2039 * If the last inode wasn't the one pointing to 2040 * us, then release its buffer since we're not 2041 * going to do anything with it. 2042 */ 2043 if (last_ibp != NULL) { 2044 xfs_trans_brelse(tp, last_ibp); 2045 } 2046 next_ino = XFS_AGINO_TO_INO(mp, agno, next_agino); 2047 error = xfs_inotobp(mp, tp, next_ino, &last_dip, 2048 &last_ibp, &last_offset); 2049 if (error) { 2050 cmn_err(CE_WARN, 2051 "xfs_iunlink_remove: xfs_inotobp() returned an error %d on %s. Returning error.", 2052 error, mp->m_fsname); 2053 return error; 2054 } 2055 next_agino = INT_GET(last_dip->di_next_unlinked, ARCH_CONVERT); 2056 ASSERT(next_agino != NULLAGINO); 2057 ASSERT(next_agino != 0); 2058 } 2059 /* 2060 * Now last_ibp points to the buffer previous to us on 2061 * the unlinked list. Pull us from the list. 2062 */ 2063 error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0); 2064 if (error) { 2065 cmn_err(CE_WARN, 2066 "xfs_iunlink_remove: xfs_itobp() returned an error %d on %s. Returning error.", 2067 error, mp->m_fsname); 2068 return error; 2069 } 2070 next_agino = INT_GET(dip->di_next_unlinked, ARCH_CONVERT); 2071 ASSERT(next_agino != 0); 2072 ASSERT(next_agino != agino); 2073 if (next_agino != NULLAGINO) { 2074 INT_SET(dip->di_next_unlinked, ARCH_CONVERT, NULLAGINO); 2075 offset = ip->i_boffset + 2076 offsetof(xfs_dinode_t, di_next_unlinked); 2077 xfs_trans_inode_buf(tp, ibp); 2078 xfs_trans_log_buf(tp, ibp, offset, 2079 (offset + sizeof(xfs_agino_t) - 1)); 2080 xfs_inobp_check(mp, ibp); 2081 } else { 2082 xfs_trans_brelse(tp, ibp); 2083 } 2084 /* 2085 * Point the previous inode on the list to the next inode. 2086 */ 2087 INT_SET(last_dip->di_next_unlinked, ARCH_CONVERT, next_agino); 2088 ASSERT(next_agino != 0); 2089 offset = last_offset + offsetof(xfs_dinode_t, di_next_unlinked); 2090 xfs_trans_inode_buf(tp, last_ibp); 2091 xfs_trans_log_buf(tp, last_ibp, offset, 2092 (offset + sizeof(xfs_agino_t) - 1)); 2093 xfs_inobp_check(mp, last_ibp); 2094 } 2095 return 0; 2096 } 2097 2098 static __inline__ int xfs_inode_clean(xfs_inode_t *ip) 2099 { 2100 return (((ip->i_itemp == NULL) || 2101 !(ip->i_itemp->ili_format.ilf_fields & XFS_ILOG_ALL)) && 2102 (ip->i_update_core == 0)); 2103 } 2104 2105 STATIC void 2106 xfs_ifree_cluster( 2107 xfs_inode_t *free_ip, 2108 xfs_trans_t *tp, 2109 xfs_ino_t inum) 2110 { 2111 xfs_mount_t *mp = free_ip->i_mount; 2112 int blks_per_cluster; 2113 int nbufs; 2114 int ninodes; 2115 int i, j, found, pre_flushed; 2116 xfs_daddr_t blkno; 2117 xfs_buf_t *bp; 2118 xfs_ihash_t *ih; 2119 xfs_inode_t *ip, **ip_found; 2120 xfs_inode_log_item_t *iip; 2121 xfs_log_item_t *lip; 2122 SPLDECL(s); 2123 2124 if (mp->m_sb.sb_blocksize >= XFS_INODE_CLUSTER_SIZE(mp)) { 2125 blks_per_cluster = 1; 2126 ninodes = mp->m_sb.sb_inopblock; 2127 nbufs = XFS_IALLOC_BLOCKS(mp); 2128 } else { 2129 blks_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) / 2130 mp->m_sb.sb_blocksize; 2131 ninodes = blks_per_cluster * mp->m_sb.sb_inopblock; 2132 nbufs = XFS_IALLOC_BLOCKS(mp) / blks_per_cluster; 2133 } 2134 2135 ip_found = kmem_alloc(ninodes * sizeof(xfs_inode_t *), KM_NOFS); 2136 2137 for (j = 0; j < nbufs; j++, inum += ninodes) { 2138 blkno = XFS_AGB_TO_DADDR(mp, XFS_INO_TO_AGNO(mp, inum), 2139 XFS_INO_TO_AGBNO(mp, inum)); 2140 2141 2142 /* 2143 * Look for each inode in memory and attempt to lock it, 2144 * we can be racing with flush and tail pushing here. 2145 * any inode we get the locks on, add to an array of 2146 * inode items to process later. 2147 * 2148 * The get the buffer lock, we could beat a flush 2149 * or tail pushing thread to the lock here, in which 2150 * case they will go looking for the inode buffer 2151 * and fail, we need some other form of interlock 2152 * here. 2153 */ 2154 found = 0; 2155 for (i = 0; i < ninodes; i++) { 2156 ih = XFS_IHASH(mp, inum + i); 2157 read_lock(&ih->ih_lock); 2158 for (ip = ih->ih_next; ip != NULL; ip = ip->i_next) { 2159 if (ip->i_ino == inum + i) 2160 break; 2161 } 2162 2163 /* Inode not in memory or we found it already, 2164 * nothing to do 2165 */ 2166 if (!ip || (ip->i_flags & XFS_ISTALE)) { 2167 read_unlock(&ih->ih_lock); 2168 continue; 2169 } 2170 2171 if (xfs_inode_clean(ip)) { 2172 read_unlock(&ih->ih_lock); 2173 continue; 2174 } 2175 2176 /* If we can get the locks then add it to the 2177 * list, otherwise by the time we get the bp lock 2178 * below it will already be attached to the 2179 * inode buffer. 2180 */ 2181 2182 /* This inode will already be locked - by us, lets 2183 * keep it that way. 2184 */ 2185 2186 if (ip == free_ip) { 2187 if (xfs_iflock_nowait(ip)) { 2188 ip->i_flags |= XFS_ISTALE; 2189 2190 if (xfs_inode_clean(ip)) { 2191 xfs_ifunlock(ip); 2192 } else { 2193 ip_found[found++] = ip; 2194 } 2195 } 2196 read_unlock(&ih->ih_lock); 2197 continue; 2198 } 2199 2200 if (xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) { 2201 if (xfs_iflock_nowait(ip)) { 2202 ip->i_flags |= XFS_ISTALE; 2203 2204 if (xfs_inode_clean(ip)) { 2205 xfs_ifunlock(ip); 2206 xfs_iunlock(ip, XFS_ILOCK_EXCL); 2207 } else { 2208 ip_found[found++] = ip; 2209 } 2210 } else { 2211 xfs_iunlock(ip, XFS_ILOCK_EXCL); 2212 } 2213 } 2214 2215 read_unlock(&ih->ih_lock); 2216 } 2217 2218 bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno, 2219 mp->m_bsize * blks_per_cluster, 2220 XFS_BUF_LOCK); 2221 2222 pre_flushed = 0; 2223 lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *); 2224 while (lip) { 2225 if (lip->li_type == XFS_LI_INODE) { 2226 iip = (xfs_inode_log_item_t *)lip; 2227 ASSERT(iip->ili_logged == 1); 2228 lip->li_cb = (void(*)(xfs_buf_t*,xfs_log_item_t*)) xfs_istale_done; 2229 AIL_LOCK(mp,s); 2230 iip->ili_flush_lsn = iip->ili_item.li_lsn; 2231 AIL_UNLOCK(mp, s); 2232 iip->ili_inode->i_flags |= XFS_ISTALE; 2233 pre_flushed++; 2234 } 2235 lip = lip->li_bio_list; 2236 } 2237 2238 for (i = 0; i < found; i++) { 2239 ip = ip_found[i]; 2240 iip = ip->i_itemp; 2241 2242 if (!iip) { 2243 ip->i_update_core = 0; 2244 xfs_ifunlock(ip); 2245 xfs_iunlock(ip, XFS_ILOCK_EXCL); 2246 continue; 2247 } 2248 2249 iip->ili_last_fields = iip->ili_format.ilf_fields; 2250 iip->ili_format.ilf_fields = 0; 2251 iip->ili_logged = 1; 2252 AIL_LOCK(mp,s); 2253 iip->ili_flush_lsn = iip->ili_item.li_lsn; 2254 AIL_UNLOCK(mp, s); 2255 2256 xfs_buf_attach_iodone(bp, 2257 (void(*)(xfs_buf_t*,xfs_log_item_t*)) 2258 xfs_istale_done, (xfs_log_item_t *)iip); 2259 if (ip != free_ip) { 2260 xfs_iunlock(ip, XFS_ILOCK_EXCL); 2261 } 2262 } 2263 2264 if (found || pre_flushed) 2265 xfs_trans_stale_inode_buf(tp, bp); 2266 xfs_trans_binval(tp, bp); 2267 } 2268 2269 kmem_free(ip_found, ninodes * sizeof(xfs_inode_t *)); 2270 } 2271 2272 /* 2273 * This is called to return an inode to the inode free list. 2274 * The inode should already be truncated to 0 length and have 2275 * no pages associated with it. This routine also assumes that 2276 * the inode is already a part of the transaction. 2277 * 2278 * The on-disk copy of the inode will have been added to the list 2279 * of unlinked inodes in the AGI. We need to remove the inode from 2280 * that list atomically with respect to freeing it here. 2281 */ 2282 int 2283 xfs_ifree( 2284 xfs_trans_t *tp, 2285 xfs_inode_t *ip, 2286 xfs_bmap_free_t *flist) 2287 { 2288 int error; 2289 int delete; 2290 xfs_ino_t first_ino; 2291 2292 ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE)); 2293 ASSERT(ip->i_transp == tp); 2294 ASSERT(ip->i_d.di_nlink == 0); 2295 ASSERT(ip->i_d.di_nextents == 0); 2296 ASSERT(ip->i_d.di_anextents == 0); 2297 ASSERT((ip->i_d.di_size == 0) || 2298 ((ip->i_d.di_mode & S_IFMT) != S_IFREG)); 2299 ASSERT(ip->i_d.di_nblocks == 0); 2300 2301 /* 2302 * Pull the on-disk inode from the AGI unlinked list. 2303 */ 2304 error = xfs_iunlink_remove(tp, ip); 2305 if (error != 0) { 2306 return error; 2307 } 2308 2309 error = xfs_difree(tp, ip->i_ino, flist, &delete, &first_ino); 2310 if (error != 0) { 2311 return error; 2312 } 2313 ip->i_d.di_mode = 0; /* mark incore inode as free */ 2314 ip->i_d.di_flags = 0; 2315 ip->i_d.di_dmevmask = 0; 2316 ip->i_d.di_forkoff = 0; /* mark the attr fork not in use */ 2317 ip->i_df.if_ext_max = 2318 XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t); 2319 ip->i_d.di_format = XFS_DINODE_FMT_EXTENTS; 2320 ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS; 2321 /* 2322 * Bump the generation count so no one will be confused 2323 * by reincarnations of this inode. 2324 */ 2325 ip->i_d.di_gen++; 2326 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 2327 2328 if (delete) { 2329 xfs_ifree_cluster(ip, tp, first_ino); 2330 } 2331 2332 return 0; 2333 } 2334 2335 /* 2336 * Reallocate the space for if_broot based on the number of records 2337 * being added or deleted as indicated in rec_diff. Move the records 2338 * and pointers in if_broot to fit the new size. When shrinking this 2339 * will eliminate holes between the records and pointers created by 2340 * the caller. When growing this will create holes to be filled in 2341 * by the caller. 2342 * 2343 * The caller must not request to add more records than would fit in 2344 * the on-disk inode root. If the if_broot is currently NULL, then 2345 * if we adding records one will be allocated. The caller must also 2346 * not request that the number of records go below zero, although 2347 * it can go to zero. 2348 * 2349 * ip -- the inode whose if_broot area is changing 2350 * ext_diff -- the change in the number of records, positive or negative, 2351 * requested for the if_broot array. 2352 */ 2353 void 2354 xfs_iroot_realloc( 2355 xfs_inode_t *ip, 2356 int rec_diff, 2357 int whichfork) 2358 { 2359 int cur_max; 2360 xfs_ifork_t *ifp; 2361 xfs_bmbt_block_t *new_broot; 2362 int new_max; 2363 size_t new_size; 2364 char *np; 2365 char *op; 2366 2367 /* 2368 * Handle the degenerate case quietly. 2369 */ 2370 if (rec_diff == 0) { 2371 return; 2372 } 2373 2374 ifp = XFS_IFORK_PTR(ip, whichfork); 2375 if (rec_diff > 0) { 2376 /* 2377 * If there wasn't any memory allocated before, just 2378 * allocate it now and get out. 2379 */ 2380 if (ifp->if_broot_bytes == 0) { 2381 new_size = (size_t)XFS_BMAP_BROOT_SPACE_CALC(rec_diff); 2382 ifp->if_broot = (xfs_bmbt_block_t*)kmem_alloc(new_size, 2383 KM_SLEEP); 2384 ifp->if_broot_bytes = (int)new_size; 2385 return; 2386 } 2387 2388 /* 2389 * If there is already an existing if_broot, then we need 2390 * to realloc() it and shift the pointers to their new 2391 * location. The records don't change location because 2392 * they are kept butted up against the btree block header. 2393 */ 2394 cur_max = XFS_BMAP_BROOT_MAXRECS(ifp->if_broot_bytes); 2395 new_max = cur_max + rec_diff; 2396 new_size = (size_t)XFS_BMAP_BROOT_SPACE_CALC(new_max); 2397 ifp->if_broot = (xfs_bmbt_block_t *) 2398 kmem_realloc(ifp->if_broot, 2399 new_size, 2400 (size_t)XFS_BMAP_BROOT_SPACE_CALC(cur_max), /* old size */ 2401 KM_SLEEP); 2402 op = (char *)XFS_BMAP_BROOT_PTR_ADDR(ifp->if_broot, 1, 2403 ifp->if_broot_bytes); 2404 np = (char *)XFS_BMAP_BROOT_PTR_ADDR(ifp->if_broot, 1, 2405 (int)new_size); 2406 ifp->if_broot_bytes = (int)new_size; 2407 ASSERT(ifp->if_broot_bytes <= 2408 XFS_IFORK_SIZE(ip, whichfork) + XFS_BROOT_SIZE_ADJ); 2409 memmove(np, op, cur_max * (uint)sizeof(xfs_dfsbno_t)); 2410 return; 2411 } 2412 2413 /* 2414 * rec_diff is less than 0. In this case, we are shrinking the 2415 * if_broot buffer. It must already exist. If we go to zero 2416 * records, just get rid of the root and clear the status bit. 2417 */ 2418 ASSERT((ifp->if_broot != NULL) && (ifp->if_broot_bytes > 0)); 2419 cur_max = XFS_BMAP_BROOT_MAXRECS(ifp->if_broot_bytes); 2420 new_max = cur_max + rec_diff; 2421 ASSERT(new_max >= 0); 2422 if (new_max > 0) 2423 new_size = (size_t)XFS_BMAP_BROOT_SPACE_CALC(new_max); 2424 else 2425 new_size = 0; 2426 if (new_size > 0) { 2427 new_broot = (xfs_bmbt_block_t *)kmem_alloc(new_size, KM_SLEEP); 2428 /* 2429 * First copy over the btree block header. 2430 */ 2431 memcpy(new_broot, ifp->if_broot, sizeof(xfs_bmbt_block_t)); 2432 } else { 2433 new_broot = NULL; 2434 ifp->if_flags &= ~XFS_IFBROOT; 2435 } 2436 2437 /* 2438 * Only copy the records and pointers if there are any. 2439 */ 2440 if (new_max > 0) { 2441 /* 2442 * First copy the records. 2443 */ 2444 op = (char *)XFS_BMAP_BROOT_REC_ADDR(ifp->if_broot, 1, 2445 ifp->if_broot_bytes); 2446 np = (char *)XFS_BMAP_BROOT_REC_ADDR(new_broot, 1, 2447 (int)new_size); 2448 memcpy(np, op, new_max * (uint)sizeof(xfs_bmbt_rec_t)); 2449 2450 /* 2451 * Then copy the pointers. 2452 */ 2453 op = (char *)XFS_BMAP_BROOT_PTR_ADDR(ifp->if_broot, 1, 2454 ifp->if_broot_bytes); 2455 np = (char *)XFS_BMAP_BROOT_PTR_ADDR(new_broot, 1, 2456 (int)new_size); 2457 memcpy(np, op, new_max * (uint)sizeof(xfs_dfsbno_t)); 2458 } 2459 kmem_free(ifp->if_broot, ifp->if_broot_bytes); 2460 ifp->if_broot = new_broot; 2461 ifp->if_broot_bytes = (int)new_size; 2462 ASSERT(ifp->if_broot_bytes <= 2463 XFS_IFORK_SIZE(ip, whichfork) + XFS_BROOT_SIZE_ADJ); 2464 return; 2465 } 2466 2467 2468 /* 2469 * This is called when the amount of space needed for if_extents 2470 * is increased or decreased. The change in size is indicated by 2471 * the number of extents that need to be added or deleted in the 2472 * ext_diff parameter. 2473 * 2474 * If the amount of space needed has decreased below the size of the 2475 * inline buffer, then switch to using the inline buffer. Otherwise, 2476 * use kmem_realloc() or kmem_alloc() to adjust the size of the buffer 2477 * to what is needed. 2478 * 2479 * ip -- the inode whose if_extents area is changing 2480 * ext_diff -- the change in the number of extents, positive or negative, 2481 * requested for the if_extents array. 2482 */ 2483 void 2484 xfs_iext_realloc( 2485 xfs_inode_t *ip, 2486 int ext_diff, 2487 int whichfork) 2488 { 2489 int byte_diff; 2490 xfs_ifork_t *ifp; 2491 int new_size; 2492 uint rnew_size; 2493 2494 if (ext_diff == 0) { 2495 return; 2496 } 2497 2498 ifp = XFS_IFORK_PTR(ip, whichfork); 2499 byte_diff = ext_diff * (uint)sizeof(xfs_bmbt_rec_t); 2500 new_size = (int)ifp->if_bytes + byte_diff; 2501 ASSERT(new_size >= 0); 2502 2503 if (new_size == 0) { 2504 if (ifp->if_u1.if_extents != ifp->if_u2.if_inline_ext) { 2505 ASSERT(ifp->if_real_bytes != 0); 2506 kmem_free(ifp->if_u1.if_extents, ifp->if_real_bytes); 2507 } 2508 ifp->if_u1.if_extents = NULL; 2509 rnew_size = 0; 2510 } else if (new_size <= sizeof(ifp->if_u2.if_inline_ext)) { 2511 /* 2512 * If the valid extents can fit in if_inline_ext, 2513 * copy them from the malloc'd vector and free it. 2514 */ 2515 if (ifp->if_u1.if_extents != ifp->if_u2.if_inline_ext) { 2516 /* 2517 * For now, empty files are format EXTENTS, 2518 * so the if_extents pointer is null. 2519 */ 2520 if (ifp->if_u1.if_extents) { 2521 memcpy(ifp->if_u2.if_inline_ext, 2522 ifp->if_u1.if_extents, new_size); 2523 kmem_free(ifp->if_u1.if_extents, 2524 ifp->if_real_bytes); 2525 } 2526 ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext; 2527 } 2528 rnew_size = 0; 2529 } else { 2530 rnew_size = new_size; 2531 if ((rnew_size & (rnew_size - 1)) != 0) 2532 rnew_size = xfs_iroundup(rnew_size); 2533 /* 2534 * Stuck with malloc/realloc. 2535 */ 2536 if (ifp->if_u1.if_extents == ifp->if_u2.if_inline_ext) { 2537 ifp->if_u1.if_extents = (xfs_bmbt_rec_t *) 2538 kmem_alloc(rnew_size, KM_SLEEP); 2539 memcpy(ifp->if_u1.if_extents, ifp->if_u2.if_inline_ext, 2540 sizeof(ifp->if_u2.if_inline_ext)); 2541 } else if (rnew_size != ifp->if_real_bytes) { 2542 ifp->if_u1.if_extents = (xfs_bmbt_rec_t *) 2543 kmem_realloc(ifp->if_u1.if_extents, 2544 rnew_size, 2545 ifp->if_real_bytes, 2546 KM_NOFS); 2547 } 2548 } 2549 ifp->if_real_bytes = rnew_size; 2550 ifp->if_bytes = new_size; 2551 } 2552 2553 2554 /* 2555 * This is called when the amount of space needed for if_data 2556 * is increased or decreased. The change in size is indicated by 2557 * the number of bytes that need to be added or deleted in the 2558 * byte_diff parameter. 2559 * 2560 * If the amount of space needed has decreased below the size of the 2561 * inline buffer, then switch to using the inline buffer. Otherwise, 2562 * use kmem_realloc() or kmem_alloc() to adjust the size of the buffer 2563 * to what is needed. 2564 * 2565 * ip -- the inode whose if_data area is changing 2566 * byte_diff -- the change in the number of bytes, positive or negative, 2567 * requested for the if_data array. 2568 */ 2569 void 2570 xfs_idata_realloc( 2571 xfs_inode_t *ip, 2572 int byte_diff, 2573 int whichfork) 2574 { 2575 xfs_ifork_t *ifp; 2576 int new_size; 2577 int real_size; 2578 2579 if (byte_diff == 0) { 2580 return; 2581 } 2582 2583 ifp = XFS_IFORK_PTR(ip, whichfork); 2584 new_size = (int)ifp->if_bytes + byte_diff; 2585 ASSERT(new_size >= 0); 2586 2587 if (new_size == 0) { 2588 if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) { 2589 kmem_free(ifp->if_u1.if_data, ifp->if_real_bytes); 2590 } 2591 ifp->if_u1.if_data = NULL; 2592 real_size = 0; 2593 } else if (new_size <= sizeof(ifp->if_u2.if_inline_data)) { 2594 /* 2595 * If the valid extents/data can fit in if_inline_ext/data, 2596 * copy them from the malloc'd vector and free it. 2597 */ 2598 if (ifp->if_u1.if_data == NULL) { 2599 ifp->if_u1.if_data = ifp->if_u2.if_inline_data; 2600 } else if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) { 2601 ASSERT(ifp->if_real_bytes != 0); 2602 memcpy(ifp->if_u2.if_inline_data, ifp->if_u1.if_data, 2603 new_size); 2604 kmem_free(ifp->if_u1.if_data, ifp->if_real_bytes); 2605 ifp->if_u1.if_data = ifp->if_u2.if_inline_data; 2606 } 2607 real_size = 0; 2608 } else { 2609 /* 2610 * Stuck with malloc/realloc. 2611 * For inline data, the underlying buffer must be 2612 * a multiple of 4 bytes in size so that it can be 2613 * logged and stay on word boundaries. We enforce 2614 * that here. 2615 */ 2616 real_size = roundup(new_size, 4); 2617 if (ifp->if_u1.if_data == NULL) { 2618 ASSERT(ifp->if_real_bytes == 0); 2619 ifp->if_u1.if_data = kmem_alloc(real_size, KM_SLEEP); 2620 } else if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) { 2621 /* 2622 * Only do the realloc if the underlying size 2623 * is really changing. 2624 */ 2625 if (ifp->if_real_bytes != real_size) { 2626 ifp->if_u1.if_data = 2627 kmem_realloc(ifp->if_u1.if_data, 2628 real_size, 2629 ifp->if_real_bytes, 2630 KM_SLEEP); 2631 } 2632 } else { 2633 ASSERT(ifp->if_real_bytes == 0); 2634 ifp->if_u1.if_data = kmem_alloc(real_size, KM_SLEEP); 2635 memcpy(ifp->if_u1.if_data, ifp->if_u2.if_inline_data, 2636 ifp->if_bytes); 2637 } 2638 } 2639 ifp->if_real_bytes = real_size; 2640 ifp->if_bytes = new_size; 2641 ASSERT(ifp->if_bytes <= XFS_IFORK_SIZE(ip, whichfork)); 2642 } 2643 2644 2645 2646 2647 /* 2648 * Map inode to disk block and offset. 2649 * 2650 * mp -- the mount point structure for the current file system 2651 * tp -- the current transaction 2652 * ino -- the inode number of the inode to be located 2653 * imap -- this structure is filled in with the information necessary 2654 * to retrieve the given inode from disk 2655 * flags -- flags to pass to xfs_dilocate indicating whether or not 2656 * lookups in the inode btree were OK or not 2657 */ 2658 int 2659 xfs_imap( 2660 xfs_mount_t *mp, 2661 xfs_trans_t *tp, 2662 xfs_ino_t ino, 2663 xfs_imap_t *imap, 2664 uint flags) 2665 { 2666 xfs_fsblock_t fsbno; 2667 int len; 2668 int off; 2669 int error; 2670 2671 fsbno = imap->im_blkno ? 2672 XFS_DADDR_TO_FSB(mp, imap->im_blkno) : NULLFSBLOCK; 2673 error = xfs_dilocate(mp, tp, ino, &fsbno, &len, &off, flags); 2674 if (error != 0) { 2675 return error; 2676 } 2677 imap->im_blkno = XFS_FSB_TO_DADDR(mp, fsbno); 2678 imap->im_len = XFS_FSB_TO_BB(mp, len); 2679 imap->im_agblkno = XFS_FSB_TO_AGBNO(mp, fsbno); 2680 imap->im_ioffset = (ushort)off; 2681 imap->im_boffset = (ushort)(off << mp->m_sb.sb_inodelog); 2682 return 0; 2683 } 2684 2685 void 2686 xfs_idestroy_fork( 2687 xfs_inode_t *ip, 2688 int whichfork) 2689 { 2690 xfs_ifork_t *ifp; 2691 2692 ifp = XFS_IFORK_PTR(ip, whichfork); 2693 if (ifp->if_broot != NULL) { 2694 kmem_free(ifp->if_broot, ifp->if_broot_bytes); 2695 ifp->if_broot = NULL; 2696 } 2697 2698 /* 2699 * If the format is local, then we can't have an extents 2700 * array so just look for an inline data array. If we're 2701 * not local then we may or may not have an extents list, 2702 * so check and free it up if we do. 2703 */ 2704 if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) { 2705 if ((ifp->if_u1.if_data != ifp->if_u2.if_inline_data) && 2706 (ifp->if_u1.if_data != NULL)) { 2707 ASSERT(ifp->if_real_bytes != 0); 2708 kmem_free(ifp->if_u1.if_data, ifp->if_real_bytes); 2709 ifp->if_u1.if_data = NULL; 2710 ifp->if_real_bytes = 0; 2711 } 2712 } else if ((ifp->if_flags & XFS_IFEXTENTS) && 2713 (ifp->if_u1.if_extents != NULL) && 2714 (ifp->if_u1.if_extents != ifp->if_u2.if_inline_ext)) { 2715 ASSERT(ifp->if_real_bytes != 0); 2716 kmem_free(ifp->if_u1.if_extents, ifp->if_real_bytes); 2717 ifp->if_u1.if_extents = NULL; 2718 ifp->if_real_bytes = 0; 2719 } 2720 ASSERT(ifp->if_u1.if_extents == NULL || 2721 ifp->if_u1.if_extents == ifp->if_u2.if_inline_ext); 2722 ASSERT(ifp->if_real_bytes == 0); 2723 if (whichfork == XFS_ATTR_FORK) { 2724 kmem_zone_free(xfs_ifork_zone, ip->i_afp); 2725 ip->i_afp = NULL; 2726 } 2727 } 2728 2729 /* 2730 * This is called free all the memory associated with an inode. 2731 * It must free the inode itself and any buffers allocated for 2732 * if_extents/if_data and if_broot. It must also free the lock 2733 * associated with the inode. 2734 */ 2735 void 2736 xfs_idestroy( 2737 xfs_inode_t *ip) 2738 { 2739 2740 switch (ip->i_d.di_mode & S_IFMT) { 2741 case S_IFREG: 2742 case S_IFDIR: 2743 case S_IFLNK: 2744 xfs_idestroy_fork(ip, XFS_DATA_FORK); 2745 break; 2746 } 2747 if (ip->i_afp) 2748 xfs_idestroy_fork(ip, XFS_ATTR_FORK); 2749 mrfree(&ip->i_lock); 2750 mrfree(&ip->i_iolock); 2751 freesema(&ip->i_flock); 2752 #ifdef XFS_BMAP_TRACE 2753 ktrace_free(ip->i_xtrace); 2754 #endif 2755 #ifdef XFS_BMBT_TRACE 2756 ktrace_free(ip->i_btrace); 2757 #endif 2758 #ifdef XFS_RW_TRACE 2759 ktrace_free(ip->i_rwtrace); 2760 #endif 2761 #ifdef XFS_ILOCK_TRACE 2762 ktrace_free(ip->i_lock_trace); 2763 #endif 2764 #ifdef XFS_DIR2_TRACE 2765 ktrace_free(ip->i_dir_trace); 2766 #endif 2767 if (ip->i_itemp) { 2768 /* XXXdpd should be able to assert this but shutdown 2769 * is leaving the AIL behind. */ 2770 ASSERT(((ip->i_itemp->ili_item.li_flags & XFS_LI_IN_AIL) == 0) || 2771 XFS_FORCED_SHUTDOWN(ip->i_mount)); 2772 xfs_inode_item_destroy(ip); 2773 } 2774 kmem_zone_free(xfs_inode_zone, ip); 2775 } 2776 2777 2778 /* 2779 * Increment the pin count of the given buffer. 2780 * This value is protected by ipinlock spinlock in the mount structure. 2781 */ 2782 void 2783 xfs_ipin( 2784 xfs_inode_t *ip) 2785 { 2786 ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE)); 2787 2788 atomic_inc(&ip->i_pincount); 2789 } 2790 2791 /* 2792 * Decrement the pin count of the given inode, and wake up 2793 * anyone in xfs_iwait_unpin() if the count goes to 0. The 2794 * inode must have been previoulsy pinned with a call to xfs_ipin(). 2795 */ 2796 void 2797 xfs_iunpin( 2798 xfs_inode_t *ip) 2799 { 2800 ASSERT(atomic_read(&ip->i_pincount) > 0); 2801 2802 if (atomic_dec_and_test(&ip->i_pincount)) { 2803 vnode_t *vp = XFS_ITOV_NULL(ip); 2804 2805 /* make sync come back and flush this inode */ 2806 if (vp) { 2807 struct inode *inode = LINVFS_GET_IP(vp); 2808 2809 if (!(inode->i_state & I_NEW)) 2810 mark_inode_dirty_sync(inode); 2811 } 2812 2813 wake_up(&ip->i_ipin_wait); 2814 } 2815 } 2816 2817 /* 2818 * This is called to wait for the given inode to be unpinned. 2819 * It will sleep until this happens. The caller must have the 2820 * inode locked in at least shared mode so that the buffer cannot 2821 * be subsequently pinned once someone is waiting for it to be 2822 * unpinned. 2823 */ 2824 STATIC void 2825 xfs_iunpin_wait( 2826 xfs_inode_t *ip) 2827 { 2828 xfs_inode_log_item_t *iip; 2829 xfs_lsn_t lsn; 2830 2831 ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE | MR_ACCESS)); 2832 2833 if (atomic_read(&ip->i_pincount) == 0) { 2834 return; 2835 } 2836 2837 iip = ip->i_itemp; 2838 if (iip && iip->ili_last_lsn) { 2839 lsn = iip->ili_last_lsn; 2840 } else { 2841 lsn = (xfs_lsn_t)0; 2842 } 2843 2844 /* 2845 * Give the log a push so we don't wait here too long. 2846 */ 2847 xfs_log_force(ip->i_mount, lsn, XFS_LOG_FORCE); 2848 2849 wait_event(ip->i_ipin_wait, (atomic_read(&ip->i_pincount) == 0)); 2850 } 2851 2852 2853 /* 2854 * xfs_iextents_copy() 2855 * 2856 * This is called to copy the REAL extents (as opposed to the delayed 2857 * allocation extents) from the inode into the given buffer. It 2858 * returns the number of bytes copied into the buffer. 2859 * 2860 * If there are no delayed allocation extents, then we can just 2861 * memcpy() the extents into the buffer. Otherwise, we need to 2862 * examine each extent in turn and skip those which are delayed. 2863 */ 2864 int 2865 xfs_iextents_copy( 2866 xfs_inode_t *ip, 2867 xfs_bmbt_rec_t *buffer, 2868 int whichfork) 2869 { 2870 int copied; 2871 xfs_bmbt_rec_t *dest_ep; 2872 xfs_bmbt_rec_t *ep; 2873 #ifdef XFS_BMAP_TRACE 2874 static char fname[] = "xfs_iextents_copy"; 2875 #endif 2876 int i; 2877 xfs_ifork_t *ifp; 2878 int nrecs; 2879 xfs_fsblock_t start_block; 2880 2881 ifp = XFS_IFORK_PTR(ip, whichfork); 2882 ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE|MR_ACCESS)); 2883 ASSERT(ifp->if_bytes > 0); 2884 2885 nrecs = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); 2886 xfs_bmap_trace_exlist(fname, ip, nrecs, whichfork); 2887 ASSERT(nrecs > 0); 2888 2889 /* 2890 * There are some delayed allocation extents in the 2891 * inode, so copy the extents one at a time and skip 2892 * the delayed ones. There must be at least one 2893 * non-delayed extent. 2894 */ 2895 ep = ifp->if_u1.if_extents; 2896 dest_ep = buffer; 2897 copied = 0; 2898 for (i = 0; i < nrecs; i++) { 2899 start_block = xfs_bmbt_get_startblock(ep); 2900 if (ISNULLSTARTBLOCK(start_block)) { 2901 /* 2902 * It's a delayed allocation extent, so skip it. 2903 */ 2904 ep++; 2905 continue; 2906 } 2907 2908 /* Translate to on disk format */ 2909 put_unaligned(INT_GET(ep->l0, ARCH_CONVERT), 2910 (__uint64_t*)&dest_ep->l0); 2911 put_unaligned(INT_GET(ep->l1, ARCH_CONVERT), 2912 (__uint64_t*)&dest_ep->l1); 2913 dest_ep++; 2914 ep++; 2915 copied++; 2916 } 2917 ASSERT(copied != 0); 2918 xfs_validate_extents(buffer, copied, 1, XFS_EXTFMT_INODE(ip)); 2919 2920 return (copied * (uint)sizeof(xfs_bmbt_rec_t)); 2921 } 2922 2923 /* 2924 * Each of the following cases stores data into the same region 2925 * of the on-disk inode, so only one of them can be valid at 2926 * any given time. While it is possible to have conflicting formats 2927 * and log flags, e.g. having XFS_ILOG_?DATA set when the fork is 2928 * in EXTENTS format, this can only happen when the fork has 2929 * changed formats after being modified but before being flushed. 2930 * In these cases, the format always takes precedence, because the 2931 * format indicates the current state of the fork. 2932 */ 2933 /*ARGSUSED*/ 2934 STATIC int 2935 xfs_iflush_fork( 2936 xfs_inode_t *ip, 2937 xfs_dinode_t *dip, 2938 xfs_inode_log_item_t *iip, 2939 int whichfork, 2940 xfs_buf_t *bp) 2941 { 2942 char *cp; 2943 xfs_ifork_t *ifp; 2944 xfs_mount_t *mp; 2945 #ifdef XFS_TRANS_DEBUG 2946 int first; 2947 #endif 2948 static const short brootflag[2] = 2949 { XFS_ILOG_DBROOT, XFS_ILOG_ABROOT }; 2950 static const short dataflag[2] = 2951 { XFS_ILOG_DDATA, XFS_ILOG_ADATA }; 2952 static const short extflag[2] = 2953 { XFS_ILOG_DEXT, XFS_ILOG_AEXT }; 2954 2955 if (iip == NULL) 2956 return 0; 2957 ifp = XFS_IFORK_PTR(ip, whichfork); 2958 /* 2959 * This can happen if we gave up in iformat in an error path, 2960 * for the attribute fork. 2961 */ 2962 if (ifp == NULL) { 2963 ASSERT(whichfork == XFS_ATTR_FORK); 2964 return 0; 2965 } 2966 cp = XFS_DFORK_PTR(dip, whichfork); 2967 mp = ip->i_mount; 2968 switch (XFS_IFORK_FORMAT(ip, whichfork)) { 2969 case XFS_DINODE_FMT_LOCAL: 2970 if ((iip->ili_format.ilf_fields & dataflag[whichfork]) && 2971 (ifp->if_bytes > 0)) { 2972 ASSERT(ifp->if_u1.if_data != NULL); 2973 ASSERT(ifp->if_bytes <= XFS_IFORK_SIZE(ip, whichfork)); 2974 memcpy(cp, ifp->if_u1.if_data, ifp->if_bytes); 2975 } 2976 if (whichfork == XFS_DATA_FORK) { 2977 if (unlikely(XFS_DIR_SHORTFORM_VALIDATE_ONDISK(mp, dip))) { 2978 XFS_ERROR_REPORT("xfs_iflush_fork", 2979 XFS_ERRLEVEL_LOW, mp); 2980 return XFS_ERROR(EFSCORRUPTED); 2981 } 2982 } 2983 break; 2984 2985 case XFS_DINODE_FMT_EXTENTS: 2986 ASSERT((ifp->if_flags & XFS_IFEXTENTS) || 2987 !(iip->ili_format.ilf_fields & extflag[whichfork])); 2988 ASSERT((ifp->if_u1.if_extents != NULL) || (ifp->if_bytes == 0)); 2989 ASSERT((ifp->if_u1.if_extents == NULL) || (ifp->if_bytes > 0)); 2990 if ((iip->ili_format.ilf_fields & extflag[whichfork]) && 2991 (ifp->if_bytes > 0)) { 2992 ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) > 0); 2993 (void)xfs_iextents_copy(ip, (xfs_bmbt_rec_t *)cp, 2994 whichfork); 2995 } 2996 break; 2997 2998 case XFS_DINODE_FMT_BTREE: 2999 if ((iip->ili_format.ilf_fields & brootflag[whichfork]) && 3000 (ifp->if_broot_bytes > 0)) { 3001 ASSERT(ifp->if_broot != NULL); 3002 ASSERT(ifp->if_broot_bytes <= 3003 (XFS_IFORK_SIZE(ip, whichfork) + 3004 XFS_BROOT_SIZE_ADJ)); 3005 xfs_bmbt_to_bmdr(ifp->if_broot, ifp->if_broot_bytes, 3006 (xfs_bmdr_block_t *)cp, 3007 XFS_DFORK_SIZE(dip, mp, whichfork)); 3008 } 3009 break; 3010 3011 case XFS_DINODE_FMT_DEV: 3012 if (iip->ili_format.ilf_fields & XFS_ILOG_DEV) { 3013 ASSERT(whichfork == XFS_DATA_FORK); 3014 INT_SET(dip->di_u.di_dev, ARCH_CONVERT, ip->i_df.if_u2.if_rdev); 3015 } 3016 break; 3017 3018 case XFS_DINODE_FMT_UUID: 3019 if (iip->ili_format.ilf_fields & XFS_ILOG_UUID) { 3020 ASSERT(whichfork == XFS_DATA_FORK); 3021 memcpy(&dip->di_u.di_muuid, &ip->i_df.if_u2.if_uuid, 3022 sizeof(uuid_t)); 3023 } 3024 break; 3025 3026 default: 3027 ASSERT(0); 3028 break; 3029 } 3030 3031 return 0; 3032 } 3033 3034 /* 3035 * xfs_iflush() will write a modified inode's changes out to the 3036 * inode's on disk home. The caller must have the inode lock held 3037 * in at least shared mode and the inode flush semaphore must be 3038 * held as well. The inode lock will still be held upon return from 3039 * the call and the caller is free to unlock it. 3040 * The inode flush lock will be unlocked when the inode reaches the disk. 3041 * The flags indicate how the inode's buffer should be written out. 3042 */ 3043 int 3044 xfs_iflush( 3045 xfs_inode_t *ip, 3046 uint flags) 3047 { 3048 xfs_inode_log_item_t *iip; 3049 xfs_buf_t *bp; 3050 xfs_dinode_t *dip; 3051 xfs_mount_t *mp; 3052 int error; 3053 /* REFERENCED */ 3054 xfs_chash_t *ch; 3055 xfs_inode_t *iq; 3056 int clcount; /* count of inodes clustered */ 3057 int bufwasdelwri; 3058 enum { INT_DELWRI = (1 << 0), INT_ASYNC = (1 << 1) }; 3059 SPLDECL(s); 3060 3061 XFS_STATS_INC(xs_iflush_count); 3062 3063 ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE|MR_ACCESS)); 3064 ASSERT(valusema(&ip->i_flock) <= 0); 3065 ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE || 3066 ip->i_d.di_nextents > ip->i_df.if_ext_max); 3067 3068 iip = ip->i_itemp; 3069 mp = ip->i_mount; 3070 3071 /* 3072 * If the inode isn't dirty, then just release the inode 3073 * flush lock and do nothing. 3074 */ 3075 if ((ip->i_update_core == 0) && 3076 ((iip == NULL) || !(iip->ili_format.ilf_fields & XFS_ILOG_ALL))) { 3077 ASSERT((iip != NULL) ? 3078 !(iip->ili_item.li_flags & XFS_LI_IN_AIL) : 1); 3079 xfs_ifunlock(ip); 3080 return 0; 3081 } 3082 3083 /* 3084 * We can't flush the inode until it is unpinned, so 3085 * wait for it. We know noone new can pin it, because 3086 * we are holding the inode lock shared and you need 3087 * to hold it exclusively to pin the inode. 3088 */ 3089 xfs_iunpin_wait(ip); 3090 3091 /* 3092 * This may have been unpinned because the filesystem is shutting 3093 * down forcibly. If that's the case we must not write this inode 3094 * to disk, because the log record didn't make it to disk! 3095 */ 3096 if (XFS_FORCED_SHUTDOWN(mp)) { 3097 ip->i_update_core = 0; 3098 if (iip) 3099 iip->ili_format.ilf_fields = 0; 3100 xfs_ifunlock(ip); 3101 return XFS_ERROR(EIO); 3102 } 3103 3104 /* 3105 * Get the buffer containing the on-disk inode. 3106 */ 3107 error = xfs_itobp(mp, NULL, ip, &dip, &bp, 0); 3108 if (error != 0) { 3109 xfs_ifunlock(ip); 3110 return error; 3111 } 3112 3113 /* 3114 * Decide how buffer will be flushed out. This is done before 3115 * the call to xfs_iflush_int because this field is zeroed by it. 3116 */ 3117 if (iip != NULL && iip->ili_format.ilf_fields != 0) { 3118 /* 3119 * Flush out the inode buffer according to the directions 3120 * of the caller. In the cases where the caller has given 3121 * us a choice choose the non-delwri case. This is because 3122 * the inode is in the AIL and we need to get it out soon. 3123 */ 3124 switch (flags) { 3125 case XFS_IFLUSH_SYNC: 3126 case XFS_IFLUSH_DELWRI_ELSE_SYNC: 3127 flags = 0; 3128 break; 3129 case XFS_IFLUSH_ASYNC: 3130 case XFS_IFLUSH_DELWRI_ELSE_ASYNC: 3131 flags = INT_ASYNC; 3132 break; 3133 case XFS_IFLUSH_DELWRI: 3134 flags = INT_DELWRI; 3135 break; 3136 default: 3137 ASSERT(0); 3138 flags = 0; 3139 break; 3140 } 3141 } else { 3142 switch (flags) { 3143 case XFS_IFLUSH_DELWRI_ELSE_SYNC: 3144 case XFS_IFLUSH_DELWRI_ELSE_ASYNC: 3145 case XFS_IFLUSH_DELWRI: 3146 flags = INT_DELWRI; 3147 break; 3148 case XFS_IFLUSH_ASYNC: 3149 flags = INT_ASYNC; 3150 break; 3151 case XFS_IFLUSH_SYNC: 3152 flags = 0; 3153 break; 3154 default: 3155 ASSERT(0); 3156 flags = 0; 3157 break; 3158 } 3159 } 3160 3161 /* 3162 * First flush out the inode that xfs_iflush was called with. 3163 */ 3164 error = xfs_iflush_int(ip, bp); 3165 if (error) { 3166 goto corrupt_out; 3167 } 3168 3169 /* 3170 * inode clustering: 3171 * see if other inodes can be gathered into this write 3172 */ 3173 3174 ip->i_chash->chl_buf = bp; 3175 3176 ch = XFS_CHASH(mp, ip->i_blkno); 3177 s = mutex_spinlock(&ch->ch_lock); 3178 3179 clcount = 0; 3180 for (iq = ip->i_cnext; iq != ip; iq = iq->i_cnext) { 3181 /* 3182 * Do an un-protected check to see if the inode is dirty and 3183 * is a candidate for flushing. These checks will be repeated 3184 * later after the appropriate locks are acquired. 3185 */ 3186 iip = iq->i_itemp; 3187 if ((iq->i_update_core == 0) && 3188 ((iip == NULL) || 3189 !(iip->ili_format.ilf_fields & XFS_ILOG_ALL)) && 3190 xfs_ipincount(iq) == 0) { 3191 continue; 3192 } 3193 3194 /* 3195 * Try to get locks. If any are unavailable, 3196 * then this inode cannot be flushed and is skipped. 3197 */ 3198 3199 /* get inode locks (just i_lock) */ 3200 if (xfs_ilock_nowait(iq, XFS_ILOCK_SHARED)) { 3201 /* get inode flush lock */ 3202 if (xfs_iflock_nowait(iq)) { 3203 /* check if pinned */ 3204 if (xfs_ipincount(iq) == 0) { 3205 /* arriving here means that 3206 * this inode can be flushed. 3207 * first re-check that it's 3208 * dirty 3209 */ 3210 iip = iq->i_itemp; 3211 if ((iq->i_update_core != 0)|| 3212 ((iip != NULL) && 3213 (iip->ili_format.ilf_fields & XFS_ILOG_ALL))) { 3214 clcount++; 3215 error = xfs_iflush_int(iq, bp); 3216 if (error) { 3217 xfs_iunlock(iq, 3218 XFS_ILOCK_SHARED); 3219 goto cluster_corrupt_out; 3220 } 3221 } else { 3222 xfs_ifunlock(iq); 3223 } 3224 } else { 3225 xfs_ifunlock(iq); 3226 } 3227 } 3228 xfs_iunlock(iq, XFS_ILOCK_SHARED); 3229 } 3230 } 3231 mutex_spinunlock(&ch->ch_lock, s); 3232 3233 if (clcount) { 3234 XFS_STATS_INC(xs_icluster_flushcnt); 3235 XFS_STATS_ADD(xs_icluster_flushinode, clcount); 3236 } 3237 3238 /* 3239 * If the buffer is pinned then push on the log so we won't 3240 * get stuck waiting in the write for too long. 3241 */ 3242 if (XFS_BUF_ISPINNED(bp)){ 3243 xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE); 3244 } 3245 3246 if (flags & INT_DELWRI) { 3247 xfs_bdwrite(mp, bp); 3248 } else if (flags & INT_ASYNC) { 3249 xfs_bawrite(mp, bp); 3250 } else { 3251 error = xfs_bwrite(mp, bp); 3252 } 3253 return error; 3254 3255 corrupt_out: 3256 xfs_buf_relse(bp); 3257 xfs_force_shutdown(mp, XFS_CORRUPT_INCORE); 3258 xfs_iflush_abort(ip); 3259 /* 3260 * Unlocks the flush lock 3261 */ 3262 return XFS_ERROR(EFSCORRUPTED); 3263 3264 cluster_corrupt_out: 3265 /* Corruption detected in the clustering loop. Invalidate the 3266 * inode buffer and shut down the filesystem. 3267 */ 3268 mutex_spinunlock(&ch->ch_lock, s); 3269 3270 /* 3271 * Clean up the buffer. If it was B_DELWRI, just release it -- 3272 * brelse can handle it with no problems. If not, shut down the 3273 * filesystem before releasing the buffer. 3274 */ 3275 if ((bufwasdelwri= XFS_BUF_ISDELAYWRITE(bp))) { 3276 xfs_buf_relse(bp); 3277 } 3278 3279 xfs_force_shutdown(mp, XFS_CORRUPT_INCORE); 3280 3281 if(!bufwasdelwri) { 3282 /* 3283 * Just like incore_relse: if we have b_iodone functions, 3284 * mark the buffer as an error and call them. Otherwise 3285 * mark it as stale and brelse. 3286 */ 3287 if (XFS_BUF_IODONE_FUNC(bp)) { 3288 XFS_BUF_CLR_BDSTRAT_FUNC(bp); 3289 XFS_BUF_UNDONE(bp); 3290 XFS_BUF_STALE(bp); 3291 XFS_BUF_SHUT(bp); 3292 XFS_BUF_ERROR(bp,EIO); 3293 xfs_biodone(bp); 3294 } else { 3295 XFS_BUF_STALE(bp); 3296 xfs_buf_relse(bp); 3297 } 3298 } 3299 3300 xfs_iflush_abort(iq); 3301 /* 3302 * Unlocks the flush lock 3303 */ 3304 return XFS_ERROR(EFSCORRUPTED); 3305 } 3306 3307 3308 STATIC int 3309 xfs_iflush_int( 3310 xfs_inode_t *ip, 3311 xfs_buf_t *bp) 3312 { 3313 xfs_inode_log_item_t *iip; 3314 xfs_dinode_t *dip; 3315 xfs_mount_t *mp; 3316 #ifdef XFS_TRANS_DEBUG 3317 int first; 3318 #endif 3319 SPLDECL(s); 3320 3321 ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE|MR_ACCESS)); 3322 ASSERT(valusema(&ip->i_flock) <= 0); 3323 ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE || 3324 ip->i_d.di_nextents > ip->i_df.if_ext_max); 3325 3326 iip = ip->i_itemp; 3327 mp = ip->i_mount; 3328 3329 3330 /* 3331 * If the inode isn't dirty, then just release the inode 3332 * flush lock and do nothing. 3333 */ 3334 if ((ip->i_update_core == 0) && 3335 ((iip == NULL) || !(iip->ili_format.ilf_fields & XFS_ILOG_ALL))) { 3336 xfs_ifunlock(ip); 3337 return 0; 3338 } 3339 3340 /* set *dip = inode's place in the buffer */ 3341 dip = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_boffset); 3342 3343 /* 3344 * Clear i_update_core before copying out the data. 3345 * This is for coordination with our timestamp updates 3346 * that don't hold the inode lock. They will always 3347 * update the timestamps BEFORE setting i_update_core, 3348 * so if we clear i_update_core after they set it we 3349 * are guaranteed to see their updates to the timestamps. 3350 * I believe that this depends on strongly ordered memory 3351 * semantics, but we have that. We use the SYNCHRONIZE 3352 * macro to make sure that the compiler does not reorder 3353 * the i_update_core access below the data copy below. 3354 */ 3355 ip->i_update_core = 0; 3356 SYNCHRONIZE(); 3357 3358 if (XFS_TEST_ERROR(INT_GET(dip->di_core.di_magic,ARCH_CONVERT) != XFS_DINODE_MAGIC, 3359 mp, XFS_ERRTAG_IFLUSH_1, XFS_RANDOM_IFLUSH_1)) { 3360 xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp, 3361 "xfs_iflush: Bad inode %Lu magic number 0x%x, ptr 0x%p", 3362 ip->i_ino, (int) INT_GET(dip->di_core.di_magic, ARCH_CONVERT), dip); 3363 goto corrupt_out; 3364 } 3365 if (XFS_TEST_ERROR(ip->i_d.di_magic != XFS_DINODE_MAGIC, 3366 mp, XFS_ERRTAG_IFLUSH_2, XFS_RANDOM_IFLUSH_2)) { 3367 xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp, 3368 "xfs_iflush: Bad inode %Lu, ptr 0x%p, magic number 0x%x", 3369 ip->i_ino, ip, ip->i_d.di_magic); 3370 goto corrupt_out; 3371 } 3372 if ((ip->i_d.di_mode & S_IFMT) == S_IFREG) { 3373 if (XFS_TEST_ERROR( 3374 (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) && 3375 (ip->i_d.di_format != XFS_DINODE_FMT_BTREE), 3376 mp, XFS_ERRTAG_IFLUSH_3, XFS_RANDOM_IFLUSH_3)) { 3377 xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp, 3378 "xfs_iflush: Bad regular inode %Lu, ptr 0x%p", 3379 ip->i_ino, ip); 3380 goto corrupt_out; 3381 } 3382 } else if ((ip->i_d.di_mode & S_IFMT) == S_IFDIR) { 3383 if (XFS_TEST_ERROR( 3384 (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) && 3385 (ip->i_d.di_format != XFS_DINODE_FMT_BTREE) && 3386 (ip->i_d.di_format != XFS_DINODE_FMT_LOCAL), 3387 mp, XFS_ERRTAG_IFLUSH_4, XFS_RANDOM_IFLUSH_4)) { 3388 xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp, 3389 "xfs_iflush: Bad directory inode %Lu, ptr 0x%p", 3390 ip->i_ino, ip); 3391 goto corrupt_out; 3392 } 3393 } 3394 if (XFS_TEST_ERROR(ip->i_d.di_nextents + ip->i_d.di_anextents > 3395 ip->i_d.di_nblocks, mp, XFS_ERRTAG_IFLUSH_5, 3396 XFS_RANDOM_IFLUSH_5)) { 3397 xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp, 3398 "xfs_iflush: detected corrupt incore inode %Lu, total extents = %d, nblocks = %Ld, ptr 0x%p", 3399 ip->i_ino, 3400 ip->i_d.di_nextents + ip->i_d.di_anextents, 3401 ip->i_d.di_nblocks, 3402 ip); 3403 goto corrupt_out; 3404 } 3405 if (XFS_TEST_ERROR(ip->i_d.di_forkoff > mp->m_sb.sb_inodesize, 3406 mp, XFS_ERRTAG_IFLUSH_6, XFS_RANDOM_IFLUSH_6)) { 3407 xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp, 3408 "xfs_iflush: bad inode %Lu, forkoff 0x%x, ptr 0x%p", 3409 ip->i_ino, ip->i_d.di_forkoff, ip); 3410 goto corrupt_out; 3411 } 3412 /* 3413 * bump the flush iteration count, used to detect flushes which 3414 * postdate a log record during recovery. 3415 */ 3416 3417 ip->i_d.di_flushiter++; 3418 3419 /* 3420 * Copy the dirty parts of the inode into the on-disk 3421 * inode. We always copy out the core of the inode, 3422 * because if the inode is dirty at all the core must 3423 * be. 3424 */ 3425 xfs_xlate_dinode_core((xfs_caddr_t)&(dip->di_core), &(ip->i_d), -1); 3426 3427 /* Wrap, we never let the log put out DI_MAX_FLUSH */ 3428 if (ip->i_d.di_flushiter == DI_MAX_FLUSH) 3429 ip->i_d.di_flushiter = 0; 3430 3431 /* 3432 * If this is really an old format inode and the superblock version 3433 * has not been updated to support only new format inodes, then 3434 * convert back to the old inode format. If the superblock version 3435 * has been updated, then make the conversion permanent. 3436 */ 3437 ASSERT(ip->i_d.di_version == XFS_DINODE_VERSION_1 || 3438 XFS_SB_VERSION_HASNLINK(&mp->m_sb)); 3439 if (ip->i_d.di_version == XFS_DINODE_VERSION_1) { 3440 if (!XFS_SB_VERSION_HASNLINK(&mp->m_sb)) { 3441 /* 3442 * Convert it back. 3443 */ 3444 ASSERT(ip->i_d.di_nlink <= XFS_MAXLINK_1); 3445 INT_SET(dip->di_core.di_onlink, ARCH_CONVERT, ip->i_d.di_nlink); 3446 } else { 3447 /* 3448 * The superblock version has already been bumped, 3449 * so just make the conversion to the new inode 3450 * format permanent. 3451 */ 3452 ip->i_d.di_version = XFS_DINODE_VERSION_2; 3453 INT_SET(dip->di_core.di_version, ARCH_CONVERT, XFS_DINODE_VERSION_2); 3454 ip->i_d.di_onlink = 0; 3455 dip->di_core.di_onlink = 0; 3456 memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad)); 3457 memset(&(dip->di_core.di_pad[0]), 0, 3458 sizeof(dip->di_core.di_pad)); 3459 ASSERT(ip->i_d.di_projid == 0); 3460 } 3461 } 3462 3463 if (xfs_iflush_fork(ip, dip, iip, XFS_DATA_FORK, bp) == EFSCORRUPTED) { 3464 goto corrupt_out; 3465 } 3466 3467 if (XFS_IFORK_Q(ip)) { 3468 /* 3469 * The only error from xfs_iflush_fork is on the data fork. 3470 */ 3471 (void) xfs_iflush_fork(ip, dip, iip, XFS_ATTR_FORK, bp); 3472 } 3473 xfs_inobp_check(mp, bp); 3474 3475 /* 3476 * We've recorded everything logged in the inode, so we'd 3477 * like to clear the ilf_fields bits so we don't log and 3478 * flush things unnecessarily. However, we can't stop 3479 * logging all this information until the data we've copied 3480 * into the disk buffer is written to disk. If we did we might 3481 * overwrite the copy of the inode in the log with all the 3482 * data after re-logging only part of it, and in the face of 3483 * a crash we wouldn't have all the data we need to recover. 3484 * 3485 * What we do is move the bits to the ili_last_fields field. 3486 * When logging the inode, these bits are moved back to the 3487 * ilf_fields field. In the xfs_iflush_done() routine we 3488 * clear ili_last_fields, since we know that the information 3489 * those bits represent is permanently on disk. As long as 3490 * the flush completes before the inode is logged again, then 3491 * both ilf_fields and ili_last_fields will be cleared. 3492 * 3493 * We can play with the ilf_fields bits here, because the inode 3494 * lock must be held exclusively in order to set bits there 3495 * and the flush lock protects the ili_last_fields bits. 3496 * Set ili_logged so the flush done 3497 * routine can tell whether or not to look in the AIL. 3498 * Also, store the current LSN of the inode so that we can tell 3499 * whether the item has moved in the AIL from xfs_iflush_done(). 3500 * In order to read the lsn we need the AIL lock, because 3501 * it is a 64 bit value that cannot be read atomically. 3502 */ 3503 if (iip != NULL && iip->ili_format.ilf_fields != 0) { 3504 iip->ili_last_fields = iip->ili_format.ilf_fields; 3505 iip->ili_format.ilf_fields = 0; 3506 iip->ili_logged = 1; 3507 3508 ASSERT(sizeof(xfs_lsn_t) == 8); /* don't lock if it shrinks */ 3509 AIL_LOCK(mp,s); 3510 iip->ili_flush_lsn = iip->ili_item.li_lsn; 3511 AIL_UNLOCK(mp, s); 3512 3513 /* 3514 * Attach the function xfs_iflush_done to the inode's 3515 * buffer. This will remove the inode from the AIL 3516 * and unlock the inode's flush lock when the inode is 3517 * completely written to disk. 3518 */ 3519 xfs_buf_attach_iodone(bp, (void(*)(xfs_buf_t*,xfs_log_item_t*)) 3520 xfs_iflush_done, (xfs_log_item_t *)iip); 3521 3522 ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL); 3523 ASSERT(XFS_BUF_IODONE_FUNC(bp) != NULL); 3524 } else { 3525 /* 3526 * We're flushing an inode which is not in the AIL and has 3527 * not been logged but has i_update_core set. For this 3528 * case we can use a B_DELWRI flush and immediately drop 3529 * the inode flush lock because we can avoid the whole 3530 * AIL state thing. It's OK to drop the flush lock now, 3531 * because we've already locked the buffer and to do anything 3532 * you really need both. 3533 */ 3534 if (iip != NULL) { 3535 ASSERT(iip->ili_logged == 0); 3536 ASSERT(iip->ili_last_fields == 0); 3537 ASSERT((iip->ili_item.li_flags & XFS_LI_IN_AIL) == 0); 3538 } 3539 xfs_ifunlock(ip); 3540 } 3541 3542 return 0; 3543 3544 corrupt_out: 3545 return XFS_ERROR(EFSCORRUPTED); 3546 } 3547 3548 3549 /* 3550 * Flush all inactive inodes in mp. 3551 */ 3552 void 3553 xfs_iflush_all( 3554 xfs_mount_t *mp) 3555 { 3556 xfs_inode_t *ip; 3557 vnode_t *vp; 3558 3559 again: 3560 XFS_MOUNT_ILOCK(mp); 3561 ip = mp->m_inodes; 3562 if (ip == NULL) 3563 goto out; 3564 3565 do { 3566 /* Make sure we skip markers inserted by sync */ 3567 if (ip->i_mount == NULL) { 3568 ip = ip->i_mnext; 3569 continue; 3570 } 3571 3572 vp = XFS_ITOV_NULL(ip); 3573 if (!vp) { 3574 XFS_MOUNT_IUNLOCK(mp); 3575 xfs_finish_reclaim(ip, 0, XFS_IFLUSH_ASYNC); 3576 goto again; 3577 } 3578 3579 ASSERT(vn_count(vp) == 0); 3580 3581 ip = ip->i_mnext; 3582 } while (ip != mp->m_inodes); 3583 out: 3584 XFS_MOUNT_IUNLOCK(mp); 3585 } 3586 3587 /* 3588 * xfs_iaccess: check accessibility of inode for mode. 3589 */ 3590 int 3591 xfs_iaccess( 3592 xfs_inode_t *ip, 3593 mode_t mode, 3594 cred_t *cr) 3595 { 3596 int error; 3597 mode_t orgmode = mode; 3598 struct inode *inode = LINVFS_GET_IP(XFS_ITOV(ip)); 3599 3600 if (mode & S_IWUSR) { 3601 umode_t imode = inode->i_mode; 3602 3603 if (IS_RDONLY(inode) && 3604 (S_ISREG(imode) || S_ISDIR(imode) || S_ISLNK(imode))) 3605 return XFS_ERROR(EROFS); 3606 3607 if (IS_IMMUTABLE(inode)) 3608 return XFS_ERROR(EACCES); 3609 } 3610 3611 /* 3612 * If there's an Access Control List it's used instead of 3613 * the mode bits. 3614 */ 3615 if ((error = _ACL_XFS_IACCESS(ip, mode, cr)) != -1) 3616 return error ? XFS_ERROR(error) : 0; 3617 3618 if (current_fsuid(cr) != ip->i_d.di_uid) { 3619 mode >>= 3; 3620 if (!in_group_p((gid_t)ip->i_d.di_gid)) 3621 mode >>= 3; 3622 } 3623 3624 /* 3625 * If the DACs are ok we don't need any capability check. 3626 */ 3627 if ((ip->i_d.di_mode & mode) == mode) 3628 return 0; 3629 /* 3630 * Read/write DACs are always overridable. 3631 * Executable DACs are overridable if at least one exec bit is set. 3632 */ 3633 if (!(orgmode & S_IXUSR) || 3634 (inode->i_mode & S_IXUGO) || S_ISDIR(inode->i_mode)) 3635 if (capable_cred(cr, CAP_DAC_OVERRIDE)) 3636 return 0; 3637 3638 if ((orgmode == S_IRUSR) || 3639 (S_ISDIR(inode->i_mode) && (!(orgmode & S_IWUSR)))) { 3640 if (capable_cred(cr, CAP_DAC_READ_SEARCH)) 3641 return 0; 3642 #ifdef NOISE 3643 cmn_err(CE_NOTE, "Ick: mode=%o, orgmode=%o", mode, orgmode); 3644 #endif /* NOISE */ 3645 return XFS_ERROR(EACCES); 3646 } 3647 return XFS_ERROR(EACCES); 3648 } 3649 3650 /* 3651 * xfs_iroundup: round up argument to next power of two 3652 */ 3653 uint 3654 xfs_iroundup( 3655 uint v) 3656 { 3657 int i; 3658 uint m; 3659 3660 if ((v & (v - 1)) == 0) 3661 return v; 3662 ASSERT((v & 0x80000000) == 0); 3663 if ((v & (v + 1)) == 0) 3664 return v + 1; 3665 for (i = 0, m = 1; i < 31; i++, m <<= 1) { 3666 if (v & m) 3667 continue; 3668 v |= m; 3669 if ((v & (v + 1)) == 0) 3670 return v + 1; 3671 } 3672 ASSERT(0); 3673 return( 0 ); 3674 } 3675 3676 #ifdef XFS_ILOCK_TRACE 3677 ktrace_t *xfs_ilock_trace_buf; 3678 3679 void 3680 xfs_ilock_trace(xfs_inode_t *ip, int lock, unsigned int lockflags, inst_t *ra) 3681 { 3682 ktrace_enter(ip->i_lock_trace, 3683 (void *)ip, 3684 (void *)(unsigned long)lock, /* 1 = LOCK, 3=UNLOCK, etc */ 3685 (void *)(unsigned long)lockflags, /* XFS_ILOCK_EXCL etc */ 3686 (void *)ra, /* caller of ilock */ 3687 (void *)(unsigned long)current_cpu(), 3688 (void *)(unsigned long)current_pid(), 3689 NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL); 3690 } 3691 #endif 3692