1 /* 2 * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc. 3 * All Rights Reserved. 4 * 5 * This program is free software; you can redistribute it and/or 6 * modify it under the terms of the GNU General Public License as 7 * published by the Free Software Foundation. 8 * 9 * This program is distributed in the hope that it would be useful, 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 * GNU General Public License for more details. 13 * 14 * You should have received a copy of the GNU General Public License 15 * along with this program; if not, write the Free Software Foundation, 16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 17 */ 18 #include "xfs.h" 19 #include "xfs_fs.h" 20 #include "xfs_types.h" 21 #include "xfs_bit.h" 22 #include "xfs_log.h" 23 #include "xfs_inum.h" 24 #include "xfs_trans.h" 25 #include "xfs_sb.h" 26 #include "xfs_ag.h" 27 #include "xfs_mount.h" 28 #include "xfs_trans_priv.h" 29 #include "xfs_bmap_btree.h" 30 #include "xfs_dinode.h" 31 #include "xfs_inode.h" 32 #include "xfs_inode_item.h" 33 #include "xfs_error.h" 34 #include "xfs_trace.h" 35 36 37 kmem_zone_t *xfs_ili_zone; /* inode log item zone */ 38 39 static inline struct xfs_inode_log_item *INODE_ITEM(struct xfs_log_item *lip) 40 { 41 return container_of(lip, struct xfs_inode_log_item, ili_item); 42 } 43 44 45 /* 46 * This returns the number of iovecs needed to log the given inode item. 47 * 48 * We need one iovec for the inode log format structure, one for the 49 * inode core, and possibly one for the inode data/extents/b-tree root 50 * and one for the inode attribute data/extents/b-tree root. 51 */ 52 STATIC uint 53 xfs_inode_item_size( 54 struct xfs_log_item *lip) 55 { 56 struct xfs_inode_log_item *iip = INODE_ITEM(lip); 57 struct xfs_inode *ip = iip->ili_inode; 58 uint nvecs = 2; 59 60 switch (ip->i_d.di_format) { 61 case XFS_DINODE_FMT_EXTENTS: 62 if ((iip->ili_fields & XFS_ILOG_DEXT) && 63 ip->i_d.di_nextents > 0 && 64 ip->i_df.if_bytes > 0) 65 nvecs++; 66 break; 67 68 case XFS_DINODE_FMT_BTREE: 69 if ((iip->ili_fields & XFS_ILOG_DBROOT) && 70 ip->i_df.if_broot_bytes > 0) 71 nvecs++; 72 break; 73 74 case XFS_DINODE_FMT_LOCAL: 75 if ((iip->ili_fields & XFS_ILOG_DDATA) && 76 ip->i_df.if_bytes > 0) 77 nvecs++; 78 break; 79 80 case XFS_DINODE_FMT_DEV: 81 case XFS_DINODE_FMT_UUID: 82 break; 83 84 default: 85 ASSERT(0); 86 break; 87 } 88 89 if (!XFS_IFORK_Q(ip)) 90 return nvecs; 91 92 93 /* 94 * Log any necessary attribute data. 95 */ 96 switch (ip->i_d.di_aformat) { 97 case XFS_DINODE_FMT_EXTENTS: 98 if ((iip->ili_fields & XFS_ILOG_AEXT) && 99 ip->i_d.di_anextents > 0 && 100 ip->i_afp->if_bytes > 0) 101 nvecs++; 102 break; 103 104 case XFS_DINODE_FMT_BTREE: 105 if ((iip->ili_fields & XFS_ILOG_ABROOT) && 106 ip->i_afp->if_broot_bytes > 0) 107 nvecs++; 108 break; 109 110 case XFS_DINODE_FMT_LOCAL: 111 if ((iip->ili_fields & XFS_ILOG_ADATA) && 112 ip->i_afp->if_bytes > 0) 113 nvecs++; 114 break; 115 116 default: 117 ASSERT(0); 118 break; 119 } 120 121 return nvecs; 122 } 123 124 /* 125 * xfs_inode_item_format_extents - convert in-core extents to on-disk form 126 * 127 * For either the data or attr fork in extent format, we need to endian convert 128 * the in-core extent as we place them into the on-disk inode. In this case, we 129 * need to do this conversion before we write the extents into the log. Because 130 * we don't have the disk inode to write into here, we allocate a buffer and 131 * format the extents into it via xfs_iextents_copy(). We free the buffer in 132 * the unlock routine after the copy for the log has been made. 133 * 134 * In the case of the data fork, the in-core and on-disk fork sizes can be 135 * different due to delayed allocation extents. We only log on-disk extents 136 * here, so always use the physical fork size to determine the size of the 137 * buffer we need to allocate. 138 */ 139 STATIC void 140 xfs_inode_item_format_extents( 141 struct xfs_inode *ip, 142 struct xfs_log_iovec *vecp, 143 int whichfork, 144 int type) 145 { 146 xfs_bmbt_rec_t *ext_buffer; 147 148 ext_buffer = kmem_alloc(XFS_IFORK_SIZE(ip, whichfork), KM_SLEEP); 149 if (whichfork == XFS_DATA_FORK) 150 ip->i_itemp->ili_extents_buf = ext_buffer; 151 else 152 ip->i_itemp->ili_aextents_buf = ext_buffer; 153 154 vecp->i_addr = ext_buffer; 155 vecp->i_len = xfs_iextents_copy(ip, ext_buffer, whichfork); 156 vecp->i_type = type; 157 } 158 159 /* 160 * This is called to fill in the vector of log iovecs for the 161 * given inode log item. It fills the first item with an inode 162 * log format structure, the second with the on-disk inode structure, 163 * and a possible third and/or fourth with the inode data/extents/b-tree 164 * root and inode attributes data/extents/b-tree root. 165 */ 166 STATIC void 167 xfs_inode_item_format( 168 struct xfs_log_item *lip, 169 struct xfs_log_iovec *vecp) 170 { 171 struct xfs_inode_log_item *iip = INODE_ITEM(lip); 172 struct xfs_inode *ip = iip->ili_inode; 173 uint nvecs; 174 size_t data_bytes; 175 xfs_mount_t *mp; 176 177 vecp->i_addr = &iip->ili_format; 178 vecp->i_len = sizeof(xfs_inode_log_format_t); 179 vecp->i_type = XLOG_REG_TYPE_IFORMAT; 180 vecp++; 181 nvecs = 1; 182 183 vecp->i_addr = &ip->i_d; 184 vecp->i_len = sizeof(struct xfs_icdinode); 185 vecp->i_type = XLOG_REG_TYPE_ICORE; 186 vecp++; 187 nvecs++; 188 189 /* 190 * If this is really an old format inode, then we need to 191 * log it as such. This means that we have to copy the link 192 * count from the new field to the old. We don't have to worry 193 * about the new fields, because nothing trusts them as long as 194 * the old inode version number is there. If the superblock already 195 * has a new version number, then we don't bother converting back. 196 */ 197 mp = ip->i_mount; 198 ASSERT(ip->i_d.di_version == 1 || xfs_sb_version_hasnlink(&mp->m_sb)); 199 if (ip->i_d.di_version == 1) { 200 if (!xfs_sb_version_hasnlink(&mp->m_sb)) { 201 /* 202 * Convert it back. 203 */ 204 ASSERT(ip->i_d.di_nlink <= XFS_MAXLINK_1); 205 ip->i_d.di_onlink = ip->i_d.di_nlink; 206 } else { 207 /* 208 * The superblock version has already been bumped, 209 * so just make the conversion to the new inode 210 * format permanent. 211 */ 212 ip->i_d.di_version = 2; 213 ip->i_d.di_onlink = 0; 214 memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad)); 215 } 216 } 217 218 switch (ip->i_d.di_format) { 219 case XFS_DINODE_FMT_EXTENTS: 220 iip->ili_fields &= 221 ~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT | 222 XFS_ILOG_DEV | XFS_ILOG_UUID); 223 224 if ((iip->ili_fields & XFS_ILOG_DEXT) && 225 ip->i_d.di_nextents > 0 && 226 ip->i_df.if_bytes > 0) { 227 ASSERT(ip->i_df.if_u1.if_extents != NULL); 228 ASSERT(ip->i_df.if_bytes / sizeof(xfs_bmbt_rec_t) > 0); 229 ASSERT(iip->ili_extents_buf == NULL); 230 231 #ifdef XFS_NATIVE_HOST 232 if (ip->i_d.di_nextents == ip->i_df.if_bytes / 233 (uint)sizeof(xfs_bmbt_rec_t)) { 234 /* 235 * There are no delayed allocation 236 * extents, so just point to the 237 * real extents array. 238 */ 239 vecp->i_addr = ip->i_df.if_u1.if_extents; 240 vecp->i_len = ip->i_df.if_bytes; 241 vecp->i_type = XLOG_REG_TYPE_IEXT; 242 } else 243 #endif 244 { 245 xfs_inode_item_format_extents(ip, vecp, 246 XFS_DATA_FORK, XLOG_REG_TYPE_IEXT); 247 } 248 ASSERT(vecp->i_len <= ip->i_df.if_bytes); 249 iip->ili_format.ilf_dsize = vecp->i_len; 250 vecp++; 251 nvecs++; 252 } else { 253 iip->ili_fields &= ~XFS_ILOG_DEXT; 254 } 255 break; 256 257 case XFS_DINODE_FMT_BTREE: 258 iip->ili_fields &= 259 ~(XFS_ILOG_DDATA | XFS_ILOG_DEXT | 260 XFS_ILOG_DEV | XFS_ILOG_UUID); 261 262 if ((iip->ili_fields & XFS_ILOG_DBROOT) && 263 ip->i_df.if_broot_bytes > 0) { 264 ASSERT(ip->i_df.if_broot != NULL); 265 vecp->i_addr = ip->i_df.if_broot; 266 vecp->i_len = ip->i_df.if_broot_bytes; 267 vecp->i_type = XLOG_REG_TYPE_IBROOT; 268 vecp++; 269 nvecs++; 270 iip->ili_format.ilf_dsize = ip->i_df.if_broot_bytes; 271 } else { 272 ASSERT(!(iip->ili_fields & 273 XFS_ILOG_DBROOT)); 274 #ifdef XFS_TRANS_DEBUG 275 if (iip->ili_root_size > 0) { 276 ASSERT(iip->ili_root_size == 277 ip->i_df.if_broot_bytes); 278 ASSERT(memcmp(iip->ili_orig_root, 279 ip->i_df.if_broot, 280 iip->ili_root_size) == 0); 281 } else { 282 ASSERT(ip->i_df.if_broot_bytes == 0); 283 } 284 #endif 285 iip->ili_fields &= ~XFS_ILOG_DBROOT; 286 } 287 break; 288 289 case XFS_DINODE_FMT_LOCAL: 290 iip->ili_fields &= 291 ~(XFS_ILOG_DEXT | XFS_ILOG_DBROOT | 292 XFS_ILOG_DEV | XFS_ILOG_UUID); 293 if ((iip->ili_fields & XFS_ILOG_DDATA) && 294 ip->i_df.if_bytes > 0) { 295 ASSERT(ip->i_df.if_u1.if_data != NULL); 296 ASSERT(ip->i_d.di_size > 0); 297 298 vecp->i_addr = ip->i_df.if_u1.if_data; 299 /* 300 * Round i_bytes up to a word boundary. 301 * The underlying memory is guaranteed to 302 * to be there by xfs_idata_realloc(). 303 */ 304 data_bytes = roundup(ip->i_df.if_bytes, 4); 305 ASSERT((ip->i_df.if_real_bytes == 0) || 306 (ip->i_df.if_real_bytes == data_bytes)); 307 vecp->i_len = (int)data_bytes; 308 vecp->i_type = XLOG_REG_TYPE_ILOCAL; 309 vecp++; 310 nvecs++; 311 iip->ili_format.ilf_dsize = (unsigned)data_bytes; 312 } else { 313 iip->ili_fields &= ~XFS_ILOG_DDATA; 314 } 315 break; 316 317 case XFS_DINODE_FMT_DEV: 318 iip->ili_fields &= 319 ~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT | 320 XFS_ILOG_DEXT | XFS_ILOG_UUID); 321 if (iip->ili_fields & XFS_ILOG_DEV) { 322 iip->ili_format.ilf_u.ilfu_rdev = 323 ip->i_df.if_u2.if_rdev; 324 } 325 break; 326 327 case XFS_DINODE_FMT_UUID: 328 iip->ili_fields &= 329 ~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT | 330 XFS_ILOG_DEXT | XFS_ILOG_DEV); 331 if (iip->ili_fields & XFS_ILOG_UUID) { 332 iip->ili_format.ilf_u.ilfu_uuid = 333 ip->i_df.if_u2.if_uuid; 334 } 335 break; 336 337 default: 338 ASSERT(0); 339 break; 340 } 341 342 /* 343 * If there are no attributes associated with the file, then we're done. 344 */ 345 if (!XFS_IFORK_Q(ip)) { 346 iip->ili_fields &= 347 ~(XFS_ILOG_ADATA | XFS_ILOG_ABROOT | XFS_ILOG_AEXT); 348 goto out; 349 } 350 351 switch (ip->i_d.di_aformat) { 352 case XFS_DINODE_FMT_EXTENTS: 353 iip->ili_fields &= 354 ~(XFS_ILOG_ADATA | XFS_ILOG_ABROOT); 355 356 if ((iip->ili_fields & XFS_ILOG_AEXT) && 357 ip->i_d.di_anextents > 0 && 358 ip->i_afp->if_bytes > 0) { 359 ASSERT(ip->i_afp->if_bytes / sizeof(xfs_bmbt_rec_t) == 360 ip->i_d.di_anextents); 361 ASSERT(ip->i_afp->if_u1.if_extents != NULL); 362 #ifdef XFS_NATIVE_HOST 363 /* 364 * There are not delayed allocation extents 365 * for attributes, so just point at the array. 366 */ 367 vecp->i_addr = ip->i_afp->if_u1.if_extents; 368 vecp->i_len = ip->i_afp->if_bytes; 369 vecp->i_type = XLOG_REG_TYPE_IATTR_EXT; 370 #else 371 ASSERT(iip->ili_aextents_buf == NULL); 372 xfs_inode_item_format_extents(ip, vecp, 373 XFS_ATTR_FORK, XLOG_REG_TYPE_IATTR_EXT); 374 #endif 375 iip->ili_format.ilf_asize = vecp->i_len; 376 vecp++; 377 nvecs++; 378 } else { 379 iip->ili_fields &= ~XFS_ILOG_AEXT; 380 } 381 break; 382 383 case XFS_DINODE_FMT_BTREE: 384 iip->ili_fields &= 385 ~(XFS_ILOG_ADATA | XFS_ILOG_AEXT); 386 387 if ((iip->ili_fields & XFS_ILOG_ABROOT) && 388 ip->i_afp->if_broot_bytes > 0) { 389 ASSERT(ip->i_afp->if_broot != NULL); 390 391 vecp->i_addr = ip->i_afp->if_broot; 392 vecp->i_len = ip->i_afp->if_broot_bytes; 393 vecp->i_type = XLOG_REG_TYPE_IATTR_BROOT; 394 vecp++; 395 nvecs++; 396 iip->ili_format.ilf_asize = ip->i_afp->if_broot_bytes; 397 } else { 398 iip->ili_fields &= ~XFS_ILOG_ABROOT; 399 } 400 break; 401 402 case XFS_DINODE_FMT_LOCAL: 403 iip->ili_fields &= 404 ~(XFS_ILOG_AEXT | XFS_ILOG_ABROOT); 405 406 if ((iip->ili_fields & XFS_ILOG_ADATA) && 407 ip->i_afp->if_bytes > 0) { 408 ASSERT(ip->i_afp->if_u1.if_data != NULL); 409 410 vecp->i_addr = ip->i_afp->if_u1.if_data; 411 /* 412 * Round i_bytes up to a word boundary. 413 * The underlying memory is guaranteed to 414 * to be there by xfs_idata_realloc(). 415 */ 416 data_bytes = roundup(ip->i_afp->if_bytes, 4); 417 ASSERT((ip->i_afp->if_real_bytes == 0) || 418 (ip->i_afp->if_real_bytes == data_bytes)); 419 vecp->i_len = (int)data_bytes; 420 vecp->i_type = XLOG_REG_TYPE_IATTR_LOCAL; 421 vecp++; 422 nvecs++; 423 iip->ili_format.ilf_asize = (unsigned)data_bytes; 424 } else { 425 iip->ili_fields &= ~XFS_ILOG_ADATA; 426 } 427 break; 428 429 default: 430 ASSERT(0); 431 break; 432 } 433 434 out: 435 /* 436 * Now update the log format that goes out to disk from the in-core 437 * values. We always write the inode core to make the arithmetic 438 * games in recovery easier, which isn't a big deal as just about any 439 * transaction would dirty it anyway. 440 */ 441 iip->ili_format.ilf_fields = XFS_ILOG_CORE | 442 (iip->ili_fields & ~XFS_ILOG_TIMESTAMP); 443 iip->ili_format.ilf_size = nvecs; 444 } 445 446 447 /* 448 * This is called to pin the inode associated with the inode log 449 * item in memory so it cannot be written out. 450 */ 451 STATIC void 452 xfs_inode_item_pin( 453 struct xfs_log_item *lip) 454 { 455 struct xfs_inode *ip = INODE_ITEM(lip)->ili_inode; 456 457 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); 458 459 trace_xfs_inode_pin(ip, _RET_IP_); 460 atomic_inc(&ip->i_pincount); 461 } 462 463 464 /* 465 * This is called to unpin the inode associated with the inode log 466 * item which was previously pinned with a call to xfs_inode_item_pin(). 467 * 468 * Also wake up anyone in xfs_iunpin_wait() if the count goes to 0. 469 */ 470 STATIC void 471 xfs_inode_item_unpin( 472 struct xfs_log_item *lip, 473 int remove) 474 { 475 struct xfs_inode *ip = INODE_ITEM(lip)->ili_inode; 476 477 trace_xfs_inode_unpin(ip, _RET_IP_); 478 ASSERT(atomic_read(&ip->i_pincount) > 0); 479 if (atomic_dec_and_test(&ip->i_pincount)) 480 wake_up_bit(&ip->i_flags, __XFS_IPINNED_BIT); 481 } 482 483 /* 484 * This is called to attempt to lock the inode associated with this 485 * inode log item, in preparation for the push routine which does the actual 486 * iflush. Don't sleep on the inode lock or the flush lock. 487 * 488 * If the flush lock is already held, indicating that the inode has 489 * been or is in the process of being flushed, then (ideally) we'd like to 490 * see if the inode's buffer is still incore, and if so give it a nudge. 491 * We delay doing so until the pushbuf routine, though, to avoid holding 492 * the AIL lock across a call to the blackhole which is the buffer cache. 493 * Also we don't want to sleep in any device strategy routines, which can happen 494 * if we do the subsequent bawrite in here. 495 */ 496 STATIC uint 497 xfs_inode_item_trylock( 498 struct xfs_log_item *lip) 499 { 500 struct xfs_inode_log_item *iip = INODE_ITEM(lip); 501 struct xfs_inode *ip = iip->ili_inode; 502 503 if (xfs_ipincount(ip) > 0) 504 return XFS_ITEM_PINNED; 505 506 if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) 507 return XFS_ITEM_LOCKED; 508 509 if (!xfs_iflock_nowait(ip)) { 510 /* 511 * inode has already been flushed to the backing buffer, 512 * leave it locked in shared mode, pushbuf routine will 513 * unlock it. 514 */ 515 return XFS_ITEM_PUSHBUF; 516 } 517 518 /* Stale items should force out the iclog */ 519 if (ip->i_flags & XFS_ISTALE) { 520 xfs_ifunlock(ip); 521 xfs_iunlock(ip, XFS_ILOCK_SHARED); 522 return XFS_ITEM_PINNED; 523 } 524 525 #ifdef DEBUG 526 if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) { 527 ASSERT(iip->ili_fields != 0); 528 ASSERT(iip->ili_logged == 0); 529 ASSERT(lip->li_flags & XFS_LI_IN_AIL); 530 } 531 #endif 532 return XFS_ITEM_SUCCESS; 533 } 534 535 /* 536 * Unlock the inode associated with the inode log item. 537 * Clear the fields of the inode and inode log item that 538 * are specific to the current transaction. If the 539 * hold flags is set, do not unlock the inode. 540 */ 541 STATIC void 542 xfs_inode_item_unlock( 543 struct xfs_log_item *lip) 544 { 545 struct xfs_inode_log_item *iip = INODE_ITEM(lip); 546 struct xfs_inode *ip = iip->ili_inode; 547 unsigned short lock_flags; 548 549 ASSERT(ip->i_itemp != NULL); 550 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); 551 552 /* 553 * If the inode needed a separate buffer with which to log 554 * its extents, then free it now. 555 */ 556 if (iip->ili_extents_buf != NULL) { 557 ASSERT(ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS); 558 ASSERT(ip->i_d.di_nextents > 0); 559 ASSERT(iip->ili_fields & XFS_ILOG_DEXT); 560 ASSERT(ip->i_df.if_bytes > 0); 561 kmem_free(iip->ili_extents_buf); 562 iip->ili_extents_buf = NULL; 563 } 564 if (iip->ili_aextents_buf != NULL) { 565 ASSERT(ip->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS); 566 ASSERT(ip->i_d.di_anextents > 0); 567 ASSERT(iip->ili_fields & XFS_ILOG_AEXT); 568 ASSERT(ip->i_afp->if_bytes > 0); 569 kmem_free(iip->ili_aextents_buf); 570 iip->ili_aextents_buf = NULL; 571 } 572 573 lock_flags = iip->ili_lock_flags; 574 iip->ili_lock_flags = 0; 575 if (lock_flags) 576 xfs_iunlock(ip, lock_flags); 577 } 578 579 /* 580 * This is called to find out where the oldest active copy of the inode log 581 * item in the on disk log resides now that the last log write of it completed 582 * at the given lsn. Since we always re-log all dirty data in an inode, the 583 * latest copy in the on disk log is the only one that matters. Therefore, 584 * simply return the given lsn. 585 * 586 * If the inode has been marked stale because the cluster is being freed, we 587 * don't want to (re-)insert this inode into the AIL. There is a race condition 588 * where the cluster buffer may be unpinned before the inode is inserted into 589 * the AIL during transaction committed processing. If the buffer is unpinned 590 * before the inode item has been committed and inserted, then it is possible 591 * for the buffer to be written and IO completes before the inode is inserted 592 * into the AIL. In that case, we'd be inserting a clean, stale inode into the 593 * AIL which will never get removed. It will, however, get reclaimed which 594 * triggers an assert in xfs_inode_free() complaining about freein an inode 595 * still in the AIL. 596 * 597 * To avoid this, just unpin the inode directly and return a LSN of -1 so the 598 * transaction committed code knows that it does not need to do any further 599 * processing on the item. 600 */ 601 STATIC xfs_lsn_t 602 xfs_inode_item_committed( 603 struct xfs_log_item *lip, 604 xfs_lsn_t lsn) 605 { 606 struct xfs_inode_log_item *iip = INODE_ITEM(lip); 607 struct xfs_inode *ip = iip->ili_inode; 608 609 if (xfs_iflags_test(ip, XFS_ISTALE)) { 610 xfs_inode_item_unpin(lip, 0); 611 return -1; 612 } 613 return lsn; 614 } 615 616 /* 617 * This gets called by xfs_trans_push_ail(), when IOP_TRYLOCK 618 * failed to get the inode flush lock but did get the inode locked SHARED. 619 * Here we're trying to see if the inode buffer is incore, and if so whether it's 620 * marked delayed write. If that's the case, we'll promote it and that will 621 * allow the caller to write the buffer by triggering the xfsbufd to run. 622 */ 623 STATIC bool 624 xfs_inode_item_pushbuf( 625 struct xfs_log_item *lip) 626 { 627 struct xfs_inode_log_item *iip = INODE_ITEM(lip); 628 struct xfs_inode *ip = iip->ili_inode; 629 struct xfs_buf *bp; 630 bool ret = true; 631 632 ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED)); 633 634 /* 635 * If a flush is not in progress anymore, chances are that the 636 * inode was taken off the AIL. So, just get out. 637 */ 638 if (!xfs_isiflocked(ip) || 639 !(lip->li_flags & XFS_LI_IN_AIL)) { 640 xfs_iunlock(ip, XFS_ILOCK_SHARED); 641 return true; 642 } 643 644 bp = xfs_incore(ip->i_mount->m_ddev_targp, iip->ili_format.ilf_blkno, 645 iip->ili_format.ilf_len, XBF_TRYLOCK); 646 647 xfs_iunlock(ip, XFS_ILOCK_SHARED); 648 if (!bp) 649 return true; 650 if (XFS_BUF_ISDELAYWRITE(bp)) 651 xfs_buf_delwri_promote(bp); 652 if (xfs_buf_ispinned(bp)) 653 ret = false; 654 xfs_buf_relse(bp); 655 return ret; 656 } 657 658 /* 659 * This is called to asynchronously write the inode associated with this 660 * inode log item out to disk. The inode will already have been locked by 661 * a successful call to xfs_inode_item_trylock(). 662 */ 663 STATIC void 664 xfs_inode_item_push( 665 struct xfs_log_item *lip) 666 { 667 struct xfs_inode_log_item *iip = INODE_ITEM(lip); 668 struct xfs_inode *ip = iip->ili_inode; 669 670 ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED)); 671 ASSERT(xfs_isiflocked(ip)); 672 673 /* 674 * Since we were able to lock the inode's flush lock and 675 * we found it on the AIL, the inode must be dirty. This 676 * is because the inode is removed from the AIL while still 677 * holding the flush lock in xfs_iflush_done(). Thus, if 678 * we found it in the AIL and were able to obtain the flush 679 * lock without sleeping, then there must not have been 680 * anyone in the process of flushing the inode. 681 */ 682 ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || iip->ili_fields != 0); 683 684 /* 685 * Push the inode to it's backing buffer. This will not remove the 686 * inode from the AIL - a further push will be required to trigger a 687 * buffer push. However, this allows all the dirty inodes to be pushed 688 * to the buffer before it is pushed to disk. The buffer IO completion 689 * will pull the inode from the AIL, mark it clean and unlock the flush 690 * lock. 691 */ 692 (void) xfs_iflush(ip, SYNC_TRYLOCK); 693 xfs_iunlock(ip, XFS_ILOCK_SHARED); 694 } 695 696 /* 697 * XXX rcc - this one really has to do something. Probably needs 698 * to stamp in a new field in the incore inode. 699 */ 700 STATIC void 701 xfs_inode_item_committing( 702 struct xfs_log_item *lip, 703 xfs_lsn_t lsn) 704 { 705 INODE_ITEM(lip)->ili_last_lsn = lsn; 706 } 707 708 /* 709 * This is the ops vector shared by all buf log items. 710 */ 711 static const struct xfs_item_ops xfs_inode_item_ops = { 712 .iop_size = xfs_inode_item_size, 713 .iop_format = xfs_inode_item_format, 714 .iop_pin = xfs_inode_item_pin, 715 .iop_unpin = xfs_inode_item_unpin, 716 .iop_trylock = xfs_inode_item_trylock, 717 .iop_unlock = xfs_inode_item_unlock, 718 .iop_committed = xfs_inode_item_committed, 719 .iop_push = xfs_inode_item_push, 720 .iop_pushbuf = xfs_inode_item_pushbuf, 721 .iop_committing = xfs_inode_item_committing 722 }; 723 724 725 /* 726 * Initialize the inode log item for a newly allocated (in-core) inode. 727 */ 728 void 729 xfs_inode_item_init( 730 struct xfs_inode *ip, 731 struct xfs_mount *mp) 732 { 733 struct xfs_inode_log_item *iip; 734 735 ASSERT(ip->i_itemp == NULL); 736 iip = ip->i_itemp = kmem_zone_zalloc(xfs_ili_zone, KM_SLEEP); 737 738 iip->ili_inode = ip; 739 xfs_log_item_init(mp, &iip->ili_item, XFS_LI_INODE, 740 &xfs_inode_item_ops); 741 iip->ili_format.ilf_type = XFS_LI_INODE; 742 iip->ili_format.ilf_ino = ip->i_ino; 743 iip->ili_format.ilf_blkno = ip->i_imap.im_blkno; 744 iip->ili_format.ilf_len = ip->i_imap.im_len; 745 iip->ili_format.ilf_boffset = ip->i_imap.im_boffset; 746 } 747 748 /* 749 * Free the inode log item and any memory hanging off of it. 750 */ 751 void 752 xfs_inode_item_destroy( 753 xfs_inode_t *ip) 754 { 755 #ifdef XFS_TRANS_DEBUG 756 if (ip->i_itemp->ili_root_size != 0) { 757 kmem_free(ip->i_itemp->ili_orig_root); 758 } 759 #endif 760 kmem_zone_free(xfs_ili_zone, ip->i_itemp); 761 } 762 763 764 /* 765 * This is the inode flushing I/O completion routine. It is called 766 * from interrupt level when the buffer containing the inode is 767 * flushed to disk. It is responsible for removing the inode item 768 * from the AIL if it has not been re-logged, and unlocking the inode's 769 * flush lock. 770 * 771 * To reduce AIL lock traffic as much as possible, we scan the buffer log item 772 * list for other inodes that will run this function. We remove them from the 773 * buffer list so we can process all the inode IO completions in one AIL lock 774 * traversal. 775 */ 776 void 777 xfs_iflush_done( 778 struct xfs_buf *bp, 779 struct xfs_log_item *lip) 780 { 781 struct xfs_inode_log_item *iip; 782 struct xfs_log_item *blip; 783 struct xfs_log_item *next; 784 struct xfs_log_item *prev; 785 struct xfs_ail *ailp = lip->li_ailp; 786 int need_ail = 0; 787 788 /* 789 * Scan the buffer IO completions for other inodes being completed and 790 * attach them to the current inode log item. 791 */ 792 blip = bp->b_fspriv; 793 prev = NULL; 794 while (blip != NULL) { 795 if (lip->li_cb != xfs_iflush_done) { 796 prev = blip; 797 blip = blip->li_bio_list; 798 continue; 799 } 800 801 /* remove from list */ 802 next = blip->li_bio_list; 803 if (!prev) { 804 bp->b_fspriv = next; 805 } else { 806 prev->li_bio_list = next; 807 } 808 809 /* add to current list */ 810 blip->li_bio_list = lip->li_bio_list; 811 lip->li_bio_list = blip; 812 813 /* 814 * while we have the item, do the unlocked check for needing 815 * the AIL lock. 816 */ 817 iip = INODE_ITEM(blip); 818 if (iip->ili_logged && blip->li_lsn == iip->ili_flush_lsn) 819 need_ail++; 820 821 blip = next; 822 } 823 824 /* make sure we capture the state of the initial inode. */ 825 iip = INODE_ITEM(lip); 826 if (iip->ili_logged && lip->li_lsn == iip->ili_flush_lsn) 827 need_ail++; 828 829 /* 830 * We only want to pull the item from the AIL if it is 831 * actually there and its location in the log has not 832 * changed since we started the flush. Thus, we only bother 833 * if the ili_logged flag is set and the inode's lsn has not 834 * changed. First we check the lsn outside 835 * the lock since it's cheaper, and then we recheck while 836 * holding the lock before removing the inode from the AIL. 837 */ 838 if (need_ail) { 839 struct xfs_log_item *log_items[need_ail]; 840 int i = 0; 841 spin_lock(&ailp->xa_lock); 842 for (blip = lip; blip; blip = blip->li_bio_list) { 843 iip = INODE_ITEM(blip); 844 if (iip->ili_logged && 845 blip->li_lsn == iip->ili_flush_lsn) { 846 log_items[i++] = blip; 847 } 848 ASSERT(i <= need_ail); 849 } 850 /* xfs_trans_ail_delete_bulk() drops the AIL lock. */ 851 xfs_trans_ail_delete_bulk(ailp, log_items, i); 852 } 853 854 855 /* 856 * clean up and unlock the flush lock now we are done. We can clear the 857 * ili_last_fields bits now that we know that the data corresponding to 858 * them is safely on disk. 859 */ 860 for (blip = lip; blip; blip = next) { 861 next = blip->li_bio_list; 862 blip->li_bio_list = NULL; 863 864 iip = INODE_ITEM(blip); 865 iip->ili_logged = 0; 866 iip->ili_last_fields = 0; 867 xfs_ifunlock(iip->ili_inode); 868 } 869 } 870 871 /* 872 * This is the inode flushing abort routine. It is called 873 * from xfs_iflush when the filesystem is shutting down to clean 874 * up the inode state. 875 * It is responsible for removing the inode item 876 * from the AIL if it has not been re-logged, and unlocking the inode's 877 * flush lock. 878 */ 879 void 880 xfs_iflush_abort( 881 xfs_inode_t *ip) 882 { 883 xfs_inode_log_item_t *iip = ip->i_itemp; 884 885 if (iip) { 886 struct xfs_ail *ailp = iip->ili_item.li_ailp; 887 if (iip->ili_item.li_flags & XFS_LI_IN_AIL) { 888 spin_lock(&ailp->xa_lock); 889 if (iip->ili_item.li_flags & XFS_LI_IN_AIL) { 890 /* xfs_trans_ail_delete() drops the AIL lock. */ 891 xfs_trans_ail_delete(ailp, (xfs_log_item_t *)iip); 892 } else 893 spin_unlock(&ailp->xa_lock); 894 } 895 iip->ili_logged = 0; 896 /* 897 * Clear the ili_last_fields bits now that we know that the 898 * data corresponding to them is safely on disk. 899 */ 900 iip->ili_last_fields = 0; 901 /* 902 * Clear the inode logging fields so no more flushes are 903 * attempted. 904 */ 905 iip->ili_fields = 0; 906 } 907 /* 908 * Release the inode's flush lock since we're done with it. 909 */ 910 xfs_ifunlock(ip); 911 } 912 913 void 914 xfs_istale_done( 915 struct xfs_buf *bp, 916 struct xfs_log_item *lip) 917 { 918 xfs_iflush_abort(INODE_ITEM(lip)->ili_inode); 919 } 920 921 /* 922 * convert an xfs_inode_log_format struct from either 32 or 64 bit versions 923 * (which can have different field alignments) to the native version 924 */ 925 int 926 xfs_inode_item_format_convert( 927 xfs_log_iovec_t *buf, 928 xfs_inode_log_format_t *in_f) 929 { 930 if (buf->i_len == sizeof(xfs_inode_log_format_32_t)) { 931 xfs_inode_log_format_32_t *in_f32 = buf->i_addr; 932 933 in_f->ilf_type = in_f32->ilf_type; 934 in_f->ilf_size = in_f32->ilf_size; 935 in_f->ilf_fields = in_f32->ilf_fields; 936 in_f->ilf_asize = in_f32->ilf_asize; 937 in_f->ilf_dsize = in_f32->ilf_dsize; 938 in_f->ilf_ino = in_f32->ilf_ino; 939 /* copy biggest field of ilf_u */ 940 memcpy(in_f->ilf_u.ilfu_uuid.__u_bits, 941 in_f32->ilf_u.ilfu_uuid.__u_bits, 942 sizeof(uuid_t)); 943 in_f->ilf_blkno = in_f32->ilf_blkno; 944 in_f->ilf_len = in_f32->ilf_len; 945 in_f->ilf_boffset = in_f32->ilf_boffset; 946 return 0; 947 } else if (buf->i_len == sizeof(xfs_inode_log_format_64_t)){ 948 xfs_inode_log_format_64_t *in_f64 = buf->i_addr; 949 950 in_f->ilf_type = in_f64->ilf_type; 951 in_f->ilf_size = in_f64->ilf_size; 952 in_f->ilf_fields = in_f64->ilf_fields; 953 in_f->ilf_asize = in_f64->ilf_asize; 954 in_f->ilf_dsize = in_f64->ilf_dsize; 955 in_f->ilf_ino = in_f64->ilf_ino; 956 /* copy biggest field of ilf_u */ 957 memcpy(in_f->ilf_u.ilfu_uuid.__u_bits, 958 in_f64->ilf_u.ilfu_uuid.__u_bits, 959 sizeof(uuid_t)); 960 in_f->ilf_blkno = in_f64->ilf_blkno; 961 in_f->ilf_len = in_f64->ilf_len; 962 in_f->ilf_boffset = in_f64->ilf_boffset; 963 return 0; 964 } 965 return EFSCORRUPTED; 966 } 967