1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (c) 2000-2005 Silicon Graphics, Inc. 4 * All Rights Reserved. 5 */ 6 #include "xfs.h" 7 #include "xfs_fs.h" 8 #include "xfs_shared.h" 9 #include "xfs_format.h" 10 #include "xfs_log_format.h" 11 #include "xfs_trans_resv.h" 12 #include "xfs_bit.h" 13 #include "xfs_mount.h" 14 #include "xfs_trans.h" 15 #include "xfs_buf_item.h" 16 #include "xfs_trans_priv.h" 17 #include "xfs_trace.h" 18 #include "xfs_log.h" 19 20 21 kmem_zone_t *xfs_buf_item_zone; 22 23 static inline struct xfs_buf_log_item *BUF_ITEM(struct xfs_log_item *lip) 24 { 25 return container_of(lip, struct xfs_buf_log_item, bli_item); 26 } 27 28 STATIC void xfs_buf_do_callbacks(struct xfs_buf *bp); 29 30 /* Is this log iovec plausibly large enough to contain the buffer log format? */ 31 bool 32 xfs_buf_log_check_iovec( 33 struct xfs_log_iovec *iovec) 34 { 35 struct xfs_buf_log_format *blfp = iovec->i_addr; 36 char *bmp_end; 37 char *item_end; 38 39 if (offsetof(struct xfs_buf_log_format, blf_data_map) > iovec->i_len) 40 return false; 41 42 item_end = (char *)iovec->i_addr + iovec->i_len; 43 bmp_end = (char *)&blfp->blf_data_map[blfp->blf_map_size]; 44 return bmp_end <= item_end; 45 } 46 47 static inline int 48 xfs_buf_log_format_size( 49 struct xfs_buf_log_format *blfp) 50 { 51 return offsetof(struct xfs_buf_log_format, blf_data_map) + 52 (blfp->blf_map_size * sizeof(blfp->blf_data_map[0])); 53 } 54 55 /* 56 * This returns the number of log iovecs needed to log the 57 * given buf log item. 58 * 59 * It calculates this as 1 iovec for the buf log format structure 60 * and 1 for each stretch of non-contiguous chunks to be logged. 61 * Contiguous chunks are logged in a single iovec. 62 * 63 * If the XFS_BLI_STALE flag has been set, then log nothing. 64 */ 65 STATIC void 66 xfs_buf_item_size_segment( 67 struct xfs_buf_log_item *bip, 68 struct xfs_buf_log_format *blfp, 69 int *nvecs, 70 int *nbytes) 71 { 72 struct xfs_buf *bp = bip->bli_buf; 73 int next_bit; 74 int last_bit; 75 76 last_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, 0); 77 if (last_bit == -1) 78 return; 79 80 /* 81 * initial count for a dirty buffer is 2 vectors - the format structure 82 * and the first dirty region. 83 */ 84 *nvecs += 2; 85 *nbytes += xfs_buf_log_format_size(blfp) + XFS_BLF_CHUNK; 86 87 while (last_bit != -1) { 88 /* 89 * This takes the bit number to start looking from and 90 * returns the next set bit from there. It returns -1 91 * if there are no more bits set or the start bit is 92 * beyond the end of the bitmap. 93 */ 94 next_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, 95 last_bit + 1); 96 /* 97 * If we run out of bits, leave the loop, 98 * else if we find a new set of bits bump the number of vecs, 99 * else keep scanning the current set of bits. 100 */ 101 if (next_bit == -1) { 102 break; 103 } else if (next_bit != last_bit + 1) { 104 last_bit = next_bit; 105 (*nvecs)++; 106 } else if (xfs_buf_offset(bp, next_bit * XFS_BLF_CHUNK) != 107 (xfs_buf_offset(bp, last_bit * XFS_BLF_CHUNK) + 108 XFS_BLF_CHUNK)) { 109 last_bit = next_bit; 110 (*nvecs)++; 111 } else { 112 last_bit++; 113 } 114 *nbytes += XFS_BLF_CHUNK; 115 } 116 } 117 118 /* 119 * This returns the number of log iovecs needed to log the given buf log item. 120 * 121 * It calculates this as 1 iovec for the buf log format structure and 1 for each 122 * stretch of non-contiguous chunks to be logged. Contiguous chunks are logged 123 * in a single iovec. 124 * 125 * Discontiguous buffers need a format structure per region that that is being 126 * logged. This makes the changes in the buffer appear to log recovery as though 127 * they came from separate buffers, just like would occur if multiple buffers 128 * were used instead of a single discontiguous buffer. This enables 129 * discontiguous buffers to be in-memory constructs, completely transparent to 130 * what ends up on disk. 131 * 132 * If the XFS_BLI_STALE flag has been set, then log nothing but the buf log 133 * format structures. 134 */ 135 STATIC void 136 xfs_buf_item_size( 137 struct xfs_log_item *lip, 138 int *nvecs, 139 int *nbytes) 140 { 141 struct xfs_buf_log_item *bip = BUF_ITEM(lip); 142 int i; 143 144 ASSERT(atomic_read(&bip->bli_refcount) > 0); 145 if (bip->bli_flags & XFS_BLI_STALE) { 146 /* 147 * The buffer is stale, so all we need to log 148 * is the buf log format structure with the 149 * cancel flag in it. 150 */ 151 trace_xfs_buf_item_size_stale(bip); 152 ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL); 153 *nvecs += bip->bli_format_count; 154 for (i = 0; i < bip->bli_format_count; i++) { 155 *nbytes += xfs_buf_log_format_size(&bip->bli_formats[i]); 156 } 157 return; 158 } 159 160 ASSERT(bip->bli_flags & XFS_BLI_LOGGED); 161 162 if (bip->bli_flags & XFS_BLI_ORDERED) { 163 /* 164 * The buffer has been logged just to order it. 165 * It is not being included in the transaction 166 * commit, so no vectors are used at all. 167 */ 168 trace_xfs_buf_item_size_ordered(bip); 169 *nvecs = XFS_LOG_VEC_ORDERED; 170 return; 171 } 172 173 /* 174 * the vector count is based on the number of buffer vectors we have 175 * dirty bits in. This will only be greater than one when we have a 176 * compound buffer with more than one segment dirty. Hence for compound 177 * buffers we need to track which segment the dirty bits correspond to, 178 * and when we move from one segment to the next increment the vector 179 * count for the extra buf log format structure that will need to be 180 * written. 181 */ 182 for (i = 0; i < bip->bli_format_count; i++) { 183 xfs_buf_item_size_segment(bip, &bip->bli_formats[i], 184 nvecs, nbytes); 185 } 186 trace_xfs_buf_item_size(bip); 187 } 188 189 static inline void 190 xfs_buf_item_copy_iovec( 191 struct xfs_log_vec *lv, 192 struct xfs_log_iovec **vecp, 193 struct xfs_buf *bp, 194 uint offset, 195 int first_bit, 196 uint nbits) 197 { 198 offset += first_bit * XFS_BLF_CHUNK; 199 xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_BCHUNK, 200 xfs_buf_offset(bp, offset), 201 nbits * XFS_BLF_CHUNK); 202 } 203 204 static inline bool 205 xfs_buf_item_straddle( 206 struct xfs_buf *bp, 207 uint offset, 208 int next_bit, 209 int last_bit) 210 { 211 return xfs_buf_offset(bp, offset + (next_bit << XFS_BLF_SHIFT)) != 212 (xfs_buf_offset(bp, offset + (last_bit << XFS_BLF_SHIFT)) + 213 XFS_BLF_CHUNK); 214 } 215 216 static void 217 xfs_buf_item_format_segment( 218 struct xfs_buf_log_item *bip, 219 struct xfs_log_vec *lv, 220 struct xfs_log_iovec **vecp, 221 uint offset, 222 struct xfs_buf_log_format *blfp) 223 { 224 struct xfs_buf *bp = bip->bli_buf; 225 uint base_size; 226 int first_bit; 227 int last_bit; 228 int next_bit; 229 uint nbits; 230 231 /* copy the flags across from the base format item */ 232 blfp->blf_flags = bip->__bli_format.blf_flags; 233 234 /* 235 * Base size is the actual size of the ondisk structure - it reflects 236 * the actual size of the dirty bitmap rather than the size of the in 237 * memory structure. 238 */ 239 base_size = xfs_buf_log_format_size(blfp); 240 241 first_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, 0); 242 if (!(bip->bli_flags & XFS_BLI_STALE) && first_bit == -1) { 243 /* 244 * If the map is not be dirty in the transaction, mark 245 * the size as zero and do not advance the vector pointer. 246 */ 247 return; 248 } 249 250 blfp = xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_BFORMAT, blfp, base_size); 251 blfp->blf_size = 1; 252 253 if (bip->bli_flags & XFS_BLI_STALE) { 254 /* 255 * The buffer is stale, so all we need to log 256 * is the buf log format structure with the 257 * cancel flag in it. 258 */ 259 trace_xfs_buf_item_format_stale(bip); 260 ASSERT(blfp->blf_flags & XFS_BLF_CANCEL); 261 return; 262 } 263 264 265 /* 266 * Fill in an iovec for each set of contiguous chunks. 267 */ 268 last_bit = first_bit; 269 nbits = 1; 270 for (;;) { 271 /* 272 * This takes the bit number to start looking from and 273 * returns the next set bit from there. It returns -1 274 * if there are no more bits set or the start bit is 275 * beyond the end of the bitmap. 276 */ 277 next_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, 278 (uint)last_bit + 1); 279 /* 280 * If we run out of bits fill in the last iovec and get out of 281 * the loop. Else if we start a new set of bits then fill in 282 * the iovec for the series we were looking at and start 283 * counting the bits in the new one. Else we're still in the 284 * same set of bits so just keep counting and scanning. 285 */ 286 if (next_bit == -1) { 287 xfs_buf_item_copy_iovec(lv, vecp, bp, offset, 288 first_bit, nbits); 289 blfp->blf_size++; 290 break; 291 } else if (next_bit != last_bit + 1 || 292 xfs_buf_item_straddle(bp, offset, next_bit, last_bit)) { 293 xfs_buf_item_copy_iovec(lv, vecp, bp, offset, 294 first_bit, nbits); 295 blfp->blf_size++; 296 first_bit = next_bit; 297 last_bit = next_bit; 298 nbits = 1; 299 } else { 300 last_bit++; 301 nbits++; 302 } 303 } 304 } 305 306 /* 307 * This is called to fill in the vector of log iovecs for the 308 * given log buf item. It fills the first entry with a buf log 309 * format structure, and the rest point to contiguous chunks 310 * within the buffer. 311 */ 312 STATIC void 313 xfs_buf_item_format( 314 struct xfs_log_item *lip, 315 struct xfs_log_vec *lv) 316 { 317 struct xfs_buf_log_item *bip = BUF_ITEM(lip); 318 struct xfs_buf *bp = bip->bli_buf; 319 struct xfs_log_iovec *vecp = NULL; 320 uint offset = 0; 321 int i; 322 323 ASSERT(atomic_read(&bip->bli_refcount) > 0); 324 ASSERT((bip->bli_flags & XFS_BLI_LOGGED) || 325 (bip->bli_flags & XFS_BLI_STALE)); 326 ASSERT((bip->bli_flags & XFS_BLI_STALE) || 327 (xfs_blft_from_flags(&bip->__bli_format) > XFS_BLFT_UNKNOWN_BUF 328 && xfs_blft_from_flags(&bip->__bli_format) < XFS_BLFT_MAX_BUF)); 329 ASSERT(!(bip->bli_flags & XFS_BLI_ORDERED) || 330 (bip->bli_flags & XFS_BLI_STALE)); 331 332 333 /* 334 * If it is an inode buffer, transfer the in-memory state to the 335 * format flags and clear the in-memory state. 336 * 337 * For buffer based inode allocation, we do not transfer 338 * this state if the inode buffer allocation has not yet been committed 339 * to the log as setting the XFS_BLI_INODE_BUF flag will prevent 340 * correct replay of the inode allocation. 341 * 342 * For icreate item based inode allocation, the buffers aren't written 343 * to the journal during allocation, and hence we should always tag the 344 * buffer as an inode buffer so that the correct unlinked list replay 345 * occurs during recovery. 346 */ 347 if (bip->bli_flags & XFS_BLI_INODE_BUF) { 348 if (xfs_sb_version_has_v3inode(&lip->li_mountp->m_sb) || 349 !((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) && 350 xfs_log_item_in_current_chkpt(lip))) 351 bip->__bli_format.blf_flags |= XFS_BLF_INODE_BUF; 352 bip->bli_flags &= ~XFS_BLI_INODE_BUF; 353 } 354 355 for (i = 0; i < bip->bli_format_count; i++) { 356 xfs_buf_item_format_segment(bip, lv, &vecp, offset, 357 &bip->bli_formats[i]); 358 offset += BBTOB(bp->b_maps[i].bm_len); 359 } 360 361 /* 362 * Check to make sure everything is consistent. 363 */ 364 trace_xfs_buf_item_format(bip); 365 } 366 367 /* 368 * This is called to pin the buffer associated with the buf log item in memory 369 * so it cannot be written out. 370 * 371 * We also always take a reference to the buffer log item here so that the bli 372 * is held while the item is pinned in memory. This means that we can 373 * unconditionally drop the reference count a transaction holds when the 374 * transaction is completed. 375 */ 376 STATIC void 377 xfs_buf_item_pin( 378 struct xfs_log_item *lip) 379 { 380 struct xfs_buf_log_item *bip = BUF_ITEM(lip); 381 382 ASSERT(atomic_read(&bip->bli_refcount) > 0); 383 ASSERT((bip->bli_flags & XFS_BLI_LOGGED) || 384 (bip->bli_flags & XFS_BLI_ORDERED) || 385 (bip->bli_flags & XFS_BLI_STALE)); 386 387 trace_xfs_buf_item_pin(bip); 388 389 atomic_inc(&bip->bli_refcount); 390 atomic_inc(&bip->bli_buf->b_pin_count); 391 } 392 393 /* 394 * This is called to unpin the buffer associated with the buf log 395 * item which was previously pinned with a call to xfs_buf_item_pin(). 396 * 397 * Also drop the reference to the buf item for the current transaction. 398 * If the XFS_BLI_STALE flag is set and we are the last reference, 399 * then free up the buf log item and unlock the buffer. 400 * 401 * If the remove flag is set we are called from uncommit in the 402 * forced-shutdown path. If that is true and the reference count on 403 * the log item is going to drop to zero we need to free the item's 404 * descriptor in the transaction. 405 */ 406 STATIC void 407 xfs_buf_item_unpin( 408 struct xfs_log_item *lip, 409 int remove) 410 { 411 struct xfs_buf_log_item *bip = BUF_ITEM(lip); 412 xfs_buf_t *bp = bip->bli_buf; 413 int stale = bip->bli_flags & XFS_BLI_STALE; 414 int freed; 415 416 ASSERT(bp->b_log_item == bip); 417 ASSERT(atomic_read(&bip->bli_refcount) > 0); 418 419 trace_xfs_buf_item_unpin(bip); 420 421 freed = atomic_dec_and_test(&bip->bli_refcount); 422 423 if (atomic_dec_and_test(&bp->b_pin_count)) 424 wake_up_all(&bp->b_waiters); 425 426 if (freed && stale) { 427 ASSERT(bip->bli_flags & XFS_BLI_STALE); 428 ASSERT(xfs_buf_islocked(bp)); 429 ASSERT(bp->b_flags & XBF_STALE); 430 ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL); 431 432 trace_xfs_buf_item_unpin_stale(bip); 433 434 if (remove) { 435 /* 436 * If we are in a transaction context, we have to 437 * remove the log item from the transaction as we are 438 * about to release our reference to the buffer. If we 439 * don't, the unlock that occurs later in 440 * xfs_trans_uncommit() will try to reference the 441 * buffer which we no longer have a hold on. 442 */ 443 if (!list_empty(&lip->li_trans)) 444 xfs_trans_del_item(lip); 445 446 /* 447 * Since the transaction no longer refers to the buffer, 448 * the buffer should no longer refer to the transaction. 449 */ 450 bp->b_transp = NULL; 451 } 452 453 /* 454 * If we get called here because of an IO error, we may or may 455 * not have the item on the AIL. xfs_trans_ail_delete() will 456 * take care of that situation. xfs_trans_ail_delete() drops 457 * the AIL lock. 458 */ 459 if (bip->bli_flags & XFS_BLI_STALE_INODE) { 460 xfs_buf_do_callbacks(bp); 461 bp->b_log_item = NULL; 462 list_del_init(&bp->b_li_list); 463 bp->b_iodone = NULL; 464 } else { 465 xfs_trans_ail_delete(lip, SHUTDOWN_LOG_IO_ERROR); 466 xfs_buf_item_relse(bp); 467 ASSERT(bp->b_log_item == NULL); 468 } 469 xfs_buf_relse(bp); 470 } else if (freed && remove) { 471 /* 472 * The buffer must be locked and held by the caller to simulate 473 * an async I/O failure. 474 */ 475 xfs_buf_lock(bp); 476 xfs_buf_hold(bp); 477 bp->b_flags |= XBF_ASYNC; 478 xfs_buf_ioend_fail(bp); 479 } 480 } 481 482 STATIC uint 483 xfs_buf_item_push( 484 struct xfs_log_item *lip, 485 struct list_head *buffer_list) 486 { 487 struct xfs_buf_log_item *bip = BUF_ITEM(lip); 488 struct xfs_buf *bp = bip->bli_buf; 489 uint rval = XFS_ITEM_SUCCESS; 490 491 if (xfs_buf_ispinned(bp)) 492 return XFS_ITEM_PINNED; 493 if (!xfs_buf_trylock(bp)) { 494 /* 495 * If we have just raced with a buffer being pinned and it has 496 * been marked stale, we could end up stalling until someone else 497 * issues a log force to unpin the stale buffer. Check for the 498 * race condition here so xfsaild recognizes the buffer is pinned 499 * and queues a log force to move it along. 500 */ 501 if (xfs_buf_ispinned(bp)) 502 return XFS_ITEM_PINNED; 503 return XFS_ITEM_LOCKED; 504 } 505 506 ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); 507 508 trace_xfs_buf_item_push(bip); 509 510 /* has a previous flush failed due to IO errors? */ 511 if (bp->b_flags & XBF_WRITE_FAIL) { 512 xfs_buf_alert_ratelimited(bp, "XFS: Failing async write", 513 "Failing async write on buffer block 0x%llx. Retrying async write.", 514 (long long)bp->b_bn); 515 } 516 517 if (!xfs_buf_delwri_queue(bp, buffer_list)) 518 rval = XFS_ITEM_FLUSHING; 519 xfs_buf_unlock(bp); 520 return rval; 521 } 522 523 /* 524 * Drop the buffer log item refcount and take appropriate action. This helper 525 * determines whether the bli must be freed or not, since a decrement to zero 526 * does not necessarily mean the bli is unused. 527 * 528 * Return true if the bli is freed, false otherwise. 529 */ 530 bool 531 xfs_buf_item_put( 532 struct xfs_buf_log_item *bip) 533 { 534 struct xfs_log_item *lip = &bip->bli_item; 535 bool aborted; 536 bool dirty; 537 538 /* drop the bli ref and return if it wasn't the last one */ 539 if (!atomic_dec_and_test(&bip->bli_refcount)) 540 return false; 541 542 /* 543 * We dropped the last ref and must free the item if clean or aborted. 544 * If the bli is dirty and non-aborted, the buffer was clean in the 545 * transaction but still awaiting writeback from previous changes. In 546 * that case, the bli is freed on buffer writeback completion. 547 */ 548 aborted = test_bit(XFS_LI_ABORTED, &lip->li_flags) || 549 XFS_FORCED_SHUTDOWN(lip->li_mountp); 550 dirty = bip->bli_flags & XFS_BLI_DIRTY; 551 if (dirty && !aborted) 552 return false; 553 554 /* 555 * The bli is aborted or clean. An aborted item may be in the AIL 556 * regardless of dirty state. For example, consider an aborted 557 * transaction that invalidated a dirty bli and cleared the dirty 558 * state. 559 */ 560 if (aborted) 561 xfs_trans_ail_delete(lip, 0); 562 xfs_buf_item_relse(bip->bli_buf); 563 return true; 564 } 565 566 /* 567 * Release the buffer associated with the buf log item. If there is no dirty 568 * logged data associated with the buffer recorded in the buf log item, then 569 * free the buf log item and remove the reference to it in the buffer. 570 * 571 * This call ignores the recursion count. It is only called when the buffer 572 * should REALLY be unlocked, regardless of the recursion count. 573 * 574 * We unconditionally drop the transaction's reference to the log item. If the 575 * item was logged, then another reference was taken when it was pinned, so we 576 * can safely drop the transaction reference now. This also allows us to avoid 577 * potential races with the unpin code freeing the bli by not referencing the 578 * bli after we've dropped the reference count. 579 * 580 * If the XFS_BLI_HOLD flag is set in the buf log item, then free the log item 581 * if necessary but do not unlock the buffer. This is for support of 582 * xfs_trans_bhold(). Make sure the XFS_BLI_HOLD field is cleared if we don't 583 * free the item. 584 */ 585 STATIC void 586 xfs_buf_item_release( 587 struct xfs_log_item *lip) 588 { 589 struct xfs_buf_log_item *bip = BUF_ITEM(lip); 590 struct xfs_buf *bp = bip->bli_buf; 591 bool released; 592 bool hold = bip->bli_flags & XFS_BLI_HOLD; 593 bool stale = bip->bli_flags & XFS_BLI_STALE; 594 #if defined(DEBUG) || defined(XFS_WARN) 595 bool ordered = bip->bli_flags & XFS_BLI_ORDERED; 596 bool dirty = bip->bli_flags & XFS_BLI_DIRTY; 597 bool aborted = test_bit(XFS_LI_ABORTED, 598 &lip->li_flags); 599 #endif 600 601 trace_xfs_buf_item_release(bip); 602 603 /* 604 * The bli dirty state should match whether the blf has logged segments 605 * except for ordered buffers, where only the bli should be dirty. 606 */ 607 ASSERT((!ordered && dirty == xfs_buf_item_dirty_format(bip)) || 608 (ordered && dirty && !xfs_buf_item_dirty_format(bip))); 609 ASSERT(!stale || (bip->__bli_format.blf_flags & XFS_BLF_CANCEL)); 610 611 /* 612 * Clear the buffer's association with this transaction and 613 * per-transaction state from the bli, which has been copied above. 614 */ 615 bp->b_transp = NULL; 616 bip->bli_flags &= ~(XFS_BLI_LOGGED | XFS_BLI_HOLD | XFS_BLI_ORDERED); 617 618 /* 619 * Unref the item and unlock the buffer unless held or stale. Stale 620 * buffers remain locked until final unpin unless the bli is freed by 621 * the unref call. The latter implies shutdown because buffer 622 * invalidation dirties the bli and transaction. 623 */ 624 released = xfs_buf_item_put(bip); 625 if (hold || (stale && !released)) 626 return; 627 ASSERT(!stale || aborted); 628 xfs_buf_relse(bp); 629 } 630 631 STATIC void 632 xfs_buf_item_committing( 633 struct xfs_log_item *lip, 634 xfs_lsn_t commit_lsn) 635 { 636 return xfs_buf_item_release(lip); 637 } 638 639 /* 640 * This is called to find out where the oldest active copy of the 641 * buf log item in the on disk log resides now that the last log 642 * write of it completed at the given lsn. 643 * We always re-log all the dirty data in a buffer, so usually the 644 * latest copy in the on disk log is the only one that matters. For 645 * those cases we simply return the given lsn. 646 * 647 * The one exception to this is for buffers full of newly allocated 648 * inodes. These buffers are only relogged with the XFS_BLI_INODE_BUF 649 * flag set, indicating that only the di_next_unlinked fields from the 650 * inodes in the buffers will be replayed during recovery. If the 651 * original newly allocated inode images have not yet been flushed 652 * when the buffer is so relogged, then we need to make sure that we 653 * keep the old images in the 'active' portion of the log. We do this 654 * by returning the original lsn of that transaction here rather than 655 * the current one. 656 */ 657 STATIC xfs_lsn_t 658 xfs_buf_item_committed( 659 struct xfs_log_item *lip, 660 xfs_lsn_t lsn) 661 { 662 struct xfs_buf_log_item *bip = BUF_ITEM(lip); 663 664 trace_xfs_buf_item_committed(bip); 665 666 if ((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) && lip->li_lsn != 0) 667 return lip->li_lsn; 668 return lsn; 669 } 670 671 static const struct xfs_item_ops xfs_buf_item_ops = { 672 .iop_size = xfs_buf_item_size, 673 .iop_format = xfs_buf_item_format, 674 .iop_pin = xfs_buf_item_pin, 675 .iop_unpin = xfs_buf_item_unpin, 676 .iop_release = xfs_buf_item_release, 677 .iop_committing = xfs_buf_item_committing, 678 .iop_committed = xfs_buf_item_committed, 679 .iop_push = xfs_buf_item_push, 680 }; 681 682 STATIC void 683 xfs_buf_item_get_format( 684 struct xfs_buf_log_item *bip, 685 int count) 686 { 687 ASSERT(bip->bli_formats == NULL); 688 bip->bli_format_count = count; 689 690 if (count == 1) { 691 bip->bli_formats = &bip->__bli_format; 692 return; 693 } 694 695 bip->bli_formats = kmem_zalloc(count * sizeof(struct xfs_buf_log_format), 696 0); 697 } 698 699 STATIC void 700 xfs_buf_item_free_format( 701 struct xfs_buf_log_item *bip) 702 { 703 if (bip->bli_formats != &bip->__bli_format) { 704 kmem_free(bip->bli_formats); 705 bip->bli_formats = NULL; 706 } 707 } 708 709 /* 710 * Allocate a new buf log item to go with the given buffer. 711 * Set the buffer's b_log_item field to point to the new 712 * buf log item. 713 */ 714 int 715 xfs_buf_item_init( 716 struct xfs_buf *bp, 717 struct xfs_mount *mp) 718 { 719 struct xfs_buf_log_item *bip = bp->b_log_item; 720 int chunks; 721 int map_size; 722 int i; 723 724 /* 725 * Check to see if there is already a buf log item for 726 * this buffer. If we do already have one, there is 727 * nothing to do here so return. 728 */ 729 ASSERT(bp->b_mount == mp); 730 if (bip) { 731 ASSERT(bip->bli_item.li_type == XFS_LI_BUF); 732 ASSERT(!bp->b_transp); 733 ASSERT(bip->bli_buf == bp); 734 return 0; 735 } 736 737 bip = kmem_zone_zalloc(xfs_buf_item_zone, 0); 738 xfs_log_item_init(mp, &bip->bli_item, XFS_LI_BUF, &xfs_buf_item_ops); 739 bip->bli_buf = bp; 740 741 /* 742 * chunks is the number of XFS_BLF_CHUNK size pieces the buffer 743 * can be divided into. Make sure not to truncate any pieces. 744 * map_size is the size of the bitmap needed to describe the 745 * chunks of the buffer. 746 * 747 * Discontiguous buffer support follows the layout of the underlying 748 * buffer. This makes the implementation as simple as possible. 749 */ 750 xfs_buf_item_get_format(bip, bp->b_map_count); 751 752 for (i = 0; i < bip->bli_format_count; i++) { 753 chunks = DIV_ROUND_UP(BBTOB(bp->b_maps[i].bm_len), 754 XFS_BLF_CHUNK); 755 map_size = DIV_ROUND_UP(chunks, NBWORD); 756 757 if (map_size > XFS_BLF_DATAMAP_SIZE) { 758 kmem_cache_free(xfs_buf_item_zone, bip); 759 xfs_err(mp, 760 "buffer item dirty bitmap (%u uints) too small to reflect %u bytes!", 761 map_size, 762 BBTOB(bp->b_maps[i].bm_len)); 763 return -EFSCORRUPTED; 764 } 765 766 bip->bli_formats[i].blf_type = XFS_LI_BUF; 767 bip->bli_formats[i].blf_blkno = bp->b_maps[i].bm_bn; 768 bip->bli_formats[i].blf_len = bp->b_maps[i].bm_len; 769 bip->bli_formats[i].blf_map_size = map_size; 770 } 771 772 bp->b_log_item = bip; 773 xfs_buf_hold(bp); 774 return 0; 775 } 776 777 778 /* 779 * Mark bytes first through last inclusive as dirty in the buf 780 * item's bitmap. 781 */ 782 static void 783 xfs_buf_item_log_segment( 784 uint first, 785 uint last, 786 uint *map) 787 { 788 uint first_bit; 789 uint last_bit; 790 uint bits_to_set; 791 uint bits_set; 792 uint word_num; 793 uint *wordp; 794 uint bit; 795 uint end_bit; 796 uint mask; 797 798 ASSERT(first < XFS_BLF_DATAMAP_SIZE * XFS_BLF_CHUNK * NBWORD); 799 ASSERT(last < XFS_BLF_DATAMAP_SIZE * XFS_BLF_CHUNK * NBWORD); 800 801 /* 802 * Convert byte offsets to bit numbers. 803 */ 804 first_bit = first >> XFS_BLF_SHIFT; 805 last_bit = last >> XFS_BLF_SHIFT; 806 807 /* 808 * Calculate the total number of bits to be set. 809 */ 810 bits_to_set = last_bit - first_bit + 1; 811 812 /* 813 * Get a pointer to the first word in the bitmap 814 * to set a bit in. 815 */ 816 word_num = first_bit >> BIT_TO_WORD_SHIFT; 817 wordp = &map[word_num]; 818 819 /* 820 * Calculate the starting bit in the first word. 821 */ 822 bit = first_bit & (uint)(NBWORD - 1); 823 824 /* 825 * First set any bits in the first word of our range. 826 * If it starts at bit 0 of the word, it will be 827 * set below rather than here. That is what the variable 828 * bit tells us. The variable bits_set tracks the number 829 * of bits that have been set so far. End_bit is the number 830 * of the last bit to be set in this word plus one. 831 */ 832 if (bit) { 833 end_bit = min(bit + bits_to_set, (uint)NBWORD); 834 mask = ((1U << (end_bit - bit)) - 1) << bit; 835 *wordp |= mask; 836 wordp++; 837 bits_set = end_bit - bit; 838 } else { 839 bits_set = 0; 840 } 841 842 /* 843 * Now set bits a whole word at a time that are between 844 * first_bit and last_bit. 845 */ 846 while ((bits_to_set - bits_set) >= NBWORD) { 847 *wordp = 0xffffffff; 848 bits_set += NBWORD; 849 wordp++; 850 } 851 852 /* 853 * Finally, set any bits left to be set in one last partial word. 854 */ 855 end_bit = bits_to_set - bits_set; 856 if (end_bit) { 857 mask = (1U << end_bit) - 1; 858 *wordp |= mask; 859 } 860 } 861 862 /* 863 * Mark bytes first through last inclusive as dirty in the buf 864 * item's bitmap. 865 */ 866 void 867 xfs_buf_item_log( 868 struct xfs_buf_log_item *bip, 869 uint first, 870 uint last) 871 { 872 int i; 873 uint start; 874 uint end; 875 struct xfs_buf *bp = bip->bli_buf; 876 877 /* 878 * walk each buffer segment and mark them dirty appropriately. 879 */ 880 start = 0; 881 for (i = 0; i < bip->bli_format_count; i++) { 882 if (start > last) 883 break; 884 end = start + BBTOB(bp->b_maps[i].bm_len) - 1; 885 886 /* skip to the map that includes the first byte to log */ 887 if (first > end) { 888 start += BBTOB(bp->b_maps[i].bm_len); 889 continue; 890 } 891 892 /* 893 * Trim the range to this segment and mark it in the bitmap. 894 * Note that we must convert buffer offsets to segment relative 895 * offsets (e.g., the first byte of each segment is byte 0 of 896 * that segment). 897 */ 898 if (first < start) 899 first = start; 900 if (end > last) 901 end = last; 902 xfs_buf_item_log_segment(first - start, end - start, 903 &bip->bli_formats[i].blf_data_map[0]); 904 905 start += BBTOB(bp->b_maps[i].bm_len); 906 } 907 } 908 909 910 /* 911 * Return true if the buffer has any ranges logged/dirtied by a transaction, 912 * false otherwise. 913 */ 914 bool 915 xfs_buf_item_dirty_format( 916 struct xfs_buf_log_item *bip) 917 { 918 int i; 919 920 for (i = 0; i < bip->bli_format_count; i++) { 921 if (!xfs_bitmap_empty(bip->bli_formats[i].blf_data_map, 922 bip->bli_formats[i].blf_map_size)) 923 return true; 924 } 925 926 return false; 927 } 928 929 STATIC void 930 xfs_buf_item_free( 931 struct xfs_buf_log_item *bip) 932 { 933 xfs_buf_item_free_format(bip); 934 kmem_free(bip->bli_item.li_lv_shadow); 935 kmem_cache_free(xfs_buf_item_zone, bip); 936 } 937 938 /* 939 * This is called when the buf log item is no longer needed. It should 940 * free the buf log item associated with the given buffer and clear 941 * the buffer's pointer to the buf log item. If there are no more 942 * items in the list, clear the b_iodone field of the buffer (see 943 * xfs_buf_attach_iodone() below). 944 */ 945 void 946 xfs_buf_item_relse( 947 xfs_buf_t *bp) 948 { 949 struct xfs_buf_log_item *bip = bp->b_log_item; 950 951 trace_xfs_buf_item_relse(bp, _RET_IP_); 952 ASSERT(!test_bit(XFS_LI_IN_AIL, &bip->bli_item.li_flags)); 953 954 bp->b_log_item = NULL; 955 if (list_empty(&bp->b_li_list)) 956 bp->b_iodone = NULL; 957 958 xfs_buf_rele(bp); 959 xfs_buf_item_free(bip); 960 } 961 962 963 /* 964 * Add the given log item with its callback to the list of callbacks 965 * to be called when the buffer's I/O completes. If it is not set 966 * already, set the buffer's b_iodone() routine to be 967 * xfs_buf_iodone_callbacks() and link the log item into the list of 968 * items rooted at b_li_list. 969 */ 970 void 971 xfs_buf_attach_iodone( 972 struct xfs_buf *bp, 973 void (*cb)(struct xfs_buf *, struct xfs_log_item *), 974 struct xfs_log_item *lip) 975 { 976 ASSERT(xfs_buf_islocked(bp)); 977 978 lip->li_cb = cb; 979 list_add_tail(&lip->li_bio_list, &bp->b_li_list); 980 981 ASSERT(bp->b_iodone == NULL || 982 bp->b_iodone == xfs_buf_iodone_callbacks); 983 bp->b_iodone = xfs_buf_iodone_callbacks; 984 } 985 986 /* 987 * We can have many callbacks on a buffer. Running the callbacks individually 988 * can cause a lot of contention on the AIL lock, so we allow for a single 989 * callback to be able to scan the remaining items in bp->b_li_list for other 990 * items of the same type and callback to be processed in the first call. 991 * 992 * As a result, the loop walking the callback list below will also modify the 993 * list. it removes the first item from the list and then runs the callback. 994 * The loop then restarts from the new first item int the list. This allows the 995 * callback to scan and modify the list attached to the buffer and we don't 996 * have to care about maintaining a next item pointer. 997 */ 998 STATIC void 999 xfs_buf_do_callbacks( 1000 struct xfs_buf *bp) 1001 { 1002 struct xfs_buf_log_item *blip = bp->b_log_item; 1003 struct xfs_log_item *lip; 1004 1005 /* If there is a buf_log_item attached, run its callback */ 1006 if (blip) { 1007 lip = &blip->bli_item; 1008 lip->li_cb(bp, lip); 1009 } 1010 1011 while (!list_empty(&bp->b_li_list)) { 1012 lip = list_first_entry(&bp->b_li_list, struct xfs_log_item, 1013 li_bio_list); 1014 1015 /* 1016 * Remove the item from the list, so we don't have any 1017 * confusion if the item is added to another buf. 1018 * Don't touch the log item after calling its 1019 * callback, because it could have freed itself. 1020 */ 1021 list_del_init(&lip->li_bio_list); 1022 lip->li_cb(bp, lip); 1023 } 1024 } 1025 1026 /* 1027 * Invoke the error state callback for each log item affected by the failed I/O. 1028 * 1029 * If a metadata buffer write fails with a non-permanent error, the buffer is 1030 * eventually resubmitted and so the completion callbacks are not run. The error 1031 * state may need to be propagated to the log items attached to the buffer, 1032 * however, so the next AIL push of the item knows hot to handle it correctly. 1033 */ 1034 STATIC void 1035 xfs_buf_do_callbacks_fail( 1036 struct xfs_buf *bp) 1037 { 1038 struct xfs_log_item *lip; 1039 struct xfs_ail *ailp; 1040 1041 /* 1042 * Buffer log item errors are handled directly by xfs_buf_item_push() 1043 * and xfs_buf_iodone_callback_error, and they have no IO error 1044 * callbacks. Check only for items in b_li_list. 1045 */ 1046 if (list_empty(&bp->b_li_list)) 1047 return; 1048 1049 lip = list_first_entry(&bp->b_li_list, struct xfs_log_item, 1050 li_bio_list); 1051 ailp = lip->li_ailp; 1052 spin_lock(&ailp->ail_lock); 1053 list_for_each_entry(lip, &bp->b_li_list, li_bio_list) { 1054 if (lip->li_ops->iop_error) 1055 lip->li_ops->iop_error(lip, bp); 1056 } 1057 spin_unlock(&ailp->ail_lock); 1058 } 1059 1060 static bool 1061 xfs_buf_iodone_callback_error( 1062 struct xfs_buf *bp) 1063 { 1064 struct xfs_buf_log_item *bip = bp->b_log_item; 1065 struct xfs_log_item *lip; 1066 struct xfs_mount *mp; 1067 static ulong lasttime; 1068 static xfs_buftarg_t *lasttarg; 1069 struct xfs_error_cfg *cfg; 1070 1071 /* 1072 * The failed buffer might not have a buf_log_item attached or the 1073 * log_item list might be empty. Get the mp from the available 1074 * xfs_log_item 1075 */ 1076 lip = list_first_entry_or_null(&bp->b_li_list, struct xfs_log_item, 1077 li_bio_list); 1078 mp = lip ? lip->li_mountp : bip->bli_item.li_mountp; 1079 1080 /* 1081 * If we've already decided to shutdown the filesystem because of 1082 * I/O errors, there's no point in giving this a retry. 1083 */ 1084 if (XFS_FORCED_SHUTDOWN(mp)) 1085 goto out_stale; 1086 1087 if (bp->b_target != lasttarg || 1088 time_after(jiffies, (lasttime + 5*HZ))) { 1089 lasttime = jiffies; 1090 xfs_buf_ioerror_alert(bp, __this_address); 1091 } 1092 lasttarg = bp->b_target; 1093 1094 /* synchronous writes will have callers process the error */ 1095 if (!(bp->b_flags & XBF_ASYNC)) 1096 goto out_stale; 1097 1098 trace_xfs_buf_item_iodone_async(bp, _RET_IP_); 1099 ASSERT(bp->b_iodone != NULL); 1100 1101 cfg = xfs_error_get_cfg(mp, XFS_ERR_METADATA, bp->b_error); 1102 1103 /* 1104 * If the write was asynchronous then no one will be looking for the 1105 * error. If this is the first failure of this type, clear the error 1106 * state and write the buffer out again. This means we always retry an 1107 * async write failure at least once, but we also need to set the buffer 1108 * up to behave correctly now for repeated failures. 1109 */ 1110 if (!(bp->b_flags & (XBF_STALE | XBF_WRITE_FAIL)) || 1111 bp->b_last_error != bp->b_error) { 1112 bp->b_flags |= (XBF_WRITE | XBF_DONE | XBF_WRITE_FAIL); 1113 bp->b_last_error = bp->b_error; 1114 if (cfg->retry_timeout != XFS_ERR_RETRY_FOREVER && 1115 !bp->b_first_retry_time) 1116 bp->b_first_retry_time = jiffies; 1117 1118 xfs_buf_ioerror(bp, 0); 1119 xfs_buf_submit(bp); 1120 return true; 1121 } 1122 1123 /* 1124 * Repeated failure on an async write. Take action according to the 1125 * error configuration we have been set up to use. 1126 */ 1127 1128 if (cfg->max_retries != XFS_ERR_RETRY_FOREVER && 1129 ++bp->b_retries > cfg->max_retries) 1130 goto permanent_error; 1131 if (cfg->retry_timeout != XFS_ERR_RETRY_FOREVER && 1132 time_after(jiffies, cfg->retry_timeout + bp->b_first_retry_time)) 1133 goto permanent_error; 1134 1135 /* At unmount we may treat errors differently */ 1136 if ((mp->m_flags & XFS_MOUNT_UNMOUNTING) && mp->m_fail_unmount) 1137 goto permanent_error; 1138 1139 /* 1140 * Still a transient error, run IO completion failure callbacks and let 1141 * the higher layers retry the buffer. 1142 */ 1143 xfs_buf_do_callbacks_fail(bp); 1144 xfs_buf_ioerror(bp, 0); 1145 xfs_buf_relse(bp); 1146 return true; 1147 1148 /* 1149 * Permanent error - we need to trigger a shutdown if we haven't already 1150 * to indicate that inconsistency will result from this action. 1151 */ 1152 permanent_error: 1153 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); 1154 out_stale: 1155 xfs_buf_stale(bp); 1156 bp->b_flags |= XBF_DONE; 1157 trace_xfs_buf_error_relse(bp, _RET_IP_); 1158 return false; 1159 } 1160 1161 /* 1162 * This is the iodone() function for buffers which have had callbacks attached 1163 * to them by xfs_buf_attach_iodone(). We need to iterate the items on the 1164 * callback list, mark the buffer as having no more callbacks and then push the 1165 * buffer through IO completion processing. 1166 */ 1167 void 1168 xfs_buf_iodone_callbacks( 1169 struct xfs_buf *bp) 1170 { 1171 /* 1172 * If there is an error, process it. Some errors require us 1173 * to run callbacks after failure processing is done so we 1174 * detect that and take appropriate action. 1175 */ 1176 if (bp->b_error && xfs_buf_iodone_callback_error(bp)) 1177 return; 1178 1179 /* 1180 * Successful IO or permanent error. Either way, we can clear the 1181 * retry state here in preparation for the next error that may occur. 1182 */ 1183 bp->b_last_error = 0; 1184 bp->b_retries = 0; 1185 bp->b_first_retry_time = 0; 1186 1187 xfs_buf_do_callbacks(bp); 1188 bp->b_log_item = NULL; 1189 list_del_init(&bp->b_li_list); 1190 bp->b_iodone = NULL; 1191 xfs_buf_ioend(bp); 1192 } 1193 1194 /* 1195 * This is the iodone() function for buffers which have been 1196 * logged. It is called when they are eventually flushed out. 1197 * It should remove the buf item from the AIL, and free the buf item. 1198 * It is called by xfs_buf_iodone_callbacks() above which will take 1199 * care of cleaning up the buffer itself. 1200 */ 1201 void 1202 xfs_buf_iodone( 1203 struct xfs_buf *bp, 1204 struct xfs_log_item *lip) 1205 { 1206 ASSERT(BUF_ITEM(lip)->bli_buf == bp); 1207 1208 xfs_buf_rele(bp); 1209 1210 /* 1211 * If we are forcibly shutting down, this may well be off the AIL 1212 * already. That's because we simulate the log-committed callbacks to 1213 * unpin these buffers. Or we may never have put this item on AIL 1214 * because of the transaction was aborted forcibly. 1215 * xfs_trans_ail_delete() takes care of these. 1216 * 1217 * Either way, AIL is useless if we're forcing a shutdown. 1218 */ 1219 xfs_trans_ail_delete(lip, SHUTDOWN_CORRUPT_INCORE); 1220 xfs_buf_item_free(BUF_ITEM(lip)); 1221 } 1222