1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (c) 2000-2005 Silicon Graphics, Inc. 4 * All Rights Reserved. 5 */ 6 #include "xfs.h" 7 #include "xfs_fs.h" 8 #include "xfs_shared.h" 9 #include "xfs_format.h" 10 #include "xfs_log_format.h" 11 #include "xfs_trans_resv.h" 12 #include "xfs_bit.h" 13 #include "xfs_mount.h" 14 #include "xfs_trans.h" 15 #include "xfs_trans_priv.h" 16 #include "xfs_buf_item.h" 17 #include "xfs_inode.h" 18 #include "xfs_inode_item.h" 19 #include "xfs_quota.h" 20 #include "xfs_dquot_item.h" 21 #include "xfs_dquot.h" 22 #include "xfs_trace.h" 23 #include "xfs_log.h" 24 25 26 kmem_zone_t *xfs_buf_item_zone; 27 28 static inline struct xfs_buf_log_item *BUF_ITEM(struct xfs_log_item *lip) 29 { 30 return container_of(lip, struct xfs_buf_log_item, bli_item); 31 } 32 33 /* Is this log iovec plausibly large enough to contain the buffer log format? */ 34 bool 35 xfs_buf_log_check_iovec( 36 struct xfs_log_iovec *iovec) 37 { 38 struct xfs_buf_log_format *blfp = iovec->i_addr; 39 char *bmp_end; 40 char *item_end; 41 42 if (offsetof(struct xfs_buf_log_format, blf_data_map) > iovec->i_len) 43 return false; 44 45 item_end = (char *)iovec->i_addr + iovec->i_len; 46 bmp_end = (char *)&blfp->blf_data_map[blfp->blf_map_size]; 47 return bmp_end <= item_end; 48 } 49 50 static inline int 51 xfs_buf_log_format_size( 52 struct xfs_buf_log_format *blfp) 53 { 54 return offsetof(struct xfs_buf_log_format, blf_data_map) + 55 (blfp->blf_map_size * sizeof(blfp->blf_data_map[0])); 56 } 57 58 static inline bool 59 xfs_buf_item_straddle( 60 struct xfs_buf *bp, 61 uint offset, 62 int first_bit, 63 int nbits) 64 { 65 void *first, *last; 66 67 first = xfs_buf_offset(bp, offset + (first_bit << XFS_BLF_SHIFT)); 68 last = xfs_buf_offset(bp, 69 offset + ((first_bit + nbits) << XFS_BLF_SHIFT)); 70 71 if (last - first != nbits * XFS_BLF_CHUNK) 72 return true; 73 return false; 74 } 75 76 /* 77 * Return the number of log iovecs and space needed to log the given buf log 78 * item segment. 79 * 80 * It calculates this as 1 iovec for the buf log format structure and 1 for each 81 * stretch of non-contiguous chunks to be logged. Contiguous chunks are logged 82 * in a single iovec. 83 */ 84 STATIC void 85 xfs_buf_item_size_segment( 86 struct xfs_buf_log_item *bip, 87 struct xfs_buf_log_format *blfp, 88 uint offset, 89 int *nvecs, 90 int *nbytes) 91 { 92 struct xfs_buf *bp = bip->bli_buf; 93 int first_bit; 94 int nbits; 95 int next_bit; 96 int last_bit; 97 98 first_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, 0); 99 if (first_bit == -1) 100 return; 101 102 (*nvecs)++; 103 *nbytes += xfs_buf_log_format_size(blfp); 104 105 do { 106 nbits = xfs_contig_bits(blfp->blf_data_map, 107 blfp->blf_map_size, first_bit); 108 ASSERT(nbits > 0); 109 110 /* 111 * Straddling a page is rare because we don't log contiguous 112 * chunks of unmapped buffers anywhere. 113 */ 114 if (nbits > 1 && 115 xfs_buf_item_straddle(bp, offset, first_bit, nbits)) 116 goto slow_scan; 117 118 (*nvecs)++; 119 *nbytes += nbits * XFS_BLF_CHUNK; 120 121 /* 122 * This takes the bit number to start looking from and 123 * returns the next set bit from there. It returns -1 124 * if there are no more bits set or the start bit is 125 * beyond the end of the bitmap. 126 */ 127 first_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, 128 (uint)first_bit + nbits + 1); 129 } while (first_bit != -1); 130 131 return; 132 133 slow_scan: 134 /* Count the first bit we jumped out of the above loop from */ 135 (*nvecs)++; 136 *nbytes += XFS_BLF_CHUNK; 137 last_bit = first_bit; 138 while (last_bit != -1) { 139 /* 140 * This takes the bit number to start looking from and 141 * returns the next set bit from there. It returns -1 142 * if there are no more bits set or the start bit is 143 * beyond the end of the bitmap. 144 */ 145 next_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, 146 last_bit + 1); 147 /* 148 * If we run out of bits, leave the loop, 149 * else if we find a new set of bits bump the number of vecs, 150 * else keep scanning the current set of bits. 151 */ 152 if (next_bit == -1) { 153 break; 154 } else if (next_bit != last_bit + 1 || 155 xfs_buf_item_straddle(bp, offset, first_bit, nbits)) { 156 last_bit = next_bit; 157 first_bit = next_bit; 158 (*nvecs)++; 159 nbits = 1; 160 } else { 161 last_bit++; 162 nbits++; 163 } 164 *nbytes += XFS_BLF_CHUNK; 165 } 166 } 167 168 /* 169 * Return the number of log iovecs and space needed to log the given buf log 170 * item. 171 * 172 * Discontiguous buffers need a format structure per region that is being 173 * logged. This makes the changes in the buffer appear to log recovery as though 174 * they came from separate buffers, just like would occur if multiple buffers 175 * were used instead of a single discontiguous buffer. This enables 176 * discontiguous buffers to be in-memory constructs, completely transparent to 177 * what ends up on disk. 178 * 179 * If the XFS_BLI_STALE flag has been set, then log nothing but the buf log 180 * format structures. If the item has previously been logged and has dirty 181 * regions, we do not relog them in stale buffers. This has the effect of 182 * reducing the size of the relogged item by the amount of dirty data tracked 183 * by the log item. This can result in the committing transaction reducing the 184 * amount of space being consumed by the CIL. 185 */ 186 STATIC void 187 xfs_buf_item_size( 188 struct xfs_log_item *lip, 189 int *nvecs, 190 int *nbytes) 191 { 192 struct xfs_buf_log_item *bip = BUF_ITEM(lip); 193 struct xfs_buf *bp = bip->bli_buf; 194 int i; 195 int bytes; 196 uint offset = 0; 197 198 ASSERT(atomic_read(&bip->bli_refcount) > 0); 199 if (bip->bli_flags & XFS_BLI_STALE) { 200 /* 201 * The buffer is stale, so all we need to log is the buf log 202 * format structure with the cancel flag in it as we are never 203 * going to replay the changes tracked in the log item. 204 */ 205 trace_xfs_buf_item_size_stale(bip); 206 ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL); 207 *nvecs += bip->bli_format_count; 208 for (i = 0; i < bip->bli_format_count; i++) { 209 *nbytes += xfs_buf_log_format_size(&bip->bli_formats[i]); 210 } 211 return; 212 } 213 214 ASSERT(bip->bli_flags & XFS_BLI_LOGGED); 215 216 if (bip->bli_flags & XFS_BLI_ORDERED) { 217 /* 218 * The buffer has been logged just to order it. It is not being 219 * included in the transaction commit, so no vectors are used at 220 * all. 221 */ 222 trace_xfs_buf_item_size_ordered(bip); 223 *nvecs = XFS_LOG_VEC_ORDERED; 224 return; 225 } 226 227 /* 228 * The vector count is based on the number of buffer vectors we have 229 * dirty bits in. This will only be greater than one when we have a 230 * compound buffer with more than one segment dirty. Hence for compound 231 * buffers we need to track which segment the dirty bits correspond to, 232 * and when we move from one segment to the next increment the vector 233 * count for the extra buf log format structure that will need to be 234 * written. 235 */ 236 bytes = 0; 237 for (i = 0; i < bip->bli_format_count; i++) { 238 xfs_buf_item_size_segment(bip, &bip->bli_formats[i], offset, 239 nvecs, &bytes); 240 offset += BBTOB(bp->b_maps[i].bm_len); 241 } 242 243 /* 244 * Round up the buffer size required to minimise the number of memory 245 * allocations that need to be done as this item grows when relogged by 246 * repeated modifications. 247 */ 248 *nbytes = round_up(bytes, 512); 249 trace_xfs_buf_item_size(bip); 250 } 251 252 static inline void 253 xfs_buf_item_copy_iovec( 254 struct xfs_log_vec *lv, 255 struct xfs_log_iovec **vecp, 256 struct xfs_buf *bp, 257 uint offset, 258 int first_bit, 259 uint nbits) 260 { 261 offset += first_bit * XFS_BLF_CHUNK; 262 xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_BCHUNK, 263 xfs_buf_offset(bp, offset), 264 nbits * XFS_BLF_CHUNK); 265 } 266 267 static void 268 xfs_buf_item_format_segment( 269 struct xfs_buf_log_item *bip, 270 struct xfs_log_vec *lv, 271 struct xfs_log_iovec **vecp, 272 uint offset, 273 struct xfs_buf_log_format *blfp) 274 { 275 struct xfs_buf *bp = bip->bli_buf; 276 uint base_size; 277 int first_bit; 278 int last_bit; 279 int next_bit; 280 uint nbits; 281 282 /* copy the flags across from the base format item */ 283 blfp->blf_flags = bip->__bli_format.blf_flags; 284 285 /* 286 * Base size is the actual size of the ondisk structure - it reflects 287 * the actual size of the dirty bitmap rather than the size of the in 288 * memory structure. 289 */ 290 base_size = xfs_buf_log_format_size(blfp); 291 292 first_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, 0); 293 if (!(bip->bli_flags & XFS_BLI_STALE) && first_bit == -1) { 294 /* 295 * If the map is not be dirty in the transaction, mark 296 * the size as zero and do not advance the vector pointer. 297 */ 298 return; 299 } 300 301 blfp = xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_BFORMAT, blfp, base_size); 302 blfp->blf_size = 1; 303 304 if (bip->bli_flags & XFS_BLI_STALE) { 305 /* 306 * The buffer is stale, so all we need to log 307 * is the buf log format structure with the 308 * cancel flag in it. 309 */ 310 trace_xfs_buf_item_format_stale(bip); 311 ASSERT(blfp->blf_flags & XFS_BLF_CANCEL); 312 return; 313 } 314 315 316 /* 317 * Fill in an iovec for each set of contiguous chunks. 318 */ 319 do { 320 ASSERT(first_bit >= 0); 321 nbits = xfs_contig_bits(blfp->blf_data_map, 322 blfp->blf_map_size, first_bit); 323 ASSERT(nbits > 0); 324 325 /* 326 * Straddling a page is rare because we don't log contiguous 327 * chunks of unmapped buffers anywhere. 328 */ 329 if (nbits > 1 && 330 xfs_buf_item_straddle(bp, offset, first_bit, nbits)) 331 goto slow_scan; 332 333 xfs_buf_item_copy_iovec(lv, vecp, bp, offset, 334 first_bit, nbits); 335 blfp->blf_size++; 336 337 /* 338 * This takes the bit number to start looking from and 339 * returns the next set bit from there. It returns -1 340 * if there are no more bits set or the start bit is 341 * beyond the end of the bitmap. 342 */ 343 first_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, 344 (uint)first_bit + nbits + 1); 345 } while (first_bit != -1); 346 347 return; 348 349 slow_scan: 350 ASSERT(bp->b_addr == NULL); 351 last_bit = first_bit; 352 nbits = 1; 353 for (;;) { 354 /* 355 * This takes the bit number to start looking from and 356 * returns the next set bit from there. It returns -1 357 * if there are no more bits set or the start bit is 358 * beyond the end of the bitmap. 359 */ 360 next_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, 361 (uint)last_bit + 1); 362 /* 363 * If we run out of bits fill in the last iovec and get out of 364 * the loop. Else if we start a new set of bits then fill in 365 * the iovec for the series we were looking at and start 366 * counting the bits in the new one. Else we're still in the 367 * same set of bits so just keep counting and scanning. 368 */ 369 if (next_bit == -1) { 370 xfs_buf_item_copy_iovec(lv, vecp, bp, offset, 371 first_bit, nbits); 372 blfp->blf_size++; 373 break; 374 } else if (next_bit != last_bit + 1 || 375 xfs_buf_item_straddle(bp, offset, first_bit, nbits)) { 376 xfs_buf_item_copy_iovec(lv, vecp, bp, offset, 377 first_bit, nbits); 378 blfp->blf_size++; 379 first_bit = next_bit; 380 last_bit = next_bit; 381 nbits = 1; 382 } else { 383 last_bit++; 384 nbits++; 385 } 386 } 387 } 388 389 /* 390 * This is called to fill in the vector of log iovecs for the 391 * given log buf item. It fills the first entry with a buf log 392 * format structure, and the rest point to contiguous chunks 393 * within the buffer. 394 */ 395 STATIC void 396 xfs_buf_item_format( 397 struct xfs_log_item *lip, 398 struct xfs_log_vec *lv) 399 { 400 struct xfs_buf_log_item *bip = BUF_ITEM(lip); 401 struct xfs_buf *bp = bip->bli_buf; 402 struct xfs_log_iovec *vecp = NULL; 403 uint offset = 0; 404 int i; 405 406 ASSERT(atomic_read(&bip->bli_refcount) > 0); 407 ASSERT((bip->bli_flags & XFS_BLI_LOGGED) || 408 (bip->bli_flags & XFS_BLI_STALE)); 409 ASSERT((bip->bli_flags & XFS_BLI_STALE) || 410 (xfs_blft_from_flags(&bip->__bli_format) > XFS_BLFT_UNKNOWN_BUF 411 && xfs_blft_from_flags(&bip->__bli_format) < XFS_BLFT_MAX_BUF)); 412 ASSERT(!(bip->bli_flags & XFS_BLI_ORDERED) || 413 (bip->bli_flags & XFS_BLI_STALE)); 414 415 416 /* 417 * If it is an inode buffer, transfer the in-memory state to the 418 * format flags and clear the in-memory state. 419 * 420 * For buffer based inode allocation, we do not transfer 421 * this state if the inode buffer allocation has not yet been committed 422 * to the log as setting the XFS_BLI_INODE_BUF flag will prevent 423 * correct replay of the inode allocation. 424 * 425 * For icreate item based inode allocation, the buffers aren't written 426 * to the journal during allocation, and hence we should always tag the 427 * buffer as an inode buffer so that the correct unlinked list replay 428 * occurs during recovery. 429 */ 430 if (bip->bli_flags & XFS_BLI_INODE_BUF) { 431 if (xfs_has_v3inodes(lip->li_mountp) || 432 !((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) && 433 xfs_log_item_in_current_chkpt(lip))) 434 bip->__bli_format.blf_flags |= XFS_BLF_INODE_BUF; 435 bip->bli_flags &= ~XFS_BLI_INODE_BUF; 436 } 437 438 for (i = 0; i < bip->bli_format_count; i++) { 439 xfs_buf_item_format_segment(bip, lv, &vecp, offset, 440 &bip->bli_formats[i]); 441 offset += BBTOB(bp->b_maps[i].bm_len); 442 } 443 444 /* 445 * Check to make sure everything is consistent. 446 */ 447 trace_xfs_buf_item_format(bip); 448 } 449 450 /* 451 * This is called to pin the buffer associated with the buf log item in memory 452 * so it cannot be written out. 453 * 454 * We also always take a reference to the buffer log item here so that the bli 455 * is held while the item is pinned in memory. This means that we can 456 * unconditionally drop the reference count a transaction holds when the 457 * transaction is completed. 458 */ 459 STATIC void 460 xfs_buf_item_pin( 461 struct xfs_log_item *lip) 462 { 463 struct xfs_buf_log_item *bip = BUF_ITEM(lip); 464 465 ASSERT(atomic_read(&bip->bli_refcount) > 0); 466 ASSERT((bip->bli_flags & XFS_BLI_LOGGED) || 467 (bip->bli_flags & XFS_BLI_ORDERED) || 468 (bip->bli_flags & XFS_BLI_STALE)); 469 470 trace_xfs_buf_item_pin(bip); 471 472 atomic_inc(&bip->bli_refcount); 473 atomic_inc(&bip->bli_buf->b_pin_count); 474 } 475 476 /* 477 * This is called to unpin the buffer associated with the buf log item which 478 * was previously pinned with a call to xfs_buf_item_pin(). 479 */ 480 STATIC void 481 xfs_buf_item_unpin( 482 struct xfs_log_item *lip, 483 int remove) 484 { 485 struct xfs_buf_log_item *bip = BUF_ITEM(lip); 486 struct xfs_buf *bp = bip->bli_buf; 487 int stale = bip->bli_flags & XFS_BLI_STALE; 488 int freed; 489 490 ASSERT(bp->b_log_item == bip); 491 ASSERT(atomic_read(&bip->bli_refcount) > 0); 492 493 trace_xfs_buf_item_unpin(bip); 494 495 /* 496 * Drop the bli ref associated with the pin and grab the hold required 497 * for the I/O simulation failure in the abort case. We have to do this 498 * before the pin count drops because the AIL doesn't acquire a bli 499 * reference. Therefore if the refcount drops to zero, the bli could 500 * still be AIL resident and the buffer submitted for I/O (and freed on 501 * completion) at any point before we return. This can be removed once 502 * the AIL properly holds a reference on the bli. 503 */ 504 freed = atomic_dec_and_test(&bip->bli_refcount); 505 if (freed && !stale && remove) 506 xfs_buf_hold(bp); 507 if (atomic_dec_and_test(&bp->b_pin_count)) 508 wake_up_all(&bp->b_waiters); 509 510 /* nothing to do but drop the pin count if the bli is active */ 511 if (!freed) 512 return; 513 514 if (stale) { 515 ASSERT(bip->bli_flags & XFS_BLI_STALE); 516 ASSERT(xfs_buf_islocked(bp)); 517 ASSERT(bp->b_flags & XBF_STALE); 518 ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL); 519 ASSERT(list_empty(&lip->li_trans)); 520 ASSERT(!bp->b_transp); 521 522 trace_xfs_buf_item_unpin_stale(bip); 523 524 /* 525 * If we get called here because of an IO error, we may or may 526 * not have the item on the AIL. xfs_trans_ail_delete() will 527 * take care of that situation. xfs_trans_ail_delete() drops 528 * the AIL lock. 529 */ 530 if (bip->bli_flags & XFS_BLI_STALE_INODE) { 531 xfs_buf_item_done(bp); 532 xfs_buf_inode_iodone(bp); 533 ASSERT(list_empty(&bp->b_li_list)); 534 } else { 535 xfs_trans_ail_delete(lip, SHUTDOWN_LOG_IO_ERROR); 536 xfs_buf_item_relse(bp); 537 ASSERT(bp->b_log_item == NULL); 538 } 539 xfs_buf_relse(bp); 540 } else if (remove) { 541 /* 542 * The buffer must be locked and held by the caller to simulate 543 * an async I/O failure. We acquired the hold for this case 544 * before the buffer was unpinned. 545 */ 546 xfs_buf_lock(bp); 547 bp->b_flags |= XBF_ASYNC; 548 xfs_buf_ioend_fail(bp); 549 } 550 } 551 552 STATIC uint 553 xfs_buf_item_push( 554 struct xfs_log_item *lip, 555 struct list_head *buffer_list) 556 { 557 struct xfs_buf_log_item *bip = BUF_ITEM(lip); 558 struct xfs_buf *bp = bip->bli_buf; 559 uint rval = XFS_ITEM_SUCCESS; 560 561 if (xfs_buf_ispinned(bp)) 562 return XFS_ITEM_PINNED; 563 if (!xfs_buf_trylock(bp)) { 564 /* 565 * If we have just raced with a buffer being pinned and it has 566 * been marked stale, we could end up stalling until someone else 567 * issues a log force to unpin the stale buffer. Check for the 568 * race condition here so xfsaild recognizes the buffer is pinned 569 * and queues a log force to move it along. 570 */ 571 if (xfs_buf_ispinned(bp)) 572 return XFS_ITEM_PINNED; 573 return XFS_ITEM_LOCKED; 574 } 575 576 ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); 577 578 trace_xfs_buf_item_push(bip); 579 580 /* has a previous flush failed due to IO errors? */ 581 if (bp->b_flags & XBF_WRITE_FAIL) { 582 xfs_buf_alert_ratelimited(bp, "XFS: Failing async write", 583 "Failing async write on buffer block 0x%llx. Retrying async write.", 584 (long long)xfs_buf_daddr(bp)); 585 } 586 587 if (!xfs_buf_delwri_queue(bp, buffer_list)) 588 rval = XFS_ITEM_FLUSHING; 589 xfs_buf_unlock(bp); 590 return rval; 591 } 592 593 /* 594 * Drop the buffer log item refcount and take appropriate action. This helper 595 * determines whether the bli must be freed or not, since a decrement to zero 596 * does not necessarily mean the bli is unused. 597 * 598 * Return true if the bli is freed, false otherwise. 599 */ 600 bool 601 xfs_buf_item_put( 602 struct xfs_buf_log_item *bip) 603 { 604 struct xfs_log_item *lip = &bip->bli_item; 605 bool aborted; 606 bool dirty; 607 608 /* drop the bli ref and return if it wasn't the last one */ 609 if (!atomic_dec_and_test(&bip->bli_refcount)) 610 return false; 611 612 /* 613 * We dropped the last ref and must free the item if clean or aborted. 614 * If the bli is dirty and non-aborted, the buffer was clean in the 615 * transaction but still awaiting writeback from previous changes. In 616 * that case, the bli is freed on buffer writeback completion. 617 */ 618 aborted = test_bit(XFS_LI_ABORTED, &lip->li_flags) || 619 xfs_is_shutdown(lip->li_mountp); 620 dirty = bip->bli_flags & XFS_BLI_DIRTY; 621 if (dirty && !aborted) 622 return false; 623 624 /* 625 * The bli is aborted or clean. An aborted item may be in the AIL 626 * regardless of dirty state. For example, consider an aborted 627 * transaction that invalidated a dirty bli and cleared the dirty 628 * state. 629 */ 630 if (aborted) 631 xfs_trans_ail_delete(lip, 0); 632 xfs_buf_item_relse(bip->bli_buf); 633 return true; 634 } 635 636 /* 637 * Release the buffer associated with the buf log item. If there is no dirty 638 * logged data associated with the buffer recorded in the buf log item, then 639 * free the buf log item and remove the reference to it in the buffer. 640 * 641 * This call ignores the recursion count. It is only called when the buffer 642 * should REALLY be unlocked, regardless of the recursion count. 643 * 644 * We unconditionally drop the transaction's reference to the log item. If the 645 * item was logged, then another reference was taken when it was pinned, so we 646 * can safely drop the transaction reference now. This also allows us to avoid 647 * potential races with the unpin code freeing the bli by not referencing the 648 * bli after we've dropped the reference count. 649 * 650 * If the XFS_BLI_HOLD flag is set in the buf log item, then free the log item 651 * if necessary but do not unlock the buffer. This is for support of 652 * xfs_trans_bhold(). Make sure the XFS_BLI_HOLD field is cleared if we don't 653 * free the item. 654 */ 655 STATIC void 656 xfs_buf_item_release( 657 struct xfs_log_item *lip) 658 { 659 struct xfs_buf_log_item *bip = BUF_ITEM(lip); 660 struct xfs_buf *bp = bip->bli_buf; 661 bool released; 662 bool hold = bip->bli_flags & XFS_BLI_HOLD; 663 bool stale = bip->bli_flags & XFS_BLI_STALE; 664 #if defined(DEBUG) || defined(XFS_WARN) 665 bool ordered = bip->bli_flags & XFS_BLI_ORDERED; 666 bool dirty = bip->bli_flags & XFS_BLI_DIRTY; 667 bool aborted = test_bit(XFS_LI_ABORTED, 668 &lip->li_flags); 669 #endif 670 671 trace_xfs_buf_item_release(bip); 672 673 /* 674 * The bli dirty state should match whether the blf has logged segments 675 * except for ordered buffers, where only the bli should be dirty. 676 */ 677 ASSERT((!ordered && dirty == xfs_buf_item_dirty_format(bip)) || 678 (ordered && dirty && !xfs_buf_item_dirty_format(bip))); 679 ASSERT(!stale || (bip->__bli_format.blf_flags & XFS_BLF_CANCEL)); 680 681 /* 682 * Clear the buffer's association with this transaction and 683 * per-transaction state from the bli, which has been copied above. 684 */ 685 bp->b_transp = NULL; 686 bip->bli_flags &= ~(XFS_BLI_LOGGED | XFS_BLI_HOLD | XFS_BLI_ORDERED); 687 688 /* 689 * Unref the item and unlock the buffer unless held or stale. Stale 690 * buffers remain locked until final unpin unless the bli is freed by 691 * the unref call. The latter implies shutdown because buffer 692 * invalidation dirties the bli and transaction. 693 */ 694 released = xfs_buf_item_put(bip); 695 if (hold || (stale && !released)) 696 return; 697 ASSERT(!stale || aborted); 698 xfs_buf_relse(bp); 699 } 700 701 STATIC void 702 xfs_buf_item_committing( 703 struct xfs_log_item *lip, 704 xfs_csn_t seq) 705 { 706 return xfs_buf_item_release(lip); 707 } 708 709 /* 710 * This is called to find out where the oldest active copy of the 711 * buf log item in the on disk log resides now that the last log 712 * write of it completed at the given lsn. 713 * We always re-log all the dirty data in a buffer, so usually the 714 * latest copy in the on disk log is the only one that matters. For 715 * those cases we simply return the given lsn. 716 * 717 * The one exception to this is for buffers full of newly allocated 718 * inodes. These buffers are only relogged with the XFS_BLI_INODE_BUF 719 * flag set, indicating that only the di_next_unlinked fields from the 720 * inodes in the buffers will be replayed during recovery. If the 721 * original newly allocated inode images have not yet been flushed 722 * when the buffer is so relogged, then we need to make sure that we 723 * keep the old images in the 'active' portion of the log. We do this 724 * by returning the original lsn of that transaction here rather than 725 * the current one. 726 */ 727 STATIC xfs_lsn_t 728 xfs_buf_item_committed( 729 struct xfs_log_item *lip, 730 xfs_lsn_t lsn) 731 { 732 struct xfs_buf_log_item *bip = BUF_ITEM(lip); 733 734 trace_xfs_buf_item_committed(bip); 735 736 if ((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) && lip->li_lsn != 0) 737 return lip->li_lsn; 738 return lsn; 739 } 740 741 static const struct xfs_item_ops xfs_buf_item_ops = { 742 .iop_size = xfs_buf_item_size, 743 .iop_format = xfs_buf_item_format, 744 .iop_pin = xfs_buf_item_pin, 745 .iop_unpin = xfs_buf_item_unpin, 746 .iop_release = xfs_buf_item_release, 747 .iop_committing = xfs_buf_item_committing, 748 .iop_committed = xfs_buf_item_committed, 749 .iop_push = xfs_buf_item_push, 750 }; 751 752 STATIC void 753 xfs_buf_item_get_format( 754 struct xfs_buf_log_item *bip, 755 int count) 756 { 757 ASSERT(bip->bli_formats == NULL); 758 bip->bli_format_count = count; 759 760 if (count == 1) { 761 bip->bli_formats = &bip->__bli_format; 762 return; 763 } 764 765 bip->bli_formats = kmem_zalloc(count * sizeof(struct xfs_buf_log_format), 766 0); 767 } 768 769 STATIC void 770 xfs_buf_item_free_format( 771 struct xfs_buf_log_item *bip) 772 { 773 if (bip->bli_formats != &bip->__bli_format) { 774 kmem_free(bip->bli_formats); 775 bip->bli_formats = NULL; 776 } 777 } 778 779 /* 780 * Allocate a new buf log item to go with the given buffer. 781 * Set the buffer's b_log_item field to point to the new 782 * buf log item. 783 */ 784 int 785 xfs_buf_item_init( 786 struct xfs_buf *bp, 787 struct xfs_mount *mp) 788 { 789 struct xfs_buf_log_item *bip = bp->b_log_item; 790 int chunks; 791 int map_size; 792 int i; 793 794 /* 795 * Check to see if there is already a buf log item for 796 * this buffer. If we do already have one, there is 797 * nothing to do here so return. 798 */ 799 ASSERT(bp->b_mount == mp); 800 if (bip) { 801 ASSERT(bip->bli_item.li_type == XFS_LI_BUF); 802 ASSERT(!bp->b_transp); 803 ASSERT(bip->bli_buf == bp); 804 return 0; 805 } 806 807 bip = kmem_cache_zalloc(xfs_buf_item_zone, GFP_KERNEL | __GFP_NOFAIL); 808 xfs_log_item_init(mp, &bip->bli_item, XFS_LI_BUF, &xfs_buf_item_ops); 809 bip->bli_buf = bp; 810 811 /* 812 * chunks is the number of XFS_BLF_CHUNK size pieces the buffer 813 * can be divided into. Make sure not to truncate any pieces. 814 * map_size is the size of the bitmap needed to describe the 815 * chunks of the buffer. 816 * 817 * Discontiguous buffer support follows the layout of the underlying 818 * buffer. This makes the implementation as simple as possible. 819 */ 820 xfs_buf_item_get_format(bip, bp->b_map_count); 821 822 for (i = 0; i < bip->bli_format_count; i++) { 823 chunks = DIV_ROUND_UP(BBTOB(bp->b_maps[i].bm_len), 824 XFS_BLF_CHUNK); 825 map_size = DIV_ROUND_UP(chunks, NBWORD); 826 827 if (map_size > XFS_BLF_DATAMAP_SIZE) { 828 kmem_cache_free(xfs_buf_item_zone, bip); 829 xfs_err(mp, 830 "buffer item dirty bitmap (%u uints) too small to reflect %u bytes!", 831 map_size, 832 BBTOB(bp->b_maps[i].bm_len)); 833 return -EFSCORRUPTED; 834 } 835 836 bip->bli_formats[i].blf_type = XFS_LI_BUF; 837 bip->bli_formats[i].blf_blkno = bp->b_maps[i].bm_bn; 838 bip->bli_formats[i].blf_len = bp->b_maps[i].bm_len; 839 bip->bli_formats[i].blf_map_size = map_size; 840 } 841 842 bp->b_log_item = bip; 843 xfs_buf_hold(bp); 844 return 0; 845 } 846 847 848 /* 849 * Mark bytes first through last inclusive as dirty in the buf 850 * item's bitmap. 851 */ 852 static void 853 xfs_buf_item_log_segment( 854 uint first, 855 uint last, 856 uint *map) 857 { 858 uint first_bit; 859 uint last_bit; 860 uint bits_to_set; 861 uint bits_set; 862 uint word_num; 863 uint *wordp; 864 uint bit; 865 uint end_bit; 866 uint mask; 867 868 ASSERT(first < XFS_BLF_DATAMAP_SIZE * XFS_BLF_CHUNK * NBWORD); 869 ASSERT(last < XFS_BLF_DATAMAP_SIZE * XFS_BLF_CHUNK * NBWORD); 870 871 /* 872 * Convert byte offsets to bit numbers. 873 */ 874 first_bit = first >> XFS_BLF_SHIFT; 875 last_bit = last >> XFS_BLF_SHIFT; 876 877 /* 878 * Calculate the total number of bits to be set. 879 */ 880 bits_to_set = last_bit - first_bit + 1; 881 882 /* 883 * Get a pointer to the first word in the bitmap 884 * to set a bit in. 885 */ 886 word_num = first_bit >> BIT_TO_WORD_SHIFT; 887 wordp = &map[word_num]; 888 889 /* 890 * Calculate the starting bit in the first word. 891 */ 892 bit = first_bit & (uint)(NBWORD - 1); 893 894 /* 895 * First set any bits in the first word of our range. 896 * If it starts at bit 0 of the word, it will be 897 * set below rather than here. That is what the variable 898 * bit tells us. The variable bits_set tracks the number 899 * of bits that have been set so far. End_bit is the number 900 * of the last bit to be set in this word plus one. 901 */ 902 if (bit) { 903 end_bit = min(bit + bits_to_set, (uint)NBWORD); 904 mask = ((1U << (end_bit - bit)) - 1) << bit; 905 *wordp |= mask; 906 wordp++; 907 bits_set = end_bit - bit; 908 } else { 909 bits_set = 0; 910 } 911 912 /* 913 * Now set bits a whole word at a time that are between 914 * first_bit and last_bit. 915 */ 916 while ((bits_to_set - bits_set) >= NBWORD) { 917 *wordp = 0xffffffff; 918 bits_set += NBWORD; 919 wordp++; 920 } 921 922 /* 923 * Finally, set any bits left to be set in one last partial word. 924 */ 925 end_bit = bits_to_set - bits_set; 926 if (end_bit) { 927 mask = (1U << end_bit) - 1; 928 *wordp |= mask; 929 } 930 } 931 932 /* 933 * Mark bytes first through last inclusive as dirty in the buf 934 * item's bitmap. 935 */ 936 void 937 xfs_buf_item_log( 938 struct xfs_buf_log_item *bip, 939 uint first, 940 uint last) 941 { 942 int i; 943 uint start; 944 uint end; 945 struct xfs_buf *bp = bip->bli_buf; 946 947 /* 948 * walk each buffer segment and mark them dirty appropriately. 949 */ 950 start = 0; 951 for (i = 0; i < bip->bli_format_count; i++) { 952 if (start > last) 953 break; 954 end = start + BBTOB(bp->b_maps[i].bm_len) - 1; 955 956 /* skip to the map that includes the first byte to log */ 957 if (first > end) { 958 start += BBTOB(bp->b_maps[i].bm_len); 959 continue; 960 } 961 962 /* 963 * Trim the range to this segment and mark it in the bitmap. 964 * Note that we must convert buffer offsets to segment relative 965 * offsets (e.g., the first byte of each segment is byte 0 of 966 * that segment). 967 */ 968 if (first < start) 969 first = start; 970 if (end > last) 971 end = last; 972 xfs_buf_item_log_segment(first - start, end - start, 973 &bip->bli_formats[i].blf_data_map[0]); 974 975 start += BBTOB(bp->b_maps[i].bm_len); 976 } 977 } 978 979 980 /* 981 * Return true if the buffer has any ranges logged/dirtied by a transaction, 982 * false otherwise. 983 */ 984 bool 985 xfs_buf_item_dirty_format( 986 struct xfs_buf_log_item *bip) 987 { 988 int i; 989 990 for (i = 0; i < bip->bli_format_count; i++) { 991 if (!xfs_bitmap_empty(bip->bli_formats[i].blf_data_map, 992 bip->bli_formats[i].blf_map_size)) 993 return true; 994 } 995 996 return false; 997 } 998 999 STATIC void 1000 xfs_buf_item_free( 1001 struct xfs_buf_log_item *bip) 1002 { 1003 xfs_buf_item_free_format(bip); 1004 kmem_free(bip->bli_item.li_lv_shadow); 1005 kmem_cache_free(xfs_buf_item_zone, bip); 1006 } 1007 1008 /* 1009 * xfs_buf_item_relse() is called when the buf log item is no longer needed. 1010 */ 1011 void 1012 xfs_buf_item_relse( 1013 struct xfs_buf *bp) 1014 { 1015 struct xfs_buf_log_item *bip = bp->b_log_item; 1016 1017 trace_xfs_buf_item_relse(bp, _RET_IP_); 1018 ASSERT(!test_bit(XFS_LI_IN_AIL, &bip->bli_item.li_flags)); 1019 1020 bp->b_log_item = NULL; 1021 xfs_buf_rele(bp); 1022 xfs_buf_item_free(bip); 1023 } 1024 1025 void 1026 xfs_buf_item_done( 1027 struct xfs_buf *bp) 1028 { 1029 /* 1030 * If we are forcibly shutting down, this may well be off the AIL 1031 * already. That's because we simulate the log-committed callbacks to 1032 * unpin these buffers. Or we may never have put this item on AIL 1033 * because of the transaction was aborted forcibly. 1034 * xfs_trans_ail_delete() takes care of these. 1035 * 1036 * Either way, AIL is useless if we're forcing a shutdown. 1037 * 1038 * Note that log recovery writes might have buffer items that are not on 1039 * the AIL even when the file system is not shut down. 1040 */ 1041 xfs_trans_ail_delete(&bp->b_log_item->bli_item, 1042 (bp->b_flags & _XBF_LOGRECOVERY) ? 0 : 1043 SHUTDOWN_CORRUPT_INCORE); 1044 xfs_buf_item_relse(bp); 1045 } 1046