1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (c) 2000-2006 Silicon Graphics, Inc. 4 * All Rights Reserved. 5 */ 6 #include "xfs.h" 7 #include "xfs_fs.h" 8 #include "xfs_shared.h" 9 #include "xfs_format.h" 10 #include "xfs_log_format.h" 11 #include "xfs_trans_resv.h" 12 #include "xfs_bit.h" 13 #include "xfs_mount.h" 14 #include "xfs_trans.h" 15 #include "xfs_buf_item.h" 16 #include "xfs_trans_priv.h" 17 #include "xfs_trace.h" 18 #include "xfs_log.h" 19 #include "xfs_log_priv.h" 20 #include "xfs_log_recover.h" 21 #include "xfs_error.h" 22 #include "xfs_inode.h" 23 #include "xfs_dir2.h" 24 #include "xfs_quota.h" 25 26 /* 27 * This structure is used during recovery to record the buf log items which 28 * have been canceled and should not be replayed. 29 */ 30 struct xfs_buf_cancel { 31 xfs_daddr_t bc_blkno; 32 uint bc_len; 33 int bc_refcount; 34 struct list_head bc_list; 35 }; 36 37 static struct xfs_buf_cancel * 38 xlog_find_buffer_cancelled( 39 struct xlog *log, 40 xfs_daddr_t blkno, 41 uint len) 42 { 43 struct list_head *bucket; 44 struct xfs_buf_cancel *bcp; 45 46 if (!log->l_buf_cancel_table) 47 return NULL; 48 49 bucket = XLOG_BUF_CANCEL_BUCKET(log, blkno); 50 list_for_each_entry(bcp, bucket, bc_list) { 51 if (bcp->bc_blkno == blkno && bcp->bc_len == len) 52 return bcp; 53 } 54 55 return NULL; 56 } 57 58 static bool 59 xlog_add_buffer_cancelled( 60 struct xlog *log, 61 xfs_daddr_t blkno, 62 uint len) 63 { 64 struct xfs_buf_cancel *bcp; 65 66 /* 67 * If we find an existing cancel record, this indicates that the buffer 68 * was cancelled multiple times. To ensure that during pass 2 we keep 69 * the record in the table until we reach its last occurrence in the 70 * log, a reference count is kept to tell how many times we expect to 71 * see this record during the second pass. 72 */ 73 bcp = xlog_find_buffer_cancelled(log, blkno, len); 74 if (bcp) { 75 bcp->bc_refcount++; 76 return false; 77 } 78 79 bcp = kmem_alloc(sizeof(struct xfs_buf_cancel), 0); 80 bcp->bc_blkno = blkno; 81 bcp->bc_len = len; 82 bcp->bc_refcount = 1; 83 list_add_tail(&bcp->bc_list, XLOG_BUF_CANCEL_BUCKET(log, blkno)); 84 return true; 85 } 86 87 /* 88 * Check if there is and entry for blkno, len in the buffer cancel record table. 89 */ 90 bool 91 xlog_is_buffer_cancelled( 92 struct xlog *log, 93 xfs_daddr_t blkno, 94 uint len) 95 { 96 return xlog_find_buffer_cancelled(log, blkno, len) != NULL; 97 } 98 99 /* 100 * Check if there is and entry for blkno, len in the buffer cancel record table, 101 * and decremented the reference count on it if there is one. 102 * 103 * Remove the cancel record once the refcount hits zero, so that if the same 104 * buffer is re-used again after its last cancellation we actually replay the 105 * changes made at that point. 106 */ 107 static bool 108 xlog_put_buffer_cancelled( 109 struct xlog *log, 110 xfs_daddr_t blkno, 111 uint len) 112 { 113 struct xfs_buf_cancel *bcp; 114 115 bcp = xlog_find_buffer_cancelled(log, blkno, len); 116 if (!bcp) { 117 ASSERT(0); 118 return false; 119 } 120 121 if (--bcp->bc_refcount == 0) { 122 list_del(&bcp->bc_list); 123 kmem_free(bcp); 124 } 125 return true; 126 } 127 128 /* log buffer item recovery */ 129 130 /* 131 * Sort buffer items for log recovery. Most buffer items should end up on the 132 * buffer list and are recovered first, with the following exceptions: 133 * 134 * 1. XFS_BLF_CANCEL buffers must be processed last because some log items 135 * might depend on the incor ecancellation record, and replaying a cancelled 136 * buffer item can remove the incore record. 137 * 138 * 2. XFS_BLF_INODE_BUF buffers are handled after most regular items so that 139 * we replay di_next_unlinked only after flushing the inode 'free' state 140 * to the inode buffer. 141 * 142 * See xlog_recover_reorder_trans for more details. 143 */ 144 STATIC enum xlog_recover_reorder 145 xlog_recover_buf_reorder( 146 struct xlog_recover_item *item) 147 { 148 struct xfs_buf_log_format *buf_f = item->ri_buf[0].i_addr; 149 150 if (buf_f->blf_flags & XFS_BLF_CANCEL) 151 return XLOG_REORDER_CANCEL_LIST; 152 if (buf_f->blf_flags & XFS_BLF_INODE_BUF) 153 return XLOG_REORDER_INODE_BUFFER_LIST; 154 return XLOG_REORDER_BUFFER_LIST; 155 } 156 157 STATIC void 158 xlog_recover_buf_ra_pass2( 159 struct xlog *log, 160 struct xlog_recover_item *item) 161 { 162 struct xfs_buf_log_format *buf_f = item->ri_buf[0].i_addr; 163 164 xlog_buf_readahead(log, buf_f->blf_blkno, buf_f->blf_len, NULL); 165 } 166 167 /* 168 * Build up the table of buf cancel records so that we don't replay cancelled 169 * data in the second pass. 170 */ 171 static int 172 xlog_recover_buf_commit_pass1( 173 struct xlog *log, 174 struct xlog_recover_item *item) 175 { 176 struct xfs_buf_log_format *bf = item->ri_buf[0].i_addr; 177 178 if (!xfs_buf_log_check_iovec(&item->ri_buf[0])) { 179 xfs_err(log->l_mp, "bad buffer log item size (%d)", 180 item->ri_buf[0].i_len); 181 return -EFSCORRUPTED; 182 } 183 184 if (!(bf->blf_flags & XFS_BLF_CANCEL)) 185 trace_xfs_log_recover_buf_not_cancel(log, bf); 186 else if (xlog_add_buffer_cancelled(log, bf->blf_blkno, bf->blf_len)) 187 trace_xfs_log_recover_buf_cancel_add(log, bf); 188 else 189 trace_xfs_log_recover_buf_cancel_ref_inc(log, bf); 190 return 0; 191 } 192 193 /* 194 * Validate the recovered buffer is of the correct type and attach the 195 * appropriate buffer operations to them for writeback. Magic numbers are in a 196 * few places: 197 * the first 16 bits of the buffer (inode buffer, dquot buffer), 198 * the first 32 bits of the buffer (most blocks), 199 * inside a struct xfs_da_blkinfo at the start of the buffer. 200 */ 201 static void 202 xlog_recover_validate_buf_type( 203 struct xfs_mount *mp, 204 struct xfs_buf *bp, 205 struct xfs_buf_log_format *buf_f, 206 xfs_lsn_t current_lsn) 207 { 208 struct xfs_da_blkinfo *info = bp->b_addr; 209 uint32_t magic32; 210 uint16_t magic16; 211 uint16_t magicda; 212 char *warnmsg = NULL; 213 214 /* 215 * We can only do post recovery validation on items on CRC enabled 216 * fielsystems as we need to know when the buffer was written to be able 217 * to determine if we should have replayed the item. If we replay old 218 * metadata over a newer buffer, then it will enter a temporarily 219 * inconsistent state resulting in verification failures. Hence for now 220 * just avoid the verification stage for non-crc filesystems 221 */ 222 if (!xfs_sb_version_hascrc(&mp->m_sb)) 223 return; 224 225 magic32 = be32_to_cpu(*(__be32 *)bp->b_addr); 226 magic16 = be16_to_cpu(*(__be16*)bp->b_addr); 227 magicda = be16_to_cpu(info->magic); 228 switch (xfs_blft_from_flags(buf_f)) { 229 case XFS_BLFT_BTREE_BUF: 230 switch (magic32) { 231 case XFS_ABTB_CRC_MAGIC: 232 case XFS_ABTB_MAGIC: 233 bp->b_ops = &xfs_bnobt_buf_ops; 234 break; 235 case XFS_ABTC_CRC_MAGIC: 236 case XFS_ABTC_MAGIC: 237 bp->b_ops = &xfs_cntbt_buf_ops; 238 break; 239 case XFS_IBT_CRC_MAGIC: 240 case XFS_IBT_MAGIC: 241 bp->b_ops = &xfs_inobt_buf_ops; 242 break; 243 case XFS_FIBT_CRC_MAGIC: 244 case XFS_FIBT_MAGIC: 245 bp->b_ops = &xfs_finobt_buf_ops; 246 break; 247 case XFS_BMAP_CRC_MAGIC: 248 case XFS_BMAP_MAGIC: 249 bp->b_ops = &xfs_bmbt_buf_ops; 250 break; 251 case XFS_RMAP_CRC_MAGIC: 252 bp->b_ops = &xfs_rmapbt_buf_ops; 253 break; 254 case XFS_REFC_CRC_MAGIC: 255 bp->b_ops = &xfs_refcountbt_buf_ops; 256 break; 257 default: 258 warnmsg = "Bad btree block magic!"; 259 break; 260 } 261 break; 262 case XFS_BLFT_AGF_BUF: 263 if (magic32 != XFS_AGF_MAGIC) { 264 warnmsg = "Bad AGF block magic!"; 265 break; 266 } 267 bp->b_ops = &xfs_agf_buf_ops; 268 break; 269 case XFS_BLFT_AGFL_BUF: 270 if (magic32 != XFS_AGFL_MAGIC) { 271 warnmsg = "Bad AGFL block magic!"; 272 break; 273 } 274 bp->b_ops = &xfs_agfl_buf_ops; 275 break; 276 case XFS_BLFT_AGI_BUF: 277 if (magic32 != XFS_AGI_MAGIC) { 278 warnmsg = "Bad AGI block magic!"; 279 break; 280 } 281 bp->b_ops = &xfs_agi_buf_ops; 282 break; 283 case XFS_BLFT_UDQUOT_BUF: 284 case XFS_BLFT_PDQUOT_BUF: 285 case XFS_BLFT_GDQUOT_BUF: 286 #ifdef CONFIG_XFS_QUOTA 287 if (magic16 != XFS_DQUOT_MAGIC) { 288 warnmsg = "Bad DQUOT block magic!"; 289 break; 290 } 291 bp->b_ops = &xfs_dquot_buf_ops; 292 #else 293 xfs_alert(mp, 294 "Trying to recover dquots without QUOTA support built in!"); 295 ASSERT(0); 296 #endif 297 break; 298 case XFS_BLFT_DINO_BUF: 299 if (magic16 != XFS_DINODE_MAGIC) { 300 warnmsg = "Bad INODE block magic!"; 301 break; 302 } 303 bp->b_ops = &xfs_inode_buf_ops; 304 break; 305 case XFS_BLFT_SYMLINK_BUF: 306 if (magic32 != XFS_SYMLINK_MAGIC) { 307 warnmsg = "Bad symlink block magic!"; 308 break; 309 } 310 bp->b_ops = &xfs_symlink_buf_ops; 311 break; 312 case XFS_BLFT_DIR_BLOCK_BUF: 313 if (magic32 != XFS_DIR2_BLOCK_MAGIC && 314 magic32 != XFS_DIR3_BLOCK_MAGIC) { 315 warnmsg = "Bad dir block magic!"; 316 break; 317 } 318 bp->b_ops = &xfs_dir3_block_buf_ops; 319 break; 320 case XFS_BLFT_DIR_DATA_BUF: 321 if (magic32 != XFS_DIR2_DATA_MAGIC && 322 magic32 != XFS_DIR3_DATA_MAGIC) { 323 warnmsg = "Bad dir data magic!"; 324 break; 325 } 326 bp->b_ops = &xfs_dir3_data_buf_ops; 327 break; 328 case XFS_BLFT_DIR_FREE_BUF: 329 if (magic32 != XFS_DIR2_FREE_MAGIC && 330 magic32 != XFS_DIR3_FREE_MAGIC) { 331 warnmsg = "Bad dir3 free magic!"; 332 break; 333 } 334 bp->b_ops = &xfs_dir3_free_buf_ops; 335 break; 336 case XFS_BLFT_DIR_LEAF1_BUF: 337 if (magicda != XFS_DIR2_LEAF1_MAGIC && 338 magicda != XFS_DIR3_LEAF1_MAGIC) { 339 warnmsg = "Bad dir leaf1 magic!"; 340 break; 341 } 342 bp->b_ops = &xfs_dir3_leaf1_buf_ops; 343 break; 344 case XFS_BLFT_DIR_LEAFN_BUF: 345 if (magicda != XFS_DIR2_LEAFN_MAGIC && 346 magicda != XFS_DIR3_LEAFN_MAGIC) { 347 warnmsg = "Bad dir leafn magic!"; 348 break; 349 } 350 bp->b_ops = &xfs_dir3_leafn_buf_ops; 351 break; 352 case XFS_BLFT_DA_NODE_BUF: 353 if (magicda != XFS_DA_NODE_MAGIC && 354 magicda != XFS_DA3_NODE_MAGIC) { 355 warnmsg = "Bad da node magic!"; 356 break; 357 } 358 bp->b_ops = &xfs_da3_node_buf_ops; 359 break; 360 case XFS_BLFT_ATTR_LEAF_BUF: 361 if (magicda != XFS_ATTR_LEAF_MAGIC && 362 magicda != XFS_ATTR3_LEAF_MAGIC) { 363 warnmsg = "Bad attr leaf magic!"; 364 break; 365 } 366 bp->b_ops = &xfs_attr3_leaf_buf_ops; 367 break; 368 case XFS_BLFT_ATTR_RMT_BUF: 369 if (magic32 != XFS_ATTR3_RMT_MAGIC) { 370 warnmsg = "Bad attr remote magic!"; 371 break; 372 } 373 bp->b_ops = &xfs_attr3_rmt_buf_ops; 374 break; 375 case XFS_BLFT_SB_BUF: 376 if (magic32 != XFS_SB_MAGIC) { 377 warnmsg = "Bad SB block magic!"; 378 break; 379 } 380 bp->b_ops = &xfs_sb_buf_ops; 381 break; 382 #ifdef CONFIG_XFS_RT 383 case XFS_BLFT_RTBITMAP_BUF: 384 case XFS_BLFT_RTSUMMARY_BUF: 385 /* no magic numbers for verification of RT buffers */ 386 bp->b_ops = &xfs_rtbuf_ops; 387 break; 388 #endif /* CONFIG_XFS_RT */ 389 default: 390 xfs_warn(mp, "Unknown buffer type %d!", 391 xfs_blft_from_flags(buf_f)); 392 break; 393 } 394 395 /* 396 * Nothing else to do in the case of a NULL current LSN as this means 397 * the buffer is more recent than the change in the log and will be 398 * skipped. 399 */ 400 if (current_lsn == NULLCOMMITLSN) 401 return; 402 403 if (warnmsg) { 404 xfs_warn(mp, warnmsg); 405 ASSERT(0); 406 } 407 408 /* 409 * We must update the metadata LSN of the buffer as it is written out to 410 * ensure that older transactions never replay over this one and corrupt 411 * the buffer. This can occur if log recovery is interrupted at some 412 * point after the current transaction completes, at which point a 413 * subsequent mount starts recovery from the beginning. 414 * 415 * Write verifiers update the metadata LSN from log items attached to 416 * the buffer. Therefore, initialize a bli purely to carry the LSN to 417 * the verifier. We'll clean it up in our ->iodone() callback. 418 */ 419 if (bp->b_ops) { 420 struct xfs_buf_log_item *bip; 421 422 ASSERT(!bp->b_iodone || bp->b_iodone == xlog_recover_iodone); 423 bp->b_iodone = xlog_recover_iodone; 424 xfs_buf_item_init(bp, mp); 425 bip = bp->b_log_item; 426 bip->bli_item.li_lsn = current_lsn; 427 } 428 } 429 430 /* 431 * Perform a 'normal' buffer recovery. Each logged region of the 432 * buffer should be copied over the corresponding region in the 433 * given buffer. The bitmap in the buf log format structure indicates 434 * where to place the logged data. 435 */ 436 STATIC void 437 xlog_recover_do_reg_buffer( 438 struct xfs_mount *mp, 439 struct xlog_recover_item *item, 440 struct xfs_buf *bp, 441 struct xfs_buf_log_format *buf_f, 442 xfs_lsn_t current_lsn) 443 { 444 int i; 445 int bit; 446 int nbits; 447 xfs_failaddr_t fa; 448 const size_t size_disk_dquot = sizeof(struct xfs_disk_dquot); 449 450 trace_xfs_log_recover_buf_reg_buf(mp->m_log, buf_f); 451 452 bit = 0; 453 i = 1; /* 0 is the buf format structure */ 454 while (1) { 455 bit = xfs_next_bit(buf_f->blf_data_map, 456 buf_f->blf_map_size, bit); 457 if (bit == -1) 458 break; 459 nbits = xfs_contig_bits(buf_f->blf_data_map, 460 buf_f->blf_map_size, bit); 461 ASSERT(nbits > 0); 462 ASSERT(item->ri_buf[i].i_addr != NULL); 463 ASSERT(item->ri_buf[i].i_len % XFS_BLF_CHUNK == 0); 464 ASSERT(BBTOB(bp->b_length) >= 465 ((uint)bit << XFS_BLF_SHIFT) + (nbits << XFS_BLF_SHIFT)); 466 467 /* 468 * The dirty regions logged in the buffer, even though 469 * contiguous, may span multiple chunks. This is because the 470 * dirty region may span a physical page boundary in a buffer 471 * and hence be split into two separate vectors for writing into 472 * the log. Hence we need to trim nbits back to the length of 473 * the current region being copied out of the log. 474 */ 475 if (item->ri_buf[i].i_len < (nbits << XFS_BLF_SHIFT)) 476 nbits = item->ri_buf[i].i_len >> XFS_BLF_SHIFT; 477 478 /* 479 * Do a sanity check if this is a dquot buffer. Just checking 480 * the first dquot in the buffer should do. XXXThis is 481 * probably a good thing to do for other buf types also. 482 */ 483 fa = NULL; 484 if (buf_f->blf_flags & 485 (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) { 486 if (item->ri_buf[i].i_addr == NULL) { 487 xfs_alert(mp, 488 "XFS: NULL dquot in %s.", __func__); 489 goto next; 490 } 491 if (item->ri_buf[i].i_len < size_disk_dquot) { 492 xfs_alert(mp, 493 "XFS: dquot too small (%d) in %s.", 494 item->ri_buf[i].i_len, __func__); 495 goto next; 496 } 497 fa = xfs_dquot_verify(mp, item->ri_buf[i].i_addr, 498 -1, 0); 499 if (fa) { 500 xfs_alert(mp, 501 "dquot corrupt at %pS trying to replay into block 0x%llx", 502 fa, bp->b_bn); 503 goto next; 504 } 505 } 506 507 memcpy(xfs_buf_offset(bp, 508 (uint)bit << XFS_BLF_SHIFT), /* dest */ 509 item->ri_buf[i].i_addr, /* source */ 510 nbits<<XFS_BLF_SHIFT); /* length */ 511 next: 512 i++; 513 bit += nbits; 514 } 515 516 /* Shouldn't be any more regions */ 517 ASSERT(i == item->ri_total); 518 519 xlog_recover_validate_buf_type(mp, bp, buf_f, current_lsn); 520 } 521 522 /* 523 * Perform a dquot buffer recovery. 524 * Simple algorithm: if we have found a QUOTAOFF log item of the same type 525 * (ie. USR or GRP), then just toss this buffer away; don't recover it. 526 * Else, treat it as a regular buffer and do recovery. 527 * 528 * Return false if the buffer was tossed and true if we recovered the buffer to 529 * indicate to the caller if the buffer needs writing. 530 */ 531 STATIC bool 532 xlog_recover_do_dquot_buffer( 533 struct xfs_mount *mp, 534 struct xlog *log, 535 struct xlog_recover_item *item, 536 struct xfs_buf *bp, 537 struct xfs_buf_log_format *buf_f) 538 { 539 uint type; 540 541 trace_xfs_log_recover_buf_dquot_buf(log, buf_f); 542 543 /* 544 * Filesystems are required to send in quota flags at mount time. 545 */ 546 if (!mp->m_qflags) 547 return false; 548 549 type = 0; 550 if (buf_f->blf_flags & XFS_BLF_UDQUOT_BUF) 551 type |= XFS_DQ_USER; 552 if (buf_f->blf_flags & XFS_BLF_PDQUOT_BUF) 553 type |= XFS_DQ_PROJ; 554 if (buf_f->blf_flags & XFS_BLF_GDQUOT_BUF) 555 type |= XFS_DQ_GROUP; 556 /* 557 * This type of quotas was turned off, so ignore this buffer 558 */ 559 if (log->l_quotaoffs_flag & type) 560 return false; 561 562 xlog_recover_do_reg_buffer(mp, item, bp, buf_f, NULLCOMMITLSN); 563 return true; 564 } 565 566 /* 567 * Perform recovery for a buffer full of inodes. In these buffers, the only 568 * data which should be recovered is that which corresponds to the 569 * di_next_unlinked pointers in the on disk inode structures. The rest of the 570 * data for the inodes is always logged through the inodes themselves rather 571 * than the inode buffer and is recovered in xlog_recover_inode_pass2(). 572 * 573 * The only time when buffers full of inodes are fully recovered is when the 574 * buffer is full of newly allocated inodes. In this case the buffer will 575 * not be marked as an inode buffer and so will be sent to 576 * xlog_recover_do_reg_buffer() below during recovery. 577 */ 578 STATIC int 579 xlog_recover_do_inode_buffer( 580 struct xfs_mount *mp, 581 struct xlog_recover_item *item, 582 struct xfs_buf *bp, 583 struct xfs_buf_log_format *buf_f) 584 { 585 int i; 586 int item_index = 0; 587 int bit = 0; 588 int nbits = 0; 589 int reg_buf_offset = 0; 590 int reg_buf_bytes = 0; 591 int next_unlinked_offset; 592 int inodes_per_buf; 593 xfs_agino_t *logged_nextp; 594 xfs_agino_t *buffer_nextp; 595 596 trace_xfs_log_recover_buf_inode_buf(mp->m_log, buf_f); 597 598 /* 599 * Post recovery validation only works properly on CRC enabled 600 * filesystems. 601 */ 602 if (xfs_sb_version_hascrc(&mp->m_sb)) 603 bp->b_ops = &xfs_inode_buf_ops; 604 605 inodes_per_buf = BBTOB(bp->b_length) >> mp->m_sb.sb_inodelog; 606 for (i = 0; i < inodes_per_buf; i++) { 607 next_unlinked_offset = (i * mp->m_sb.sb_inodesize) + 608 offsetof(xfs_dinode_t, di_next_unlinked); 609 610 while (next_unlinked_offset >= 611 (reg_buf_offset + reg_buf_bytes)) { 612 /* 613 * The next di_next_unlinked field is beyond 614 * the current logged region. Find the next 615 * logged region that contains or is beyond 616 * the current di_next_unlinked field. 617 */ 618 bit += nbits; 619 bit = xfs_next_bit(buf_f->blf_data_map, 620 buf_f->blf_map_size, bit); 621 622 /* 623 * If there are no more logged regions in the 624 * buffer, then we're done. 625 */ 626 if (bit == -1) 627 return 0; 628 629 nbits = xfs_contig_bits(buf_f->blf_data_map, 630 buf_f->blf_map_size, bit); 631 ASSERT(nbits > 0); 632 reg_buf_offset = bit << XFS_BLF_SHIFT; 633 reg_buf_bytes = nbits << XFS_BLF_SHIFT; 634 item_index++; 635 } 636 637 /* 638 * If the current logged region starts after the current 639 * di_next_unlinked field, then move on to the next 640 * di_next_unlinked field. 641 */ 642 if (next_unlinked_offset < reg_buf_offset) 643 continue; 644 645 ASSERT(item->ri_buf[item_index].i_addr != NULL); 646 ASSERT((item->ri_buf[item_index].i_len % XFS_BLF_CHUNK) == 0); 647 ASSERT((reg_buf_offset + reg_buf_bytes) <= BBTOB(bp->b_length)); 648 649 /* 650 * The current logged region contains a copy of the 651 * current di_next_unlinked field. Extract its value 652 * and copy it to the buffer copy. 653 */ 654 logged_nextp = item->ri_buf[item_index].i_addr + 655 next_unlinked_offset - reg_buf_offset; 656 if (XFS_IS_CORRUPT(mp, *logged_nextp == 0)) { 657 xfs_alert(mp, 658 "Bad inode buffer log record (ptr = "PTR_FMT", bp = "PTR_FMT"). " 659 "Trying to replay bad (0) inode di_next_unlinked field.", 660 item, bp); 661 return -EFSCORRUPTED; 662 } 663 664 buffer_nextp = xfs_buf_offset(bp, next_unlinked_offset); 665 *buffer_nextp = *logged_nextp; 666 667 /* 668 * If necessary, recalculate the CRC in the on-disk inode. We 669 * have to leave the inode in a consistent state for whoever 670 * reads it next.... 671 */ 672 xfs_dinode_calc_crc(mp, 673 xfs_buf_offset(bp, i * mp->m_sb.sb_inodesize)); 674 675 } 676 677 return 0; 678 } 679 680 /* 681 * V5 filesystems know the age of the buffer on disk being recovered. We can 682 * have newer objects on disk than we are replaying, and so for these cases we 683 * don't want to replay the current change as that will make the buffer contents 684 * temporarily invalid on disk. 685 * 686 * The magic number might not match the buffer type we are going to recover 687 * (e.g. reallocated blocks), so we ignore the xfs_buf_log_format flags. Hence 688 * extract the LSN of the existing object in the buffer based on it's current 689 * magic number. If we don't recognise the magic number in the buffer, then 690 * return a LSN of -1 so that the caller knows it was an unrecognised block and 691 * so can recover the buffer. 692 * 693 * Note: we cannot rely solely on magic number matches to determine that the 694 * buffer has a valid LSN - we also need to verify that it belongs to this 695 * filesystem, so we need to extract the object's LSN and compare it to that 696 * which we read from the superblock. If the UUIDs don't match, then we've got a 697 * stale metadata block from an old filesystem instance that we need to recover 698 * over the top of. 699 */ 700 static xfs_lsn_t 701 xlog_recover_get_buf_lsn( 702 struct xfs_mount *mp, 703 struct xfs_buf *bp) 704 { 705 uint32_t magic32; 706 uint16_t magic16; 707 uint16_t magicda; 708 void *blk = bp->b_addr; 709 uuid_t *uuid; 710 xfs_lsn_t lsn = -1; 711 712 /* v4 filesystems always recover immediately */ 713 if (!xfs_sb_version_hascrc(&mp->m_sb)) 714 goto recover_immediately; 715 716 magic32 = be32_to_cpu(*(__be32 *)blk); 717 switch (magic32) { 718 case XFS_ABTB_CRC_MAGIC: 719 case XFS_ABTC_CRC_MAGIC: 720 case XFS_ABTB_MAGIC: 721 case XFS_ABTC_MAGIC: 722 case XFS_RMAP_CRC_MAGIC: 723 case XFS_REFC_CRC_MAGIC: 724 case XFS_IBT_CRC_MAGIC: 725 case XFS_IBT_MAGIC: { 726 struct xfs_btree_block *btb = blk; 727 728 lsn = be64_to_cpu(btb->bb_u.s.bb_lsn); 729 uuid = &btb->bb_u.s.bb_uuid; 730 break; 731 } 732 case XFS_BMAP_CRC_MAGIC: 733 case XFS_BMAP_MAGIC: { 734 struct xfs_btree_block *btb = blk; 735 736 lsn = be64_to_cpu(btb->bb_u.l.bb_lsn); 737 uuid = &btb->bb_u.l.bb_uuid; 738 break; 739 } 740 case XFS_AGF_MAGIC: 741 lsn = be64_to_cpu(((struct xfs_agf *)blk)->agf_lsn); 742 uuid = &((struct xfs_agf *)blk)->agf_uuid; 743 break; 744 case XFS_AGFL_MAGIC: 745 lsn = be64_to_cpu(((struct xfs_agfl *)blk)->agfl_lsn); 746 uuid = &((struct xfs_agfl *)blk)->agfl_uuid; 747 break; 748 case XFS_AGI_MAGIC: 749 lsn = be64_to_cpu(((struct xfs_agi *)blk)->agi_lsn); 750 uuid = &((struct xfs_agi *)blk)->agi_uuid; 751 break; 752 case XFS_SYMLINK_MAGIC: 753 lsn = be64_to_cpu(((struct xfs_dsymlink_hdr *)blk)->sl_lsn); 754 uuid = &((struct xfs_dsymlink_hdr *)blk)->sl_uuid; 755 break; 756 case XFS_DIR3_BLOCK_MAGIC: 757 case XFS_DIR3_DATA_MAGIC: 758 case XFS_DIR3_FREE_MAGIC: 759 lsn = be64_to_cpu(((struct xfs_dir3_blk_hdr *)blk)->lsn); 760 uuid = &((struct xfs_dir3_blk_hdr *)blk)->uuid; 761 break; 762 case XFS_ATTR3_RMT_MAGIC: 763 /* 764 * Remote attr blocks are written synchronously, rather than 765 * being logged. That means they do not contain a valid LSN 766 * (i.e. transactionally ordered) in them, and hence any time we 767 * see a buffer to replay over the top of a remote attribute 768 * block we should simply do so. 769 */ 770 goto recover_immediately; 771 case XFS_SB_MAGIC: 772 /* 773 * superblock uuids are magic. We may or may not have a 774 * sb_meta_uuid on disk, but it will be set in the in-core 775 * superblock. We set the uuid pointer for verification 776 * according to the superblock feature mask to ensure we check 777 * the relevant UUID in the superblock. 778 */ 779 lsn = be64_to_cpu(((struct xfs_dsb *)blk)->sb_lsn); 780 if (xfs_sb_version_hasmetauuid(&mp->m_sb)) 781 uuid = &((struct xfs_dsb *)blk)->sb_meta_uuid; 782 else 783 uuid = &((struct xfs_dsb *)blk)->sb_uuid; 784 break; 785 default: 786 break; 787 } 788 789 if (lsn != (xfs_lsn_t)-1) { 790 if (!uuid_equal(&mp->m_sb.sb_meta_uuid, uuid)) 791 goto recover_immediately; 792 return lsn; 793 } 794 795 magicda = be16_to_cpu(((struct xfs_da_blkinfo *)blk)->magic); 796 switch (magicda) { 797 case XFS_DIR3_LEAF1_MAGIC: 798 case XFS_DIR3_LEAFN_MAGIC: 799 case XFS_DA3_NODE_MAGIC: 800 lsn = be64_to_cpu(((struct xfs_da3_blkinfo *)blk)->lsn); 801 uuid = &((struct xfs_da3_blkinfo *)blk)->uuid; 802 break; 803 default: 804 break; 805 } 806 807 if (lsn != (xfs_lsn_t)-1) { 808 if (!uuid_equal(&mp->m_sb.sb_uuid, uuid)) 809 goto recover_immediately; 810 return lsn; 811 } 812 813 /* 814 * We do individual object checks on dquot and inode buffers as they 815 * have their own individual LSN records. Also, we could have a stale 816 * buffer here, so we have to at least recognise these buffer types. 817 * 818 * A notd complexity here is inode unlinked list processing - it logs 819 * the inode directly in the buffer, but we don't know which inodes have 820 * been modified, and there is no global buffer LSN. Hence we need to 821 * recover all inode buffer types immediately. This problem will be 822 * fixed by logical logging of the unlinked list modifications. 823 */ 824 magic16 = be16_to_cpu(*(__be16 *)blk); 825 switch (magic16) { 826 case XFS_DQUOT_MAGIC: 827 case XFS_DINODE_MAGIC: 828 goto recover_immediately; 829 default: 830 break; 831 } 832 833 /* unknown buffer contents, recover immediately */ 834 835 recover_immediately: 836 return (xfs_lsn_t)-1; 837 838 } 839 840 /* 841 * This routine replays a modification made to a buffer at runtime. 842 * There are actually two types of buffer, regular and inode, which 843 * are handled differently. Inode buffers are handled differently 844 * in that we only recover a specific set of data from them, namely 845 * the inode di_next_unlinked fields. This is because all other inode 846 * data is actually logged via inode records and any data we replay 847 * here which overlaps that may be stale. 848 * 849 * When meta-data buffers are freed at run time we log a buffer item 850 * with the XFS_BLF_CANCEL bit set to indicate that previous copies 851 * of the buffer in the log should not be replayed at recovery time. 852 * This is so that if the blocks covered by the buffer are reused for 853 * file data before we crash we don't end up replaying old, freed 854 * meta-data into a user's file. 855 * 856 * To handle the cancellation of buffer log items, we make two passes 857 * over the log during recovery. During the first we build a table of 858 * those buffers which have been cancelled, and during the second we 859 * only replay those buffers which do not have corresponding cancel 860 * records in the table. See xlog_recover_buf_pass[1,2] above 861 * for more details on the implementation of the table of cancel records. 862 */ 863 STATIC int 864 xlog_recover_buf_commit_pass2( 865 struct xlog *log, 866 struct list_head *buffer_list, 867 struct xlog_recover_item *item, 868 xfs_lsn_t current_lsn) 869 { 870 struct xfs_buf_log_format *buf_f = item->ri_buf[0].i_addr; 871 struct xfs_mount *mp = log->l_mp; 872 struct xfs_buf *bp; 873 int error; 874 uint buf_flags; 875 xfs_lsn_t lsn; 876 877 /* 878 * In this pass we only want to recover all the buffers which have 879 * not been cancelled and are not cancellation buffers themselves. 880 */ 881 if (buf_f->blf_flags & XFS_BLF_CANCEL) { 882 if (xlog_put_buffer_cancelled(log, buf_f->blf_blkno, 883 buf_f->blf_len)) 884 goto cancelled; 885 } else { 886 887 if (xlog_is_buffer_cancelled(log, buf_f->blf_blkno, 888 buf_f->blf_len)) 889 goto cancelled; 890 } 891 892 trace_xfs_log_recover_buf_recover(log, buf_f); 893 894 buf_flags = 0; 895 if (buf_f->blf_flags & XFS_BLF_INODE_BUF) 896 buf_flags |= XBF_UNMAPPED; 897 898 error = xfs_buf_read(mp->m_ddev_targp, buf_f->blf_blkno, buf_f->blf_len, 899 buf_flags, &bp, NULL); 900 if (error) 901 return error; 902 903 /* 904 * Recover the buffer only if we get an LSN from it and it's less than 905 * the lsn of the transaction we are replaying. 906 * 907 * Note that we have to be extremely careful of readahead here. 908 * Readahead does not attach verfiers to the buffers so if we don't 909 * actually do any replay after readahead because of the LSN we found 910 * in the buffer if more recent than that current transaction then we 911 * need to attach the verifier directly. Failure to do so can lead to 912 * future recovery actions (e.g. EFI and unlinked list recovery) can 913 * operate on the buffers and they won't get the verifier attached. This 914 * can lead to blocks on disk having the correct content but a stale 915 * CRC. 916 * 917 * It is safe to assume these clean buffers are currently up to date. 918 * If the buffer is dirtied by a later transaction being replayed, then 919 * the verifier will be reset to match whatever recover turns that 920 * buffer into. 921 */ 922 lsn = xlog_recover_get_buf_lsn(mp, bp); 923 if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) { 924 trace_xfs_log_recover_buf_skip(log, buf_f); 925 xlog_recover_validate_buf_type(mp, bp, buf_f, NULLCOMMITLSN); 926 goto out_release; 927 } 928 929 if (buf_f->blf_flags & XFS_BLF_INODE_BUF) { 930 error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f); 931 if (error) 932 goto out_release; 933 } else if (buf_f->blf_flags & 934 (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) { 935 bool dirty; 936 937 dirty = xlog_recover_do_dquot_buffer(mp, log, item, bp, buf_f); 938 if (!dirty) 939 goto out_release; 940 } else { 941 xlog_recover_do_reg_buffer(mp, item, bp, buf_f, current_lsn); 942 } 943 944 /* 945 * Perform delayed write on the buffer. Asynchronous writes will be 946 * slower when taking into account all the buffers to be flushed. 947 * 948 * Also make sure that only inode buffers with good sizes stay in 949 * the buffer cache. The kernel moves inodes in buffers of 1 block 950 * or inode_cluster_size bytes, whichever is bigger. The inode 951 * buffers in the log can be a different size if the log was generated 952 * by an older kernel using unclustered inode buffers or a newer kernel 953 * running with a different inode cluster size. Regardless, if the 954 * the inode buffer size isn't max(blocksize, inode_cluster_size) 955 * for *our* value of inode_cluster_size, then we need to keep 956 * the buffer out of the buffer cache so that the buffer won't 957 * overlap with future reads of those inodes. 958 */ 959 if (XFS_DINODE_MAGIC == 960 be16_to_cpu(*((__be16 *)xfs_buf_offset(bp, 0))) && 961 (BBTOB(bp->b_length) != M_IGEO(log->l_mp)->inode_cluster_size)) { 962 xfs_buf_stale(bp); 963 error = xfs_bwrite(bp); 964 } else { 965 ASSERT(bp->b_mount == mp); 966 bp->b_iodone = xlog_recover_iodone; 967 xfs_buf_delwri_queue(bp, buffer_list); 968 } 969 970 out_release: 971 xfs_buf_relse(bp); 972 return error; 973 cancelled: 974 trace_xfs_log_recover_buf_cancel(log, buf_f); 975 return 0; 976 } 977 978 const struct xlog_recover_item_ops xlog_buf_item_ops = { 979 .item_type = XFS_LI_BUF, 980 .reorder = xlog_recover_buf_reorder, 981 .ra_pass2 = xlog_recover_buf_ra_pass2, 982 .commit_pass1 = xlog_recover_buf_commit_pass1, 983 .commit_pass2 = xlog_recover_buf_commit_pass2, 984 }; 985