1 /* 2 * Copyright (c) 2000-2006 Silicon Graphics, Inc. 3 * All Rights Reserved. 4 * 5 * This program is free software; you can redistribute it and/or 6 * modify it under the terms of the GNU General Public License as 7 * published by the Free Software Foundation. 8 * 9 * This program is distributed in the hope that it would be useful, 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 * GNU General Public License for more details. 13 * 14 * You should have received a copy of the GNU General Public License 15 * along with this program; if not, write the Free Software Foundation, 16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 17 */ 18 #include "xfs.h" 19 #include "xfs_fs.h" 20 #include "xfs_shared.h" 21 #include "xfs_format.h" 22 #include "xfs_log_format.h" 23 #include "xfs_trans_resv.h" 24 #include "xfs_bit.h" 25 #include "xfs_inum.h" 26 #include "xfs_sb.h" 27 #include "xfs_ag.h" 28 #include "xfs_mount.h" 29 #include "xfs_da_format.h" 30 #include "xfs_inode.h" 31 #include "xfs_trans.h" 32 #include "xfs_log.h" 33 #include "xfs_log_priv.h" 34 #include "xfs_log_recover.h" 35 #include "xfs_inode_item.h" 36 #include "xfs_extfree_item.h" 37 #include "xfs_trans_priv.h" 38 #include "xfs_alloc.h" 39 #include "xfs_ialloc.h" 40 #include "xfs_quota.h" 41 #include "xfs_cksum.h" 42 #include "xfs_trace.h" 43 #include "xfs_icache.h" 44 #include "xfs_bmap_btree.h" 45 #include "xfs_dinode.h" 46 #include "xfs_error.h" 47 #include "xfs_dir2.h" 48 49 #define BLK_AVG(blk1, blk2) ((blk1+blk2) >> 1) 50 51 STATIC int 52 xlog_find_zeroed( 53 struct xlog *, 54 xfs_daddr_t *); 55 STATIC int 56 xlog_clear_stale_blocks( 57 struct xlog *, 58 xfs_lsn_t); 59 #if defined(DEBUG) 60 STATIC void 61 xlog_recover_check_summary( 62 struct xlog *); 63 #else 64 #define xlog_recover_check_summary(log) 65 #endif 66 67 /* 68 * This structure is used during recovery to record the buf log items which 69 * have been canceled and should not be replayed. 70 */ 71 struct xfs_buf_cancel { 72 xfs_daddr_t bc_blkno; 73 uint bc_len; 74 int bc_refcount; 75 struct list_head bc_list; 76 }; 77 78 /* 79 * Sector aligned buffer routines for buffer create/read/write/access 80 */ 81 82 /* 83 * Verify the given count of basic blocks is valid number of blocks 84 * to specify for an operation involving the given XFS log buffer. 85 * Returns nonzero if the count is valid, 0 otherwise. 86 */ 87 88 static inline int 89 xlog_buf_bbcount_valid( 90 struct xlog *log, 91 int bbcount) 92 { 93 return bbcount > 0 && bbcount <= log->l_logBBsize; 94 } 95 96 /* 97 * Allocate a buffer to hold log data. The buffer needs to be able 98 * to map to a range of nbblks basic blocks at any valid (basic 99 * block) offset within the log. 100 */ 101 STATIC xfs_buf_t * 102 xlog_get_bp( 103 struct xlog *log, 104 int nbblks) 105 { 106 struct xfs_buf *bp; 107 108 if (!xlog_buf_bbcount_valid(log, nbblks)) { 109 xfs_warn(log->l_mp, "Invalid block length (0x%x) for buffer", 110 nbblks); 111 XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp); 112 return NULL; 113 } 114 115 /* 116 * We do log I/O in units of log sectors (a power-of-2 117 * multiple of the basic block size), so we round up the 118 * requested size to accommodate the basic blocks required 119 * for complete log sectors. 120 * 121 * In addition, the buffer may be used for a non-sector- 122 * aligned block offset, in which case an I/O of the 123 * requested size could extend beyond the end of the 124 * buffer. If the requested size is only 1 basic block it 125 * will never straddle a sector boundary, so this won't be 126 * an issue. Nor will this be a problem if the log I/O is 127 * done in basic blocks (sector size 1). But otherwise we 128 * extend the buffer by one extra log sector to ensure 129 * there's space to accommodate this possibility. 130 */ 131 if (nbblks > 1 && log->l_sectBBsize > 1) 132 nbblks += log->l_sectBBsize; 133 nbblks = round_up(nbblks, log->l_sectBBsize); 134 135 bp = xfs_buf_get_uncached(log->l_mp->m_logdev_targp, nbblks, 0); 136 if (bp) 137 xfs_buf_unlock(bp); 138 return bp; 139 } 140 141 STATIC void 142 xlog_put_bp( 143 xfs_buf_t *bp) 144 { 145 xfs_buf_free(bp); 146 } 147 148 /* 149 * Return the address of the start of the given block number's data 150 * in a log buffer. The buffer covers a log sector-aligned region. 151 */ 152 STATIC xfs_caddr_t 153 xlog_align( 154 struct xlog *log, 155 xfs_daddr_t blk_no, 156 int nbblks, 157 struct xfs_buf *bp) 158 { 159 xfs_daddr_t offset = blk_no & ((xfs_daddr_t)log->l_sectBBsize - 1); 160 161 ASSERT(offset + nbblks <= bp->b_length); 162 return bp->b_addr + BBTOB(offset); 163 } 164 165 166 /* 167 * nbblks should be uint, but oh well. Just want to catch that 32-bit length. 168 */ 169 STATIC int 170 xlog_bread_noalign( 171 struct xlog *log, 172 xfs_daddr_t blk_no, 173 int nbblks, 174 struct xfs_buf *bp) 175 { 176 int error; 177 178 if (!xlog_buf_bbcount_valid(log, nbblks)) { 179 xfs_warn(log->l_mp, "Invalid block length (0x%x) for buffer", 180 nbblks); 181 XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp); 182 return -EFSCORRUPTED; 183 } 184 185 blk_no = round_down(blk_no, log->l_sectBBsize); 186 nbblks = round_up(nbblks, log->l_sectBBsize); 187 188 ASSERT(nbblks > 0); 189 ASSERT(nbblks <= bp->b_length); 190 191 XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no); 192 XFS_BUF_READ(bp); 193 bp->b_io_length = nbblks; 194 bp->b_error = 0; 195 196 error = xfs_buf_submit_wait(bp); 197 if (error && !XFS_FORCED_SHUTDOWN(log->l_mp)) 198 xfs_buf_ioerror_alert(bp, __func__); 199 return error; 200 } 201 202 STATIC int 203 xlog_bread( 204 struct xlog *log, 205 xfs_daddr_t blk_no, 206 int nbblks, 207 struct xfs_buf *bp, 208 xfs_caddr_t *offset) 209 { 210 int error; 211 212 error = xlog_bread_noalign(log, blk_no, nbblks, bp); 213 if (error) 214 return error; 215 216 *offset = xlog_align(log, blk_no, nbblks, bp); 217 return 0; 218 } 219 220 /* 221 * Read at an offset into the buffer. Returns with the buffer in it's original 222 * state regardless of the result of the read. 223 */ 224 STATIC int 225 xlog_bread_offset( 226 struct xlog *log, 227 xfs_daddr_t blk_no, /* block to read from */ 228 int nbblks, /* blocks to read */ 229 struct xfs_buf *bp, 230 xfs_caddr_t offset) 231 { 232 xfs_caddr_t orig_offset = bp->b_addr; 233 int orig_len = BBTOB(bp->b_length); 234 int error, error2; 235 236 error = xfs_buf_associate_memory(bp, offset, BBTOB(nbblks)); 237 if (error) 238 return error; 239 240 error = xlog_bread_noalign(log, blk_no, nbblks, bp); 241 242 /* must reset buffer pointer even on error */ 243 error2 = xfs_buf_associate_memory(bp, orig_offset, orig_len); 244 if (error) 245 return error; 246 return error2; 247 } 248 249 /* 250 * Write out the buffer at the given block for the given number of blocks. 251 * The buffer is kept locked across the write and is returned locked. 252 * This can only be used for synchronous log writes. 253 */ 254 STATIC int 255 xlog_bwrite( 256 struct xlog *log, 257 xfs_daddr_t blk_no, 258 int nbblks, 259 struct xfs_buf *bp) 260 { 261 int error; 262 263 if (!xlog_buf_bbcount_valid(log, nbblks)) { 264 xfs_warn(log->l_mp, "Invalid block length (0x%x) for buffer", 265 nbblks); 266 XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp); 267 return -EFSCORRUPTED; 268 } 269 270 blk_no = round_down(blk_no, log->l_sectBBsize); 271 nbblks = round_up(nbblks, log->l_sectBBsize); 272 273 ASSERT(nbblks > 0); 274 ASSERT(nbblks <= bp->b_length); 275 276 XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no); 277 XFS_BUF_ZEROFLAGS(bp); 278 xfs_buf_hold(bp); 279 xfs_buf_lock(bp); 280 bp->b_io_length = nbblks; 281 bp->b_error = 0; 282 283 error = xfs_bwrite(bp); 284 if (error) 285 xfs_buf_ioerror_alert(bp, __func__); 286 xfs_buf_relse(bp); 287 return error; 288 } 289 290 #ifdef DEBUG 291 /* 292 * dump debug superblock and log record information 293 */ 294 STATIC void 295 xlog_header_check_dump( 296 xfs_mount_t *mp, 297 xlog_rec_header_t *head) 298 { 299 xfs_debug(mp, "%s: SB : uuid = %pU, fmt = %d", 300 __func__, &mp->m_sb.sb_uuid, XLOG_FMT); 301 xfs_debug(mp, " log : uuid = %pU, fmt = %d", 302 &head->h_fs_uuid, be32_to_cpu(head->h_fmt)); 303 } 304 #else 305 #define xlog_header_check_dump(mp, head) 306 #endif 307 308 /* 309 * check log record header for recovery 310 */ 311 STATIC int 312 xlog_header_check_recover( 313 xfs_mount_t *mp, 314 xlog_rec_header_t *head) 315 { 316 ASSERT(head->h_magicno == cpu_to_be32(XLOG_HEADER_MAGIC_NUM)); 317 318 /* 319 * IRIX doesn't write the h_fmt field and leaves it zeroed 320 * (XLOG_FMT_UNKNOWN). This stops us from trying to recover 321 * a dirty log created in IRIX. 322 */ 323 if (unlikely(head->h_fmt != cpu_to_be32(XLOG_FMT))) { 324 xfs_warn(mp, 325 "dirty log written in incompatible format - can't recover"); 326 xlog_header_check_dump(mp, head); 327 XFS_ERROR_REPORT("xlog_header_check_recover(1)", 328 XFS_ERRLEVEL_HIGH, mp); 329 return -EFSCORRUPTED; 330 } else if (unlikely(!uuid_equal(&mp->m_sb.sb_uuid, &head->h_fs_uuid))) { 331 xfs_warn(mp, 332 "dirty log entry has mismatched uuid - can't recover"); 333 xlog_header_check_dump(mp, head); 334 XFS_ERROR_REPORT("xlog_header_check_recover(2)", 335 XFS_ERRLEVEL_HIGH, mp); 336 return -EFSCORRUPTED; 337 } 338 return 0; 339 } 340 341 /* 342 * read the head block of the log and check the header 343 */ 344 STATIC int 345 xlog_header_check_mount( 346 xfs_mount_t *mp, 347 xlog_rec_header_t *head) 348 { 349 ASSERT(head->h_magicno == cpu_to_be32(XLOG_HEADER_MAGIC_NUM)); 350 351 if (uuid_is_nil(&head->h_fs_uuid)) { 352 /* 353 * IRIX doesn't write the h_fs_uuid or h_fmt fields. If 354 * h_fs_uuid is nil, we assume this log was last mounted 355 * by IRIX and continue. 356 */ 357 xfs_warn(mp, "nil uuid in log - IRIX style log"); 358 } else if (unlikely(!uuid_equal(&mp->m_sb.sb_uuid, &head->h_fs_uuid))) { 359 xfs_warn(mp, "log has mismatched uuid - can't recover"); 360 xlog_header_check_dump(mp, head); 361 XFS_ERROR_REPORT("xlog_header_check_mount", 362 XFS_ERRLEVEL_HIGH, mp); 363 return -EFSCORRUPTED; 364 } 365 return 0; 366 } 367 368 STATIC void 369 xlog_recover_iodone( 370 struct xfs_buf *bp) 371 { 372 if (bp->b_error) { 373 /* 374 * We're not going to bother about retrying 375 * this during recovery. One strike! 376 */ 377 if (!XFS_FORCED_SHUTDOWN(bp->b_target->bt_mount)) { 378 xfs_buf_ioerror_alert(bp, __func__); 379 xfs_force_shutdown(bp->b_target->bt_mount, 380 SHUTDOWN_META_IO_ERROR); 381 } 382 } 383 bp->b_iodone = NULL; 384 xfs_buf_ioend(bp); 385 } 386 387 /* 388 * This routine finds (to an approximation) the first block in the physical 389 * log which contains the given cycle. It uses a binary search algorithm. 390 * Note that the algorithm can not be perfect because the disk will not 391 * necessarily be perfect. 392 */ 393 STATIC int 394 xlog_find_cycle_start( 395 struct xlog *log, 396 struct xfs_buf *bp, 397 xfs_daddr_t first_blk, 398 xfs_daddr_t *last_blk, 399 uint cycle) 400 { 401 xfs_caddr_t offset; 402 xfs_daddr_t mid_blk; 403 xfs_daddr_t end_blk; 404 uint mid_cycle; 405 int error; 406 407 end_blk = *last_blk; 408 mid_blk = BLK_AVG(first_blk, end_blk); 409 while (mid_blk != first_blk && mid_blk != end_blk) { 410 error = xlog_bread(log, mid_blk, 1, bp, &offset); 411 if (error) 412 return error; 413 mid_cycle = xlog_get_cycle(offset); 414 if (mid_cycle == cycle) 415 end_blk = mid_blk; /* last_half_cycle == mid_cycle */ 416 else 417 first_blk = mid_blk; /* first_half_cycle == mid_cycle */ 418 mid_blk = BLK_AVG(first_blk, end_blk); 419 } 420 ASSERT((mid_blk == first_blk && mid_blk+1 == end_blk) || 421 (mid_blk == end_blk && mid_blk-1 == first_blk)); 422 423 *last_blk = end_blk; 424 425 return 0; 426 } 427 428 /* 429 * Check that a range of blocks does not contain stop_on_cycle_no. 430 * Fill in *new_blk with the block offset where such a block is 431 * found, or with -1 (an invalid block number) if there is no such 432 * block in the range. The scan needs to occur from front to back 433 * and the pointer into the region must be updated since a later 434 * routine will need to perform another test. 435 */ 436 STATIC int 437 xlog_find_verify_cycle( 438 struct xlog *log, 439 xfs_daddr_t start_blk, 440 int nbblks, 441 uint stop_on_cycle_no, 442 xfs_daddr_t *new_blk) 443 { 444 xfs_daddr_t i, j; 445 uint cycle; 446 xfs_buf_t *bp; 447 xfs_daddr_t bufblks; 448 xfs_caddr_t buf = NULL; 449 int error = 0; 450 451 /* 452 * Greedily allocate a buffer big enough to handle the full 453 * range of basic blocks we'll be examining. If that fails, 454 * try a smaller size. We need to be able to read at least 455 * a log sector, or we're out of luck. 456 */ 457 bufblks = 1 << ffs(nbblks); 458 while (bufblks > log->l_logBBsize) 459 bufblks >>= 1; 460 while (!(bp = xlog_get_bp(log, bufblks))) { 461 bufblks >>= 1; 462 if (bufblks < log->l_sectBBsize) 463 return -ENOMEM; 464 } 465 466 for (i = start_blk; i < start_blk + nbblks; i += bufblks) { 467 int bcount; 468 469 bcount = min(bufblks, (start_blk + nbblks - i)); 470 471 error = xlog_bread(log, i, bcount, bp, &buf); 472 if (error) 473 goto out; 474 475 for (j = 0; j < bcount; j++) { 476 cycle = xlog_get_cycle(buf); 477 if (cycle == stop_on_cycle_no) { 478 *new_blk = i+j; 479 goto out; 480 } 481 482 buf += BBSIZE; 483 } 484 } 485 486 *new_blk = -1; 487 488 out: 489 xlog_put_bp(bp); 490 return error; 491 } 492 493 /* 494 * Potentially backup over partial log record write. 495 * 496 * In the typical case, last_blk is the number of the block directly after 497 * a good log record. Therefore, we subtract one to get the block number 498 * of the last block in the given buffer. extra_bblks contains the number 499 * of blocks we would have read on a previous read. This happens when the 500 * last log record is split over the end of the physical log. 501 * 502 * extra_bblks is the number of blocks potentially verified on a previous 503 * call to this routine. 504 */ 505 STATIC int 506 xlog_find_verify_log_record( 507 struct xlog *log, 508 xfs_daddr_t start_blk, 509 xfs_daddr_t *last_blk, 510 int extra_bblks) 511 { 512 xfs_daddr_t i; 513 xfs_buf_t *bp; 514 xfs_caddr_t offset = NULL; 515 xlog_rec_header_t *head = NULL; 516 int error = 0; 517 int smallmem = 0; 518 int num_blks = *last_blk - start_blk; 519 int xhdrs; 520 521 ASSERT(start_blk != 0 || *last_blk != start_blk); 522 523 if (!(bp = xlog_get_bp(log, num_blks))) { 524 if (!(bp = xlog_get_bp(log, 1))) 525 return -ENOMEM; 526 smallmem = 1; 527 } else { 528 error = xlog_bread(log, start_blk, num_blks, bp, &offset); 529 if (error) 530 goto out; 531 offset += ((num_blks - 1) << BBSHIFT); 532 } 533 534 for (i = (*last_blk) - 1; i >= 0; i--) { 535 if (i < start_blk) { 536 /* valid log record not found */ 537 xfs_warn(log->l_mp, 538 "Log inconsistent (didn't find previous header)"); 539 ASSERT(0); 540 error = -EIO; 541 goto out; 542 } 543 544 if (smallmem) { 545 error = xlog_bread(log, i, 1, bp, &offset); 546 if (error) 547 goto out; 548 } 549 550 head = (xlog_rec_header_t *)offset; 551 552 if (head->h_magicno == cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) 553 break; 554 555 if (!smallmem) 556 offset -= BBSIZE; 557 } 558 559 /* 560 * We hit the beginning of the physical log & still no header. Return 561 * to caller. If caller can handle a return of -1, then this routine 562 * will be called again for the end of the physical log. 563 */ 564 if (i == -1) { 565 error = 1; 566 goto out; 567 } 568 569 /* 570 * We have the final block of the good log (the first block 571 * of the log record _before_ the head. So we check the uuid. 572 */ 573 if ((error = xlog_header_check_mount(log->l_mp, head))) 574 goto out; 575 576 /* 577 * We may have found a log record header before we expected one. 578 * last_blk will be the 1st block # with a given cycle #. We may end 579 * up reading an entire log record. In this case, we don't want to 580 * reset last_blk. Only when last_blk points in the middle of a log 581 * record do we update last_blk. 582 */ 583 if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) { 584 uint h_size = be32_to_cpu(head->h_size); 585 586 xhdrs = h_size / XLOG_HEADER_CYCLE_SIZE; 587 if (h_size % XLOG_HEADER_CYCLE_SIZE) 588 xhdrs++; 589 } else { 590 xhdrs = 1; 591 } 592 593 if (*last_blk - i + extra_bblks != 594 BTOBB(be32_to_cpu(head->h_len)) + xhdrs) 595 *last_blk = i; 596 597 out: 598 xlog_put_bp(bp); 599 return error; 600 } 601 602 /* 603 * Head is defined to be the point of the log where the next log write 604 * could go. This means that incomplete LR writes at the end are 605 * eliminated when calculating the head. We aren't guaranteed that previous 606 * LR have complete transactions. We only know that a cycle number of 607 * current cycle number -1 won't be present in the log if we start writing 608 * from our current block number. 609 * 610 * last_blk contains the block number of the first block with a given 611 * cycle number. 612 * 613 * Return: zero if normal, non-zero if error. 614 */ 615 STATIC int 616 xlog_find_head( 617 struct xlog *log, 618 xfs_daddr_t *return_head_blk) 619 { 620 xfs_buf_t *bp; 621 xfs_caddr_t offset; 622 xfs_daddr_t new_blk, first_blk, start_blk, last_blk, head_blk; 623 int num_scan_bblks; 624 uint first_half_cycle, last_half_cycle; 625 uint stop_on_cycle; 626 int error, log_bbnum = log->l_logBBsize; 627 628 /* Is the end of the log device zeroed? */ 629 error = xlog_find_zeroed(log, &first_blk); 630 if (error < 0) { 631 xfs_warn(log->l_mp, "empty log check failed"); 632 return error; 633 } 634 if (error == 1) { 635 *return_head_blk = first_blk; 636 637 /* Is the whole lot zeroed? */ 638 if (!first_blk) { 639 /* Linux XFS shouldn't generate totally zeroed logs - 640 * mkfs etc write a dummy unmount record to a fresh 641 * log so we can store the uuid in there 642 */ 643 xfs_warn(log->l_mp, "totally zeroed log"); 644 } 645 646 return 0; 647 } 648 649 first_blk = 0; /* get cycle # of 1st block */ 650 bp = xlog_get_bp(log, 1); 651 if (!bp) 652 return -ENOMEM; 653 654 error = xlog_bread(log, 0, 1, bp, &offset); 655 if (error) 656 goto bp_err; 657 658 first_half_cycle = xlog_get_cycle(offset); 659 660 last_blk = head_blk = log_bbnum - 1; /* get cycle # of last block */ 661 error = xlog_bread(log, last_blk, 1, bp, &offset); 662 if (error) 663 goto bp_err; 664 665 last_half_cycle = xlog_get_cycle(offset); 666 ASSERT(last_half_cycle != 0); 667 668 /* 669 * If the 1st half cycle number is equal to the last half cycle number, 670 * then the entire log is stamped with the same cycle number. In this 671 * case, head_blk can't be set to zero (which makes sense). The below 672 * math doesn't work out properly with head_blk equal to zero. Instead, 673 * we set it to log_bbnum which is an invalid block number, but this 674 * value makes the math correct. If head_blk doesn't changed through 675 * all the tests below, *head_blk is set to zero at the very end rather 676 * than log_bbnum. In a sense, log_bbnum and zero are the same block 677 * in a circular file. 678 */ 679 if (first_half_cycle == last_half_cycle) { 680 /* 681 * In this case we believe that the entire log should have 682 * cycle number last_half_cycle. We need to scan backwards 683 * from the end verifying that there are no holes still 684 * containing last_half_cycle - 1. If we find such a hole, 685 * then the start of that hole will be the new head. The 686 * simple case looks like 687 * x | x ... | x - 1 | x 688 * Another case that fits this picture would be 689 * x | x + 1 | x ... | x 690 * In this case the head really is somewhere at the end of the 691 * log, as one of the latest writes at the beginning was 692 * incomplete. 693 * One more case is 694 * x | x + 1 | x ... | x - 1 | x 695 * This is really the combination of the above two cases, and 696 * the head has to end up at the start of the x-1 hole at the 697 * end of the log. 698 * 699 * In the 256k log case, we will read from the beginning to the 700 * end of the log and search for cycle numbers equal to x-1. 701 * We don't worry about the x+1 blocks that we encounter, 702 * because we know that they cannot be the head since the log 703 * started with x. 704 */ 705 head_blk = log_bbnum; 706 stop_on_cycle = last_half_cycle - 1; 707 } else { 708 /* 709 * In this case we want to find the first block with cycle 710 * number matching last_half_cycle. We expect the log to be 711 * some variation on 712 * x + 1 ... | x ... | x 713 * The first block with cycle number x (last_half_cycle) will 714 * be where the new head belongs. First we do a binary search 715 * for the first occurrence of last_half_cycle. The binary 716 * search may not be totally accurate, so then we scan back 717 * from there looking for occurrences of last_half_cycle before 718 * us. If that backwards scan wraps around the beginning of 719 * the log, then we look for occurrences of last_half_cycle - 1 720 * at the end of the log. The cases we're looking for look 721 * like 722 * v binary search stopped here 723 * x + 1 ... | x | x + 1 | x ... | x 724 * ^ but we want to locate this spot 725 * or 726 * <---------> less than scan distance 727 * x + 1 ... | x ... | x - 1 | x 728 * ^ we want to locate this spot 729 */ 730 stop_on_cycle = last_half_cycle; 731 if ((error = xlog_find_cycle_start(log, bp, first_blk, 732 &head_blk, last_half_cycle))) 733 goto bp_err; 734 } 735 736 /* 737 * Now validate the answer. Scan back some number of maximum possible 738 * blocks and make sure each one has the expected cycle number. The 739 * maximum is determined by the total possible amount of buffering 740 * in the in-core log. The following number can be made tighter if 741 * we actually look at the block size of the filesystem. 742 */ 743 num_scan_bblks = XLOG_TOTAL_REC_SHIFT(log); 744 if (head_blk >= num_scan_bblks) { 745 /* 746 * We are guaranteed that the entire check can be performed 747 * in one buffer. 748 */ 749 start_blk = head_blk - num_scan_bblks; 750 if ((error = xlog_find_verify_cycle(log, 751 start_blk, num_scan_bblks, 752 stop_on_cycle, &new_blk))) 753 goto bp_err; 754 if (new_blk != -1) 755 head_blk = new_blk; 756 } else { /* need to read 2 parts of log */ 757 /* 758 * We are going to scan backwards in the log in two parts. 759 * First we scan the physical end of the log. In this part 760 * of the log, we are looking for blocks with cycle number 761 * last_half_cycle - 1. 762 * If we find one, then we know that the log starts there, as 763 * we've found a hole that didn't get written in going around 764 * the end of the physical log. The simple case for this is 765 * x + 1 ... | x ... | x - 1 | x 766 * <---------> less than scan distance 767 * If all of the blocks at the end of the log have cycle number 768 * last_half_cycle, then we check the blocks at the start of 769 * the log looking for occurrences of last_half_cycle. If we 770 * find one, then our current estimate for the location of the 771 * first occurrence of last_half_cycle is wrong and we move 772 * back to the hole we've found. This case looks like 773 * x + 1 ... | x | x + 1 | x ... 774 * ^ binary search stopped here 775 * Another case we need to handle that only occurs in 256k 776 * logs is 777 * x + 1 ... | x ... | x+1 | x ... 778 * ^ binary search stops here 779 * In a 256k log, the scan at the end of the log will see the 780 * x + 1 blocks. We need to skip past those since that is 781 * certainly not the head of the log. By searching for 782 * last_half_cycle-1 we accomplish that. 783 */ 784 ASSERT(head_blk <= INT_MAX && 785 (xfs_daddr_t) num_scan_bblks >= head_blk); 786 start_blk = log_bbnum - (num_scan_bblks - head_blk); 787 if ((error = xlog_find_verify_cycle(log, start_blk, 788 num_scan_bblks - (int)head_blk, 789 (stop_on_cycle - 1), &new_blk))) 790 goto bp_err; 791 if (new_blk != -1) { 792 head_blk = new_blk; 793 goto validate_head; 794 } 795 796 /* 797 * Scan beginning of log now. The last part of the physical 798 * log is good. This scan needs to verify that it doesn't find 799 * the last_half_cycle. 800 */ 801 start_blk = 0; 802 ASSERT(head_blk <= INT_MAX); 803 if ((error = xlog_find_verify_cycle(log, 804 start_blk, (int)head_blk, 805 stop_on_cycle, &new_blk))) 806 goto bp_err; 807 if (new_blk != -1) 808 head_blk = new_blk; 809 } 810 811 validate_head: 812 /* 813 * Now we need to make sure head_blk is not pointing to a block in 814 * the middle of a log record. 815 */ 816 num_scan_bblks = XLOG_REC_SHIFT(log); 817 if (head_blk >= num_scan_bblks) { 818 start_blk = head_blk - num_scan_bblks; /* don't read head_blk */ 819 820 /* start ptr at last block ptr before head_blk */ 821 error = xlog_find_verify_log_record(log, start_blk, &head_blk, 0); 822 if (error == 1) 823 error = -EIO; 824 if (error) 825 goto bp_err; 826 } else { 827 start_blk = 0; 828 ASSERT(head_blk <= INT_MAX); 829 error = xlog_find_verify_log_record(log, start_blk, &head_blk, 0); 830 if (error < 0) 831 goto bp_err; 832 if (error == 1) { 833 /* We hit the beginning of the log during our search */ 834 start_blk = log_bbnum - (num_scan_bblks - head_blk); 835 new_blk = log_bbnum; 836 ASSERT(start_blk <= INT_MAX && 837 (xfs_daddr_t) log_bbnum-start_blk >= 0); 838 ASSERT(head_blk <= INT_MAX); 839 error = xlog_find_verify_log_record(log, start_blk, 840 &new_blk, (int)head_blk); 841 if (error == 1) 842 error = -EIO; 843 if (error) 844 goto bp_err; 845 if (new_blk != log_bbnum) 846 head_blk = new_blk; 847 } else if (error) 848 goto bp_err; 849 } 850 851 xlog_put_bp(bp); 852 if (head_blk == log_bbnum) 853 *return_head_blk = 0; 854 else 855 *return_head_blk = head_blk; 856 /* 857 * When returning here, we have a good block number. Bad block 858 * means that during a previous crash, we didn't have a clean break 859 * from cycle number N to cycle number N-1. In this case, we need 860 * to find the first block with cycle number N-1. 861 */ 862 return 0; 863 864 bp_err: 865 xlog_put_bp(bp); 866 867 if (error) 868 xfs_warn(log->l_mp, "failed to find log head"); 869 return error; 870 } 871 872 /* 873 * Find the sync block number or the tail of the log. 874 * 875 * This will be the block number of the last record to have its 876 * associated buffers synced to disk. Every log record header has 877 * a sync lsn embedded in it. LSNs hold block numbers, so it is easy 878 * to get a sync block number. The only concern is to figure out which 879 * log record header to believe. 880 * 881 * The following algorithm uses the log record header with the largest 882 * lsn. The entire log record does not need to be valid. We only care 883 * that the header is valid. 884 * 885 * We could speed up search by using current head_blk buffer, but it is not 886 * available. 887 */ 888 STATIC int 889 xlog_find_tail( 890 struct xlog *log, 891 xfs_daddr_t *head_blk, 892 xfs_daddr_t *tail_blk) 893 { 894 xlog_rec_header_t *rhead; 895 xlog_op_header_t *op_head; 896 xfs_caddr_t offset = NULL; 897 xfs_buf_t *bp; 898 int error, i, found; 899 xfs_daddr_t umount_data_blk; 900 xfs_daddr_t after_umount_blk; 901 xfs_lsn_t tail_lsn; 902 int hblks; 903 904 found = 0; 905 906 /* 907 * Find previous log record 908 */ 909 if ((error = xlog_find_head(log, head_blk))) 910 return error; 911 912 bp = xlog_get_bp(log, 1); 913 if (!bp) 914 return -ENOMEM; 915 if (*head_blk == 0) { /* special case */ 916 error = xlog_bread(log, 0, 1, bp, &offset); 917 if (error) 918 goto done; 919 920 if (xlog_get_cycle(offset) == 0) { 921 *tail_blk = 0; 922 /* leave all other log inited values alone */ 923 goto done; 924 } 925 } 926 927 /* 928 * Search backwards looking for log record header block 929 */ 930 ASSERT(*head_blk < INT_MAX); 931 for (i = (int)(*head_blk) - 1; i >= 0; i--) { 932 error = xlog_bread(log, i, 1, bp, &offset); 933 if (error) 934 goto done; 935 936 if (*(__be32 *)offset == cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) { 937 found = 1; 938 break; 939 } 940 } 941 /* 942 * If we haven't found the log record header block, start looking 943 * again from the end of the physical log. XXXmiken: There should be 944 * a check here to make sure we didn't search more than N blocks in 945 * the previous code. 946 */ 947 if (!found) { 948 for (i = log->l_logBBsize - 1; i >= (int)(*head_blk); i--) { 949 error = xlog_bread(log, i, 1, bp, &offset); 950 if (error) 951 goto done; 952 953 if (*(__be32 *)offset == 954 cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) { 955 found = 2; 956 break; 957 } 958 } 959 } 960 if (!found) { 961 xfs_warn(log->l_mp, "%s: couldn't find sync record", __func__); 962 xlog_put_bp(bp); 963 ASSERT(0); 964 return -EIO; 965 } 966 967 /* find blk_no of tail of log */ 968 rhead = (xlog_rec_header_t *)offset; 969 *tail_blk = BLOCK_LSN(be64_to_cpu(rhead->h_tail_lsn)); 970 971 /* 972 * Reset log values according to the state of the log when we 973 * crashed. In the case where head_blk == 0, we bump curr_cycle 974 * one because the next write starts a new cycle rather than 975 * continuing the cycle of the last good log record. At this 976 * point we have guaranteed that all partial log records have been 977 * accounted for. Therefore, we know that the last good log record 978 * written was complete and ended exactly on the end boundary 979 * of the physical log. 980 */ 981 log->l_prev_block = i; 982 log->l_curr_block = (int)*head_blk; 983 log->l_curr_cycle = be32_to_cpu(rhead->h_cycle); 984 if (found == 2) 985 log->l_curr_cycle++; 986 atomic64_set(&log->l_tail_lsn, be64_to_cpu(rhead->h_tail_lsn)); 987 atomic64_set(&log->l_last_sync_lsn, be64_to_cpu(rhead->h_lsn)); 988 xlog_assign_grant_head(&log->l_reserve_head.grant, log->l_curr_cycle, 989 BBTOB(log->l_curr_block)); 990 xlog_assign_grant_head(&log->l_write_head.grant, log->l_curr_cycle, 991 BBTOB(log->l_curr_block)); 992 993 /* 994 * Look for unmount record. If we find it, then we know there 995 * was a clean unmount. Since 'i' could be the last block in 996 * the physical log, we convert to a log block before comparing 997 * to the head_blk. 998 * 999 * Save the current tail lsn to use to pass to 1000 * xlog_clear_stale_blocks() below. We won't want to clear the 1001 * unmount record if there is one, so we pass the lsn of the 1002 * unmount record rather than the block after it. 1003 */ 1004 if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) { 1005 int h_size = be32_to_cpu(rhead->h_size); 1006 int h_version = be32_to_cpu(rhead->h_version); 1007 1008 if ((h_version & XLOG_VERSION_2) && 1009 (h_size > XLOG_HEADER_CYCLE_SIZE)) { 1010 hblks = h_size / XLOG_HEADER_CYCLE_SIZE; 1011 if (h_size % XLOG_HEADER_CYCLE_SIZE) 1012 hblks++; 1013 } else { 1014 hblks = 1; 1015 } 1016 } else { 1017 hblks = 1; 1018 } 1019 after_umount_blk = (i + hblks + (int) 1020 BTOBB(be32_to_cpu(rhead->h_len))) % log->l_logBBsize; 1021 tail_lsn = atomic64_read(&log->l_tail_lsn); 1022 if (*head_blk == after_umount_blk && 1023 be32_to_cpu(rhead->h_num_logops) == 1) { 1024 umount_data_blk = (i + hblks) % log->l_logBBsize; 1025 error = xlog_bread(log, umount_data_blk, 1, bp, &offset); 1026 if (error) 1027 goto done; 1028 1029 op_head = (xlog_op_header_t *)offset; 1030 if (op_head->oh_flags & XLOG_UNMOUNT_TRANS) { 1031 /* 1032 * Set tail and last sync so that newly written 1033 * log records will point recovery to after the 1034 * current unmount record. 1035 */ 1036 xlog_assign_atomic_lsn(&log->l_tail_lsn, 1037 log->l_curr_cycle, after_umount_blk); 1038 xlog_assign_atomic_lsn(&log->l_last_sync_lsn, 1039 log->l_curr_cycle, after_umount_blk); 1040 *tail_blk = after_umount_blk; 1041 1042 /* 1043 * Note that the unmount was clean. If the unmount 1044 * was not clean, we need to know this to rebuild the 1045 * superblock counters from the perag headers if we 1046 * have a filesystem using non-persistent counters. 1047 */ 1048 log->l_mp->m_flags |= XFS_MOUNT_WAS_CLEAN; 1049 } 1050 } 1051 1052 /* 1053 * Make sure that there are no blocks in front of the head 1054 * with the same cycle number as the head. This can happen 1055 * because we allow multiple outstanding log writes concurrently, 1056 * and the later writes might make it out before earlier ones. 1057 * 1058 * We use the lsn from before modifying it so that we'll never 1059 * overwrite the unmount record after a clean unmount. 1060 * 1061 * Do this only if we are going to recover the filesystem 1062 * 1063 * NOTE: This used to say "if (!readonly)" 1064 * However on Linux, we can & do recover a read-only filesystem. 1065 * We only skip recovery if NORECOVERY is specified on mount, 1066 * in which case we would not be here. 1067 * 1068 * But... if the -device- itself is readonly, just skip this. 1069 * We can't recover this device anyway, so it won't matter. 1070 */ 1071 if (!xfs_readonly_buftarg(log->l_mp->m_logdev_targp)) 1072 error = xlog_clear_stale_blocks(log, tail_lsn); 1073 1074 done: 1075 xlog_put_bp(bp); 1076 1077 if (error) 1078 xfs_warn(log->l_mp, "failed to locate log tail"); 1079 return error; 1080 } 1081 1082 /* 1083 * Is the log zeroed at all? 1084 * 1085 * The last binary search should be changed to perform an X block read 1086 * once X becomes small enough. You can then search linearly through 1087 * the X blocks. This will cut down on the number of reads we need to do. 1088 * 1089 * If the log is partially zeroed, this routine will pass back the blkno 1090 * of the first block with cycle number 0. It won't have a complete LR 1091 * preceding it. 1092 * 1093 * Return: 1094 * 0 => the log is completely written to 1095 * 1 => use *blk_no as the first block of the log 1096 * <0 => error has occurred 1097 */ 1098 STATIC int 1099 xlog_find_zeroed( 1100 struct xlog *log, 1101 xfs_daddr_t *blk_no) 1102 { 1103 xfs_buf_t *bp; 1104 xfs_caddr_t offset; 1105 uint first_cycle, last_cycle; 1106 xfs_daddr_t new_blk, last_blk, start_blk; 1107 xfs_daddr_t num_scan_bblks; 1108 int error, log_bbnum = log->l_logBBsize; 1109 1110 *blk_no = 0; 1111 1112 /* check totally zeroed log */ 1113 bp = xlog_get_bp(log, 1); 1114 if (!bp) 1115 return -ENOMEM; 1116 error = xlog_bread(log, 0, 1, bp, &offset); 1117 if (error) 1118 goto bp_err; 1119 1120 first_cycle = xlog_get_cycle(offset); 1121 if (first_cycle == 0) { /* completely zeroed log */ 1122 *blk_no = 0; 1123 xlog_put_bp(bp); 1124 return 1; 1125 } 1126 1127 /* check partially zeroed log */ 1128 error = xlog_bread(log, log_bbnum-1, 1, bp, &offset); 1129 if (error) 1130 goto bp_err; 1131 1132 last_cycle = xlog_get_cycle(offset); 1133 if (last_cycle != 0) { /* log completely written to */ 1134 xlog_put_bp(bp); 1135 return 0; 1136 } else if (first_cycle != 1) { 1137 /* 1138 * If the cycle of the last block is zero, the cycle of 1139 * the first block must be 1. If it's not, maybe we're 1140 * not looking at a log... Bail out. 1141 */ 1142 xfs_warn(log->l_mp, 1143 "Log inconsistent or not a log (last==0, first!=1)"); 1144 error = -EINVAL; 1145 goto bp_err; 1146 } 1147 1148 /* we have a partially zeroed log */ 1149 last_blk = log_bbnum-1; 1150 if ((error = xlog_find_cycle_start(log, bp, 0, &last_blk, 0))) 1151 goto bp_err; 1152 1153 /* 1154 * Validate the answer. Because there is no way to guarantee that 1155 * the entire log is made up of log records which are the same size, 1156 * we scan over the defined maximum blocks. At this point, the maximum 1157 * is not chosen to mean anything special. XXXmiken 1158 */ 1159 num_scan_bblks = XLOG_TOTAL_REC_SHIFT(log); 1160 ASSERT(num_scan_bblks <= INT_MAX); 1161 1162 if (last_blk < num_scan_bblks) 1163 num_scan_bblks = last_blk; 1164 start_blk = last_blk - num_scan_bblks; 1165 1166 /* 1167 * We search for any instances of cycle number 0 that occur before 1168 * our current estimate of the head. What we're trying to detect is 1169 * 1 ... | 0 | 1 | 0... 1170 * ^ binary search ends here 1171 */ 1172 if ((error = xlog_find_verify_cycle(log, start_blk, 1173 (int)num_scan_bblks, 0, &new_blk))) 1174 goto bp_err; 1175 if (new_blk != -1) 1176 last_blk = new_blk; 1177 1178 /* 1179 * Potentially backup over partial log record write. We don't need 1180 * to search the end of the log because we know it is zero. 1181 */ 1182 error = xlog_find_verify_log_record(log, start_blk, &last_blk, 0); 1183 if (error == 1) 1184 error = -EIO; 1185 if (error) 1186 goto bp_err; 1187 1188 *blk_no = last_blk; 1189 bp_err: 1190 xlog_put_bp(bp); 1191 if (error) 1192 return error; 1193 return 1; 1194 } 1195 1196 /* 1197 * These are simple subroutines used by xlog_clear_stale_blocks() below 1198 * to initialize a buffer full of empty log record headers and write 1199 * them into the log. 1200 */ 1201 STATIC void 1202 xlog_add_record( 1203 struct xlog *log, 1204 xfs_caddr_t buf, 1205 int cycle, 1206 int block, 1207 int tail_cycle, 1208 int tail_block) 1209 { 1210 xlog_rec_header_t *recp = (xlog_rec_header_t *)buf; 1211 1212 memset(buf, 0, BBSIZE); 1213 recp->h_magicno = cpu_to_be32(XLOG_HEADER_MAGIC_NUM); 1214 recp->h_cycle = cpu_to_be32(cycle); 1215 recp->h_version = cpu_to_be32( 1216 xfs_sb_version_haslogv2(&log->l_mp->m_sb) ? 2 : 1); 1217 recp->h_lsn = cpu_to_be64(xlog_assign_lsn(cycle, block)); 1218 recp->h_tail_lsn = cpu_to_be64(xlog_assign_lsn(tail_cycle, tail_block)); 1219 recp->h_fmt = cpu_to_be32(XLOG_FMT); 1220 memcpy(&recp->h_fs_uuid, &log->l_mp->m_sb.sb_uuid, sizeof(uuid_t)); 1221 } 1222 1223 STATIC int 1224 xlog_write_log_records( 1225 struct xlog *log, 1226 int cycle, 1227 int start_block, 1228 int blocks, 1229 int tail_cycle, 1230 int tail_block) 1231 { 1232 xfs_caddr_t offset; 1233 xfs_buf_t *bp; 1234 int balign, ealign; 1235 int sectbb = log->l_sectBBsize; 1236 int end_block = start_block + blocks; 1237 int bufblks; 1238 int error = 0; 1239 int i, j = 0; 1240 1241 /* 1242 * Greedily allocate a buffer big enough to handle the full 1243 * range of basic blocks to be written. If that fails, try 1244 * a smaller size. We need to be able to write at least a 1245 * log sector, or we're out of luck. 1246 */ 1247 bufblks = 1 << ffs(blocks); 1248 while (bufblks > log->l_logBBsize) 1249 bufblks >>= 1; 1250 while (!(bp = xlog_get_bp(log, bufblks))) { 1251 bufblks >>= 1; 1252 if (bufblks < sectbb) 1253 return -ENOMEM; 1254 } 1255 1256 /* We may need to do a read at the start to fill in part of 1257 * the buffer in the starting sector not covered by the first 1258 * write below. 1259 */ 1260 balign = round_down(start_block, sectbb); 1261 if (balign != start_block) { 1262 error = xlog_bread_noalign(log, start_block, 1, bp); 1263 if (error) 1264 goto out_put_bp; 1265 1266 j = start_block - balign; 1267 } 1268 1269 for (i = start_block; i < end_block; i += bufblks) { 1270 int bcount, endcount; 1271 1272 bcount = min(bufblks, end_block - start_block); 1273 endcount = bcount - j; 1274 1275 /* We may need to do a read at the end to fill in part of 1276 * the buffer in the final sector not covered by the write. 1277 * If this is the same sector as the above read, skip it. 1278 */ 1279 ealign = round_down(end_block, sectbb); 1280 if (j == 0 && (start_block + endcount > ealign)) { 1281 offset = bp->b_addr + BBTOB(ealign - start_block); 1282 error = xlog_bread_offset(log, ealign, sectbb, 1283 bp, offset); 1284 if (error) 1285 break; 1286 1287 } 1288 1289 offset = xlog_align(log, start_block, endcount, bp); 1290 for (; j < endcount; j++) { 1291 xlog_add_record(log, offset, cycle, i+j, 1292 tail_cycle, tail_block); 1293 offset += BBSIZE; 1294 } 1295 error = xlog_bwrite(log, start_block, endcount, bp); 1296 if (error) 1297 break; 1298 start_block += endcount; 1299 j = 0; 1300 } 1301 1302 out_put_bp: 1303 xlog_put_bp(bp); 1304 return error; 1305 } 1306 1307 /* 1308 * This routine is called to blow away any incomplete log writes out 1309 * in front of the log head. We do this so that we won't become confused 1310 * if we come up, write only a little bit more, and then crash again. 1311 * If we leave the partial log records out there, this situation could 1312 * cause us to think those partial writes are valid blocks since they 1313 * have the current cycle number. We get rid of them by overwriting them 1314 * with empty log records with the old cycle number rather than the 1315 * current one. 1316 * 1317 * The tail lsn is passed in rather than taken from 1318 * the log so that we will not write over the unmount record after a 1319 * clean unmount in a 512 block log. Doing so would leave the log without 1320 * any valid log records in it until a new one was written. If we crashed 1321 * during that time we would not be able to recover. 1322 */ 1323 STATIC int 1324 xlog_clear_stale_blocks( 1325 struct xlog *log, 1326 xfs_lsn_t tail_lsn) 1327 { 1328 int tail_cycle, head_cycle; 1329 int tail_block, head_block; 1330 int tail_distance, max_distance; 1331 int distance; 1332 int error; 1333 1334 tail_cycle = CYCLE_LSN(tail_lsn); 1335 tail_block = BLOCK_LSN(tail_lsn); 1336 head_cycle = log->l_curr_cycle; 1337 head_block = log->l_curr_block; 1338 1339 /* 1340 * Figure out the distance between the new head of the log 1341 * and the tail. We want to write over any blocks beyond the 1342 * head that we may have written just before the crash, but 1343 * we don't want to overwrite the tail of the log. 1344 */ 1345 if (head_cycle == tail_cycle) { 1346 /* 1347 * The tail is behind the head in the physical log, 1348 * so the distance from the head to the tail is the 1349 * distance from the head to the end of the log plus 1350 * the distance from the beginning of the log to the 1351 * tail. 1352 */ 1353 if (unlikely(head_block < tail_block || head_block >= log->l_logBBsize)) { 1354 XFS_ERROR_REPORT("xlog_clear_stale_blocks(1)", 1355 XFS_ERRLEVEL_LOW, log->l_mp); 1356 return -EFSCORRUPTED; 1357 } 1358 tail_distance = tail_block + (log->l_logBBsize - head_block); 1359 } else { 1360 /* 1361 * The head is behind the tail in the physical log, 1362 * so the distance from the head to the tail is just 1363 * the tail block minus the head block. 1364 */ 1365 if (unlikely(head_block >= tail_block || head_cycle != (tail_cycle + 1))){ 1366 XFS_ERROR_REPORT("xlog_clear_stale_blocks(2)", 1367 XFS_ERRLEVEL_LOW, log->l_mp); 1368 return -EFSCORRUPTED; 1369 } 1370 tail_distance = tail_block - head_block; 1371 } 1372 1373 /* 1374 * If the head is right up against the tail, we can't clear 1375 * anything. 1376 */ 1377 if (tail_distance <= 0) { 1378 ASSERT(tail_distance == 0); 1379 return 0; 1380 } 1381 1382 max_distance = XLOG_TOTAL_REC_SHIFT(log); 1383 /* 1384 * Take the smaller of the maximum amount of outstanding I/O 1385 * we could have and the distance to the tail to clear out. 1386 * We take the smaller so that we don't overwrite the tail and 1387 * we don't waste all day writing from the head to the tail 1388 * for no reason. 1389 */ 1390 max_distance = MIN(max_distance, tail_distance); 1391 1392 if ((head_block + max_distance) <= log->l_logBBsize) { 1393 /* 1394 * We can stomp all the blocks we need to without 1395 * wrapping around the end of the log. Just do it 1396 * in a single write. Use the cycle number of the 1397 * current cycle minus one so that the log will look like: 1398 * n ... | n - 1 ... 1399 */ 1400 error = xlog_write_log_records(log, (head_cycle - 1), 1401 head_block, max_distance, tail_cycle, 1402 tail_block); 1403 if (error) 1404 return error; 1405 } else { 1406 /* 1407 * We need to wrap around the end of the physical log in 1408 * order to clear all the blocks. Do it in two separate 1409 * I/Os. The first write should be from the head to the 1410 * end of the physical log, and it should use the current 1411 * cycle number minus one just like above. 1412 */ 1413 distance = log->l_logBBsize - head_block; 1414 error = xlog_write_log_records(log, (head_cycle - 1), 1415 head_block, distance, tail_cycle, 1416 tail_block); 1417 1418 if (error) 1419 return error; 1420 1421 /* 1422 * Now write the blocks at the start of the physical log. 1423 * This writes the remainder of the blocks we want to clear. 1424 * It uses the current cycle number since we're now on the 1425 * same cycle as the head so that we get: 1426 * n ... n ... | n - 1 ... 1427 * ^^^^^ blocks we're writing 1428 */ 1429 distance = max_distance - (log->l_logBBsize - head_block); 1430 error = xlog_write_log_records(log, head_cycle, 0, distance, 1431 tail_cycle, tail_block); 1432 if (error) 1433 return error; 1434 } 1435 1436 return 0; 1437 } 1438 1439 /****************************************************************************** 1440 * 1441 * Log recover routines 1442 * 1443 ****************************************************************************** 1444 */ 1445 1446 /* 1447 * Sort the log items in the transaction. 1448 * 1449 * The ordering constraints are defined by the inode allocation and unlink 1450 * behaviour. The rules are: 1451 * 1452 * 1. Every item is only logged once in a given transaction. Hence it 1453 * represents the last logged state of the item. Hence ordering is 1454 * dependent on the order in which operations need to be performed so 1455 * required initial conditions are always met. 1456 * 1457 * 2. Cancelled buffers are recorded in pass 1 in a separate table and 1458 * there's nothing to replay from them so we can simply cull them 1459 * from the transaction. However, we can't do that until after we've 1460 * replayed all the other items because they may be dependent on the 1461 * cancelled buffer and replaying the cancelled buffer can remove it 1462 * form the cancelled buffer table. Hence they have tobe done last. 1463 * 1464 * 3. Inode allocation buffers must be replayed before inode items that 1465 * read the buffer and replay changes into it. For filesystems using the 1466 * ICREATE transactions, this means XFS_LI_ICREATE objects need to get 1467 * treated the same as inode allocation buffers as they create and 1468 * initialise the buffers directly. 1469 * 1470 * 4. Inode unlink buffers must be replayed after inode items are replayed. 1471 * This ensures that inodes are completely flushed to the inode buffer 1472 * in a "free" state before we remove the unlinked inode list pointer. 1473 * 1474 * Hence the ordering needs to be inode allocation buffers first, inode items 1475 * second, inode unlink buffers third and cancelled buffers last. 1476 * 1477 * But there's a problem with that - we can't tell an inode allocation buffer 1478 * apart from a regular buffer, so we can't separate them. We can, however, 1479 * tell an inode unlink buffer from the others, and so we can separate them out 1480 * from all the other buffers and move them to last. 1481 * 1482 * Hence, 4 lists, in order from head to tail: 1483 * - buffer_list for all buffers except cancelled/inode unlink buffers 1484 * - item_list for all non-buffer items 1485 * - inode_buffer_list for inode unlink buffers 1486 * - cancel_list for the cancelled buffers 1487 * 1488 * Note that we add objects to the tail of the lists so that first-to-last 1489 * ordering is preserved within the lists. Adding objects to the head of the 1490 * list means when we traverse from the head we walk them in last-to-first 1491 * order. For cancelled buffers and inode unlink buffers this doesn't matter, 1492 * but for all other items there may be specific ordering that we need to 1493 * preserve. 1494 */ 1495 STATIC int 1496 xlog_recover_reorder_trans( 1497 struct xlog *log, 1498 struct xlog_recover *trans, 1499 int pass) 1500 { 1501 xlog_recover_item_t *item, *n; 1502 int error = 0; 1503 LIST_HEAD(sort_list); 1504 LIST_HEAD(cancel_list); 1505 LIST_HEAD(buffer_list); 1506 LIST_HEAD(inode_buffer_list); 1507 LIST_HEAD(inode_list); 1508 1509 list_splice_init(&trans->r_itemq, &sort_list); 1510 list_for_each_entry_safe(item, n, &sort_list, ri_list) { 1511 xfs_buf_log_format_t *buf_f = item->ri_buf[0].i_addr; 1512 1513 switch (ITEM_TYPE(item)) { 1514 case XFS_LI_ICREATE: 1515 list_move_tail(&item->ri_list, &buffer_list); 1516 break; 1517 case XFS_LI_BUF: 1518 if (buf_f->blf_flags & XFS_BLF_CANCEL) { 1519 trace_xfs_log_recover_item_reorder_head(log, 1520 trans, item, pass); 1521 list_move(&item->ri_list, &cancel_list); 1522 break; 1523 } 1524 if (buf_f->blf_flags & XFS_BLF_INODE_BUF) { 1525 list_move(&item->ri_list, &inode_buffer_list); 1526 break; 1527 } 1528 list_move_tail(&item->ri_list, &buffer_list); 1529 break; 1530 case XFS_LI_INODE: 1531 case XFS_LI_DQUOT: 1532 case XFS_LI_QUOTAOFF: 1533 case XFS_LI_EFD: 1534 case XFS_LI_EFI: 1535 trace_xfs_log_recover_item_reorder_tail(log, 1536 trans, item, pass); 1537 list_move_tail(&item->ri_list, &inode_list); 1538 break; 1539 default: 1540 xfs_warn(log->l_mp, 1541 "%s: unrecognized type of log operation", 1542 __func__); 1543 ASSERT(0); 1544 /* 1545 * return the remaining items back to the transaction 1546 * item list so they can be freed in caller. 1547 */ 1548 if (!list_empty(&sort_list)) 1549 list_splice_init(&sort_list, &trans->r_itemq); 1550 error = -EIO; 1551 goto out; 1552 } 1553 } 1554 out: 1555 ASSERT(list_empty(&sort_list)); 1556 if (!list_empty(&buffer_list)) 1557 list_splice(&buffer_list, &trans->r_itemq); 1558 if (!list_empty(&inode_list)) 1559 list_splice_tail(&inode_list, &trans->r_itemq); 1560 if (!list_empty(&inode_buffer_list)) 1561 list_splice_tail(&inode_buffer_list, &trans->r_itemq); 1562 if (!list_empty(&cancel_list)) 1563 list_splice_tail(&cancel_list, &trans->r_itemq); 1564 return error; 1565 } 1566 1567 /* 1568 * Build up the table of buf cancel records so that we don't replay 1569 * cancelled data in the second pass. For buffer records that are 1570 * not cancel records, there is nothing to do here so we just return. 1571 * 1572 * If we get a cancel record which is already in the table, this indicates 1573 * that the buffer was cancelled multiple times. In order to ensure 1574 * that during pass 2 we keep the record in the table until we reach its 1575 * last occurrence in the log, we keep a reference count in the cancel 1576 * record in the table to tell us how many times we expect to see this 1577 * record during the second pass. 1578 */ 1579 STATIC int 1580 xlog_recover_buffer_pass1( 1581 struct xlog *log, 1582 struct xlog_recover_item *item) 1583 { 1584 xfs_buf_log_format_t *buf_f = item->ri_buf[0].i_addr; 1585 struct list_head *bucket; 1586 struct xfs_buf_cancel *bcp; 1587 1588 /* 1589 * If this isn't a cancel buffer item, then just return. 1590 */ 1591 if (!(buf_f->blf_flags & XFS_BLF_CANCEL)) { 1592 trace_xfs_log_recover_buf_not_cancel(log, buf_f); 1593 return 0; 1594 } 1595 1596 /* 1597 * Insert an xfs_buf_cancel record into the hash table of them. 1598 * If there is already an identical record, bump its reference count. 1599 */ 1600 bucket = XLOG_BUF_CANCEL_BUCKET(log, buf_f->blf_blkno); 1601 list_for_each_entry(bcp, bucket, bc_list) { 1602 if (bcp->bc_blkno == buf_f->blf_blkno && 1603 bcp->bc_len == buf_f->blf_len) { 1604 bcp->bc_refcount++; 1605 trace_xfs_log_recover_buf_cancel_ref_inc(log, buf_f); 1606 return 0; 1607 } 1608 } 1609 1610 bcp = kmem_alloc(sizeof(struct xfs_buf_cancel), KM_SLEEP); 1611 bcp->bc_blkno = buf_f->blf_blkno; 1612 bcp->bc_len = buf_f->blf_len; 1613 bcp->bc_refcount = 1; 1614 list_add_tail(&bcp->bc_list, bucket); 1615 1616 trace_xfs_log_recover_buf_cancel_add(log, buf_f); 1617 return 0; 1618 } 1619 1620 /* 1621 * Check to see whether the buffer being recovered has a corresponding 1622 * entry in the buffer cancel record table. If it is, return the cancel 1623 * buffer structure to the caller. 1624 */ 1625 STATIC struct xfs_buf_cancel * 1626 xlog_peek_buffer_cancelled( 1627 struct xlog *log, 1628 xfs_daddr_t blkno, 1629 uint len, 1630 ushort flags) 1631 { 1632 struct list_head *bucket; 1633 struct xfs_buf_cancel *bcp; 1634 1635 if (!log->l_buf_cancel_table) { 1636 /* empty table means no cancelled buffers in the log */ 1637 ASSERT(!(flags & XFS_BLF_CANCEL)); 1638 return NULL; 1639 } 1640 1641 bucket = XLOG_BUF_CANCEL_BUCKET(log, blkno); 1642 list_for_each_entry(bcp, bucket, bc_list) { 1643 if (bcp->bc_blkno == blkno && bcp->bc_len == len) 1644 return bcp; 1645 } 1646 1647 /* 1648 * We didn't find a corresponding entry in the table, so return 0 so 1649 * that the buffer is NOT cancelled. 1650 */ 1651 ASSERT(!(flags & XFS_BLF_CANCEL)); 1652 return NULL; 1653 } 1654 1655 /* 1656 * If the buffer is being cancelled then return 1 so that it will be cancelled, 1657 * otherwise return 0. If the buffer is actually a buffer cancel item 1658 * (XFS_BLF_CANCEL is set), then decrement the refcount on the entry in the 1659 * table and remove it from the table if this is the last reference. 1660 * 1661 * We remove the cancel record from the table when we encounter its last 1662 * occurrence in the log so that if the same buffer is re-used again after its 1663 * last cancellation we actually replay the changes made at that point. 1664 */ 1665 STATIC int 1666 xlog_check_buffer_cancelled( 1667 struct xlog *log, 1668 xfs_daddr_t blkno, 1669 uint len, 1670 ushort flags) 1671 { 1672 struct xfs_buf_cancel *bcp; 1673 1674 bcp = xlog_peek_buffer_cancelled(log, blkno, len, flags); 1675 if (!bcp) 1676 return 0; 1677 1678 /* 1679 * We've go a match, so return 1 so that the recovery of this buffer 1680 * is cancelled. If this buffer is actually a buffer cancel log 1681 * item, then decrement the refcount on the one in the table and 1682 * remove it if this is the last reference. 1683 */ 1684 if (flags & XFS_BLF_CANCEL) { 1685 if (--bcp->bc_refcount == 0) { 1686 list_del(&bcp->bc_list); 1687 kmem_free(bcp); 1688 } 1689 } 1690 return 1; 1691 } 1692 1693 /* 1694 * Perform recovery for a buffer full of inodes. In these buffers, the only 1695 * data which should be recovered is that which corresponds to the 1696 * di_next_unlinked pointers in the on disk inode structures. The rest of the 1697 * data for the inodes is always logged through the inodes themselves rather 1698 * than the inode buffer and is recovered in xlog_recover_inode_pass2(). 1699 * 1700 * The only time when buffers full of inodes are fully recovered is when the 1701 * buffer is full of newly allocated inodes. In this case the buffer will 1702 * not be marked as an inode buffer and so will be sent to 1703 * xlog_recover_do_reg_buffer() below during recovery. 1704 */ 1705 STATIC int 1706 xlog_recover_do_inode_buffer( 1707 struct xfs_mount *mp, 1708 xlog_recover_item_t *item, 1709 struct xfs_buf *bp, 1710 xfs_buf_log_format_t *buf_f) 1711 { 1712 int i; 1713 int item_index = 0; 1714 int bit = 0; 1715 int nbits = 0; 1716 int reg_buf_offset = 0; 1717 int reg_buf_bytes = 0; 1718 int next_unlinked_offset; 1719 int inodes_per_buf; 1720 xfs_agino_t *logged_nextp; 1721 xfs_agino_t *buffer_nextp; 1722 1723 trace_xfs_log_recover_buf_inode_buf(mp->m_log, buf_f); 1724 1725 /* 1726 * Post recovery validation only works properly on CRC enabled 1727 * filesystems. 1728 */ 1729 if (xfs_sb_version_hascrc(&mp->m_sb)) 1730 bp->b_ops = &xfs_inode_buf_ops; 1731 1732 inodes_per_buf = BBTOB(bp->b_io_length) >> mp->m_sb.sb_inodelog; 1733 for (i = 0; i < inodes_per_buf; i++) { 1734 next_unlinked_offset = (i * mp->m_sb.sb_inodesize) + 1735 offsetof(xfs_dinode_t, di_next_unlinked); 1736 1737 while (next_unlinked_offset >= 1738 (reg_buf_offset + reg_buf_bytes)) { 1739 /* 1740 * The next di_next_unlinked field is beyond 1741 * the current logged region. Find the next 1742 * logged region that contains or is beyond 1743 * the current di_next_unlinked field. 1744 */ 1745 bit += nbits; 1746 bit = xfs_next_bit(buf_f->blf_data_map, 1747 buf_f->blf_map_size, bit); 1748 1749 /* 1750 * If there are no more logged regions in the 1751 * buffer, then we're done. 1752 */ 1753 if (bit == -1) 1754 return 0; 1755 1756 nbits = xfs_contig_bits(buf_f->blf_data_map, 1757 buf_f->blf_map_size, bit); 1758 ASSERT(nbits > 0); 1759 reg_buf_offset = bit << XFS_BLF_SHIFT; 1760 reg_buf_bytes = nbits << XFS_BLF_SHIFT; 1761 item_index++; 1762 } 1763 1764 /* 1765 * If the current logged region starts after the current 1766 * di_next_unlinked field, then move on to the next 1767 * di_next_unlinked field. 1768 */ 1769 if (next_unlinked_offset < reg_buf_offset) 1770 continue; 1771 1772 ASSERT(item->ri_buf[item_index].i_addr != NULL); 1773 ASSERT((item->ri_buf[item_index].i_len % XFS_BLF_CHUNK) == 0); 1774 ASSERT((reg_buf_offset + reg_buf_bytes) <= 1775 BBTOB(bp->b_io_length)); 1776 1777 /* 1778 * The current logged region contains a copy of the 1779 * current di_next_unlinked field. Extract its value 1780 * and copy it to the buffer copy. 1781 */ 1782 logged_nextp = item->ri_buf[item_index].i_addr + 1783 next_unlinked_offset - reg_buf_offset; 1784 if (unlikely(*logged_nextp == 0)) { 1785 xfs_alert(mp, 1786 "Bad inode buffer log record (ptr = 0x%p, bp = 0x%p). " 1787 "Trying to replay bad (0) inode di_next_unlinked field.", 1788 item, bp); 1789 XFS_ERROR_REPORT("xlog_recover_do_inode_buf", 1790 XFS_ERRLEVEL_LOW, mp); 1791 return -EFSCORRUPTED; 1792 } 1793 1794 buffer_nextp = (xfs_agino_t *)xfs_buf_offset(bp, 1795 next_unlinked_offset); 1796 *buffer_nextp = *logged_nextp; 1797 1798 /* 1799 * If necessary, recalculate the CRC in the on-disk inode. We 1800 * have to leave the inode in a consistent state for whoever 1801 * reads it next.... 1802 */ 1803 xfs_dinode_calc_crc(mp, (struct xfs_dinode *) 1804 xfs_buf_offset(bp, i * mp->m_sb.sb_inodesize)); 1805 1806 } 1807 1808 return 0; 1809 } 1810 1811 /* 1812 * V5 filesystems know the age of the buffer on disk being recovered. We can 1813 * have newer objects on disk than we are replaying, and so for these cases we 1814 * don't want to replay the current change as that will make the buffer contents 1815 * temporarily invalid on disk. 1816 * 1817 * The magic number might not match the buffer type we are going to recover 1818 * (e.g. reallocated blocks), so we ignore the xfs_buf_log_format flags. Hence 1819 * extract the LSN of the existing object in the buffer based on it's current 1820 * magic number. If we don't recognise the magic number in the buffer, then 1821 * return a LSN of -1 so that the caller knows it was an unrecognised block and 1822 * so can recover the buffer. 1823 * 1824 * Note: we cannot rely solely on magic number matches to determine that the 1825 * buffer has a valid LSN - we also need to verify that it belongs to this 1826 * filesystem, so we need to extract the object's LSN and compare it to that 1827 * which we read from the superblock. If the UUIDs don't match, then we've got a 1828 * stale metadata block from an old filesystem instance that we need to recover 1829 * over the top of. 1830 */ 1831 static xfs_lsn_t 1832 xlog_recover_get_buf_lsn( 1833 struct xfs_mount *mp, 1834 struct xfs_buf *bp) 1835 { 1836 __uint32_t magic32; 1837 __uint16_t magic16; 1838 __uint16_t magicda; 1839 void *blk = bp->b_addr; 1840 uuid_t *uuid; 1841 xfs_lsn_t lsn = -1; 1842 1843 /* v4 filesystems always recover immediately */ 1844 if (!xfs_sb_version_hascrc(&mp->m_sb)) 1845 goto recover_immediately; 1846 1847 magic32 = be32_to_cpu(*(__be32 *)blk); 1848 switch (magic32) { 1849 case XFS_ABTB_CRC_MAGIC: 1850 case XFS_ABTC_CRC_MAGIC: 1851 case XFS_ABTB_MAGIC: 1852 case XFS_ABTC_MAGIC: 1853 case XFS_IBT_CRC_MAGIC: 1854 case XFS_IBT_MAGIC: { 1855 struct xfs_btree_block *btb = blk; 1856 1857 lsn = be64_to_cpu(btb->bb_u.s.bb_lsn); 1858 uuid = &btb->bb_u.s.bb_uuid; 1859 break; 1860 } 1861 case XFS_BMAP_CRC_MAGIC: 1862 case XFS_BMAP_MAGIC: { 1863 struct xfs_btree_block *btb = blk; 1864 1865 lsn = be64_to_cpu(btb->bb_u.l.bb_lsn); 1866 uuid = &btb->bb_u.l.bb_uuid; 1867 break; 1868 } 1869 case XFS_AGF_MAGIC: 1870 lsn = be64_to_cpu(((struct xfs_agf *)blk)->agf_lsn); 1871 uuid = &((struct xfs_agf *)blk)->agf_uuid; 1872 break; 1873 case XFS_AGFL_MAGIC: 1874 lsn = be64_to_cpu(((struct xfs_agfl *)blk)->agfl_lsn); 1875 uuid = &((struct xfs_agfl *)blk)->agfl_uuid; 1876 break; 1877 case XFS_AGI_MAGIC: 1878 lsn = be64_to_cpu(((struct xfs_agi *)blk)->agi_lsn); 1879 uuid = &((struct xfs_agi *)blk)->agi_uuid; 1880 break; 1881 case XFS_SYMLINK_MAGIC: 1882 lsn = be64_to_cpu(((struct xfs_dsymlink_hdr *)blk)->sl_lsn); 1883 uuid = &((struct xfs_dsymlink_hdr *)blk)->sl_uuid; 1884 break; 1885 case XFS_DIR3_BLOCK_MAGIC: 1886 case XFS_DIR3_DATA_MAGIC: 1887 case XFS_DIR3_FREE_MAGIC: 1888 lsn = be64_to_cpu(((struct xfs_dir3_blk_hdr *)blk)->lsn); 1889 uuid = &((struct xfs_dir3_blk_hdr *)blk)->uuid; 1890 break; 1891 case XFS_ATTR3_RMT_MAGIC: 1892 lsn = be64_to_cpu(((struct xfs_attr3_rmt_hdr *)blk)->rm_lsn); 1893 uuid = &((struct xfs_attr3_rmt_hdr *)blk)->rm_uuid; 1894 break; 1895 case XFS_SB_MAGIC: 1896 lsn = be64_to_cpu(((struct xfs_dsb *)blk)->sb_lsn); 1897 uuid = &((struct xfs_dsb *)blk)->sb_uuid; 1898 break; 1899 default: 1900 break; 1901 } 1902 1903 if (lsn != (xfs_lsn_t)-1) { 1904 if (!uuid_equal(&mp->m_sb.sb_uuid, uuid)) 1905 goto recover_immediately; 1906 return lsn; 1907 } 1908 1909 magicda = be16_to_cpu(((struct xfs_da_blkinfo *)blk)->magic); 1910 switch (magicda) { 1911 case XFS_DIR3_LEAF1_MAGIC: 1912 case XFS_DIR3_LEAFN_MAGIC: 1913 case XFS_DA3_NODE_MAGIC: 1914 lsn = be64_to_cpu(((struct xfs_da3_blkinfo *)blk)->lsn); 1915 uuid = &((struct xfs_da3_blkinfo *)blk)->uuid; 1916 break; 1917 default: 1918 break; 1919 } 1920 1921 if (lsn != (xfs_lsn_t)-1) { 1922 if (!uuid_equal(&mp->m_sb.sb_uuid, uuid)) 1923 goto recover_immediately; 1924 return lsn; 1925 } 1926 1927 /* 1928 * We do individual object checks on dquot and inode buffers as they 1929 * have their own individual LSN records. Also, we could have a stale 1930 * buffer here, so we have to at least recognise these buffer types. 1931 * 1932 * A notd complexity here is inode unlinked list processing - it logs 1933 * the inode directly in the buffer, but we don't know which inodes have 1934 * been modified, and there is no global buffer LSN. Hence we need to 1935 * recover all inode buffer types immediately. This problem will be 1936 * fixed by logical logging of the unlinked list modifications. 1937 */ 1938 magic16 = be16_to_cpu(*(__be16 *)blk); 1939 switch (magic16) { 1940 case XFS_DQUOT_MAGIC: 1941 case XFS_DINODE_MAGIC: 1942 goto recover_immediately; 1943 default: 1944 break; 1945 } 1946 1947 /* unknown buffer contents, recover immediately */ 1948 1949 recover_immediately: 1950 return (xfs_lsn_t)-1; 1951 1952 } 1953 1954 /* 1955 * Validate the recovered buffer is of the correct type and attach the 1956 * appropriate buffer operations to them for writeback. Magic numbers are in a 1957 * few places: 1958 * the first 16 bits of the buffer (inode buffer, dquot buffer), 1959 * the first 32 bits of the buffer (most blocks), 1960 * inside a struct xfs_da_blkinfo at the start of the buffer. 1961 */ 1962 static void 1963 xlog_recover_validate_buf_type( 1964 struct xfs_mount *mp, 1965 struct xfs_buf *bp, 1966 xfs_buf_log_format_t *buf_f) 1967 { 1968 struct xfs_da_blkinfo *info = bp->b_addr; 1969 __uint32_t magic32; 1970 __uint16_t magic16; 1971 __uint16_t magicda; 1972 1973 /* 1974 * We can only do post recovery validation on items on CRC enabled 1975 * fielsystems as we need to know when the buffer was written to be able 1976 * to determine if we should have replayed the item. If we replay old 1977 * metadata over a newer buffer, then it will enter a temporarily 1978 * inconsistent state resulting in verification failures. Hence for now 1979 * just avoid the verification stage for non-crc filesystems 1980 */ 1981 if (!xfs_sb_version_hascrc(&mp->m_sb)) 1982 return; 1983 1984 magic32 = be32_to_cpu(*(__be32 *)bp->b_addr); 1985 magic16 = be16_to_cpu(*(__be16*)bp->b_addr); 1986 magicda = be16_to_cpu(info->magic); 1987 switch (xfs_blft_from_flags(buf_f)) { 1988 case XFS_BLFT_BTREE_BUF: 1989 switch (magic32) { 1990 case XFS_ABTB_CRC_MAGIC: 1991 case XFS_ABTC_CRC_MAGIC: 1992 case XFS_ABTB_MAGIC: 1993 case XFS_ABTC_MAGIC: 1994 bp->b_ops = &xfs_allocbt_buf_ops; 1995 break; 1996 case XFS_IBT_CRC_MAGIC: 1997 case XFS_FIBT_CRC_MAGIC: 1998 case XFS_IBT_MAGIC: 1999 case XFS_FIBT_MAGIC: 2000 bp->b_ops = &xfs_inobt_buf_ops; 2001 break; 2002 case XFS_BMAP_CRC_MAGIC: 2003 case XFS_BMAP_MAGIC: 2004 bp->b_ops = &xfs_bmbt_buf_ops; 2005 break; 2006 default: 2007 xfs_warn(mp, "Bad btree block magic!"); 2008 ASSERT(0); 2009 break; 2010 } 2011 break; 2012 case XFS_BLFT_AGF_BUF: 2013 if (magic32 != XFS_AGF_MAGIC) { 2014 xfs_warn(mp, "Bad AGF block magic!"); 2015 ASSERT(0); 2016 break; 2017 } 2018 bp->b_ops = &xfs_agf_buf_ops; 2019 break; 2020 case XFS_BLFT_AGFL_BUF: 2021 if (magic32 != XFS_AGFL_MAGIC) { 2022 xfs_warn(mp, "Bad AGFL block magic!"); 2023 ASSERT(0); 2024 break; 2025 } 2026 bp->b_ops = &xfs_agfl_buf_ops; 2027 break; 2028 case XFS_BLFT_AGI_BUF: 2029 if (magic32 != XFS_AGI_MAGIC) { 2030 xfs_warn(mp, "Bad AGI block magic!"); 2031 ASSERT(0); 2032 break; 2033 } 2034 bp->b_ops = &xfs_agi_buf_ops; 2035 break; 2036 case XFS_BLFT_UDQUOT_BUF: 2037 case XFS_BLFT_PDQUOT_BUF: 2038 case XFS_BLFT_GDQUOT_BUF: 2039 #ifdef CONFIG_XFS_QUOTA 2040 if (magic16 != XFS_DQUOT_MAGIC) { 2041 xfs_warn(mp, "Bad DQUOT block magic!"); 2042 ASSERT(0); 2043 break; 2044 } 2045 bp->b_ops = &xfs_dquot_buf_ops; 2046 #else 2047 xfs_alert(mp, 2048 "Trying to recover dquots without QUOTA support built in!"); 2049 ASSERT(0); 2050 #endif 2051 break; 2052 case XFS_BLFT_DINO_BUF: 2053 if (magic16 != XFS_DINODE_MAGIC) { 2054 xfs_warn(mp, "Bad INODE block magic!"); 2055 ASSERT(0); 2056 break; 2057 } 2058 bp->b_ops = &xfs_inode_buf_ops; 2059 break; 2060 case XFS_BLFT_SYMLINK_BUF: 2061 if (magic32 != XFS_SYMLINK_MAGIC) { 2062 xfs_warn(mp, "Bad symlink block magic!"); 2063 ASSERT(0); 2064 break; 2065 } 2066 bp->b_ops = &xfs_symlink_buf_ops; 2067 break; 2068 case XFS_BLFT_DIR_BLOCK_BUF: 2069 if (magic32 != XFS_DIR2_BLOCK_MAGIC && 2070 magic32 != XFS_DIR3_BLOCK_MAGIC) { 2071 xfs_warn(mp, "Bad dir block magic!"); 2072 ASSERT(0); 2073 break; 2074 } 2075 bp->b_ops = &xfs_dir3_block_buf_ops; 2076 break; 2077 case XFS_BLFT_DIR_DATA_BUF: 2078 if (magic32 != XFS_DIR2_DATA_MAGIC && 2079 magic32 != XFS_DIR3_DATA_MAGIC) { 2080 xfs_warn(mp, "Bad dir data magic!"); 2081 ASSERT(0); 2082 break; 2083 } 2084 bp->b_ops = &xfs_dir3_data_buf_ops; 2085 break; 2086 case XFS_BLFT_DIR_FREE_BUF: 2087 if (magic32 != XFS_DIR2_FREE_MAGIC && 2088 magic32 != XFS_DIR3_FREE_MAGIC) { 2089 xfs_warn(mp, "Bad dir3 free magic!"); 2090 ASSERT(0); 2091 break; 2092 } 2093 bp->b_ops = &xfs_dir3_free_buf_ops; 2094 break; 2095 case XFS_BLFT_DIR_LEAF1_BUF: 2096 if (magicda != XFS_DIR2_LEAF1_MAGIC && 2097 magicda != XFS_DIR3_LEAF1_MAGIC) { 2098 xfs_warn(mp, "Bad dir leaf1 magic!"); 2099 ASSERT(0); 2100 break; 2101 } 2102 bp->b_ops = &xfs_dir3_leaf1_buf_ops; 2103 break; 2104 case XFS_BLFT_DIR_LEAFN_BUF: 2105 if (magicda != XFS_DIR2_LEAFN_MAGIC && 2106 magicda != XFS_DIR3_LEAFN_MAGIC) { 2107 xfs_warn(mp, "Bad dir leafn magic!"); 2108 ASSERT(0); 2109 break; 2110 } 2111 bp->b_ops = &xfs_dir3_leafn_buf_ops; 2112 break; 2113 case XFS_BLFT_DA_NODE_BUF: 2114 if (magicda != XFS_DA_NODE_MAGIC && 2115 magicda != XFS_DA3_NODE_MAGIC) { 2116 xfs_warn(mp, "Bad da node magic!"); 2117 ASSERT(0); 2118 break; 2119 } 2120 bp->b_ops = &xfs_da3_node_buf_ops; 2121 break; 2122 case XFS_BLFT_ATTR_LEAF_BUF: 2123 if (magicda != XFS_ATTR_LEAF_MAGIC && 2124 magicda != XFS_ATTR3_LEAF_MAGIC) { 2125 xfs_warn(mp, "Bad attr leaf magic!"); 2126 ASSERT(0); 2127 break; 2128 } 2129 bp->b_ops = &xfs_attr3_leaf_buf_ops; 2130 break; 2131 case XFS_BLFT_ATTR_RMT_BUF: 2132 if (magic32 != XFS_ATTR3_RMT_MAGIC) { 2133 xfs_warn(mp, "Bad attr remote magic!"); 2134 ASSERT(0); 2135 break; 2136 } 2137 bp->b_ops = &xfs_attr3_rmt_buf_ops; 2138 break; 2139 case XFS_BLFT_SB_BUF: 2140 if (magic32 != XFS_SB_MAGIC) { 2141 xfs_warn(mp, "Bad SB block magic!"); 2142 ASSERT(0); 2143 break; 2144 } 2145 bp->b_ops = &xfs_sb_buf_ops; 2146 break; 2147 default: 2148 xfs_warn(mp, "Unknown buffer type %d!", 2149 xfs_blft_from_flags(buf_f)); 2150 break; 2151 } 2152 } 2153 2154 /* 2155 * Perform a 'normal' buffer recovery. Each logged region of the 2156 * buffer should be copied over the corresponding region in the 2157 * given buffer. The bitmap in the buf log format structure indicates 2158 * where to place the logged data. 2159 */ 2160 STATIC void 2161 xlog_recover_do_reg_buffer( 2162 struct xfs_mount *mp, 2163 xlog_recover_item_t *item, 2164 struct xfs_buf *bp, 2165 xfs_buf_log_format_t *buf_f) 2166 { 2167 int i; 2168 int bit; 2169 int nbits; 2170 int error; 2171 2172 trace_xfs_log_recover_buf_reg_buf(mp->m_log, buf_f); 2173 2174 bit = 0; 2175 i = 1; /* 0 is the buf format structure */ 2176 while (1) { 2177 bit = xfs_next_bit(buf_f->blf_data_map, 2178 buf_f->blf_map_size, bit); 2179 if (bit == -1) 2180 break; 2181 nbits = xfs_contig_bits(buf_f->blf_data_map, 2182 buf_f->blf_map_size, bit); 2183 ASSERT(nbits > 0); 2184 ASSERT(item->ri_buf[i].i_addr != NULL); 2185 ASSERT(item->ri_buf[i].i_len % XFS_BLF_CHUNK == 0); 2186 ASSERT(BBTOB(bp->b_io_length) >= 2187 ((uint)bit << XFS_BLF_SHIFT) + (nbits << XFS_BLF_SHIFT)); 2188 2189 /* 2190 * The dirty regions logged in the buffer, even though 2191 * contiguous, may span multiple chunks. This is because the 2192 * dirty region may span a physical page boundary in a buffer 2193 * and hence be split into two separate vectors for writing into 2194 * the log. Hence we need to trim nbits back to the length of 2195 * the current region being copied out of the log. 2196 */ 2197 if (item->ri_buf[i].i_len < (nbits << XFS_BLF_SHIFT)) 2198 nbits = item->ri_buf[i].i_len >> XFS_BLF_SHIFT; 2199 2200 /* 2201 * Do a sanity check if this is a dquot buffer. Just checking 2202 * the first dquot in the buffer should do. XXXThis is 2203 * probably a good thing to do for other buf types also. 2204 */ 2205 error = 0; 2206 if (buf_f->blf_flags & 2207 (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) { 2208 if (item->ri_buf[i].i_addr == NULL) { 2209 xfs_alert(mp, 2210 "XFS: NULL dquot in %s.", __func__); 2211 goto next; 2212 } 2213 if (item->ri_buf[i].i_len < sizeof(xfs_disk_dquot_t)) { 2214 xfs_alert(mp, 2215 "XFS: dquot too small (%d) in %s.", 2216 item->ri_buf[i].i_len, __func__); 2217 goto next; 2218 } 2219 error = xfs_dqcheck(mp, item->ri_buf[i].i_addr, 2220 -1, 0, XFS_QMOPT_DOWARN, 2221 "dquot_buf_recover"); 2222 if (error) 2223 goto next; 2224 } 2225 2226 memcpy(xfs_buf_offset(bp, 2227 (uint)bit << XFS_BLF_SHIFT), /* dest */ 2228 item->ri_buf[i].i_addr, /* source */ 2229 nbits<<XFS_BLF_SHIFT); /* length */ 2230 next: 2231 i++; 2232 bit += nbits; 2233 } 2234 2235 /* Shouldn't be any more regions */ 2236 ASSERT(i == item->ri_total); 2237 2238 xlog_recover_validate_buf_type(mp, bp, buf_f); 2239 } 2240 2241 /* 2242 * Perform a dquot buffer recovery. 2243 * Simple algorithm: if we have found a QUOTAOFF log item of the same type 2244 * (ie. USR or GRP), then just toss this buffer away; don't recover it. 2245 * Else, treat it as a regular buffer and do recovery. 2246 * 2247 * Return false if the buffer was tossed and true if we recovered the buffer to 2248 * indicate to the caller if the buffer needs writing. 2249 */ 2250 STATIC bool 2251 xlog_recover_do_dquot_buffer( 2252 struct xfs_mount *mp, 2253 struct xlog *log, 2254 struct xlog_recover_item *item, 2255 struct xfs_buf *bp, 2256 struct xfs_buf_log_format *buf_f) 2257 { 2258 uint type; 2259 2260 trace_xfs_log_recover_buf_dquot_buf(log, buf_f); 2261 2262 /* 2263 * Filesystems are required to send in quota flags at mount time. 2264 */ 2265 if (!mp->m_qflags) 2266 return false; 2267 2268 type = 0; 2269 if (buf_f->blf_flags & XFS_BLF_UDQUOT_BUF) 2270 type |= XFS_DQ_USER; 2271 if (buf_f->blf_flags & XFS_BLF_PDQUOT_BUF) 2272 type |= XFS_DQ_PROJ; 2273 if (buf_f->blf_flags & XFS_BLF_GDQUOT_BUF) 2274 type |= XFS_DQ_GROUP; 2275 /* 2276 * This type of quotas was turned off, so ignore this buffer 2277 */ 2278 if (log->l_quotaoffs_flag & type) 2279 return false; 2280 2281 xlog_recover_do_reg_buffer(mp, item, bp, buf_f); 2282 return true; 2283 } 2284 2285 /* 2286 * This routine replays a modification made to a buffer at runtime. 2287 * There are actually two types of buffer, regular and inode, which 2288 * are handled differently. Inode buffers are handled differently 2289 * in that we only recover a specific set of data from them, namely 2290 * the inode di_next_unlinked fields. This is because all other inode 2291 * data is actually logged via inode records and any data we replay 2292 * here which overlaps that may be stale. 2293 * 2294 * When meta-data buffers are freed at run time we log a buffer item 2295 * with the XFS_BLF_CANCEL bit set to indicate that previous copies 2296 * of the buffer in the log should not be replayed at recovery time. 2297 * This is so that if the blocks covered by the buffer are reused for 2298 * file data before we crash we don't end up replaying old, freed 2299 * meta-data into a user's file. 2300 * 2301 * To handle the cancellation of buffer log items, we make two passes 2302 * over the log during recovery. During the first we build a table of 2303 * those buffers which have been cancelled, and during the second we 2304 * only replay those buffers which do not have corresponding cancel 2305 * records in the table. See xlog_recover_buffer_pass[1,2] above 2306 * for more details on the implementation of the table of cancel records. 2307 */ 2308 STATIC int 2309 xlog_recover_buffer_pass2( 2310 struct xlog *log, 2311 struct list_head *buffer_list, 2312 struct xlog_recover_item *item, 2313 xfs_lsn_t current_lsn) 2314 { 2315 xfs_buf_log_format_t *buf_f = item->ri_buf[0].i_addr; 2316 xfs_mount_t *mp = log->l_mp; 2317 xfs_buf_t *bp; 2318 int error; 2319 uint buf_flags; 2320 xfs_lsn_t lsn; 2321 2322 /* 2323 * In this pass we only want to recover all the buffers which have 2324 * not been cancelled and are not cancellation buffers themselves. 2325 */ 2326 if (xlog_check_buffer_cancelled(log, buf_f->blf_blkno, 2327 buf_f->blf_len, buf_f->blf_flags)) { 2328 trace_xfs_log_recover_buf_cancel(log, buf_f); 2329 return 0; 2330 } 2331 2332 trace_xfs_log_recover_buf_recover(log, buf_f); 2333 2334 buf_flags = 0; 2335 if (buf_f->blf_flags & XFS_BLF_INODE_BUF) 2336 buf_flags |= XBF_UNMAPPED; 2337 2338 bp = xfs_buf_read(mp->m_ddev_targp, buf_f->blf_blkno, buf_f->blf_len, 2339 buf_flags, NULL); 2340 if (!bp) 2341 return -ENOMEM; 2342 error = bp->b_error; 2343 if (error) { 2344 xfs_buf_ioerror_alert(bp, "xlog_recover_do..(read#1)"); 2345 goto out_release; 2346 } 2347 2348 /* 2349 * Recover the buffer only if we get an LSN from it and it's less than 2350 * the lsn of the transaction we are replaying. 2351 * 2352 * Note that we have to be extremely careful of readahead here. 2353 * Readahead does not attach verfiers to the buffers so if we don't 2354 * actually do any replay after readahead because of the LSN we found 2355 * in the buffer if more recent than that current transaction then we 2356 * need to attach the verifier directly. Failure to do so can lead to 2357 * future recovery actions (e.g. EFI and unlinked list recovery) can 2358 * operate on the buffers and they won't get the verifier attached. This 2359 * can lead to blocks on disk having the correct content but a stale 2360 * CRC. 2361 * 2362 * It is safe to assume these clean buffers are currently up to date. 2363 * If the buffer is dirtied by a later transaction being replayed, then 2364 * the verifier will be reset to match whatever recover turns that 2365 * buffer into. 2366 */ 2367 lsn = xlog_recover_get_buf_lsn(mp, bp); 2368 if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) { 2369 xlog_recover_validate_buf_type(mp, bp, buf_f); 2370 goto out_release; 2371 } 2372 2373 if (buf_f->blf_flags & XFS_BLF_INODE_BUF) { 2374 error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f); 2375 if (error) 2376 goto out_release; 2377 } else if (buf_f->blf_flags & 2378 (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) { 2379 bool dirty; 2380 2381 dirty = xlog_recover_do_dquot_buffer(mp, log, item, bp, buf_f); 2382 if (!dirty) 2383 goto out_release; 2384 } else { 2385 xlog_recover_do_reg_buffer(mp, item, bp, buf_f); 2386 } 2387 2388 /* 2389 * Perform delayed write on the buffer. Asynchronous writes will be 2390 * slower when taking into account all the buffers to be flushed. 2391 * 2392 * Also make sure that only inode buffers with good sizes stay in 2393 * the buffer cache. The kernel moves inodes in buffers of 1 block 2394 * or mp->m_inode_cluster_size bytes, whichever is bigger. The inode 2395 * buffers in the log can be a different size if the log was generated 2396 * by an older kernel using unclustered inode buffers or a newer kernel 2397 * running with a different inode cluster size. Regardless, if the 2398 * the inode buffer size isn't MAX(blocksize, mp->m_inode_cluster_size) 2399 * for *our* value of mp->m_inode_cluster_size, then we need to keep 2400 * the buffer out of the buffer cache so that the buffer won't 2401 * overlap with future reads of those inodes. 2402 */ 2403 if (XFS_DINODE_MAGIC == 2404 be16_to_cpu(*((__be16 *)xfs_buf_offset(bp, 0))) && 2405 (BBTOB(bp->b_io_length) != MAX(log->l_mp->m_sb.sb_blocksize, 2406 (__uint32_t)log->l_mp->m_inode_cluster_size))) { 2407 xfs_buf_stale(bp); 2408 error = xfs_bwrite(bp); 2409 } else { 2410 ASSERT(bp->b_target->bt_mount == mp); 2411 bp->b_iodone = xlog_recover_iodone; 2412 xfs_buf_delwri_queue(bp, buffer_list); 2413 } 2414 2415 out_release: 2416 xfs_buf_relse(bp); 2417 return error; 2418 } 2419 2420 /* 2421 * Inode fork owner changes 2422 * 2423 * If we have been told that we have to reparent the inode fork, it's because an 2424 * extent swap operation on a CRC enabled filesystem has been done and we are 2425 * replaying it. We need to walk the BMBT of the appropriate fork and change the 2426 * owners of it. 2427 * 2428 * The complexity here is that we don't have an inode context to work with, so 2429 * after we've replayed the inode we need to instantiate one. This is where the 2430 * fun begins. 2431 * 2432 * We are in the middle of log recovery, so we can't run transactions. That 2433 * means we cannot use cache coherent inode instantiation via xfs_iget(), as 2434 * that will result in the corresponding iput() running the inode through 2435 * xfs_inactive(). If we've just replayed an inode core that changes the link 2436 * count to zero (i.e. it's been unlinked), then xfs_inactive() will run 2437 * transactions (bad!). 2438 * 2439 * So, to avoid this, we instantiate an inode directly from the inode core we've 2440 * just recovered. We have the buffer still locked, and all we really need to 2441 * instantiate is the inode core and the forks being modified. We can do this 2442 * manually, then run the inode btree owner change, and then tear down the 2443 * xfs_inode without having to run any transactions at all. 2444 * 2445 * Also, because we don't have a transaction context available here but need to 2446 * gather all the buffers we modify for writeback so we pass the buffer_list 2447 * instead for the operation to use. 2448 */ 2449 2450 STATIC int 2451 xfs_recover_inode_owner_change( 2452 struct xfs_mount *mp, 2453 struct xfs_dinode *dip, 2454 struct xfs_inode_log_format *in_f, 2455 struct list_head *buffer_list) 2456 { 2457 struct xfs_inode *ip; 2458 int error; 2459 2460 ASSERT(in_f->ilf_fields & (XFS_ILOG_DOWNER|XFS_ILOG_AOWNER)); 2461 2462 ip = xfs_inode_alloc(mp, in_f->ilf_ino); 2463 if (!ip) 2464 return -ENOMEM; 2465 2466 /* instantiate the inode */ 2467 xfs_dinode_from_disk(&ip->i_d, dip); 2468 ASSERT(ip->i_d.di_version >= 3); 2469 2470 error = xfs_iformat_fork(ip, dip); 2471 if (error) 2472 goto out_free_ip; 2473 2474 2475 if (in_f->ilf_fields & XFS_ILOG_DOWNER) { 2476 ASSERT(in_f->ilf_fields & XFS_ILOG_DBROOT); 2477 error = xfs_bmbt_change_owner(NULL, ip, XFS_DATA_FORK, 2478 ip->i_ino, buffer_list); 2479 if (error) 2480 goto out_free_ip; 2481 } 2482 2483 if (in_f->ilf_fields & XFS_ILOG_AOWNER) { 2484 ASSERT(in_f->ilf_fields & XFS_ILOG_ABROOT); 2485 error = xfs_bmbt_change_owner(NULL, ip, XFS_ATTR_FORK, 2486 ip->i_ino, buffer_list); 2487 if (error) 2488 goto out_free_ip; 2489 } 2490 2491 out_free_ip: 2492 xfs_inode_free(ip); 2493 return error; 2494 } 2495 2496 STATIC int 2497 xlog_recover_inode_pass2( 2498 struct xlog *log, 2499 struct list_head *buffer_list, 2500 struct xlog_recover_item *item, 2501 xfs_lsn_t current_lsn) 2502 { 2503 xfs_inode_log_format_t *in_f; 2504 xfs_mount_t *mp = log->l_mp; 2505 xfs_buf_t *bp; 2506 xfs_dinode_t *dip; 2507 int len; 2508 xfs_caddr_t src; 2509 xfs_caddr_t dest; 2510 int error; 2511 int attr_index; 2512 uint fields; 2513 xfs_icdinode_t *dicp; 2514 uint isize; 2515 int need_free = 0; 2516 2517 if (item->ri_buf[0].i_len == sizeof(xfs_inode_log_format_t)) { 2518 in_f = item->ri_buf[0].i_addr; 2519 } else { 2520 in_f = kmem_alloc(sizeof(xfs_inode_log_format_t), KM_SLEEP); 2521 need_free = 1; 2522 error = xfs_inode_item_format_convert(&item->ri_buf[0], in_f); 2523 if (error) 2524 goto error; 2525 } 2526 2527 /* 2528 * Inode buffers can be freed, look out for it, 2529 * and do not replay the inode. 2530 */ 2531 if (xlog_check_buffer_cancelled(log, in_f->ilf_blkno, 2532 in_f->ilf_len, 0)) { 2533 error = 0; 2534 trace_xfs_log_recover_inode_cancel(log, in_f); 2535 goto error; 2536 } 2537 trace_xfs_log_recover_inode_recover(log, in_f); 2538 2539 bp = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len, 0, 2540 &xfs_inode_buf_ops); 2541 if (!bp) { 2542 error = -ENOMEM; 2543 goto error; 2544 } 2545 error = bp->b_error; 2546 if (error) { 2547 xfs_buf_ioerror_alert(bp, "xlog_recover_do..(read#2)"); 2548 goto out_release; 2549 } 2550 ASSERT(in_f->ilf_fields & XFS_ILOG_CORE); 2551 dip = (xfs_dinode_t *)xfs_buf_offset(bp, in_f->ilf_boffset); 2552 2553 /* 2554 * Make sure the place we're flushing out to really looks 2555 * like an inode! 2556 */ 2557 if (unlikely(dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC))) { 2558 xfs_alert(mp, 2559 "%s: Bad inode magic number, dip = 0x%p, dino bp = 0x%p, ino = %Ld", 2560 __func__, dip, bp, in_f->ilf_ino); 2561 XFS_ERROR_REPORT("xlog_recover_inode_pass2(1)", 2562 XFS_ERRLEVEL_LOW, mp); 2563 error = -EFSCORRUPTED; 2564 goto out_release; 2565 } 2566 dicp = item->ri_buf[1].i_addr; 2567 if (unlikely(dicp->di_magic != XFS_DINODE_MAGIC)) { 2568 xfs_alert(mp, 2569 "%s: Bad inode log record, rec ptr 0x%p, ino %Ld", 2570 __func__, item, in_f->ilf_ino); 2571 XFS_ERROR_REPORT("xlog_recover_inode_pass2(2)", 2572 XFS_ERRLEVEL_LOW, mp); 2573 error = -EFSCORRUPTED; 2574 goto out_release; 2575 } 2576 2577 /* 2578 * If the inode has an LSN in it, recover the inode only if it's less 2579 * than the lsn of the transaction we are replaying. Note: we still 2580 * need to replay an owner change even though the inode is more recent 2581 * than the transaction as there is no guarantee that all the btree 2582 * blocks are more recent than this transaction, too. 2583 */ 2584 if (dip->di_version >= 3) { 2585 xfs_lsn_t lsn = be64_to_cpu(dip->di_lsn); 2586 2587 if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) { 2588 trace_xfs_log_recover_inode_skip(log, in_f); 2589 error = 0; 2590 goto out_owner_change; 2591 } 2592 } 2593 2594 /* 2595 * di_flushiter is only valid for v1/2 inodes. All changes for v3 inodes 2596 * are transactional and if ordering is necessary we can determine that 2597 * more accurately by the LSN field in the V3 inode core. Don't trust 2598 * the inode versions we might be changing them here - use the 2599 * superblock flag to determine whether we need to look at di_flushiter 2600 * to skip replay when the on disk inode is newer than the log one 2601 */ 2602 if (!xfs_sb_version_hascrc(&mp->m_sb) && 2603 dicp->di_flushiter < be16_to_cpu(dip->di_flushiter)) { 2604 /* 2605 * Deal with the wrap case, DI_MAX_FLUSH is less 2606 * than smaller numbers 2607 */ 2608 if (be16_to_cpu(dip->di_flushiter) == DI_MAX_FLUSH && 2609 dicp->di_flushiter < (DI_MAX_FLUSH >> 1)) { 2610 /* do nothing */ 2611 } else { 2612 trace_xfs_log_recover_inode_skip(log, in_f); 2613 error = 0; 2614 goto out_release; 2615 } 2616 } 2617 2618 /* Take the opportunity to reset the flush iteration count */ 2619 dicp->di_flushiter = 0; 2620 2621 if (unlikely(S_ISREG(dicp->di_mode))) { 2622 if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) && 2623 (dicp->di_format != XFS_DINODE_FMT_BTREE)) { 2624 XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(3)", 2625 XFS_ERRLEVEL_LOW, mp, dicp); 2626 xfs_alert(mp, 2627 "%s: Bad regular inode log record, rec ptr 0x%p, " 2628 "ino ptr = 0x%p, ino bp = 0x%p, ino %Ld", 2629 __func__, item, dip, bp, in_f->ilf_ino); 2630 error = -EFSCORRUPTED; 2631 goto out_release; 2632 } 2633 } else if (unlikely(S_ISDIR(dicp->di_mode))) { 2634 if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) && 2635 (dicp->di_format != XFS_DINODE_FMT_BTREE) && 2636 (dicp->di_format != XFS_DINODE_FMT_LOCAL)) { 2637 XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(4)", 2638 XFS_ERRLEVEL_LOW, mp, dicp); 2639 xfs_alert(mp, 2640 "%s: Bad dir inode log record, rec ptr 0x%p, " 2641 "ino ptr = 0x%p, ino bp = 0x%p, ino %Ld", 2642 __func__, item, dip, bp, in_f->ilf_ino); 2643 error = -EFSCORRUPTED; 2644 goto out_release; 2645 } 2646 } 2647 if (unlikely(dicp->di_nextents + dicp->di_anextents > dicp->di_nblocks)){ 2648 XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(5)", 2649 XFS_ERRLEVEL_LOW, mp, dicp); 2650 xfs_alert(mp, 2651 "%s: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, " 2652 "dino bp 0x%p, ino %Ld, total extents = %d, nblocks = %Ld", 2653 __func__, item, dip, bp, in_f->ilf_ino, 2654 dicp->di_nextents + dicp->di_anextents, 2655 dicp->di_nblocks); 2656 error = -EFSCORRUPTED; 2657 goto out_release; 2658 } 2659 if (unlikely(dicp->di_forkoff > mp->m_sb.sb_inodesize)) { 2660 XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(6)", 2661 XFS_ERRLEVEL_LOW, mp, dicp); 2662 xfs_alert(mp, 2663 "%s: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, " 2664 "dino bp 0x%p, ino %Ld, forkoff 0x%x", __func__, 2665 item, dip, bp, in_f->ilf_ino, dicp->di_forkoff); 2666 error = -EFSCORRUPTED; 2667 goto out_release; 2668 } 2669 isize = xfs_icdinode_size(dicp->di_version); 2670 if (unlikely(item->ri_buf[1].i_len > isize)) { 2671 XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(7)", 2672 XFS_ERRLEVEL_LOW, mp, dicp); 2673 xfs_alert(mp, 2674 "%s: Bad inode log record length %d, rec ptr 0x%p", 2675 __func__, item->ri_buf[1].i_len, item); 2676 error = -EFSCORRUPTED; 2677 goto out_release; 2678 } 2679 2680 /* The core is in in-core format */ 2681 xfs_dinode_to_disk(dip, dicp); 2682 2683 /* the rest is in on-disk format */ 2684 if (item->ri_buf[1].i_len > isize) { 2685 memcpy((char *)dip + isize, 2686 item->ri_buf[1].i_addr + isize, 2687 item->ri_buf[1].i_len - isize); 2688 } 2689 2690 fields = in_f->ilf_fields; 2691 switch (fields & (XFS_ILOG_DEV | XFS_ILOG_UUID)) { 2692 case XFS_ILOG_DEV: 2693 xfs_dinode_put_rdev(dip, in_f->ilf_u.ilfu_rdev); 2694 break; 2695 case XFS_ILOG_UUID: 2696 memcpy(XFS_DFORK_DPTR(dip), 2697 &in_f->ilf_u.ilfu_uuid, 2698 sizeof(uuid_t)); 2699 break; 2700 } 2701 2702 if (in_f->ilf_size == 2) 2703 goto out_owner_change; 2704 len = item->ri_buf[2].i_len; 2705 src = item->ri_buf[2].i_addr; 2706 ASSERT(in_f->ilf_size <= 4); 2707 ASSERT((in_f->ilf_size == 3) || (fields & XFS_ILOG_AFORK)); 2708 ASSERT(!(fields & XFS_ILOG_DFORK) || 2709 (len == in_f->ilf_dsize)); 2710 2711 switch (fields & XFS_ILOG_DFORK) { 2712 case XFS_ILOG_DDATA: 2713 case XFS_ILOG_DEXT: 2714 memcpy(XFS_DFORK_DPTR(dip), src, len); 2715 break; 2716 2717 case XFS_ILOG_DBROOT: 2718 xfs_bmbt_to_bmdr(mp, (struct xfs_btree_block *)src, len, 2719 (xfs_bmdr_block_t *)XFS_DFORK_DPTR(dip), 2720 XFS_DFORK_DSIZE(dip, mp)); 2721 break; 2722 2723 default: 2724 /* 2725 * There are no data fork flags set. 2726 */ 2727 ASSERT((fields & XFS_ILOG_DFORK) == 0); 2728 break; 2729 } 2730 2731 /* 2732 * If we logged any attribute data, recover it. There may or 2733 * may not have been any other non-core data logged in this 2734 * transaction. 2735 */ 2736 if (in_f->ilf_fields & XFS_ILOG_AFORK) { 2737 if (in_f->ilf_fields & XFS_ILOG_DFORK) { 2738 attr_index = 3; 2739 } else { 2740 attr_index = 2; 2741 } 2742 len = item->ri_buf[attr_index].i_len; 2743 src = item->ri_buf[attr_index].i_addr; 2744 ASSERT(len == in_f->ilf_asize); 2745 2746 switch (in_f->ilf_fields & XFS_ILOG_AFORK) { 2747 case XFS_ILOG_ADATA: 2748 case XFS_ILOG_AEXT: 2749 dest = XFS_DFORK_APTR(dip); 2750 ASSERT(len <= XFS_DFORK_ASIZE(dip, mp)); 2751 memcpy(dest, src, len); 2752 break; 2753 2754 case XFS_ILOG_ABROOT: 2755 dest = XFS_DFORK_APTR(dip); 2756 xfs_bmbt_to_bmdr(mp, (struct xfs_btree_block *)src, 2757 len, (xfs_bmdr_block_t*)dest, 2758 XFS_DFORK_ASIZE(dip, mp)); 2759 break; 2760 2761 default: 2762 xfs_warn(log->l_mp, "%s: Invalid flag", __func__); 2763 ASSERT(0); 2764 error = -EIO; 2765 goto out_release; 2766 } 2767 } 2768 2769 out_owner_change: 2770 if (in_f->ilf_fields & (XFS_ILOG_DOWNER|XFS_ILOG_AOWNER)) 2771 error = xfs_recover_inode_owner_change(mp, dip, in_f, 2772 buffer_list); 2773 /* re-generate the checksum. */ 2774 xfs_dinode_calc_crc(log->l_mp, dip); 2775 2776 ASSERT(bp->b_target->bt_mount == mp); 2777 bp->b_iodone = xlog_recover_iodone; 2778 xfs_buf_delwri_queue(bp, buffer_list); 2779 2780 out_release: 2781 xfs_buf_relse(bp); 2782 error: 2783 if (need_free) 2784 kmem_free(in_f); 2785 return error; 2786 } 2787 2788 /* 2789 * Recover QUOTAOFF records. We simply make a note of it in the xlog 2790 * structure, so that we know not to do any dquot item or dquot buffer recovery, 2791 * of that type. 2792 */ 2793 STATIC int 2794 xlog_recover_quotaoff_pass1( 2795 struct xlog *log, 2796 struct xlog_recover_item *item) 2797 { 2798 xfs_qoff_logformat_t *qoff_f = item->ri_buf[0].i_addr; 2799 ASSERT(qoff_f); 2800 2801 /* 2802 * The logitem format's flag tells us if this was user quotaoff, 2803 * group/project quotaoff or both. 2804 */ 2805 if (qoff_f->qf_flags & XFS_UQUOTA_ACCT) 2806 log->l_quotaoffs_flag |= XFS_DQ_USER; 2807 if (qoff_f->qf_flags & XFS_PQUOTA_ACCT) 2808 log->l_quotaoffs_flag |= XFS_DQ_PROJ; 2809 if (qoff_f->qf_flags & XFS_GQUOTA_ACCT) 2810 log->l_quotaoffs_flag |= XFS_DQ_GROUP; 2811 2812 return 0; 2813 } 2814 2815 /* 2816 * Recover a dquot record 2817 */ 2818 STATIC int 2819 xlog_recover_dquot_pass2( 2820 struct xlog *log, 2821 struct list_head *buffer_list, 2822 struct xlog_recover_item *item, 2823 xfs_lsn_t current_lsn) 2824 { 2825 xfs_mount_t *mp = log->l_mp; 2826 xfs_buf_t *bp; 2827 struct xfs_disk_dquot *ddq, *recddq; 2828 int error; 2829 xfs_dq_logformat_t *dq_f; 2830 uint type; 2831 2832 2833 /* 2834 * Filesystems are required to send in quota flags at mount time. 2835 */ 2836 if (mp->m_qflags == 0) 2837 return 0; 2838 2839 recddq = item->ri_buf[1].i_addr; 2840 if (recddq == NULL) { 2841 xfs_alert(log->l_mp, "NULL dquot in %s.", __func__); 2842 return -EIO; 2843 } 2844 if (item->ri_buf[1].i_len < sizeof(xfs_disk_dquot_t)) { 2845 xfs_alert(log->l_mp, "dquot too small (%d) in %s.", 2846 item->ri_buf[1].i_len, __func__); 2847 return -EIO; 2848 } 2849 2850 /* 2851 * This type of quotas was turned off, so ignore this record. 2852 */ 2853 type = recddq->d_flags & (XFS_DQ_USER | XFS_DQ_PROJ | XFS_DQ_GROUP); 2854 ASSERT(type); 2855 if (log->l_quotaoffs_flag & type) 2856 return 0; 2857 2858 /* 2859 * At this point we know that quota was _not_ turned off. 2860 * Since the mount flags are not indicating to us otherwise, this 2861 * must mean that quota is on, and the dquot needs to be replayed. 2862 * Remember that we may not have fully recovered the superblock yet, 2863 * so we can't do the usual trick of looking at the SB quota bits. 2864 * 2865 * The other possibility, of course, is that the quota subsystem was 2866 * removed since the last mount - ENOSYS. 2867 */ 2868 dq_f = item->ri_buf[0].i_addr; 2869 ASSERT(dq_f); 2870 error = xfs_dqcheck(mp, recddq, dq_f->qlf_id, 0, XFS_QMOPT_DOWARN, 2871 "xlog_recover_dquot_pass2 (log copy)"); 2872 if (error) 2873 return -EIO; 2874 ASSERT(dq_f->qlf_len == 1); 2875 2876 /* 2877 * At this point we are assuming that the dquots have been allocated 2878 * and hence the buffer has valid dquots stamped in it. It should, 2879 * therefore, pass verifier validation. If the dquot is bad, then the 2880 * we'll return an error here, so we don't need to specifically check 2881 * the dquot in the buffer after the verifier has run. 2882 */ 2883 error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dq_f->qlf_blkno, 2884 XFS_FSB_TO_BB(mp, dq_f->qlf_len), 0, &bp, 2885 &xfs_dquot_buf_ops); 2886 if (error) 2887 return error; 2888 2889 ASSERT(bp); 2890 ddq = (xfs_disk_dquot_t *)xfs_buf_offset(bp, dq_f->qlf_boffset); 2891 2892 /* 2893 * If the dquot has an LSN in it, recover the dquot only if it's less 2894 * than the lsn of the transaction we are replaying. 2895 */ 2896 if (xfs_sb_version_hascrc(&mp->m_sb)) { 2897 struct xfs_dqblk *dqb = (struct xfs_dqblk *)ddq; 2898 xfs_lsn_t lsn = be64_to_cpu(dqb->dd_lsn); 2899 2900 if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) { 2901 goto out_release; 2902 } 2903 } 2904 2905 memcpy(ddq, recddq, item->ri_buf[1].i_len); 2906 if (xfs_sb_version_hascrc(&mp->m_sb)) { 2907 xfs_update_cksum((char *)ddq, sizeof(struct xfs_dqblk), 2908 XFS_DQUOT_CRC_OFF); 2909 } 2910 2911 ASSERT(dq_f->qlf_size == 2); 2912 ASSERT(bp->b_target->bt_mount == mp); 2913 bp->b_iodone = xlog_recover_iodone; 2914 xfs_buf_delwri_queue(bp, buffer_list); 2915 2916 out_release: 2917 xfs_buf_relse(bp); 2918 return 0; 2919 } 2920 2921 /* 2922 * This routine is called to create an in-core extent free intent 2923 * item from the efi format structure which was logged on disk. 2924 * It allocates an in-core efi, copies the extents from the format 2925 * structure into it, and adds the efi to the AIL with the given 2926 * LSN. 2927 */ 2928 STATIC int 2929 xlog_recover_efi_pass2( 2930 struct xlog *log, 2931 struct xlog_recover_item *item, 2932 xfs_lsn_t lsn) 2933 { 2934 int error; 2935 xfs_mount_t *mp = log->l_mp; 2936 xfs_efi_log_item_t *efip; 2937 xfs_efi_log_format_t *efi_formatp; 2938 2939 efi_formatp = item->ri_buf[0].i_addr; 2940 2941 efip = xfs_efi_init(mp, efi_formatp->efi_nextents); 2942 if ((error = xfs_efi_copy_format(&(item->ri_buf[0]), 2943 &(efip->efi_format)))) { 2944 xfs_efi_item_free(efip); 2945 return error; 2946 } 2947 atomic_set(&efip->efi_next_extent, efi_formatp->efi_nextents); 2948 2949 spin_lock(&log->l_ailp->xa_lock); 2950 /* 2951 * xfs_trans_ail_update() drops the AIL lock. 2952 */ 2953 xfs_trans_ail_update(log->l_ailp, &efip->efi_item, lsn); 2954 return 0; 2955 } 2956 2957 2958 /* 2959 * This routine is called when an efd format structure is found in 2960 * a committed transaction in the log. It's purpose is to cancel 2961 * the corresponding efi if it was still in the log. To do this 2962 * it searches the AIL for the efi with an id equal to that in the 2963 * efd format structure. If we find it, we remove the efi from the 2964 * AIL and free it. 2965 */ 2966 STATIC int 2967 xlog_recover_efd_pass2( 2968 struct xlog *log, 2969 struct xlog_recover_item *item) 2970 { 2971 xfs_efd_log_format_t *efd_formatp; 2972 xfs_efi_log_item_t *efip = NULL; 2973 xfs_log_item_t *lip; 2974 __uint64_t efi_id; 2975 struct xfs_ail_cursor cur; 2976 struct xfs_ail *ailp = log->l_ailp; 2977 2978 efd_formatp = item->ri_buf[0].i_addr; 2979 ASSERT((item->ri_buf[0].i_len == (sizeof(xfs_efd_log_format_32_t) + 2980 ((efd_formatp->efd_nextents - 1) * sizeof(xfs_extent_32_t)))) || 2981 (item->ri_buf[0].i_len == (sizeof(xfs_efd_log_format_64_t) + 2982 ((efd_formatp->efd_nextents - 1) * sizeof(xfs_extent_64_t))))); 2983 efi_id = efd_formatp->efd_efi_id; 2984 2985 /* 2986 * Search for the efi with the id in the efd format structure 2987 * in the AIL. 2988 */ 2989 spin_lock(&ailp->xa_lock); 2990 lip = xfs_trans_ail_cursor_first(ailp, &cur, 0); 2991 while (lip != NULL) { 2992 if (lip->li_type == XFS_LI_EFI) { 2993 efip = (xfs_efi_log_item_t *)lip; 2994 if (efip->efi_format.efi_id == efi_id) { 2995 /* 2996 * xfs_trans_ail_delete() drops the 2997 * AIL lock. 2998 */ 2999 xfs_trans_ail_delete(ailp, lip, 3000 SHUTDOWN_CORRUPT_INCORE); 3001 xfs_efi_item_free(efip); 3002 spin_lock(&ailp->xa_lock); 3003 break; 3004 } 3005 } 3006 lip = xfs_trans_ail_cursor_next(ailp, &cur); 3007 } 3008 xfs_trans_ail_cursor_done(&cur); 3009 spin_unlock(&ailp->xa_lock); 3010 3011 return 0; 3012 } 3013 3014 /* 3015 * This routine is called when an inode create format structure is found in a 3016 * committed transaction in the log. It's purpose is to initialise the inodes 3017 * being allocated on disk. This requires us to get inode cluster buffers that 3018 * match the range to be intialised, stamped with inode templates and written 3019 * by delayed write so that subsequent modifications will hit the cached buffer 3020 * and only need writing out at the end of recovery. 3021 */ 3022 STATIC int 3023 xlog_recover_do_icreate_pass2( 3024 struct xlog *log, 3025 struct list_head *buffer_list, 3026 xlog_recover_item_t *item) 3027 { 3028 struct xfs_mount *mp = log->l_mp; 3029 struct xfs_icreate_log *icl; 3030 xfs_agnumber_t agno; 3031 xfs_agblock_t agbno; 3032 unsigned int count; 3033 unsigned int isize; 3034 xfs_agblock_t length; 3035 3036 icl = (struct xfs_icreate_log *)item->ri_buf[0].i_addr; 3037 if (icl->icl_type != XFS_LI_ICREATE) { 3038 xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad type"); 3039 return -EINVAL; 3040 } 3041 3042 if (icl->icl_size != 1) { 3043 xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad icl size"); 3044 return -EINVAL; 3045 } 3046 3047 agno = be32_to_cpu(icl->icl_ag); 3048 if (agno >= mp->m_sb.sb_agcount) { 3049 xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad agno"); 3050 return -EINVAL; 3051 } 3052 agbno = be32_to_cpu(icl->icl_agbno); 3053 if (!agbno || agbno == NULLAGBLOCK || agbno >= mp->m_sb.sb_agblocks) { 3054 xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad agbno"); 3055 return -EINVAL; 3056 } 3057 isize = be32_to_cpu(icl->icl_isize); 3058 if (isize != mp->m_sb.sb_inodesize) { 3059 xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad isize"); 3060 return -EINVAL; 3061 } 3062 count = be32_to_cpu(icl->icl_count); 3063 if (!count) { 3064 xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad count"); 3065 return -EINVAL; 3066 } 3067 length = be32_to_cpu(icl->icl_length); 3068 if (!length || length >= mp->m_sb.sb_agblocks) { 3069 xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad length"); 3070 return -EINVAL; 3071 } 3072 3073 /* existing allocation is fixed value */ 3074 ASSERT(count == mp->m_ialloc_inos); 3075 ASSERT(length == mp->m_ialloc_blks); 3076 if (count != mp->m_ialloc_inos || 3077 length != mp->m_ialloc_blks) { 3078 xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad count 2"); 3079 return -EINVAL; 3080 } 3081 3082 /* 3083 * Inode buffers can be freed. Do not replay the inode initialisation as 3084 * we could be overwriting something written after this inode buffer was 3085 * cancelled. 3086 * 3087 * XXX: we need to iterate all buffers and only init those that are not 3088 * cancelled. I think that a more fine grained factoring of 3089 * xfs_ialloc_inode_init may be appropriate here to enable this to be 3090 * done easily. 3091 */ 3092 if (xlog_check_buffer_cancelled(log, 3093 XFS_AGB_TO_DADDR(mp, agno, agbno), length, 0)) 3094 return 0; 3095 3096 xfs_ialloc_inode_init(mp, NULL, buffer_list, agno, agbno, length, 3097 be32_to_cpu(icl->icl_gen)); 3098 return 0; 3099 } 3100 3101 STATIC void 3102 xlog_recover_buffer_ra_pass2( 3103 struct xlog *log, 3104 struct xlog_recover_item *item) 3105 { 3106 struct xfs_buf_log_format *buf_f = item->ri_buf[0].i_addr; 3107 struct xfs_mount *mp = log->l_mp; 3108 3109 if (xlog_peek_buffer_cancelled(log, buf_f->blf_blkno, 3110 buf_f->blf_len, buf_f->blf_flags)) { 3111 return; 3112 } 3113 3114 xfs_buf_readahead(mp->m_ddev_targp, buf_f->blf_blkno, 3115 buf_f->blf_len, NULL); 3116 } 3117 3118 STATIC void 3119 xlog_recover_inode_ra_pass2( 3120 struct xlog *log, 3121 struct xlog_recover_item *item) 3122 { 3123 struct xfs_inode_log_format ilf_buf; 3124 struct xfs_inode_log_format *ilfp; 3125 struct xfs_mount *mp = log->l_mp; 3126 int error; 3127 3128 if (item->ri_buf[0].i_len == sizeof(struct xfs_inode_log_format)) { 3129 ilfp = item->ri_buf[0].i_addr; 3130 } else { 3131 ilfp = &ilf_buf; 3132 memset(ilfp, 0, sizeof(*ilfp)); 3133 error = xfs_inode_item_format_convert(&item->ri_buf[0], ilfp); 3134 if (error) 3135 return; 3136 } 3137 3138 if (xlog_peek_buffer_cancelled(log, ilfp->ilf_blkno, ilfp->ilf_len, 0)) 3139 return; 3140 3141 xfs_buf_readahead(mp->m_ddev_targp, ilfp->ilf_blkno, 3142 ilfp->ilf_len, &xfs_inode_buf_ra_ops); 3143 } 3144 3145 STATIC void 3146 xlog_recover_dquot_ra_pass2( 3147 struct xlog *log, 3148 struct xlog_recover_item *item) 3149 { 3150 struct xfs_mount *mp = log->l_mp; 3151 struct xfs_disk_dquot *recddq; 3152 struct xfs_dq_logformat *dq_f; 3153 uint type; 3154 3155 3156 if (mp->m_qflags == 0) 3157 return; 3158 3159 recddq = item->ri_buf[1].i_addr; 3160 if (recddq == NULL) 3161 return; 3162 if (item->ri_buf[1].i_len < sizeof(struct xfs_disk_dquot)) 3163 return; 3164 3165 type = recddq->d_flags & (XFS_DQ_USER | XFS_DQ_PROJ | XFS_DQ_GROUP); 3166 ASSERT(type); 3167 if (log->l_quotaoffs_flag & type) 3168 return; 3169 3170 dq_f = item->ri_buf[0].i_addr; 3171 ASSERT(dq_f); 3172 ASSERT(dq_f->qlf_len == 1); 3173 3174 xfs_buf_readahead(mp->m_ddev_targp, dq_f->qlf_blkno, 3175 XFS_FSB_TO_BB(mp, dq_f->qlf_len), NULL); 3176 } 3177 3178 STATIC void 3179 xlog_recover_ra_pass2( 3180 struct xlog *log, 3181 struct xlog_recover_item *item) 3182 { 3183 switch (ITEM_TYPE(item)) { 3184 case XFS_LI_BUF: 3185 xlog_recover_buffer_ra_pass2(log, item); 3186 break; 3187 case XFS_LI_INODE: 3188 xlog_recover_inode_ra_pass2(log, item); 3189 break; 3190 case XFS_LI_DQUOT: 3191 xlog_recover_dquot_ra_pass2(log, item); 3192 break; 3193 case XFS_LI_EFI: 3194 case XFS_LI_EFD: 3195 case XFS_LI_QUOTAOFF: 3196 default: 3197 break; 3198 } 3199 } 3200 3201 STATIC int 3202 xlog_recover_commit_pass1( 3203 struct xlog *log, 3204 struct xlog_recover *trans, 3205 struct xlog_recover_item *item) 3206 { 3207 trace_xfs_log_recover_item_recover(log, trans, item, XLOG_RECOVER_PASS1); 3208 3209 switch (ITEM_TYPE(item)) { 3210 case XFS_LI_BUF: 3211 return xlog_recover_buffer_pass1(log, item); 3212 case XFS_LI_QUOTAOFF: 3213 return xlog_recover_quotaoff_pass1(log, item); 3214 case XFS_LI_INODE: 3215 case XFS_LI_EFI: 3216 case XFS_LI_EFD: 3217 case XFS_LI_DQUOT: 3218 case XFS_LI_ICREATE: 3219 /* nothing to do in pass 1 */ 3220 return 0; 3221 default: 3222 xfs_warn(log->l_mp, "%s: invalid item type (%d)", 3223 __func__, ITEM_TYPE(item)); 3224 ASSERT(0); 3225 return -EIO; 3226 } 3227 } 3228 3229 STATIC int 3230 xlog_recover_commit_pass2( 3231 struct xlog *log, 3232 struct xlog_recover *trans, 3233 struct list_head *buffer_list, 3234 struct xlog_recover_item *item) 3235 { 3236 trace_xfs_log_recover_item_recover(log, trans, item, XLOG_RECOVER_PASS2); 3237 3238 switch (ITEM_TYPE(item)) { 3239 case XFS_LI_BUF: 3240 return xlog_recover_buffer_pass2(log, buffer_list, item, 3241 trans->r_lsn); 3242 case XFS_LI_INODE: 3243 return xlog_recover_inode_pass2(log, buffer_list, item, 3244 trans->r_lsn); 3245 case XFS_LI_EFI: 3246 return xlog_recover_efi_pass2(log, item, trans->r_lsn); 3247 case XFS_LI_EFD: 3248 return xlog_recover_efd_pass2(log, item); 3249 case XFS_LI_DQUOT: 3250 return xlog_recover_dquot_pass2(log, buffer_list, item, 3251 trans->r_lsn); 3252 case XFS_LI_ICREATE: 3253 return xlog_recover_do_icreate_pass2(log, buffer_list, item); 3254 case XFS_LI_QUOTAOFF: 3255 /* nothing to do in pass2 */ 3256 return 0; 3257 default: 3258 xfs_warn(log->l_mp, "%s: invalid item type (%d)", 3259 __func__, ITEM_TYPE(item)); 3260 ASSERT(0); 3261 return -EIO; 3262 } 3263 } 3264 3265 STATIC int 3266 xlog_recover_items_pass2( 3267 struct xlog *log, 3268 struct xlog_recover *trans, 3269 struct list_head *buffer_list, 3270 struct list_head *item_list) 3271 { 3272 struct xlog_recover_item *item; 3273 int error = 0; 3274 3275 list_for_each_entry(item, item_list, ri_list) { 3276 error = xlog_recover_commit_pass2(log, trans, 3277 buffer_list, item); 3278 if (error) 3279 return error; 3280 } 3281 3282 return error; 3283 } 3284 3285 /* 3286 * Perform the transaction. 3287 * 3288 * If the transaction modifies a buffer or inode, do it now. Otherwise, 3289 * EFIs and EFDs get queued up by adding entries into the AIL for them. 3290 */ 3291 STATIC int 3292 xlog_recover_commit_trans( 3293 struct xlog *log, 3294 struct xlog_recover *trans, 3295 int pass) 3296 { 3297 int error = 0; 3298 int error2; 3299 int items_queued = 0; 3300 struct xlog_recover_item *item; 3301 struct xlog_recover_item *next; 3302 LIST_HEAD (buffer_list); 3303 LIST_HEAD (ra_list); 3304 LIST_HEAD (done_list); 3305 3306 #define XLOG_RECOVER_COMMIT_QUEUE_MAX 100 3307 3308 hlist_del(&trans->r_list); 3309 3310 error = xlog_recover_reorder_trans(log, trans, pass); 3311 if (error) 3312 return error; 3313 3314 list_for_each_entry_safe(item, next, &trans->r_itemq, ri_list) { 3315 switch (pass) { 3316 case XLOG_RECOVER_PASS1: 3317 error = xlog_recover_commit_pass1(log, trans, item); 3318 break; 3319 case XLOG_RECOVER_PASS2: 3320 xlog_recover_ra_pass2(log, item); 3321 list_move_tail(&item->ri_list, &ra_list); 3322 items_queued++; 3323 if (items_queued >= XLOG_RECOVER_COMMIT_QUEUE_MAX) { 3324 error = xlog_recover_items_pass2(log, trans, 3325 &buffer_list, &ra_list); 3326 list_splice_tail_init(&ra_list, &done_list); 3327 items_queued = 0; 3328 } 3329 3330 break; 3331 default: 3332 ASSERT(0); 3333 } 3334 3335 if (error) 3336 goto out; 3337 } 3338 3339 out: 3340 if (!list_empty(&ra_list)) { 3341 if (!error) 3342 error = xlog_recover_items_pass2(log, trans, 3343 &buffer_list, &ra_list); 3344 list_splice_tail_init(&ra_list, &done_list); 3345 } 3346 3347 if (!list_empty(&done_list)) 3348 list_splice_init(&done_list, &trans->r_itemq); 3349 3350 error2 = xfs_buf_delwri_submit(&buffer_list); 3351 return error ? error : error2; 3352 } 3353 3354 STATIC void 3355 xlog_recover_add_item( 3356 struct list_head *head) 3357 { 3358 xlog_recover_item_t *item; 3359 3360 item = kmem_zalloc(sizeof(xlog_recover_item_t), KM_SLEEP); 3361 INIT_LIST_HEAD(&item->ri_list); 3362 list_add_tail(&item->ri_list, head); 3363 } 3364 3365 STATIC int 3366 xlog_recover_add_to_cont_trans( 3367 struct xlog *log, 3368 struct xlog_recover *trans, 3369 xfs_caddr_t dp, 3370 int len) 3371 { 3372 xlog_recover_item_t *item; 3373 xfs_caddr_t ptr, old_ptr; 3374 int old_len; 3375 3376 if (list_empty(&trans->r_itemq)) { 3377 /* finish copying rest of trans header */ 3378 xlog_recover_add_item(&trans->r_itemq); 3379 ptr = (xfs_caddr_t) &trans->r_theader + 3380 sizeof(xfs_trans_header_t) - len; 3381 memcpy(ptr, dp, len); 3382 return 0; 3383 } 3384 /* take the tail entry */ 3385 item = list_entry(trans->r_itemq.prev, xlog_recover_item_t, ri_list); 3386 3387 old_ptr = item->ri_buf[item->ri_cnt-1].i_addr; 3388 old_len = item->ri_buf[item->ri_cnt-1].i_len; 3389 3390 ptr = kmem_realloc(old_ptr, len+old_len, old_len, KM_SLEEP); 3391 memcpy(&ptr[old_len], dp, len); 3392 item->ri_buf[item->ri_cnt-1].i_len += len; 3393 item->ri_buf[item->ri_cnt-1].i_addr = ptr; 3394 trace_xfs_log_recover_item_add_cont(log, trans, item, 0); 3395 return 0; 3396 } 3397 3398 /* 3399 * The next region to add is the start of a new region. It could be 3400 * a whole region or it could be the first part of a new region. Because 3401 * of this, the assumption here is that the type and size fields of all 3402 * format structures fit into the first 32 bits of the structure. 3403 * 3404 * This works because all regions must be 32 bit aligned. Therefore, we 3405 * either have both fields or we have neither field. In the case we have 3406 * neither field, the data part of the region is zero length. We only have 3407 * a log_op_header and can throw away the header since a new one will appear 3408 * later. If we have at least 4 bytes, then we can determine how many regions 3409 * will appear in the current log item. 3410 */ 3411 STATIC int 3412 xlog_recover_add_to_trans( 3413 struct xlog *log, 3414 struct xlog_recover *trans, 3415 xfs_caddr_t dp, 3416 int len) 3417 { 3418 xfs_inode_log_format_t *in_f; /* any will do */ 3419 xlog_recover_item_t *item; 3420 xfs_caddr_t ptr; 3421 3422 if (!len) 3423 return 0; 3424 if (list_empty(&trans->r_itemq)) { 3425 /* we need to catch log corruptions here */ 3426 if (*(uint *)dp != XFS_TRANS_HEADER_MAGIC) { 3427 xfs_warn(log->l_mp, "%s: bad header magic number", 3428 __func__); 3429 ASSERT(0); 3430 return -EIO; 3431 } 3432 if (len == sizeof(xfs_trans_header_t)) 3433 xlog_recover_add_item(&trans->r_itemq); 3434 memcpy(&trans->r_theader, dp, len); 3435 return 0; 3436 } 3437 3438 ptr = kmem_alloc(len, KM_SLEEP); 3439 memcpy(ptr, dp, len); 3440 in_f = (xfs_inode_log_format_t *)ptr; 3441 3442 /* take the tail entry */ 3443 item = list_entry(trans->r_itemq.prev, xlog_recover_item_t, ri_list); 3444 if (item->ri_total != 0 && 3445 item->ri_total == item->ri_cnt) { 3446 /* tail item is in use, get a new one */ 3447 xlog_recover_add_item(&trans->r_itemq); 3448 item = list_entry(trans->r_itemq.prev, 3449 xlog_recover_item_t, ri_list); 3450 } 3451 3452 if (item->ri_total == 0) { /* first region to be added */ 3453 if (in_f->ilf_size == 0 || 3454 in_f->ilf_size > XLOG_MAX_REGIONS_IN_ITEM) { 3455 xfs_warn(log->l_mp, 3456 "bad number of regions (%d) in inode log format", 3457 in_f->ilf_size); 3458 ASSERT(0); 3459 kmem_free(ptr); 3460 return -EIO; 3461 } 3462 3463 item->ri_total = in_f->ilf_size; 3464 item->ri_buf = 3465 kmem_zalloc(item->ri_total * sizeof(xfs_log_iovec_t), 3466 KM_SLEEP); 3467 } 3468 ASSERT(item->ri_total > item->ri_cnt); 3469 /* Description region is ri_buf[0] */ 3470 item->ri_buf[item->ri_cnt].i_addr = ptr; 3471 item->ri_buf[item->ri_cnt].i_len = len; 3472 item->ri_cnt++; 3473 trace_xfs_log_recover_item_add(log, trans, item, 0); 3474 return 0; 3475 } 3476 3477 /* 3478 * Free up any resources allocated by the transaction 3479 * 3480 * Remember that EFIs, EFDs, and IUNLINKs are handled later. 3481 */ 3482 STATIC void 3483 xlog_recover_free_trans( 3484 struct xlog_recover *trans) 3485 { 3486 xlog_recover_item_t *item, *n; 3487 int i; 3488 3489 list_for_each_entry_safe(item, n, &trans->r_itemq, ri_list) { 3490 /* Free the regions in the item. */ 3491 list_del(&item->ri_list); 3492 for (i = 0; i < item->ri_cnt; i++) 3493 kmem_free(item->ri_buf[i].i_addr); 3494 /* Free the item itself */ 3495 kmem_free(item->ri_buf); 3496 kmem_free(item); 3497 } 3498 /* Free the transaction recover structure */ 3499 kmem_free(trans); 3500 } 3501 3502 /* 3503 * On error or completion, trans is freed. 3504 */ 3505 STATIC int 3506 xlog_recovery_process_trans( 3507 struct xlog *log, 3508 struct xlog_recover *trans, 3509 xfs_caddr_t dp, 3510 unsigned int len, 3511 unsigned int flags, 3512 int pass) 3513 { 3514 int error = 0; 3515 bool freeit = false; 3516 3517 /* mask off ophdr transaction container flags */ 3518 flags &= ~XLOG_END_TRANS; 3519 if (flags & XLOG_WAS_CONT_TRANS) 3520 flags &= ~XLOG_CONTINUE_TRANS; 3521 3522 /* 3523 * Callees must not free the trans structure. We'll decide if we need to 3524 * free it or not based on the operation being done and it's result. 3525 */ 3526 switch (flags) { 3527 /* expected flag values */ 3528 case 0: 3529 case XLOG_CONTINUE_TRANS: 3530 error = xlog_recover_add_to_trans(log, trans, dp, len); 3531 break; 3532 case XLOG_WAS_CONT_TRANS: 3533 error = xlog_recover_add_to_cont_trans(log, trans, dp, len); 3534 break; 3535 case XLOG_COMMIT_TRANS: 3536 error = xlog_recover_commit_trans(log, trans, pass); 3537 /* success or fail, we are now done with this transaction. */ 3538 freeit = true; 3539 break; 3540 3541 /* unexpected flag values */ 3542 case XLOG_UNMOUNT_TRANS: 3543 /* just skip trans */ 3544 xfs_warn(log->l_mp, "%s: Unmount LR", __func__); 3545 freeit = true; 3546 break; 3547 case XLOG_START_TRANS: 3548 default: 3549 xfs_warn(log->l_mp, "%s: bad flag 0x%x", __func__, flags); 3550 ASSERT(0); 3551 error = -EIO; 3552 break; 3553 } 3554 if (error || freeit) 3555 xlog_recover_free_trans(trans); 3556 return error; 3557 } 3558 3559 /* 3560 * Lookup the transaction recovery structure associated with the ID in the 3561 * current ophdr. If the transaction doesn't exist and the start flag is set in 3562 * the ophdr, then allocate a new transaction for future ID matches to find. 3563 * Either way, return what we found during the lookup - an existing transaction 3564 * or nothing. 3565 */ 3566 STATIC struct xlog_recover * 3567 xlog_recover_ophdr_to_trans( 3568 struct hlist_head rhash[], 3569 struct xlog_rec_header *rhead, 3570 struct xlog_op_header *ohead) 3571 { 3572 struct xlog_recover *trans; 3573 xlog_tid_t tid; 3574 struct hlist_head *rhp; 3575 3576 tid = be32_to_cpu(ohead->oh_tid); 3577 rhp = &rhash[XLOG_RHASH(tid)]; 3578 hlist_for_each_entry(trans, rhp, r_list) { 3579 if (trans->r_log_tid == tid) 3580 return trans; 3581 } 3582 3583 /* 3584 * skip over non-start transaction headers - we could be 3585 * processing slack space before the next transaction starts 3586 */ 3587 if (!(ohead->oh_flags & XLOG_START_TRANS)) 3588 return NULL; 3589 3590 ASSERT(be32_to_cpu(ohead->oh_len) == 0); 3591 3592 /* 3593 * This is a new transaction so allocate a new recovery container to 3594 * hold the recovery ops that will follow. 3595 */ 3596 trans = kmem_zalloc(sizeof(struct xlog_recover), KM_SLEEP); 3597 trans->r_log_tid = tid; 3598 trans->r_lsn = be64_to_cpu(rhead->h_lsn); 3599 INIT_LIST_HEAD(&trans->r_itemq); 3600 INIT_HLIST_NODE(&trans->r_list); 3601 hlist_add_head(&trans->r_list, rhp); 3602 3603 /* 3604 * Nothing more to do for this ophdr. Items to be added to this new 3605 * transaction will be in subsequent ophdr containers. 3606 */ 3607 return NULL; 3608 } 3609 3610 STATIC int 3611 xlog_recover_process_ophdr( 3612 struct xlog *log, 3613 struct hlist_head rhash[], 3614 struct xlog_rec_header *rhead, 3615 struct xlog_op_header *ohead, 3616 xfs_caddr_t dp, 3617 xfs_caddr_t end, 3618 int pass) 3619 { 3620 struct xlog_recover *trans; 3621 unsigned int len; 3622 3623 /* Do we understand who wrote this op? */ 3624 if (ohead->oh_clientid != XFS_TRANSACTION && 3625 ohead->oh_clientid != XFS_LOG) { 3626 xfs_warn(log->l_mp, "%s: bad clientid 0x%x", 3627 __func__, ohead->oh_clientid); 3628 ASSERT(0); 3629 return -EIO; 3630 } 3631 3632 /* 3633 * Check the ophdr contains all the data it is supposed to contain. 3634 */ 3635 len = be32_to_cpu(ohead->oh_len); 3636 if (dp + len > end) { 3637 xfs_warn(log->l_mp, "%s: bad length 0x%x", __func__, len); 3638 WARN_ON(1); 3639 return -EIO; 3640 } 3641 3642 trans = xlog_recover_ophdr_to_trans(rhash, rhead, ohead); 3643 if (!trans) { 3644 /* nothing to do, so skip over this ophdr */ 3645 return 0; 3646 } 3647 3648 return xlog_recovery_process_trans(log, trans, dp, len, 3649 ohead->oh_flags, pass); 3650 } 3651 3652 /* 3653 * There are two valid states of the r_state field. 0 indicates that the 3654 * transaction structure is in a normal state. We have either seen the 3655 * start of the transaction or the last operation we added was not a partial 3656 * operation. If the last operation we added to the transaction was a 3657 * partial operation, we need to mark r_state with XLOG_WAS_CONT_TRANS. 3658 * 3659 * NOTE: skip LRs with 0 data length. 3660 */ 3661 STATIC int 3662 xlog_recover_process_data( 3663 struct xlog *log, 3664 struct hlist_head rhash[], 3665 struct xlog_rec_header *rhead, 3666 xfs_caddr_t dp, 3667 int pass) 3668 { 3669 struct xlog_op_header *ohead; 3670 xfs_caddr_t end; 3671 int num_logops; 3672 int error; 3673 3674 end = dp + be32_to_cpu(rhead->h_len); 3675 num_logops = be32_to_cpu(rhead->h_num_logops); 3676 3677 /* check the log format matches our own - else we can't recover */ 3678 if (xlog_header_check_recover(log->l_mp, rhead)) 3679 return -EIO; 3680 3681 while ((dp < end) && num_logops) { 3682 3683 ohead = (struct xlog_op_header *)dp; 3684 dp += sizeof(*ohead); 3685 ASSERT(dp <= end); 3686 3687 /* errors will abort recovery */ 3688 error = xlog_recover_process_ophdr(log, rhash, rhead, ohead, 3689 dp, end, pass); 3690 if (error) 3691 return error; 3692 3693 dp += be32_to_cpu(ohead->oh_len); 3694 num_logops--; 3695 } 3696 return 0; 3697 } 3698 3699 /* 3700 * Process an extent free intent item that was recovered from 3701 * the log. We need to free the extents that it describes. 3702 */ 3703 STATIC int 3704 xlog_recover_process_efi( 3705 xfs_mount_t *mp, 3706 xfs_efi_log_item_t *efip) 3707 { 3708 xfs_efd_log_item_t *efdp; 3709 xfs_trans_t *tp; 3710 int i; 3711 int error = 0; 3712 xfs_extent_t *extp; 3713 xfs_fsblock_t startblock_fsb; 3714 3715 ASSERT(!test_bit(XFS_EFI_RECOVERED, &efip->efi_flags)); 3716 3717 /* 3718 * First check the validity of the extents described by the 3719 * EFI. If any are bad, then assume that all are bad and 3720 * just toss the EFI. 3721 */ 3722 for (i = 0; i < efip->efi_format.efi_nextents; i++) { 3723 extp = &(efip->efi_format.efi_extents[i]); 3724 startblock_fsb = XFS_BB_TO_FSB(mp, 3725 XFS_FSB_TO_DADDR(mp, extp->ext_start)); 3726 if ((startblock_fsb == 0) || 3727 (extp->ext_len == 0) || 3728 (startblock_fsb >= mp->m_sb.sb_dblocks) || 3729 (extp->ext_len >= mp->m_sb.sb_agblocks)) { 3730 /* 3731 * This will pull the EFI from the AIL and 3732 * free the memory associated with it. 3733 */ 3734 set_bit(XFS_EFI_RECOVERED, &efip->efi_flags); 3735 xfs_efi_release(efip, efip->efi_format.efi_nextents); 3736 return -EIO; 3737 } 3738 } 3739 3740 tp = xfs_trans_alloc(mp, 0); 3741 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0); 3742 if (error) 3743 goto abort_error; 3744 efdp = xfs_trans_get_efd(tp, efip, efip->efi_format.efi_nextents); 3745 3746 for (i = 0; i < efip->efi_format.efi_nextents; i++) { 3747 extp = &(efip->efi_format.efi_extents[i]); 3748 error = xfs_free_extent(tp, extp->ext_start, extp->ext_len); 3749 if (error) 3750 goto abort_error; 3751 xfs_trans_log_efd_extent(tp, efdp, extp->ext_start, 3752 extp->ext_len); 3753 } 3754 3755 set_bit(XFS_EFI_RECOVERED, &efip->efi_flags); 3756 error = xfs_trans_commit(tp, 0); 3757 return error; 3758 3759 abort_error: 3760 xfs_trans_cancel(tp, XFS_TRANS_ABORT); 3761 return error; 3762 } 3763 3764 /* 3765 * When this is called, all of the EFIs which did not have 3766 * corresponding EFDs should be in the AIL. What we do now 3767 * is free the extents associated with each one. 3768 * 3769 * Since we process the EFIs in normal transactions, they 3770 * will be removed at some point after the commit. This prevents 3771 * us from just walking down the list processing each one. 3772 * We'll use a flag in the EFI to skip those that we've already 3773 * processed and use the AIL iteration mechanism's generation 3774 * count to try to speed this up at least a bit. 3775 * 3776 * When we start, we know that the EFIs are the only things in 3777 * the AIL. As we process them, however, other items are added 3778 * to the AIL. Since everything added to the AIL must come after 3779 * everything already in the AIL, we stop processing as soon as 3780 * we see something other than an EFI in the AIL. 3781 */ 3782 STATIC int 3783 xlog_recover_process_efis( 3784 struct xlog *log) 3785 { 3786 xfs_log_item_t *lip; 3787 xfs_efi_log_item_t *efip; 3788 int error = 0; 3789 struct xfs_ail_cursor cur; 3790 struct xfs_ail *ailp; 3791 3792 ailp = log->l_ailp; 3793 spin_lock(&ailp->xa_lock); 3794 lip = xfs_trans_ail_cursor_first(ailp, &cur, 0); 3795 while (lip != NULL) { 3796 /* 3797 * We're done when we see something other than an EFI. 3798 * There should be no EFIs left in the AIL now. 3799 */ 3800 if (lip->li_type != XFS_LI_EFI) { 3801 #ifdef DEBUG 3802 for (; lip; lip = xfs_trans_ail_cursor_next(ailp, &cur)) 3803 ASSERT(lip->li_type != XFS_LI_EFI); 3804 #endif 3805 break; 3806 } 3807 3808 /* 3809 * Skip EFIs that we've already processed. 3810 */ 3811 efip = (xfs_efi_log_item_t *)lip; 3812 if (test_bit(XFS_EFI_RECOVERED, &efip->efi_flags)) { 3813 lip = xfs_trans_ail_cursor_next(ailp, &cur); 3814 continue; 3815 } 3816 3817 spin_unlock(&ailp->xa_lock); 3818 error = xlog_recover_process_efi(log->l_mp, efip); 3819 spin_lock(&ailp->xa_lock); 3820 if (error) 3821 goto out; 3822 lip = xfs_trans_ail_cursor_next(ailp, &cur); 3823 } 3824 out: 3825 xfs_trans_ail_cursor_done(&cur); 3826 spin_unlock(&ailp->xa_lock); 3827 return error; 3828 } 3829 3830 /* 3831 * This routine performs a transaction to null out a bad inode pointer 3832 * in an agi unlinked inode hash bucket. 3833 */ 3834 STATIC void 3835 xlog_recover_clear_agi_bucket( 3836 xfs_mount_t *mp, 3837 xfs_agnumber_t agno, 3838 int bucket) 3839 { 3840 xfs_trans_t *tp; 3841 xfs_agi_t *agi; 3842 xfs_buf_t *agibp; 3843 int offset; 3844 int error; 3845 3846 tp = xfs_trans_alloc(mp, XFS_TRANS_CLEAR_AGI_BUCKET); 3847 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_clearagi, 0, 0); 3848 if (error) 3849 goto out_abort; 3850 3851 error = xfs_read_agi(mp, tp, agno, &agibp); 3852 if (error) 3853 goto out_abort; 3854 3855 agi = XFS_BUF_TO_AGI(agibp); 3856 agi->agi_unlinked[bucket] = cpu_to_be32(NULLAGINO); 3857 offset = offsetof(xfs_agi_t, agi_unlinked) + 3858 (sizeof(xfs_agino_t) * bucket); 3859 xfs_trans_log_buf(tp, agibp, offset, 3860 (offset + sizeof(xfs_agino_t) - 1)); 3861 3862 error = xfs_trans_commit(tp, 0); 3863 if (error) 3864 goto out_error; 3865 return; 3866 3867 out_abort: 3868 xfs_trans_cancel(tp, XFS_TRANS_ABORT); 3869 out_error: 3870 xfs_warn(mp, "%s: failed to clear agi %d. Continuing.", __func__, agno); 3871 return; 3872 } 3873 3874 STATIC xfs_agino_t 3875 xlog_recover_process_one_iunlink( 3876 struct xfs_mount *mp, 3877 xfs_agnumber_t agno, 3878 xfs_agino_t agino, 3879 int bucket) 3880 { 3881 struct xfs_buf *ibp; 3882 struct xfs_dinode *dip; 3883 struct xfs_inode *ip; 3884 xfs_ino_t ino; 3885 int error; 3886 3887 ino = XFS_AGINO_TO_INO(mp, agno, agino); 3888 error = xfs_iget(mp, NULL, ino, 0, 0, &ip); 3889 if (error) 3890 goto fail; 3891 3892 /* 3893 * Get the on disk inode to find the next inode in the bucket. 3894 */ 3895 error = xfs_imap_to_bp(mp, NULL, &ip->i_imap, &dip, &ibp, 0, 0); 3896 if (error) 3897 goto fail_iput; 3898 3899 ASSERT(ip->i_d.di_nlink == 0); 3900 ASSERT(ip->i_d.di_mode != 0); 3901 3902 /* setup for the next pass */ 3903 agino = be32_to_cpu(dip->di_next_unlinked); 3904 xfs_buf_relse(ibp); 3905 3906 /* 3907 * Prevent any DMAPI event from being sent when the reference on 3908 * the inode is dropped. 3909 */ 3910 ip->i_d.di_dmevmask = 0; 3911 3912 IRELE(ip); 3913 return agino; 3914 3915 fail_iput: 3916 IRELE(ip); 3917 fail: 3918 /* 3919 * We can't read in the inode this bucket points to, or this inode 3920 * is messed up. Just ditch this bucket of inodes. We will lose 3921 * some inodes and space, but at least we won't hang. 3922 * 3923 * Call xlog_recover_clear_agi_bucket() to perform a transaction to 3924 * clear the inode pointer in the bucket. 3925 */ 3926 xlog_recover_clear_agi_bucket(mp, agno, bucket); 3927 return NULLAGINO; 3928 } 3929 3930 /* 3931 * xlog_iunlink_recover 3932 * 3933 * This is called during recovery to process any inodes which 3934 * we unlinked but not freed when the system crashed. These 3935 * inodes will be on the lists in the AGI blocks. What we do 3936 * here is scan all the AGIs and fully truncate and free any 3937 * inodes found on the lists. Each inode is removed from the 3938 * lists when it has been fully truncated and is freed. The 3939 * freeing of the inode and its removal from the list must be 3940 * atomic. 3941 */ 3942 STATIC void 3943 xlog_recover_process_iunlinks( 3944 struct xlog *log) 3945 { 3946 xfs_mount_t *mp; 3947 xfs_agnumber_t agno; 3948 xfs_agi_t *agi; 3949 xfs_buf_t *agibp; 3950 xfs_agino_t agino; 3951 int bucket; 3952 int error; 3953 uint mp_dmevmask; 3954 3955 mp = log->l_mp; 3956 3957 /* 3958 * Prevent any DMAPI event from being sent while in this function. 3959 */ 3960 mp_dmevmask = mp->m_dmevmask; 3961 mp->m_dmevmask = 0; 3962 3963 for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) { 3964 /* 3965 * Find the agi for this ag. 3966 */ 3967 error = xfs_read_agi(mp, NULL, agno, &agibp); 3968 if (error) { 3969 /* 3970 * AGI is b0rked. Don't process it. 3971 * 3972 * We should probably mark the filesystem as corrupt 3973 * after we've recovered all the ag's we can.... 3974 */ 3975 continue; 3976 } 3977 /* 3978 * Unlock the buffer so that it can be acquired in the normal 3979 * course of the transaction to truncate and free each inode. 3980 * Because we are not racing with anyone else here for the AGI 3981 * buffer, we don't even need to hold it locked to read the 3982 * initial unlinked bucket entries out of the buffer. We keep 3983 * buffer reference though, so that it stays pinned in memory 3984 * while we need the buffer. 3985 */ 3986 agi = XFS_BUF_TO_AGI(agibp); 3987 xfs_buf_unlock(agibp); 3988 3989 for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++) { 3990 agino = be32_to_cpu(agi->agi_unlinked[bucket]); 3991 while (agino != NULLAGINO) { 3992 agino = xlog_recover_process_one_iunlink(mp, 3993 agno, agino, bucket); 3994 } 3995 } 3996 xfs_buf_rele(agibp); 3997 } 3998 3999 mp->m_dmevmask = mp_dmevmask; 4000 } 4001 4002 /* 4003 * Upack the log buffer data and crc check it. If the check fails, issue a 4004 * warning if and only if the CRC in the header is non-zero. This makes the 4005 * check an advisory warning, and the zero CRC check will prevent failure 4006 * warnings from being emitted when upgrading the kernel from one that does not 4007 * add CRCs by default. 4008 * 4009 * When filesystems are CRC enabled, this CRC mismatch becomes a fatal log 4010 * corruption failure 4011 */ 4012 STATIC int 4013 xlog_unpack_data_crc( 4014 struct xlog_rec_header *rhead, 4015 xfs_caddr_t dp, 4016 struct xlog *log) 4017 { 4018 __le32 crc; 4019 4020 crc = xlog_cksum(log, rhead, dp, be32_to_cpu(rhead->h_len)); 4021 if (crc != rhead->h_crc) { 4022 if (rhead->h_crc || xfs_sb_version_hascrc(&log->l_mp->m_sb)) { 4023 xfs_alert(log->l_mp, 4024 "log record CRC mismatch: found 0x%x, expected 0x%x.", 4025 le32_to_cpu(rhead->h_crc), 4026 le32_to_cpu(crc)); 4027 xfs_hex_dump(dp, 32); 4028 } 4029 4030 /* 4031 * If we've detected a log record corruption, then we can't 4032 * recover past this point. Abort recovery if we are enforcing 4033 * CRC protection by punting an error back up the stack. 4034 */ 4035 if (xfs_sb_version_hascrc(&log->l_mp->m_sb)) 4036 return -EFSCORRUPTED; 4037 } 4038 4039 return 0; 4040 } 4041 4042 STATIC int 4043 xlog_unpack_data( 4044 struct xlog_rec_header *rhead, 4045 xfs_caddr_t dp, 4046 struct xlog *log) 4047 { 4048 int i, j, k; 4049 int error; 4050 4051 error = xlog_unpack_data_crc(rhead, dp, log); 4052 if (error) 4053 return error; 4054 4055 for (i = 0; i < BTOBB(be32_to_cpu(rhead->h_len)) && 4056 i < (XLOG_HEADER_CYCLE_SIZE / BBSIZE); i++) { 4057 *(__be32 *)dp = *(__be32 *)&rhead->h_cycle_data[i]; 4058 dp += BBSIZE; 4059 } 4060 4061 if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) { 4062 xlog_in_core_2_t *xhdr = (xlog_in_core_2_t *)rhead; 4063 for ( ; i < BTOBB(be32_to_cpu(rhead->h_len)); i++) { 4064 j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE); 4065 k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE); 4066 *(__be32 *)dp = xhdr[j].hic_xheader.xh_cycle_data[k]; 4067 dp += BBSIZE; 4068 } 4069 } 4070 4071 return 0; 4072 } 4073 4074 STATIC int 4075 xlog_valid_rec_header( 4076 struct xlog *log, 4077 struct xlog_rec_header *rhead, 4078 xfs_daddr_t blkno) 4079 { 4080 int hlen; 4081 4082 if (unlikely(rhead->h_magicno != cpu_to_be32(XLOG_HEADER_MAGIC_NUM))) { 4083 XFS_ERROR_REPORT("xlog_valid_rec_header(1)", 4084 XFS_ERRLEVEL_LOW, log->l_mp); 4085 return -EFSCORRUPTED; 4086 } 4087 if (unlikely( 4088 (!rhead->h_version || 4089 (be32_to_cpu(rhead->h_version) & (~XLOG_VERSION_OKBITS))))) { 4090 xfs_warn(log->l_mp, "%s: unrecognised log version (%d).", 4091 __func__, be32_to_cpu(rhead->h_version)); 4092 return -EIO; 4093 } 4094 4095 /* LR body must have data or it wouldn't have been written */ 4096 hlen = be32_to_cpu(rhead->h_len); 4097 if (unlikely( hlen <= 0 || hlen > INT_MAX )) { 4098 XFS_ERROR_REPORT("xlog_valid_rec_header(2)", 4099 XFS_ERRLEVEL_LOW, log->l_mp); 4100 return -EFSCORRUPTED; 4101 } 4102 if (unlikely( blkno > log->l_logBBsize || blkno > INT_MAX )) { 4103 XFS_ERROR_REPORT("xlog_valid_rec_header(3)", 4104 XFS_ERRLEVEL_LOW, log->l_mp); 4105 return -EFSCORRUPTED; 4106 } 4107 return 0; 4108 } 4109 4110 /* 4111 * Read the log from tail to head and process the log records found. 4112 * Handle the two cases where the tail and head are in the same cycle 4113 * and where the active portion of the log wraps around the end of 4114 * the physical log separately. The pass parameter is passed through 4115 * to the routines called to process the data and is not looked at 4116 * here. 4117 */ 4118 STATIC int 4119 xlog_do_recovery_pass( 4120 struct xlog *log, 4121 xfs_daddr_t head_blk, 4122 xfs_daddr_t tail_blk, 4123 int pass) 4124 { 4125 xlog_rec_header_t *rhead; 4126 xfs_daddr_t blk_no; 4127 xfs_caddr_t offset; 4128 xfs_buf_t *hbp, *dbp; 4129 int error = 0, h_size; 4130 int bblks, split_bblks; 4131 int hblks, split_hblks, wrapped_hblks; 4132 struct hlist_head rhash[XLOG_RHASH_SIZE]; 4133 4134 ASSERT(head_blk != tail_blk); 4135 4136 /* 4137 * Read the header of the tail block and get the iclog buffer size from 4138 * h_size. Use this to tell how many sectors make up the log header. 4139 */ 4140 if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) { 4141 /* 4142 * When using variable length iclogs, read first sector of 4143 * iclog header and extract the header size from it. Get a 4144 * new hbp that is the correct size. 4145 */ 4146 hbp = xlog_get_bp(log, 1); 4147 if (!hbp) 4148 return -ENOMEM; 4149 4150 error = xlog_bread(log, tail_blk, 1, hbp, &offset); 4151 if (error) 4152 goto bread_err1; 4153 4154 rhead = (xlog_rec_header_t *)offset; 4155 error = xlog_valid_rec_header(log, rhead, tail_blk); 4156 if (error) 4157 goto bread_err1; 4158 h_size = be32_to_cpu(rhead->h_size); 4159 if ((be32_to_cpu(rhead->h_version) & XLOG_VERSION_2) && 4160 (h_size > XLOG_HEADER_CYCLE_SIZE)) { 4161 hblks = h_size / XLOG_HEADER_CYCLE_SIZE; 4162 if (h_size % XLOG_HEADER_CYCLE_SIZE) 4163 hblks++; 4164 xlog_put_bp(hbp); 4165 hbp = xlog_get_bp(log, hblks); 4166 } else { 4167 hblks = 1; 4168 } 4169 } else { 4170 ASSERT(log->l_sectBBsize == 1); 4171 hblks = 1; 4172 hbp = xlog_get_bp(log, 1); 4173 h_size = XLOG_BIG_RECORD_BSIZE; 4174 } 4175 4176 if (!hbp) 4177 return -ENOMEM; 4178 dbp = xlog_get_bp(log, BTOBB(h_size)); 4179 if (!dbp) { 4180 xlog_put_bp(hbp); 4181 return -ENOMEM; 4182 } 4183 4184 memset(rhash, 0, sizeof(rhash)); 4185 blk_no = tail_blk; 4186 if (tail_blk > head_blk) { 4187 /* 4188 * Perform recovery around the end of the physical log. 4189 * When the head is not on the same cycle number as the tail, 4190 * we can't do a sequential recovery. 4191 */ 4192 while (blk_no < log->l_logBBsize) { 4193 /* 4194 * Check for header wrapping around physical end-of-log 4195 */ 4196 offset = hbp->b_addr; 4197 split_hblks = 0; 4198 wrapped_hblks = 0; 4199 if (blk_no + hblks <= log->l_logBBsize) { 4200 /* Read header in one read */ 4201 error = xlog_bread(log, blk_no, hblks, hbp, 4202 &offset); 4203 if (error) 4204 goto bread_err2; 4205 } else { 4206 /* This LR is split across physical log end */ 4207 if (blk_no != log->l_logBBsize) { 4208 /* some data before physical log end */ 4209 ASSERT(blk_no <= INT_MAX); 4210 split_hblks = log->l_logBBsize - (int)blk_no; 4211 ASSERT(split_hblks > 0); 4212 error = xlog_bread(log, blk_no, 4213 split_hblks, hbp, 4214 &offset); 4215 if (error) 4216 goto bread_err2; 4217 } 4218 4219 /* 4220 * Note: this black magic still works with 4221 * large sector sizes (non-512) only because: 4222 * - we increased the buffer size originally 4223 * by 1 sector giving us enough extra space 4224 * for the second read; 4225 * - the log start is guaranteed to be sector 4226 * aligned; 4227 * - we read the log end (LR header start) 4228 * _first_, then the log start (LR header end) 4229 * - order is important. 4230 */ 4231 wrapped_hblks = hblks - split_hblks; 4232 error = xlog_bread_offset(log, 0, 4233 wrapped_hblks, hbp, 4234 offset + BBTOB(split_hblks)); 4235 if (error) 4236 goto bread_err2; 4237 } 4238 rhead = (xlog_rec_header_t *)offset; 4239 error = xlog_valid_rec_header(log, rhead, 4240 split_hblks ? blk_no : 0); 4241 if (error) 4242 goto bread_err2; 4243 4244 bblks = (int)BTOBB(be32_to_cpu(rhead->h_len)); 4245 blk_no += hblks; 4246 4247 /* Read in data for log record */ 4248 if (blk_no + bblks <= log->l_logBBsize) { 4249 error = xlog_bread(log, blk_no, bblks, dbp, 4250 &offset); 4251 if (error) 4252 goto bread_err2; 4253 } else { 4254 /* This log record is split across the 4255 * physical end of log */ 4256 offset = dbp->b_addr; 4257 split_bblks = 0; 4258 if (blk_no != log->l_logBBsize) { 4259 /* some data is before the physical 4260 * end of log */ 4261 ASSERT(!wrapped_hblks); 4262 ASSERT(blk_no <= INT_MAX); 4263 split_bblks = 4264 log->l_logBBsize - (int)blk_no; 4265 ASSERT(split_bblks > 0); 4266 error = xlog_bread(log, blk_no, 4267 split_bblks, dbp, 4268 &offset); 4269 if (error) 4270 goto bread_err2; 4271 } 4272 4273 /* 4274 * Note: this black magic still works with 4275 * large sector sizes (non-512) only because: 4276 * - we increased the buffer size originally 4277 * by 1 sector giving us enough extra space 4278 * for the second read; 4279 * - the log start is guaranteed to be sector 4280 * aligned; 4281 * - we read the log end (LR header start) 4282 * _first_, then the log start (LR header end) 4283 * - order is important. 4284 */ 4285 error = xlog_bread_offset(log, 0, 4286 bblks - split_bblks, dbp, 4287 offset + BBTOB(split_bblks)); 4288 if (error) 4289 goto bread_err2; 4290 } 4291 4292 error = xlog_unpack_data(rhead, offset, log); 4293 if (error) 4294 goto bread_err2; 4295 4296 error = xlog_recover_process_data(log, rhash, 4297 rhead, offset, pass); 4298 if (error) 4299 goto bread_err2; 4300 blk_no += bblks; 4301 } 4302 4303 ASSERT(blk_no >= log->l_logBBsize); 4304 blk_no -= log->l_logBBsize; 4305 } 4306 4307 /* read first part of physical log */ 4308 while (blk_no < head_blk) { 4309 error = xlog_bread(log, blk_no, hblks, hbp, &offset); 4310 if (error) 4311 goto bread_err2; 4312 4313 rhead = (xlog_rec_header_t *)offset; 4314 error = xlog_valid_rec_header(log, rhead, blk_no); 4315 if (error) 4316 goto bread_err2; 4317 4318 /* blocks in data section */ 4319 bblks = (int)BTOBB(be32_to_cpu(rhead->h_len)); 4320 error = xlog_bread(log, blk_no+hblks, bblks, dbp, 4321 &offset); 4322 if (error) 4323 goto bread_err2; 4324 4325 error = xlog_unpack_data(rhead, offset, log); 4326 if (error) 4327 goto bread_err2; 4328 4329 error = xlog_recover_process_data(log, rhash, 4330 rhead, offset, pass); 4331 if (error) 4332 goto bread_err2; 4333 blk_no += bblks + hblks; 4334 } 4335 4336 bread_err2: 4337 xlog_put_bp(dbp); 4338 bread_err1: 4339 xlog_put_bp(hbp); 4340 return error; 4341 } 4342 4343 /* 4344 * Do the recovery of the log. We actually do this in two phases. 4345 * The two passes are necessary in order to implement the function 4346 * of cancelling a record written into the log. The first pass 4347 * determines those things which have been cancelled, and the 4348 * second pass replays log items normally except for those which 4349 * have been cancelled. The handling of the replay and cancellations 4350 * takes place in the log item type specific routines. 4351 * 4352 * The table of items which have cancel records in the log is allocated 4353 * and freed at this level, since only here do we know when all of 4354 * the log recovery has been completed. 4355 */ 4356 STATIC int 4357 xlog_do_log_recovery( 4358 struct xlog *log, 4359 xfs_daddr_t head_blk, 4360 xfs_daddr_t tail_blk) 4361 { 4362 int error, i; 4363 4364 ASSERT(head_blk != tail_blk); 4365 4366 /* 4367 * First do a pass to find all of the cancelled buf log items. 4368 * Store them in the buf_cancel_table for use in the second pass. 4369 */ 4370 log->l_buf_cancel_table = kmem_zalloc(XLOG_BC_TABLE_SIZE * 4371 sizeof(struct list_head), 4372 KM_SLEEP); 4373 for (i = 0; i < XLOG_BC_TABLE_SIZE; i++) 4374 INIT_LIST_HEAD(&log->l_buf_cancel_table[i]); 4375 4376 error = xlog_do_recovery_pass(log, head_blk, tail_blk, 4377 XLOG_RECOVER_PASS1); 4378 if (error != 0) { 4379 kmem_free(log->l_buf_cancel_table); 4380 log->l_buf_cancel_table = NULL; 4381 return error; 4382 } 4383 /* 4384 * Then do a second pass to actually recover the items in the log. 4385 * When it is complete free the table of buf cancel items. 4386 */ 4387 error = xlog_do_recovery_pass(log, head_blk, tail_blk, 4388 XLOG_RECOVER_PASS2); 4389 #ifdef DEBUG 4390 if (!error) { 4391 int i; 4392 4393 for (i = 0; i < XLOG_BC_TABLE_SIZE; i++) 4394 ASSERT(list_empty(&log->l_buf_cancel_table[i])); 4395 } 4396 #endif /* DEBUG */ 4397 4398 kmem_free(log->l_buf_cancel_table); 4399 log->l_buf_cancel_table = NULL; 4400 4401 return error; 4402 } 4403 4404 /* 4405 * Do the actual recovery 4406 */ 4407 STATIC int 4408 xlog_do_recover( 4409 struct xlog *log, 4410 xfs_daddr_t head_blk, 4411 xfs_daddr_t tail_blk) 4412 { 4413 int error; 4414 xfs_buf_t *bp; 4415 xfs_sb_t *sbp; 4416 4417 /* 4418 * First replay the images in the log. 4419 */ 4420 error = xlog_do_log_recovery(log, head_blk, tail_blk); 4421 if (error) 4422 return error; 4423 4424 /* 4425 * If IO errors happened during recovery, bail out. 4426 */ 4427 if (XFS_FORCED_SHUTDOWN(log->l_mp)) { 4428 return -EIO; 4429 } 4430 4431 /* 4432 * We now update the tail_lsn since much of the recovery has completed 4433 * and there may be space available to use. If there were no extent 4434 * or iunlinks, we can free up the entire log and set the tail_lsn to 4435 * be the last_sync_lsn. This was set in xlog_find_tail to be the 4436 * lsn of the last known good LR on disk. If there are extent frees 4437 * or iunlinks they will have some entries in the AIL; so we look at 4438 * the AIL to determine how to set the tail_lsn. 4439 */ 4440 xlog_assign_tail_lsn(log->l_mp); 4441 4442 /* 4443 * Now that we've finished replaying all buffer and inode 4444 * updates, re-read in the superblock and reverify it. 4445 */ 4446 bp = xfs_getsb(log->l_mp, 0); 4447 XFS_BUF_UNDONE(bp); 4448 ASSERT(!(XFS_BUF_ISWRITE(bp))); 4449 XFS_BUF_READ(bp); 4450 XFS_BUF_UNASYNC(bp); 4451 bp->b_ops = &xfs_sb_buf_ops; 4452 4453 error = xfs_buf_submit_wait(bp); 4454 if (error) { 4455 if (!XFS_FORCED_SHUTDOWN(log->l_mp)) { 4456 xfs_buf_ioerror_alert(bp, __func__); 4457 ASSERT(0); 4458 } 4459 xfs_buf_relse(bp); 4460 return error; 4461 } 4462 4463 /* Convert superblock from on-disk format */ 4464 sbp = &log->l_mp->m_sb; 4465 xfs_sb_from_disk(sbp, XFS_BUF_TO_SBP(bp)); 4466 ASSERT(sbp->sb_magicnum == XFS_SB_MAGIC); 4467 ASSERT(xfs_sb_good_version(sbp)); 4468 xfs_buf_relse(bp); 4469 4470 /* We've re-read the superblock so re-initialize per-cpu counters */ 4471 xfs_icsb_reinit_counters(log->l_mp); 4472 4473 xlog_recover_check_summary(log); 4474 4475 /* Normal transactions can now occur */ 4476 log->l_flags &= ~XLOG_ACTIVE_RECOVERY; 4477 return 0; 4478 } 4479 4480 /* 4481 * Perform recovery and re-initialize some log variables in xlog_find_tail. 4482 * 4483 * Return error or zero. 4484 */ 4485 int 4486 xlog_recover( 4487 struct xlog *log) 4488 { 4489 xfs_daddr_t head_blk, tail_blk; 4490 int error; 4491 4492 /* find the tail of the log */ 4493 if ((error = xlog_find_tail(log, &head_blk, &tail_blk))) 4494 return error; 4495 4496 if (tail_blk != head_blk) { 4497 /* There used to be a comment here: 4498 * 4499 * disallow recovery on read-only mounts. note -- mount 4500 * checks for ENOSPC and turns it into an intelligent 4501 * error message. 4502 * ...but this is no longer true. Now, unless you specify 4503 * NORECOVERY (in which case this function would never be 4504 * called), we just go ahead and recover. We do this all 4505 * under the vfs layer, so we can get away with it unless 4506 * the device itself is read-only, in which case we fail. 4507 */ 4508 if ((error = xfs_dev_is_read_only(log->l_mp, "recovery"))) { 4509 return error; 4510 } 4511 4512 /* 4513 * Version 5 superblock log feature mask validation. We know the 4514 * log is dirty so check if there are any unknown log features 4515 * in what we need to recover. If there are unknown features 4516 * (e.g. unsupported transactions, then simply reject the 4517 * attempt at recovery before touching anything. 4518 */ 4519 if (XFS_SB_VERSION_NUM(&log->l_mp->m_sb) == XFS_SB_VERSION_5 && 4520 xfs_sb_has_incompat_log_feature(&log->l_mp->m_sb, 4521 XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN)) { 4522 xfs_warn(log->l_mp, 4523 "Superblock has unknown incompatible log features (0x%x) enabled.\n" 4524 "The log can not be fully and/or safely recovered by this kernel.\n" 4525 "Please recover the log on a kernel that supports the unknown features.", 4526 (log->l_mp->m_sb.sb_features_log_incompat & 4527 XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN)); 4528 return -EINVAL; 4529 } 4530 4531 /* 4532 * Delay log recovery if the debug hook is set. This is debug 4533 * instrumention to coordinate simulation of I/O failures with 4534 * log recovery. 4535 */ 4536 if (xfs_globals.log_recovery_delay) { 4537 xfs_notice(log->l_mp, 4538 "Delaying log recovery for %d seconds.", 4539 xfs_globals.log_recovery_delay); 4540 msleep(xfs_globals.log_recovery_delay * 1000); 4541 } 4542 4543 xfs_notice(log->l_mp, "Starting recovery (logdev: %s)", 4544 log->l_mp->m_logname ? log->l_mp->m_logname 4545 : "internal"); 4546 4547 error = xlog_do_recover(log, head_blk, tail_blk); 4548 log->l_flags |= XLOG_RECOVERY_NEEDED; 4549 } 4550 return error; 4551 } 4552 4553 /* 4554 * In the first part of recovery we replay inodes and buffers and build 4555 * up the list of extent free items which need to be processed. Here 4556 * we process the extent free items and clean up the on disk unlinked 4557 * inode lists. This is separated from the first part of recovery so 4558 * that the root and real-time bitmap inodes can be read in from disk in 4559 * between the two stages. This is necessary so that we can free space 4560 * in the real-time portion of the file system. 4561 */ 4562 int 4563 xlog_recover_finish( 4564 struct xlog *log) 4565 { 4566 /* 4567 * Now we're ready to do the transactions needed for the 4568 * rest of recovery. Start with completing all the extent 4569 * free intent records and then process the unlinked inode 4570 * lists. At this point, we essentially run in normal mode 4571 * except that we're still performing recovery actions 4572 * rather than accepting new requests. 4573 */ 4574 if (log->l_flags & XLOG_RECOVERY_NEEDED) { 4575 int error; 4576 error = xlog_recover_process_efis(log); 4577 if (error) { 4578 xfs_alert(log->l_mp, "Failed to recover EFIs"); 4579 return error; 4580 } 4581 /* 4582 * Sync the log to get all the EFIs out of the AIL. 4583 * This isn't absolutely necessary, but it helps in 4584 * case the unlink transactions would have problems 4585 * pushing the EFIs out of the way. 4586 */ 4587 xfs_log_force(log->l_mp, XFS_LOG_SYNC); 4588 4589 xlog_recover_process_iunlinks(log); 4590 4591 xlog_recover_check_summary(log); 4592 4593 xfs_notice(log->l_mp, "Ending recovery (logdev: %s)", 4594 log->l_mp->m_logname ? log->l_mp->m_logname 4595 : "internal"); 4596 log->l_flags &= ~XLOG_RECOVERY_NEEDED; 4597 } else { 4598 xfs_info(log->l_mp, "Ending clean mount"); 4599 } 4600 return 0; 4601 } 4602 4603 4604 #if defined(DEBUG) 4605 /* 4606 * Read all of the agf and agi counters and check that they 4607 * are consistent with the superblock counters. 4608 */ 4609 void 4610 xlog_recover_check_summary( 4611 struct xlog *log) 4612 { 4613 xfs_mount_t *mp; 4614 xfs_agf_t *agfp; 4615 xfs_buf_t *agfbp; 4616 xfs_buf_t *agibp; 4617 xfs_agnumber_t agno; 4618 __uint64_t freeblks; 4619 __uint64_t itotal; 4620 __uint64_t ifree; 4621 int error; 4622 4623 mp = log->l_mp; 4624 4625 freeblks = 0LL; 4626 itotal = 0LL; 4627 ifree = 0LL; 4628 for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) { 4629 error = xfs_read_agf(mp, NULL, agno, 0, &agfbp); 4630 if (error) { 4631 xfs_alert(mp, "%s agf read failed agno %d error %d", 4632 __func__, agno, error); 4633 } else { 4634 agfp = XFS_BUF_TO_AGF(agfbp); 4635 freeblks += be32_to_cpu(agfp->agf_freeblks) + 4636 be32_to_cpu(agfp->agf_flcount); 4637 xfs_buf_relse(agfbp); 4638 } 4639 4640 error = xfs_read_agi(mp, NULL, agno, &agibp); 4641 if (error) { 4642 xfs_alert(mp, "%s agi read failed agno %d error %d", 4643 __func__, agno, error); 4644 } else { 4645 struct xfs_agi *agi = XFS_BUF_TO_AGI(agibp); 4646 4647 itotal += be32_to_cpu(agi->agi_count); 4648 ifree += be32_to_cpu(agi->agi_freecount); 4649 xfs_buf_relse(agibp); 4650 } 4651 } 4652 } 4653 #endif /* DEBUG */ 4654