1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (c) 2000-2005 Silicon Graphics, Inc. 4 * All Rights Reserved. 5 */ 6 #include "xfs.h" 7 #include "xfs_fs.h" 8 #include "xfs_shared.h" 9 #include "xfs_format.h" 10 #include "xfs_log_format.h" 11 #include "xfs_trans_resv.h" 12 #include "xfs_mount.h" 13 #include "xfs_errortag.h" 14 #include "xfs_error.h" 15 #include "xfs_trans.h" 16 #include "xfs_trans_priv.h" 17 #include "xfs_log.h" 18 #include "xfs_log_priv.h" 19 #include "xfs_trace.h" 20 #include "xfs_sysfs.h" 21 #include "xfs_sb.h" 22 #include "xfs_health.h" 23 24 kmem_zone_t *xfs_log_ticket_zone; 25 26 /* Local miscellaneous function prototypes */ 27 STATIC int 28 xlog_commit_record( 29 struct xlog *log, 30 struct xlog_ticket *ticket, 31 struct xlog_in_core **iclog, 32 xfs_lsn_t *commitlsnp); 33 34 STATIC struct xlog * 35 xlog_alloc_log( 36 struct xfs_mount *mp, 37 struct xfs_buftarg *log_target, 38 xfs_daddr_t blk_offset, 39 int num_bblks); 40 STATIC int 41 xlog_space_left( 42 struct xlog *log, 43 atomic64_t *head); 44 STATIC void 45 xlog_dealloc_log( 46 struct xlog *log); 47 48 /* local state machine functions */ 49 STATIC void xlog_state_done_syncing( 50 struct xlog_in_core *iclog, 51 bool aborted); 52 STATIC int 53 xlog_state_get_iclog_space( 54 struct xlog *log, 55 int len, 56 struct xlog_in_core **iclog, 57 struct xlog_ticket *ticket, 58 int *continued_write, 59 int *logoffsetp); 60 STATIC int 61 xlog_state_release_iclog( 62 struct xlog *log, 63 struct xlog_in_core *iclog); 64 STATIC void 65 xlog_state_switch_iclogs( 66 struct xlog *log, 67 struct xlog_in_core *iclog, 68 int eventual_size); 69 STATIC void 70 xlog_state_want_sync( 71 struct xlog *log, 72 struct xlog_in_core *iclog); 73 74 STATIC void 75 xlog_grant_push_ail( 76 struct xlog *log, 77 int need_bytes); 78 STATIC void 79 xlog_regrant_reserve_log_space( 80 struct xlog *log, 81 struct xlog_ticket *ticket); 82 STATIC void 83 xlog_ungrant_log_space( 84 struct xlog *log, 85 struct xlog_ticket *ticket); 86 87 #if defined(DEBUG) 88 STATIC void 89 xlog_verify_dest_ptr( 90 struct xlog *log, 91 void *ptr); 92 STATIC void 93 xlog_verify_grant_tail( 94 struct xlog *log); 95 STATIC void 96 xlog_verify_iclog( 97 struct xlog *log, 98 struct xlog_in_core *iclog, 99 int count); 100 STATIC void 101 xlog_verify_tail_lsn( 102 struct xlog *log, 103 struct xlog_in_core *iclog, 104 xfs_lsn_t tail_lsn); 105 #else 106 #define xlog_verify_dest_ptr(a,b) 107 #define xlog_verify_grant_tail(a) 108 #define xlog_verify_iclog(a,b,c) 109 #define xlog_verify_tail_lsn(a,b,c) 110 #endif 111 112 STATIC int 113 xlog_iclogs_empty( 114 struct xlog *log); 115 116 static void 117 xlog_grant_sub_space( 118 struct xlog *log, 119 atomic64_t *head, 120 int bytes) 121 { 122 int64_t head_val = atomic64_read(head); 123 int64_t new, old; 124 125 do { 126 int cycle, space; 127 128 xlog_crack_grant_head_val(head_val, &cycle, &space); 129 130 space -= bytes; 131 if (space < 0) { 132 space += log->l_logsize; 133 cycle--; 134 } 135 136 old = head_val; 137 new = xlog_assign_grant_head_val(cycle, space); 138 head_val = atomic64_cmpxchg(head, old, new); 139 } while (head_val != old); 140 } 141 142 static void 143 xlog_grant_add_space( 144 struct xlog *log, 145 atomic64_t *head, 146 int bytes) 147 { 148 int64_t head_val = atomic64_read(head); 149 int64_t new, old; 150 151 do { 152 int tmp; 153 int cycle, space; 154 155 xlog_crack_grant_head_val(head_val, &cycle, &space); 156 157 tmp = log->l_logsize - space; 158 if (tmp > bytes) 159 space += bytes; 160 else { 161 space = bytes - tmp; 162 cycle++; 163 } 164 165 old = head_val; 166 new = xlog_assign_grant_head_val(cycle, space); 167 head_val = atomic64_cmpxchg(head, old, new); 168 } while (head_val != old); 169 } 170 171 STATIC void 172 xlog_grant_head_init( 173 struct xlog_grant_head *head) 174 { 175 xlog_assign_grant_head(&head->grant, 1, 0); 176 INIT_LIST_HEAD(&head->waiters); 177 spin_lock_init(&head->lock); 178 } 179 180 STATIC void 181 xlog_grant_head_wake_all( 182 struct xlog_grant_head *head) 183 { 184 struct xlog_ticket *tic; 185 186 spin_lock(&head->lock); 187 list_for_each_entry(tic, &head->waiters, t_queue) 188 wake_up_process(tic->t_task); 189 spin_unlock(&head->lock); 190 } 191 192 static inline int 193 xlog_ticket_reservation( 194 struct xlog *log, 195 struct xlog_grant_head *head, 196 struct xlog_ticket *tic) 197 { 198 if (head == &log->l_write_head) { 199 ASSERT(tic->t_flags & XLOG_TIC_PERM_RESERV); 200 return tic->t_unit_res; 201 } else { 202 if (tic->t_flags & XLOG_TIC_PERM_RESERV) 203 return tic->t_unit_res * tic->t_cnt; 204 else 205 return tic->t_unit_res; 206 } 207 } 208 209 STATIC bool 210 xlog_grant_head_wake( 211 struct xlog *log, 212 struct xlog_grant_head *head, 213 int *free_bytes) 214 { 215 struct xlog_ticket *tic; 216 int need_bytes; 217 218 list_for_each_entry(tic, &head->waiters, t_queue) { 219 need_bytes = xlog_ticket_reservation(log, head, tic); 220 if (*free_bytes < need_bytes) 221 return false; 222 223 *free_bytes -= need_bytes; 224 trace_xfs_log_grant_wake_up(log, tic); 225 wake_up_process(tic->t_task); 226 } 227 228 return true; 229 } 230 231 STATIC int 232 xlog_grant_head_wait( 233 struct xlog *log, 234 struct xlog_grant_head *head, 235 struct xlog_ticket *tic, 236 int need_bytes) __releases(&head->lock) 237 __acquires(&head->lock) 238 { 239 list_add_tail(&tic->t_queue, &head->waiters); 240 241 do { 242 if (XLOG_FORCED_SHUTDOWN(log)) 243 goto shutdown; 244 xlog_grant_push_ail(log, need_bytes); 245 246 __set_current_state(TASK_UNINTERRUPTIBLE); 247 spin_unlock(&head->lock); 248 249 XFS_STATS_INC(log->l_mp, xs_sleep_logspace); 250 251 trace_xfs_log_grant_sleep(log, tic); 252 schedule(); 253 trace_xfs_log_grant_wake(log, tic); 254 255 spin_lock(&head->lock); 256 if (XLOG_FORCED_SHUTDOWN(log)) 257 goto shutdown; 258 } while (xlog_space_left(log, &head->grant) < need_bytes); 259 260 list_del_init(&tic->t_queue); 261 return 0; 262 shutdown: 263 list_del_init(&tic->t_queue); 264 return -EIO; 265 } 266 267 /* 268 * Atomically get the log space required for a log ticket. 269 * 270 * Once a ticket gets put onto head->waiters, it will only return after the 271 * needed reservation is satisfied. 272 * 273 * This function is structured so that it has a lock free fast path. This is 274 * necessary because every new transaction reservation will come through this 275 * path. Hence any lock will be globally hot if we take it unconditionally on 276 * every pass. 277 * 278 * As tickets are only ever moved on and off head->waiters under head->lock, we 279 * only need to take that lock if we are going to add the ticket to the queue 280 * and sleep. We can avoid taking the lock if the ticket was never added to 281 * head->waiters because the t_queue list head will be empty and we hold the 282 * only reference to it so it can safely be checked unlocked. 283 */ 284 STATIC int 285 xlog_grant_head_check( 286 struct xlog *log, 287 struct xlog_grant_head *head, 288 struct xlog_ticket *tic, 289 int *need_bytes) 290 { 291 int free_bytes; 292 int error = 0; 293 294 ASSERT(!(log->l_flags & XLOG_ACTIVE_RECOVERY)); 295 296 /* 297 * If there are other waiters on the queue then give them a chance at 298 * logspace before us. Wake up the first waiters, if we do not wake 299 * up all the waiters then go to sleep waiting for more free space, 300 * otherwise try to get some space for this transaction. 301 */ 302 *need_bytes = xlog_ticket_reservation(log, head, tic); 303 free_bytes = xlog_space_left(log, &head->grant); 304 if (!list_empty_careful(&head->waiters)) { 305 spin_lock(&head->lock); 306 if (!xlog_grant_head_wake(log, head, &free_bytes) || 307 free_bytes < *need_bytes) { 308 error = xlog_grant_head_wait(log, head, tic, 309 *need_bytes); 310 } 311 spin_unlock(&head->lock); 312 } else if (free_bytes < *need_bytes) { 313 spin_lock(&head->lock); 314 error = xlog_grant_head_wait(log, head, tic, *need_bytes); 315 spin_unlock(&head->lock); 316 } 317 318 return error; 319 } 320 321 static void 322 xlog_tic_reset_res(xlog_ticket_t *tic) 323 { 324 tic->t_res_num = 0; 325 tic->t_res_arr_sum = 0; 326 tic->t_res_num_ophdrs = 0; 327 } 328 329 static void 330 xlog_tic_add_region(xlog_ticket_t *tic, uint len, uint type) 331 { 332 if (tic->t_res_num == XLOG_TIC_LEN_MAX) { 333 /* add to overflow and start again */ 334 tic->t_res_o_flow += tic->t_res_arr_sum; 335 tic->t_res_num = 0; 336 tic->t_res_arr_sum = 0; 337 } 338 339 tic->t_res_arr[tic->t_res_num].r_len = len; 340 tic->t_res_arr[tic->t_res_num].r_type = type; 341 tic->t_res_arr_sum += len; 342 tic->t_res_num++; 343 } 344 345 /* 346 * Replenish the byte reservation required by moving the grant write head. 347 */ 348 int 349 xfs_log_regrant( 350 struct xfs_mount *mp, 351 struct xlog_ticket *tic) 352 { 353 struct xlog *log = mp->m_log; 354 int need_bytes; 355 int error = 0; 356 357 if (XLOG_FORCED_SHUTDOWN(log)) 358 return -EIO; 359 360 XFS_STATS_INC(mp, xs_try_logspace); 361 362 /* 363 * This is a new transaction on the ticket, so we need to change the 364 * transaction ID so that the next transaction has a different TID in 365 * the log. Just add one to the existing tid so that we can see chains 366 * of rolling transactions in the log easily. 367 */ 368 tic->t_tid++; 369 370 xlog_grant_push_ail(log, tic->t_unit_res); 371 372 tic->t_curr_res = tic->t_unit_res; 373 xlog_tic_reset_res(tic); 374 375 if (tic->t_cnt > 0) 376 return 0; 377 378 trace_xfs_log_regrant(log, tic); 379 380 error = xlog_grant_head_check(log, &log->l_write_head, tic, 381 &need_bytes); 382 if (error) 383 goto out_error; 384 385 xlog_grant_add_space(log, &log->l_write_head.grant, need_bytes); 386 trace_xfs_log_regrant_exit(log, tic); 387 xlog_verify_grant_tail(log); 388 return 0; 389 390 out_error: 391 /* 392 * If we are failing, make sure the ticket doesn't have any current 393 * reservations. We don't want to add this back when the ticket/ 394 * transaction gets cancelled. 395 */ 396 tic->t_curr_res = 0; 397 tic->t_cnt = 0; /* ungrant will give back unit_res * t_cnt. */ 398 return error; 399 } 400 401 /* 402 * Reserve log space and return a ticket corresponding to the reservation. 403 * 404 * Each reservation is going to reserve extra space for a log record header. 405 * When writes happen to the on-disk log, we don't subtract the length of the 406 * log record header from any reservation. By wasting space in each 407 * reservation, we prevent over allocation problems. 408 */ 409 int 410 xfs_log_reserve( 411 struct xfs_mount *mp, 412 int unit_bytes, 413 int cnt, 414 struct xlog_ticket **ticp, 415 uint8_t client, 416 bool permanent) 417 { 418 struct xlog *log = mp->m_log; 419 struct xlog_ticket *tic; 420 int need_bytes; 421 int error = 0; 422 423 ASSERT(client == XFS_TRANSACTION || client == XFS_LOG); 424 425 if (XLOG_FORCED_SHUTDOWN(log)) 426 return -EIO; 427 428 XFS_STATS_INC(mp, xs_try_logspace); 429 430 ASSERT(*ticp == NULL); 431 tic = xlog_ticket_alloc(log, unit_bytes, cnt, client, permanent, 432 KM_SLEEP); 433 *ticp = tic; 434 435 xlog_grant_push_ail(log, tic->t_cnt ? tic->t_unit_res * tic->t_cnt 436 : tic->t_unit_res); 437 438 trace_xfs_log_reserve(log, tic); 439 440 error = xlog_grant_head_check(log, &log->l_reserve_head, tic, 441 &need_bytes); 442 if (error) 443 goto out_error; 444 445 xlog_grant_add_space(log, &log->l_reserve_head.grant, need_bytes); 446 xlog_grant_add_space(log, &log->l_write_head.grant, need_bytes); 447 trace_xfs_log_reserve_exit(log, tic); 448 xlog_verify_grant_tail(log); 449 return 0; 450 451 out_error: 452 /* 453 * If we are failing, make sure the ticket doesn't have any current 454 * reservations. We don't want to add this back when the ticket/ 455 * transaction gets cancelled. 456 */ 457 tic->t_curr_res = 0; 458 tic->t_cnt = 0; /* ungrant will give back unit_res * t_cnt. */ 459 return error; 460 } 461 462 463 /* 464 * NOTES: 465 * 466 * 1. currblock field gets updated at startup and after in-core logs 467 * marked as with WANT_SYNC. 468 */ 469 470 /* 471 * This routine is called when a user of a log manager ticket is done with 472 * the reservation. If the ticket was ever used, then a commit record for 473 * the associated transaction is written out as a log operation header with 474 * no data. The flag XLOG_TIC_INITED is set when the first write occurs with 475 * a given ticket. If the ticket was one with a permanent reservation, then 476 * a few operations are done differently. Permanent reservation tickets by 477 * default don't release the reservation. They just commit the current 478 * transaction with the belief that the reservation is still needed. A flag 479 * must be passed in before permanent reservations are actually released. 480 * When these type of tickets are not released, they need to be set into 481 * the inited state again. By doing this, a start record will be written 482 * out when the next write occurs. 483 */ 484 xfs_lsn_t 485 xfs_log_done( 486 struct xfs_mount *mp, 487 struct xlog_ticket *ticket, 488 struct xlog_in_core **iclog, 489 bool regrant) 490 { 491 struct xlog *log = mp->m_log; 492 xfs_lsn_t lsn = 0; 493 494 if (XLOG_FORCED_SHUTDOWN(log) || 495 /* 496 * If nothing was ever written, don't write out commit record. 497 * If we get an error, just continue and give back the log ticket. 498 */ 499 (((ticket->t_flags & XLOG_TIC_INITED) == 0) && 500 (xlog_commit_record(log, ticket, iclog, &lsn)))) { 501 lsn = (xfs_lsn_t) -1; 502 regrant = false; 503 } 504 505 506 if (!regrant) { 507 trace_xfs_log_done_nonperm(log, ticket); 508 509 /* 510 * Release ticket if not permanent reservation or a specific 511 * request has been made to release a permanent reservation. 512 */ 513 xlog_ungrant_log_space(log, ticket); 514 } else { 515 trace_xfs_log_done_perm(log, ticket); 516 517 xlog_regrant_reserve_log_space(log, ticket); 518 /* If this ticket was a permanent reservation and we aren't 519 * trying to release it, reset the inited flags; so next time 520 * we write, a start record will be written out. 521 */ 522 ticket->t_flags |= XLOG_TIC_INITED; 523 } 524 525 xfs_log_ticket_put(ticket); 526 return lsn; 527 } 528 529 int 530 xfs_log_release_iclog( 531 struct xfs_mount *mp, 532 struct xlog_in_core *iclog) 533 { 534 if (xlog_state_release_iclog(mp->m_log, iclog)) { 535 xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR); 536 return -EIO; 537 } 538 539 return 0; 540 } 541 542 /* 543 * Mount a log filesystem 544 * 545 * mp - ubiquitous xfs mount point structure 546 * log_target - buftarg of on-disk log device 547 * blk_offset - Start block # where block size is 512 bytes (BBSIZE) 548 * num_bblocks - Number of BBSIZE blocks in on-disk log 549 * 550 * Return error or zero. 551 */ 552 int 553 xfs_log_mount( 554 xfs_mount_t *mp, 555 xfs_buftarg_t *log_target, 556 xfs_daddr_t blk_offset, 557 int num_bblks) 558 { 559 bool fatal = xfs_sb_version_hascrc(&mp->m_sb); 560 int error = 0; 561 int min_logfsbs; 562 563 if (!(mp->m_flags & XFS_MOUNT_NORECOVERY)) { 564 xfs_notice(mp, "Mounting V%d Filesystem", 565 XFS_SB_VERSION_NUM(&mp->m_sb)); 566 } else { 567 xfs_notice(mp, 568 "Mounting V%d filesystem in no-recovery mode. Filesystem will be inconsistent.", 569 XFS_SB_VERSION_NUM(&mp->m_sb)); 570 ASSERT(mp->m_flags & XFS_MOUNT_RDONLY); 571 } 572 573 mp->m_log = xlog_alloc_log(mp, log_target, blk_offset, num_bblks); 574 if (IS_ERR(mp->m_log)) { 575 error = PTR_ERR(mp->m_log); 576 goto out; 577 } 578 579 /* 580 * Validate the given log space and drop a critical message via syslog 581 * if the log size is too small that would lead to some unexpected 582 * situations in transaction log space reservation stage. 583 * 584 * Note: we can't just reject the mount if the validation fails. This 585 * would mean that people would have to downgrade their kernel just to 586 * remedy the situation as there is no way to grow the log (short of 587 * black magic surgery with xfs_db). 588 * 589 * We can, however, reject mounts for CRC format filesystems, as the 590 * mkfs binary being used to make the filesystem should never create a 591 * filesystem with a log that is too small. 592 */ 593 min_logfsbs = xfs_log_calc_minimum_size(mp); 594 595 if (mp->m_sb.sb_logblocks < min_logfsbs) { 596 xfs_warn(mp, 597 "Log size %d blocks too small, minimum size is %d blocks", 598 mp->m_sb.sb_logblocks, min_logfsbs); 599 error = -EINVAL; 600 } else if (mp->m_sb.sb_logblocks > XFS_MAX_LOG_BLOCKS) { 601 xfs_warn(mp, 602 "Log size %d blocks too large, maximum size is %lld blocks", 603 mp->m_sb.sb_logblocks, XFS_MAX_LOG_BLOCKS); 604 error = -EINVAL; 605 } else if (XFS_FSB_TO_B(mp, mp->m_sb.sb_logblocks) > XFS_MAX_LOG_BYTES) { 606 xfs_warn(mp, 607 "log size %lld bytes too large, maximum size is %lld bytes", 608 XFS_FSB_TO_B(mp, mp->m_sb.sb_logblocks), 609 XFS_MAX_LOG_BYTES); 610 error = -EINVAL; 611 } else if (mp->m_sb.sb_logsunit > 1 && 612 mp->m_sb.sb_logsunit % mp->m_sb.sb_blocksize) { 613 xfs_warn(mp, 614 "log stripe unit %u bytes must be a multiple of block size", 615 mp->m_sb.sb_logsunit); 616 error = -EINVAL; 617 fatal = true; 618 } 619 if (error) { 620 /* 621 * Log check errors are always fatal on v5; or whenever bad 622 * metadata leads to a crash. 623 */ 624 if (fatal) { 625 xfs_crit(mp, "AAIEEE! Log failed size checks. Abort!"); 626 ASSERT(0); 627 goto out_free_log; 628 } 629 xfs_crit(mp, "Log size out of supported range."); 630 xfs_crit(mp, 631 "Continuing onwards, but if log hangs are experienced then please report this message in the bug report."); 632 } 633 634 /* 635 * Initialize the AIL now we have a log. 636 */ 637 error = xfs_trans_ail_init(mp); 638 if (error) { 639 xfs_warn(mp, "AIL initialisation failed: error %d", error); 640 goto out_free_log; 641 } 642 mp->m_log->l_ailp = mp->m_ail; 643 644 /* 645 * skip log recovery on a norecovery mount. pretend it all 646 * just worked. 647 */ 648 if (!(mp->m_flags & XFS_MOUNT_NORECOVERY)) { 649 int readonly = (mp->m_flags & XFS_MOUNT_RDONLY); 650 651 if (readonly) 652 mp->m_flags &= ~XFS_MOUNT_RDONLY; 653 654 error = xlog_recover(mp->m_log); 655 656 if (readonly) 657 mp->m_flags |= XFS_MOUNT_RDONLY; 658 if (error) { 659 xfs_warn(mp, "log mount/recovery failed: error %d", 660 error); 661 xlog_recover_cancel(mp->m_log); 662 goto out_destroy_ail; 663 } 664 } 665 666 error = xfs_sysfs_init(&mp->m_log->l_kobj, &xfs_log_ktype, &mp->m_kobj, 667 "log"); 668 if (error) 669 goto out_destroy_ail; 670 671 /* Normal transactions can now occur */ 672 mp->m_log->l_flags &= ~XLOG_ACTIVE_RECOVERY; 673 674 /* 675 * Now the log has been fully initialised and we know were our 676 * space grant counters are, we can initialise the permanent ticket 677 * needed for delayed logging to work. 678 */ 679 xlog_cil_init_post_recovery(mp->m_log); 680 681 return 0; 682 683 out_destroy_ail: 684 xfs_trans_ail_destroy(mp); 685 out_free_log: 686 xlog_dealloc_log(mp->m_log); 687 out: 688 return error; 689 } 690 691 /* 692 * Finish the recovery of the file system. This is separate from the 693 * xfs_log_mount() call, because it depends on the code in xfs_mountfs() to read 694 * in the root and real-time bitmap inodes between calling xfs_log_mount() and 695 * here. 696 * 697 * If we finish recovery successfully, start the background log work. If we are 698 * not doing recovery, then we have a RO filesystem and we don't need to start 699 * it. 700 */ 701 int 702 xfs_log_mount_finish( 703 struct xfs_mount *mp) 704 { 705 int error = 0; 706 bool readonly = (mp->m_flags & XFS_MOUNT_RDONLY); 707 bool recovered = mp->m_log->l_flags & XLOG_RECOVERY_NEEDED; 708 709 if (mp->m_flags & XFS_MOUNT_NORECOVERY) { 710 ASSERT(mp->m_flags & XFS_MOUNT_RDONLY); 711 return 0; 712 } else if (readonly) { 713 /* Allow unlinked processing to proceed */ 714 mp->m_flags &= ~XFS_MOUNT_RDONLY; 715 } 716 717 /* 718 * During the second phase of log recovery, we need iget and 719 * iput to behave like they do for an active filesystem. 720 * xfs_fs_drop_inode needs to be able to prevent the deletion 721 * of inodes before we're done replaying log items on those 722 * inodes. Turn it off immediately after recovery finishes 723 * so that we don't leak the quota inodes if subsequent mount 724 * activities fail. 725 * 726 * We let all inodes involved in redo item processing end up on 727 * the LRU instead of being evicted immediately so that if we do 728 * something to an unlinked inode, the irele won't cause 729 * premature truncation and freeing of the inode, which results 730 * in log recovery failure. We have to evict the unreferenced 731 * lru inodes after clearing SB_ACTIVE because we don't 732 * otherwise clean up the lru if there's a subsequent failure in 733 * xfs_mountfs, which leads to us leaking the inodes if nothing 734 * else (e.g. quotacheck) references the inodes before the 735 * mount failure occurs. 736 */ 737 mp->m_super->s_flags |= SB_ACTIVE; 738 error = xlog_recover_finish(mp->m_log); 739 if (!error) 740 xfs_log_work_queue(mp); 741 mp->m_super->s_flags &= ~SB_ACTIVE; 742 evict_inodes(mp->m_super); 743 744 /* 745 * Drain the buffer LRU after log recovery. This is required for v4 746 * filesystems to avoid leaving around buffers with NULL verifier ops, 747 * but we do it unconditionally to make sure we're always in a clean 748 * cache state after mount. 749 * 750 * Don't push in the error case because the AIL may have pending intents 751 * that aren't removed until recovery is cancelled. 752 */ 753 if (!error && recovered) { 754 xfs_log_force(mp, XFS_LOG_SYNC); 755 xfs_ail_push_all_sync(mp->m_ail); 756 } 757 xfs_wait_buftarg(mp->m_ddev_targp); 758 759 if (readonly) 760 mp->m_flags |= XFS_MOUNT_RDONLY; 761 762 return error; 763 } 764 765 /* 766 * The mount has failed. Cancel the recovery if it hasn't completed and destroy 767 * the log. 768 */ 769 void 770 xfs_log_mount_cancel( 771 struct xfs_mount *mp) 772 { 773 xlog_recover_cancel(mp->m_log); 774 xfs_log_unmount(mp); 775 } 776 777 /* 778 * Final log writes as part of unmount. 779 * 780 * Mark the filesystem clean as unmount happens. Note that during relocation 781 * this routine needs to be executed as part of source-bag while the 782 * deallocation must not be done until source-end. 783 */ 784 785 /* Actually write the unmount record to disk. */ 786 static void 787 xfs_log_write_unmount_record( 788 struct xfs_mount *mp) 789 { 790 /* the data section must be 32 bit size aligned */ 791 struct xfs_unmount_log_format magic = { 792 .magic = XLOG_UNMOUNT_TYPE, 793 }; 794 struct xfs_log_iovec reg = { 795 .i_addr = &magic, 796 .i_len = sizeof(magic), 797 .i_type = XLOG_REG_TYPE_UNMOUNT, 798 }; 799 struct xfs_log_vec vec = { 800 .lv_niovecs = 1, 801 .lv_iovecp = ®, 802 }; 803 struct xlog *log = mp->m_log; 804 struct xlog_in_core *iclog; 805 struct xlog_ticket *tic = NULL; 806 xfs_lsn_t lsn; 807 uint flags = XLOG_UNMOUNT_TRANS; 808 int error; 809 810 error = xfs_log_reserve(mp, 600, 1, &tic, XFS_LOG, 0); 811 if (error) 812 goto out_err; 813 814 /* 815 * If we think the summary counters are bad, clear the unmount header 816 * flag in the unmount record so that the summary counters will be 817 * recalculated during log recovery at next mount. Refer to 818 * xlog_check_unmount_rec for more details. 819 */ 820 if (XFS_TEST_ERROR(xfs_fs_has_sickness(mp, XFS_SICK_FS_COUNTERS), mp, 821 XFS_ERRTAG_FORCE_SUMMARY_RECALC)) { 822 xfs_alert(mp, "%s: will fix summary counters at next mount", 823 __func__); 824 flags &= ~XLOG_UNMOUNT_TRANS; 825 } 826 827 /* remove inited flag, and account for space used */ 828 tic->t_flags = 0; 829 tic->t_curr_res -= sizeof(magic); 830 error = xlog_write(log, &vec, tic, &lsn, NULL, flags); 831 /* 832 * At this point, we're umounting anyway, so there's no point in 833 * transitioning log state to IOERROR. Just continue... 834 */ 835 out_err: 836 if (error) 837 xfs_alert(mp, "%s: unmount record failed", __func__); 838 839 spin_lock(&log->l_icloglock); 840 iclog = log->l_iclog; 841 atomic_inc(&iclog->ic_refcnt); 842 xlog_state_want_sync(log, iclog); 843 spin_unlock(&log->l_icloglock); 844 error = xlog_state_release_iclog(log, iclog); 845 846 spin_lock(&log->l_icloglock); 847 switch (iclog->ic_state) { 848 default: 849 if (!XLOG_FORCED_SHUTDOWN(log)) { 850 xlog_wait(&iclog->ic_force_wait, &log->l_icloglock); 851 break; 852 } 853 /* fall through */ 854 case XLOG_STATE_ACTIVE: 855 case XLOG_STATE_DIRTY: 856 spin_unlock(&log->l_icloglock); 857 break; 858 } 859 860 if (tic) { 861 trace_xfs_log_umount_write(log, tic); 862 xlog_ungrant_log_space(log, tic); 863 xfs_log_ticket_put(tic); 864 } 865 } 866 867 /* 868 * Unmount record used to have a string "Unmount filesystem--" in the 869 * data section where the "Un" was really a magic number (XLOG_UNMOUNT_TYPE). 870 * We just write the magic number now since that particular field isn't 871 * currently architecture converted and "Unmount" is a bit foo. 872 * As far as I know, there weren't any dependencies on the old behaviour. 873 */ 874 875 static int 876 xfs_log_unmount_write(xfs_mount_t *mp) 877 { 878 struct xlog *log = mp->m_log; 879 xlog_in_core_t *iclog; 880 #ifdef DEBUG 881 xlog_in_core_t *first_iclog; 882 #endif 883 int error; 884 885 /* 886 * Don't write out unmount record on norecovery mounts or ro devices. 887 * Or, if we are doing a forced umount (typically because of IO errors). 888 */ 889 if (mp->m_flags & XFS_MOUNT_NORECOVERY || 890 xfs_readonly_buftarg(log->l_targ)) { 891 ASSERT(mp->m_flags & XFS_MOUNT_RDONLY); 892 return 0; 893 } 894 895 error = xfs_log_force(mp, XFS_LOG_SYNC); 896 ASSERT(error || !(XLOG_FORCED_SHUTDOWN(log))); 897 898 #ifdef DEBUG 899 first_iclog = iclog = log->l_iclog; 900 do { 901 if (!(iclog->ic_state & XLOG_STATE_IOERROR)) { 902 ASSERT(iclog->ic_state & XLOG_STATE_ACTIVE); 903 ASSERT(iclog->ic_offset == 0); 904 } 905 iclog = iclog->ic_next; 906 } while (iclog != first_iclog); 907 #endif 908 if (! (XLOG_FORCED_SHUTDOWN(log))) { 909 xfs_log_write_unmount_record(mp); 910 } else { 911 /* 912 * We're already in forced_shutdown mode, couldn't 913 * even attempt to write out the unmount transaction. 914 * 915 * Go through the motions of sync'ing and releasing 916 * the iclog, even though no I/O will actually happen, 917 * we need to wait for other log I/Os that may already 918 * be in progress. Do this as a separate section of 919 * code so we'll know if we ever get stuck here that 920 * we're in this odd situation of trying to unmount 921 * a file system that went into forced_shutdown as 922 * the result of an unmount.. 923 */ 924 spin_lock(&log->l_icloglock); 925 iclog = log->l_iclog; 926 atomic_inc(&iclog->ic_refcnt); 927 928 xlog_state_want_sync(log, iclog); 929 spin_unlock(&log->l_icloglock); 930 error = xlog_state_release_iclog(log, iclog); 931 932 spin_lock(&log->l_icloglock); 933 934 if ( ! ( iclog->ic_state == XLOG_STATE_ACTIVE 935 || iclog->ic_state == XLOG_STATE_DIRTY 936 || iclog->ic_state == XLOG_STATE_IOERROR) ) { 937 938 xlog_wait(&iclog->ic_force_wait, 939 &log->l_icloglock); 940 } else { 941 spin_unlock(&log->l_icloglock); 942 } 943 } 944 945 return error; 946 } /* xfs_log_unmount_write */ 947 948 /* 949 * Empty the log for unmount/freeze. 950 * 951 * To do this, we first need to shut down the background log work so it is not 952 * trying to cover the log as we clean up. We then need to unpin all objects in 953 * the log so we can then flush them out. Once they have completed their IO and 954 * run the callbacks removing themselves from the AIL, we can write the unmount 955 * record. 956 */ 957 void 958 xfs_log_quiesce( 959 struct xfs_mount *mp) 960 { 961 cancel_delayed_work_sync(&mp->m_log->l_work); 962 xfs_log_force(mp, XFS_LOG_SYNC); 963 964 /* 965 * The superblock buffer is uncached and while xfs_ail_push_all_sync() 966 * will push it, xfs_wait_buftarg() will not wait for it. Further, 967 * xfs_buf_iowait() cannot be used because it was pushed with the 968 * XBF_ASYNC flag set, so we need to use a lock/unlock pair to wait for 969 * the IO to complete. 970 */ 971 xfs_ail_push_all_sync(mp->m_ail); 972 xfs_wait_buftarg(mp->m_ddev_targp); 973 xfs_buf_lock(mp->m_sb_bp); 974 xfs_buf_unlock(mp->m_sb_bp); 975 976 xfs_log_unmount_write(mp); 977 } 978 979 /* 980 * Shut down and release the AIL and Log. 981 * 982 * During unmount, we need to ensure we flush all the dirty metadata objects 983 * from the AIL so that the log is empty before we write the unmount record to 984 * the log. Once this is done, we can tear down the AIL and the log. 985 */ 986 void 987 xfs_log_unmount( 988 struct xfs_mount *mp) 989 { 990 xfs_log_quiesce(mp); 991 992 xfs_trans_ail_destroy(mp); 993 994 xfs_sysfs_del(&mp->m_log->l_kobj); 995 996 xlog_dealloc_log(mp->m_log); 997 } 998 999 void 1000 xfs_log_item_init( 1001 struct xfs_mount *mp, 1002 struct xfs_log_item *item, 1003 int type, 1004 const struct xfs_item_ops *ops) 1005 { 1006 item->li_mountp = mp; 1007 item->li_ailp = mp->m_ail; 1008 item->li_type = type; 1009 item->li_ops = ops; 1010 item->li_lv = NULL; 1011 1012 INIT_LIST_HEAD(&item->li_ail); 1013 INIT_LIST_HEAD(&item->li_cil); 1014 INIT_LIST_HEAD(&item->li_bio_list); 1015 INIT_LIST_HEAD(&item->li_trans); 1016 } 1017 1018 /* 1019 * Wake up processes waiting for log space after we have moved the log tail. 1020 */ 1021 void 1022 xfs_log_space_wake( 1023 struct xfs_mount *mp) 1024 { 1025 struct xlog *log = mp->m_log; 1026 int free_bytes; 1027 1028 if (XLOG_FORCED_SHUTDOWN(log)) 1029 return; 1030 1031 if (!list_empty_careful(&log->l_write_head.waiters)) { 1032 ASSERT(!(log->l_flags & XLOG_ACTIVE_RECOVERY)); 1033 1034 spin_lock(&log->l_write_head.lock); 1035 free_bytes = xlog_space_left(log, &log->l_write_head.grant); 1036 xlog_grant_head_wake(log, &log->l_write_head, &free_bytes); 1037 spin_unlock(&log->l_write_head.lock); 1038 } 1039 1040 if (!list_empty_careful(&log->l_reserve_head.waiters)) { 1041 ASSERT(!(log->l_flags & XLOG_ACTIVE_RECOVERY)); 1042 1043 spin_lock(&log->l_reserve_head.lock); 1044 free_bytes = xlog_space_left(log, &log->l_reserve_head.grant); 1045 xlog_grant_head_wake(log, &log->l_reserve_head, &free_bytes); 1046 spin_unlock(&log->l_reserve_head.lock); 1047 } 1048 } 1049 1050 /* 1051 * Determine if we have a transaction that has gone to disk that needs to be 1052 * covered. To begin the transition to the idle state firstly the log needs to 1053 * be idle. That means the CIL, the AIL and the iclogs needs to be empty before 1054 * we start attempting to cover the log. 1055 * 1056 * Only if we are then in a state where covering is needed, the caller is 1057 * informed that dummy transactions are required to move the log into the idle 1058 * state. 1059 * 1060 * If there are any items in the AIl or CIL, then we do not want to attempt to 1061 * cover the log as we may be in a situation where there isn't log space 1062 * available to run a dummy transaction and this can lead to deadlocks when the 1063 * tail of the log is pinned by an item that is modified in the CIL. Hence 1064 * there's no point in running a dummy transaction at this point because we 1065 * can't start trying to idle the log until both the CIL and AIL are empty. 1066 */ 1067 static int 1068 xfs_log_need_covered(xfs_mount_t *mp) 1069 { 1070 struct xlog *log = mp->m_log; 1071 int needed = 0; 1072 1073 if (!xfs_fs_writable(mp, SB_FREEZE_WRITE)) 1074 return 0; 1075 1076 if (!xlog_cil_empty(log)) 1077 return 0; 1078 1079 spin_lock(&log->l_icloglock); 1080 switch (log->l_covered_state) { 1081 case XLOG_STATE_COVER_DONE: 1082 case XLOG_STATE_COVER_DONE2: 1083 case XLOG_STATE_COVER_IDLE: 1084 break; 1085 case XLOG_STATE_COVER_NEED: 1086 case XLOG_STATE_COVER_NEED2: 1087 if (xfs_ail_min_lsn(log->l_ailp)) 1088 break; 1089 if (!xlog_iclogs_empty(log)) 1090 break; 1091 1092 needed = 1; 1093 if (log->l_covered_state == XLOG_STATE_COVER_NEED) 1094 log->l_covered_state = XLOG_STATE_COVER_DONE; 1095 else 1096 log->l_covered_state = XLOG_STATE_COVER_DONE2; 1097 break; 1098 default: 1099 needed = 1; 1100 break; 1101 } 1102 spin_unlock(&log->l_icloglock); 1103 return needed; 1104 } 1105 1106 /* 1107 * We may be holding the log iclog lock upon entering this routine. 1108 */ 1109 xfs_lsn_t 1110 xlog_assign_tail_lsn_locked( 1111 struct xfs_mount *mp) 1112 { 1113 struct xlog *log = mp->m_log; 1114 struct xfs_log_item *lip; 1115 xfs_lsn_t tail_lsn; 1116 1117 assert_spin_locked(&mp->m_ail->ail_lock); 1118 1119 /* 1120 * To make sure we always have a valid LSN for the log tail we keep 1121 * track of the last LSN which was committed in log->l_last_sync_lsn, 1122 * and use that when the AIL was empty. 1123 */ 1124 lip = xfs_ail_min(mp->m_ail); 1125 if (lip) 1126 tail_lsn = lip->li_lsn; 1127 else 1128 tail_lsn = atomic64_read(&log->l_last_sync_lsn); 1129 trace_xfs_log_assign_tail_lsn(log, tail_lsn); 1130 atomic64_set(&log->l_tail_lsn, tail_lsn); 1131 return tail_lsn; 1132 } 1133 1134 xfs_lsn_t 1135 xlog_assign_tail_lsn( 1136 struct xfs_mount *mp) 1137 { 1138 xfs_lsn_t tail_lsn; 1139 1140 spin_lock(&mp->m_ail->ail_lock); 1141 tail_lsn = xlog_assign_tail_lsn_locked(mp); 1142 spin_unlock(&mp->m_ail->ail_lock); 1143 1144 return tail_lsn; 1145 } 1146 1147 /* 1148 * Return the space in the log between the tail and the head. The head 1149 * is passed in the cycle/bytes formal parms. In the special case where 1150 * the reserve head has wrapped passed the tail, this calculation is no 1151 * longer valid. In this case, just return 0 which means there is no space 1152 * in the log. This works for all places where this function is called 1153 * with the reserve head. Of course, if the write head were to ever 1154 * wrap the tail, we should blow up. Rather than catch this case here, 1155 * we depend on other ASSERTions in other parts of the code. XXXmiken 1156 * 1157 * This code also handles the case where the reservation head is behind 1158 * the tail. The details of this case are described below, but the end 1159 * result is that we return the size of the log as the amount of space left. 1160 */ 1161 STATIC int 1162 xlog_space_left( 1163 struct xlog *log, 1164 atomic64_t *head) 1165 { 1166 int free_bytes; 1167 int tail_bytes; 1168 int tail_cycle; 1169 int head_cycle; 1170 int head_bytes; 1171 1172 xlog_crack_grant_head(head, &head_cycle, &head_bytes); 1173 xlog_crack_atomic_lsn(&log->l_tail_lsn, &tail_cycle, &tail_bytes); 1174 tail_bytes = BBTOB(tail_bytes); 1175 if (tail_cycle == head_cycle && head_bytes >= tail_bytes) 1176 free_bytes = log->l_logsize - (head_bytes - tail_bytes); 1177 else if (tail_cycle + 1 < head_cycle) 1178 return 0; 1179 else if (tail_cycle < head_cycle) { 1180 ASSERT(tail_cycle == (head_cycle - 1)); 1181 free_bytes = tail_bytes - head_bytes; 1182 } else { 1183 /* 1184 * The reservation head is behind the tail. 1185 * In this case we just want to return the size of the 1186 * log as the amount of space left. 1187 */ 1188 xfs_alert(log->l_mp, "xlog_space_left: head behind tail"); 1189 xfs_alert(log->l_mp, 1190 " tail_cycle = %d, tail_bytes = %d", 1191 tail_cycle, tail_bytes); 1192 xfs_alert(log->l_mp, 1193 " GH cycle = %d, GH bytes = %d", 1194 head_cycle, head_bytes); 1195 ASSERT(0); 1196 free_bytes = log->l_logsize; 1197 } 1198 return free_bytes; 1199 } 1200 1201 1202 static void 1203 xlog_ioend_work( 1204 struct work_struct *work) 1205 { 1206 struct xlog_in_core *iclog = 1207 container_of(work, struct xlog_in_core, ic_end_io_work); 1208 struct xlog *log = iclog->ic_log; 1209 bool aborted = false; 1210 int error; 1211 1212 error = blk_status_to_errno(iclog->ic_bio.bi_status); 1213 #ifdef DEBUG 1214 /* treat writes with injected CRC errors as failed */ 1215 if (iclog->ic_fail_crc) 1216 error = -EIO; 1217 #endif 1218 1219 /* 1220 * Race to shutdown the filesystem if we see an error. 1221 */ 1222 if (XFS_TEST_ERROR(error, log->l_mp, XFS_ERRTAG_IODONE_IOERR)) { 1223 xfs_alert(log->l_mp, "log I/O error %d", error); 1224 xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR); 1225 /* 1226 * This flag will be propagated to the trans-committed 1227 * callback routines to let them know that the log-commit 1228 * didn't succeed. 1229 */ 1230 aborted = true; 1231 } else if (iclog->ic_state & XLOG_STATE_IOERROR) { 1232 aborted = true; 1233 } 1234 1235 xlog_state_done_syncing(iclog, aborted); 1236 bio_uninit(&iclog->ic_bio); 1237 1238 /* 1239 * Drop the lock to signal that we are done. Nothing references the 1240 * iclog after this, so an unmount waiting on this lock can now tear it 1241 * down safely. As such, it is unsafe to reference the iclog after the 1242 * unlock as we could race with it being freed. 1243 */ 1244 up(&iclog->ic_sema); 1245 } 1246 1247 /* 1248 * Return size of each in-core log record buffer. 1249 * 1250 * All machines get 8 x 32kB buffers by default, unless tuned otherwise. 1251 * 1252 * If the filesystem blocksize is too large, we may need to choose a 1253 * larger size since the directory code currently logs entire blocks. 1254 */ 1255 STATIC void 1256 xlog_get_iclog_buffer_size( 1257 struct xfs_mount *mp, 1258 struct xlog *log) 1259 { 1260 if (mp->m_logbufs <= 0) 1261 mp->m_logbufs = XLOG_MAX_ICLOGS; 1262 if (mp->m_logbsize <= 0) 1263 mp->m_logbsize = XLOG_BIG_RECORD_BSIZE; 1264 1265 log->l_iclog_bufs = mp->m_logbufs; 1266 log->l_iclog_size = mp->m_logbsize; 1267 1268 /* 1269 * # headers = size / 32k - one header holds cycles from 32k of data. 1270 */ 1271 log->l_iclog_heads = 1272 DIV_ROUND_UP(mp->m_logbsize, XLOG_HEADER_CYCLE_SIZE); 1273 log->l_iclog_hsize = log->l_iclog_heads << BBSHIFT; 1274 } 1275 1276 void 1277 xfs_log_work_queue( 1278 struct xfs_mount *mp) 1279 { 1280 queue_delayed_work(mp->m_sync_workqueue, &mp->m_log->l_work, 1281 msecs_to_jiffies(xfs_syncd_centisecs * 10)); 1282 } 1283 1284 /* 1285 * Every sync period we need to unpin all items in the AIL and push them to 1286 * disk. If there is nothing dirty, then we might need to cover the log to 1287 * indicate that the filesystem is idle. 1288 */ 1289 static void 1290 xfs_log_worker( 1291 struct work_struct *work) 1292 { 1293 struct xlog *log = container_of(to_delayed_work(work), 1294 struct xlog, l_work); 1295 struct xfs_mount *mp = log->l_mp; 1296 1297 /* dgc: errors ignored - not fatal and nowhere to report them */ 1298 if (xfs_log_need_covered(mp)) { 1299 /* 1300 * Dump a transaction into the log that contains no real change. 1301 * This is needed to stamp the current tail LSN into the log 1302 * during the covering operation. 1303 * 1304 * We cannot use an inode here for this - that will push dirty 1305 * state back up into the VFS and then periodic inode flushing 1306 * will prevent log covering from making progress. Hence we 1307 * synchronously log the superblock instead to ensure the 1308 * superblock is immediately unpinned and can be written back. 1309 */ 1310 xfs_sync_sb(mp, true); 1311 } else 1312 xfs_log_force(mp, 0); 1313 1314 /* start pushing all the metadata that is currently dirty */ 1315 xfs_ail_push_all(mp->m_ail); 1316 1317 /* queue us up again */ 1318 xfs_log_work_queue(mp); 1319 } 1320 1321 /* 1322 * This routine initializes some of the log structure for a given mount point. 1323 * Its primary purpose is to fill in enough, so recovery can occur. However, 1324 * some other stuff may be filled in too. 1325 */ 1326 STATIC struct xlog * 1327 xlog_alloc_log( 1328 struct xfs_mount *mp, 1329 struct xfs_buftarg *log_target, 1330 xfs_daddr_t blk_offset, 1331 int num_bblks) 1332 { 1333 struct xlog *log; 1334 xlog_rec_header_t *head; 1335 xlog_in_core_t **iclogp; 1336 xlog_in_core_t *iclog, *prev_iclog=NULL; 1337 int i; 1338 int error = -ENOMEM; 1339 uint log2_size = 0; 1340 1341 log = kmem_zalloc(sizeof(struct xlog), KM_MAYFAIL); 1342 if (!log) { 1343 xfs_warn(mp, "Log allocation failed: No memory!"); 1344 goto out; 1345 } 1346 1347 log->l_mp = mp; 1348 log->l_targ = log_target; 1349 log->l_logsize = BBTOB(num_bblks); 1350 log->l_logBBstart = blk_offset; 1351 log->l_logBBsize = num_bblks; 1352 log->l_covered_state = XLOG_STATE_COVER_IDLE; 1353 log->l_flags |= XLOG_ACTIVE_RECOVERY; 1354 INIT_DELAYED_WORK(&log->l_work, xfs_log_worker); 1355 1356 log->l_prev_block = -1; 1357 /* log->l_tail_lsn = 0x100000000LL; cycle = 1; current block = 0 */ 1358 xlog_assign_atomic_lsn(&log->l_tail_lsn, 1, 0); 1359 xlog_assign_atomic_lsn(&log->l_last_sync_lsn, 1, 0); 1360 log->l_curr_cycle = 1; /* 0 is bad since this is initial value */ 1361 1362 xlog_grant_head_init(&log->l_reserve_head); 1363 xlog_grant_head_init(&log->l_write_head); 1364 1365 error = -EFSCORRUPTED; 1366 if (xfs_sb_version_hassector(&mp->m_sb)) { 1367 log2_size = mp->m_sb.sb_logsectlog; 1368 if (log2_size < BBSHIFT) { 1369 xfs_warn(mp, "Log sector size too small (0x%x < 0x%x)", 1370 log2_size, BBSHIFT); 1371 goto out_free_log; 1372 } 1373 1374 log2_size -= BBSHIFT; 1375 if (log2_size > mp->m_sectbb_log) { 1376 xfs_warn(mp, "Log sector size too large (0x%x > 0x%x)", 1377 log2_size, mp->m_sectbb_log); 1378 goto out_free_log; 1379 } 1380 1381 /* for larger sector sizes, must have v2 or external log */ 1382 if (log2_size && log->l_logBBstart > 0 && 1383 !xfs_sb_version_haslogv2(&mp->m_sb)) { 1384 xfs_warn(mp, 1385 "log sector size (0x%x) invalid for configuration.", 1386 log2_size); 1387 goto out_free_log; 1388 } 1389 } 1390 log->l_sectBBsize = 1 << log2_size; 1391 1392 xlog_get_iclog_buffer_size(mp, log); 1393 1394 spin_lock_init(&log->l_icloglock); 1395 init_waitqueue_head(&log->l_flush_wait); 1396 1397 iclogp = &log->l_iclog; 1398 /* 1399 * The amount of memory to allocate for the iclog structure is 1400 * rather funky due to the way the structure is defined. It is 1401 * done this way so that we can use different sizes for machines 1402 * with different amounts of memory. See the definition of 1403 * xlog_in_core_t in xfs_log_priv.h for details. 1404 */ 1405 ASSERT(log->l_iclog_size >= 4096); 1406 for (i = 0; i < log->l_iclog_bufs; i++) { 1407 size_t bvec_size = howmany(log->l_iclog_size, PAGE_SIZE) * 1408 sizeof(struct bio_vec); 1409 1410 iclog = kmem_zalloc(sizeof(*iclog) + bvec_size, KM_MAYFAIL); 1411 if (!iclog) 1412 goto out_free_iclog; 1413 1414 *iclogp = iclog; 1415 iclog->ic_prev = prev_iclog; 1416 prev_iclog = iclog; 1417 1418 iclog->ic_data = kmem_alloc_large(log->l_iclog_size, 1419 KM_MAYFAIL); 1420 if (!iclog->ic_data) 1421 goto out_free_iclog; 1422 #ifdef DEBUG 1423 log->l_iclog_bak[i] = &iclog->ic_header; 1424 #endif 1425 head = &iclog->ic_header; 1426 memset(head, 0, sizeof(xlog_rec_header_t)); 1427 head->h_magicno = cpu_to_be32(XLOG_HEADER_MAGIC_NUM); 1428 head->h_version = cpu_to_be32( 1429 xfs_sb_version_haslogv2(&log->l_mp->m_sb) ? 2 : 1); 1430 head->h_size = cpu_to_be32(log->l_iclog_size); 1431 /* new fields */ 1432 head->h_fmt = cpu_to_be32(XLOG_FMT); 1433 memcpy(&head->h_fs_uuid, &mp->m_sb.sb_uuid, sizeof(uuid_t)); 1434 1435 iclog->ic_size = log->l_iclog_size - log->l_iclog_hsize; 1436 iclog->ic_state = XLOG_STATE_ACTIVE; 1437 iclog->ic_log = log; 1438 atomic_set(&iclog->ic_refcnt, 0); 1439 spin_lock_init(&iclog->ic_callback_lock); 1440 INIT_LIST_HEAD(&iclog->ic_callbacks); 1441 iclog->ic_datap = (char *)iclog->ic_data + log->l_iclog_hsize; 1442 1443 init_waitqueue_head(&iclog->ic_force_wait); 1444 init_waitqueue_head(&iclog->ic_write_wait); 1445 INIT_WORK(&iclog->ic_end_io_work, xlog_ioend_work); 1446 sema_init(&iclog->ic_sema, 1); 1447 1448 iclogp = &iclog->ic_next; 1449 } 1450 *iclogp = log->l_iclog; /* complete ring */ 1451 log->l_iclog->ic_prev = prev_iclog; /* re-write 1st prev ptr */ 1452 1453 log->l_ioend_workqueue = alloc_workqueue("xfs-log/%s", 1454 WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_HIGHPRI, 0, 1455 mp->m_fsname); 1456 if (!log->l_ioend_workqueue) 1457 goto out_free_iclog; 1458 1459 error = xlog_cil_init(log); 1460 if (error) 1461 goto out_destroy_workqueue; 1462 return log; 1463 1464 out_destroy_workqueue: 1465 destroy_workqueue(log->l_ioend_workqueue); 1466 out_free_iclog: 1467 for (iclog = log->l_iclog; iclog; iclog = prev_iclog) { 1468 prev_iclog = iclog->ic_next; 1469 kmem_free(iclog->ic_data); 1470 kmem_free(iclog); 1471 } 1472 out_free_log: 1473 kmem_free(log); 1474 out: 1475 return ERR_PTR(error); 1476 } /* xlog_alloc_log */ 1477 1478 1479 /* 1480 * Write out the commit record of a transaction associated with the given 1481 * ticket. Return the lsn of the commit record. 1482 */ 1483 STATIC int 1484 xlog_commit_record( 1485 struct xlog *log, 1486 struct xlog_ticket *ticket, 1487 struct xlog_in_core **iclog, 1488 xfs_lsn_t *commitlsnp) 1489 { 1490 struct xfs_mount *mp = log->l_mp; 1491 int error; 1492 struct xfs_log_iovec reg = { 1493 .i_addr = NULL, 1494 .i_len = 0, 1495 .i_type = XLOG_REG_TYPE_COMMIT, 1496 }; 1497 struct xfs_log_vec vec = { 1498 .lv_niovecs = 1, 1499 .lv_iovecp = ®, 1500 }; 1501 1502 ASSERT_ALWAYS(iclog); 1503 error = xlog_write(log, &vec, ticket, commitlsnp, iclog, 1504 XLOG_COMMIT_TRANS); 1505 if (error) 1506 xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR); 1507 return error; 1508 } 1509 1510 /* 1511 * Push on the buffer cache code if we ever use more than 75% of the on-disk 1512 * log space. This code pushes on the lsn which would supposedly free up 1513 * the 25% which we want to leave free. We may need to adopt a policy which 1514 * pushes on an lsn which is further along in the log once we reach the high 1515 * water mark. In this manner, we would be creating a low water mark. 1516 */ 1517 STATIC void 1518 xlog_grant_push_ail( 1519 struct xlog *log, 1520 int need_bytes) 1521 { 1522 xfs_lsn_t threshold_lsn = 0; 1523 xfs_lsn_t last_sync_lsn; 1524 int free_blocks; 1525 int free_bytes; 1526 int threshold_block; 1527 int threshold_cycle; 1528 int free_threshold; 1529 1530 ASSERT(BTOBB(need_bytes) < log->l_logBBsize); 1531 1532 free_bytes = xlog_space_left(log, &log->l_reserve_head.grant); 1533 free_blocks = BTOBBT(free_bytes); 1534 1535 /* 1536 * Set the threshold for the minimum number of free blocks in the 1537 * log to the maximum of what the caller needs, one quarter of the 1538 * log, and 256 blocks. 1539 */ 1540 free_threshold = BTOBB(need_bytes); 1541 free_threshold = max(free_threshold, (log->l_logBBsize >> 2)); 1542 free_threshold = max(free_threshold, 256); 1543 if (free_blocks >= free_threshold) 1544 return; 1545 1546 xlog_crack_atomic_lsn(&log->l_tail_lsn, &threshold_cycle, 1547 &threshold_block); 1548 threshold_block += free_threshold; 1549 if (threshold_block >= log->l_logBBsize) { 1550 threshold_block -= log->l_logBBsize; 1551 threshold_cycle += 1; 1552 } 1553 threshold_lsn = xlog_assign_lsn(threshold_cycle, 1554 threshold_block); 1555 /* 1556 * Don't pass in an lsn greater than the lsn of the last 1557 * log record known to be on disk. Use a snapshot of the last sync lsn 1558 * so that it doesn't change between the compare and the set. 1559 */ 1560 last_sync_lsn = atomic64_read(&log->l_last_sync_lsn); 1561 if (XFS_LSN_CMP(threshold_lsn, last_sync_lsn) > 0) 1562 threshold_lsn = last_sync_lsn; 1563 1564 /* 1565 * Get the transaction layer to kick the dirty buffers out to 1566 * disk asynchronously. No point in trying to do this if 1567 * the filesystem is shutting down. 1568 */ 1569 if (!XLOG_FORCED_SHUTDOWN(log)) 1570 xfs_ail_push(log->l_ailp, threshold_lsn); 1571 } 1572 1573 /* 1574 * Stamp cycle number in every block 1575 */ 1576 STATIC void 1577 xlog_pack_data( 1578 struct xlog *log, 1579 struct xlog_in_core *iclog, 1580 int roundoff) 1581 { 1582 int i, j, k; 1583 int size = iclog->ic_offset + roundoff; 1584 __be32 cycle_lsn; 1585 char *dp; 1586 1587 cycle_lsn = CYCLE_LSN_DISK(iclog->ic_header.h_lsn); 1588 1589 dp = iclog->ic_datap; 1590 for (i = 0; i < BTOBB(size); i++) { 1591 if (i >= (XLOG_HEADER_CYCLE_SIZE / BBSIZE)) 1592 break; 1593 iclog->ic_header.h_cycle_data[i] = *(__be32 *)dp; 1594 *(__be32 *)dp = cycle_lsn; 1595 dp += BBSIZE; 1596 } 1597 1598 if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) { 1599 xlog_in_core_2_t *xhdr = iclog->ic_data; 1600 1601 for ( ; i < BTOBB(size); i++) { 1602 j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE); 1603 k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE); 1604 xhdr[j].hic_xheader.xh_cycle_data[k] = *(__be32 *)dp; 1605 *(__be32 *)dp = cycle_lsn; 1606 dp += BBSIZE; 1607 } 1608 1609 for (i = 1; i < log->l_iclog_heads; i++) 1610 xhdr[i].hic_xheader.xh_cycle = cycle_lsn; 1611 } 1612 } 1613 1614 /* 1615 * Calculate the checksum for a log buffer. 1616 * 1617 * This is a little more complicated than it should be because the various 1618 * headers and the actual data are non-contiguous. 1619 */ 1620 __le32 1621 xlog_cksum( 1622 struct xlog *log, 1623 struct xlog_rec_header *rhead, 1624 char *dp, 1625 int size) 1626 { 1627 uint32_t crc; 1628 1629 /* first generate the crc for the record header ... */ 1630 crc = xfs_start_cksum_update((char *)rhead, 1631 sizeof(struct xlog_rec_header), 1632 offsetof(struct xlog_rec_header, h_crc)); 1633 1634 /* ... then for additional cycle data for v2 logs ... */ 1635 if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) { 1636 union xlog_in_core2 *xhdr = (union xlog_in_core2 *)rhead; 1637 int i; 1638 int xheads; 1639 1640 xheads = size / XLOG_HEADER_CYCLE_SIZE; 1641 if (size % XLOG_HEADER_CYCLE_SIZE) 1642 xheads++; 1643 1644 for (i = 1; i < xheads; i++) { 1645 crc = crc32c(crc, &xhdr[i].hic_xheader, 1646 sizeof(struct xlog_rec_ext_header)); 1647 } 1648 } 1649 1650 /* ... and finally for the payload */ 1651 crc = crc32c(crc, dp, size); 1652 1653 return xfs_end_cksum(crc); 1654 } 1655 1656 static void 1657 xlog_bio_end_io( 1658 struct bio *bio) 1659 { 1660 struct xlog_in_core *iclog = bio->bi_private; 1661 1662 queue_work(iclog->ic_log->l_ioend_workqueue, 1663 &iclog->ic_end_io_work); 1664 } 1665 1666 static void 1667 xlog_map_iclog_data( 1668 struct bio *bio, 1669 void *data, 1670 size_t count) 1671 { 1672 do { 1673 struct page *page = kmem_to_page(data); 1674 unsigned int off = offset_in_page(data); 1675 size_t len = min_t(size_t, count, PAGE_SIZE - off); 1676 1677 WARN_ON_ONCE(bio_add_page(bio, page, len, off) != len); 1678 1679 data += len; 1680 count -= len; 1681 } while (count); 1682 } 1683 1684 STATIC void 1685 xlog_write_iclog( 1686 struct xlog *log, 1687 struct xlog_in_core *iclog, 1688 uint64_t bno, 1689 unsigned int count, 1690 bool need_flush) 1691 { 1692 ASSERT(bno < log->l_logBBsize); 1693 1694 /* 1695 * We lock the iclogbufs here so that we can serialise against I/O 1696 * completion during unmount. We might be processing a shutdown 1697 * triggered during unmount, and that can occur asynchronously to the 1698 * unmount thread, and hence we need to ensure that completes before 1699 * tearing down the iclogbufs. Hence we need to hold the buffer lock 1700 * across the log IO to archieve that. 1701 */ 1702 down(&iclog->ic_sema); 1703 if (unlikely(iclog->ic_state & XLOG_STATE_IOERROR)) { 1704 /* 1705 * It would seem logical to return EIO here, but we rely on 1706 * the log state machine to propagate I/O errors instead of 1707 * doing it here. We kick of the state machine and unlock 1708 * the buffer manually, the code needs to be kept in sync 1709 * with the I/O completion path. 1710 */ 1711 xlog_state_done_syncing(iclog, XFS_LI_ABORTED); 1712 up(&iclog->ic_sema); 1713 return; 1714 } 1715 1716 iclog->ic_io_size = count; 1717 1718 bio_init(&iclog->ic_bio, iclog->ic_bvec, howmany(count, PAGE_SIZE)); 1719 bio_set_dev(&iclog->ic_bio, log->l_targ->bt_bdev); 1720 iclog->ic_bio.bi_iter.bi_sector = log->l_logBBstart + bno; 1721 iclog->ic_bio.bi_end_io = xlog_bio_end_io; 1722 iclog->ic_bio.bi_private = iclog; 1723 iclog->ic_bio.bi_opf = REQ_OP_WRITE | REQ_META | REQ_SYNC | REQ_FUA; 1724 if (need_flush) 1725 iclog->ic_bio.bi_opf |= REQ_PREFLUSH; 1726 1727 xlog_map_iclog_data(&iclog->ic_bio, iclog->ic_data, iclog->ic_io_size); 1728 if (is_vmalloc_addr(iclog->ic_data)) 1729 flush_kernel_vmap_range(iclog->ic_data, iclog->ic_io_size); 1730 1731 /* 1732 * If this log buffer would straddle the end of the log we will have 1733 * to split it up into two bios, so that we can continue at the start. 1734 */ 1735 if (bno + BTOBB(count) > log->l_logBBsize) { 1736 struct bio *split; 1737 1738 split = bio_split(&iclog->ic_bio, log->l_logBBsize - bno, 1739 GFP_NOIO, &fs_bio_set); 1740 bio_chain(split, &iclog->ic_bio); 1741 submit_bio(split); 1742 1743 /* restart at logical offset zero for the remainder */ 1744 iclog->ic_bio.bi_iter.bi_sector = log->l_logBBstart; 1745 } 1746 1747 submit_bio(&iclog->ic_bio); 1748 } 1749 1750 /* 1751 * We need to bump cycle number for the part of the iclog that is 1752 * written to the start of the log. Watch out for the header magic 1753 * number case, though. 1754 */ 1755 static void 1756 xlog_split_iclog( 1757 struct xlog *log, 1758 void *data, 1759 uint64_t bno, 1760 unsigned int count) 1761 { 1762 unsigned int split_offset = BBTOB(log->l_logBBsize - bno); 1763 unsigned int i; 1764 1765 for (i = split_offset; i < count; i += BBSIZE) { 1766 uint32_t cycle = get_unaligned_be32(data + i); 1767 1768 if (++cycle == XLOG_HEADER_MAGIC_NUM) 1769 cycle++; 1770 put_unaligned_be32(cycle, data + i); 1771 } 1772 } 1773 1774 static int 1775 xlog_calc_iclog_size( 1776 struct xlog *log, 1777 struct xlog_in_core *iclog, 1778 uint32_t *roundoff) 1779 { 1780 uint32_t count_init, count; 1781 bool use_lsunit; 1782 1783 use_lsunit = xfs_sb_version_haslogv2(&log->l_mp->m_sb) && 1784 log->l_mp->m_sb.sb_logsunit > 1; 1785 1786 /* Add for LR header */ 1787 count_init = log->l_iclog_hsize + iclog->ic_offset; 1788 1789 /* Round out the log write size */ 1790 if (use_lsunit) { 1791 /* we have a v2 stripe unit to use */ 1792 count = XLOG_LSUNITTOB(log, XLOG_BTOLSUNIT(log, count_init)); 1793 } else { 1794 count = BBTOB(BTOBB(count_init)); 1795 } 1796 1797 ASSERT(count >= count_init); 1798 *roundoff = count - count_init; 1799 1800 if (use_lsunit) 1801 ASSERT(*roundoff < log->l_mp->m_sb.sb_logsunit); 1802 else 1803 ASSERT(*roundoff < BBTOB(1)); 1804 return count; 1805 } 1806 1807 /* 1808 * Flush out the in-core log (iclog) to the on-disk log in an asynchronous 1809 * fashion. Previously, we should have moved the current iclog 1810 * ptr in the log to point to the next available iclog. This allows further 1811 * write to continue while this code syncs out an iclog ready to go. 1812 * Before an in-core log can be written out, the data section must be scanned 1813 * to save away the 1st word of each BBSIZE block into the header. We replace 1814 * it with the current cycle count. Each BBSIZE block is tagged with the 1815 * cycle count because there in an implicit assumption that drives will 1816 * guarantee that entire 512 byte blocks get written at once. In other words, 1817 * we can't have part of a 512 byte block written and part not written. By 1818 * tagging each block, we will know which blocks are valid when recovering 1819 * after an unclean shutdown. 1820 * 1821 * This routine is single threaded on the iclog. No other thread can be in 1822 * this routine with the same iclog. Changing contents of iclog can there- 1823 * fore be done without grabbing the state machine lock. Updating the global 1824 * log will require grabbing the lock though. 1825 * 1826 * The entire log manager uses a logical block numbering scheme. Only 1827 * xlog_write_iclog knows about the fact that the log may not start with 1828 * block zero on a given device. 1829 */ 1830 STATIC void 1831 xlog_sync( 1832 struct xlog *log, 1833 struct xlog_in_core *iclog) 1834 { 1835 unsigned int count; /* byte count of bwrite */ 1836 unsigned int roundoff; /* roundoff to BB or stripe */ 1837 uint64_t bno; 1838 unsigned int size; 1839 bool need_flush = true, split = false; 1840 1841 ASSERT(atomic_read(&iclog->ic_refcnt) == 0); 1842 1843 count = xlog_calc_iclog_size(log, iclog, &roundoff); 1844 1845 /* move grant heads by roundoff in sync */ 1846 xlog_grant_add_space(log, &log->l_reserve_head.grant, roundoff); 1847 xlog_grant_add_space(log, &log->l_write_head.grant, roundoff); 1848 1849 /* put cycle number in every block */ 1850 xlog_pack_data(log, iclog, roundoff); 1851 1852 /* real byte length */ 1853 size = iclog->ic_offset; 1854 if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) 1855 size += roundoff; 1856 iclog->ic_header.h_len = cpu_to_be32(size); 1857 1858 XFS_STATS_INC(log->l_mp, xs_log_writes); 1859 XFS_STATS_ADD(log->l_mp, xs_log_blocks, BTOBB(count)); 1860 1861 bno = BLOCK_LSN(be64_to_cpu(iclog->ic_header.h_lsn)); 1862 1863 /* Do we need to split this write into 2 parts? */ 1864 if (bno + BTOBB(count) > log->l_logBBsize) { 1865 xlog_split_iclog(log, &iclog->ic_header, bno, count); 1866 split = true; 1867 } 1868 1869 /* calculcate the checksum */ 1870 iclog->ic_header.h_crc = xlog_cksum(log, &iclog->ic_header, 1871 iclog->ic_datap, size); 1872 /* 1873 * Intentionally corrupt the log record CRC based on the error injection 1874 * frequency, if defined. This facilitates testing log recovery in the 1875 * event of torn writes. Hence, set the IOABORT state to abort the log 1876 * write on I/O completion and shutdown the fs. The subsequent mount 1877 * detects the bad CRC and attempts to recover. 1878 */ 1879 #ifdef DEBUG 1880 if (XFS_TEST_ERROR(false, log->l_mp, XFS_ERRTAG_LOG_BAD_CRC)) { 1881 iclog->ic_header.h_crc &= cpu_to_le32(0xAAAAAAAA); 1882 iclog->ic_fail_crc = true; 1883 xfs_warn(log->l_mp, 1884 "Intentionally corrupted log record at LSN 0x%llx. Shutdown imminent.", 1885 be64_to_cpu(iclog->ic_header.h_lsn)); 1886 } 1887 #endif 1888 1889 /* 1890 * Flush the data device before flushing the log to make sure all meta 1891 * data written back from the AIL actually made it to disk before 1892 * stamping the new log tail LSN into the log buffer. For an external 1893 * log we need to issue the flush explicitly, and unfortunately 1894 * synchronously here; for an internal log we can simply use the block 1895 * layer state machine for preflushes. 1896 */ 1897 if (log->l_targ != log->l_mp->m_ddev_targp || split) { 1898 xfs_blkdev_issue_flush(log->l_mp->m_ddev_targp); 1899 need_flush = false; 1900 } 1901 1902 xlog_verify_iclog(log, iclog, count); 1903 xlog_write_iclog(log, iclog, bno, count, need_flush); 1904 } 1905 1906 /* 1907 * Deallocate a log structure 1908 */ 1909 STATIC void 1910 xlog_dealloc_log( 1911 struct xlog *log) 1912 { 1913 xlog_in_core_t *iclog, *next_iclog; 1914 int i; 1915 1916 xlog_cil_destroy(log); 1917 1918 /* 1919 * Cycle all the iclogbuf locks to make sure all log IO completion 1920 * is done before we tear down these buffers. 1921 */ 1922 iclog = log->l_iclog; 1923 for (i = 0; i < log->l_iclog_bufs; i++) { 1924 down(&iclog->ic_sema); 1925 up(&iclog->ic_sema); 1926 iclog = iclog->ic_next; 1927 } 1928 1929 iclog = log->l_iclog; 1930 for (i = 0; i < log->l_iclog_bufs; i++) { 1931 next_iclog = iclog->ic_next; 1932 kmem_free(iclog->ic_data); 1933 kmem_free(iclog); 1934 iclog = next_iclog; 1935 } 1936 1937 log->l_mp->m_log = NULL; 1938 destroy_workqueue(log->l_ioend_workqueue); 1939 kmem_free(log); 1940 } /* xlog_dealloc_log */ 1941 1942 /* 1943 * Update counters atomically now that memcpy is done. 1944 */ 1945 /* ARGSUSED */ 1946 static inline void 1947 xlog_state_finish_copy( 1948 struct xlog *log, 1949 struct xlog_in_core *iclog, 1950 int record_cnt, 1951 int copy_bytes) 1952 { 1953 spin_lock(&log->l_icloglock); 1954 1955 be32_add_cpu(&iclog->ic_header.h_num_logops, record_cnt); 1956 iclog->ic_offset += copy_bytes; 1957 1958 spin_unlock(&log->l_icloglock); 1959 } /* xlog_state_finish_copy */ 1960 1961 1962 1963 1964 /* 1965 * print out info relating to regions written which consume 1966 * the reservation 1967 */ 1968 void 1969 xlog_print_tic_res( 1970 struct xfs_mount *mp, 1971 struct xlog_ticket *ticket) 1972 { 1973 uint i; 1974 uint ophdr_spc = ticket->t_res_num_ophdrs * (uint)sizeof(xlog_op_header_t); 1975 1976 /* match with XLOG_REG_TYPE_* in xfs_log.h */ 1977 #define REG_TYPE_STR(type, str) [XLOG_REG_TYPE_##type] = str 1978 static char *res_type_str[] = { 1979 REG_TYPE_STR(BFORMAT, "bformat"), 1980 REG_TYPE_STR(BCHUNK, "bchunk"), 1981 REG_TYPE_STR(EFI_FORMAT, "efi_format"), 1982 REG_TYPE_STR(EFD_FORMAT, "efd_format"), 1983 REG_TYPE_STR(IFORMAT, "iformat"), 1984 REG_TYPE_STR(ICORE, "icore"), 1985 REG_TYPE_STR(IEXT, "iext"), 1986 REG_TYPE_STR(IBROOT, "ibroot"), 1987 REG_TYPE_STR(ILOCAL, "ilocal"), 1988 REG_TYPE_STR(IATTR_EXT, "iattr_ext"), 1989 REG_TYPE_STR(IATTR_BROOT, "iattr_broot"), 1990 REG_TYPE_STR(IATTR_LOCAL, "iattr_local"), 1991 REG_TYPE_STR(QFORMAT, "qformat"), 1992 REG_TYPE_STR(DQUOT, "dquot"), 1993 REG_TYPE_STR(QUOTAOFF, "quotaoff"), 1994 REG_TYPE_STR(LRHEADER, "LR header"), 1995 REG_TYPE_STR(UNMOUNT, "unmount"), 1996 REG_TYPE_STR(COMMIT, "commit"), 1997 REG_TYPE_STR(TRANSHDR, "trans header"), 1998 REG_TYPE_STR(ICREATE, "inode create"), 1999 REG_TYPE_STR(RUI_FORMAT, "rui_format"), 2000 REG_TYPE_STR(RUD_FORMAT, "rud_format"), 2001 REG_TYPE_STR(CUI_FORMAT, "cui_format"), 2002 REG_TYPE_STR(CUD_FORMAT, "cud_format"), 2003 REG_TYPE_STR(BUI_FORMAT, "bui_format"), 2004 REG_TYPE_STR(BUD_FORMAT, "bud_format"), 2005 }; 2006 BUILD_BUG_ON(ARRAY_SIZE(res_type_str) != XLOG_REG_TYPE_MAX + 1); 2007 #undef REG_TYPE_STR 2008 2009 xfs_warn(mp, "ticket reservation summary:"); 2010 xfs_warn(mp, " unit res = %d bytes", 2011 ticket->t_unit_res); 2012 xfs_warn(mp, " current res = %d bytes", 2013 ticket->t_curr_res); 2014 xfs_warn(mp, " total reg = %u bytes (o/flow = %u bytes)", 2015 ticket->t_res_arr_sum, ticket->t_res_o_flow); 2016 xfs_warn(mp, " ophdrs = %u (ophdr space = %u bytes)", 2017 ticket->t_res_num_ophdrs, ophdr_spc); 2018 xfs_warn(mp, " ophdr + reg = %u bytes", 2019 ticket->t_res_arr_sum + ticket->t_res_o_flow + ophdr_spc); 2020 xfs_warn(mp, " num regions = %u", 2021 ticket->t_res_num); 2022 2023 for (i = 0; i < ticket->t_res_num; i++) { 2024 uint r_type = ticket->t_res_arr[i].r_type; 2025 xfs_warn(mp, "region[%u]: %s - %u bytes", i, 2026 ((r_type <= 0 || r_type > XLOG_REG_TYPE_MAX) ? 2027 "bad-rtype" : res_type_str[r_type]), 2028 ticket->t_res_arr[i].r_len); 2029 } 2030 } 2031 2032 /* 2033 * Print a summary of the transaction. 2034 */ 2035 void 2036 xlog_print_trans( 2037 struct xfs_trans *tp) 2038 { 2039 struct xfs_mount *mp = tp->t_mountp; 2040 struct xfs_log_item *lip; 2041 2042 /* dump core transaction and ticket info */ 2043 xfs_warn(mp, "transaction summary:"); 2044 xfs_warn(mp, " log res = %d", tp->t_log_res); 2045 xfs_warn(mp, " log count = %d", tp->t_log_count); 2046 xfs_warn(mp, " flags = 0x%x", tp->t_flags); 2047 2048 xlog_print_tic_res(mp, tp->t_ticket); 2049 2050 /* dump each log item */ 2051 list_for_each_entry(lip, &tp->t_items, li_trans) { 2052 struct xfs_log_vec *lv = lip->li_lv; 2053 struct xfs_log_iovec *vec; 2054 int i; 2055 2056 xfs_warn(mp, "log item: "); 2057 xfs_warn(mp, " type = 0x%x", lip->li_type); 2058 xfs_warn(mp, " flags = 0x%lx", lip->li_flags); 2059 if (!lv) 2060 continue; 2061 xfs_warn(mp, " niovecs = %d", lv->lv_niovecs); 2062 xfs_warn(mp, " size = %d", lv->lv_size); 2063 xfs_warn(mp, " bytes = %d", lv->lv_bytes); 2064 xfs_warn(mp, " buf len = %d", lv->lv_buf_len); 2065 2066 /* dump each iovec for the log item */ 2067 vec = lv->lv_iovecp; 2068 for (i = 0; i < lv->lv_niovecs; i++) { 2069 int dumplen = min(vec->i_len, 32); 2070 2071 xfs_warn(mp, " iovec[%d]", i); 2072 xfs_warn(mp, " type = 0x%x", vec->i_type); 2073 xfs_warn(mp, " len = %d", vec->i_len); 2074 xfs_warn(mp, " first %d bytes of iovec[%d]:", dumplen, i); 2075 xfs_hex_dump(vec->i_addr, dumplen); 2076 2077 vec++; 2078 } 2079 } 2080 } 2081 2082 /* 2083 * Calculate the potential space needed by the log vector. Each region gets 2084 * its own xlog_op_header_t and may need to be double word aligned. 2085 */ 2086 static int 2087 xlog_write_calc_vec_length( 2088 struct xlog_ticket *ticket, 2089 struct xfs_log_vec *log_vector) 2090 { 2091 struct xfs_log_vec *lv; 2092 int headers = 0; 2093 int len = 0; 2094 int i; 2095 2096 /* acct for start rec of xact */ 2097 if (ticket->t_flags & XLOG_TIC_INITED) 2098 headers++; 2099 2100 for (lv = log_vector; lv; lv = lv->lv_next) { 2101 /* we don't write ordered log vectors */ 2102 if (lv->lv_buf_len == XFS_LOG_VEC_ORDERED) 2103 continue; 2104 2105 headers += lv->lv_niovecs; 2106 2107 for (i = 0; i < lv->lv_niovecs; i++) { 2108 struct xfs_log_iovec *vecp = &lv->lv_iovecp[i]; 2109 2110 len += vecp->i_len; 2111 xlog_tic_add_region(ticket, vecp->i_len, vecp->i_type); 2112 } 2113 } 2114 2115 ticket->t_res_num_ophdrs += headers; 2116 len += headers * sizeof(struct xlog_op_header); 2117 2118 return len; 2119 } 2120 2121 /* 2122 * If first write for transaction, insert start record We can't be trying to 2123 * commit if we are inited. We can't have any "partial_copy" if we are inited. 2124 */ 2125 static int 2126 xlog_write_start_rec( 2127 struct xlog_op_header *ophdr, 2128 struct xlog_ticket *ticket) 2129 { 2130 if (!(ticket->t_flags & XLOG_TIC_INITED)) 2131 return 0; 2132 2133 ophdr->oh_tid = cpu_to_be32(ticket->t_tid); 2134 ophdr->oh_clientid = ticket->t_clientid; 2135 ophdr->oh_len = 0; 2136 ophdr->oh_flags = XLOG_START_TRANS; 2137 ophdr->oh_res2 = 0; 2138 2139 ticket->t_flags &= ~XLOG_TIC_INITED; 2140 2141 return sizeof(struct xlog_op_header); 2142 } 2143 2144 static xlog_op_header_t * 2145 xlog_write_setup_ophdr( 2146 struct xlog *log, 2147 struct xlog_op_header *ophdr, 2148 struct xlog_ticket *ticket, 2149 uint flags) 2150 { 2151 ophdr->oh_tid = cpu_to_be32(ticket->t_tid); 2152 ophdr->oh_clientid = ticket->t_clientid; 2153 ophdr->oh_res2 = 0; 2154 2155 /* are we copying a commit or unmount record? */ 2156 ophdr->oh_flags = flags; 2157 2158 /* 2159 * We've seen logs corrupted with bad transaction client ids. This 2160 * makes sure that XFS doesn't generate them on. Turn this into an EIO 2161 * and shut down the filesystem. 2162 */ 2163 switch (ophdr->oh_clientid) { 2164 case XFS_TRANSACTION: 2165 case XFS_VOLUME: 2166 case XFS_LOG: 2167 break; 2168 default: 2169 xfs_warn(log->l_mp, 2170 "Bad XFS transaction clientid 0x%x in ticket "PTR_FMT, 2171 ophdr->oh_clientid, ticket); 2172 return NULL; 2173 } 2174 2175 return ophdr; 2176 } 2177 2178 /* 2179 * Set up the parameters of the region copy into the log. This has 2180 * to handle region write split across multiple log buffers - this 2181 * state is kept external to this function so that this code can 2182 * be written in an obvious, self documenting manner. 2183 */ 2184 static int 2185 xlog_write_setup_copy( 2186 struct xlog_ticket *ticket, 2187 struct xlog_op_header *ophdr, 2188 int space_available, 2189 int space_required, 2190 int *copy_off, 2191 int *copy_len, 2192 int *last_was_partial_copy, 2193 int *bytes_consumed) 2194 { 2195 int still_to_copy; 2196 2197 still_to_copy = space_required - *bytes_consumed; 2198 *copy_off = *bytes_consumed; 2199 2200 if (still_to_copy <= space_available) { 2201 /* write of region completes here */ 2202 *copy_len = still_to_copy; 2203 ophdr->oh_len = cpu_to_be32(*copy_len); 2204 if (*last_was_partial_copy) 2205 ophdr->oh_flags |= (XLOG_END_TRANS|XLOG_WAS_CONT_TRANS); 2206 *last_was_partial_copy = 0; 2207 *bytes_consumed = 0; 2208 return 0; 2209 } 2210 2211 /* partial write of region, needs extra log op header reservation */ 2212 *copy_len = space_available; 2213 ophdr->oh_len = cpu_to_be32(*copy_len); 2214 ophdr->oh_flags |= XLOG_CONTINUE_TRANS; 2215 if (*last_was_partial_copy) 2216 ophdr->oh_flags |= XLOG_WAS_CONT_TRANS; 2217 *bytes_consumed += *copy_len; 2218 (*last_was_partial_copy)++; 2219 2220 /* account for new log op header */ 2221 ticket->t_curr_res -= sizeof(struct xlog_op_header); 2222 ticket->t_res_num_ophdrs++; 2223 2224 return sizeof(struct xlog_op_header); 2225 } 2226 2227 static int 2228 xlog_write_copy_finish( 2229 struct xlog *log, 2230 struct xlog_in_core *iclog, 2231 uint flags, 2232 int *record_cnt, 2233 int *data_cnt, 2234 int *partial_copy, 2235 int *partial_copy_len, 2236 int log_offset, 2237 struct xlog_in_core **commit_iclog) 2238 { 2239 if (*partial_copy) { 2240 /* 2241 * This iclog has already been marked WANT_SYNC by 2242 * xlog_state_get_iclog_space. 2243 */ 2244 xlog_state_finish_copy(log, iclog, *record_cnt, *data_cnt); 2245 *record_cnt = 0; 2246 *data_cnt = 0; 2247 return xlog_state_release_iclog(log, iclog); 2248 } 2249 2250 *partial_copy = 0; 2251 *partial_copy_len = 0; 2252 2253 if (iclog->ic_size - log_offset <= sizeof(xlog_op_header_t)) { 2254 /* no more space in this iclog - push it. */ 2255 xlog_state_finish_copy(log, iclog, *record_cnt, *data_cnt); 2256 *record_cnt = 0; 2257 *data_cnt = 0; 2258 2259 spin_lock(&log->l_icloglock); 2260 xlog_state_want_sync(log, iclog); 2261 spin_unlock(&log->l_icloglock); 2262 2263 if (!commit_iclog) 2264 return xlog_state_release_iclog(log, iclog); 2265 ASSERT(flags & XLOG_COMMIT_TRANS); 2266 *commit_iclog = iclog; 2267 } 2268 2269 return 0; 2270 } 2271 2272 /* 2273 * Write some region out to in-core log 2274 * 2275 * This will be called when writing externally provided regions or when 2276 * writing out a commit record for a given transaction. 2277 * 2278 * General algorithm: 2279 * 1. Find total length of this write. This may include adding to the 2280 * lengths passed in. 2281 * 2. Check whether we violate the tickets reservation. 2282 * 3. While writing to this iclog 2283 * A. Reserve as much space in this iclog as can get 2284 * B. If this is first write, save away start lsn 2285 * C. While writing this region: 2286 * 1. If first write of transaction, write start record 2287 * 2. Write log operation header (header per region) 2288 * 3. Find out if we can fit entire region into this iclog 2289 * 4. Potentially, verify destination memcpy ptr 2290 * 5. Memcpy (partial) region 2291 * 6. If partial copy, release iclog; otherwise, continue 2292 * copying more regions into current iclog 2293 * 4. Mark want sync bit (in simulation mode) 2294 * 5. Release iclog for potential flush to on-disk log. 2295 * 2296 * ERRORS: 2297 * 1. Panic if reservation is overrun. This should never happen since 2298 * reservation amounts are generated internal to the filesystem. 2299 * NOTES: 2300 * 1. Tickets are single threaded data structures. 2301 * 2. The XLOG_END_TRANS & XLOG_CONTINUE_TRANS flags are passed down to the 2302 * syncing routine. When a single log_write region needs to span 2303 * multiple in-core logs, the XLOG_CONTINUE_TRANS bit should be set 2304 * on all log operation writes which don't contain the end of the 2305 * region. The XLOG_END_TRANS bit is used for the in-core log 2306 * operation which contains the end of the continued log_write region. 2307 * 3. When xlog_state_get_iclog_space() grabs the rest of the current iclog, 2308 * we don't really know exactly how much space will be used. As a result, 2309 * we don't update ic_offset until the end when we know exactly how many 2310 * bytes have been written out. 2311 */ 2312 int 2313 xlog_write( 2314 struct xlog *log, 2315 struct xfs_log_vec *log_vector, 2316 struct xlog_ticket *ticket, 2317 xfs_lsn_t *start_lsn, 2318 struct xlog_in_core **commit_iclog, 2319 uint flags) 2320 { 2321 struct xlog_in_core *iclog = NULL; 2322 struct xfs_log_iovec *vecp; 2323 struct xfs_log_vec *lv; 2324 int len; 2325 int index; 2326 int partial_copy = 0; 2327 int partial_copy_len = 0; 2328 int contwr = 0; 2329 int record_cnt = 0; 2330 int data_cnt = 0; 2331 int error; 2332 2333 *start_lsn = 0; 2334 2335 len = xlog_write_calc_vec_length(ticket, log_vector); 2336 2337 /* 2338 * Region headers and bytes are already accounted for. 2339 * We only need to take into account start records and 2340 * split regions in this function. 2341 */ 2342 if (ticket->t_flags & XLOG_TIC_INITED) 2343 ticket->t_curr_res -= sizeof(xlog_op_header_t); 2344 2345 /* 2346 * Commit record headers need to be accounted for. These 2347 * come in as separate writes so are easy to detect. 2348 */ 2349 if (flags & (XLOG_COMMIT_TRANS | XLOG_UNMOUNT_TRANS)) 2350 ticket->t_curr_res -= sizeof(xlog_op_header_t); 2351 2352 if (ticket->t_curr_res < 0) { 2353 xfs_alert_tag(log->l_mp, XFS_PTAG_LOGRES, 2354 "ctx ticket reservation ran out. Need to up reservation"); 2355 xlog_print_tic_res(log->l_mp, ticket); 2356 xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR); 2357 } 2358 2359 index = 0; 2360 lv = log_vector; 2361 vecp = lv->lv_iovecp; 2362 while (lv && (!lv->lv_niovecs || index < lv->lv_niovecs)) { 2363 void *ptr; 2364 int log_offset; 2365 2366 error = xlog_state_get_iclog_space(log, len, &iclog, ticket, 2367 &contwr, &log_offset); 2368 if (error) 2369 return error; 2370 2371 ASSERT(log_offset <= iclog->ic_size - 1); 2372 ptr = iclog->ic_datap + log_offset; 2373 2374 /* start_lsn is the first lsn written to. That's all we need. */ 2375 if (!*start_lsn) 2376 *start_lsn = be64_to_cpu(iclog->ic_header.h_lsn); 2377 2378 /* 2379 * This loop writes out as many regions as can fit in the amount 2380 * of space which was allocated by xlog_state_get_iclog_space(). 2381 */ 2382 while (lv && (!lv->lv_niovecs || index < lv->lv_niovecs)) { 2383 struct xfs_log_iovec *reg; 2384 struct xlog_op_header *ophdr; 2385 int start_rec_copy; 2386 int copy_len; 2387 int copy_off; 2388 bool ordered = false; 2389 2390 /* ordered log vectors have no regions to write */ 2391 if (lv->lv_buf_len == XFS_LOG_VEC_ORDERED) { 2392 ASSERT(lv->lv_niovecs == 0); 2393 ordered = true; 2394 goto next_lv; 2395 } 2396 2397 reg = &vecp[index]; 2398 ASSERT(reg->i_len % sizeof(int32_t) == 0); 2399 ASSERT((unsigned long)ptr % sizeof(int32_t) == 0); 2400 2401 start_rec_copy = xlog_write_start_rec(ptr, ticket); 2402 if (start_rec_copy) { 2403 record_cnt++; 2404 xlog_write_adv_cnt(&ptr, &len, &log_offset, 2405 start_rec_copy); 2406 } 2407 2408 ophdr = xlog_write_setup_ophdr(log, ptr, ticket, flags); 2409 if (!ophdr) 2410 return -EIO; 2411 2412 xlog_write_adv_cnt(&ptr, &len, &log_offset, 2413 sizeof(struct xlog_op_header)); 2414 2415 len += xlog_write_setup_copy(ticket, ophdr, 2416 iclog->ic_size-log_offset, 2417 reg->i_len, 2418 ©_off, ©_len, 2419 &partial_copy, 2420 &partial_copy_len); 2421 xlog_verify_dest_ptr(log, ptr); 2422 2423 /* 2424 * Copy region. 2425 * 2426 * Unmount records just log an opheader, so can have 2427 * empty payloads with no data region to copy. Hence we 2428 * only copy the payload if the vector says it has data 2429 * to copy. 2430 */ 2431 ASSERT(copy_len >= 0); 2432 if (copy_len > 0) { 2433 memcpy(ptr, reg->i_addr + copy_off, copy_len); 2434 xlog_write_adv_cnt(&ptr, &len, &log_offset, 2435 copy_len); 2436 } 2437 copy_len += start_rec_copy + sizeof(xlog_op_header_t); 2438 record_cnt++; 2439 data_cnt += contwr ? copy_len : 0; 2440 2441 error = xlog_write_copy_finish(log, iclog, flags, 2442 &record_cnt, &data_cnt, 2443 &partial_copy, 2444 &partial_copy_len, 2445 log_offset, 2446 commit_iclog); 2447 if (error) 2448 return error; 2449 2450 /* 2451 * if we had a partial copy, we need to get more iclog 2452 * space but we don't want to increment the region 2453 * index because there is still more is this region to 2454 * write. 2455 * 2456 * If we completed writing this region, and we flushed 2457 * the iclog (indicated by resetting of the record 2458 * count), then we also need to get more log space. If 2459 * this was the last record, though, we are done and 2460 * can just return. 2461 */ 2462 if (partial_copy) 2463 break; 2464 2465 if (++index == lv->lv_niovecs) { 2466 next_lv: 2467 lv = lv->lv_next; 2468 index = 0; 2469 if (lv) 2470 vecp = lv->lv_iovecp; 2471 } 2472 if (record_cnt == 0 && !ordered) { 2473 if (!lv) 2474 return 0; 2475 break; 2476 } 2477 } 2478 } 2479 2480 ASSERT(len == 0); 2481 2482 xlog_state_finish_copy(log, iclog, record_cnt, data_cnt); 2483 if (!commit_iclog) 2484 return xlog_state_release_iclog(log, iclog); 2485 2486 ASSERT(flags & XLOG_COMMIT_TRANS); 2487 *commit_iclog = iclog; 2488 return 0; 2489 } 2490 2491 2492 /***************************************************************************** 2493 * 2494 * State Machine functions 2495 * 2496 ***************************************************************************** 2497 */ 2498 2499 /* Clean iclogs starting from the head. This ordering must be 2500 * maintained, so an iclog doesn't become ACTIVE beyond one that 2501 * is SYNCING. This is also required to maintain the notion that we use 2502 * a ordered wait queue to hold off would be writers to the log when every 2503 * iclog is trying to sync to disk. 2504 * 2505 * State Change: DIRTY -> ACTIVE 2506 */ 2507 STATIC void 2508 xlog_state_clean_log( 2509 struct xlog *log) 2510 { 2511 xlog_in_core_t *iclog; 2512 int changed = 0; 2513 2514 iclog = log->l_iclog; 2515 do { 2516 if (iclog->ic_state == XLOG_STATE_DIRTY) { 2517 iclog->ic_state = XLOG_STATE_ACTIVE; 2518 iclog->ic_offset = 0; 2519 ASSERT(list_empty_careful(&iclog->ic_callbacks)); 2520 /* 2521 * If the number of ops in this iclog indicate it just 2522 * contains the dummy transaction, we can 2523 * change state into IDLE (the second time around). 2524 * Otherwise we should change the state into 2525 * NEED a dummy. 2526 * We don't need to cover the dummy. 2527 */ 2528 if (!changed && 2529 (be32_to_cpu(iclog->ic_header.h_num_logops) == 2530 XLOG_COVER_OPS)) { 2531 changed = 1; 2532 } else { 2533 /* 2534 * We have two dirty iclogs so start over 2535 * This could also be num of ops indicates 2536 * this is not the dummy going out. 2537 */ 2538 changed = 2; 2539 } 2540 iclog->ic_header.h_num_logops = 0; 2541 memset(iclog->ic_header.h_cycle_data, 0, 2542 sizeof(iclog->ic_header.h_cycle_data)); 2543 iclog->ic_header.h_lsn = 0; 2544 } else if (iclog->ic_state == XLOG_STATE_ACTIVE) 2545 /* do nothing */; 2546 else 2547 break; /* stop cleaning */ 2548 iclog = iclog->ic_next; 2549 } while (iclog != log->l_iclog); 2550 2551 /* log is locked when we are called */ 2552 /* 2553 * Change state for the dummy log recording. 2554 * We usually go to NEED. But we go to NEED2 if the changed indicates 2555 * we are done writing the dummy record. 2556 * If we are done with the second dummy recored (DONE2), then 2557 * we go to IDLE. 2558 */ 2559 if (changed) { 2560 switch (log->l_covered_state) { 2561 case XLOG_STATE_COVER_IDLE: 2562 case XLOG_STATE_COVER_NEED: 2563 case XLOG_STATE_COVER_NEED2: 2564 log->l_covered_state = XLOG_STATE_COVER_NEED; 2565 break; 2566 2567 case XLOG_STATE_COVER_DONE: 2568 if (changed == 1) 2569 log->l_covered_state = XLOG_STATE_COVER_NEED2; 2570 else 2571 log->l_covered_state = XLOG_STATE_COVER_NEED; 2572 break; 2573 2574 case XLOG_STATE_COVER_DONE2: 2575 if (changed == 1) 2576 log->l_covered_state = XLOG_STATE_COVER_IDLE; 2577 else 2578 log->l_covered_state = XLOG_STATE_COVER_NEED; 2579 break; 2580 2581 default: 2582 ASSERT(0); 2583 } 2584 } 2585 } /* xlog_state_clean_log */ 2586 2587 STATIC xfs_lsn_t 2588 xlog_get_lowest_lsn( 2589 struct xlog *log) 2590 { 2591 struct xlog_in_core *iclog = log->l_iclog; 2592 xfs_lsn_t lowest_lsn = 0, lsn; 2593 2594 do { 2595 if (iclog->ic_state & (XLOG_STATE_ACTIVE | XLOG_STATE_DIRTY)) 2596 continue; 2597 2598 lsn = be64_to_cpu(iclog->ic_header.h_lsn); 2599 if ((lsn && !lowest_lsn) || XFS_LSN_CMP(lsn, lowest_lsn) < 0) 2600 lowest_lsn = lsn; 2601 } while ((iclog = iclog->ic_next) != log->l_iclog); 2602 2603 return lowest_lsn; 2604 } 2605 2606 STATIC void 2607 xlog_state_do_callback( 2608 struct xlog *log, 2609 bool aborted, 2610 struct xlog_in_core *ciclog) 2611 { 2612 xlog_in_core_t *iclog; 2613 xlog_in_core_t *first_iclog; /* used to know when we've 2614 * processed all iclogs once */ 2615 int flushcnt = 0; 2616 xfs_lsn_t lowest_lsn; 2617 int ioerrors; /* counter: iclogs with errors */ 2618 int loopdidcallbacks; /* flag: inner loop did callbacks*/ 2619 int funcdidcallbacks; /* flag: function did callbacks */ 2620 int repeats; /* for issuing console warnings if 2621 * looping too many times */ 2622 int wake = 0; 2623 2624 spin_lock(&log->l_icloglock); 2625 first_iclog = iclog = log->l_iclog; 2626 ioerrors = 0; 2627 funcdidcallbacks = 0; 2628 repeats = 0; 2629 2630 do { 2631 /* 2632 * Scan all iclogs starting with the one pointed to by the 2633 * log. Reset this starting point each time the log is 2634 * unlocked (during callbacks). 2635 * 2636 * Keep looping through iclogs until one full pass is made 2637 * without running any callbacks. 2638 */ 2639 first_iclog = log->l_iclog; 2640 iclog = log->l_iclog; 2641 loopdidcallbacks = 0; 2642 repeats++; 2643 2644 do { 2645 2646 /* skip all iclogs in the ACTIVE & DIRTY states */ 2647 if (iclog->ic_state & 2648 (XLOG_STATE_ACTIVE|XLOG_STATE_DIRTY)) { 2649 iclog = iclog->ic_next; 2650 continue; 2651 } 2652 2653 /* 2654 * Between marking a filesystem SHUTDOWN and stopping 2655 * the log, we do flush all iclogs to disk (if there 2656 * wasn't a log I/O error). So, we do want things to 2657 * go smoothly in case of just a SHUTDOWN w/o a 2658 * LOG_IO_ERROR. 2659 */ 2660 if (!(iclog->ic_state & XLOG_STATE_IOERROR)) { 2661 /* 2662 * Can only perform callbacks in order. Since 2663 * this iclog is not in the DONE_SYNC/ 2664 * DO_CALLBACK state, we skip the rest and 2665 * just try to clean up. If we set our iclog 2666 * to DO_CALLBACK, we will not process it when 2667 * we retry since a previous iclog is in the 2668 * CALLBACK and the state cannot change since 2669 * we are holding the l_icloglock. 2670 */ 2671 if (!(iclog->ic_state & 2672 (XLOG_STATE_DONE_SYNC | 2673 XLOG_STATE_DO_CALLBACK))) { 2674 if (ciclog && (ciclog->ic_state == 2675 XLOG_STATE_DONE_SYNC)) { 2676 ciclog->ic_state = XLOG_STATE_DO_CALLBACK; 2677 } 2678 break; 2679 } 2680 /* 2681 * We now have an iclog that is in either the 2682 * DO_CALLBACK or DONE_SYNC states. The other 2683 * states (WANT_SYNC, SYNCING, or CALLBACK were 2684 * caught by the above if and are going to 2685 * clean (i.e. we aren't doing their callbacks) 2686 * see the above if. 2687 */ 2688 2689 /* 2690 * We will do one more check here to see if we 2691 * have chased our tail around. 2692 */ 2693 2694 lowest_lsn = xlog_get_lowest_lsn(log); 2695 if (lowest_lsn && 2696 XFS_LSN_CMP(lowest_lsn, 2697 be64_to_cpu(iclog->ic_header.h_lsn)) < 0) { 2698 iclog = iclog->ic_next; 2699 continue; /* Leave this iclog for 2700 * another thread */ 2701 } 2702 2703 iclog->ic_state = XLOG_STATE_CALLBACK; 2704 2705 2706 /* 2707 * Completion of a iclog IO does not imply that 2708 * a transaction has completed, as transactions 2709 * can be large enough to span many iclogs. We 2710 * cannot change the tail of the log half way 2711 * through a transaction as this may be the only 2712 * transaction in the log and moving th etail to 2713 * point to the middle of it will prevent 2714 * recovery from finding the start of the 2715 * transaction. Hence we should only update the 2716 * last_sync_lsn if this iclog contains 2717 * transaction completion callbacks on it. 2718 * 2719 * We have to do this before we drop the 2720 * icloglock to ensure we are the only one that 2721 * can update it. 2722 */ 2723 ASSERT(XFS_LSN_CMP(atomic64_read(&log->l_last_sync_lsn), 2724 be64_to_cpu(iclog->ic_header.h_lsn)) <= 0); 2725 if (!list_empty_careful(&iclog->ic_callbacks)) 2726 atomic64_set(&log->l_last_sync_lsn, 2727 be64_to_cpu(iclog->ic_header.h_lsn)); 2728 2729 } else 2730 ioerrors++; 2731 2732 spin_unlock(&log->l_icloglock); 2733 2734 /* 2735 * Keep processing entries in the callback list until 2736 * we come around and it is empty. We need to 2737 * atomically see that the list is empty and change the 2738 * state to DIRTY so that we don't miss any more 2739 * callbacks being added. 2740 */ 2741 spin_lock(&iclog->ic_callback_lock); 2742 while (!list_empty(&iclog->ic_callbacks)) { 2743 LIST_HEAD(tmp); 2744 2745 list_splice_init(&iclog->ic_callbacks, &tmp); 2746 2747 spin_unlock(&iclog->ic_callback_lock); 2748 xlog_cil_process_committed(&tmp, aborted); 2749 spin_lock(&iclog->ic_callback_lock); 2750 } 2751 2752 loopdidcallbacks++; 2753 funcdidcallbacks++; 2754 2755 spin_lock(&log->l_icloglock); 2756 spin_unlock(&iclog->ic_callback_lock); 2757 if (!(iclog->ic_state & XLOG_STATE_IOERROR)) 2758 iclog->ic_state = XLOG_STATE_DIRTY; 2759 2760 /* 2761 * Transition from DIRTY to ACTIVE if applicable. 2762 * NOP if STATE_IOERROR. 2763 */ 2764 xlog_state_clean_log(log); 2765 2766 /* wake up threads waiting in xfs_log_force() */ 2767 wake_up_all(&iclog->ic_force_wait); 2768 2769 iclog = iclog->ic_next; 2770 } while (first_iclog != iclog); 2771 2772 if (repeats > 5000) { 2773 flushcnt += repeats; 2774 repeats = 0; 2775 xfs_warn(log->l_mp, 2776 "%s: possible infinite loop (%d iterations)", 2777 __func__, flushcnt); 2778 } 2779 } while (!ioerrors && loopdidcallbacks); 2780 2781 #ifdef DEBUG 2782 /* 2783 * Make one last gasp attempt to see if iclogs are being left in limbo. 2784 * If the above loop finds an iclog earlier than the current iclog and 2785 * in one of the syncing states, the current iclog is put into 2786 * DO_CALLBACK and the callbacks are deferred to the completion of the 2787 * earlier iclog. Walk the iclogs in order and make sure that no iclog 2788 * is in DO_CALLBACK unless an earlier iclog is in one of the syncing 2789 * states. 2790 * 2791 * Note that SYNCING|IOABORT is a valid state so we cannot just check 2792 * for ic_state == SYNCING. 2793 */ 2794 if (funcdidcallbacks) { 2795 first_iclog = iclog = log->l_iclog; 2796 do { 2797 ASSERT(iclog->ic_state != XLOG_STATE_DO_CALLBACK); 2798 /* 2799 * Terminate the loop if iclogs are found in states 2800 * which will cause other threads to clean up iclogs. 2801 * 2802 * SYNCING - i/o completion will go through logs 2803 * DONE_SYNC - interrupt thread should be waiting for 2804 * l_icloglock 2805 * IOERROR - give up hope all ye who enter here 2806 */ 2807 if (iclog->ic_state == XLOG_STATE_WANT_SYNC || 2808 iclog->ic_state & XLOG_STATE_SYNCING || 2809 iclog->ic_state == XLOG_STATE_DONE_SYNC || 2810 iclog->ic_state == XLOG_STATE_IOERROR ) 2811 break; 2812 iclog = iclog->ic_next; 2813 } while (first_iclog != iclog); 2814 } 2815 #endif 2816 2817 if (log->l_iclog->ic_state & (XLOG_STATE_ACTIVE|XLOG_STATE_IOERROR)) 2818 wake = 1; 2819 spin_unlock(&log->l_icloglock); 2820 2821 if (wake) 2822 wake_up_all(&log->l_flush_wait); 2823 } 2824 2825 2826 /* 2827 * Finish transitioning this iclog to the dirty state. 2828 * 2829 * Make sure that we completely execute this routine only when this is 2830 * the last call to the iclog. There is a good chance that iclog flushes, 2831 * when we reach the end of the physical log, get turned into 2 separate 2832 * calls to bwrite. Hence, one iclog flush could generate two calls to this 2833 * routine. By using the reference count bwritecnt, we guarantee that only 2834 * the second completion goes through. 2835 * 2836 * Callbacks could take time, so they are done outside the scope of the 2837 * global state machine log lock. 2838 */ 2839 STATIC void 2840 xlog_state_done_syncing( 2841 struct xlog_in_core *iclog, 2842 bool aborted) 2843 { 2844 struct xlog *log = iclog->ic_log; 2845 2846 spin_lock(&log->l_icloglock); 2847 2848 ASSERT(iclog->ic_state == XLOG_STATE_SYNCING || 2849 iclog->ic_state == XLOG_STATE_IOERROR); 2850 ASSERT(atomic_read(&iclog->ic_refcnt) == 0); 2851 2852 /* 2853 * If we got an error, either on the first buffer, or in the case of 2854 * split log writes, on the second, we mark ALL iclogs STATE_IOERROR, 2855 * and none should ever be attempted to be written to disk 2856 * again. 2857 */ 2858 if (iclog->ic_state != XLOG_STATE_IOERROR) 2859 iclog->ic_state = XLOG_STATE_DONE_SYNC; 2860 2861 /* 2862 * Someone could be sleeping prior to writing out the next 2863 * iclog buffer, we wake them all, one will get to do the 2864 * I/O, the others get to wait for the result. 2865 */ 2866 wake_up_all(&iclog->ic_write_wait); 2867 spin_unlock(&log->l_icloglock); 2868 xlog_state_do_callback(log, aborted, iclog); /* also cleans log */ 2869 } /* xlog_state_done_syncing */ 2870 2871 2872 /* 2873 * If the head of the in-core log ring is not (ACTIVE or DIRTY), then we must 2874 * sleep. We wait on the flush queue on the head iclog as that should be 2875 * the first iclog to complete flushing. Hence if all iclogs are syncing, 2876 * we will wait here and all new writes will sleep until a sync completes. 2877 * 2878 * The in-core logs are used in a circular fashion. They are not used 2879 * out-of-order even when an iclog past the head is free. 2880 * 2881 * return: 2882 * * log_offset where xlog_write() can start writing into the in-core 2883 * log's data space. 2884 * * in-core log pointer to which xlog_write() should write. 2885 * * boolean indicating this is a continued write to an in-core log. 2886 * If this is the last write, then the in-core log's offset field 2887 * needs to be incremented, depending on the amount of data which 2888 * is copied. 2889 */ 2890 STATIC int 2891 xlog_state_get_iclog_space( 2892 struct xlog *log, 2893 int len, 2894 struct xlog_in_core **iclogp, 2895 struct xlog_ticket *ticket, 2896 int *continued_write, 2897 int *logoffsetp) 2898 { 2899 int log_offset; 2900 xlog_rec_header_t *head; 2901 xlog_in_core_t *iclog; 2902 int error; 2903 2904 restart: 2905 spin_lock(&log->l_icloglock); 2906 if (XLOG_FORCED_SHUTDOWN(log)) { 2907 spin_unlock(&log->l_icloglock); 2908 return -EIO; 2909 } 2910 2911 iclog = log->l_iclog; 2912 if (iclog->ic_state != XLOG_STATE_ACTIVE) { 2913 XFS_STATS_INC(log->l_mp, xs_log_noiclogs); 2914 2915 /* Wait for log writes to have flushed */ 2916 xlog_wait(&log->l_flush_wait, &log->l_icloglock); 2917 goto restart; 2918 } 2919 2920 head = &iclog->ic_header; 2921 2922 atomic_inc(&iclog->ic_refcnt); /* prevents sync */ 2923 log_offset = iclog->ic_offset; 2924 2925 /* On the 1st write to an iclog, figure out lsn. This works 2926 * if iclogs marked XLOG_STATE_WANT_SYNC always write out what they are 2927 * committing to. If the offset is set, that's how many blocks 2928 * must be written. 2929 */ 2930 if (log_offset == 0) { 2931 ticket->t_curr_res -= log->l_iclog_hsize; 2932 xlog_tic_add_region(ticket, 2933 log->l_iclog_hsize, 2934 XLOG_REG_TYPE_LRHEADER); 2935 head->h_cycle = cpu_to_be32(log->l_curr_cycle); 2936 head->h_lsn = cpu_to_be64( 2937 xlog_assign_lsn(log->l_curr_cycle, log->l_curr_block)); 2938 ASSERT(log->l_curr_block >= 0); 2939 } 2940 2941 /* If there is enough room to write everything, then do it. Otherwise, 2942 * claim the rest of the region and make sure the XLOG_STATE_WANT_SYNC 2943 * bit is on, so this will get flushed out. Don't update ic_offset 2944 * until you know exactly how many bytes get copied. Therefore, wait 2945 * until later to update ic_offset. 2946 * 2947 * xlog_write() algorithm assumes that at least 2 xlog_op_header_t's 2948 * can fit into remaining data section. 2949 */ 2950 if (iclog->ic_size - iclog->ic_offset < 2*sizeof(xlog_op_header_t)) { 2951 xlog_state_switch_iclogs(log, iclog, iclog->ic_size); 2952 2953 /* 2954 * If I'm the only one writing to this iclog, sync it to disk. 2955 * We need to do an atomic compare and decrement here to avoid 2956 * racing with concurrent atomic_dec_and_lock() calls in 2957 * xlog_state_release_iclog() when there is more than one 2958 * reference to the iclog. 2959 */ 2960 if (!atomic_add_unless(&iclog->ic_refcnt, -1, 1)) { 2961 /* we are the only one */ 2962 spin_unlock(&log->l_icloglock); 2963 error = xlog_state_release_iclog(log, iclog); 2964 if (error) 2965 return error; 2966 } else { 2967 spin_unlock(&log->l_icloglock); 2968 } 2969 goto restart; 2970 } 2971 2972 /* Do we have enough room to write the full amount in the remainder 2973 * of this iclog? Or must we continue a write on the next iclog and 2974 * mark this iclog as completely taken? In the case where we switch 2975 * iclogs (to mark it taken), this particular iclog will release/sync 2976 * to disk in xlog_write(). 2977 */ 2978 if (len <= iclog->ic_size - iclog->ic_offset) { 2979 *continued_write = 0; 2980 iclog->ic_offset += len; 2981 } else { 2982 *continued_write = 1; 2983 xlog_state_switch_iclogs(log, iclog, iclog->ic_size); 2984 } 2985 *iclogp = iclog; 2986 2987 ASSERT(iclog->ic_offset <= iclog->ic_size); 2988 spin_unlock(&log->l_icloglock); 2989 2990 *logoffsetp = log_offset; 2991 return 0; 2992 } /* xlog_state_get_iclog_space */ 2993 2994 /* The first cnt-1 times through here we don't need to 2995 * move the grant write head because the permanent 2996 * reservation has reserved cnt times the unit amount. 2997 * Release part of current permanent unit reservation and 2998 * reset current reservation to be one units worth. Also 2999 * move grant reservation head forward. 3000 */ 3001 STATIC void 3002 xlog_regrant_reserve_log_space( 3003 struct xlog *log, 3004 struct xlog_ticket *ticket) 3005 { 3006 trace_xfs_log_regrant_reserve_enter(log, ticket); 3007 3008 if (ticket->t_cnt > 0) 3009 ticket->t_cnt--; 3010 3011 xlog_grant_sub_space(log, &log->l_reserve_head.grant, 3012 ticket->t_curr_res); 3013 xlog_grant_sub_space(log, &log->l_write_head.grant, 3014 ticket->t_curr_res); 3015 ticket->t_curr_res = ticket->t_unit_res; 3016 xlog_tic_reset_res(ticket); 3017 3018 trace_xfs_log_regrant_reserve_sub(log, ticket); 3019 3020 /* just return if we still have some of the pre-reserved space */ 3021 if (ticket->t_cnt > 0) 3022 return; 3023 3024 xlog_grant_add_space(log, &log->l_reserve_head.grant, 3025 ticket->t_unit_res); 3026 3027 trace_xfs_log_regrant_reserve_exit(log, ticket); 3028 3029 ticket->t_curr_res = ticket->t_unit_res; 3030 xlog_tic_reset_res(ticket); 3031 } /* xlog_regrant_reserve_log_space */ 3032 3033 3034 /* 3035 * Give back the space left from a reservation. 3036 * 3037 * All the information we need to make a correct determination of space left 3038 * is present. For non-permanent reservations, things are quite easy. The 3039 * count should have been decremented to zero. We only need to deal with the 3040 * space remaining in the current reservation part of the ticket. If the 3041 * ticket contains a permanent reservation, there may be left over space which 3042 * needs to be released. A count of N means that N-1 refills of the current 3043 * reservation can be done before we need to ask for more space. The first 3044 * one goes to fill up the first current reservation. Once we run out of 3045 * space, the count will stay at zero and the only space remaining will be 3046 * in the current reservation field. 3047 */ 3048 STATIC void 3049 xlog_ungrant_log_space( 3050 struct xlog *log, 3051 struct xlog_ticket *ticket) 3052 { 3053 int bytes; 3054 3055 if (ticket->t_cnt > 0) 3056 ticket->t_cnt--; 3057 3058 trace_xfs_log_ungrant_enter(log, ticket); 3059 trace_xfs_log_ungrant_sub(log, ticket); 3060 3061 /* 3062 * If this is a permanent reservation ticket, we may be able to free 3063 * up more space based on the remaining count. 3064 */ 3065 bytes = ticket->t_curr_res; 3066 if (ticket->t_cnt > 0) { 3067 ASSERT(ticket->t_flags & XLOG_TIC_PERM_RESERV); 3068 bytes += ticket->t_unit_res*ticket->t_cnt; 3069 } 3070 3071 xlog_grant_sub_space(log, &log->l_reserve_head.grant, bytes); 3072 xlog_grant_sub_space(log, &log->l_write_head.grant, bytes); 3073 3074 trace_xfs_log_ungrant_exit(log, ticket); 3075 3076 xfs_log_space_wake(log->l_mp); 3077 } 3078 3079 /* 3080 * Flush iclog to disk if this is the last reference to the given iclog and 3081 * the WANT_SYNC bit is set. 3082 * 3083 * When this function is entered, the iclog is not necessarily in the 3084 * WANT_SYNC state. It may be sitting around waiting to get filled. 3085 * 3086 * 3087 */ 3088 STATIC int 3089 xlog_state_release_iclog( 3090 struct xlog *log, 3091 struct xlog_in_core *iclog) 3092 { 3093 int sync = 0; /* do we sync? */ 3094 3095 if (iclog->ic_state & XLOG_STATE_IOERROR) 3096 return -EIO; 3097 3098 ASSERT(atomic_read(&iclog->ic_refcnt) > 0); 3099 if (!atomic_dec_and_lock(&iclog->ic_refcnt, &log->l_icloglock)) 3100 return 0; 3101 3102 if (iclog->ic_state & XLOG_STATE_IOERROR) { 3103 spin_unlock(&log->l_icloglock); 3104 return -EIO; 3105 } 3106 ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE || 3107 iclog->ic_state == XLOG_STATE_WANT_SYNC); 3108 3109 if (iclog->ic_state == XLOG_STATE_WANT_SYNC) { 3110 /* update tail before writing to iclog */ 3111 xfs_lsn_t tail_lsn = xlog_assign_tail_lsn(log->l_mp); 3112 sync++; 3113 iclog->ic_state = XLOG_STATE_SYNCING; 3114 iclog->ic_header.h_tail_lsn = cpu_to_be64(tail_lsn); 3115 xlog_verify_tail_lsn(log, iclog, tail_lsn); 3116 /* cycle incremented when incrementing curr_block */ 3117 } 3118 spin_unlock(&log->l_icloglock); 3119 3120 /* 3121 * We let the log lock go, so it's possible that we hit a log I/O 3122 * error or some other SHUTDOWN condition that marks the iclog 3123 * as XLOG_STATE_IOERROR before the bwrite. However, we know that 3124 * this iclog has consistent data, so we ignore IOERROR 3125 * flags after this point. 3126 */ 3127 if (sync) 3128 xlog_sync(log, iclog); 3129 return 0; 3130 } /* xlog_state_release_iclog */ 3131 3132 3133 /* 3134 * This routine will mark the current iclog in the ring as WANT_SYNC 3135 * and move the current iclog pointer to the next iclog in the ring. 3136 * When this routine is called from xlog_state_get_iclog_space(), the 3137 * exact size of the iclog has not yet been determined. All we know is 3138 * that every data block. We have run out of space in this log record. 3139 */ 3140 STATIC void 3141 xlog_state_switch_iclogs( 3142 struct xlog *log, 3143 struct xlog_in_core *iclog, 3144 int eventual_size) 3145 { 3146 ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE); 3147 if (!eventual_size) 3148 eventual_size = iclog->ic_offset; 3149 iclog->ic_state = XLOG_STATE_WANT_SYNC; 3150 iclog->ic_header.h_prev_block = cpu_to_be32(log->l_prev_block); 3151 log->l_prev_block = log->l_curr_block; 3152 log->l_prev_cycle = log->l_curr_cycle; 3153 3154 /* roll log?: ic_offset changed later */ 3155 log->l_curr_block += BTOBB(eventual_size)+BTOBB(log->l_iclog_hsize); 3156 3157 /* Round up to next log-sunit */ 3158 if (xfs_sb_version_haslogv2(&log->l_mp->m_sb) && 3159 log->l_mp->m_sb.sb_logsunit > 1) { 3160 uint32_t sunit_bb = BTOBB(log->l_mp->m_sb.sb_logsunit); 3161 log->l_curr_block = roundup(log->l_curr_block, sunit_bb); 3162 } 3163 3164 if (log->l_curr_block >= log->l_logBBsize) { 3165 /* 3166 * Rewind the current block before the cycle is bumped to make 3167 * sure that the combined LSN never transiently moves forward 3168 * when the log wraps to the next cycle. This is to support the 3169 * unlocked sample of these fields from xlog_valid_lsn(). Most 3170 * other cases should acquire l_icloglock. 3171 */ 3172 log->l_curr_block -= log->l_logBBsize; 3173 ASSERT(log->l_curr_block >= 0); 3174 smp_wmb(); 3175 log->l_curr_cycle++; 3176 if (log->l_curr_cycle == XLOG_HEADER_MAGIC_NUM) 3177 log->l_curr_cycle++; 3178 } 3179 ASSERT(iclog == log->l_iclog); 3180 log->l_iclog = iclog->ic_next; 3181 } /* xlog_state_switch_iclogs */ 3182 3183 /* 3184 * Write out all data in the in-core log as of this exact moment in time. 3185 * 3186 * Data may be written to the in-core log during this call. However, 3187 * we don't guarantee this data will be written out. A change from past 3188 * implementation means this routine will *not* write out zero length LRs. 3189 * 3190 * Basically, we try and perform an intelligent scan of the in-core logs. 3191 * If we determine there is no flushable data, we just return. There is no 3192 * flushable data if: 3193 * 3194 * 1. the current iclog is active and has no data; the previous iclog 3195 * is in the active or dirty state. 3196 * 2. the current iclog is drity, and the previous iclog is in the 3197 * active or dirty state. 3198 * 3199 * We may sleep if: 3200 * 3201 * 1. the current iclog is not in the active nor dirty state. 3202 * 2. the current iclog dirty, and the previous iclog is not in the 3203 * active nor dirty state. 3204 * 3. the current iclog is active, and there is another thread writing 3205 * to this particular iclog. 3206 * 4. a) the current iclog is active and has no other writers 3207 * b) when we return from flushing out this iclog, it is still 3208 * not in the active nor dirty state. 3209 */ 3210 int 3211 xfs_log_force( 3212 struct xfs_mount *mp, 3213 uint flags) 3214 { 3215 struct xlog *log = mp->m_log; 3216 struct xlog_in_core *iclog; 3217 xfs_lsn_t lsn; 3218 3219 XFS_STATS_INC(mp, xs_log_force); 3220 trace_xfs_log_force(mp, 0, _RET_IP_); 3221 3222 xlog_cil_force(log); 3223 3224 spin_lock(&log->l_icloglock); 3225 iclog = log->l_iclog; 3226 if (iclog->ic_state & XLOG_STATE_IOERROR) 3227 goto out_error; 3228 3229 if (iclog->ic_state == XLOG_STATE_DIRTY || 3230 (iclog->ic_state == XLOG_STATE_ACTIVE && 3231 atomic_read(&iclog->ic_refcnt) == 0 && iclog->ic_offset == 0)) { 3232 /* 3233 * If the head is dirty or (active and empty), then we need to 3234 * look at the previous iclog. 3235 * 3236 * If the previous iclog is active or dirty we are done. There 3237 * is nothing to sync out. Otherwise, we attach ourselves to the 3238 * previous iclog and go to sleep. 3239 */ 3240 iclog = iclog->ic_prev; 3241 if (iclog->ic_state == XLOG_STATE_ACTIVE || 3242 iclog->ic_state == XLOG_STATE_DIRTY) 3243 goto out_unlock; 3244 } else if (iclog->ic_state == XLOG_STATE_ACTIVE) { 3245 if (atomic_read(&iclog->ic_refcnt) == 0) { 3246 /* 3247 * We are the only one with access to this iclog. 3248 * 3249 * Flush it out now. There should be a roundoff of zero 3250 * to show that someone has already taken care of the 3251 * roundoff from the previous sync. 3252 */ 3253 atomic_inc(&iclog->ic_refcnt); 3254 lsn = be64_to_cpu(iclog->ic_header.h_lsn); 3255 xlog_state_switch_iclogs(log, iclog, 0); 3256 spin_unlock(&log->l_icloglock); 3257 3258 if (xlog_state_release_iclog(log, iclog)) 3259 return -EIO; 3260 3261 spin_lock(&log->l_icloglock); 3262 if (be64_to_cpu(iclog->ic_header.h_lsn) != lsn || 3263 iclog->ic_state == XLOG_STATE_DIRTY) 3264 goto out_unlock; 3265 } else { 3266 /* 3267 * Someone else is writing to this iclog. 3268 * 3269 * Use its call to flush out the data. However, the 3270 * other thread may not force out this LR, so we mark 3271 * it WANT_SYNC. 3272 */ 3273 xlog_state_switch_iclogs(log, iclog, 0); 3274 } 3275 } else { 3276 /* 3277 * If the head iclog is not active nor dirty, we just attach 3278 * ourselves to the head and go to sleep if necessary. 3279 */ 3280 ; 3281 } 3282 3283 if (!(flags & XFS_LOG_SYNC)) 3284 goto out_unlock; 3285 3286 if (iclog->ic_state & XLOG_STATE_IOERROR) 3287 goto out_error; 3288 XFS_STATS_INC(mp, xs_log_force_sleep); 3289 xlog_wait(&iclog->ic_force_wait, &log->l_icloglock); 3290 if (iclog->ic_state & XLOG_STATE_IOERROR) 3291 return -EIO; 3292 return 0; 3293 3294 out_unlock: 3295 spin_unlock(&log->l_icloglock); 3296 return 0; 3297 out_error: 3298 spin_unlock(&log->l_icloglock); 3299 return -EIO; 3300 } 3301 3302 static int 3303 __xfs_log_force_lsn( 3304 struct xfs_mount *mp, 3305 xfs_lsn_t lsn, 3306 uint flags, 3307 int *log_flushed, 3308 bool already_slept) 3309 { 3310 struct xlog *log = mp->m_log; 3311 struct xlog_in_core *iclog; 3312 3313 spin_lock(&log->l_icloglock); 3314 iclog = log->l_iclog; 3315 if (iclog->ic_state & XLOG_STATE_IOERROR) 3316 goto out_error; 3317 3318 while (be64_to_cpu(iclog->ic_header.h_lsn) != lsn) { 3319 iclog = iclog->ic_next; 3320 if (iclog == log->l_iclog) 3321 goto out_unlock; 3322 } 3323 3324 if (iclog->ic_state == XLOG_STATE_DIRTY) 3325 goto out_unlock; 3326 3327 if (iclog->ic_state == XLOG_STATE_ACTIVE) { 3328 /* 3329 * We sleep here if we haven't already slept (e.g. this is the 3330 * first time we've looked at the correct iclog buf) and the 3331 * buffer before us is going to be sync'ed. The reason for this 3332 * is that if we are doing sync transactions here, by waiting 3333 * for the previous I/O to complete, we can allow a few more 3334 * transactions into this iclog before we close it down. 3335 * 3336 * Otherwise, we mark the buffer WANT_SYNC, and bump up the 3337 * refcnt so we can release the log (which drops the ref count). 3338 * The state switch keeps new transaction commits from using 3339 * this buffer. When the current commits finish writing into 3340 * the buffer, the refcount will drop to zero and the buffer 3341 * will go out then. 3342 */ 3343 if (!already_slept && 3344 (iclog->ic_prev->ic_state & 3345 (XLOG_STATE_WANT_SYNC | XLOG_STATE_SYNCING))) { 3346 ASSERT(!(iclog->ic_state & XLOG_STATE_IOERROR)); 3347 3348 XFS_STATS_INC(mp, xs_log_force_sleep); 3349 3350 xlog_wait(&iclog->ic_prev->ic_write_wait, 3351 &log->l_icloglock); 3352 return -EAGAIN; 3353 } 3354 atomic_inc(&iclog->ic_refcnt); 3355 xlog_state_switch_iclogs(log, iclog, 0); 3356 spin_unlock(&log->l_icloglock); 3357 if (xlog_state_release_iclog(log, iclog)) 3358 return -EIO; 3359 if (log_flushed) 3360 *log_flushed = 1; 3361 spin_lock(&log->l_icloglock); 3362 } 3363 3364 if (!(flags & XFS_LOG_SYNC) || 3365 (iclog->ic_state & (XLOG_STATE_ACTIVE | XLOG_STATE_DIRTY))) 3366 goto out_unlock; 3367 3368 if (iclog->ic_state & XLOG_STATE_IOERROR) 3369 goto out_error; 3370 3371 XFS_STATS_INC(mp, xs_log_force_sleep); 3372 xlog_wait(&iclog->ic_force_wait, &log->l_icloglock); 3373 if (iclog->ic_state & XLOG_STATE_IOERROR) 3374 return -EIO; 3375 return 0; 3376 3377 out_unlock: 3378 spin_unlock(&log->l_icloglock); 3379 return 0; 3380 out_error: 3381 spin_unlock(&log->l_icloglock); 3382 return -EIO; 3383 } 3384 3385 /* 3386 * Force the in-core log to disk for a specific LSN. 3387 * 3388 * Find in-core log with lsn. 3389 * If it is in the DIRTY state, just return. 3390 * If it is in the ACTIVE state, move the in-core log into the WANT_SYNC 3391 * state and go to sleep or return. 3392 * If it is in any other state, go to sleep or return. 3393 * 3394 * Synchronous forces are implemented with a wait queue. All callers trying 3395 * to force a given lsn to disk must wait on the queue attached to the 3396 * specific in-core log. When given in-core log finally completes its write 3397 * to disk, that thread will wake up all threads waiting on the queue. 3398 */ 3399 int 3400 xfs_log_force_lsn( 3401 struct xfs_mount *mp, 3402 xfs_lsn_t lsn, 3403 uint flags, 3404 int *log_flushed) 3405 { 3406 int ret; 3407 ASSERT(lsn != 0); 3408 3409 XFS_STATS_INC(mp, xs_log_force); 3410 trace_xfs_log_force(mp, lsn, _RET_IP_); 3411 3412 lsn = xlog_cil_force_lsn(mp->m_log, lsn); 3413 if (lsn == NULLCOMMITLSN) 3414 return 0; 3415 3416 ret = __xfs_log_force_lsn(mp, lsn, flags, log_flushed, false); 3417 if (ret == -EAGAIN) 3418 ret = __xfs_log_force_lsn(mp, lsn, flags, log_flushed, true); 3419 return ret; 3420 } 3421 3422 /* 3423 * Called when we want to mark the current iclog as being ready to sync to 3424 * disk. 3425 */ 3426 STATIC void 3427 xlog_state_want_sync( 3428 struct xlog *log, 3429 struct xlog_in_core *iclog) 3430 { 3431 assert_spin_locked(&log->l_icloglock); 3432 3433 if (iclog->ic_state == XLOG_STATE_ACTIVE) { 3434 xlog_state_switch_iclogs(log, iclog, 0); 3435 } else { 3436 ASSERT(iclog->ic_state & 3437 (XLOG_STATE_WANT_SYNC|XLOG_STATE_IOERROR)); 3438 } 3439 } 3440 3441 3442 /***************************************************************************** 3443 * 3444 * TICKET functions 3445 * 3446 ***************************************************************************** 3447 */ 3448 3449 /* 3450 * Free a used ticket when its refcount falls to zero. 3451 */ 3452 void 3453 xfs_log_ticket_put( 3454 xlog_ticket_t *ticket) 3455 { 3456 ASSERT(atomic_read(&ticket->t_ref) > 0); 3457 if (atomic_dec_and_test(&ticket->t_ref)) 3458 kmem_zone_free(xfs_log_ticket_zone, ticket); 3459 } 3460 3461 xlog_ticket_t * 3462 xfs_log_ticket_get( 3463 xlog_ticket_t *ticket) 3464 { 3465 ASSERT(atomic_read(&ticket->t_ref) > 0); 3466 atomic_inc(&ticket->t_ref); 3467 return ticket; 3468 } 3469 3470 /* 3471 * Figure out the total log space unit (in bytes) that would be 3472 * required for a log ticket. 3473 */ 3474 int 3475 xfs_log_calc_unit_res( 3476 struct xfs_mount *mp, 3477 int unit_bytes) 3478 { 3479 struct xlog *log = mp->m_log; 3480 int iclog_space; 3481 uint num_headers; 3482 3483 /* 3484 * Permanent reservations have up to 'cnt'-1 active log operations 3485 * in the log. A unit in this case is the amount of space for one 3486 * of these log operations. Normal reservations have a cnt of 1 3487 * and their unit amount is the total amount of space required. 3488 * 3489 * The following lines of code account for non-transaction data 3490 * which occupy space in the on-disk log. 3491 * 3492 * Normal form of a transaction is: 3493 * <oph><trans-hdr><start-oph><reg1-oph><reg1><reg2-oph>...<commit-oph> 3494 * and then there are LR hdrs, split-recs and roundoff at end of syncs. 3495 * 3496 * We need to account for all the leadup data and trailer data 3497 * around the transaction data. 3498 * And then we need to account for the worst case in terms of using 3499 * more space. 3500 * The worst case will happen if: 3501 * - the placement of the transaction happens to be such that the 3502 * roundoff is at its maximum 3503 * - the transaction data is synced before the commit record is synced 3504 * i.e. <transaction-data><roundoff> | <commit-rec><roundoff> 3505 * Therefore the commit record is in its own Log Record. 3506 * This can happen as the commit record is called with its 3507 * own region to xlog_write(). 3508 * This then means that in the worst case, roundoff can happen for 3509 * the commit-rec as well. 3510 * The commit-rec is smaller than padding in this scenario and so it is 3511 * not added separately. 3512 */ 3513 3514 /* for trans header */ 3515 unit_bytes += sizeof(xlog_op_header_t); 3516 unit_bytes += sizeof(xfs_trans_header_t); 3517 3518 /* for start-rec */ 3519 unit_bytes += sizeof(xlog_op_header_t); 3520 3521 /* 3522 * for LR headers - the space for data in an iclog is the size minus 3523 * the space used for the headers. If we use the iclog size, then we 3524 * undercalculate the number of headers required. 3525 * 3526 * Furthermore - the addition of op headers for split-recs might 3527 * increase the space required enough to require more log and op 3528 * headers, so take that into account too. 3529 * 3530 * IMPORTANT: This reservation makes the assumption that if this 3531 * transaction is the first in an iclog and hence has the LR headers 3532 * accounted to it, then the remaining space in the iclog is 3533 * exclusively for this transaction. i.e. if the transaction is larger 3534 * than the iclog, it will be the only thing in that iclog. 3535 * Fundamentally, this means we must pass the entire log vector to 3536 * xlog_write to guarantee this. 3537 */ 3538 iclog_space = log->l_iclog_size - log->l_iclog_hsize; 3539 num_headers = howmany(unit_bytes, iclog_space); 3540 3541 /* for split-recs - ophdrs added when data split over LRs */ 3542 unit_bytes += sizeof(xlog_op_header_t) * num_headers; 3543 3544 /* add extra header reservations if we overrun */ 3545 while (!num_headers || 3546 howmany(unit_bytes, iclog_space) > num_headers) { 3547 unit_bytes += sizeof(xlog_op_header_t); 3548 num_headers++; 3549 } 3550 unit_bytes += log->l_iclog_hsize * num_headers; 3551 3552 /* for commit-rec LR header - note: padding will subsume the ophdr */ 3553 unit_bytes += log->l_iclog_hsize; 3554 3555 /* for roundoff padding for transaction data and one for commit record */ 3556 if (xfs_sb_version_haslogv2(&mp->m_sb) && mp->m_sb.sb_logsunit > 1) { 3557 /* log su roundoff */ 3558 unit_bytes += 2 * mp->m_sb.sb_logsunit; 3559 } else { 3560 /* BB roundoff */ 3561 unit_bytes += 2 * BBSIZE; 3562 } 3563 3564 return unit_bytes; 3565 } 3566 3567 /* 3568 * Allocate and initialise a new log ticket. 3569 */ 3570 struct xlog_ticket * 3571 xlog_ticket_alloc( 3572 struct xlog *log, 3573 int unit_bytes, 3574 int cnt, 3575 char client, 3576 bool permanent, 3577 xfs_km_flags_t alloc_flags) 3578 { 3579 struct xlog_ticket *tic; 3580 int unit_res; 3581 3582 tic = kmem_zone_zalloc(xfs_log_ticket_zone, alloc_flags); 3583 if (!tic) 3584 return NULL; 3585 3586 unit_res = xfs_log_calc_unit_res(log->l_mp, unit_bytes); 3587 3588 atomic_set(&tic->t_ref, 1); 3589 tic->t_task = current; 3590 INIT_LIST_HEAD(&tic->t_queue); 3591 tic->t_unit_res = unit_res; 3592 tic->t_curr_res = unit_res; 3593 tic->t_cnt = cnt; 3594 tic->t_ocnt = cnt; 3595 tic->t_tid = prandom_u32(); 3596 tic->t_clientid = client; 3597 tic->t_flags = XLOG_TIC_INITED; 3598 if (permanent) 3599 tic->t_flags |= XLOG_TIC_PERM_RESERV; 3600 3601 xlog_tic_reset_res(tic); 3602 3603 return tic; 3604 } 3605 3606 3607 /****************************************************************************** 3608 * 3609 * Log debug routines 3610 * 3611 ****************************************************************************** 3612 */ 3613 #if defined(DEBUG) 3614 /* 3615 * Make sure that the destination ptr is within the valid data region of 3616 * one of the iclogs. This uses backup pointers stored in a different 3617 * part of the log in case we trash the log structure. 3618 */ 3619 STATIC void 3620 xlog_verify_dest_ptr( 3621 struct xlog *log, 3622 void *ptr) 3623 { 3624 int i; 3625 int good_ptr = 0; 3626 3627 for (i = 0; i < log->l_iclog_bufs; i++) { 3628 if (ptr >= log->l_iclog_bak[i] && 3629 ptr <= log->l_iclog_bak[i] + log->l_iclog_size) 3630 good_ptr++; 3631 } 3632 3633 if (!good_ptr) 3634 xfs_emerg(log->l_mp, "%s: invalid ptr", __func__); 3635 } 3636 3637 /* 3638 * Check to make sure the grant write head didn't just over lap the tail. If 3639 * the cycles are the same, we can't be overlapping. Otherwise, make sure that 3640 * the cycles differ by exactly one and check the byte count. 3641 * 3642 * This check is run unlocked, so can give false positives. Rather than assert 3643 * on failures, use a warn-once flag and a panic tag to allow the admin to 3644 * determine if they want to panic the machine when such an error occurs. For 3645 * debug kernels this will have the same effect as using an assert but, unlinke 3646 * an assert, it can be turned off at runtime. 3647 */ 3648 STATIC void 3649 xlog_verify_grant_tail( 3650 struct xlog *log) 3651 { 3652 int tail_cycle, tail_blocks; 3653 int cycle, space; 3654 3655 xlog_crack_grant_head(&log->l_write_head.grant, &cycle, &space); 3656 xlog_crack_atomic_lsn(&log->l_tail_lsn, &tail_cycle, &tail_blocks); 3657 if (tail_cycle != cycle) { 3658 if (cycle - 1 != tail_cycle && 3659 !(log->l_flags & XLOG_TAIL_WARN)) { 3660 xfs_alert_tag(log->l_mp, XFS_PTAG_LOGRES, 3661 "%s: cycle - 1 != tail_cycle", __func__); 3662 log->l_flags |= XLOG_TAIL_WARN; 3663 } 3664 3665 if (space > BBTOB(tail_blocks) && 3666 !(log->l_flags & XLOG_TAIL_WARN)) { 3667 xfs_alert_tag(log->l_mp, XFS_PTAG_LOGRES, 3668 "%s: space > BBTOB(tail_blocks)", __func__); 3669 log->l_flags |= XLOG_TAIL_WARN; 3670 } 3671 } 3672 } 3673 3674 /* check if it will fit */ 3675 STATIC void 3676 xlog_verify_tail_lsn( 3677 struct xlog *log, 3678 struct xlog_in_core *iclog, 3679 xfs_lsn_t tail_lsn) 3680 { 3681 int blocks; 3682 3683 if (CYCLE_LSN(tail_lsn) == log->l_prev_cycle) { 3684 blocks = 3685 log->l_logBBsize - (log->l_prev_block - BLOCK_LSN(tail_lsn)); 3686 if (blocks < BTOBB(iclog->ic_offset)+BTOBB(log->l_iclog_hsize)) 3687 xfs_emerg(log->l_mp, "%s: ran out of log space", __func__); 3688 } else { 3689 ASSERT(CYCLE_LSN(tail_lsn)+1 == log->l_prev_cycle); 3690 3691 if (BLOCK_LSN(tail_lsn) == log->l_prev_block) 3692 xfs_emerg(log->l_mp, "%s: tail wrapped", __func__); 3693 3694 blocks = BLOCK_LSN(tail_lsn) - log->l_prev_block; 3695 if (blocks < BTOBB(iclog->ic_offset) + 1) 3696 xfs_emerg(log->l_mp, "%s: ran out of log space", __func__); 3697 } 3698 } /* xlog_verify_tail_lsn */ 3699 3700 /* 3701 * Perform a number of checks on the iclog before writing to disk. 3702 * 3703 * 1. Make sure the iclogs are still circular 3704 * 2. Make sure we have a good magic number 3705 * 3. Make sure we don't have magic numbers in the data 3706 * 4. Check fields of each log operation header for: 3707 * A. Valid client identifier 3708 * B. tid ptr value falls in valid ptr space (user space code) 3709 * C. Length in log record header is correct according to the 3710 * individual operation headers within record. 3711 * 5. When a bwrite will occur within 5 blocks of the front of the physical 3712 * log, check the preceding blocks of the physical log to make sure all 3713 * the cycle numbers agree with the current cycle number. 3714 */ 3715 STATIC void 3716 xlog_verify_iclog( 3717 struct xlog *log, 3718 struct xlog_in_core *iclog, 3719 int count) 3720 { 3721 xlog_op_header_t *ophead; 3722 xlog_in_core_t *icptr; 3723 xlog_in_core_2_t *xhdr; 3724 void *base_ptr, *ptr, *p; 3725 ptrdiff_t field_offset; 3726 uint8_t clientid; 3727 int len, i, j, k, op_len; 3728 int idx; 3729 3730 /* check validity of iclog pointers */ 3731 spin_lock(&log->l_icloglock); 3732 icptr = log->l_iclog; 3733 for (i = 0; i < log->l_iclog_bufs; i++, icptr = icptr->ic_next) 3734 ASSERT(icptr); 3735 3736 if (icptr != log->l_iclog) 3737 xfs_emerg(log->l_mp, "%s: corrupt iclog ring", __func__); 3738 spin_unlock(&log->l_icloglock); 3739 3740 /* check log magic numbers */ 3741 if (iclog->ic_header.h_magicno != cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) 3742 xfs_emerg(log->l_mp, "%s: invalid magic num", __func__); 3743 3744 base_ptr = ptr = &iclog->ic_header; 3745 p = &iclog->ic_header; 3746 for (ptr += BBSIZE; ptr < base_ptr + count; ptr += BBSIZE) { 3747 if (*(__be32 *)ptr == cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) 3748 xfs_emerg(log->l_mp, "%s: unexpected magic num", 3749 __func__); 3750 } 3751 3752 /* check fields */ 3753 len = be32_to_cpu(iclog->ic_header.h_num_logops); 3754 base_ptr = ptr = iclog->ic_datap; 3755 ophead = ptr; 3756 xhdr = iclog->ic_data; 3757 for (i = 0; i < len; i++) { 3758 ophead = ptr; 3759 3760 /* clientid is only 1 byte */ 3761 p = &ophead->oh_clientid; 3762 field_offset = p - base_ptr; 3763 if (field_offset & 0x1ff) { 3764 clientid = ophead->oh_clientid; 3765 } else { 3766 idx = BTOBBT((char *)&ophead->oh_clientid - iclog->ic_datap); 3767 if (idx >= (XLOG_HEADER_CYCLE_SIZE / BBSIZE)) { 3768 j = idx / (XLOG_HEADER_CYCLE_SIZE / BBSIZE); 3769 k = idx % (XLOG_HEADER_CYCLE_SIZE / BBSIZE); 3770 clientid = xlog_get_client_id( 3771 xhdr[j].hic_xheader.xh_cycle_data[k]); 3772 } else { 3773 clientid = xlog_get_client_id( 3774 iclog->ic_header.h_cycle_data[idx]); 3775 } 3776 } 3777 if (clientid != XFS_TRANSACTION && clientid != XFS_LOG) 3778 xfs_warn(log->l_mp, 3779 "%s: invalid clientid %d op "PTR_FMT" offset 0x%lx", 3780 __func__, clientid, ophead, 3781 (unsigned long)field_offset); 3782 3783 /* check length */ 3784 p = &ophead->oh_len; 3785 field_offset = p - base_ptr; 3786 if (field_offset & 0x1ff) { 3787 op_len = be32_to_cpu(ophead->oh_len); 3788 } else { 3789 idx = BTOBBT((uintptr_t)&ophead->oh_len - 3790 (uintptr_t)iclog->ic_datap); 3791 if (idx >= (XLOG_HEADER_CYCLE_SIZE / BBSIZE)) { 3792 j = idx / (XLOG_HEADER_CYCLE_SIZE / BBSIZE); 3793 k = idx % (XLOG_HEADER_CYCLE_SIZE / BBSIZE); 3794 op_len = be32_to_cpu(xhdr[j].hic_xheader.xh_cycle_data[k]); 3795 } else { 3796 op_len = be32_to_cpu(iclog->ic_header.h_cycle_data[idx]); 3797 } 3798 } 3799 ptr += sizeof(xlog_op_header_t) + op_len; 3800 } 3801 } /* xlog_verify_iclog */ 3802 #endif 3803 3804 /* 3805 * Mark all iclogs IOERROR. l_icloglock is held by the caller. 3806 */ 3807 STATIC int 3808 xlog_state_ioerror( 3809 struct xlog *log) 3810 { 3811 xlog_in_core_t *iclog, *ic; 3812 3813 iclog = log->l_iclog; 3814 if (! (iclog->ic_state & XLOG_STATE_IOERROR)) { 3815 /* 3816 * Mark all the incore logs IOERROR. 3817 * From now on, no log flushes will result. 3818 */ 3819 ic = iclog; 3820 do { 3821 ic->ic_state = XLOG_STATE_IOERROR; 3822 ic = ic->ic_next; 3823 } while (ic != iclog); 3824 return 0; 3825 } 3826 /* 3827 * Return non-zero, if state transition has already happened. 3828 */ 3829 return 1; 3830 } 3831 3832 /* 3833 * This is called from xfs_force_shutdown, when we're forcibly 3834 * shutting down the filesystem, typically because of an IO error. 3835 * Our main objectives here are to make sure that: 3836 * a. if !logerror, flush the logs to disk. Anything modified 3837 * after this is ignored. 3838 * b. the filesystem gets marked 'SHUTDOWN' for all interested 3839 * parties to find out, 'atomically'. 3840 * c. those who're sleeping on log reservations, pinned objects and 3841 * other resources get woken up, and be told the bad news. 3842 * d. nothing new gets queued up after (b) and (c) are done. 3843 * 3844 * Note: for the !logerror case we need to flush the regions held in memory out 3845 * to disk first. This needs to be done before the log is marked as shutdown, 3846 * otherwise the iclog writes will fail. 3847 */ 3848 int 3849 xfs_log_force_umount( 3850 struct xfs_mount *mp, 3851 int logerror) 3852 { 3853 struct xlog *log; 3854 int retval; 3855 3856 log = mp->m_log; 3857 3858 /* 3859 * If this happens during log recovery, don't worry about 3860 * locking; the log isn't open for business yet. 3861 */ 3862 if (!log || 3863 log->l_flags & XLOG_ACTIVE_RECOVERY) { 3864 mp->m_flags |= XFS_MOUNT_FS_SHUTDOWN; 3865 if (mp->m_sb_bp) 3866 mp->m_sb_bp->b_flags |= XBF_DONE; 3867 return 0; 3868 } 3869 3870 /* 3871 * Somebody could've already done the hard work for us. 3872 * No need to get locks for this. 3873 */ 3874 if (logerror && log->l_iclog->ic_state & XLOG_STATE_IOERROR) { 3875 ASSERT(XLOG_FORCED_SHUTDOWN(log)); 3876 return 1; 3877 } 3878 3879 /* 3880 * Flush all the completed transactions to disk before marking the log 3881 * being shut down. We need to do it in this order to ensure that 3882 * completed operations are safely on disk before we shut down, and that 3883 * we don't have to issue any buffer IO after the shutdown flags are set 3884 * to guarantee this. 3885 */ 3886 if (!logerror) 3887 xfs_log_force(mp, XFS_LOG_SYNC); 3888 3889 /* 3890 * mark the filesystem and the as in a shutdown state and wake 3891 * everybody up to tell them the bad news. 3892 */ 3893 spin_lock(&log->l_icloglock); 3894 mp->m_flags |= XFS_MOUNT_FS_SHUTDOWN; 3895 if (mp->m_sb_bp) 3896 mp->m_sb_bp->b_flags |= XBF_DONE; 3897 3898 /* 3899 * Mark the log and the iclogs with IO error flags to prevent any 3900 * further log IO from being issued or completed. 3901 */ 3902 log->l_flags |= XLOG_IO_ERROR; 3903 retval = xlog_state_ioerror(log); 3904 spin_unlock(&log->l_icloglock); 3905 3906 /* 3907 * We don't want anybody waiting for log reservations after this. That 3908 * means we have to wake up everybody queued up on reserveq as well as 3909 * writeq. In addition, we make sure in xlog_{re}grant_log_space that 3910 * we don't enqueue anything once the SHUTDOWN flag is set, and this 3911 * action is protected by the grant locks. 3912 */ 3913 xlog_grant_head_wake_all(&log->l_reserve_head); 3914 xlog_grant_head_wake_all(&log->l_write_head); 3915 3916 /* 3917 * Wake up everybody waiting on xfs_log_force. Wake the CIL push first 3918 * as if the log writes were completed. The abort handling in the log 3919 * item committed callback functions will do this again under lock to 3920 * avoid races. 3921 */ 3922 wake_up_all(&log->l_cilp->xc_commit_wait); 3923 xlog_state_do_callback(log, true, NULL); 3924 3925 #ifdef XFSERRORDEBUG 3926 { 3927 xlog_in_core_t *iclog; 3928 3929 spin_lock(&log->l_icloglock); 3930 iclog = log->l_iclog; 3931 do { 3932 ASSERT(iclog->ic_callback == 0); 3933 iclog = iclog->ic_next; 3934 } while (iclog != log->l_iclog); 3935 spin_unlock(&log->l_icloglock); 3936 } 3937 #endif 3938 /* return non-zero if log IOERROR transition had already happened */ 3939 return retval; 3940 } 3941 3942 STATIC int 3943 xlog_iclogs_empty( 3944 struct xlog *log) 3945 { 3946 xlog_in_core_t *iclog; 3947 3948 iclog = log->l_iclog; 3949 do { 3950 /* endianness does not matter here, zero is zero in 3951 * any language. 3952 */ 3953 if (iclog->ic_header.h_num_logops) 3954 return 0; 3955 iclog = iclog->ic_next; 3956 } while (iclog != log->l_iclog); 3957 return 1; 3958 } 3959 3960 /* 3961 * Verify that an LSN stamped into a piece of metadata is valid. This is 3962 * intended for use in read verifiers on v5 superblocks. 3963 */ 3964 bool 3965 xfs_log_check_lsn( 3966 struct xfs_mount *mp, 3967 xfs_lsn_t lsn) 3968 { 3969 struct xlog *log = mp->m_log; 3970 bool valid; 3971 3972 /* 3973 * norecovery mode skips mount-time log processing and unconditionally 3974 * resets the in-core LSN. We can't validate in this mode, but 3975 * modifications are not allowed anyways so just return true. 3976 */ 3977 if (mp->m_flags & XFS_MOUNT_NORECOVERY) 3978 return true; 3979 3980 /* 3981 * Some metadata LSNs are initialized to NULL (e.g., the agfl). This is 3982 * handled by recovery and thus safe to ignore here. 3983 */ 3984 if (lsn == NULLCOMMITLSN) 3985 return true; 3986 3987 valid = xlog_valid_lsn(mp->m_log, lsn); 3988 3989 /* warn the user about what's gone wrong before verifier failure */ 3990 if (!valid) { 3991 spin_lock(&log->l_icloglock); 3992 xfs_warn(mp, 3993 "Corruption warning: Metadata has LSN (%d:%d) ahead of current LSN (%d:%d). " 3994 "Please unmount and run xfs_repair (>= v4.3) to resolve.", 3995 CYCLE_LSN(lsn), BLOCK_LSN(lsn), 3996 log->l_curr_cycle, log->l_curr_block); 3997 spin_unlock(&log->l_icloglock); 3998 } 3999 4000 return valid; 4001 } 4002 4003 bool 4004 xfs_log_in_recovery( 4005 struct xfs_mount *mp) 4006 { 4007 struct xlog *log = mp->m_log; 4008 4009 return log->l_flags & XLOG_ACTIVE_RECOVERY; 4010 } 4011