1 /* 2 * Copyright (C) 2015 Shaohua Li <shli@fb.com> 3 * Copyright (C) 2016 Song Liu <songliubraving@fb.com> 4 * 5 * This program is free software; you can redistribute it and/or modify it 6 * under the terms and conditions of the GNU General Public License, 7 * version 2, as published by the Free Software Foundation. 8 * 9 * This program is distributed in the hope it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 12 * more details. 13 * 14 */ 15 #include <linux/kernel.h> 16 #include <linux/wait.h> 17 #include <linux/blkdev.h> 18 #include <linux/slab.h> 19 #include <linux/raid/md_p.h> 20 #include <linux/crc32c.h> 21 #include <linux/random.h> 22 #include <linux/kthread.h> 23 #include "md.h" 24 #include "raid5.h" 25 #include "bitmap.h" 26 27 /* 28 * metadata/data stored in disk with 4k size unit (a block) regardless 29 * underneath hardware sector size. only works with PAGE_SIZE == 4096 30 */ 31 #define BLOCK_SECTORS (8) 32 33 /* 34 * log->max_free_space is min(1/4 disk size, 10G reclaimable space). 35 * 36 * In write through mode, the reclaim runs every log->max_free_space. 37 * This can prevent the recovery scans for too long 38 */ 39 #define RECLAIM_MAX_FREE_SPACE (10 * 1024 * 1024 * 2) /* sector */ 40 #define RECLAIM_MAX_FREE_SPACE_SHIFT (2) 41 42 /* wake up reclaim thread periodically */ 43 #define R5C_RECLAIM_WAKEUP_INTERVAL (30 * HZ) 44 /* start flush with these full stripes */ 45 #define R5C_FULL_STRIPE_FLUSH_BATCH 256 46 /* reclaim stripes in groups */ 47 #define R5C_RECLAIM_STRIPE_GROUP (NR_STRIPE_HASH_LOCKS * 2) 48 49 /* 50 * We only need 2 bios per I/O unit to make progress, but ensure we 51 * have a few more available to not get too tight. 52 */ 53 #define R5L_POOL_SIZE 4 54 55 /* 56 * r5c journal modes of the array: write-back or write-through. 57 * write-through mode has identical behavior as existing log only 58 * implementation. 59 */ 60 enum r5c_journal_mode { 61 R5C_JOURNAL_MODE_WRITE_THROUGH = 0, 62 R5C_JOURNAL_MODE_WRITE_BACK = 1, 63 }; 64 65 static char *r5c_journal_mode_str[] = {"write-through", 66 "write-back"}; 67 /* 68 * raid5 cache state machine 69 * 70 * With the RAID cache, each stripe works in two phases: 71 * - caching phase 72 * - writing-out phase 73 * 74 * These two phases are controlled by bit STRIPE_R5C_CACHING: 75 * if STRIPE_R5C_CACHING == 0, the stripe is in writing-out phase 76 * if STRIPE_R5C_CACHING == 1, the stripe is in caching phase 77 * 78 * When there is no journal, or the journal is in write-through mode, 79 * the stripe is always in writing-out phase. 80 * 81 * For write-back journal, the stripe is sent to caching phase on write 82 * (r5c_try_caching_write). r5c_make_stripe_write_out() kicks off 83 * the write-out phase by clearing STRIPE_R5C_CACHING. 84 * 85 * Stripes in caching phase do not write the raid disks. Instead, all 86 * writes are committed from the log device. Therefore, a stripe in 87 * caching phase handles writes as: 88 * - write to log device 89 * - return IO 90 * 91 * Stripes in writing-out phase handle writes as: 92 * - calculate parity 93 * - write pending data and parity to journal 94 * - write data and parity to raid disks 95 * - return IO for pending writes 96 */ 97 98 struct r5l_log { 99 struct md_rdev *rdev; 100 101 u32 uuid_checksum; 102 103 sector_t device_size; /* log device size, round to 104 * BLOCK_SECTORS */ 105 sector_t max_free_space; /* reclaim run if free space is at 106 * this size */ 107 108 sector_t last_checkpoint; /* log tail. where recovery scan 109 * starts from */ 110 u64 last_cp_seq; /* log tail sequence */ 111 112 sector_t log_start; /* log head. where new data appends */ 113 u64 seq; /* log head sequence */ 114 115 sector_t next_checkpoint; 116 117 struct mutex io_mutex; 118 struct r5l_io_unit *current_io; /* current io_unit accepting new data */ 119 120 spinlock_t io_list_lock; 121 struct list_head running_ios; /* io_units which are still running, 122 * and have not yet been completely 123 * written to the log */ 124 struct list_head io_end_ios; /* io_units which have been completely 125 * written to the log but not yet written 126 * to the RAID */ 127 struct list_head flushing_ios; /* io_units which are waiting for log 128 * cache flush */ 129 struct list_head finished_ios; /* io_units which settle down in log disk */ 130 struct bio flush_bio; 131 132 struct list_head no_mem_stripes; /* pending stripes, -ENOMEM */ 133 134 struct kmem_cache *io_kc; 135 mempool_t *io_pool; 136 struct bio_set *bs; 137 mempool_t *meta_pool; 138 139 struct md_thread *reclaim_thread; 140 unsigned long reclaim_target; /* number of space that need to be 141 * reclaimed. if it's 0, reclaim spaces 142 * used by io_units which are in 143 * IO_UNIT_STRIPE_END state (eg, reclaim 144 * dones't wait for specific io_unit 145 * switching to IO_UNIT_STRIPE_END 146 * state) */ 147 wait_queue_head_t iounit_wait; 148 149 struct list_head no_space_stripes; /* pending stripes, log has no space */ 150 spinlock_t no_space_stripes_lock; 151 152 bool need_cache_flush; 153 154 /* for r5c_cache */ 155 enum r5c_journal_mode r5c_journal_mode; 156 157 /* all stripes in r5cache, in the order of seq at sh->log_start */ 158 struct list_head stripe_in_journal_list; 159 160 spinlock_t stripe_in_journal_lock; 161 atomic_t stripe_in_journal_count; 162 163 /* to submit async io_units, to fulfill ordering of flush */ 164 struct work_struct deferred_io_work; 165 /* to disable write back during in degraded mode */ 166 struct work_struct disable_writeback_work; 167 }; 168 169 /* 170 * an IO range starts from a meta data block and end at the next meta data 171 * block. The io unit's the meta data block tracks data/parity followed it. io 172 * unit is written to log disk with normal write, as we always flush log disk 173 * first and then start move data to raid disks, there is no requirement to 174 * write io unit with FLUSH/FUA 175 */ 176 struct r5l_io_unit { 177 struct r5l_log *log; 178 179 struct page *meta_page; /* store meta block */ 180 int meta_offset; /* current offset in meta_page */ 181 182 struct bio *current_bio;/* current_bio accepting new data */ 183 184 atomic_t pending_stripe;/* how many stripes not flushed to raid */ 185 u64 seq; /* seq number of the metablock */ 186 sector_t log_start; /* where the io_unit starts */ 187 sector_t log_end; /* where the io_unit ends */ 188 struct list_head log_sibling; /* log->running_ios */ 189 struct list_head stripe_list; /* stripes added to the io_unit */ 190 191 int state; 192 bool need_split_bio; 193 struct bio *split_bio; 194 195 unsigned int has_flush:1; /* include flush request */ 196 unsigned int has_fua:1; /* include fua request */ 197 unsigned int has_null_flush:1; /* include empty flush request */ 198 /* 199 * io isn't sent yet, flush/fua request can only be submitted till it's 200 * the first IO in running_ios list 201 */ 202 unsigned int io_deferred:1; 203 204 struct bio_list flush_barriers; /* size == 0 flush bios */ 205 }; 206 207 /* r5l_io_unit state */ 208 enum r5l_io_unit_state { 209 IO_UNIT_RUNNING = 0, /* accepting new IO */ 210 IO_UNIT_IO_START = 1, /* io_unit bio start writing to log, 211 * don't accepting new bio */ 212 IO_UNIT_IO_END = 2, /* io_unit bio finish writing to log */ 213 IO_UNIT_STRIPE_END = 3, /* stripes data finished writing to raid */ 214 }; 215 216 bool r5c_is_writeback(struct r5l_log *log) 217 { 218 return (log != NULL && 219 log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_BACK); 220 } 221 222 static sector_t r5l_ring_add(struct r5l_log *log, sector_t start, sector_t inc) 223 { 224 start += inc; 225 if (start >= log->device_size) 226 start = start - log->device_size; 227 return start; 228 } 229 230 static sector_t r5l_ring_distance(struct r5l_log *log, sector_t start, 231 sector_t end) 232 { 233 if (end >= start) 234 return end - start; 235 else 236 return end + log->device_size - start; 237 } 238 239 static bool r5l_has_free_space(struct r5l_log *log, sector_t size) 240 { 241 sector_t used_size; 242 243 used_size = r5l_ring_distance(log, log->last_checkpoint, 244 log->log_start); 245 246 return log->device_size > used_size + size; 247 } 248 249 static void __r5l_set_io_unit_state(struct r5l_io_unit *io, 250 enum r5l_io_unit_state state) 251 { 252 if (WARN_ON(io->state >= state)) 253 return; 254 io->state = state; 255 } 256 257 static void 258 r5c_return_dev_pending_writes(struct r5conf *conf, struct r5dev *dev, 259 struct bio_list *return_bi) 260 { 261 struct bio *wbi, *wbi2; 262 263 wbi = dev->written; 264 dev->written = NULL; 265 while (wbi && wbi->bi_iter.bi_sector < 266 dev->sector + STRIPE_SECTORS) { 267 wbi2 = r5_next_bio(wbi, dev->sector); 268 if (!raid5_dec_bi_active_stripes(wbi)) { 269 md_write_end(conf->mddev); 270 bio_list_add(return_bi, wbi); 271 } 272 wbi = wbi2; 273 } 274 } 275 276 void r5c_handle_cached_data_endio(struct r5conf *conf, 277 struct stripe_head *sh, int disks, struct bio_list *return_bi) 278 { 279 int i; 280 281 for (i = sh->disks; i--; ) { 282 if (sh->dev[i].written) { 283 set_bit(R5_UPTODATE, &sh->dev[i].flags); 284 r5c_return_dev_pending_writes(conf, &sh->dev[i], 285 return_bi); 286 bitmap_endwrite(conf->mddev->bitmap, sh->sector, 287 STRIPE_SECTORS, 288 !test_bit(STRIPE_DEGRADED, &sh->state), 289 0); 290 } 291 } 292 } 293 294 /* Check whether we should flush some stripes to free up stripe cache */ 295 void r5c_check_stripe_cache_usage(struct r5conf *conf) 296 { 297 int total_cached; 298 299 if (!r5c_is_writeback(conf->log)) 300 return; 301 302 total_cached = atomic_read(&conf->r5c_cached_partial_stripes) + 303 atomic_read(&conf->r5c_cached_full_stripes); 304 305 /* 306 * The following condition is true for either of the following: 307 * - stripe cache pressure high: 308 * total_cached > 3/4 min_nr_stripes || 309 * empty_inactive_list_nr > 0 310 * - stripe cache pressure moderate: 311 * total_cached > 1/2 min_nr_stripes 312 */ 313 if (total_cached > conf->min_nr_stripes * 1 / 2 || 314 atomic_read(&conf->empty_inactive_list_nr) > 0) 315 r5l_wake_reclaim(conf->log, 0); 316 } 317 318 /* 319 * flush cache when there are R5C_FULL_STRIPE_FLUSH_BATCH or more full 320 * stripes in the cache 321 */ 322 void r5c_check_cached_full_stripe(struct r5conf *conf) 323 { 324 if (!r5c_is_writeback(conf->log)) 325 return; 326 327 /* 328 * wake up reclaim for R5C_FULL_STRIPE_FLUSH_BATCH cached stripes 329 * or a full stripe (chunk size / 4k stripes). 330 */ 331 if (atomic_read(&conf->r5c_cached_full_stripes) >= 332 min(R5C_FULL_STRIPE_FLUSH_BATCH, 333 conf->chunk_sectors >> STRIPE_SHIFT)) 334 r5l_wake_reclaim(conf->log, 0); 335 } 336 337 /* 338 * Total log space (in sectors) needed to flush all data in cache 339 * 340 * Currently, writing-out phase automatically includes all pending writes 341 * to the same sector. So the reclaim of each stripe takes up to 342 * (conf->raid_disks + 1) pages of log space. 343 * 344 * To totally avoid deadlock due to log space, the code reserves 345 * (conf->raid_disks + 1) pages for each stripe in cache, which is not 346 * necessary in most cases. 347 * 348 * To improve this, we will need writing-out phase to be able to NOT include 349 * pending writes, which will reduce the requirement to 350 * (conf->max_degraded + 1) pages per stripe in cache. 351 */ 352 static sector_t r5c_log_required_to_flush_cache(struct r5conf *conf) 353 { 354 struct r5l_log *log = conf->log; 355 356 if (!r5c_is_writeback(log)) 357 return 0; 358 359 return BLOCK_SECTORS * (conf->raid_disks + 1) * 360 atomic_read(&log->stripe_in_journal_count); 361 } 362 363 /* 364 * evaluate log space usage and update R5C_LOG_TIGHT and R5C_LOG_CRITICAL 365 * 366 * R5C_LOG_TIGHT is set when free space on the log device is less than 3x of 367 * reclaim_required_space. R5C_LOG_CRITICAL is set when free space on the log 368 * device is less than 2x of reclaim_required_space. 369 */ 370 static inline void r5c_update_log_state(struct r5l_log *log) 371 { 372 struct r5conf *conf = log->rdev->mddev->private; 373 sector_t free_space; 374 sector_t reclaim_space; 375 bool wake_reclaim = false; 376 377 if (!r5c_is_writeback(log)) 378 return; 379 380 free_space = r5l_ring_distance(log, log->log_start, 381 log->last_checkpoint); 382 reclaim_space = r5c_log_required_to_flush_cache(conf); 383 if (free_space < 2 * reclaim_space) 384 set_bit(R5C_LOG_CRITICAL, &conf->cache_state); 385 else { 386 if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state)) 387 wake_reclaim = true; 388 clear_bit(R5C_LOG_CRITICAL, &conf->cache_state); 389 } 390 if (free_space < 3 * reclaim_space) 391 set_bit(R5C_LOG_TIGHT, &conf->cache_state); 392 else 393 clear_bit(R5C_LOG_TIGHT, &conf->cache_state); 394 395 if (wake_reclaim) 396 r5l_wake_reclaim(log, 0); 397 } 398 399 /* 400 * Put the stripe into writing-out phase by clearing STRIPE_R5C_CACHING. 401 * This function should only be called in write-back mode. 402 */ 403 void r5c_make_stripe_write_out(struct stripe_head *sh) 404 { 405 struct r5conf *conf = sh->raid_conf; 406 struct r5l_log *log = conf->log; 407 408 BUG_ON(!r5c_is_writeback(log)); 409 410 WARN_ON(!test_bit(STRIPE_R5C_CACHING, &sh->state)); 411 clear_bit(STRIPE_R5C_CACHING, &sh->state); 412 413 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 414 atomic_inc(&conf->preread_active_stripes); 415 416 if (test_and_clear_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state)) { 417 BUG_ON(atomic_read(&conf->r5c_cached_partial_stripes) == 0); 418 atomic_dec(&conf->r5c_cached_partial_stripes); 419 } 420 421 if (test_and_clear_bit(STRIPE_R5C_FULL_STRIPE, &sh->state)) { 422 BUG_ON(atomic_read(&conf->r5c_cached_full_stripes) == 0); 423 atomic_dec(&conf->r5c_cached_full_stripes); 424 } 425 } 426 427 static void r5c_handle_data_cached(struct stripe_head *sh) 428 { 429 int i; 430 431 for (i = sh->disks; i--; ) 432 if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) { 433 set_bit(R5_InJournal, &sh->dev[i].flags); 434 clear_bit(R5_LOCKED, &sh->dev[i].flags); 435 } 436 clear_bit(STRIPE_LOG_TRAPPED, &sh->state); 437 } 438 439 /* 440 * this journal write must contain full parity, 441 * it may also contain some data pages 442 */ 443 static void r5c_handle_parity_cached(struct stripe_head *sh) 444 { 445 int i; 446 447 for (i = sh->disks; i--; ) 448 if (test_bit(R5_InJournal, &sh->dev[i].flags)) 449 set_bit(R5_Wantwrite, &sh->dev[i].flags); 450 } 451 452 /* 453 * Setting proper flags after writing (or flushing) data and/or parity to the 454 * log device. This is called from r5l_log_endio() or r5l_log_flush_endio(). 455 */ 456 static void r5c_finish_cache_stripe(struct stripe_head *sh) 457 { 458 struct r5l_log *log = sh->raid_conf->log; 459 460 if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) { 461 BUG_ON(test_bit(STRIPE_R5C_CACHING, &sh->state)); 462 /* 463 * Set R5_InJournal for parity dev[pd_idx]. This means 464 * all data AND parity in the journal. For RAID 6, it is 465 * NOT necessary to set the flag for dev[qd_idx], as the 466 * two parities are written out together. 467 */ 468 set_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags); 469 } else if (test_bit(STRIPE_R5C_CACHING, &sh->state)) { 470 r5c_handle_data_cached(sh); 471 } else { 472 r5c_handle_parity_cached(sh); 473 set_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags); 474 } 475 } 476 477 static void r5l_io_run_stripes(struct r5l_io_unit *io) 478 { 479 struct stripe_head *sh, *next; 480 481 list_for_each_entry_safe(sh, next, &io->stripe_list, log_list) { 482 list_del_init(&sh->log_list); 483 484 r5c_finish_cache_stripe(sh); 485 486 set_bit(STRIPE_HANDLE, &sh->state); 487 raid5_release_stripe(sh); 488 } 489 } 490 491 static void r5l_log_run_stripes(struct r5l_log *log) 492 { 493 struct r5l_io_unit *io, *next; 494 495 assert_spin_locked(&log->io_list_lock); 496 497 list_for_each_entry_safe(io, next, &log->running_ios, log_sibling) { 498 /* don't change list order */ 499 if (io->state < IO_UNIT_IO_END) 500 break; 501 502 list_move_tail(&io->log_sibling, &log->finished_ios); 503 r5l_io_run_stripes(io); 504 } 505 } 506 507 static void r5l_move_to_end_ios(struct r5l_log *log) 508 { 509 struct r5l_io_unit *io, *next; 510 511 assert_spin_locked(&log->io_list_lock); 512 513 list_for_each_entry_safe(io, next, &log->running_ios, log_sibling) { 514 /* don't change list order */ 515 if (io->state < IO_UNIT_IO_END) 516 break; 517 list_move_tail(&io->log_sibling, &log->io_end_ios); 518 } 519 } 520 521 static void __r5l_stripe_write_finished(struct r5l_io_unit *io); 522 static void r5l_log_endio(struct bio *bio) 523 { 524 struct r5l_io_unit *io = bio->bi_private; 525 struct r5l_io_unit *io_deferred; 526 struct r5l_log *log = io->log; 527 unsigned long flags; 528 529 if (bio->bi_error) 530 md_error(log->rdev->mddev, log->rdev); 531 532 bio_put(bio); 533 mempool_free(io->meta_page, log->meta_pool); 534 535 spin_lock_irqsave(&log->io_list_lock, flags); 536 __r5l_set_io_unit_state(io, IO_UNIT_IO_END); 537 if (log->need_cache_flush) 538 r5l_move_to_end_ios(log); 539 else 540 r5l_log_run_stripes(log); 541 if (!list_empty(&log->running_ios)) { 542 /* 543 * FLUSH/FUA io_unit is deferred because of ordering, now we 544 * can dispatch it 545 */ 546 io_deferred = list_first_entry(&log->running_ios, 547 struct r5l_io_unit, log_sibling); 548 if (io_deferred->io_deferred) 549 schedule_work(&log->deferred_io_work); 550 } 551 552 spin_unlock_irqrestore(&log->io_list_lock, flags); 553 554 if (log->need_cache_flush) 555 md_wakeup_thread(log->rdev->mddev->thread); 556 557 if (io->has_null_flush) { 558 struct bio *bi; 559 560 WARN_ON(bio_list_empty(&io->flush_barriers)); 561 while ((bi = bio_list_pop(&io->flush_barriers)) != NULL) { 562 bio_endio(bi); 563 atomic_dec(&io->pending_stripe); 564 } 565 if (atomic_read(&io->pending_stripe) == 0) 566 __r5l_stripe_write_finished(io); 567 } 568 } 569 570 static void r5l_do_submit_io(struct r5l_log *log, struct r5l_io_unit *io) 571 { 572 unsigned long flags; 573 574 spin_lock_irqsave(&log->io_list_lock, flags); 575 __r5l_set_io_unit_state(io, IO_UNIT_IO_START); 576 spin_unlock_irqrestore(&log->io_list_lock, flags); 577 578 if (io->has_flush) 579 io->current_bio->bi_opf |= REQ_PREFLUSH; 580 if (io->has_fua) 581 io->current_bio->bi_opf |= REQ_FUA; 582 submit_bio(io->current_bio); 583 584 if (!io->split_bio) 585 return; 586 587 if (io->has_flush) 588 io->split_bio->bi_opf |= REQ_PREFLUSH; 589 if (io->has_fua) 590 io->split_bio->bi_opf |= REQ_FUA; 591 submit_bio(io->split_bio); 592 } 593 594 /* deferred io_unit will be dispatched here */ 595 static void r5l_submit_io_async(struct work_struct *work) 596 { 597 struct r5l_log *log = container_of(work, struct r5l_log, 598 deferred_io_work); 599 struct r5l_io_unit *io = NULL; 600 unsigned long flags; 601 602 spin_lock_irqsave(&log->io_list_lock, flags); 603 if (!list_empty(&log->running_ios)) { 604 io = list_first_entry(&log->running_ios, struct r5l_io_unit, 605 log_sibling); 606 if (!io->io_deferred) 607 io = NULL; 608 else 609 io->io_deferred = 0; 610 } 611 spin_unlock_irqrestore(&log->io_list_lock, flags); 612 if (io) 613 r5l_do_submit_io(log, io); 614 } 615 616 static void r5c_disable_writeback_async(struct work_struct *work) 617 { 618 struct r5l_log *log = container_of(work, struct r5l_log, 619 disable_writeback_work); 620 struct mddev *mddev = log->rdev->mddev; 621 622 if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) 623 return; 624 pr_info("md/raid:%s: Disabling writeback cache for degraded array.\n", 625 mdname(mddev)); 626 mddev_suspend(mddev); 627 log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH; 628 mddev_resume(mddev); 629 } 630 631 static void r5l_submit_current_io(struct r5l_log *log) 632 { 633 struct r5l_io_unit *io = log->current_io; 634 struct bio *bio; 635 struct r5l_meta_block *block; 636 unsigned long flags; 637 u32 crc; 638 bool do_submit = true; 639 640 if (!io) 641 return; 642 643 block = page_address(io->meta_page); 644 block->meta_size = cpu_to_le32(io->meta_offset); 645 crc = crc32c_le(log->uuid_checksum, block, PAGE_SIZE); 646 block->checksum = cpu_to_le32(crc); 647 bio = io->current_bio; 648 649 log->current_io = NULL; 650 spin_lock_irqsave(&log->io_list_lock, flags); 651 if (io->has_flush || io->has_fua) { 652 if (io != list_first_entry(&log->running_ios, 653 struct r5l_io_unit, log_sibling)) { 654 io->io_deferred = 1; 655 do_submit = false; 656 } 657 } 658 spin_unlock_irqrestore(&log->io_list_lock, flags); 659 if (do_submit) 660 r5l_do_submit_io(log, io); 661 } 662 663 static struct bio *r5l_bio_alloc(struct r5l_log *log) 664 { 665 struct bio *bio = bio_alloc_bioset(GFP_NOIO, BIO_MAX_PAGES, log->bs); 666 667 bio_set_op_attrs(bio, REQ_OP_WRITE, 0); 668 bio->bi_bdev = log->rdev->bdev; 669 bio->bi_iter.bi_sector = log->rdev->data_offset + log->log_start; 670 671 return bio; 672 } 673 674 static void r5_reserve_log_entry(struct r5l_log *log, struct r5l_io_unit *io) 675 { 676 log->log_start = r5l_ring_add(log, log->log_start, BLOCK_SECTORS); 677 678 r5c_update_log_state(log); 679 /* 680 * If we filled up the log device start from the beginning again, 681 * which will require a new bio. 682 * 683 * Note: for this to work properly the log size needs to me a multiple 684 * of BLOCK_SECTORS. 685 */ 686 if (log->log_start == 0) 687 io->need_split_bio = true; 688 689 io->log_end = log->log_start; 690 } 691 692 static struct r5l_io_unit *r5l_new_meta(struct r5l_log *log) 693 { 694 struct r5l_io_unit *io; 695 struct r5l_meta_block *block; 696 697 io = mempool_alloc(log->io_pool, GFP_ATOMIC); 698 if (!io) 699 return NULL; 700 memset(io, 0, sizeof(*io)); 701 702 io->log = log; 703 INIT_LIST_HEAD(&io->log_sibling); 704 INIT_LIST_HEAD(&io->stripe_list); 705 bio_list_init(&io->flush_barriers); 706 io->state = IO_UNIT_RUNNING; 707 708 io->meta_page = mempool_alloc(log->meta_pool, GFP_NOIO); 709 block = page_address(io->meta_page); 710 clear_page(block); 711 block->magic = cpu_to_le32(R5LOG_MAGIC); 712 block->version = R5LOG_VERSION; 713 block->seq = cpu_to_le64(log->seq); 714 block->position = cpu_to_le64(log->log_start); 715 716 io->log_start = log->log_start; 717 io->meta_offset = sizeof(struct r5l_meta_block); 718 io->seq = log->seq++; 719 720 io->current_bio = r5l_bio_alloc(log); 721 io->current_bio->bi_end_io = r5l_log_endio; 722 io->current_bio->bi_private = io; 723 bio_add_page(io->current_bio, io->meta_page, PAGE_SIZE, 0); 724 725 r5_reserve_log_entry(log, io); 726 727 spin_lock_irq(&log->io_list_lock); 728 list_add_tail(&io->log_sibling, &log->running_ios); 729 spin_unlock_irq(&log->io_list_lock); 730 731 return io; 732 } 733 734 static int r5l_get_meta(struct r5l_log *log, unsigned int payload_size) 735 { 736 if (log->current_io && 737 log->current_io->meta_offset + payload_size > PAGE_SIZE) 738 r5l_submit_current_io(log); 739 740 if (!log->current_io) { 741 log->current_io = r5l_new_meta(log); 742 if (!log->current_io) 743 return -ENOMEM; 744 } 745 746 return 0; 747 } 748 749 static void r5l_append_payload_meta(struct r5l_log *log, u16 type, 750 sector_t location, 751 u32 checksum1, u32 checksum2, 752 bool checksum2_valid) 753 { 754 struct r5l_io_unit *io = log->current_io; 755 struct r5l_payload_data_parity *payload; 756 757 payload = page_address(io->meta_page) + io->meta_offset; 758 payload->header.type = cpu_to_le16(type); 759 payload->header.flags = cpu_to_le16(0); 760 payload->size = cpu_to_le32((1 + !!checksum2_valid) << 761 (PAGE_SHIFT - 9)); 762 payload->location = cpu_to_le64(location); 763 payload->checksum[0] = cpu_to_le32(checksum1); 764 if (checksum2_valid) 765 payload->checksum[1] = cpu_to_le32(checksum2); 766 767 io->meta_offset += sizeof(struct r5l_payload_data_parity) + 768 sizeof(__le32) * (1 + !!checksum2_valid); 769 } 770 771 static void r5l_append_payload_page(struct r5l_log *log, struct page *page) 772 { 773 struct r5l_io_unit *io = log->current_io; 774 775 if (io->need_split_bio) { 776 BUG_ON(io->split_bio); 777 io->split_bio = io->current_bio; 778 io->current_bio = r5l_bio_alloc(log); 779 bio_chain(io->current_bio, io->split_bio); 780 io->need_split_bio = false; 781 } 782 783 if (!bio_add_page(io->current_bio, page, PAGE_SIZE, 0)) 784 BUG(); 785 786 r5_reserve_log_entry(log, io); 787 } 788 789 static int r5l_log_stripe(struct r5l_log *log, struct stripe_head *sh, 790 int data_pages, int parity_pages) 791 { 792 int i; 793 int meta_size; 794 int ret; 795 struct r5l_io_unit *io; 796 797 meta_size = 798 ((sizeof(struct r5l_payload_data_parity) + sizeof(__le32)) 799 * data_pages) + 800 sizeof(struct r5l_payload_data_parity) + 801 sizeof(__le32) * parity_pages; 802 803 ret = r5l_get_meta(log, meta_size); 804 if (ret) 805 return ret; 806 807 io = log->current_io; 808 809 if (test_and_clear_bit(STRIPE_R5C_PREFLUSH, &sh->state)) 810 io->has_flush = 1; 811 812 for (i = 0; i < sh->disks; i++) { 813 if (!test_bit(R5_Wantwrite, &sh->dev[i].flags) || 814 test_bit(R5_InJournal, &sh->dev[i].flags)) 815 continue; 816 if (i == sh->pd_idx || i == sh->qd_idx) 817 continue; 818 if (test_bit(R5_WantFUA, &sh->dev[i].flags) && 819 log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_BACK) { 820 io->has_fua = 1; 821 /* 822 * we need to flush journal to make sure recovery can 823 * reach the data with fua flag 824 */ 825 io->has_flush = 1; 826 } 827 r5l_append_payload_meta(log, R5LOG_PAYLOAD_DATA, 828 raid5_compute_blocknr(sh, i, 0), 829 sh->dev[i].log_checksum, 0, false); 830 r5l_append_payload_page(log, sh->dev[i].page); 831 } 832 833 if (parity_pages == 2) { 834 r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY, 835 sh->sector, sh->dev[sh->pd_idx].log_checksum, 836 sh->dev[sh->qd_idx].log_checksum, true); 837 r5l_append_payload_page(log, sh->dev[sh->pd_idx].page); 838 r5l_append_payload_page(log, sh->dev[sh->qd_idx].page); 839 } else if (parity_pages == 1) { 840 r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY, 841 sh->sector, sh->dev[sh->pd_idx].log_checksum, 842 0, false); 843 r5l_append_payload_page(log, sh->dev[sh->pd_idx].page); 844 } else /* Just writing data, not parity, in caching phase */ 845 BUG_ON(parity_pages != 0); 846 847 list_add_tail(&sh->log_list, &io->stripe_list); 848 atomic_inc(&io->pending_stripe); 849 sh->log_io = io; 850 851 if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) 852 return 0; 853 854 if (sh->log_start == MaxSector) { 855 BUG_ON(!list_empty(&sh->r5c)); 856 sh->log_start = io->log_start; 857 spin_lock_irq(&log->stripe_in_journal_lock); 858 list_add_tail(&sh->r5c, 859 &log->stripe_in_journal_list); 860 spin_unlock_irq(&log->stripe_in_journal_lock); 861 atomic_inc(&log->stripe_in_journal_count); 862 } 863 return 0; 864 } 865 866 /* add stripe to no_space_stripes, and then wake up reclaim */ 867 static inline void r5l_add_no_space_stripe(struct r5l_log *log, 868 struct stripe_head *sh) 869 { 870 spin_lock(&log->no_space_stripes_lock); 871 list_add_tail(&sh->log_list, &log->no_space_stripes); 872 spin_unlock(&log->no_space_stripes_lock); 873 } 874 875 /* 876 * running in raid5d, where reclaim could wait for raid5d too (when it flushes 877 * data from log to raid disks), so we shouldn't wait for reclaim here 878 */ 879 int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh) 880 { 881 struct r5conf *conf = sh->raid_conf; 882 int write_disks = 0; 883 int data_pages, parity_pages; 884 int reserve; 885 int i; 886 int ret = 0; 887 bool wake_reclaim = false; 888 889 if (!log) 890 return -EAGAIN; 891 /* Don't support stripe batch */ 892 if (sh->log_io || !test_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags) || 893 test_bit(STRIPE_SYNCING, &sh->state)) { 894 /* the stripe is written to log, we start writing it to raid */ 895 clear_bit(STRIPE_LOG_TRAPPED, &sh->state); 896 return -EAGAIN; 897 } 898 899 WARN_ON(test_bit(STRIPE_R5C_CACHING, &sh->state)); 900 901 for (i = 0; i < sh->disks; i++) { 902 void *addr; 903 904 if (!test_bit(R5_Wantwrite, &sh->dev[i].flags) || 905 test_bit(R5_InJournal, &sh->dev[i].flags)) 906 continue; 907 908 write_disks++; 909 /* checksum is already calculated in last run */ 910 if (test_bit(STRIPE_LOG_TRAPPED, &sh->state)) 911 continue; 912 addr = kmap_atomic(sh->dev[i].page); 913 sh->dev[i].log_checksum = crc32c_le(log->uuid_checksum, 914 addr, PAGE_SIZE); 915 kunmap_atomic(addr); 916 } 917 parity_pages = 1 + !!(sh->qd_idx >= 0); 918 data_pages = write_disks - parity_pages; 919 920 set_bit(STRIPE_LOG_TRAPPED, &sh->state); 921 /* 922 * The stripe must enter state machine again to finish the write, so 923 * don't delay. 924 */ 925 clear_bit(STRIPE_DELAYED, &sh->state); 926 atomic_inc(&sh->count); 927 928 mutex_lock(&log->io_mutex); 929 /* meta + data */ 930 reserve = (1 + write_disks) << (PAGE_SHIFT - 9); 931 932 if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) { 933 if (!r5l_has_free_space(log, reserve)) { 934 r5l_add_no_space_stripe(log, sh); 935 wake_reclaim = true; 936 } else { 937 ret = r5l_log_stripe(log, sh, data_pages, parity_pages); 938 if (ret) { 939 spin_lock_irq(&log->io_list_lock); 940 list_add_tail(&sh->log_list, 941 &log->no_mem_stripes); 942 spin_unlock_irq(&log->io_list_lock); 943 } 944 } 945 } else { /* R5C_JOURNAL_MODE_WRITE_BACK */ 946 /* 947 * log space critical, do not process stripes that are 948 * not in cache yet (sh->log_start == MaxSector). 949 */ 950 if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state) && 951 sh->log_start == MaxSector) { 952 r5l_add_no_space_stripe(log, sh); 953 wake_reclaim = true; 954 reserve = 0; 955 } else if (!r5l_has_free_space(log, reserve)) { 956 if (sh->log_start == log->last_checkpoint) 957 BUG(); 958 else 959 r5l_add_no_space_stripe(log, sh); 960 } else { 961 ret = r5l_log_stripe(log, sh, data_pages, parity_pages); 962 if (ret) { 963 spin_lock_irq(&log->io_list_lock); 964 list_add_tail(&sh->log_list, 965 &log->no_mem_stripes); 966 spin_unlock_irq(&log->io_list_lock); 967 } 968 } 969 } 970 971 mutex_unlock(&log->io_mutex); 972 if (wake_reclaim) 973 r5l_wake_reclaim(log, reserve); 974 return 0; 975 } 976 977 void r5l_write_stripe_run(struct r5l_log *log) 978 { 979 if (!log) 980 return; 981 mutex_lock(&log->io_mutex); 982 r5l_submit_current_io(log); 983 mutex_unlock(&log->io_mutex); 984 } 985 986 int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio) 987 { 988 if (!log) 989 return -ENODEV; 990 991 if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) { 992 /* 993 * in write through (journal only) 994 * we flush log disk cache first, then write stripe data to 995 * raid disks. So if bio is finished, the log disk cache is 996 * flushed already. The recovery guarantees we can recovery 997 * the bio from log disk, so we don't need to flush again 998 */ 999 if (bio->bi_iter.bi_size == 0) { 1000 bio_endio(bio); 1001 return 0; 1002 } 1003 bio->bi_opf &= ~REQ_PREFLUSH; 1004 } else { 1005 /* write back (with cache) */ 1006 if (bio->bi_iter.bi_size == 0) { 1007 mutex_lock(&log->io_mutex); 1008 r5l_get_meta(log, 0); 1009 bio_list_add(&log->current_io->flush_barriers, bio); 1010 log->current_io->has_flush = 1; 1011 log->current_io->has_null_flush = 1; 1012 atomic_inc(&log->current_io->pending_stripe); 1013 r5l_submit_current_io(log); 1014 mutex_unlock(&log->io_mutex); 1015 return 0; 1016 } 1017 } 1018 return -EAGAIN; 1019 } 1020 1021 /* This will run after log space is reclaimed */ 1022 static void r5l_run_no_space_stripes(struct r5l_log *log) 1023 { 1024 struct stripe_head *sh; 1025 1026 spin_lock(&log->no_space_stripes_lock); 1027 while (!list_empty(&log->no_space_stripes)) { 1028 sh = list_first_entry(&log->no_space_stripes, 1029 struct stripe_head, log_list); 1030 list_del_init(&sh->log_list); 1031 set_bit(STRIPE_HANDLE, &sh->state); 1032 raid5_release_stripe(sh); 1033 } 1034 spin_unlock(&log->no_space_stripes_lock); 1035 } 1036 1037 /* 1038 * calculate new last_checkpoint 1039 * for write through mode, returns log->next_checkpoint 1040 * for write back, returns log_start of first sh in stripe_in_journal_list 1041 */ 1042 static sector_t r5c_calculate_new_cp(struct r5conf *conf) 1043 { 1044 struct stripe_head *sh; 1045 struct r5l_log *log = conf->log; 1046 sector_t new_cp; 1047 unsigned long flags; 1048 1049 if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) 1050 return log->next_checkpoint; 1051 1052 spin_lock_irqsave(&log->stripe_in_journal_lock, flags); 1053 if (list_empty(&conf->log->stripe_in_journal_list)) { 1054 /* all stripes flushed */ 1055 spin_unlock_irqrestore(&log->stripe_in_journal_lock, flags); 1056 return log->next_checkpoint; 1057 } 1058 sh = list_first_entry(&conf->log->stripe_in_journal_list, 1059 struct stripe_head, r5c); 1060 new_cp = sh->log_start; 1061 spin_unlock_irqrestore(&log->stripe_in_journal_lock, flags); 1062 return new_cp; 1063 } 1064 1065 static sector_t r5l_reclaimable_space(struct r5l_log *log) 1066 { 1067 struct r5conf *conf = log->rdev->mddev->private; 1068 1069 return r5l_ring_distance(log, log->last_checkpoint, 1070 r5c_calculate_new_cp(conf)); 1071 } 1072 1073 static void r5l_run_no_mem_stripe(struct r5l_log *log) 1074 { 1075 struct stripe_head *sh; 1076 1077 assert_spin_locked(&log->io_list_lock); 1078 1079 if (!list_empty(&log->no_mem_stripes)) { 1080 sh = list_first_entry(&log->no_mem_stripes, 1081 struct stripe_head, log_list); 1082 list_del_init(&sh->log_list); 1083 set_bit(STRIPE_HANDLE, &sh->state); 1084 raid5_release_stripe(sh); 1085 } 1086 } 1087 1088 static bool r5l_complete_finished_ios(struct r5l_log *log) 1089 { 1090 struct r5l_io_unit *io, *next; 1091 bool found = false; 1092 1093 assert_spin_locked(&log->io_list_lock); 1094 1095 list_for_each_entry_safe(io, next, &log->finished_ios, log_sibling) { 1096 /* don't change list order */ 1097 if (io->state < IO_UNIT_STRIPE_END) 1098 break; 1099 1100 log->next_checkpoint = io->log_start; 1101 1102 list_del(&io->log_sibling); 1103 mempool_free(io, log->io_pool); 1104 r5l_run_no_mem_stripe(log); 1105 1106 found = true; 1107 } 1108 1109 return found; 1110 } 1111 1112 static void __r5l_stripe_write_finished(struct r5l_io_unit *io) 1113 { 1114 struct r5l_log *log = io->log; 1115 struct r5conf *conf = log->rdev->mddev->private; 1116 unsigned long flags; 1117 1118 spin_lock_irqsave(&log->io_list_lock, flags); 1119 __r5l_set_io_unit_state(io, IO_UNIT_STRIPE_END); 1120 1121 if (!r5l_complete_finished_ios(log)) { 1122 spin_unlock_irqrestore(&log->io_list_lock, flags); 1123 return; 1124 } 1125 1126 if (r5l_reclaimable_space(log) > log->max_free_space || 1127 test_bit(R5C_LOG_TIGHT, &conf->cache_state)) 1128 r5l_wake_reclaim(log, 0); 1129 1130 spin_unlock_irqrestore(&log->io_list_lock, flags); 1131 wake_up(&log->iounit_wait); 1132 } 1133 1134 void r5l_stripe_write_finished(struct stripe_head *sh) 1135 { 1136 struct r5l_io_unit *io; 1137 1138 io = sh->log_io; 1139 sh->log_io = NULL; 1140 1141 if (io && atomic_dec_and_test(&io->pending_stripe)) 1142 __r5l_stripe_write_finished(io); 1143 } 1144 1145 static void r5l_log_flush_endio(struct bio *bio) 1146 { 1147 struct r5l_log *log = container_of(bio, struct r5l_log, 1148 flush_bio); 1149 unsigned long flags; 1150 struct r5l_io_unit *io; 1151 1152 if (bio->bi_error) 1153 md_error(log->rdev->mddev, log->rdev); 1154 1155 spin_lock_irqsave(&log->io_list_lock, flags); 1156 list_for_each_entry(io, &log->flushing_ios, log_sibling) 1157 r5l_io_run_stripes(io); 1158 list_splice_tail_init(&log->flushing_ios, &log->finished_ios); 1159 spin_unlock_irqrestore(&log->io_list_lock, flags); 1160 } 1161 1162 /* 1163 * Starting dispatch IO to raid. 1164 * io_unit(meta) consists of a log. There is one situation we want to avoid. A 1165 * broken meta in the middle of a log causes recovery can't find meta at the 1166 * head of log. If operations require meta at the head persistent in log, we 1167 * must make sure meta before it persistent in log too. A case is: 1168 * 1169 * stripe data/parity is in log, we start write stripe to raid disks. stripe 1170 * data/parity must be persistent in log before we do the write to raid disks. 1171 * 1172 * The solution is we restrictly maintain io_unit list order. In this case, we 1173 * only write stripes of an io_unit to raid disks till the io_unit is the first 1174 * one whose data/parity is in log. 1175 */ 1176 void r5l_flush_stripe_to_raid(struct r5l_log *log) 1177 { 1178 bool do_flush; 1179 1180 if (!log || !log->need_cache_flush) 1181 return; 1182 1183 spin_lock_irq(&log->io_list_lock); 1184 /* flush bio is running */ 1185 if (!list_empty(&log->flushing_ios)) { 1186 spin_unlock_irq(&log->io_list_lock); 1187 return; 1188 } 1189 list_splice_tail_init(&log->io_end_ios, &log->flushing_ios); 1190 do_flush = !list_empty(&log->flushing_ios); 1191 spin_unlock_irq(&log->io_list_lock); 1192 1193 if (!do_flush) 1194 return; 1195 bio_reset(&log->flush_bio); 1196 log->flush_bio.bi_bdev = log->rdev->bdev; 1197 log->flush_bio.bi_end_io = r5l_log_flush_endio; 1198 log->flush_bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH; 1199 submit_bio(&log->flush_bio); 1200 } 1201 1202 static void r5l_write_super(struct r5l_log *log, sector_t cp); 1203 static void r5l_write_super_and_discard_space(struct r5l_log *log, 1204 sector_t end) 1205 { 1206 struct block_device *bdev = log->rdev->bdev; 1207 struct mddev *mddev; 1208 1209 r5l_write_super(log, end); 1210 1211 if (!blk_queue_discard(bdev_get_queue(bdev))) 1212 return; 1213 1214 mddev = log->rdev->mddev; 1215 /* 1216 * Discard could zero data, so before discard we must make sure 1217 * superblock is updated to new log tail. Updating superblock (either 1218 * directly call md_update_sb() or depend on md thread) must hold 1219 * reconfig mutex. On the other hand, raid5_quiesce is called with 1220 * reconfig_mutex hold. The first step of raid5_quiesce() is waitting 1221 * for all IO finish, hence waitting for reclaim thread, while reclaim 1222 * thread is calling this function and waitting for reconfig mutex. So 1223 * there is a deadlock. We workaround this issue with a trylock. 1224 * FIXME: we could miss discard if we can't take reconfig mutex 1225 */ 1226 set_mask_bits(&mddev->sb_flags, 0, 1227 BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_PENDING)); 1228 if (!mddev_trylock(mddev)) 1229 return; 1230 md_update_sb(mddev, 1); 1231 mddev_unlock(mddev); 1232 1233 /* discard IO error really doesn't matter, ignore it */ 1234 if (log->last_checkpoint < end) { 1235 blkdev_issue_discard(bdev, 1236 log->last_checkpoint + log->rdev->data_offset, 1237 end - log->last_checkpoint, GFP_NOIO, 0); 1238 } else { 1239 blkdev_issue_discard(bdev, 1240 log->last_checkpoint + log->rdev->data_offset, 1241 log->device_size - log->last_checkpoint, 1242 GFP_NOIO, 0); 1243 blkdev_issue_discard(bdev, log->rdev->data_offset, end, 1244 GFP_NOIO, 0); 1245 } 1246 } 1247 1248 /* 1249 * r5c_flush_stripe moves stripe from cached list to handle_list. When called, 1250 * the stripe must be on r5c_cached_full_stripes or r5c_cached_partial_stripes. 1251 * 1252 * must hold conf->device_lock 1253 */ 1254 static void r5c_flush_stripe(struct r5conf *conf, struct stripe_head *sh) 1255 { 1256 BUG_ON(list_empty(&sh->lru)); 1257 BUG_ON(!test_bit(STRIPE_R5C_CACHING, &sh->state)); 1258 BUG_ON(test_bit(STRIPE_HANDLE, &sh->state)); 1259 1260 /* 1261 * The stripe is not ON_RELEASE_LIST, so it is safe to call 1262 * raid5_release_stripe() while holding conf->device_lock 1263 */ 1264 BUG_ON(test_bit(STRIPE_ON_RELEASE_LIST, &sh->state)); 1265 assert_spin_locked(&conf->device_lock); 1266 1267 list_del_init(&sh->lru); 1268 atomic_inc(&sh->count); 1269 1270 set_bit(STRIPE_HANDLE, &sh->state); 1271 atomic_inc(&conf->active_stripes); 1272 r5c_make_stripe_write_out(sh); 1273 1274 raid5_release_stripe(sh); 1275 } 1276 1277 /* 1278 * if num == 0, flush all full stripes 1279 * if num > 0, flush all full stripes. If less than num full stripes are 1280 * flushed, flush some partial stripes until totally num stripes are 1281 * flushed or there is no more cached stripes. 1282 */ 1283 void r5c_flush_cache(struct r5conf *conf, int num) 1284 { 1285 int count; 1286 struct stripe_head *sh, *next; 1287 1288 assert_spin_locked(&conf->device_lock); 1289 if (!conf->log) 1290 return; 1291 1292 count = 0; 1293 list_for_each_entry_safe(sh, next, &conf->r5c_full_stripe_list, lru) { 1294 r5c_flush_stripe(conf, sh); 1295 count++; 1296 } 1297 1298 if (count >= num) 1299 return; 1300 list_for_each_entry_safe(sh, next, 1301 &conf->r5c_partial_stripe_list, lru) { 1302 r5c_flush_stripe(conf, sh); 1303 if (++count >= num) 1304 break; 1305 } 1306 } 1307 1308 static void r5c_do_reclaim(struct r5conf *conf) 1309 { 1310 struct r5l_log *log = conf->log; 1311 struct stripe_head *sh; 1312 int count = 0; 1313 unsigned long flags; 1314 int total_cached; 1315 int stripes_to_flush; 1316 1317 if (!r5c_is_writeback(log)) 1318 return; 1319 1320 total_cached = atomic_read(&conf->r5c_cached_partial_stripes) + 1321 atomic_read(&conf->r5c_cached_full_stripes); 1322 1323 if (total_cached > conf->min_nr_stripes * 3 / 4 || 1324 atomic_read(&conf->empty_inactive_list_nr) > 0) 1325 /* 1326 * if stripe cache pressure high, flush all full stripes and 1327 * some partial stripes 1328 */ 1329 stripes_to_flush = R5C_RECLAIM_STRIPE_GROUP; 1330 else if (total_cached > conf->min_nr_stripes * 1 / 2 || 1331 atomic_read(&conf->r5c_cached_full_stripes) > 1332 R5C_FULL_STRIPE_FLUSH_BATCH) 1333 /* 1334 * if stripe cache pressure moderate, or if there is many full 1335 * stripes,flush all full stripes 1336 */ 1337 stripes_to_flush = 0; 1338 else 1339 /* no need to flush */ 1340 stripes_to_flush = -1; 1341 1342 if (stripes_to_flush >= 0) { 1343 spin_lock_irqsave(&conf->device_lock, flags); 1344 r5c_flush_cache(conf, stripes_to_flush); 1345 spin_unlock_irqrestore(&conf->device_lock, flags); 1346 } 1347 1348 /* if log space is tight, flush stripes on stripe_in_journal_list */ 1349 if (test_bit(R5C_LOG_TIGHT, &conf->cache_state)) { 1350 spin_lock_irqsave(&log->stripe_in_journal_lock, flags); 1351 spin_lock(&conf->device_lock); 1352 list_for_each_entry(sh, &log->stripe_in_journal_list, r5c) { 1353 /* 1354 * stripes on stripe_in_journal_list could be in any 1355 * state of the stripe_cache state machine. In this 1356 * case, we only want to flush stripe on 1357 * r5c_cached_full/partial_stripes. The following 1358 * condition makes sure the stripe is on one of the 1359 * two lists. 1360 */ 1361 if (!list_empty(&sh->lru) && 1362 !test_bit(STRIPE_HANDLE, &sh->state) && 1363 atomic_read(&sh->count) == 0) { 1364 r5c_flush_stripe(conf, sh); 1365 } 1366 if (count++ >= R5C_RECLAIM_STRIPE_GROUP) 1367 break; 1368 } 1369 spin_unlock(&conf->device_lock); 1370 spin_unlock_irqrestore(&log->stripe_in_journal_lock, flags); 1371 } 1372 1373 if (!test_bit(R5C_LOG_CRITICAL, &conf->cache_state)) 1374 r5l_run_no_space_stripes(log); 1375 1376 md_wakeup_thread(conf->mddev->thread); 1377 } 1378 1379 static void r5l_do_reclaim(struct r5l_log *log) 1380 { 1381 struct r5conf *conf = log->rdev->mddev->private; 1382 sector_t reclaim_target = xchg(&log->reclaim_target, 0); 1383 sector_t reclaimable; 1384 sector_t next_checkpoint; 1385 bool write_super; 1386 1387 spin_lock_irq(&log->io_list_lock); 1388 write_super = r5l_reclaimable_space(log) > log->max_free_space || 1389 reclaim_target != 0 || !list_empty(&log->no_space_stripes); 1390 /* 1391 * move proper io_unit to reclaim list. We should not change the order. 1392 * reclaimable/unreclaimable io_unit can be mixed in the list, we 1393 * shouldn't reuse space of an unreclaimable io_unit 1394 */ 1395 while (1) { 1396 reclaimable = r5l_reclaimable_space(log); 1397 if (reclaimable >= reclaim_target || 1398 (list_empty(&log->running_ios) && 1399 list_empty(&log->io_end_ios) && 1400 list_empty(&log->flushing_ios) && 1401 list_empty(&log->finished_ios))) 1402 break; 1403 1404 md_wakeup_thread(log->rdev->mddev->thread); 1405 wait_event_lock_irq(log->iounit_wait, 1406 r5l_reclaimable_space(log) > reclaimable, 1407 log->io_list_lock); 1408 } 1409 1410 next_checkpoint = r5c_calculate_new_cp(conf); 1411 spin_unlock_irq(&log->io_list_lock); 1412 1413 if (reclaimable == 0 || !write_super) 1414 return; 1415 1416 /* 1417 * write_super will flush cache of each raid disk. We must write super 1418 * here, because the log area might be reused soon and we don't want to 1419 * confuse recovery 1420 */ 1421 r5l_write_super_and_discard_space(log, next_checkpoint); 1422 1423 mutex_lock(&log->io_mutex); 1424 log->last_checkpoint = next_checkpoint; 1425 r5c_update_log_state(log); 1426 mutex_unlock(&log->io_mutex); 1427 1428 r5l_run_no_space_stripes(log); 1429 } 1430 1431 static void r5l_reclaim_thread(struct md_thread *thread) 1432 { 1433 struct mddev *mddev = thread->mddev; 1434 struct r5conf *conf = mddev->private; 1435 struct r5l_log *log = conf->log; 1436 1437 if (!log) 1438 return; 1439 r5c_do_reclaim(conf); 1440 r5l_do_reclaim(log); 1441 } 1442 1443 void r5l_wake_reclaim(struct r5l_log *log, sector_t space) 1444 { 1445 unsigned long target; 1446 unsigned long new = (unsigned long)space; /* overflow in theory */ 1447 1448 if (!log) 1449 return; 1450 do { 1451 target = log->reclaim_target; 1452 if (new < target) 1453 return; 1454 } while (cmpxchg(&log->reclaim_target, target, new) != target); 1455 md_wakeup_thread(log->reclaim_thread); 1456 } 1457 1458 void r5l_quiesce(struct r5l_log *log, int state) 1459 { 1460 struct mddev *mddev; 1461 if (!log || state == 2) 1462 return; 1463 if (state == 0) 1464 kthread_unpark(log->reclaim_thread->tsk); 1465 else if (state == 1) { 1466 /* make sure r5l_write_super_and_discard_space exits */ 1467 mddev = log->rdev->mddev; 1468 wake_up(&mddev->sb_wait); 1469 kthread_park(log->reclaim_thread->tsk); 1470 r5l_wake_reclaim(log, MaxSector); 1471 r5l_do_reclaim(log); 1472 } 1473 } 1474 1475 bool r5l_log_disk_error(struct r5conf *conf) 1476 { 1477 struct r5l_log *log; 1478 bool ret; 1479 /* don't allow write if journal disk is missing */ 1480 rcu_read_lock(); 1481 log = rcu_dereference(conf->log); 1482 1483 if (!log) 1484 ret = test_bit(MD_HAS_JOURNAL, &conf->mddev->flags); 1485 else 1486 ret = test_bit(Faulty, &log->rdev->flags); 1487 rcu_read_unlock(); 1488 return ret; 1489 } 1490 1491 struct r5l_recovery_ctx { 1492 struct page *meta_page; /* current meta */ 1493 sector_t meta_total_blocks; /* total size of current meta and data */ 1494 sector_t pos; /* recovery position */ 1495 u64 seq; /* recovery position seq */ 1496 int data_parity_stripes; /* number of data_parity stripes */ 1497 int data_only_stripes; /* number of data_only stripes */ 1498 struct list_head cached_list; 1499 }; 1500 1501 static int r5l_recovery_read_meta_block(struct r5l_log *log, 1502 struct r5l_recovery_ctx *ctx) 1503 { 1504 struct page *page = ctx->meta_page; 1505 struct r5l_meta_block *mb; 1506 u32 crc, stored_crc; 1507 1508 if (!sync_page_io(log->rdev, ctx->pos, PAGE_SIZE, page, REQ_OP_READ, 0, 1509 false)) 1510 return -EIO; 1511 1512 mb = page_address(page); 1513 stored_crc = le32_to_cpu(mb->checksum); 1514 mb->checksum = 0; 1515 1516 if (le32_to_cpu(mb->magic) != R5LOG_MAGIC || 1517 le64_to_cpu(mb->seq) != ctx->seq || 1518 mb->version != R5LOG_VERSION || 1519 le64_to_cpu(mb->position) != ctx->pos) 1520 return -EINVAL; 1521 1522 crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE); 1523 if (stored_crc != crc) 1524 return -EINVAL; 1525 1526 if (le32_to_cpu(mb->meta_size) > PAGE_SIZE) 1527 return -EINVAL; 1528 1529 ctx->meta_total_blocks = BLOCK_SECTORS; 1530 1531 return 0; 1532 } 1533 1534 static void 1535 r5l_recovery_create_empty_meta_block(struct r5l_log *log, 1536 struct page *page, 1537 sector_t pos, u64 seq) 1538 { 1539 struct r5l_meta_block *mb; 1540 1541 mb = page_address(page); 1542 clear_page(mb); 1543 mb->magic = cpu_to_le32(R5LOG_MAGIC); 1544 mb->version = R5LOG_VERSION; 1545 mb->meta_size = cpu_to_le32(sizeof(struct r5l_meta_block)); 1546 mb->seq = cpu_to_le64(seq); 1547 mb->position = cpu_to_le64(pos); 1548 } 1549 1550 static int r5l_log_write_empty_meta_block(struct r5l_log *log, sector_t pos, 1551 u64 seq) 1552 { 1553 struct page *page; 1554 struct r5l_meta_block *mb; 1555 1556 page = alloc_page(GFP_KERNEL); 1557 if (!page) 1558 return -ENOMEM; 1559 r5l_recovery_create_empty_meta_block(log, page, pos, seq); 1560 mb = page_address(page); 1561 mb->checksum = cpu_to_le32(crc32c_le(log->uuid_checksum, 1562 mb, PAGE_SIZE)); 1563 if (!sync_page_io(log->rdev, pos, PAGE_SIZE, page, REQ_OP_WRITE, 1564 REQ_FUA, false)) { 1565 __free_page(page); 1566 return -EIO; 1567 } 1568 __free_page(page); 1569 return 0; 1570 } 1571 1572 /* 1573 * r5l_recovery_load_data and r5l_recovery_load_parity uses flag R5_Wantwrite 1574 * to mark valid (potentially not flushed) data in the journal. 1575 * 1576 * We already verified checksum in r5l_recovery_verify_data_checksum_for_mb, 1577 * so there should not be any mismatch here. 1578 */ 1579 static void r5l_recovery_load_data(struct r5l_log *log, 1580 struct stripe_head *sh, 1581 struct r5l_recovery_ctx *ctx, 1582 struct r5l_payload_data_parity *payload, 1583 sector_t log_offset) 1584 { 1585 struct mddev *mddev = log->rdev->mddev; 1586 struct r5conf *conf = mddev->private; 1587 int dd_idx; 1588 1589 raid5_compute_sector(conf, 1590 le64_to_cpu(payload->location), 0, 1591 &dd_idx, sh); 1592 sync_page_io(log->rdev, log_offset, PAGE_SIZE, 1593 sh->dev[dd_idx].page, REQ_OP_READ, 0, false); 1594 sh->dev[dd_idx].log_checksum = 1595 le32_to_cpu(payload->checksum[0]); 1596 ctx->meta_total_blocks += BLOCK_SECTORS; 1597 1598 set_bit(R5_Wantwrite, &sh->dev[dd_idx].flags); 1599 set_bit(STRIPE_R5C_CACHING, &sh->state); 1600 } 1601 1602 static void r5l_recovery_load_parity(struct r5l_log *log, 1603 struct stripe_head *sh, 1604 struct r5l_recovery_ctx *ctx, 1605 struct r5l_payload_data_parity *payload, 1606 sector_t log_offset) 1607 { 1608 struct mddev *mddev = log->rdev->mddev; 1609 struct r5conf *conf = mddev->private; 1610 1611 ctx->meta_total_blocks += BLOCK_SECTORS * conf->max_degraded; 1612 sync_page_io(log->rdev, log_offset, PAGE_SIZE, 1613 sh->dev[sh->pd_idx].page, REQ_OP_READ, 0, false); 1614 sh->dev[sh->pd_idx].log_checksum = 1615 le32_to_cpu(payload->checksum[0]); 1616 set_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags); 1617 1618 if (sh->qd_idx >= 0) { 1619 sync_page_io(log->rdev, 1620 r5l_ring_add(log, log_offset, BLOCK_SECTORS), 1621 PAGE_SIZE, sh->dev[sh->qd_idx].page, 1622 REQ_OP_READ, 0, false); 1623 sh->dev[sh->qd_idx].log_checksum = 1624 le32_to_cpu(payload->checksum[1]); 1625 set_bit(R5_Wantwrite, &sh->dev[sh->qd_idx].flags); 1626 } 1627 clear_bit(STRIPE_R5C_CACHING, &sh->state); 1628 } 1629 1630 static void r5l_recovery_reset_stripe(struct stripe_head *sh) 1631 { 1632 int i; 1633 1634 sh->state = 0; 1635 sh->log_start = MaxSector; 1636 for (i = sh->disks; i--; ) 1637 sh->dev[i].flags = 0; 1638 } 1639 1640 static void 1641 r5l_recovery_replay_one_stripe(struct r5conf *conf, 1642 struct stripe_head *sh, 1643 struct r5l_recovery_ctx *ctx) 1644 { 1645 struct md_rdev *rdev, *rrdev; 1646 int disk_index; 1647 int data_count = 0; 1648 1649 for (disk_index = 0; disk_index < sh->disks; disk_index++) { 1650 if (!test_bit(R5_Wantwrite, &sh->dev[disk_index].flags)) 1651 continue; 1652 if (disk_index == sh->qd_idx || disk_index == sh->pd_idx) 1653 continue; 1654 data_count++; 1655 } 1656 1657 /* 1658 * stripes that only have parity must have been flushed 1659 * before the crash that we are now recovering from, so 1660 * there is nothing more to recovery. 1661 */ 1662 if (data_count == 0) 1663 goto out; 1664 1665 for (disk_index = 0; disk_index < sh->disks; disk_index++) { 1666 if (!test_bit(R5_Wantwrite, &sh->dev[disk_index].flags)) 1667 continue; 1668 1669 /* in case device is broken */ 1670 rcu_read_lock(); 1671 rdev = rcu_dereference(conf->disks[disk_index].rdev); 1672 if (rdev) { 1673 atomic_inc(&rdev->nr_pending); 1674 rcu_read_unlock(); 1675 sync_page_io(rdev, sh->sector, PAGE_SIZE, 1676 sh->dev[disk_index].page, REQ_OP_WRITE, 0, 1677 false); 1678 rdev_dec_pending(rdev, rdev->mddev); 1679 rcu_read_lock(); 1680 } 1681 rrdev = rcu_dereference(conf->disks[disk_index].replacement); 1682 if (rrdev) { 1683 atomic_inc(&rrdev->nr_pending); 1684 rcu_read_unlock(); 1685 sync_page_io(rrdev, sh->sector, PAGE_SIZE, 1686 sh->dev[disk_index].page, REQ_OP_WRITE, 0, 1687 false); 1688 rdev_dec_pending(rrdev, rrdev->mddev); 1689 rcu_read_lock(); 1690 } 1691 rcu_read_unlock(); 1692 } 1693 ctx->data_parity_stripes++; 1694 out: 1695 r5l_recovery_reset_stripe(sh); 1696 } 1697 1698 static struct stripe_head * 1699 r5c_recovery_alloc_stripe(struct r5conf *conf, 1700 sector_t stripe_sect) 1701 { 1702 struct stripe_head *sh; 1703 1704 sh = raid5_get_active_stripe(conf, stripe_sect, 0, 1, 0); 1705 if (!sh) 1706 return NULL; /* no more stripe available */ 1707 1708 r5l_recovery_reset_stripe(sh); 1709 1710 return sh; 1711 } 1712 1713 static struct stripe_head * 1714 r5c_recovery_lookup_stripe(struct list_head *list, sector_t sect) 1715 { 1716 struct stripe_head *sh; 1717 1718 list_for_each_entry(sh, list, lru) 1719 if (sh->sector == sect) 1720 return sh; 1721 return NULL; 1722 } 1723 1724 static void 1725 r5c_recovery_drop_stripes(struct list_head *cached_stripe_list, 1726 struct r5l_recovery_ctx *ctx) 1727 { 1728 struct stripe_head *sh, *next; 1729 1730 list_for_each_entry_safe(sh, next, cached_stripe_list, lru) { 1731 r5l_recovery_reset_stripe(sh); 1732 list_del_init(&sh->lru); 1733 raid5_release_stripe(sh); 1734 } 1735 } 1736 1737 static void 1738 r5c_recovery_replay_stripes(struct list_head *cached_stripe_list, 1739 struct r5l_recovery_ctx *ctx) 1740 { 1741 struct stripe_head *sh, *next; 1742 1743 list_for_each_entry_safe(sh, next, cached_stripe_list, lru) 1744 if (!test_bit(STRIPE_R5C_CACHING, &sh->state)) { 1745 r5l_recovery_replay_one_stripe(sh->raid_conf, sh, ctx); 1746 list_del_init(&sh->lru); 1747 raid5_release_stripe(sh); 1748 } 1749 } 1750 1751 /* if matches return 0; otherwise return -EINVAL */ 1752 static int 1753 r5l_recovery_verify_data_checksum(struct r5l_log *log, struct page *page, 1754 sector_t log_offset, __le32 log_checksum) 1755 { 1756 void *addr; 1757 u32 checksum; 1758 1759 sync_page_io(log->rdev, log_offset, PAGE_SIZE, 1760 page, REQ_OP_READ, 0, false); 1761 addr = kmap_atomic(page); 1762 checksum = crc32c_le(log->uuid_checksum, addr, PAGE_SIZE); 1763 kunmap_atomic(addr); 1764 return (le32_to_cpu(log_checksum) == checksum) ? 0 : -EINVAL; 1765 } 1766 1767 /* 1768 * before loading data to stripe cache, we need verify checksum for all data, 1769 * if there is mismatch for any data page, we drop all data in the mata block 1770 */ 1771 static int 1772 r5l_recovery_verify_data_checksum_for_mb(struct r5l_log *log, 1773 struct r5l_recovery_ctx *ctx) 1774 { 1775 struct mddev *mddev = log->rdev->mddev; 1776 struct r5conf *conf = mddev->private; 1777 struct r5l_meta_block *mb = page_address(ctx->meta_page); 1778 sector_t mb_offset = sizeof(struct r5l_meta_block); 1779 sector_t log_offset = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS); 1780 struct page *page; 1781 struct r5l_payload_data_parity *payload; 1782 1783 page = alloc_page(GFP_KERNEL); 1784 if (!page) 1785 return -ENOMEM; 1786 1787 while (mb_offset < le32_to_cpu(mb->meta_size)) { 1788 payload = (void *)mb + mb_offset; 1789 1790 if (payload->header.type == R5LOG_PAYLOAD_DATA) { 1791 if (r5l_recovery_verify_data_checksum( 1792 log, page, log_offset, 1793 payload->checksum[0]) < 0) 1794 goto mismatch; 1795 } else if (payload->header.type == R5LOG_PAYLOAD_PARITY) { 1796 if (r5l_recovery_verify_data_checksum( 1797 log, page, log_offset, 1798 payload->checksum[0]) < 0) 1799 goto mismatch; 1800 if (conf->max_degraded == 2 && /* q for RAID 6 */ 1801 r5l_recovery_verify_data_checksum( 1802 log, page, 1803 r5l_ring_add(log, log_offset, 1804 BLOCK_SECTORS), 1805 payload->checksum[1]) < 0) 1806 goto mismatch; 1807 } else /* not R5LOG_PAYLOAD_DATA or R5LOG_PAYLOAD_PARITY */ 1808 goto mismatch; 1809 1810 log_offset = r5l_ring_add(log, log_offset, 1811 le32_to_cpu(payload->size)); 1812 1813 mb_offset += sizeof(struct r5l_payload_data_parity) + 1814 sizeof(__le32) * 1815 (le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9)); 1816 } 1817 1818 put_page(page); 1819 return 0; 1820 1821 mismatch: 1822 put_page(page); 1823 return -EINVAL; 1824 } 1825 1826 /* 1827 * Analyze all data/parity pages in one meta block 1828 * Returns: 1829 * 0 for success 1830 * -EINVAL for unknown playload type 1831 * -EAGAIN for checksum mismatch of data page 1832 * -ENOMEM for run out of memory (alloc_page failed or run out of stripes) 1833 */ 1834 static int 1835 r5c_recovery_analyze_meta_block(struct r5l_log *log, 1836 struct r5l_recovery_ctx *ctx, 1837 struct list_head *cached_stripe_list) 1838 { 1839 struct mddev *mddev = log->rdev->mddev; 1840 struct r5conf *conf = mddev->private; 1841 struct r5l_meta_block *mb; 1842 struct r5l_payload_data_parity *payload; 1843 int mb_offset; 1844 sector_t log_offset; 1845 sector_t stripe_sect; 1846 struct stripe_head *sh; 1847 int ret; 1848 1849 /* 1850 * for mismatch in data blocks, we will drop all data in this mb, but 1851 * we will still read next mb for other data with FLUSH flag, as 1852 * io_unit could finish out of order. 1853 */ 1854 ret = r5l_recovery_verify_data_checksum_for_mb(log, ctx); 1855 if (ret == -EINVAL) 1856 return -EAGAIN; 1857 else if (ret) 1858 return ret; /* -ENOMEM duo to alloc_page() failed */ 1859 1860 mb = page_address(ctx->meta_page); 1861 mb_offset = sizeof(struct r5l_meta_block); 1862 log_offset = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS); 1863 1864 while (mb_offset < le32_to_cpu(mb->meta_size)) { 1865 int dd; 1866 1867 payload = (void *)mb + mb_offset; 1868 stripe_sect = (payload->header.type == R5LOG_PAYLOAD_DATA) ? 1869 raid5_compute_sector( 1870 conf, le64_to_cpu(payload->location), 0, &dd, 1871 NULL) 1872 : le64_to_cpu(payload->location); 1873 1874 sh = r5c_recovery_lookup_stripe(cached_stripe_list, 1875 stripe_sect); 1876 1877 if (!sh) { 1878 sh = r5c_recovery_alloc_stripe(conf, stripe_sect); 1879 /* 1880 * cannot get stripe from raid5_get_active_stripe 1881 * try replay some stripes 1882 */ 1883 if (!sh) { 1884 r5c_recovery_replay_stripes( 1885 cached_stripe_list, ctx); 1886 sh = r5c_recovery_alloc_stripe( 1887 conf, stripe_sect); 1888 } 1889 if (!sh) { 1890 pr_debug("md/raid:%s: Increasing stripe cache size to %d to recovery data on journal.\n", 1891 mdname(mddev), 1892 conf->min_nr_stripes * 2); 1893 raid5_set_cache_size(mddev, 1894 conf->min_nr_stripes * 2); 1895 sh = r5c_recovery_alloc_stripe(conf, 1896 stripe_sect); 1897 } 1898 if (!sh) { 1899 pr_err("md/raid:%s: Cannot get enough stripes due to memory pressure. Recovery failed.\n", 1900 mdname(mddev)); 1901 return -ENOMEM; 1902 } 1903 list_add_tail(&sh->lru, cached_stripe_list); 1904 } 1905 1906 if (payload->header.type == R5LOG_PAYLOAD_DATA) { 1907 if (!test_bit(STRIPE_R5C_CACHING, &sh->state) && 1908 test_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags)) { 1909 r5l_recovery_replay_one_stripe(conf, sh, ctx); 1910 list_move_tail(&sh->lru, cached_stripe_list); 1911 } 1912 r5l_recovery_load_data(log, sh, ctx, payload, 1913 log_offset); 1914 } else if (payload->header.type == R5LOG_PAYLOAD_PARITY) 1915 r5l_recovery_load_parity(log, sh, ctx, payload, 1916 log_offset); 1917 else 1918 return -EINVAL; 1919 1920 log_offset = r5l_ring_add(log, log_offset, 1921 le32_to_cpu(payload->size)); 1922 1923 mb_offset += sizeof(struct r5l_payload_data_parity) + 1924 sizeof(__le32) * 1925 (le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9)); 1926 } 1927 1928 return 0; 1929 } 1930 1931 /* 1932 * Load the stripe into cache. The stripe will be written out later by 1933 * the stripe cache state machine. 1934 */ 1935 static void r5c_recovery_load_one_stripe(struct r5l_log *log, 1936 struct stripe_head *sh) 1937 { 1938 struct r5dev *dev; 1939 int i; 1940 1941 for (i = sh->disks; i--; ) { 1942 dev = sh->dev + i; 1943 if (test_and_clear_bit(R5_Wantwrite, &dev->flags)) { 1944 set_bit(R5_InJournal, &dev->flags); 1945 set_bit(R5_UPTODATE, &dev->flags); 1946 } 1947 } 1948 } 1949 1950 /* 1951 * Scan through the log for all to-be-flushed data 1952 * 1953 * For stripes with data and parity, namely Data-Parity stripe 1954 * (STRIPE_R5C_CACHING == 0), we simply replay all the writes. 1955 * 1956 * For stripes with only data, namely Data-Only stripe 1957 * (STRIPE_R5C_CACHING == 1), we load them to stripe cache state machine. 1958 * 1959 * For a stripe, if we see data after parity, we should discard all previous 1960 * data and parity for this stripe, as these data are already flushed to 1961 * the array. 1962 * 1963 * At the end of the scan, we return the new journal_tail, which points to 1964 * first data-only stripe on the journal device, or next invalid meta block. 1965 */ 1966 static int r5c_recovery_flush_log(struct r5l_log *log, 1967 struct r5l_recovery_ctx *ctx) 1968 { 1969 struct stripe_head *sh; 1970 int ret = 0; 1971 1972 /* scan through the log */ 1973 while (1) { 1974 if (r5l_recovery_read_meta_block(log, ctx)) 1975 break; 1976 1977 ret = r5c_recovery_analyze_meta_block(log, ctx, 1978 &ctx->cached_list); 1979 /* 1980 * -EAGAIN means mismatch in data block, in this case, we still 1981 * try scan the next metablock 1982 */ 1983 if (ret && ret != -EAGAIN) 1984 break; /* ret == -EINVAL or -ENOMEM */ 1985 ctx->seq++; 1986 ctx->pos = r5l_ring_add(log, ctx->pos, ctx->meta_total_blocks); 1987 } 1988 1989 if (ret == -ENOMEM) { 1990 r5c_recovery_drop_stripes(&ctx->cached_list, ctx); 1991 return ret; 1992 } 1993 1994 /* replay data-parity stripes */ 1995 r5c_recovery_replay_stripes(&ctx->cached_list, ctx); 1996 1997 /* load data-only stripes to stripe cache */ 1998 list_for_each_entry(sh, &ctx->cached_list, lru) { 1999 WARN_ON(!test_bit(STRIPE_R5C_CACHING, &sh->state)); 2000 r5c_recovery_load_one_stripe(log, sh); 2001 ctx->data_only_stripes++; 2002 } 2003 2004 return 0; 2005 } 2006 2007 /* 2008 * we did a recovery. Now ctx.pos points to an invalid meta block. New 2009 * log will start here. but we can't let superblock point to last valid 2010 * meta block. The log might looks like: 2011 * | meta 1| meta 2| meta 3| 2012 * meta 1 is valid, meta 2 is invalid. meta 3 could be valid. If 2013 * superblock points to meta 1, we write a new valid meta 2n. if crash 2014 * happens again, new recovery will start from meta 1. Since meta 2n is 2015 * valid now, recovery will think meta 3 is valid, which is wrong. 2016 * The solution is we create a new meta in meta2 with its seq == meta 2017 * 1's seq + 10000 and let superblock points to meta2. The same recovery 2018 * will not think meta 3 is a valid meta, because its seq doesn't match 2019 */ 2020 2021 /* 2022 * Before recovery, the log looks like the following 2023 * 2024 * --------------------------------------------- 2025 * | valid log | invalid log | 2026 * --------------------------------------------- 2027 * ^ 2028 * |- log->last_checkpoint 2029 * |- log->last_cp_seq 2030 * 2031 * Now we scan through the log until we see invalid entry 2032 * 2033 * --------------------------------------------- 2034 * | valid log | invalid log | 2035 * --------------------------------------------- 2036 * ^ ^ 2037 * |- log->last_checkpoint |- ctx->pos 2038 * |- log->last_cp_seq |- ctx->seq 2039 * 2040 * From this point, we need to increase seq number by 10 to avoid 2041 * confusing next recovery. 2042 * 2043 * --------------------------------------------- 2044 * | valid log | invalid log | 2045 * --------------------------------------------- 2046 * ^ ^ 2047 * |- log->last_checkpoint |- ctx->pos+1 2048 * |- log->last_cp_seq |- ctx->seq+10001 2049 * 2050 * However, it is not safe to start the state machine yet, because data only 2051 * parities are not yet secured in RAID. To save these data only parities, we 2052 * rewrite them from seq+11. 2053 * 2054 * ----------------------------------------------------------------- 2055 * | valid log | data only stripes | invalid log | 2056 * ----------------------------------------------------------------- 2057 * ^ ^ 2058 * |- log->last_checkpoint |- ctx->pos+n 2059 * |- log->last_cp_seq |- ctx->seq+10000+n 2060 * 2061 * If failure happens again during this process, the recovery can safe start 2062 * again from log->last_checkpoint. 2063 * 2064 * Once data only stripes are rewritten to journal, we move log_tail 2065 * 2066 * ----------------------------------------------------------------- 2067 * | old log | data only stripes | invalid log | 2068 * ----------------------------------------------------------------- 2069 * ^ ^ 2070 * |- log->last_checkpoint |- ctx->pos+n 2071 * |- log->last_cp_seq |- ctx->seq+10000+n 2072 * 2073 * Then we can safely start the state machine. If failure happens from this 2074 * point on, the recovery will start from new log->last_checkpoint. 2075 */ 2076 static int 2077 r5c_recovery_rewrite_data_only_stripes(struct r5l_log *log, 2078 struct r5l_recovery_ctx *ctx) 2079 { 2080 struct stripe_head *sh; 2081 struct mddev *mddev = log->rdev->mddev; 2082 struct page *page; 2083 sector_t next_checkpoint = MaxSector; 2084 2085 page = alloc_page(GFP_KERNEL); 2086 if (!page) { 2087 pr_err("md/raid:%s: cannot allocate memory to rewrite data only stripes\n", 2088 mdname(mddev)); 2089 return -ENOMEM; 2090 } 2091 2092 WARN_ON(list_empty(&ctx->cached_list)); 2093 2094 list_for_each_entry(sh, &ctx->cached_list, lru) { 2095 struct r5l_meta_block *mb; 2096 int i; 2097 int offset; 2098 sector_t write_pos; 2099 2100 WARN_ON(!test_bit(STRIPE_R5C_CACHING, &sh->state)); 2101 r5l_recovery_create_empty_meta_block(log, page, 2102 ctx->pos, ctx->seq); 2103 mb = page_address(page); 2104 offset = le32_to_cpu(mb->meta_size); 2105 write_pos = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS); 2106 2107 for (i = sh->disks; i--; ) { 2108 struct r5dev *dev = &sh->dev[i]; 2109 struct r5l_payload_data_parity *payload; 2110 void *addr; 2111 2112 if (test_bit(R5_InJournal, &dev->flags)) { 2113 payload = (void *)mb + offset; 2114 payload->header.type = cpu_to_le16( 2115 R5LOG_PAYLOAD_DATA); 2116 payload->size = BLOCK_SECTORS; 2117 payload->location = cpu_to_le64( 2118 raid5_compute_blocknr(sh, i, 0)); 2119 addr = kmap_atomic(dev->page); 2120 payload->checksum[0] = cpu_to_le32( 2121 crc32c_le(log->uuid_checksum, addr, 2122 PAGE_SIZE)); 2123 kunmap_atomic(addr); 2124 sync_page_io(log->rdev, write_pos, PAGE_SIZE, 2125 dev->page, REQ_OP_WRITE, 0, false); 2126 write_pos = r5l_ring_add(log, write_pos, 2127 BLOCK_SECTORS); 2128 offset += sizeof(__le32) + 2129 sizeof(struct r5l_payload_data_parity); 2130 2131 } 2132 } 2133 mb->meta_size = cpu_to_le32(offset); 2134 mb->checksum = cpu_to_le32(crc32c_le(log->uuid_checksum, 2135 mb, PAGE_SIZE)); 2136 sync_page_io(log->rdev, ctx->pos, PAGE_SIZE, page, 2137 REQ_OP_WRITE, REQ_FUA, false); 2138 sh->log_start = ctx->pos; 2139 list_add_tail(&sh->r5c, &log->stripe_in_journal_list); 2140 atomic_inc(&log->stripe_in_journal_count); 2141 ctx->pos = write_pos; 2142 ctx->seq += 1; 2143 next_checkpoint = sh->log_start; 2144 } 2145 log->next_checkpoint = next_checkpoint; 2146 __free_page(page); 2147 return 0; 2148 } 2149 2150 static void r5c_recovery_flush_data_only_stripes(struct r5l_log *log, 2151 struct r5l_recovery_ctx *ctx) 2152 { 2153 struct mddev *mddev = log->rdev->mddev; 2154 struct r5conf *conf = mddev->private; 2155 struct stripe_head *sh, *next; 2156 2157 if (ctx->data_only_stripes == 0) 2158 return; 2159 2160 log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_BACK; 2161 2162 list_for_each_entry_safe(sh, next, &ctx->cached_list, lru) { 2163 r5c_make_stripe_write_out(sh); 2164 set_bit(STRIPE_HANDLE, &sh->state); 2165 list_del_init(&sh->lru); 2166 raid5_release_stripe(sh); 2167 } 2168 2169 md_wakeup_thread(conf->mddev->thread); 2170 /* reuse conf->wait_for_quiescent in recovery */ 2171 wait_event(conf->wait_for_quiescent, 2172 atomic_read(&conf->active_stripes) == 0); 2173 2174 log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH; 2175 } 2176 2177 static int r5l_recovery_log(struct r5l_log *log) 2178 { 2179 struct mddev *mddev = log->rdev->mddev; 2180 struct r5l_recovery_ctx ctx; 2181 int ret; 2182 sector_t pos; 2183 2184 ctx.pos = log->last_checkpoint; 2185 ctx.seq = log->last_cp_seq; 2186 ctx.meta_page = alloc_page(GFP_KERNEL); 2187 ctx.data_only_stripes = 0; 2188 ctx.data_parity_stripes = 0; 2189 INIT_LIST_HEAD(&ctx.cached_list); 2190 2191 if (!ctx.meta_page) 2192 return -ENOMEM; 2193 2194 ret = r5c_recovery_flush_log(log, &ctx); 2195 __free_page(ctx.meta_page); 2196 2197 if (ret) 2198 return ret; 2199 2200 pos = ctx.pos; 2201 ctx.seq += 10000; 2202 2203 2204 if ((ctx.data_only_stripes == 0) && (ctx.data_parity_stripes == 0)) 2205 pr_debug("md/raid:%s: starting from clean shutdown\n", 2206 mdname(mddev)); 2207 else 2208 pr_debug("md/raid:%s: recovering %d data-only stripes and %d data-parity stripes\n", 2209 mdname(mddev), ctx.data_only_stripes, 2210 ctx.data_parity_stripes); 2211 2212 if (ctx.data_only_stripes == 0) { 2213 log->next_checkpoint = ctx.pos; 2214 r5l_log_write_empty_meta_block(log, ctx.pos, ctx.seq++); 2215 ctx.pos = r5l_ring_add(log, ctx.pos, BLOCK_SECTORS); 2216 } else if (r5c_recovery_rewrite_data_only_stripes(log, &ctx)) { 2217 pr_err("md/raid:%s: failed to rewrite stripes to journal\n", 2218 mdname(mddev)); 2219 return -EIO; 2220 } 2221 2222 log->log_start = ctx.pos; 2223 log->seq = ctx.seq; 2224 log->last_checkpoint = pos; 2225 r5l_write_super(log, pos); 2226 2227 r5c_recovery_flush_data_only_stripes(log, &ctx); 2228 return 0; 2229 } 2230 2231 static void r5l_write_super(struct r5l_log *log, sector_t cp) 2232 { 2233 struct mddev *mddev = log->rdev->mddev; 2234 2235 log->rdev->journal_tail = cp; 2236 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 2237 } 2238 2239 static ssize_t r5c_journal_mode_show(struct mddev *mddev, char *page) 2240 { 2241 struct r5conf *conf = mddev->private; 2242 int ret; 2243 2244 if (!conf->log) 2245 return 0; 2246 2247 switch (conf->log->r5c_journal_mode) { 2248 case R5C_JOURNAL_MODE_WRITE_THROUGH: 2249 ret = snprintf( 2250 page, PAGE_SIZE, "[%s] %s\n", 2251 r5c_journal_mode_str[R5C_JOURNAL_MODE_WRITE_THROUGH], 2252 r5c_journal_mode_str[R5C_JOURNAL_MODE_WRITE_BACK]); 2253 break; 2254 case R5C_JOURNAL_MODE_WRITE_BACK: 2255 ret = snprintf( 2256 page, PAGE_SIZE, "%s [%s]\n", 2257 r5c_journal_mode_str[R5C_JOURNAL_MODE_WRITE_THROUGH], 2258 r5c_journal_mode_str[R5C_JOURNAL_MODE_WRITE_BACK]); 2259 break; 2260 default: 2261 ret = 0; 2262 } 2263 return ret; 2264 } 2265 2266 static ssize_t r5c_journal_mode_store(struct mddev *mddev, 2267 const char *page, size_t length) 2268 { 2269 struct r5conf *conf = mddev->private; 2270 struct r5l_log *log = conf->log; 2271 int val = -1, i; 2272 int len = length; 2273 2274 if (!log) 2275 return -ENODEV; 2276 2277 if (len && page[len - 1] == '\n') 2278 len -= 1; 2279 for (i = 0; i < ARRAY_SIZE(r5c_journal_mode_str); i++) 2280 if (strlen(r5c_journal_mode_str[i]) == len && 2281 strncmp(page, r5c_journal_mode_str[i], len) == 0) { 2282 val = i; 2283 break; 2284 } 2285 if (val < R5C_JOURNAL_MODE_WRITE_THROUGH || 2286 val > R5C_JOURNAL_MODE_WRITE_BACK) 2287 return -EINVAL; 2288 2289 if (raid5_calc_degraded(conf) > 0 && 2290 val == R5C_JOURNAL_MODE_WRITE_BACK) 2291 return -EINVAL; 2292 2293 mddev_suspend(mddev); 2294 conf->log->r5c_journal_mode = val; 2295 mddev_resume(mddev); 2296 2297 pr_debug("md/raid:%s: setting r5c cache mode to %d: %s\n", 2298 mdname(mddev), val, r5c_journal_mode_str[val]); 2299 return length; 2300 } 2301 2302 struct md_sysfs_entry 2303 r5c_journal_mode = __ATTR(journal_mode, 0644, 2304 r5c_journal_mode_show, r5c_journal_mode_store); 2305 2306 /* 2307 * Try handle write operation in caching phase. This function should only 2308 * be called in write-back mode. 2309 * 2310 * If all outstanding writes can be handled in caching phase, returns 0 2311 * If writes requires write-out phase, call r5c_make_stripe_write_out() 2312 * and returns -EAGAIN 2313 */ 2314 int r5c_try_caching_write(struct r5conf *conf, 2315 struct stripe_head *sh, 2316 struct stripe_head_state *s, 2317 int disks) 2318 { 2319 struct r5l_log *log = conf->log; 2320 int i; 2321 struct r5dev *dev; 2322 int to_cache = 0; 2323 2324 BUG_ON(!r5c_is_writeback(log)); 2325 2326 if (!test_bit(STRIPE_R5C_CACHING, &sh->state)) { 2327 /* 2328 * There are two different scenarios here: 2329 * 1. The stripe has some data cached, and it is sent to 2330 * write-out phase for reclaim 2331 * 2. The stripe is clean, and this is the first write 2332 * 2333 * For 1, return -EAGAIN, so we continue with 2334 * handle_stripe_dirtying(). 2335 * 2336 * For 2, set STRIPE_R5C_CACHING and continue with caching 2337 * write. 2338 */ 2339 2340 /* case 1: anything injournal or anything in written */ 2341 if (s->injournal > 0 || s->written > 0) 2342 return -EAGAIN; 2343 /* case 2 */ 2344 set_bit(STRIPE_R5C_CACHING, &sh->state); 2345 } 2346 2347 /* 2348 * When run in degraded mode, array is set to write-through mode. 2349 * This check helps drain pending write safely in the transition to 2350 * write-through mode. 2351 */ 2352 if (s->failed) { 2353 r5c_make_stripe_write_out(sh); 2354 return -EAGAIN; 2355 } 2356 2357 for (i = disks; i--; ) { 2358 dev = &sh->dev[i]; 2359 /* if non-overwrite, use writing-out phase */ 2360 if (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags) && 2361 !test_bit(R5_InJournal, &dev->flags)) { 2362 r5c_make_stripe_write_out(sh); 2363 return -EAGAIN; 2364 } 2365 } 2366 2367 for (i = disks; i--; ) { 2368 dev = &sh->dev[i]; 2369 if (dev->towrite) { 2370 set_bit(R5_Wantwrite, &dev->flags); 2371 set_bit(R5_Wantdrain, &dev->flags); 2372 set_bit(R5_LOCKED, &dev->flags); 2373 to_cache++; 2374 } 2375 } 2376 2377 if (to_cache) { 2378 set_bit(STRIPE_OP_BIODRAIN, &s->ops_request); 2379 /* 2380 * set STRIPE_LOG_TRAPPED, which triggers r5c_cache_data() 2381 * in ops_run_io(). STRIPE_LOG_TRAPPED will be cleared in 2382 * r5c_handle_data_cached() 2383 */ 2384 set_bit(STRIPE_LOG_TRAPPED, &sh->state); 2385 } 2386 2387 return 0; 2388 } 2389 2390 /* 2391 * free extra pages (orig_page) we allocated for prexor 2392 */ 2393 void r5c_release_extra_page(struct stripe_head *sh) 2394 { 2395 struct r5conf *conf = sh->raid_conf; 2396 int i; 2397 bool using_disk_info_extra_page; 2398 2399 using_disk_info_extra_page = 2400 sh->dev[0].orig_page == conf->disks[0].extra_page; 2401 2402 for (i = sh->disks; i--; ) 2403 if (sh->dev[i].page != sh->dev[i].orig_page) { 2404 struct page *p = sh->dev[i].orig_page; 2405 2406 sh->dev[i].orig_page = sh->dev[i].page; 2407 clear_bit(R5_OrigPageUPTDODATE, &sh->dev[i].flags); 2408 2409 if (!using_disk_info_extra_page) 2410 put_page(p); 2411 } 2412 2413 if (using_disk_info_extra_page) { 2414 clear_bit(R5C_EXTRA_PAGE_IN_USE, &conf->cache_state); 2415 md_wakeup_thread(conf->mddev->thread); 2416 } 2417 } 2418 2419 void r5c_use_extra_page(struct stripe_head *sh) 2420 { 2421 struct r5conf *conf = sh->raid_conf; 2422 int i; 2423 struct r5dev *dev; 2424 2425 for (i = sh->disks; i--; ) { 2426 dev = &sh->dev[i]; 2427 if (dev->orig_page != dev->page) 2428 put_page(dev->orig_page); 2429 dev->orig_page = conf->disks[i].extra_page; 2430 } 2431 } 2432 2433 /* 2434 * clean up the stripe (clear R5_InJournal for dev[pd_idx] etc.) after the 2435 * stripe is committed to RAID disks. 2436 */ 2437 void r5c_finish_stripe_write_out(struct r5conf *conf, 2438 struct stripe_head *sh, 2439 struct stripe_head_state *s) 2440 { 2441 int i; 2442 int do_wakeup = 0; 2443 2444 if (!conf->log || 2445 !test_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags)) 2446 return; 2447 2448 WARN_ON(test_bit(STRIPE_R5C_CACHING, &sh->state)); 2449 clear_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags); 2450 2451 if (conf->log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) 2452 return; 2453 2454 for (i = sh->disks; i--; ) { 2455 clear_bit(R5_InJournal, &sh->dev[i].flags); 2456 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) 2457 do_wakeup = 1; 2458 } 2459 2460 /* 2461 * analyse_stripe() runs before r5c_finish_stripe_write_out(), 2462 * We updated R5_InJournal, so we also update s->injournal. 2463 */ 2464 s->injournal = 0; 2465 2466 if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state)) 2467 if (atomic_dec_and_test(&conf->pending_full_writes)) 2468 md_wakeup_thread(conf->mddev->thread); 2469 2470 if (do_wakeup) 2471 wake_up(&conf->wait_for_overlap); 2472 2473 spin_lock_irq(&conf->log->stripe_in_journal_lock); 2474 list_del_init(&sh->r5c); 2475 spin_unlock_irq(&conf->log->stripe_in_journal_lock); 2476 sh->log_start = MaxSector; 2477 atomic_dec(&conf->log->stripe_in_journal_count); 2478 r5c_update_log_state(conf->log); 2479 } 2480 2481 int 2482 r5c_cache_data(struct r5l_log *log, struct stripe_head *sh, 2483 struct stripe_head_state *s) 2484 { 2485 struct r5conf *conf = sh->raid_conf; 2486 int pages = 0; 2487 int reserve; 2488 int i; 2489 int ret = 0; 2490 2491 BUG_ON(!log); 2492 2493 for (i = 0; i < sh->disks; i++) { 2494 void *addr; 2495 2496 if (!test_bit(R5_Wantwrite, &sh->dev[i].flags)) 2497 continue; 2498 addr = kmap_atomic(sh->dev[i].page); 2499 sh->dev[i].log_checksum = crc32c_le(log->uuid_checksum, 2500 addr, PAGE_SIZE); 2501 kunmap_atomic(addr); 2502 pages++; 2503 } 2504 WARN_ON(pages == 0); 2505 2506 /* 2507 * The stripe must enter state machine again to call endio, so 2508 * don't delay. 2509 */ 2510 clear_bit(STRIPE_DELAYED, &sh->state); 2511 atomic_inc(&sh->count); 2512 2513 mutex_lock(&log->io_mutex); 2514 /* meta + data */ 2515 reserve = (1 + pages) << (PAGE_SHIFT - 9); 2516 2517 if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state) && 2518 sh->log_start == MaxSector) 2519 r5l_add_no_space_stripe(log, sh); 2520 else if (!r5l_has_free_space(log, reserve)) { 2521 if (sh->log_start == log->last_checkpoint) 2522 BUG(); 2523 else 2524 r5l_add_no_space_stripe(log, sh); 2525 } else { 2526 ret = r5l_log_stripe(log, sh, pages, 0); 2527 if (ret) { 2528 spin_lock_irq(&log->io_list_lock); 2529 list_add_tail(&sh->log_list, &log->no_mem_stripes); 2530 spin_unlock_irq(&log->io_list_lock); 2531 } 2532 } 2533 2534 mutex_unlock(&log->io_mutex); 2535 return 0; 2536 } 2537 2538 static int r5l_load_log(struct r5l_log *log) 2539 { 2540 struct md_rdev *rdev = log->rdev; 2541 struct page *page; 2542 struct r5l_meta_block *mb; 2543 sector_t cp = log->rdev->journal_tail; 2544 u32 stored_crc, expected_crc; 2545 bool create_super = false; 2546 int ret = 0; 2547 2548 /* Make sure it's valid */ 2549 if (cp >= rdev->sectors || round_down(cp, BLOCK_SECTORS) != cp) 2550 cp = 0; 2551 page = alloc_page(GFP_KERNEL); 2552 if (!page) 2553 return -ENOMEM; 2554 2555 if (!sync_page_io(rdev, cp, PAGE_SIZE, page, REQ_OP_READ, 0, false)) { 2556 ret = -EIO; 2557 goto ioerr; 2558 } 2559 mb = page_address(page); 2560 2561 if (le32_to_cpu(mb->magic) != R5LOG_MAGIC || 2562 mb->version != R5LOG_VERSION) { 2563 create_super = true; 2564 goto create; 2565 } 2566 stored_crc = le32_to_cpu(mb->checksum); 2567 mb->checksum = 0; 2568 expected_crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE); 2569 if (stored_crc != expected_crc) { 2570 create_super = true; 2571 goto create; 2572 } 2573 if (le64_to_cpu(mb->position) != cp) { 2574 create_super = true; 2575 goto create; 2576 } 2577 create: 2578 if (create_super) { 2579 log->last_cp_seq = prandom_u32(); 2580 cp = 0; 2581 r5l_log_write_empty_meta_block(log, cp, log->last_cp_seq); 2582 /* 2583 * Make sure super points to correct address. Log might have 2584 * data very soon. If super hasn't correct log tail address, 2585 * recovery can't find the log 2586 */ 2587 r5l_write_super(log, cp); 2588 } else 2589 log->last_cp_seq = le64_to_cpu(mb->seq); 2590 2591 log->device_size = round_down(rdev->sectors, BLOCK_SECTORS); 2592 log->max_free_space = log->device_size >> RECLAIM_MAX_FREE_SPACE_SHIFT; 2593 if (log->max_free_space > RECLAIM_MAX_FREE_SPACE) 2594 log->max_free_space = RECLAIM_MAX_FREE_SPACE; 2595 log->last_checkpoint = cp; 2596 2597 __free_page(page); 2598 2599 if (create_super) { 2600 log->log_start = r5l_ring_add(log, cp, BLOCK_SECTORS); 2601 log->seq = log->last_cp_seq + 1; 2602 log->next_checkpoint = cp; 2603 } else 2604 ret = r5l_recovery_log(log); 2605 2606 r5c_update_log_state(log); 2607 return ret; 2608 ioerr: 2609 __free_page(page); 2610 return ret; 2611 } 2612 2613 void r5c_update_on_rdev_error(struct mddev *mddev) 2614 { 2615 struct r5conf *conf = mddev->private; 2616 struct r5l_log *log = conf->log; 2617 2618 if (!log) 2619 return; 2620 2621 if (raid5_calc_degraded(conf) > 0 && 2622 conf->log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_BACK) 2623 schedule_work(&log->disable_writeback_work); 2624 } 2625 2626 int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev) 2627 { 2628 struct request_queue *q = bdev_get_queue(rdev->bdev); 2629 struct r5l_log *log; 2630 2631 if (PAGE_SIZE != 4096) 2632 return -EINVAL; 2633 2634 /* 2635 * The PAGE_SIZE must be big enough to hold 1 r5l_meta_block and 2636 * raid_disks r5l_payload_data_parity. 2637 * 2638 * Write journal and cache does not work for very big array 2639 * (raid_disks > 203) 2640 */ 2641 if (sizeof(struct r5l_meta_block) + 2642 ((sizeof(struct r5l_payload_data_parity) + sizeof(__le32)) * 2643 conf->raid_disks) > PAGE_SIZE) { 2644 pr_err("md/raid:%s: write journal/cache doesn't work for array with %d disks\n", 2645 mdname(conf->mddev), conf->raid_disks); 2646 return -EINVAL; 2647 } 2648 2649 log = kzalloc(sizeof(*log), GFP_KERNEL); 2650 if (!log) 2651 return -ENOMEM; 2652 log->rdev = rdev; 2653 2654 log->need_cache_flush = test_bit(QUEUE_FLAG_WC, &q->queue_flags) != 0; 2655 2656 log->uuid_checksum = crc32c_le(~0, rdev->mddev->uuid, 2657 sizeof(rdev->mddev->uuid)); 2658 2659 mutex_init(&log->io_mutex); 2660 2661 spin_lock_init(&log->io_list_lock); 2662 INIT_LIST_HEAD(&log->running_ios); 2663 INIT_LIST_HEAD(&log->io_end_ios); 2664 INIT_LIST_HEAD(&log->flushing_ios); 2665 INIT_LIST_HEAD(&log->finished_ios); 2666 bio_init(&log->flush_bio, NULL, 0); 2667 2668 log->io_kc = KMEM_CACHE(r5l_io_unit, 0); 2669 if (!log->io_kc) 2670 goto io_kc; 2671 2672 log->io_pool = mempool_create_slab_pool(R5L_POOL_SIZE, log->io_kc); 2673 if (!log->io_pool) 2674 goto io_pool; 2675 2676 log->bs = bioset_create(R5L_POOL_SIZE, 0); 2677 if (!log->bs) 2678 goto io_bs; 2679 2680 log->meta_pool = mempool_create_page_pool(R5L_POOL_SIZE, 0); 2681 if (!log->meta_pool) 2682 goto out_mempool; 2683 2684 log->reclaim_thread = md_register_thread(r5l_reclaim_thread, 2685 log->rdev->mddev, "reclaim"); 2686 if (!log->reclaim_thread) 2687 goto reclaim_thread; 2688 log->reclaim_thread->timeout = R5C_RECLAIM_WAKEUP_INTERVAL; 2689 2690 init_waitqueue_head(&log->iounit_wait); 2691 2692 INIT_LIST_HEAD(&log->no_mem_stripes); 2693 2694 INIT_LIST_HEAD(&log->no_space_stripes); 2695 spin_lock_init(&log->no_space_stripes_lock); 2696 2697 INIT_WORK(&log->deferred_io_work, r5l_submit_io_async); 2698 INIT_WORK(&log->disable_writeback_work, r5c_disable_writeback_async); 2699 2700 log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH; 2701 INIT_LIST_HEAD(&log->stripe_in_journal_list); 2702 spin_lock_init(&log->stripe_in_journal_lock); 2703 atomic_set(&log->stripe_in_journal_count, 0); 2704 2705 rcu_assign_pointer(conf->log, log); 2706 2707 if (r5l_load_log(log)) 2708 goto error; 2709 2710 set_bit(MD_HAS_JOURNAL, &conf->mddev->flags); 2711 return 0; 2712 2713 error: 2714 rcu_assign_pointer(conf->log, NULL); 2715 md_unregister_thread(&log->reclaim_thread); 2716 reclaim_thread: 2717 mempool_destroy(log->meta_pool); 2718 out_mempool: 2719 bioset_free(log->bs); 2720 io_bs: 2721 mempool_destroy(log->io_pool); 2722 io_pool: 2723 kmem_cache_destroy(log->io_kc); 2724 io_kc: 2725 kfree(log); 2726 return -EINVAL; 2727 } 2728 2729 void r5l_exit_log(struct r5l_log *log) 2730 { 2731 flush_work(&log->disable_writeback_work); 2732 md_unregister_thread(&log->reclaim_thread); 2733 mempool_destroy(log->meta_pool); 2734 bioset_free(log->bs); 2735 mempool_destroy(log->io_pool); 2736 kmem_cache_destroy(log->io_kc); 2737 kfree(log); 2738 } 2739