1 /* 2 * Copyright (C) 2015 Shaohua Li <shli@fb.com> 3 * Copyright (C) 2016 Song Liu <songliubraving@fb.com> 4 * 5 * This program is free software; you can redistribute it and/or modify it 6 * under the terms and conditions of the GNU General Public License, 7 * version 2, as published by the Free Software Foundation. 8 * 9 * This program is distributed in the hope it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 12 * more details. 13 * 14 */ 15 #include <linux/kernel.h> 16 #include <linux/wait.h> 17 #include <linux/blkdev.h> 18 #include <linux/slab.h> 19 #include <linux/raid/md_p.h> 20 #include <linux/crc32c.h> 21 #include <linux/random.h> 22 #include "md.h" 23 #include "raid5.h" 24 #include "bitmap.h" 25 26 /* 27 * metadata/data stored in disk with 4k size unit (a block) regardless 28 * underneath hardware sector size. only works with PAGE_SIZE == 4096 29 */ 30 #define BLOCK_SECTORS (8) 31 32 /* 33 * log->max_free_space is min(1/4 disk size, 10G reclaimable space). 34 * 35 * In write through mode, the reclaim runs every log->max_free_space. 36 * This can prevent the recovery scans for too long 37 */ 38 #define RECLAIM_MAX_FREE_SPACE (10 * 1024 * 1024 * 2) /* sector */ 39 #define RECLAIM_MAX_FREE_SPACE_SHIFT (2) 40 41 /* wake up reclaim thread periodically */ 42 #define R5C_RECLAIM_WAKEUP_INTERVAL (30 * HZ) 43 /* start flush with these full stripes */ 44 #define R5C_FULL_STRIPE_FLUSH_BATCH 256 45 /* reclaim stripes in groups */ 46 #define R5C_RECLAIM_STRIPE_GROUP (NR_STRIPE_HASH_LOCKS * 2) 47 48 /* 49 * We only need 2 bios per I/O unit to make progress, but ensure we 50 * have a few more available to not get too tight. 51 */ 52 #define R5L_POOL_SIZE 4 53 54 /* 55 * r5c journal modes of the array: write-back or write-through. 56 * write-through mode has identical behavior as existing log only 57 * implementation. 58 */ 59 enum r5c_journal_mode { 60 R5C_JOURNAL_MODE_WRITE_THROUGH = 0, 61 R5C_JOURNAL_MODE_WRITE_BACK = 1, 62 }; 63 64 static char *r5c_journal_mode_str[] = {"write-through", 65 "write-back"}; 66 /* 67 * raid5 cache state machine 68 * 69 * With rhe RAID cache, each stripe works in two phases: 70 * - caching phase 71 * - writing-out phase 72 * 73 * These two phases are controlled by bit STRIPE_R5C_CACHING: 74 * if STRIPE_R5C_CACHING == 0, the stripe is in writing-out phase 75 * if STRIPE_R5C_CACHING == 1, the stripe is in caching phase 76 * 77 * When there is no journal, or the journal is in write-through mode, 78 * the stripe is always in writing-out phase. 79 * 80 * For write-back journal, the stripe is sent to caching phase on write 81 * (r5c_try_caching_write). r5c_make_stripe_write_out() kicks off 82 * the write-out phase by clearing STRIPE_R5C_CACHING. 83 * 84 * Stripes in caching phase do not write the raid disks. Instead, all 85 * writes are committed from the log device. Therefore, a stripe in 86 * caching phase handles writes as: 87 * - write to log device 88 * - return IO 89 * 90 * Stripes in writing-out phase handle writes as: 91 * - calculate parity 92 * - write pending data and parity to journal 93 * - write data and parity to raid disks 94 * - return IO for pending writes 95 */ 96 97 struct r5l_log { 98 struct md_rdev *rdev; 99 100 u32 uuid_checksum; 101 102 sector_t device_size; /* log device size, round to 103 * BLOCK_SECTORS */ 104 sector_t max_free_space; /* reclaim run if free space is at 105 * this size */ 106 107 sector_t last_checkpoint; /* log tail. where recovery scan 108 * starts from */ 109 u64 last_cp_seq; /* log tail sequence */ 110 111 sector_t log_start; /* log head. where new data appends */ 112 u64 seq; /* log head sequence */ 113 114 sector_t next_checkpoint; 115 u64 next_cp_seq; 116 117 struct mutex io_mutex; 118 struct r5l_io_unit *current_io; /* current io_unit accepting new data */ 119 120 spinlock_t io_list_lock; 121 struct list_head running_ios; /* io_units which are still running, 122 * and have not yet been completely 123 * written to the log */ 124 struct list_head io_end_ios; /* io_units which have been completely 125 * written to the log but not yet written 126 * to the RAID */ 127 struct list_head flushing_ios; /* io_units which are waiting for log 128 * cache flush */ 129 struct list_head finished_ios; /* io_units which settle down in log disk */ 130 struct bio flush_bio; 131 132 struct list_head no_mem_stripes; /* pending stripes, -ENOMEM */ 133 134 struct kmem_cache *io_kc; 135 mempool_t *io_pool; 136 struct bio_set *bs; 137 mempool_t *meta_pool; 138 139 struct md_thread *reclaim_thread; 140 unsigned long reclaim_target; /* number of space that need to be 141 * reclaimed. if it's 0, reclaim spaces 142 * used by io_units which are in 143 * IO_UNIT_STRIPE_END state (eg, reclaim 144 * dones't wait for specific io_unit 145 * switching to IO_UNIT_STRIPE_END 146 * state) */ 147 wait_queue_head_t iounit_wait; 148 149 struct list_head no_space_stripes; /* pending stripes, log has no space */ 150 spinlock_t no_space_stripes_lock; 151 152 bool need_cache_flush; 153 154 /* for r5c_cache */ 155 enum r5c_journal_mode r5c_journal_mode; 156 157 /* all stripes in r5cache, in the order of seq at sh->log_start */ 158 struct list_head stripe_in_journal_list; 159 160 spinlock_t stripe_in_journal_lock; 161 atomic_t stripe_in_journal_count; 162 163 /* to submit async io_units, to fulfill ordering of flush */ 164 struct work_struct deferred_io_work; 165 }; 166 167 /* 168 * an IO range starts from a meta data block and end at the next meta data 169 * block. The io unit's the meta data block tracks data/parity followed it. io 170 * unit is written to log disk with normal write, as we always flush log disk 171 * first and then start move data to raid disks, there is no requirement to 172 * write io unit with FLUSH/FUA 173 */ 174 struct r5l_io_unit { 175 struct r5l_log *log; 176 177 struct page *meta_page; /* store meta block */ 178 int meta_offset; /* current offset in meta_page */ 179 180 struct bio *current_bio;/* current_bio accepting new data */ 181 182 atomic_t pending_stripe;/* how many stripes not flushed to raid */ 183 u64 seq; /* seq number of the metablock */ 184 sector_t log_start; /* where the io_unit starts */ 185 sector_t log_end; /* where the io_unit ends */ 186 struct list_head log_sibling; /* log->running_ios */ 187 struct list_head stripe_list; /* stripes added to the io_unit */ 188 189 int state; 190 bool need_split_bio; 191 struct bio *split_bio; 192 193 unsigned int has_flush:1; /* include flush request */ 194 unsigned int has_fua:1; /* include fua request */ 195 unsigned int has_null_flush:1; /* include empty flush request */ 196 /* 197 * io isn't sent yet, flush/fua request can only be submitted till it's 198 * the first IO in running_ios list 199 */ 200 unsigned int io_deferred:1; 201 202 struct bio_list flush_barriers; /* size == 0 flush bios */ 203 }; 204 205 /* r5l_io_unit state */ 206 enum r5l_io_unit_state { 207 IO_UNIT_RUNNING = 0, /* accepting new IO */ 208 IO_UNIT_IO_START = 1, /* io_unit bio start writing to log, 209 * don't accepting new bio */ 210 IO_UNIT_IO_END = 2, /* io_unit bio finish writing to log */ 211 IO_UNIT_STRIPE_END = 3, /* stripes data finished writing to raid */ 212 }; 213 214 bool r5c_is_writeback(struct r5l_log *log) 215 { 216 return (log != NULL && 217 log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_BACK); 218 } 219 220 static sector_t r5l_ring_add(struct r5l_log *log, sector_t start, sector_t inc) 221 { 222 start += inc; 223 if (start >= log->device_size) 224 start = start - log->device_size; 225 return start; 226 } 227 228 static sector_t r5l_ring_distance(struct r5l_log *log, sector_t start, 229 sector_t end) 230 { 231 if (end >= start) 232 return end - start; 233 else 234 return end + log->device_size - start; 235 } 236 237 static bool r5l_has_free_space(struct r5l_log *log, sector_t size) 238 { 239 sector_t used_size; 240 241 used_size = r5l_ring_distance(log, log->last_checkpoint, 242 log->log_start); 243 244 return log->device_size > used_size + size; 245 } 246 247 static void __r5l_set_io_unit_state(struct r5l_io_unit *io, 248 enum r5l_io_unit_state state) 249 { 250 if (WARN_ON(io->state >= state)) 251 return; 252 io->state = state; 253 } 254 255 static void 256 r5c_return_dev_pending_writes(struct r5conf *conf, struct r5dev *dev, 257 struct bio_list *return_bi) 258 { 259 struct bio *wbi, *wbi2; 260 261 wbi = dev->written; 262 dev->written = NULL; 263 while (wbi && wbi->bi_iter.bi_sector < 264 dev->sector + STRIPE_SECTORS) { 265 wbi2 = r5_next_bio(wbi, dev->sector); 266 if (!raid5_dec_bi_active_stripes(wbi)) { 267 md_write_end(conf->mddev); 268 bio_list_add(return_bi, wbi); 269 } 270 wbi = wbi2; 271 } 272 } 273 274 void r5c_handle_cached_data_endio(struct r5conf *conf, 275 struct stripe_head *sh, int disks, struct bio_list *return_bi) 276 { 277 int i; 278 279 for (i = sh->disks; i--; ) { 280 if (sh->dev[i].written) { 281 set_bit(R5_UPTODATE, &sh->dev[i].flags); 282 r5c_return_dev_pending_writes(conf, &sh->dev[i], 283 return_bi); 284 bitmap_endwrite(conf->mddev->bitmap, sh->sector, 285 STRIPE_SECTORS, 286 !test_bit(STRIPE_DEGRADED, &sh->state), 287 0); 288 } 289 } 290 } 291 292 /* Check whether we should flush some stripes to free up stripe cache */ 293 void r5c_check_stripe_cache_usage(struct r5conf *conf) 294 { 295 int total_cached; 296 297 if (!r5c_is_writeback(conf->log)) 298 return; 299 300 total_cached = atomic_read(&conf->r5c_cached_partial_stripes) + 301 atomic_read(&conf->r5c_cached_full_stripes); 302 303 /* 304 * The following condition is true for either of the following: 305 * - stripe cache pressure high: 306 * total_cached > 3/4 min_nr_stripes || 307 * empty_inactive_list_nr > 0 308 * - stripe cache pressure moderate: 309 * total_cached > 1/2 min_nr_stripes 310 */ 311 if (total_cached > conf->min_nr_stripes * 1 / 2 || 312 atomic_read(&conf->empty_inactive_list_nr) > 0) 313 r5l_wake_reclaim(conf->log, 0); 314 } 315 316 /* 317 * flush cache when there are R5C_FULL_STRIPE_FLUSH_BATCH or more full 318 * stripes in the cache 319 */ 320 void r5c_check_cached_full_stripe(struct r5conf *conf) 321 { 322 if (!r5c_is_writeback(conf->log)) 323 return; 324 325 /* 326 * wake up reclaim for R5C_FULL_STRIPE_FLUSH_BATCH cached stripes 327 * or a full stripe (chunk size / 4k stripes). 328 */ 329 if (atomic_read(&conf->r5c_cached_full_stripes) >= 330 min(R5C_FULL_STRIPE_FLUSH_BATCH, 331 conf->chunk_sectors >> STRIPE_SHIFT)) 332 r5l_wake_reclaim(conf->log, 0); 333 } 334 335 /* 336 * Total log space (in sectors) needed to flush all data in cache 337 * 338 * Currently, writing-out phase automatically includes all pending writes 339 * to the same sector. So the reclaim of each stripe takes up to 340 * (conf->raid_disks + 1) pages of log space. 341 * 342 * To totally avoid deadlock due to log space, the code reserves 343 * (conf->raid_disks + 1) pages for each stripe in cache, which is not 344 * necessary in most cases. 345 * 346 * To improve this, we will need writing-out phase to be able to NOT include 347 * pending writes, which will reduce the requirement to 348 * (conf->max_degraded + 1) pages per stripe in cache. 349 */ 350 static sector_t r5c_log_required_to_flush_cache(struct r5conf *conf) 351 { 352 struct r5l_log *log = conf->log; 353 354 if (!r5c_is_writeback(log)) 355 return 0; 356 357 return BLOCK_SECTORS * (conf->raid_disks + 1) * 358 atomic_read(&log->stripe_in_journal_count); 359 } 360 361 /* 362 * evaluate log space usage and update R5C_LOG_TIGHT and R5C_LOG_CRITICAL 363 * 364 * R5C_LOG_TIGHT is set when free space on the log device is less than 3x of 365 * reclaim_required_space. R5C_LOG_CRITICAL is set when free space on the log 366 * device is less than 2x of reclaim_required_space. 367 */ 368 static inline void r5c_update_log_state(struct r5l_log *log) 369 { 370 struct r5conf *conf = log->rdev->mddev->private; 371 sector_t free_space; 372 sector_t reclaim_space; 373 374 if (!r5c_is_writeback(log)) 375 return; 376 377 free_space = r5l_ring_distance(log, log->log_start, 378 log->last_checkpoint); 379 reclaim_space = r5c_log_required_to_flush_cache(conf); 380 if (free_space < 2 * reclaim_space) 381 set_bit(R5C_LOG_CRITICAL, &conf->cache_state); 382 else 383 clear_bit(R5C_LOG_CRITICAL, &conf->cache_state); 384 if (free_space < 3 * reclaim_space) 385 set_bit(R5C_LOG_TIGHT, &conf->cache_state); 386 else 387 clear_bit(R5C_LOG_TIGHT, &conf->cache_state); 388 } 389 390 /* 391 * Put the stripe into writing-out phase by clearing STRIPE_R5C_CACHING. 392 * This function should only be called in write-back mode. 393 */ 394 void r5c_make_stripe_write_out(struct stripe_head *sh) 395 { 396 struct r5conf *conf = sh->raid_conf; 397 struct r5l_log *log = conf->log; 398 399 BUG_ON(!r5c_is_writeback(log)); 400 401 WARN_ON(!test_bit(STRIPE_R5C_CACHING, &sh->state)); 402 clear_bit(STRIPE_R5C_CACHING, &sh->state); 403 404 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 405 atomic_inc(&conf->preread_active_stripes); 406 407 if (test_and_clear_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state)) { 408 BUG_ON(atomic_read(&conf->r5c_cached_partial_stripes) == 0); 409 atomic_dec(&conf->r5c_cached_partial_stripes); 410 } 411 412 if (test_and_clear_bit(STRIPE_R5C_FULL_STRIPE, &sh->state)) { 413 BUG_ON(atomic_read(&conf->r5c_cached_full_stripes) == 0); 414 atomic_dec(&conf->r5c_cached_full_stripes); 415 } 416 } 417 418 static void r5c_handle_data_cached(struct stripe_head *sh) 419 { 420 int i; 421 422 for (i = sh->disks; i--; ) 423 if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) { 424 set_bit(R5_InJournal, &sh->dev[i].flags); 425 clear_bit(R5_LOCKED, &sh->dev[i].flags); 426 } 427 clear_bit(STRIPE_LOG_TRAPPED, &sh->state); 428 } 429 430 /* 431 * this journal write must contain full parity, 432 * it may also contain some data pages 433 */ 434 static void r5c_handle_parity_cached(struct stripe_head *sh) 435 { 436 int i; 437 438 for (i = sh->disks; i--; ) 439 if (test_bit(R5_InJournal, &sh->dev[i].flags)) 440 set_bit(R5_Wantwrite, &sh->dev[i].flags); 441 } 442 443 /* 444 * Setting proper flags after writing (or flushing) data and/or parity to the 445 * log device. This is called from r5l_log_endio() or r5l_log_flush_endio(). 446 */ 447 static void r5c_finish_cache_stripe(struct stripe_head *sh) 448 { 449 struct r5l_log *log = sh->raid_conf->log; 450 451 if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) { 452 BUG_ON(test_bit(STRIPE_R5C_CACHING, &sh->state)); 453 /* 454 * Set R5_InJournal for parity dev[pd_idx]. This means 455 * all data AND parity in the journal. For RAID 6, it is 456 * NOT necessary to set the flag for dev[qd_idx], as the 457 * two parities are written out together. 458 */ 459 set_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags); 460 } else if (test_bit(STRIPE_R5C_CACHING, &sh->state)) { 461 r5c_handle_data_cached(sh); 462 } else { 463 r5c_handle_parity_cached(sh); 464 set_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags); 465 } 466 } 467 468 static void r5l_io_run_stripes(struct r5l_io_unit *io) 469 { 470 struct stripe_head *sh, *next; 471 472 list_for_each_entry_safe(sh, next, &io->stripe_list, log_list) { 473 list_del_init(&sh->log_list); 474 475 r5c_finish_cache_stripe(sh); 476 477 set_bit(STRIPE_HANDLE, &sh->state); 478 raid5_release_stripe(sh); 479 } 480 } 481 482 static void r5l_log_run_stripes(struct r5l_log *log) 483 { 484 struct r5l_io_unit *io, *next; 485 486 assert_spin_locked(&log->io_list_lock); 487 488 list_for_each_entry_safe(io, next, &log->running_ios, log_sibling) { 489 /* don't change list order */ 490 if (io->state < IO_UNIT_IO_END) 491 break; 492 493 list_move_tail(&io->log_sibling, &log->finished_ios); 494 r5l_io_run_stripes(io); 495 } 496 } 497 498 static void r5l_move_to_end_ios(struct r5l_log *log) 499 { 500 struct r5l_io_unit *io, *next; 501 502 assert_spin_locked(&log->io_list_lock); 503 504 list_for_each_entry_safe(io, next, &log->running_ios, log_sibling) { 505 /* don't change list order */ 506 if (io->state < IO_UNIT_IO_END) 507 break; 508 list_move_tail(&io->log_sibling, &log->io_end_ios); 509 } 510 } 511 512 static void __r5l_stripe_write_finished(struct r5l_io_unit *io); 513 static void r5l_log_endio(struct bio *bio) 514 { 515 struct r5l_io_unit *io = bio->bi_private; 516 struct r5l_io_unit *io_deferred; 517 struct r5l_log *log = io->log; 518 unsigned long flags; 519 520 if (bio->bi_error) 521 md_error(log->rdev->mddev, log->rdev); 522 523 bio_put(bio); 524 mempool_free(io->meta_page, log->meta_pool); 525 526 spin_lock_irqsave(&log->io_list_lock, flags); 527 __r5l_set_io_unit_state(io, IO_UNIT_IO_END); 528 if (log->need_cache_flush) 529 r5l_move_to_end_ios(log); 530 else 531 r5l_log_run_stripes(log); 532 if (!list_empty(&log->running_ios)) { 533 /* 534 * FLUSH/FUA io_unit is deferred because of ordering, now we 535 * can dispatch it 536 */ 537 io_deferred = list_first_entry(&log->running_ios, 538 struct r5l_io_unit, log_sibling); 539 if (io_deferred->io_deferred) 540 schedule_work(&log->deferred_io_work); 541 } 542 543 spin_unlock_irqrestore(&log->io_list_lock, flags); 544 545 if (log->need_cache_flush) 546 md_wakeup_thread(log->rdev->mddev->thread); 547 548 if (io->has_null_flush) { 549 struct bio *bi; 550 551 WARN_ON(bio_list_empty(&io->flush_barriers)); 552 while ((bi = bio_list_pop(&io->flush_barriers)) != NULL) { 553 bio_endio(bi); 554 atomic_dec(&io->pending_stripe); 555 } 556 if (atomic_read(&io->pending_stripe) == 0) 557 __r5l_stripe_write_finished(io); 558 } 559 } 560 561 static void r5l_do_submit_io(struct r5l_log *log, struct r5l_io_unit *io) 562 { 563 unsigned long flags; 564 565 spin_lock_irqsave(&log->io_list_lock, flags); 566 __r5l_set_io_unit_state(io, IO_UNIT_IO_START); 567 spin_unlock_irqrestore(&log->io_list_lock, flags); 568 569 if (io->has_flush) 570 bio_set_op_attrs(io->current_bio, REQ_OP_WRITE, WRITE_FLUSH); 571 if (io->has_fua) 572 bio_set_op_attrs(io->current_bio, REQ_OP_WRITE, WRITE_FUA); 573 submit_bio(io->current_bio); 574 575 if (!io->split_bio) 576 return; 577 578 if (io->has_flush) 579 bio_set_op_attrs(io->split_bio, REQ_OP_WRITE, WRITE_FLUSH); 580 if (io->has_fua) 581 bio_set_op_attrs(io->split_bio, REQ_OP_WRITE, WRITE_FUA); 582 submit_bio(io->split_bio); 583 } 584 585 /* deferred io_unit will be dispatched here */ 586 static void r5l_submit_io_async(struct work_struct *work) 587 { 588 struct r5l_log *log = container_of(work, struct r5l_log, 589 deferred_io_work); 590 struct r5l_io_unit *io = NULL; 591 unsigned long flags; 592 593 spin_lock_irqsave(&log->io_list_lock, flags); 594 if (!list_empty(&log->running_ios)) { 595 io = list_first_entry(&log->running_ios, struct r5l_io_unit, 596 log_sibling); 597 if (!io->io_deferred) 598 io = NULL; 599 else 600 io->io_deferred = 0; 601 } 602 spin_unlock_irqrestore(&log->io_list_lock, flags); 603 if (io) 604 r5l_do_submit_io(log, io); 605 } 606 607 static void r5l_submit_current_io(struct r5l_log *log) 608 { 609 struct r5l_io_unit *io = log->current_io; 610 struct bio *bio; 611 struct r5l_meta_block *block; 612 unsigned long flags; 613 u32 crc; 614 bool do_submit = true; 615 616 if (!io) 617 return; 618 619 block = page_address(io->meta_page); 620 block->meta_size = cpu_to_le32(io->meta_offset); 621 crc = crc32c_le(log->uuid_checksum, block, PAGE_SIZE); 622 block->checksum = cpu_to_le32(crc); 623 bio = io->current_bio; 624 625 log->current_io = NULL; 626 spin_lock_irqsave(&log->io_list_lock, flags); 627 if (io->has_flush || io->has_fua) { 628 if (io != list_first_entry(&log->running_ios, 629 struct r5l_io_unit, log_sibling)) { 630 io->io_deferred = 1; 631 do_submit = false; 632 } 633 } 634 spin_unlock_irqrestore(&log->io_list_lock, flags); 635 if (do_submit) 636 r5l_do_submit_io(log, io); 637 } 638 639 static struct bio *r5l_bio_alloc(struct r5l_log *log) 640 { 641 struct bio *bio = bio_alloc_bioset(GFP_NOIO, BIO_MAX_PAGES, log->bs); 642 643 bio_set_op_attrs(bio, REQ_OP_WRITE, 0); 644 bio->bi_bdev = log->rdev->bdev; 645 bio->bi_iter.bi_sector = log->rdev->data_offset + log->log_start; 646 647 return bio; 648 } 649 650 static void r5_reserve_log_entry(struct r5l_log *log, struct r5l_io_unit *io) 651 { 652 log->log_start = r5l_ring_add(log, log->log_start, BLOCK_SECTORS); 653 654 r5c_update_log_state(log); 655 /* 656 * If we filled up the log device start from the beginning again, 657 * which will require a new bio. 658 * 659 * Note: for this to work properly the log size needs to me a multiple 660 * of BLOCK_SECTORS. 661 */ 662 if (log->log_start == 0) 663 io->need_split_bio = true; 664 665 io->log_end = log->log_start; 666 } 667 668 static struct r5l_io_unit *r5l_new_meta(struct r5l_log *log) 669 { 670 struct r5l_io_unit *io; 671 struct r5l_meta_block *block; 672 673 io = mempool_alloc(log->io_pool, GFP_ATOMIC); 674 if (!io) 675 return NULL; 676 memset(io, 0, sizeof(*io)); 677 678 io->log = log; 679 INIT_LIST_HEAD(&io->log_sibling); 680 INIT_LIST_HEAD(&io->stripe_list); 681 bio_list_init(&io->flush_barriers); 682 io->state = IO_UNIT_RUNNING; 683 684 io->meta_page = mempool_alloc(log->meta_pool, GFP_NOIO); 685 block = page_address(io->meta_page); 686 clear_page(block); 687 block->magic = cpu_to_le32(R5LOG_MAGIC); 688 block->version = R5LOG_VERSION; 689 block->seq = cpu_to_le64(log->seq); 690 block->position = cpu_to_le64(log->log_start); 691 692 io->log_start = log->log_start; 693 io->meta_offset = sizeof(struct r5l_meta_block); 694 io->seq = log->seq++; 695 696 io->current_bio = r5l_bio_alloc(log); 697 io->current_bio->bi_end_io = r5l_log_endio; 698 io->current_bio->bi_private = io; 699 bio_add_page(io->current_bio, io->meta_page, PAGE_SIZE, 0); 700 701 r5_reserve_log_entry(log, io); 702 703 spin_lock_irq(&log->io_list_lock); 704 list_add_tail(&io->log_sibling, &log->running_ios); 705 spin_unlock_irq(&log->io_list_lock); 706 707 return io; 708 } 709 710 static int r5l_get_meta(struct r5l_log *log, unsigned int payload_size) 711 { 712 if (log->current_io && 713 log->current_io->meta_offset + payload_size > PAGE_SIZE) 714 r5l_submit_current_io(log); 715 716 if (!log->current_io) { 717 log->current_io = r5l_new_meta(log); 718 if (!log->current_io) 719 return -ENOMEM; 720 } 721 722 return 0; 723 } 724 725 static void r5l_append_payload_meta(struct r5l_log *log, u16 type, 726 sector_t location, 727 u32 checksum1, u32 checksum2, 728 bool checksum2_valid) 729 { 730 struct r5l_io_unit *io = log->current_io; 731 struct r5l_payload_data_parity *payload; 732 733 payload = page_address(io->meta_page) + io->meta_offset; 734 payload->header.type = cpu_to_le16(type); 735 payload->header.flags = cpu_to_le16(0); 736 payload->size = cpu_to_le32((1 + !!checksum2_valid) << 737 (PAGE_SHIFT - 9)); 738 payload->location = cpu_to_le64(location); 739 payload->checksum[0] = cpu_to_le32(checksum1); 740 if (checksum2_valid) 741 payload->checksum[1] = cpu_to_le32(checksum2); 742 743 io->meta_offset += sizeof(struct r5l_payload_data_parity) + 744 sizeof(__le32) * (1 + !!checksum2_valid); 745 } 746 747 static void r5l_append_payload_page(struct r5l_log *log, struct page *page) 748 { 749 struct r5l_io_unit *io = log->current_io; 750 751 if (io->need_split_bio) { 752 BUG_ON(io->split_bio); 753 io->split_bio = io->current_bio; 754 io->current_bio = r5l_bio_alloc(log); 755 bio_chain(io->current_bio, io->split_bio); 756 io->need_split_bio = false; 757 } 758 759 if (!bio_add_page(io->current_bio, page, PAGE_SIZE, 0)) 760 BUG(); 761 762 r5_reserve_log_entry(log, io); 763 } 764 765 static int r5l_log_stripe(struct r5l_log *log, struct stripe_head *sh, 766 int data_pages, int parity_pages) 767 { 768 int i; 769 int meta_size; 770 int ret; 771 struct r5l_io_unit *io; 772 773 meta_size = 774 ((sizeof(struct r5l_payload_data_parity) + sizeof(__le32)) 775 * data_pages) + 776 sizeof(struct r5l_payload_data_parity) + 777 sizeof(__le32) * parity_pages; 778 779 ret = r5l_get_meta(log, meta_size); 780 if (ret) 781 return ret; 782 783 io = log->current_io; 784 785 if (test_and_clear_bit(STRIPE_R5C_PREFLUSH, &sh->state)) 786 io->has_flush = 1; 787 788 for (i = 0; i < sh->disks; i++) { 789 if (!test_bit(R5_Wantwrite, &sh->dev[i].flags) || 790 test_bit(R5_InJournal, &sh->dev[i].flags)) 791 continue; 792 if (i == sh->pd_idx || i == sh->qd_idx) 793 continue; 794 if (test_bit(R5_WantFUA, &sh->dev[i].flags) && 795 log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_BACK) { 796 io->has_fua = 1; 797 /* 798 * we need to flush journal to make sure recovery can 799 * reach the data with fua flag 800 */ 801 io->has_flush = 1; 802 } 803 r5l_append_payload_meta(log, R5LOG_PAYLOAD_DATA, 804 raid5_compute_blocknr(sh, i, 0), 805 sh->dev[i].log_checksum, 0, false); 806 r5l_append_payload_page(log, sh->dev[i].page); 807 } 808 809 if (parity_pages == 2) { 810 r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY, 811 sh->sector, sh->dev[sh->pd_idx].log_checksum, 812 sh->dev[sh->qd_idx].log_checksum, true); 813 r5l_append_payload_page(log, sh->dev[sh->pd_idx].page); 814 r5l_append_payload_page(log, sh->dev[sh->qd_idx].page); 815 } else if (parity_pages == 1) { 816 r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY, 817 sh->sector, sh->dev[sh->pd_idx].log_checksum, 818 0, false); 819 r5l_append_payload_page(log, sh->dev[sh->pd_idx].page); 820 } else /* Just writing data, not parity, in caching phase */ 821 BUG_ON(parity_pages != 0); 822 823 list_add_tail(&sh->log_list, &io->stripe_list); 824 atomic_inc(&io->pending_stripe); 825 sh->log_io = io; 826 827 if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) 828 return 0; 829 830 if (sh->log_start == MaxSector) { 831 BUG_ON(!list_empty(&sh->r5c)); 832 sh->log_start = io->log_start; 833 spin_lock_irq(&log->stripe_in_journal_lock); 834 list_add_tail(&sh->r5c, 835 &log->stripe_in_journal_list); 836 spin_unlock_irq(&log->stripe_in_journal_lock); 837 atomic_inc(&log->stripe_in_journal_count); 838 } 839 return 0; 840 } 841 842 /* add stripe to no_space_stripes, and then wake up reclaim */ 843 static inline void r5l_add_no_space_stripe(struct r5l_log *log, 844 struct stripe_head *sh) 845 { 846 spin_lock(&log->no_space_stripes_lock); 847 list_add_tail(&sh->log_list, &log->no_space_stripes); 848 spin_unlock(&log->no_space_stripes_lock); 849 } 850 851 /* 852 * running in raid5d, where reclaim could wait for raid5d too (when it flushes 853 * data from log to raid disks), so we shouldn't wait for reclaim here 854 */ 855 int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh) 856 { 857 struct r5conf *conf = sh->raid_conf; 858 int write_disks = 0; 859 int data_pages, parity_pages; 860 int reserve; 861 int i; 862 int ret = 0; 863 bool wake_reclaim = false; 864 865 if (!log) 866 return -EAGAIN; 867 /* Don't support stripe batch */ 868 if (sh->log_io || !test_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags) || 869 test_bit(STRIPE_SYNCING, &sh->state)) { 870 /* the stripe is written to log, we start writing it to raid */ 871 clear_bit(STRIPE_LOG_TRAPPED, &sh->state); 872 return -EAGAIN; 873 } 874 875 WARN_ON(test_bit(STRIPE_R5C_CACHING, &sh->state)); 876 877 for (i = 0; i < sh->disks; i++) { 878 void *addr; 879 880 if (!test_bit(R5_Wantwrite, &sh->dev[i].flags) || 881 test_bit(R5_InJournal, &sh->dev[i].flags)) 882 continue; 883 884 write_disks++; 885 /* checksum is already calculated in last run */ 886 if (test_bit(STRIPE_LOG_TRAPPED, &sh->state)) 887 continue; 888 addr = kmap_atomic(sh->dev[i].page); 889 sh->dev[i].log_checksum = crc32c_le(log->uuid_checksum, 890 addr, PAGE_SIZE); 891 kunmap_atomic(addr); 892 } 893 parity_pages = 1 + !!(sh->qd_idx >= 0); 894 data_pages = write_disks - parity_pages; 895 896 set_bit(STRIPE_LOG_TRAPPED, &sh->state); 897 /* 898 * The stripe must enter state machine again to finish the write, so 899 * don't delay. 900 */ 901 clear_bit(STRIPE_DELAYED, &sh->state); 902 atomic_inc(&sh->count); 903 904 mutex_lock(&log->io_mutex); 905 /* meta + data */ 906 reserve = (1 + write_disks) << (PAGE_SHIFT - 9); 907 908 if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) { 909 if (!r5l_has_free_space(log, reserve)) { 910 r5l_add_no_space_stripe(log, sh); 911 wake_reclaim = true; 912 } else { 913 ret = r5l_log_stripe(log, sh, data_pages, parity_pages); 914 if (ret) { 915 spin_lock_irq(&log->io_list_lock); 916 list_add_tail(&sh->log_list, 917 &log->no_mem_stripes); 918 spin_unlock_irq(&log->io_list_lock); 919 } 920 } 921 } else { /* R5C_JOURNAL_MODE_WRITE_BACK */ 922 /* 923 * log space critical, do not process stripes that are 924 * not in cache yet (sh->log_start == MaxSector). 925 */ 926 if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state) && 927 sh->log_start == MaxSector) { 928 r5l_add_no_space_stripe(log, sh); 929 wake_reclaim = true; 930 reserve = 0; 931 } else if (!r5l_has_free_space(log, reserve)) { 932 if (sh->log_start == log->last_checkpoint) 933 BUG(); 934 else 935 r5l_add_no_space_stripe(log, sh); 936 } else { 937 ret = r5l_log_stripe(log, sh, data_pages, parity_pages); 938 if (ret) { 939 spin_lock_irq(&log->io_list_lock); 940 list_add_tail(&sh->log_list, 941 &log->no_mem_stripes); 942 spin_unlock_irq(&log->io_list_lock); 943 } 944 } 945 } 946 947 mutex_unlock(&log->io_mutex); 948 if (wake_reclaim) 949 r5l_wake_reclaim(log, reserve); 950 return 0; 951 } 952 953 void r5l_write_stripe_run(struct r5l_log *log) 954 { 955 if (!log) 956 return; 957 mutex_lock(&log->io_mutex); 958 r5l_submit_current_io(log); 959 mutex_unlock(&log->io_mutex); 960 } 961 962 int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio) 963 { 964 if (!log) 965 return -ENODEV; 966 967 if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) { 968 /* 969 * in write through (journal only) 970 * we flush log disk cache first, then write stripe data to 971 * raid disks. So if bio is finished, the log disk cache is 972 * flushed already. The recovery guarantees we can recovery 973 * the bio from log disk, so we don't need to flush again 974 */ 975 if (bio->bi_iter.bi_size == 0) { 976 bio_endio(bio); 977 return 0; 978 } 979 bio->bi_opf &= ~REQ_PREFLUSH; 980 } else { 981 /* write back (with cache) */ 982 if (bio->bi_iter.bi_size == 0) { 983 mutex_lock(&log->io_mutex); 984 r5l_get_meta(log, 0); 985 bio_list_add(&log->current_io->flush_barriers, bio); 986 log->current_io->has_flush = 1; 987 log->current_io->has_null_flush = 1; 988 atomic_inc(&log->current_io->pending_stripe); 989 r5l_submit_current_io(log); 990 mutex_unlock(&log->io_mutex); 991 return 0; 992 } 993 } 994 return -EAGAIN; 995 } 996 997 /* This will run after log space is reclaimed */ 998 static void r5l_run_no_space_stripes(struct r5l_log *log) 999 { 1000 struct stripe_head *sh; 1001 1002 spin_lock(&log->no_space_stripes_lock); 1003 while (!list_empty(&log->no_space_stripes)) { 1004 sh = list_first_entry(&log->no_space_stripes, 1005 struct stripe_head, log_list); 1006 list_del_init(&sh->log_list); 1007 set_bit(STRIPE_HANDLE, &sh->state); 1008 raid5_release_stripe(sh); 1009 } 1010 spin_unlock(&log->no_space_stripes_lock); 1011 } 1012 1013 /* 1014 * calculate new last_checkpoint 1015 * for write through mode, returns log->next_checkpoint 1016 * for write back, returns log_start of first sh in stripe_in_journal_list 1017 */ 1018 static sector_t r5c_calculate_new_cp(struct r5conf *conf) 1019 { 1020 struct stripe_head *sh; 1021 struct r5l_log *log = conf->log; 1022 sector_t new_cp; 1023 unsigned long flags; 1024 1025 if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) 1026 return log->next_checkpoint; 1027 1028 spin_lock_irqsave(&log->stripe_in_journal_lock, flags); 1029 if (list_empty(&conf->log->stripe_in_journal_list)) { 1030 /* all stripes flushed */ 1031 spin_unlock(&log->stripe_in_journal_lock); 1032 return log->next_checkpoint; 1033 } 1034 sh = list_first_entry(&conf->log->stripe_in_journal_list, 1035 struct stripe_head, r5c); 1036 new_cp = sh->log_start; 1037 spin_unlock_irqrestore(&log->stripe_in_journal_lock, flags); 1038 return new_cp; 1039 } 1040 1041 static sector_t r5l_reclaimable_space(struct r5l_log *log) 1042 { 1043 struct r5conf *conf = log->rdev->mddev->private; 1044 1045 return r5l_ring_distance(log, log->last_checkpoint, 1046 r5c_calculate_new_cp(conf)); 1047 } 1048 1049 static void r5l_run_no_mem_stripe(struct r5l_log *log) 1050 { 1051 struct stripe_head *sh; 1052 1053 assert_spin_locked(&log->io_list_lock); 1054 1055 if (!list_empty(&log->no_mem_stripes)) { 1056 sh = list_first_entry(&log->no_mem_stripes, 1057 struct stripe_head, log_list); 1058 list_del_init(&sh->log_list); 1059 set_bit(STRIPE_HANDLE, &sh->state); 1060 raid5_release_stripe(sh); 1061 } 1062 } 1063 1064 static bool r5l_complete_finished_ios(struct r5l_log *log) 1065 { 1066 struct r5l_io_unit *io, *next; 1067 bool found = false; 1068 1069 assert_spin_locked(&log->io_list_lock); 1070 1071 list_for_each_entry_safe(io, next, &log->finished_ios, log_sibling) { 1072 /* don't change list order */ 1073 if (io->state < IO_UNIT_STRIPE_END) 1074 break; 1075 1076 log->next_checkpoint = io->log_start; 1077 log->next_cp_seq = io->seq; 1078 1079 list_del(&io->log_sibling); 1080 mempool_free(io, log->io_pool); 1081 r5l_run_no_mem_stripe(log); 1082 1083 found = true; 1084 } 1085 1086 return found; 1087 } 1088 1089 static void __r5l_stripe_write_finished(struct r5l_io_unit *io) 1090 { 1091 struct r5l_log *log = io->log; 1092 struct r5conf *conf = log->rdev->mddev->private; 1093 unsigned long flags; 1094 1095 spin_lock_irqsave(&log->io_list_lock, flags); 1096 __r5l_set_io_unit_state(io, IO_UNIT_STRIPE_END); 1097 1098 if (!r5l_complete_finished_ios(log)) { 1099 spin_unlock_irqrestore(&log->io_list_lock, flags); 1100 return; 1101 } 1102 1103 if (r5l_reclaimable_space(log) > log->max_free_space || 1104 test_bit(R5C_LOG_TIGHT, &conf->cache_state)) 1105 r5l_wake_reclaim(log, 0); 1106 1107 spin_unlock_irqrestore(&log->io_list_lock, flags); 1108 wake_up(&log->iounit_wait); 1109 } 1110 1111 void r5l_stripe_write_finished(struct stripe_head *sh) 1112 { 1113 struct r5l_io_unit *io; 1114 1115 io = sh->log_io; 1116 sh->log_io = NULL; 1117 1118 if (io && atomic_dec_and_test(&io->pending_stripe)) 1119 __r5l_stripe_write_finished(io); 1120 } 1121 1122 static void r5l_log_flush_endio(struct bio *bio) 1123 { 1124 struct r5l_log *log = container_of(bio, struct r5l_log, 1125 flush_bio); 1126 unsigned long flags; 1127 struct r5l_io_unit *io; 1128 1129 if (bio->bi_error) 1130 md_error(log->rdev->mddev, log->rdev); 1131 1132 spin_lock_irqsave(&log->io_list_lock, flags); 1133 list_for_each_entry(io, &log->flushing_ios, log_sibling) 1134 r5l_io_run_stripes(io); 1135 list_splice_tail_init(&log->flushing_ios, &log->finished_ios); 1136 spin_unlock_irqrestore(&log->io_list_lock, flags); 1137 } 1138 1139 /* 1140 * Starting dispatch IO to raid. 1141 * io_unit(meta) consists of a log. There is one situation we want to avoid. A 1142 * broken meta in the middle of a log causes recovery can't find meta at the 1143 * head of log. If operations require meta at the head persistent in log, we 1144 * must make sure meta before it persistent in log too. A case is: 1145 * 1146 * stripe data/parity is in log, we start write stripe to raid disks. stripe 1147 * data/parity must be persistent in log before we do the write to raid disks. 1148 * 1149 * The solution is we restrictly maintain io_unit list order. In this case, we 1150 * only write stripes of an io_unit to raid disks till the io_unit is the first 1151 * one whose data/parity is in log. 1152 */ 1153 void r5l_flush_stripe_to_raid(struct r5l_log *log) 1154 { 1155 bool do_flush; 1156 1157 if (!log || !log->need_cache_flush) 1158 return; 1159 1160 spin_lock_irq(&log->io_list_lock); 1161 /* flush bio is running */ 1162 if (!list_empty(&log->flushing_ios)) { 1163 spin_unlock_irq(&log->io_list_lock); 1164 return; 1165 } 1166 list_splice_tail_init(&log->io_end_ios, &log->flushing_ios); 1167 do_flush = !list_empty(&log->flushing_ios); 1168 spin_unlock_irq(&log->io_list_lock); 1169 1170 if (!do_flush) 1171 return; 1172 bio_reset(&log->flush_bio); 1173 log->flush_bio.bi_bdev = log->rdev->bdev; 1174 log->flush_bio.bi_end_io = r5l_log_flush_endio; 1175 bio_set_op_attrs(&log->flush_bio, REQ_OP_WRITE, WRITE_FLUSH); 1176 submit_bio(&log->flush_bio); 1177 } 1178 1179 static void r5l_write_super(struct r5l_log *log, sector_t cp); 1180 static void r5l_write_super_and_discard_space(struct r5l_log *log, 1181 sector_t end) 1182 { 1183 struct block_device *bdev = log->rdev->bdev; 1184 struct mddev *mddev; 1185 1186 r5l_write_super(log, end); 1187 1188 if (!blk_queue_discard(bdev_get_queue(bdev))) 1189 return; 1190 1191 mddev = log->rdev->mddev; 1192 /* 1193 * Discard could zero data, so before discard we must make sure 1194 * superblock is updated to new log tail. Updating superblock (either 1195 * directly call md_update_sb() or depend on md thread) must hold 1196 * reconfig mutex. On the other hand, raid5_quiesce is called with 1197 * reconfig_mutex hold. The first step of raid5_quiesce() is waitting 1198 * for all IO finish, hence waitting for reclaim thread, while reclaim 1199 * thread is calling this function and waitting for reconfig mutex. So 1200 * there is a deadlock. We workaround this issue with a trylock. 1201 * FIXME: we could miss discard if we can't take reconfig mutex 1202 */ 1203 set_mask_bits(&mddev->flags, 0, 1204 BIT(MD_CHANGE_DEVS) | BIT(MD_CHANGE_PENDING)); 1205 if (!mddev_trylock(mddev)) 1206 return; 1207 md_update_sb(mddev, 1); 1208 mddev_unlock(mddev); 1209 1210 /* discard IO error really doesn't matter, ignore it */ 1211 if (log->last_checkpoint < end) { 1212 blkdev_issue_discard(bdev, 1213 log->last_checkpoint + log->rdev->data_offset, 1214 end - log->last_checkpoint, GFP_NOIO, 0); 1215 } else { 1216 blkdev_issue_discard(bdev, 1217 log->last_checkpoint + log->rdev->data_offset, 1218 log->device_size - log->last_checkpoint, 1219 GFP_NOIO, 0); 1220 blkdev_issue_discard(bdev, log->rdev->data_offset, end, 1221 GFP_NOIO, 0); 1222 } 1223 } 1224 1225 /* 1226 * r5c_flush_stripe moves stripe from cached list to handle_list. When called, 1227 * the stripe must be on r5c_cached_full_stripes or r5c_cached_partial_stripes. 1228 * 1229 * must hold conf->device_lock 1230 */ 1231 static void r5c_flush_stripe(struct r5conf *conf, struct stripe_head *sh) 1232 { 1233 BUG_ON(list_empty(&sh->lru)); 1234 BUG_ON(!test_bit(STRIPE_R5C_CACHING, &sh->state)); 1235 BUG_ON(test_bit(STRIPE_HANDLE, &sh->state)); 1236 1237 /* 1238 * The stripe is not ON_RELEASE_LIST, so it is safe to call 1239 * raid5_release_stripe() while holding conf->device_lock 1240 */ 1241 BUG_ON(test_bit(STRIPE_ON_RELEASE_LIST, &sh->state)); 1242 assert_spin_locked(&conf->device_lock); 1243 1244 list_del_init(&sh->lru); 1245 atomic_inc(&sh->count); 1246 1247 set_bit(STRIPE_HANDLE, &sh->state); 1248 atomic_inc(&conf->active_stripes); 1249 r5c_make_stripe_write_out(sh); 1250 1251 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 1252 atomic_inc(&conf->preread_active_stripes); 1253 raid5_release_stripe(sh); 1254 } 1255 1256 /* 1257 * if num == 0, flush all full stripes 1258 * if num > 0, flush all full stripes. If less than num full stripes are 1259 * flushed, flush some partial stripes until totally num stripes are 1260 * flushed or there is no more cached stripes. 1261 */ 1262 void r5c_flush_cache(struct r5conf *conf, int num) 1263 { 1264 int count; 1265 struct stripe_head *sh, *next; 1266 1267 assert_spin_locked(&conf->device_lock); 1268 if (!conf->log) 1269 return; 1270 1271 count = 0; 1272 list_for_each_entry_safe(sh, next, &conf->r5c_full_stripe_list, lru) { 1273 r5c_flush_stripe(conf, sh); 1274 count++; 1275 } 1276 1277 if (count >= num) 1278 return; 1279 list_for_each_entry_safe(sh, next, 1280 &conf->r5c_partial_stripe_list, lru) { 1281 r5c_flush_stripe(conf, sh); 1282 if (++count >= num) 1283 break; 1284 } 1285 } 1286 1287 static void r5c_do_reclaim(struct r5conf *conf) 1288 { 1289 struct r5l_log *log = conf->log; 1290 struct stripe_head *sh; 1291 int count = 0; 1292 unsigned long flags; 1293 int total_cached; 1294 int stripes_to_flush; 1295 1296 if (!r5c_is_writeback(log)) 1297 return; 1298 1299 total_cached = atomic_read(&conf->r5c_cached_partial_stripes) + 1300 atomic_read(&conf->r5c_cached_full_stripes); 1301 1302 if (total_cached > conf->min_nr_stripes * 3 / 4 || 1303 atomic_read(&conf->empty_inactive_list_nr) > 0) 1304 /* 1305 * if stripe cache pressure high, flush all full stripes and 1306 * some partial stripes 1307 */ 1308 stripes_to_flush = R5C_RECLAIM_STRIPE_GROUP; 1309 else if (total_cached > conf->min_nr_stripes * 1 / 2 || 1310 atomic_read(&conf->r5c_cached_full_stripes) > 1311 R5C_FULL_STRIPE_FLUSH_BATCH) 1312 /* 1313 * if stripe cache pressure moderate, or if there is many full 1314 * stripes,flush all full stripes 1315 */ 1316 stripes_to_flush = 0; 1317 else 1318 /* no need to flush */ 1319 stripes_to_flush = -1; 1320 1321 if (stripes_to_flush >= 0) { 1322 spin_lock_irqsave(&conf->device_lock, flags); 1323 r5c_flush_cache(conf, stripes_to_flush); 1324 spin_unlock_irqrestore(&conf->device_lock, flags); 1325 } 1326 1327 /* if log space is tight, flush stripes on stripe_in_journal_list */ 1328 if (test_bit(R5C_LOG_TIGHT, &conf->cache_state)) { 1329 spin_lock_irqsave(&log->stripe_in_journal_lock, flags); 1330 spin_lock(&conf->device_lock); 1331 list_for_each_entry(sh, &log->stripe_in_journal_list, r5c) { 1332 /* 1333 * stripes on stripe_in_journal_list could be in any 1334 * state of the stripe_cache state machine. In this 1335 * case, we only want to flush stripe on 1336 * r5c_cached_full/partial_stripes. The following 1337 * condition makes sure the stripe is on one of the 1338 * two lists. 1339 */ 1340 if (!list_empty(&sh->lru) && 1341 !test_bit(STRIPE_HANDLE, &sh->state) && 1342 atomic_read(&sh->count) == 0) { 1343 r5c_flush_stripe(conf, sh); 1344 } 1345 if (count++ >= R5C_RECLAIM_STRIPE_GROUP) 1346 break; 1347 } 1348 spin_unlock(&conf->device_lock); 1349 spin_unlock_irqrestore(&log->stripe_in_journal_lock, flags); 1350 } 1351 md_wakeup_thread(conf->mddev->thread); 1352 } 1353 1354 static void r5l_do_reclaim(struct r5l_log *log) 1355 { 1356 struct r5conf *conf = log->rdev->mddev->private; 1357 sector_t reclaim_target = xchg(&log->reclaim_target, 0); 1358 sector_t reclaimable; 1359 sector_t next_checkpoint; 1360 bool write_super; 1361 1362 spin_lock_irq(&log->io_list_lock); 1363 write_super = r5l_reclaimable_space(log) > log->max_free_space || 1364 reclaim_target != 0 || !list_empty(&log->no_space_stripes); 1365 /* 1366 * move proper io_unit to reclaim list. We should not change the order. 1367 * reclaimable/unreclaimable io_unit can be mixed in the list, we 1368 * shouldn't reuse space of an unreclaimable io_unit 1369 */ 1370 while (1) { 1371 reclaimable = r5l_reclaimable_space(log); 1372 if (reclaimable >= reclaim_target || 1373 (list_empty(&log->running_ios) && 1374 list_empty(&log->io_end_ios) && 1375 list_empty(&log->flushing_ios) && 1376 list_empty(&log->finished_ios))) 1377 break; 1378 1379 md_wakeup_thread(log->rdev->mddev->thread); 1380 wait_event_lock_irq(log->iounit_wait, 1381 r5l_reclaimable_space(log) > reclaimable, 1382 log->io_list_lock); 1383 } 1384 1385 next_checkpoint = r5c_calculate_new_cp(conf); 1386 spin_unlock_irq(&log->io_list_lock); 1387 1388 BUG_ON(reclaimable < 0); 1389 1390 if (reclaimable == 0 || !write_super) 1391 return; 1392 1393 /* 1394 * write_super will flush cache of each raid disk. We must write super 1395 * here, because the log area might be reused soon and we don't want to 1396 * confuse recovery 1397 */ 1398 r5l_write_super_and_discard_space(log, next_checkpoint); 1399 1400 mutex_lock(&log->io_mutex); 1401 log->last_checkpoint = next_checkpoint; 1402 r5c_update_log_state(log); 1403 mutex_unlock(&log->io_mutex); 1404 1405 r5l_run_no_space_stripes(log); 1406 } 1407 1408 static void r5l_reclaim_thread(struct md_thread *thread) 1409 { 1410 struct mddev *mddev = thread->mddev; 1411 struct r5conf *conf = mddev->private; 1412 struct r5l_log *log = conf->log; 1413 1414 if (!log) 1415 return; 1416 r5c_do_reclaim(conf); 1417 r5l_do_reclaim(log); 1418 } 1419 1420 void r5l_wake_reclaim(struct r5l_log *log, sector_t space) 1421 { 1422 unsigned long target; 1423 unsigned long new = (unsigned long)space; /* overflow in theory */ 1424 1425 if (!log) 1426 return; 1427 do { 1428 target = log->reclaim_target; 1429 if (new < target) 1430 return; 1431 } while (cmpxchg(&log->reclaim_target, target, new) != target); 1432 md_wakeup_thread(log->reclaim_thread); 1433 } 1434 1435 void r5l_quiesce(struct r5l_log *log, int state) 1436 { 1437 struct mddev *mddev; 1438 if (!log || state == 2) 1439 return; 1440 if (state == 0) { 1441 /* 1442 * This is a special case for hotadd. In suspend, the array has 1443 * no journal. In resume, journal is initialized as well as the 1444 * reclaim thread. 1445 */ 1446 if (log->reclaim_thread) 1447 return; 1448 log->reclaim_thread = md_register_thread(r5l_reclaim_thread, 1449 log->rdev->mddev, "reclaim"); 1450 log->reclaim_thread->timeout = R5C_RECLAIM_WAKEUP_INTERVAL; 1451 } else if (state == 1) { 1452 /* make sure r5l_write_super_and_discard_space exits */ 1453 mddev = log->rdev->mddev; 1454 wake_up(&mddev->sb_wait); 1455 r5l_wake_reclaim(log, MaxSector); 1456 md_unregister_thread(&log->reclaim_thread); 1457 r5l_do_reclaim(log); 1458 } 1459 } 1460 1461 bool r5l_log_disk_error(struct r5conf *conf) 1462 { 1463 struct r5l_log *log; 1464 bool ret; 1465 /* don't allow write if journal disk is missing */ 1466 rcu_read_lock(); 1467 log = rcu_dereference(conf->log); 1468 1469 if (!log) 1470 ret = test_bit(MD_HAS_JOURNAL, &conf->mddev->flags); 1471 else 1472 ret = test_bit(Faulty, &log->rdev->flags); 1473 rcu_read_unlock(); 1474 return ret; 1475 } 1476 1477 struct r5l_recovery_ctx { 1478 struct page *meta_page; /* current meta */ 1479 sector_t meta_total_blocks; /* total size of current meta and data */ 1480 sector_t pos; /* recovery position */ 1481 u64 seq; /* recovery position seq */ 1482 int data_parity_stripes; /* number of data_parity stripes */ 1483 int data_only_stripes; /* number of data_only stripes */ 1484 struct list_head cached_list; 1485 }; 1486 1487 static int r5l_recovery_read_meta_block(struct r5l_log *log, 1488 struct r5l_recovery_ctx *ctx) 1489 { 1490 struct page *page = ctx->meta_page; 1491 struct r5l_meta_block *mb; 1492 u32 crc, stored_crc; 1493 1494 if (!sync_page_io(log->rdev, ctx->pos, PAGE_SIZE, page, REQ_OP_READ, 0, 1495 false)) 1496 return -EIO; 1497 1498 mb = page_address(page); 1499 stored_crc = le32_to_cpu(mb->checksum); 1500 mb->checksum = 0; 1501 1502 if (le32_to_cpu(mb->magic) != R5LOG_MAGIC || 1503 le64_to_cpu(mb->seq) != ctx->seq || 1504 mb->version != R5LOG_VERSION || 1505 le64_to_cpu(mb->position) != ctx->pos) 1506 return -EINVAL; 1507 1508 crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE); 1509 if (stored_crc != crc) 1510 return -EINVAL; 1511 1512 if (le32_to_cpu(mb->meta_size) > PAGE_SIZE) 1513 return -EINVAL; 1514 1515 ctx->meta_total_blocks = BLOCK_SECTORS; 1516 1517 return 0; 1518 } 1519 1520 static void 1521 r5l_recovery_create_empty_meta_block(struct r5l_log *log, 1522 struct page *page, 1523 sector_t pos, u64 seq) 1524 { 1525 struct r5l_meta_block *mb; 1526 u32 crc; 1527 1528 mb = page_address(page); 1529 clear_page(mb); 1530 mb->magic = cpu_to_le32(R5LOG_MAGIC); 1531 mb->version = R5LOG_VERSION; 1532 mb->meta_size = cpu_to_le32(sizeof(struct r5l_meta_block)); 1533 mb->seq = cpu_to_le64(seq); 1534 mb->position = cpu_to_le64(pos); 1535 crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE); 1536 mb->checksum = cpu_to_le32(crc); 1537 } 1538 1539 static int r5l_log_write_empty_meta_block(struct r5l_log *log, sector_t pos, 1540 u64 seq) 1541 { 1542 struct page *page; 1543 1544 page = alloc_page(GFP_KERNEL); 1545 if (!page) 1546 return -ENOMEM; 1547 r5l_recovery_create_empty_meta_block(log, page, pos, seq); 1548 if (!sync_page_io(log->rdev, pos, PAGE_SIZE, page, REQ_OP_WRITE, 1549 WRITE_FUA, false)) { 1550 __free_page(page); 1551 return -EIO; 1552 } 1553 __free_page(page); 1554 return 0; 1555 } 1556 1557 /* 1558 * r5l_recovery_load_data and r5l_recovery_load_parity uses flag R5_Wantwrite 1559 * to mark valid (potentially not flushed) data in the journal. 1560 * 1561 * We already verified checksum in r5l_recovery_verify_data_checksum_for_mb, 1562 * so there should not be any mismatch here. 1563 */ 1564 static void r5l_recovery_load_data(struct r5l_log *log, 1565 struct stripe_head *sh, 1566 struct r5l_recovery_ctx *ctx, 1567 struct r5l_payload_data_parity *payload, 1568 sector_t log_offset) 1569 { 1570 struct mddev *mddev = log->rdev->mddev; 1571 struct r5conf *conf = mddev->private; 1572 int dd_idx; 1573 1574 raid5_compute_sector(conf, 1575 le64_to_cpu(payload->location), 0, 1576 &dd_idx, sh); 1577 sync_page_io(log->rdev, log_offset, PAGE_SIZE, 1578 sh->dev[dd_idx].page, REQ_OP_READ, 0, false); 1579 sh->dev[dd_idx].log_checksum = 1580 le32_to_cpu(payload->checksum[0]); 1581 ctx->meta_total_blocks += BLOCK_SECTORS; 1582 1583 set_bit(R5_Wantwrite, &sh->dev[dd_idx].flags); 1584 set_bit(STRIPE_R5C_CACHING, &sh->state); 1585 } 1586 1587 static void r5l_recovery_load_parity(struct r5l_log *log, 1588 struct stripe_head *sh, 1589 struct r5l_recovery_ctx *ctx, 1590 struct r5l_payload_data_parity *payload, 1591 sector_t log_offset) 1592 { 1593 struct mddev *mddev = log->rdev->mddev; 1594 struct r5conf *conf = mddev->private; 1595 1596 ctx->meta_total_blocks += BLOCK_SECTORS * conf->max_degraded; 1597 sync_page_io(log->rdev, log_offset, PAGE_SIZE, 1598 sh->dev[sh->pd_idx].page, REQ_OP_READ, 0, false); 1599 sh->dev[sh->pd_idx].log_checksum = 1600 le32_to_cpu(payload->checksum[0]); 1601 set_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags); 1602 1603 if (sh->qd_idx >= 0) { 1604 sync_page_io(log->rdev, 1605 r5l_ring_add(log, log_offset, BLOCK_SECTORS), 1606 PAGE_SIZE, sh->dev[sh->qd_idx].page, 1607 REQ_OP_READ, 0, false); 1608 sh->dev[sh->qd_idx].log_checksum = 1609 le32_to_cpu(payload->checksum[1]); 1610 set_bit(R5_Wantwrite, &sh->dev[sh->qd_idx].flags); 1611 } 1612 clear_bit(STRIPE_R5C_CACHING, &sh->state); 1613 } 1614 1615 static void r5l_recovery_reset_stripe(struct stripe_head *sh) 1616 { 1617 int i; 1618 1619 sh->state = 0; 1620 sh->log_start = MaxSector; 1621 for (i = sh->disks; i--; ) 1622 sh->dev[i].flags = 0; 1623 } 1624 1625 static void 1626 r5l_recovery_replay_one_stripe(struct r5conf *conf, 1627 struct stripe_head *sh, 1628 struct r5l_recovery_ctx *ctx) 1629 { 1630 struct md_rdev *rdev, *rrdev; 1631 int disk_index; 1632 int data_count = 0; 1633 1634 for (disk_index = 0; disk_index < sh->disks; disk_index++) { 1635 if (!test_bit(R5_Wantwrite, &sh->dev[disk_index].flags)) 1636 continue; 1637 if (disk_index == sh->qd_idx || disk_index == sh->pd_idx) 1638 continue; 1639 data_count++; 1640 } 1641 1642 /* 1643 * stripes that only have parity must have been flushed 1644 * before the crash that we are now recovering from, so 1645 * there is nothing more to recovery. 1646 */ 1647 if (data_count == 0) 1648 goto out; 1649 1650 for (disk_index = 0; disk_index < sh->disks; disk_index++) { 1651 if (!test_bit(R5_Wantwrite, &sh->dev[disk_index].flags)) 1652 continue; 1653 1654 /* in case device is broken */ 1655 rcu_read_lock(); 1656 rdev = rcu_dereference(conf->disks[disk_index].rdev); 1657 if (rdev) { 1658 atomic_inc(&rdev->nr_pending); 1659 rcu_read_unlock(); 1660 sync_page_io(rdev, sh->sector, PAGE_SIZE, 1661 sh->dev[disk_index].page, REQ_OP_WRITE, 0, 1662 false); 1663 rdev_dec_pending(rdev, rdev->mddev); 1664 rcu_read_lock(); 1665 } 1666 rrdev = rcu_dereference(conf->disks[disk_index].replacement); 1667 if (rrdev) { 1668 atomic_inc(&rrdev->nr_pending); 1669 rcu_read_unlock(); 1670 sync_page_io(rrdev, sh->sector, PAGE_SIZE, 1671 sh->dev[disk_index].page, REQ_OP_WRITE, 0, 1672 false); 1673 rdev_dec_pending(rrdev, rrdev->mddev); 1674 rcu_read_lock(); 1675 } 1676 rcu_read_unlock(); 1677 } 1678 ctx->data_parity_stripes++; 1679 out: 1680 r5l_recovery_reset_stripe(sh); 1681 } 1682 1683 static struct stripe_head * 1684 r5c_recovery_alloc_stripe(struct r5conf *conf, 1685 struct list_head *recovery_list, 1686 sector_t stripe_sect, 1687 sector_t log_start) 1688 { 1689 struct stripe_head *sh; 1690 1691 sh = raid5_get_active_stripe(conf, stripe_sect, 0, 1, 0); 1692 if (!sh) 1693 return NULL; /* no more stripe available */ 1694 1695 r5l_recovery_reset_stripe(sh); 1696 sh->log_start = log_start; 1697 1698 return sh; 1699 } 1700 1701 static struct stripe_head * 1702 r5c_recovery_lookup_stripe(struct list_head *list, sector_t sect) 1703 { 1704 struct stripe_head *sh; 1705 1706 list_for_each_entry(sh, list, lru) 1707 if (sh->sector == sect) 1708 return sh; 1709 return NULL; 1710 } 1711 1712 static void 1713 r5c_recovery_drop_stripes(struct list_head *cached_stripe_list, 1714 struct r5l_recovery_ctx *ctx) 1715 { 1716 struct stripe_head *sh, *next; 1717 1718 list_for_each_entry_safe(sh, next, cached_stripe_list, lru) { 1719 r5l_recovery_reset_stripe(sh); 1720 list_del_init(&sh->lru); 1721 raid5_release_stripe(sh); 1722 } 1723 } 1724 1725 static void 1726 r5c_recovery_replay_stripes(struct list_head *cached_stripe_list, 1727 struct r5l_recovery_ctx *ctx) 1728 { 1729 struct stripe_head *sh, *next; 1730 1731 list_for_each_entry_safe(sh, next, cached_stripe_list, lru) 1732 if (!test_bit(STRIPE_R5C_CACHING, &sh->state)) { 1733 r5l_recovery_replay_one_stripe(sh->raid_conf, sh, ctx); 1734 list_del_init(&sh->lru); 1735 raid5_release_stripe(sh); 1736 } 1737 } 1738 1739 /* if matches return 0; otherwise return -EINVAL */ 1740 static int 1741 r5l_recovery_verify_data_checksum(struct r5l_log *log, struct page *page, 1742 sector_t log_offset, __le32 log_checksum) 1743 { 1744 void *addr; 1745 u32 checksum; 1746 1747 sync_page_io(log->rdev, log_offset, PAGE_SIZE, 1748 page, REQ_OP_READ, 0, false); 1749 addr = kmap_atomic(page); 1750 checksum = crc32c_le(log->uuid_checksum, addr, PAGE_SIZE); 1751 kunmap_atomic(addr); 1752 return (le32_to_cpu(log_checksum) == checksum) ? 0 : -EINVAL; 1753 } 1754 1755 /* 1756 * before loading data to stripe cache, we need verify checksum for all data, 1757 * if there is mismatch for any data page, we drop all data in the mata block 1758 */ 1759 static int 1760 r5l_recovery_verify_data_checksum_for_mb(struct r5l_log *log, 1761 struct r5l_recovery_ctx *ctx) 1762 { 1763 struct mddev *mddev = log->rdev->mddev; 1764 struct r5conf *conf = mddev->private; 1765 struct r5l_meta_block *mb = page_address(ctx->meta_page); 1766 sector_t mb_offset = sizeof(struct r5l_meta_block); 1767 sector_t log_offset = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS); 1768 struct page *page; 1769 struct r5l_payload_data_parity *payload; 1770 1771 page = alloc_page(GFP_KERNEL); 1772 if (!page) 1773 return -ENOMEM; 1774 1775 while (mb_offset < le32_to_cpu(mb->meta_size)) { 1776 payload = (void *)mb + mb_offset; 1777 1778 if (payload->header.type == R5LOG_PAYLOAD_DATA) { 1779 if (r5l_recovery_verify_data_checksum( 1780 log, page, log_offset, 1781 payload->checksum[0]) < 0) 1782 goto mismatch; 1783 } else if (payload->header.type == R5LOG_PAYLOAD_PARITY) { 1784 if (r5l_recovery_verify_data_checksum( 1785 log, page, log_offset, 1786 payload->checksum[0]) < 0) 1787 goto mismatch; 1788 if (conf->max_degraded == 2 && /* q for RAID 6 */ 1789 r5l_recovery_verify_data_checksum( 1790 log, page, 1791 r5l_ring_add(log, log_offset, 1792 BLOCK_SECTORS), 1793 payload->checksum[1]) < 0) 1794 goto mismatch; 1795 } else /* not R5LOG_PAYLOAD_DATA or R5LOG_PAYLOAD_PARITY */ 1796 goto mismatch; 1797 1798 log_offset = r5l_ring_add(log, log_offset, 1799 le32_to_cpu(payload->size)); 1800 1801 mb_offset += sizeof(struct r5l_payload_data_parity) + 1802 sizeof(__le32) * 1803 (le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9)); 1804 } 1805 1806 put_page(page); 1807 return 0; 1808 1809 mismatch: 1810 put_page(page); 1811 return -EINVAL; 1812 } 1813 1814 /* 1815 * Analyze all data/parity pages in one meta block 1816 * Returns: 1817 * 0 for success 1818 * -EINVAL for unknown playload type 1819 * -EAGAIN for checksum mismatch of data page 1820 * -ENOMEM for run out of memory (alloc_page failed or run out of stripes) 1821 */ 1822 static int 1823 r5c_recovery_analyze_meta_block(struct r5l_log *log, 1824 struct r5l_recovery_ctx *ctx, 1825 struct list_head *cached_stripe_list) 1826 { 1827 struct mddev *mddev = log->rdev->mddev; 1828 struct r5conf *conf = mddev->private; 1829 struct r5l_meta_block *mb; 1830 struct r5l_payload_data_parity *payload; 1831 int mb_offset; 1832 sector_t log_offset; 1833 sector_t stripe_sect; 1834 struct stripe_head *sh; 1835 int ret; 1836 1837 /* 1838 * for mismatch in data blocks, we will drop all data in this mb, but 1839 * we will still read next mb for other data with FLUSH flag, as 1840 * io_unit could finish out of order. 1841 */ 1842 ret = r5l_recovery_verify_data_checksum_for_mb(log, ctx); 1843 if (ret == -EINVAL) 1844 return -EAGAIN; 1845 else if (ret) 1846 return ret; /* -ENOMEM duo to alloc_page() failed */ 1847 1848 mb = page_address(ctx->meta_page); 1849 mb_offset = sizeof(struct r5l_meta_block); 1850 log_offset = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS); 1851 1852 while (mb_offset < le32_to_cpu(mb->meta_size)) { 1853 int dd; 1854 1855 payload = (void *)mb + mb_offset; 1856 stripe_sect = (payload->header.type == R5LOG_PAYLOAD_DATA) ? 1857 raid5_compute_sector( 1858 conf, le64_to_cpu(payload->location), 0, &dd, 1859 NULL) 1860 : le64_to_cpu(payload->location); 1861 1862 sh = r5c_recovery_lookup_stripe(cached_stripe_list, 1863 stripe_sect); 1864 1865 if (!sh) { 1866 sh = r5c_recovery_alloc_stripe(conf, cached_stripe_list, 1867 stripe_sect, ctx->pos); 1868 /* 1869 * cannot get stripe from raid5_get_active_stripe 1870 * try replay some stripes 1871 */ 1872 if (!sh) { 1873 r5c_recovery_replay_stripes( 1874 cached_stripe_list, ctx); 1875 sh = r5c_recovery_alloc_stripe( 1876 conf, cached_stripe_list, 1877 stripe_sect, ctx->pos); 1878 } 1879 if (!sh) { 1880 pr_debug("md/raid:%s: Increasing stripe cache size to %d to recovery data on journal.\n", 1881 mdname(mddev), 1882 conf->min_nr_stripes * 2); 1883 raid5_set_cache_size(mddev, 1884 conf->min_nr_stripes * 2); 1885 sh = r5c_recovery_alloc_stripe( 1886 conf, cached_stripe_list, stripe_sect, 1887 ctx->pos); 1888 } 1889 if (!sh) { 1890 pr_err("md/raid:%s: Cannot get enough stripes due to memory pressure. Recovery failed.\n", 1891 mdname(mddev)); 1892 return -ENOMEM; 1893 } 1894 list_add_tail(&sh->lru, cached_stripe_list); 1895 } 1896 1897 if (payload->header.type == R5LOG_PAYLOAD_DATA) { 1898 if (!test_bit(STRIPE_R5C_CACHING, &sh->state)) { 1899 r5l_recovery_replay_one_stripe(conf, sh, ctx); 1900 r5l_recovery_reset_stripe(sh); 1901 sh->log_start = ctx->pos; 1902 list_move_tail(&sh->lru, cached_stripe_list); 1903 } 1904 r5l_recovery_load_data(log, sh, ctx, payload, 1905 log_offset); 1906 } else if (payload->header.type == R5LOG_PAYLOAD_PARITY) 1907 r5l_recovery_load_parity(log, sh, ctx, payload, 1908 log_offset); 1909 else 1910 return -EINVAL; 1911 1912 log_offset = r5l_ring_add(log, log_offset, 1913 le32_to_cpu(payload->size)); 1914 1915 mb_offset += sizeof(struct r5l_payload_data_parity) + 1916 sizeof(__le32) * 1917 (le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9)); 1918 } 1919 1920 return 0; 1921 } 1922 1923 /* 1924 * Load the stripe into cache. The stripe will be written out later by 1925 * the stripe cache state machine. 1926 */ 1927 static void r5c_recovery_load_one_stripe(struct r5l_log *log, 1928 struct stripe_head *sh) 1929 { 1930 struct r5conf *conf = sh->raid_conf; 1931 struct r5dev *dev; 1932 int i; 1933 1934 for (i = sh->disks; i--; ) { 1935 dev = sh->dev + i; 1936 if (test_and_clear_bit(R5_Wantwrite, &dev->flags)) { 1937 set_bit(R5_InJournal, &dev->flags); 1938 set_bit(R5_UPTODATE, &dev->flags); 1939 } 1940 } 1941 set_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state); 1942 atomic_inc(&conf->r5c_cached_partial_stripes); 1943 list_add_tail(&sh->r5c, &log->stripe_in_journal_list); 1944 } 1945 1946 /* 1947 * Scan through the log for all to-be-flushed data 1948 * 1949 * For stripes with data and parity, namely Data-Parity stripe 1950 * (STRIPE_R5C_CACHING == 0), we simply replay all the writes. 1951 * 1952 * For stripes with only data, namely Data-Only stripe 1953 * (STRIPE_R5C_CACHING == 1), we load them to stripe cache state machine. 1954 * 1955 * For a stripe, if we see data after parity, we should discard all previous 1956 * data and parity for this stripe, as these data are already flushed to 1957 * the array. 1958 * 1959 * At the end of the scan, we return the new journal_tail, which points to 1960 * first data-only stripe on the journal device, or next invalid meta block. 1961 */ 1962 static int r5c_recovery_flush_log(struct r5l_log *log, 1963 struct r5l_recovery_ctx *ctx) 1964 { 1965 struct stripe_head *sh, *next; 1966 int ret = 0; 1967 1968 /* scan through the log */ 1969 while (1) { 1970 if (r5l_recovery_read_meta_block(log, ctx)) 1971 break; 1972 1973 ret = r5c_recovery_analyze_meta_block(log, ctx, 1974 &ctx->cached_list); 1975 /* 1976 * -EAGAIN means mismatch in data block, in this case, we still 1977 * try scan the next metablock 1978 */ 1979 if (ret && ret != -EAGAIN) 1980 break; /* ret == -EINVAL or -ENOMEM */ 1981 ctx->seq++; 1982 ctx->pos = r5l_ring_add(log, ctx->pos, ctx->meta_total_blocks); 1983 } 1984 1985 if (ret == -ENOMEM) { 1986 r5c_recovery_drop_stripes(&ctx->cached_list, ctx); 1987 return ret; 1988 } 1989 1990 /* replay data-parity stripes */ 1991 r5c_recovery_replay_stripes(&ctx->cached_list, ctx); 1992 1993 /* load data-only stripes to stripe cache */ 1994 list_for_each_entry_safe(sh, next, &ctx->cached_list, lru) { 1995 WARN_ON(!test_bit(STRIPE_R5C_CACHING, &sh->state)); 1996 r5c_recovery_load_one_stripe(log, sh); 1997 list_del_init(&sh->lru); 1998 raid5_release_stripe(sh); 1999 ctx->data_only_stripes++; 2000 } 2001 2002 return 0; 2003 } 2004 2005 /* 2006 * we did a recovery. Now ctx.pos points to an invalid meta block. New 2007 * log will start here. but we can't let superblock point to last valid 2008 * meta block. The log might looks like: 2009 * | meta 1| meta 2| meta 3| 2010 * meta 1 is valid, meta 2 is invalid. meta 3 could be valid. If 2011 * superblock points to meta 1, we write a new valid meta 2n. if crash 2012 * happens again, new recovery will start from meta 1. Since meta 2n is 2013 * valid now, recovery will think meta 3 is valid, which is wrong. 2014 * The solution is we create a new meta in meta2 with its seq == meta 2015 * 1's seq + 10 and let superblock points to meta2. The same recovery will 2016 * not think meta 3 is a valid meta, because its seq doesn't match 2017 */ 2018 2019 /* 2020 * Before recovery, the log looks like the following 2021 * 2022 * --------------------------------------------- 2023 * | valid log | invalid log | 2024 * --------------------------------------------- 2025 * ^ 2026 * |- log->last_checkpoint 2027 * |- log->last_cp_seq 2028 * 2029 * Now we scan through the log until we see invalid entry 2030 * 2031 * --------------------------------------------- 2032 * | valid log | invalid log | 2033 * --------------------------------------------- 2034 * ^ ^ 2035 * |- log->last_checkpoint |- ctx->pos 2036 * |- log->last_cp_seq |- ctx->seq 2037 * 2038 * From this point, we need to increase seq number by 10 to avoid 2039 * confusing next recovery. 2040 * 2041 * --------------------------------------------- 2042 * | valid log | invalid log | 2043 * --------------------------------------------- 2044 * ^ ^ 2045 * |- log->last_checkpoint |- ctx->pos+1 2046 * |- log->last_cp_seq |- ctx->seq+11 2047 * 2048 * However, it is not safe to start the state machine yet, because data only 2049 * parities are not yet secured in RAID. To save these data only parities, we 2050 * rewrite them from seq+11. 2051 * 2052 * ----------------------------------------------------------------- 2053 * | valid log | data only stripes | invalid log | 2054 * ----------------------------------------------------------------- 2055 * ^ ^ 2056 * |- log->last_checkpoint |- ctx->pos+n 2057 * |- log->last_cp_seq |- ctx->seq+10+n 2058 * 2059 * If failure happens again during this process, the recovery can safe start 2060 * again from log->last_checkpoint. 2061 * 2062 * Once data only stripes are rewritten to journal, we move log_tail 2063 * 2064 * ----------------------------------------------------------------- 2065 * | old log | data only stripes | invalid log | 2066 * ----------------------------------------------------------------- 2067 * ^ ^ 2068 * |- log->last_checkpoint |- ctx->pos+n 2069 * |- log->last_cp_seq |- ctx->seq+10+n 2070 * 2071 * Then we can safely start the state machine. If failure happens from this 2072 * point on, the recovery will start from new log->last_checkpoint. 2073 */ 2074 static int 2075 r5c_recovery_rewrite_data_only_stripes(struct r5l_log *log, 2076 struct r5l_recovery_ctx *ctx) 2077 { 2078 struct stripe_head *sh; 2079 struct mddev *mddev = log->rdev->mddev; 2080 struct page *page; 2081 2082 page = alloc_page(GFP_KERNEL); 2083 if (!page) { 2084 pr_err("md/raid:%s: cannot allocate memory to rewrite data only stripes\n", 2085 mdname(mddev)); 2086 return -ENOMEM; 2087 } 2088 2089 ctx->seq += 10; 2090 list_for_each_entry(sh, &ctx->cached_list, lru) { 2091 struct r5l_meta_block *mb; 2092 int i; 2093 int offset; 2094 sector_t write_pos; 2095 2096 WARN_ON(!test_bit(STRIPE_R5C_CACHING, &sh->state)); 2097 r5l_recovery_create_empty_meta_block(log, page, 2098 ctx->pos, ctx->seq); 2099 mb = page_address(page); 2100 offset = le32_to_cpu(mb->meta_size); 2101 write_pos = ctx->pos + BLOCK_SECTORS; 2102 2103 for (i = sh->disks; i--; ) { 2104 struct r5dev *dev = &sh->dev[i]; 2105 struct r5l_payload_data_parity *payload; 2106 void *addr; 2107 2108 if (test_bit(R5_InJournal, &dev->flags)) { 2109 payload = (void *)mb + offset; 2110 payload->header.type = cpu_to_le16( 2111 R5LOG_PAYLOAD_DATA); 2112 payload->size = BLOCK_SECTORS; 2113 payload->location = cpu_to_le64( 2114 raid5_compute_blocknr(sh, i, 0)); 2115 addr = kmap_atomic(dev->page); 2116 payload->checksum[0] = cpu_to_le32( 2117 crc32c_le(log->uuid_checksum, addr, 2118 PAGE_SIZE)); 2119 kunmap_atomic(addr); 2120 sync_page_io(log->rdev, write_pos, PAGE_SIZE, 2121 dev->page, REQ_OP_WRITE, 0, false); 2122 write_pos = r5l_ring_add(log, write_pos, 2123 BLOCK_SECTORS); 2124 offset += sizeof(__le32) + 2125 sizeof(struct r5l_payload_data_parity); 2126 2127 } 2128 } 2129 mb->meta_size = cpu_to_le32(offset); 2130 mb->checksum = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE); 2131 sync_page_io(log->rdev, ctx->pos, PAGE_SIZE, page, 2132 REQ_OP_WRITE, WRITE_FUA, false); 2133 sh->log_start = ctx->pos; 2134 ctx->pos = write_pos; 2135 ctx->seq += 1; 2136 } 2137 __free_page(page); 2138 return 0; 2139 } 2140 2141 static int r5l_recovery_log(struct r5l_log *log) 2142 { 2143 struct mddev *mddev = log->rdev->mddev; 2144 struct r5l_recovery_ctx ctx; 2145 int ret; 2146 2147 ctx.pos = log->last_checkpoint; 2148 ctx.seq = log->last_cp_seq; 2149 ctx.meta_page = alloc_page(GFP_KERNEL); 2150 ctx.data_only_stripes = 0; 2151 ctx.data_parity_stripes = 0; 2152 INIT_LIST_HEAD(&ctx.cached_list); 2153 2154 if (!ctx.meta_page) 2155 return -ENOMEM; 2156 2157 ret = r5c_recovery_flush_log(log, &ctx); 2158 __free_page(ctx.meta_page); 2159 2160 if (ret) 2161 return ret; 2162 2163 if ((ctx.data_only_stripes == 0) && (ctx.data_parity_stripes == 0)) 2164 pr_debug("md/raid:%s: starting from clean shutdown\n", 2165 mdname(mddev)); 2166 else { 2167 pr_debug("md/raid:%s: recoverying %d data-only stripes and %d data-parity stripes\n", 2168 mdname(mddev), ctx.data_only_stripes, 2169 ctx.data_parity_stripes); 2170 2171 if (ctx.data_only_stripes > 0) 2172 if (r5c_recovery_rewrite_data_only_stripes(log, &ctx)) { 2173 pr_err("md/raid:%s: failed to rewrite stripes to journal\n", 2174 mdname(mddev)); 2175 return -EIO; 2176 } 2177 } 2178 2179 log->log_start = ctx.pos; 2180 log->next_checkpoint = ctx.pos; 2181 log->seq = ctx.seq; 2182 r5l_log_write_empty_meta_block(log, ctx.pos, ctx.seq); 2183 r5l_write_super(log, ctx.pos); 2184 return 0; 2185 } 2186 2187 static void r5l_write_super(struct r5l_log *log, sector_t cp) 2188 { 2189 struct mddev *mddev = log->rdev->mddev; 2190 2191 log->rdev->journal_tail = cp; 2192 set_bit(MD_CHANGE_DEVS, &mddev->flags); 2193 } 2194 2195 static ssize_t r5c_journal_mode_show(struct mddev *mddev, char *page) 2196 { 2197 struct r5conf *conf = mddev->private; 2198 int ret; 2199 2200 if (!conf->log) 2201 return 0; 2202 2203 switch (conf->log->r5c_journal_mode) { 2204 case R5C_JOURNAL_MODE_WRITE_THROUGH: 2205 ret = snprintf( 2206 page, PAGE_SIZE, "[%s] %s\n", 2207 r5c_journal_mode_str[R5C_JOURNAL_MODE_WRITE_THROUGH], 2208 r5c_journal_mode_str[R5C_JOURNAL_MODE_WRITE_BACK]); 2209 break; 2210 case R5C_JOURNAL_MODE_WRITE_BACK: 2211 ret = snprintf( 2212 page, PAGE_SIZE, "%s [%s]\n", 2213 r5c_journal_mode_str[R5C_JOURNAL_MODE_WRITE_THROUGH], 2214 r5c_journal_mode_str[R5C_JOURNAL_MODE_WRITE_BACK]); 2215 break; 2216 default: 2217 ret = 0; 2218 } 2219 return ret; 2220 } 2221 2222 static ssize_t r5c_journal_mode_store(struct mddev *mddev, 2223 const char *page, size_t length) 2224 { 2225 struct r5conf *conf = mddev->private; 2226 struct r5l_log *log = conf->log; 2227 int val = -1, i; 2228 int len = length; 2229 2230 if (!log) 2231 return -ENODEV; 2232 2233 if (len && page[len - 1] == '\n') 2234 len -= 1; 2235 for (i = 0; i < ARRAY_SIZE(r5c_journal_mode_str); i++) 2236 if (strlen(r5c_journal_mode_str[i]) == len && 2237 strncmp(page, r5c_journal_mode_str[i], len) == 0) { 2238 val = i; 2239 break; 2240 } 2241 if (val < R5C_JOURNAL_MODE_WRITE_THROUGH || 2242 val > R5C_JOURNAL_MODE_WRITE_BACK) 2243 return -EINVAL; 2244 2245 mddev_suspend(mddev); 2246 conf->log->r5c_journal_mode = val; 2247 mddev_resume(mddev); 2248 2249 pr_debug("md/raid:%s: setting r5c cache mode to %d: %s\n", 2250 mdname(mddev), val, r5c_journal_mode_str[val]); 2251 return length; 2252 } 2253 2254 struct md_sysfs_entry 2255 r5c_journal_mode = __ATTR(journal_mode, 0644, 2256 r5c_journal_mode_show, r5c_journal_mode_store); 2257 2258 /* 2259 * Try handle write operation in caching phase. This function should only 2260 * be called in write-back mode. 2261 * 2262 * If all outstanding writes can be handled in caching phase, returns 0 2263 * If writes requires write-out phase, call r5c_make_stripe_write_out() 2264 * and returns -EAGAIN 2265 */ 2266 int r5c_try_caching_write(struct r5conf *conf, 2267 struct stripe_head *sh, 2268 struct stripe_head_state *s, 2269 int disks) 2270 { 2271 struct r5l_log *log = conf->log; 2272 int i; 2273 struct r5dev *dev; 2274 int to_cache = 0; 2275 2276 BUG_ON(!r5c_is_writeback(log)); 2277 2278 if (!test_bit(STRIPE_R5C_CACHING, &sh->state)) { 2279 /* 2280 * There are two different scenarios here: 2281 * 1. The stripe has some data cached, and it is sent to 2282 * write-out phase for reclaim 2283 * 2. The stripe is clean, and this is the first write 2284 * 2285 * For 1, return -EAGAIN, so we continue with 2286 * handle_stripe_dirtying(). 2287 * 2288 * For 2, set STRIPE_R5C_CACHING and continue with caching 2289 * write. 2290 */ 2291 2292 /* case 1: anything injournal or anything in written */ 2293 if (s->injournal > 0 || s->written > 0) 2294 return -EAGAIN; 2295 /* case 2 */ 2296 set_bit(STRIPE_R5C_CACHING, &sh->state); 2297 } 2298 2299 for (i = disks; i--; ) { 2300 dev = &sh->dev[i]; 2301 /* if non-overwrite, use writing-out phase */ 2302 if (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags) && 2303 !test_bit(R5_InJournal, &dev->flags)) { 2304 r5c_make_stripe_write_out(sh); 2305 return -EAGAIN; 2306 } 2307 } 2308 2309 for (i = disks; i--; ) { 2310 dev = &sh->dev[i]; 2311 if (dev->towrite) { 2312 set_bit(R5_Wantwrite, &dev->flags); 2313 set_bit(R5_Wantdrain, &dev->flags); 2314 set_bit(R5_LOCKED, &dev->flags); 2315 to_cache++; 2316 } 2317 } 2318 2319 if (to_cache) { 2320 set_bit(STRIPE_OP_BIODRAIN, &s->ops_request); 2321 /* 2322 * set STRIPE_LOG_TRAPPED, which triggers r5c_cache_data() 2323 * in ops_run_io(). STRIPE_LOG_TRAPPED will be cleared in 2324 * r5c_handle_data_cached() 2325 */ 2326 set_bit(STRIPE_LOG_TRAPPED, &sh->state); 2327 } 2328 2329 return 0; 2330 } 2331 2332 /* 2333 * free extra pages (orig_page) we allocated for prexor 2334 */ 2335 void r5c_release_extra_page(struct stripe_head *sh) 2336 { 2337 int i; 2338 2339 for (i = sh->disks; i--; ) 2340 if (sh->dev[i].page != sh->dev[i].orig_page) { 2341 struct page *p = sh->dev[i].orig_page; 2342 2343 sh->dev[i].orig_page = sh->dev[i].page; 2344 put_page(p); 2345 } 2346 } 2347 2348 /* 2349 * clean up the stripe (clear R5_InJournal for dev[pd_idx] etc.) after the 2350 * stripe is committed to RAID disks. 2351 */ 2352 void r5c_finish_stripe_write_out(struct r5conf *conf, 2353 struct stripe_head *sh, 2354 struct stripe_head_state *s) 2355 { 2356 int i; 2357 int do_wakeup = 0; 2358 2359 if (!conf->log || 2360 !test_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags)) 2361 return; 2362 2363 WARN_ON(test_bit(STRIPE_R5C_CACHING, &sh->state)); 2364 clear_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags); 2365 2366 if (conf->log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) 2367 return; 2368 2369 for (i = sh->disks; i--; ) { 2370 clear_bit(R5_InJournal, &sh->dev[i].flags); 2371 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) 2372 do_wakeup = 1; 2373 } 2374 2375 /* 2376 * analyse_stripe() runs before r5c_finish_stripe_write_out(), 2377 * We updated R5_InJournal, so we also update s->injournal. 2378 */ 2379 s->injournal = 0; 2380 2381 if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state)) 2382 if (atomic_dec_and_test(&conf->pending_full_writes)) 2383 md_wakeup_thread(conf->mddev->thread); 2384 2385 if (do_wakeup) 2386 wake_up(&conf->wait_for_overlap); 2387 2388 if (conf->log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) 2389 return; 2390 2391 spin_lock_irq(&conf->log->stripe_in_journal_lock); 2392 list_del_init(&sh->r5c); 2393 spin_unlock_irq(&conf->log->stripe_in_journal_lock); 2394 sh->log_start = MaxSector; 2395 atomic_dec(&conf->log->stripe_in_journal_count); 2396 } 2397 2398 int 2399 r5c_cache_data(struct r5l_log *log, struct stripe_head *sh, 2400 struct stripe_head_state *s) 2401 { 2402 struct r5conf *conf = sh->raid_conf; 2403 int pages = 0; 2404 int reserve; 2405 int i; 2406 int ret = 0; 2407 2408 BUG_ON(!log); 2409 2410 for (i = 0; i < sh->disks; i++) { 2411 void *addr; 2412 2413 if (!test_bit(R5_Wantwrite, &sh->dev[i].flags)) 2414 continue; 2415 addr = kmap_atomic(sh->dev[i].page); 2416 sh->dev[i].log_checksum = crc32c_le(log->uuid_checksum, 2417 addr, PAGE_SIZE); 2418 kunmap_atomic(addr); 2419 pages++; 2420 } 2421 WARN_ON(pages == 0); 2422 2423 /* 2424 * The stripe must enter state machine again to call endio, so 2425 * don't delay. 2426 */ 2427 clear_bit(STRIPE_DELAYED, &sh->state); 2428 atomic_inc(&sh->count); 2429 2430 mutex_lock(&log->io_mutex); 2431 /* meta + data */ 2432 reserve = (1 + pages) << (PAGE_SHIFT - 9); 2433 2434 if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state) && 2435 sh->log_start == MaxSector) 2436 r5l_add_no_space_stripe(log, sh); 2437 else if (!r5l_has_free_space(log, reserve)) { 2438 if (sh->log_start == log->last_checkpoint) 2439 BUG(); 2440 else 2441 r5l_add_no_space_stripe(log, sh); 2442 } else { 2443 ret = r5l_log_stripe(log, sh, pages, 0); 2444 if (ret) { 2445 spin_lock_irq(&log->io_list_lock); 2446 list_add_tail(&sh->log_list, &log->no_mem_stripes); 2447 spin_unlock_irq(&log->io_list_lock); 2448 } 2449 } 2450 2451 mutex_unlock(&log->io_mutex); 2452 return 0; 2453 } 2454 2455 static int r5l_load_log(struct r5l_log *log) 2456 { 2457 struct md_rdev *rdev = log->rdev; 2458 struct page *page; 2459 struct r5l_meta_block *mb; 2460 sector_t cp = log->rdev->journal_tail; 2461 u32 stored_crc, expected_crc; 2462 bool create_super = false; 2463 int ret; 2464 2465 /* Make sure it's valid */ 2466 if (cp >= rdev->sectors || round_down(cp, BLOCK_SECTORS) != cp) 2467 cp = 0; 2468 page = alloc_page(GFP_KERNEL); 2469 if (!page) 2470 return -ENOMEM; 2471 2472 if (!sync_page_io(rdev, cp, PAGE_SIZE, page, REQ_OP_READ, 0, false)) { 2473 ret = -EIO; 2474 goto ioerr; 2475 } 2476 mb = page_address(page); 2477 2478 if (le32_to_cpu(mb->magic) != R5LOG_MAGIC || 2479 mb->version != R5LOG_VERSION) { 2480 create_super = true; 2481 goto create; 2482 } 2483 stored_crc = le32_to_cpu(mb->checksum); 2484 mb->checksum = 0; 2485 expected_crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE); 2486 if (stored_crc != expected_crc) { 2487 create_super = true; 2488 goto create; 2489 } 2490 if (le64_to_cpu(mb->position) != cp) { 2491 create_super = true; 2492 goto create; 2493 } 2494 create: 2495 if (create_super) { 2496 log->last_cp_seq = prandom_u32(); 2497 cp = 0; 2498 r5l_log_write_empty_meta_block(log, cp, log->last_cp_seq); 2499 /* 2500 * Make sure super points to correct address. Log might have 2501 * data very soon. If super hasn't correct log tail address, 2502 * recovery can't find the log 2503 */ 2504 r5l_write_super(log, cp); 2505 } else 2506 log->last_cp_seq = le64_to_cpu(mb->seq); 2507 2508 log->device_size = round_down(rdev->sectors, BLOCK_SECTORS); 2509 log->max_free_space = log->device_size >> RECLAIM_MAX_FREE_SPACE_SHIFT; 2510 if (log->max_free_space > RECLAIM_MAX_FREE_SPACE) 2511 log->max_free_space = RECLAIM_MAX_FREE_SPACE; 2512 log->last_checkpoint = cp; 2513 log->next_checkpoint = cp; 2514 mutex_lock(&log->io_mutex); 2515 r5c_update_log_state(log); 2516 mutex_unlock(&log->io_mutex); 2517 2518 __free_page(page); 2519 2520 return r5l_recovery_log(log); 2521 ioerr: 2522 __free_page(page); 2523 return ret; 2524 } 2525 2526 int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev) 2527 { 2528 struct request_queue *q = bdev_get_queue(rdev->bdev); 2529 struct r5l_log *log; 2530 2531 if (PAGE_SIZE != 4096) 2532 return -EINVAL; 2533 2534 /* 2535 * The PAGE_SIZE must be big enough to hold 1 r5l_meta_block and 2536 * raid_disks r5l_payload_data_parity. 2537 * 2538 * Write journal and cache does not work for very big array 2539 * (raid_disks > 203) 2540 */ 2541 if (sizeof(struct r5l_meta_block) + 2542 ((sizeof(struct r5l_payload_data_parity) + sizeof(__le32)) * 2543 conf->raid_disks) > PAGE_SIZE) { 2544 pr_err("md/raid:%s: write journal/cache doesn't work for array with %d disks\n", 2545 mdname(conf->mddev), conf->raid_disks); 2546 return -EINVAL; 2547 } 2548 2549 log = kzalloc(sizeof(*log), GFP_KERNEL); 2550 if (!log) 2551 return -ENOMEM; 2552 log->rdev = rdev; 2553 2554 log->need_cache_flush = test_bit(QUEUE_FLAG_WC, &q->queue_flags) != 0; 2555 2556 log->uuid_checksum = crc32c_le(~0, rdev->mddev->uuid, 2557 sizeof(rdev->mddev->uuid)); 2558 2559 mutex_init(&log->io_mutex); 2560 2561 spin_lock_init(&log->io_list_lock); 2562 INIT_LIST_HEAD(&log->running_ios); 2563 INIT_LIST_HEAD(&log->io_end_ios); 2564 INIT_LIST_HEAD(&log->flushing_ios); 2565 INIT_LIST_HEAD(&log->finished_ios); 2566 bio_init(&log->flush_bio); 2567 2568 log->io_kc = KMEM_CACHE(r5l_io_unit, 0); 2569 if (!log->io_kc) 2570 goto io_kc; 2571 2572 log->io_pool = mempool_create_slab_pool(R5L_POOL_SIZE, log->io_kc); 2573 if (!log->io_pool) 2574 goto io_pool; 2575 2576 log->bs = bioset_create(R5L_POOL_SIZE, 0); 2577 if (!log->bs) 2578 goto io_bs; 2579 2580 log->meta_pool = mempool_create_page_pool(R5L_POOL_SIZE, 0); 2581 if (!log->meta_pool) 2582 goto out_mempool; 2583 2584 log->reclaim_thread = md_register_thread(r5l_reclaim_thread, 2585 log->rdev->mddev, "reclaim"); 2586 if (!log->reclaim_thread) 2587 goto reclaim_thread; 2588 log->reclaim_thread->timeout = R5C_RECLAIM_WAKEUP_INTERVAL; 2589 2590 init_waitqueue_head(&log->iounit_wait); 2591 2592 INIT_LIST_HEAD(&log->no_mem_stripes); 2593 2594 INIT_LIST_HEAD(&log->no_space_stripes); 2595 spin_lock_init(&log->no_space_stripes_lock); 2596 2597 INIT_WORK(&log->deferred_io_work, r5l_submit_io_async); 2598 2599 log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH; 2600 INIT_LIST_HEAD(&log->stripe_in_journal_list); 2601 spin_lock_init(&log->stripe_in_journal_lock); 2602 atomic_set(&log->stripe_in_journal_count, 0); 2603 2604 if (r5l_load_log(log)) 2605 goto error; 2606 2607 rcu_assign_pointer(conf->log, log); 2608 set_bit(MD_HAS_JOURNAL, &conf->mddev->flags); 2609 return 0; 2610 2611 error: 2612 md_unregister_thread(&log->reclaim_thread); 2613 reclaim_thread: 2614 mempool_destroy(log->meta_pool); 2615 out_mempool: 2616 bioset_free(log->bs); 2617 io_bs: 2618 mempool_destroy(log->io_pool); 2619 io_pool: 2620 kmem_cache_destroy(log->io_kc); 2621 io_kc: 2622 kfree(log); 2623 return -EINVAL; 2624 } 2625 2626 void r5l_exit_log(struct r5l_log *log) 2627 { 2628 md_unregister_thread(&log->reclaim_thread); 2629 mempool_destroy(log->meta_pool); 2630 bioset_free(log->bs); 2631 mempool_destroy(log->io_pool); 2632 kmem_cache_destroy(log->io_kc); 2633 kfree(log); 2634 } 2635