1 /* 2 * Copyright (C) 2015 Shaohua Li <shli@fb.com> 3 * Copyright (C) 2016 Song Liu <songliubraving@fb.com> 4 * 5 * This program is free software; you can redistribute it and/or modify it 6 * under the terms and conditions of the GNU General Public License, 7 * version 2, as published by the Free Software Foundation. 8 * 9 * This program is distributed in the hope it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 12 * more details. 13 * 14 */ 15 #include <linux/kernel.h> 16 #include <linux/wait.h> 17 #include <linux/blkdev.h> 18 #include <linux/slab.h> 19 #include <linux/raid/md_p.h> 20 #include <linux/crc32c.h> 21 #include <linux/random.h> 22 #include <linux/kthread.h> 23 #include "md.h" 24 #include "raid5.h" 25 #include "bitmap.h" 26 27 /* 28 * metadata/data stored in disk with 4k size unit (a block) regardless 29 * underneath hardware sector size. only works with PAGE_SIZE == 4096 30 */ 31 #define BLOCK_SECTORS (8) 32 33 /* 34 * log->max_free_space is min(1/4 disk size, 10G reclaimable space). 35 * 36 * In write through mode, the reclaim runs every log->max_free_space. 37 * This can prevent the recovery scans for too long 38 */ 39 #define RECLAIM_MAX_FREE_SPACE (10 * 1024 * 1024 * 2) /* sector */ 40 #define RECLAIM_MAX_FREE_SPACE_SHIFT (2) 41 42 /* wake up reclaim thread periodically */ 43 #define R5C_RECLAIM_WAKEUP_INTERVAL (30 * HZ) 44 /* start flush with these full stripes */ 45 #define R5C_FULL_STRIPE_FLUSH_BATCH 256 46 /* reclaim stripes in groups */ 47 #define R5C_RECLAIM_STRIPE_GROUP (NR_STRIPE_HASH_LOCKS * 2) 48 49 /* 50 * We only need 2 bios per I/O unit to make progress, but ensure we 51 * have a few more available to not get too tight. 52 */ 53 #define R5L_POOL_SIZE 4 54 55 /* 56 * r5c journal modes of the array: write-back or write-through. 57 * write-through mode has identical behavior as existing log only 58 * implementation. 59 */ 60 enum r5c_journal_mode { 61 R5C_JOURNAL_MODE_WRITE_THROUGH = 0, 62 R5C_JOURNAL_MODE_WRITE_BACK = 1, 63 }; 64 65 static char *r5c_journal_mode_str[] = {"write-through", 66 "write-back"}; 67 /* 68 * raid5 cache state machine 69 * 70 * With the RAID cache, each stripe works in two phases: 71 * - caching phase 72 * - writing-out phase 73 * 74 * These two phases are controlled by bit STRIPE_R5C_CACHING: 75 * if STRIPE_R5C_CACHING == 0, the stripe is in writing-out phase 76 * if STRIPE_R5C_CACHING == 1, the stripe is in caching phase 77 * 78 * When there is no journal, or the journal is in write-through mode, 79 * the stripe is always in writing-out phase. 80 * 81 * For write-back journal, the stripe is sent to caching phase on write 82 * (r5c_try_caching_write). r5c_make_stripe_write_out() kicks off 83 * the write-out phase by clearing STRIPE_R5C_CACHING. 84 * 85 * Stripes in caching phase do not write the raid disks. Instead, all 86 * writes are committed from the log device. Therefore, a stripe in 87 * caching phase handles writes as: 88 * - write to log device 89 * - return IO 90 * 91 * Stripes in writing-out phase handle writes as: 92 * - calculate parity 93 * - write pending data and parity to journal 94 * - write data and parity to raid disks 95 * - return IO for pending writes 96 */ 97 98 struct r5l_log { 99 struct md_rdev *rdev; 100 101 u32 uuid_checksum; 102 103 sector_t device_size; /* log device size, round to 104 * BLOCK_SECTORS */ 105 sector_t max_free_space; /* reclaim run if free space is at 106 * this size */ 107 108 sector_t last_checkpoint; /* log tail. where recovery scan 109 * starts from */ 110 u64 last_cp_seq; /* log tail sequence */ 111 112 sector_t log_start; /* log head. where new data appends */ 113 u64 seq; /* log head sequence */ 114 115 sector_t next_checkpoint; 116 117 struct mutex io_mutex; 118 struct r5l_io_unit *current_io; /* current io_unit accepting new data */ 119 120 spinlock_t io_list_lock; 121 struct list_head running_ios; /* io_units which are still running, 122 * and have not yet been completely 123 * written to the log */ 124 struct list_head io_end_ios; /* io_units which have been completely 125 * written to the log but not yet written 126 * to the RAID */ 127 struct list_head flushing_ios; /* io_units which are waiting for log 128 * cache flush */ 129 struct list_head finished_ios; /* io_units which settle down in log disk */ 130 struct bio flush_bio; 131 132 struct list_head no_mem_stripes; /* pending stripes, -ENOMEM */ 133 134 struct kmem_cache *io_kc; 135 mempool_t *io_pool; 136 struct bio_set *bs; 137 mempool_t *meta_pool; 138 139 struct md_thread *reclaim_thread; 140 unsigned long reclaim_target; /* number of space that need to be 141 * reclaimed. if it's 0, reclaim spaces 142 * used by io_units which are in 143 * IO_UNIT_STRIPE_END state (eg, reclaim 144 * dones't wait for specific io_unit 145 * switching to IO_UNIT_STRIPE_END 146 * state) */ 147 wait_queue_head_t iounit_wait; 148 149 struct list_head no_space_stripes; /* pending stripes, log has no space */ 150 spinlock_t no_space_stripes_lock; 151 152 bool need_cache_flush; 153 154 /* for r5c_cache */ 155 enum r5c_journal_mode r5c_journal_mode; 156 157 /* all stripes in r5cache, in the order of seq at sh->log_start */ 158 struct list_head stripe_in_journal_list; 159 160 spinlock_t stripe_in_journal_lock; 161 atomic_t stripe_in_journal_count; 162 163 /* to submit async io_units, to fulfill ordering of flush */ 164 struct work_struct deferred_io_work; 165 }; 166 167 /* 168 * an IO range starts from a meta data block and end at the next meta data 169 * block. The io unit's the meta data block tracks data/parity followed it. io 170 * unit is written to log disk with normal write, as we always flush log disk 171 * first and then start move data to raid disks, there is no requirement to 172 * write io unit with FLUSH/FUA 173 */ 174 struct r5l_io_unit { 175 struct r5l_log *log; 176 177 struct page *meta_page; /* store meta block */ 178 int meta_offset; /* current offset in meta_page */ 179 180 struct bio *current_bio;/* current_bio accepting new data */ 181 182 atomic_t pending_stripe;/* how many stripes not flushed to raid */ 183 u64 seq; /* seq number of the metablock */ 184 sector_t log_start; /* where the io_unit starts */ 185 sector_t log_end; /* where the io_unit ends */ 186 struct list_head log_sibling; /* log->running_ios */ 187 struct list_head stripe_list; /* stripes added to the io_unit */ 188 189 int state; 190 bool need_split_bio; 191 struct bio *split_bio; 192 193 unsigned int has_flush:1; /* include flush request */ 194 unsigned int has_fua:1; /* include fua request */ 195 unsigned int has_null_flush:1; /* include empty flush request */ 196 /* 197 * io isn't sent yet, flush/fua request can only be submitted till it's 198 * the first IO in running_ios list 199 */ 200 unsigned int io_deferred:1; 201 202 struct bio_list flush_barriers; /* size == 0 flush bios */ 203 }; 204 205 /* r5l_io_unit state */ 206 enum r5l_io_unit_state { 207 IO_UNIT_RUNNING = 0, /* accepting new IO */ 208 IO_UNIT_IO_START = 1, /* io_unit bio start writing to log, 209 * don't accepting new bio */ 210 IO_UNIT_IO_END = 2, /* io_unit bio finish writing to log */ 211 IO_UNIT_STRIPE_END = 3, /* stripes data finished writing to raid */ 212 }; 213 214 bool r5c_is_writeback(struct r5l_log *log) 215 { 216 return (log != NULL && 217 log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_BACK); 218 } 219 220 static sector_t r5l_ring_add(struct r5l_log *log, sector_t start, sector_t inc) 221 { 222 start += inc; 223 if (start >= log->device_size) 224 start = start - log->device_size; 225 return start; 226 } 227 228 static sector_t r5l_ring_distance(struct r5l_log *log, sector_t start, 229 sector_t end) 230 { 231 if (end >= start) 232 return end - start; 233 else 234 return end + log->device_size - start; 235 } 236 237 static bool r5l_has_free_space(struct r5l_log *log, sector_t size) 238 { 239 sector_t used_size; 240 241 used_size = r5l_ring_distance(log, log->last_checkpoint, 242 log->log_start); 243 244 return log->device_size > used_size + size; 245 } 246 247 static void __r5l_set_io_unit_state(struct r5l_io_unit *io, 248 enum r5l_io_unit_state state) 249 { 250 if (WARN_ON(io->state >= state)) 251 return; 252 io->state = state; 253 } 254 255 static void 256 r5c_return_dev_pending_writes(struct r5conf *conf, struct r5dev *dev, 257 struct bio_list *return_bi) 258 { 259 struct bio *wbi, *wbi2; 260 261 wbi = dev->written; 262 dev->written = NULL; 263 while (wbi && wbi->bi_iter.bi_sector < 264 dev->sector + STRIPE_SECTORS) { 265 wbi2 = r5_next_bio(wbi, dev->sector); 266 if (!raid5_dec_bi_active_stripes(wbi)) { 267 md_write_end(conf->mddev); 268 bio_list_add(return_bi, wbi); 269 } 270 wbi = wbi2; 271 } 272 } 273 274 void r5c_handle_cached_data_endio(struct r5conf *conf, 275 struct stripe_head *sh, int disks, struct bio_list *return_bi) 276 { 277 int i; 278 279 for (i = sh->disks; i--; ) { 280 if (sh->dev[i].written) { 281 set_bit(R5_UPTODATE, &sh->dev[i].flags); 282 r5c_return_dev_pending_writes(conf, &sh->dev[i], 283 return_bi); 284 bitmap_endwrite(conf->mddev->bitmap, sh->sector, 285 STRIPE_SECTORS, 286 !test_bit(STRIPE_DEGRADED, &sh->state), 287 0); 288 } 289 } 290 } 291 292 /* Check whether we should flush some stripes to free up stripe cache */ 293 void r5c_check_stripe_cache_usage(struct r5conf *conf) 294 { 295 int total_cached; 296 297 if (!r5c_is_writeback(conf->log)) 298 return; 299 300 total_cached = atomic_read(&conf->r5c_cached_partial_stripes) + 301 atomic_read(&conf->r5c_cached_full_stripes); 302 303 /* 304 * The following condition is true for either of the following: 305 * - stripe cache pressure high: 306 * total_cached > 3/4 min_nr_stripes || 307 * empty_inactive_list_nr > 0 308 * - stripe cache pressure moderate: 309 * total_cached > 1/2 min_nr_stripes 310 */ 311 if (total_cached > conf->min_nr_stripes * 1 / 2 || 312 atomic_read(&conf->empty_inactive_list_nr) > 0) 313 r5l_wake_reclaim(conf->log, 0); 314 } 315 316 /* 317 * flush cache when there are R5C_FULL_STRIPE_FLUSH_BATCH or more full 318 * stripes in the cache 319 */ 320 void r5c_check_cached_full_stripe(struct r5conf *conf) 321 { 322 if (!r5c_is_writeback(conf->log)) 323 return; 324 325 /* 326 * wake up reclaim for R5C_FULL_STRIPE_FLUSH_BATCH cached stripes 327 * or a full stripe (chunk size / 4k stripes). 328 */ 329 if (atomic_read(&conf->r5c_cached_full_stripes) >= 330 min(R5C_FULL_STRIPE_FLUSH_BATCH, 331 conf->chunk_sectors >> STRIPE_SHIFT)) 332 r5l_wake_reclaim(conf->log, 0); 333 } 334 335 /* 336 * Total log space (in sectors) needed to flush all data in cache 337 * 338 * Currently, writing-out phase automatically includes all pending writes 339 * to the same sector. So the reclaim of each stripe takes up to 340 * (conf->raid_disks + 1) pages of log space. 341 * 342 * To totally avoid deadlock due to log space, the code reserves 343 * (conf->raid_disks + 1) pages for each stripe in cache, which is not 344 * necessary in most cases. 345 * 346 * To improve this, we will need writing-out phase to be able to NOT include 347 * pending writes, which will reduce the requirement to 348 * (conf->max_degraded + 1) pages per stripe in cache. 349 */ 350 static sector_t r5c_log_required_to_flush_cache(struct r5conf *conf) 351 { 352 struct r5l_log *log = conf->log; 353 354 if (!r5c_is_writeback(log)) 355 return 0; 356 357 return BLOCK_SECTORS * (conf->raid_disks + 1) * 358 atomic_read(&log->stripe_in_journal_count); 359 } 360 361 /* 362 * evaluate log space usage and update R5C_LOG_TIGHT and R5C_LOG_CRITICAL 363 * 364 * R5C_LOG_TIGHT is set when free space on the log device is less than 3x of 365 * reclaim_required_space. R5C_LOG_CRITICAL is set when free space on the log 366 * device is less than 2x of reclaim_required_space. 367 */ 368 static inline void r5c_update_log_state(struct r5l_log *log) 369 { 370 struct r5conf *conf = log->rdev->mddev->private; 371 sector_t free_space; 372 sector_t reclaim_space; 373 bool wake_reclaim = false; 374 375 if (!r5c_is_writeback(log)) 376 return; 377 378 free_space = r5l_ring_distance(log, log->log_start, 379 log->last_checkpoint); 380 reclaim_space = r5c_log_required_to_flush_cache(conf); 381 if (free_space < 2 * reclaim_space) 382 set_bit(R5C_LOG_CRITICAL, &conf->cache_state); 383 else { 384 if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state)) 385 wake_reclaim = true; 386 clear_bit(R5C_LOG_CRITICAL, &conf->cache_state); 387 } 388 if (free_space < 3 * reclaim_space) 389 set_bit(R5C_LOG_TIGHT, &conf->cache_state); 390 else 391 clear_bit(R5C_LOG_TIGHT, &conf->cache_state); 392 393 if (wake_reclaim) 394 r5l_wake_reclaim(log, 0); 395 } 396 397 /* 398 * Put the stripe into writing-out phase by clearing STRIPE_R5C_CACHING. 399 * This function should only be called in write-back mode. 400 */ 401 void r5c_make_stripe_write_out(struct stripe_head *sh) 402 { 403 struct r5conf *conf = sh->raid_conf; 404 struct r5l_log *log = conf->log; 405 406 BUG_ON(!r5c_is_writeback(log)); 407 408 WARN_ON(!test_bit(STRIPE_R5C_CACHING, &sh->state)); 409 clear_bit(STRIPE_R5C_CACHING, &sh->state); 410 411 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 412 atomic_inc(&conf->preread_active_stripes); 413 414 if (test_and_clear_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state)) { 415 BUG_ON(atomic_read(&conf->r5c_cached_partial_stripes) == 0); 416 atomic_dec(&conf->r5c_cached_partial_stripes); 417 } 418 419 if (test_and_clear_bit(STRIPE_R5C_FULL_STRIPE, &sh->state)) { 420 BUG_ON(atomic_read(&conf->r5c_cached_full_stripes) == 0); 421 atomic_dec(&conf->r5c_cached_full_stripes); 422 } 423 } 424 425 static void r5c_handle_data_cached(struct stripe_head *sh) 426 { 427 int i; 428 429 for (i = sh->disks; i--; ) 430 if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) { 431 set_bit(R5_InJournal, &sh->dev[i].flags); 432 clear_bit(R5_LOCKED, &sh->dev[i].flags); 433 } 434 clear_bit(STRIPE_LOG_TRAPPED, &sh->state); 435 } 436 437 /* 438 * this journal write must contain full parity, 439 * it may also contain some data pages 440 */ 441 static void r5c_handle_parity_cached(struct stripe_head *sh) 442 { 443 int i; 444 445 for (i = sh->disks; i--; ) 446 if (test_bit(R5_InJournal, &sh->dev[i].flags)) 447 set_bit(R5_Wantwrite, &sh->dev[i].flags); 448 } 449 450 /* 451 * Setting proper flags after writing (or flushing) data and/or parity to the 452 * log device. This is called from r5l_log_endio() or r5l_log_flush_endio(). 453 */ 454 static void r5c_finish_cache_stripe(struct stripe_head *sh) 455 { 456 struct r5l_log *log = sh->raid_conf->log; 457 458 if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) { 459 BUG_ON(test_bit(STRIPE_R5C_CACHING, &sh->state)); 460 /* 461 * Set R5_InJournal for parity dev[pd_idx]. This means 462 * all data AND parity in the journal. For RAID 6, it is 463 * NOT necessary to set the flag for dev[qd_idx], as the 464 * two parities are written out together. 465 */ 466 set_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags); 467 } else if (test_bit(STRIPE_R5C_CACHING, &sh->state)) { 468 r5c_handle_data_cached(sh); 469 } else { 470 r5c_handle_parity_cached(sh); 471 set_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags); 472 } 473 } 474 475 static void r5l_io_run_stripes(struct r5l_io_unit *io) 476 { 477 struct stripe_head *sh, *next; 478 479 list_for_each_entry_safe(sh, next, &io->stripe_list, log_list) { 480 list_del_init(&sh->log_list); 481 482 r5c_finish_cache_stripe(sh); 483 484 set_bit(STRIPE_HANDLE, &sh->state); 485 raid5_release_stripe(sh); 486 } 487 } 488 489 static void r5l_log_run_stripes(struct r5l_log *log) 490 { 491 struct r5l_io_unit *io, *next; 492 493 assert_spin_locked(&log->io_list_lock); 494 495 list_for_each_entry_safe(io, next, &log->running_ios, log_sibling) { 496 /* don't change list order */ 497 if (io->state < IO_UNIT_IO_END) 498 break; 499 500 list_move_tail(&io->log_sibling, &log->finished_ios); 501 r5l_io_run_stripes(io); 502 } 503 } 504 505 static void r5l_move_to_end_ios(struct r5l_log *log) 506 { 507 struct r5l_io_unit *io, *next; 508 509 assert_spin_locked(&log->io_list_lock); 510 511 list_for_each_entry_safe(io, next, &log->running_ios, log_sibling) { 512 /* don't change list order */ 513 if (io->state < IO_UNIT_IO_END) 514 break; 515 list_move_tail(&io->log_sibling, &log->io_end_ios); 516 } 517 } 518 519 static void __r5l_stripe_write_finished(struct r5l_io_unit *io); 520 static void r5l_log_endio(struct bio *bio) 521 { 522 struct r5l_io_unit *io = bio->bi_private; 523 struct r5l_io_unit *io_deferred; 524 struct r5l_log *log = io->log; 525 unsigned long flags; 526 527 if (bio->bi_error) 528 md_error(log->rdev->mddev, log->rdev); 529 530 bio_put(bio); 531 mempool_free(io->meta_page, log->meta_pool); 532 533 spin_lock_irqsave(&log->io_list_lock, flags); 534 __r5l_set_io_unit_state(io, IO_UNIT_IO_END); 535 if (log->need_cache_flush) 536 r5l_move_to_end_ios(log); 537 else 538 r5l_log_run_stripes(log); 539 if (!list_empty(&log->running_ios)) { 540 /* 541 * FLUSH/FUA io_unit is deferred because of ordering, now we 542 * can dispatch it 543 */ 544 io_deferred = list_first_entry(&log->running_ios, 545 struct r5l_io_unit, log_sibling); 546 if (io_deferred->io_deferred) 547 schedule_work(&log->deferred_io_work); 548 } 549 550 spin_unlock_irqrestore(&log->io_list_lock, flags); 551 552 if (log->need_cache_flush) 553 md_wakeup_thread(log->rdev->mddev->thread); 554 555 if (io->has_null_flush) { 556 struct bio *bi; 557 558 WARN_ON(bio_list_empty(&io->flush_barriers)); 559 while ((bi = bio_list_pop(&io->flush_barriers)) != NULL) { 560 bio_endio(bi); 561 atomic_dec(&io->pending_stripe); 562 } 563 if (atomic_read(&io->pending_stripe) == 0) 564 __r5l_stripe_write_finished(io); 565 } 566 } 567 568 static void r5l_do_submit_io(struct r5l_log *log, struct r5l_io_unit *io) 569 { 570 unsigned long flags; 571 572 spin_lock_irqsave(&log->io_list_lock, flags); 573 __r5l_set_io_unit_state(io, IO_UNIT_IO_START); 574 spin_unlock_irqrestore(&log->io_list_lock, flags); 575 576 if (io->has_flush) 577 io->current_bio->bi_opf |= REQ_PREFLUSH; 578 if (io->has_fua) 579 io->current_bio->bi_opf |= REQ_FUA; 580 submit_bio(io->current_bio); 581 582 if (!io->split_bio) 583 return; 584 585 if (io->has_flush) 586 io->split_bio->bi_opf |= REQ_PREFLUSH; 587 if (io->has_fua) 588 io->split_bio->bi_opf |= REQ_FUA; 589 submit_bio(io->split_bio); 590 } 591 592 /* deferred io_unit will be dispatched here */ 593 static void r5l_submit_io_async(struct work_struct *work) 594 { 595 struct r5l_log *log = container_of(work, struct r5l_log, 596 deferred_io_work); 597 struct r5l_io_unit *io = NULL; 598 unsigned long flags; 599 600 spin_lock_irqsave(&log->io_list_lock, flags); 601 if (!list_empty(&log->running_ios)) { 602 io = list_first_entry(&log->running_ios, struct r5l_io_unit, 603 log_sibling); 604 if (!io->io_deferred) 605 io = NULL; 606 else 607 io->io_deferred = 0; 608 } 609 spin_unlock_irqrestore(&log->io_list_lock, flags); 610 if (io) 611 r5l_do_submit_io(log, io); 612 } 613 614 static void r5l_submit_current_io(struct r5l_log *log) 615 { 616 struct r5l_io_unit *io = log->current_io; 617 struct bio *bio; 618 struct r5l_meta_block *block; 619 unsigned long flags; 620 u32 crc; 621 bool do_submit = true; 622 623 if (!io) 624 return; 625 626 block = page_address(io->meta_page); 627 block->meta_size = cpu_to_le32(io->meta_offset); 628 crc = crc32c_le(log->uuid_checksum, block, PAGE_SIZE); 629 block->checksum = cpu_to_le32(crc); 630 bio = io->current_bio; 631 632 log->current_io = NULL; 633 spin_lock_irqsave(&log->io_list_lock, flags); 634 if (io->has_flush || io->has_fua) { 635 if (io != list_first_entry(&log->running_ios, 636 struct r5l_io_unit, log_sibling)) { 637 io->io_deferred = 1; 638 do_submit = false; 639 } 640 } 641 spin_unlock_irqrestore(&log->io_list_lock, flags); 642 if (do_submit) 643 r5l_do_submit_io(log, io); 644 } 645 646 static struct bio *r5l_bio_alloc(struct r5l_log *log) 647 { 648 struct bio *bio = bio_alloc_bioset(GFP_NOIO, BIO_MAX_PAGES, log->bs); 649 650 bio_set_op_attrs(bio, REQ_OP_WRITE, 0); 651 bio->bi_bdev = log->rdev->bdev; 652 bio->bi_iter.bi_sector = log->rdev->data_offset + log->log_start; 653 654 return bio; 655 } 656 657 static void r5_reserve_log_entry(struct r5l_log *log, struct r5l_io_unit *io) 658 { 659 log->log_start = r5l_ring_add(log, log->log_start, BLOCK_SECTORS); 660 661 r5c_update_log_state(log); 662 /* 663 * If we filled up the log device start from the beginning again, 664 * which will require a new bio. 665 * 666 * Note: for this to work properly the log size needs to me a multiple 667 * of BLOCK_SECTORS. 668 */ 669 if (log->log_start == 0) 670 io->need_split_bio = true; 671 672 io->log_end = log->log_start; 673 } 674 675 static struct r5l_io_unit *r5l_new_meta(struct r5l_log *log) 676 { 677 struct r5l_io_unit *io; 678 struct r5l_meta_block *block; 679 680 io = mempool_alloc(log->io_pool, GFP_ATOMIC); 681 if (!io) 682 return NULL; 683 memset(io, 0, sizeof(*io)); 684 685 io->log = log; 686 INIT_LIST_HEAD(&io->log_sibling); 687 INIT_LIST_HEAD(&io->stripe_list); 688 bio_list_init(&io->flush_barriers); 689 io->state = IO_UNIT_RUNNING; 690 691 io->meta_page = mempool_alloc(log->meta_pool, GFP_NOIO); 692 block = page_address(io->meta_page); 693 clear_page(block); 694 block->magic = cpu_to_le32(R5LOG_MAGIC); 695 block->version = R5LOG_VERSION; 696 block->seq = cpu_to_le64(log->seq); 697 block->position = cpu_to_le64(log->log_start); 698 699 io->log_start = log->log_start; 700 io->meta_offset = sizeof(struct r5l_meta_block); 701 io->seq = log->seq++; 702 703 io->current_bio = r5l_bio_alloc(log); 704 io->current_bio->bi_end_io = r5l_log_endio; 705 io->current_bio->bi_private = io; 706 bio_add_page(io->current_bio, io->meta_page, PAGE_SIZE, 0); 707 708 r5_reserve_log_entry(log, io); 709 710 spin_lock_irq(&log->io_list_lock); 711 list_add_tail(&io->log_sibling, &log->running_ios); 712 spin_unlock_irq(&log->io_list_lock); 713 714 return io; 715 } 716 717 static int r5l_get_meta(struct r5l_log *log, unsigned int payload_size) 718 { 719 if (log->current_io && 720 log->current_io->meta_offset + payload_size > PAGE_SIZE) 721 r5l_submit_current_io(log); 722 723 if (!log->current_io) { 724 log->current_io = r5l_new_meta(log); 725 if (!log->current_io) 726 return -ENOMEM; 727 } 728 729 return 0; 730 } 731 732 static void r5l_append_payload_meta(struct r5l_log *log, u16 type, 733 sector_t location, 734 u32 checksum1, u32 checksum2, 735 bool checksum2_valid) 736 { 737 struct r5l_io_unit *io = log->current_io; 738 struct r5l_payload_data_parity *payload; 739 740 payload = page_address(io->meta_page) + io->meta_offset; 741 payload->header.type = cpu_to_le16(type); 742 payload->header.flags = cpu_to_le16(0); 743 payload->size = cpu_to_le32((1 + !!checksum2_valid) << 744 (PAGE_SHIFT - 9)); 745 payload->location = cpu_to_le64(location); 746 payload->checksum[0] = cpu_to_le32(checksum1); 747 if (checksum2_valid) 748 payload->checksum[1] = cpu_to_le32(checksum2); 749 750 io->meta_offset += sizeof(struct r5l_payload_data_parity) + 751 sizeof(__le32) * (1 + !!checksum2_valid); 752 } 753 754 static void r5l_append_payload_page(struct r5l_log *log, struct page *page) 755 { 756 struct r5l_io_unit *io = log->current_io; 757 758 if (io->need_split_bio) { 759 BUG_ON(io->split_bio); 760 io->split_bio = io->current_bio; 761 io->current_bio = r5l_bio_alloc(log); 762 bio_chain(io->current_bio, io->split_bio); 763 io->need_split_bio = false; 764 } 765 766 if (!bio_add_page(io->current_bio, page, PAGE_SIZE, 0)) 767 BUG(); 768 769 r5_reserve_log_entry(log, io); 770 } 771 772 static int r5l_log_stripe(struct r5l_log *log, struct stripe_head *sh, 773 int data_pages, int parity_pages) 774 { 775 int i; 776 int meta_size; 777 int ret; 778 struct r5l_io_unit *io; 779 780 meta_size = 781 ((sizeof(struct r5l_payload_data_parity) + sizeof(__le32)) 782 * data_pages) + 783 sizeof(struct r5l_payload_data_parity) + 784 sizeof(__le32) * parity_pages; 785 786 ret = r5l_get_meta(log, meta_size); 787 if (ret) 788 return ret; 789 790 io = log->current_io; 791 792 if (test_and_clear_bit(STRIPE_R5C_PREFLUSH, &sh->state)) 793 io->has_flush = 1; 794 795 for (i = 0; i < sh->disks; i++) { 796 if (!test_bit(R5_Wantwrite, &sh->dev[i].flags) || 797 test_bit(R5_InJournal, &sh->dev[i].flags)) 798 continue; 799 if (i == sh->pd_idx || i == sh->qd_idx) 800 continue; 801 if (test_bit(R5_WantFUA, &sh->dev[i].flags) && 802 log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_BACK) { 803 io->has_fua = 1; 804 /* 805 * we need to flush journal to make sure recovery can 806 * reach the data with fua flag 807 */ 808 io->has_flush = 1; 809 } 810 r5l_append_payload_meta(log, R5LOG_PAYLOAD_DATA, 811 raid5_compute_blocknr(sh, i, 0), 812 sh->dev[i].log_checksum, 0, false); 813 r5l_append_payload_page(log, sh->dev[i].page); 814 } 815 816 if (parity_pages == 2) { 817 r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY, 818 sh->sector, sh->dev[sh->pd_idx].log_checksum, 819 sh->dev[sh->qd_idx].log_checksum, true); 820 r5l_append_payload_page(log, sh->dev[sh->pd_idx].page); 821 r5l_append_payload_page(log, sh->dev[sh->qd_idx].page); 822 } else if (parity_pages == 1) { 823 r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY, 824 sh->sector, sh->dev[sh->pd_idx].log_checksum, 825 0, false); 826 r5l_append_payload_page(log, sh->dev[sh->pd_idx].page); 827 } else /* Just writing data, not parity, in caching phase */ 828 BUG_ON(parity_pages != 0); 829 830 list_add_tail(&sh->log_list, &io->stripe_list); 831 atomic_inc(&io->pending_stripe); 832 sh->log_io = io; 833 834 if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) 835 return 0; 836 837 if (sh->log_start == MaxSector) { 838 BUG_ON(!list_empty(&sh->r5c)); 839 sh->log_start = io->log_start; 840 spin_lock_irq(&log->stripe_in_journal_lock); 841 list_add_tail(&sh->r5c, 842 &log->stripe_in_journal_list); 843 spin_unlock_irq(&log->stripe_in_journal_lock); 844 atomic_inc(&log->stripe_in_journal_count); 845 } 846 return 0; 847 } 848 849 /* add stripe to no_space_stripes, and then wake up reclaim */ 850 static inline void r5l_add_no_space_stripe(struct r5l_log *log, 851 struct stripe_head *sh) 852 { 853 spin_lock(&log->no_space_stripes_lock); 854 list_add_tail(&sh->log_list, &log->no_space_stripes); 855 spin_unlock(&log->no_space_stripes_lock); 856 } 857 858 /* 859 * running in raid5d, where reclaim could wait for raid5d too (when it flushes 860 * data from log to raid disks), so we shouldn't wait for reclaim here 861 */ 862 int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh) 863 { 864 struct r5conf *conf = sh->raid_conf; 865 int write_disks = 0; 866 int data_pages, parity_pages; 867 int reserve; 868 int i; 869 int ret = 0; 870 bool wake_reclaim = false; 871 872 if (!log) 873 return -EAGAIN; 874 /* Don't support stripe batch */ 875 if (sh->log_io || !test_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags) || 876 test_bit(STRIPE_SYNCING, &sh->state)) { 877 /* the stripe is written to log, we start writing it to raid */ 878 clear_bit(STRIPE_LOG_TRAPPED, &sh->state); 879 return -EAGAIN; 880 } 881 882 WARN_ON(test_bit(STRIPE_R5C_CACHING, &sh->state)); 883 884 for (i = 0; i < sh->disks; i++) { 885 void *addr; 886 887 if (!test_bit(R5_Wantwrite, &sh->dev[i].flags) || 888 test_bit(R5_InJournal, &sh->dev[i].flags)) 889 continue; 890 891 write_disks++; 892 /* checksum is already calculated in last run */ 893 if (test_bit(STRIPE_LOG_TRAPPED, &sh->state)) 894 continue; 895 addr = kmap_atomic(sh->dev[i].page); 896 sh->dev[i].log_checksum = crc32c_le(log->uuid_checksum, 897 addr, PAGE_SIZE); 898 kunmap_atomic(addr); 899 } 900 parity_pages = 1 + !!(sh->qd_idx >= 0); 901 data_pages = write_disks - parity_pages; 902 903 set_bit(STRIPE_LOG_TRAPPED, &sh->state); 904 /* 905 * The stripe must enter state machine again to finish the write, so 906 * don't delay. 907 */ 908 clear_bit(STRIPE_DELAYED, &sh->state); 909 atomic_inc(&sh->count); 910 911 mutex_lock(&log->io_mutex); 912 /* meta + data */ 913 reserve = (1 + write_disks) << (PAGE_SHIFT - 9); 914 915 if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) { 916 if (!r5l_has_free_space(log, reserve)) { 917 r5l_add_no_space_stripe(log, sh); 918 wake_reclaim = true; 919 } else { 920 ret = r5l_log_stripe(log, sh, data_pages, parity_pages); 921 if (ret) { 922 spin_lock_irq(&log->io_list_lock); 923 list_add_tail(&sh->log_list, 924 &log->no_mem_stripes); 925 spin_unlock_irq(&log->io_list_lock); 926 } 927 } 928 } else { /* R5C_JOURNAL_MODE_WRITE_BACK */ 929 /* 930 * log space critical, do not process stripes that are 931 * not in cache yet (sh->log_start == MaxSector). 932 */ 933 if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state) && 934 sh->log_start == MaxSector) { 935 r5l_add_no_space_stripe(log, sh); 936 wake_reclaim = true; 937 reserve = 0; 938 } else if (!r5l_has_free_space(log, reserve)) { 939 if (sh->log_start == log->last_checkpoint) 940 BUG(); 941 else 942 r5l_add_no_space_stripe(log, sh); 943 } else { 944 ret = r5l_log_stripe(log, sh, data_pages, parity_pages); 945 if (ret) { 946 spin_lock_irq(&log->io_list_lock); 947 list_add_tail(&sh->log_list, 948 &log->no_mem_stripes); 949 spin_unlock_irq(&log->io_list_lock); 950 } 951 } 952 } 953 954 mutex_unlock(&log->io_mutex); 955 if (wake_reclaim) 956 r5l_wake_reclaim(log, reserve); 957 return 0; 958 } 959 960 void r5l_write_stripe_run(struct r5l_log *log) 961 { 962 if (!log) 963 return; 964 mutex_lock(&log->io_mutex); 965 r5l_submit_current_io(log); 966 mutex_unlock(&log->io_mutex); 967 } 968 969 int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio) 970 { 971 if (!log) 972 return -ENODEV; 973 974 if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) { 975 /* 976 * in write through (journal only) 977 * we flush log disk cache first, then write stripe data to 978 * raid disks. So if bio is finished, the log disk cache is 979 * flushed already. The recovery guarantees we can recovery 980 * the bio from log disk, so we don't need to flush again 981 */ 982 if (bio->bi_iter.bi_size == 0) { 983 bio_endio(bio); 984 return 0; 985 } 986 bio->bi_opf &= ~REQ_PREFLUSH; 987 } else { 988 /* write back (with cache) */ 989 if (bio->bi_iter.bi_size == 0) { 990 mutex_lock(&log->io_mutex); 991 r5l_get_meta(log, 0); 992 bio_list_add(&log->current_io->flush_barriers, bio); 993 log->current_io->has_flush = 1; 994 log->current_io->has_null_flush = 1; 995 atomic_inc(&log->current_io->pending_stripe); 996 r5l_submit_current_io(log); 997 mutex_unlock(&log->io_mutex); 998 return 0; 999 } 1000 } 1001 return -EAGAIN; 1002 } 1003 1004 /* This will run after log space is reclaimed */ 1005 static void r5l_run_no_space_stripes(struct r5l_log *log) 1006 { 1007 struct stripe_head *sh; 1008 1009 spin_lock(&log->no_space_stripes_lock); 1010 while (!list_empty(&log->no_space_stripes)) { 1011 sh = list_first_entry(&log->no_space_stripes, 1012 struct stripe_head, log_list); 1013 list_del_init(&sh->log_list); 1014 set_bit(STRIPE_HANDLE, &sh->state); 1015 raid5_release_stripe(sh); 1016 } 1017 spin_unlock(&log->no_space_stripes_lock); 1018 } 1019 1020 /* 1021 * calculate new last_checkpoint 1022 * for write through mode, returns log->next_checkpoint 1023 * for write back, returns log_start of first sh in stripe_in_journal_list 1024 */ 1025 static sector_t r5c_calculate_new_cp(struct r5conf *conf) 1026 { 1027 struct stripe_head *sh; 1028 struct r5l_log *log = conf->log; 1029 sector_t new_cp; 1030 unsigned long flags; 1031 1032 if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) 1033 return log->next_checkpoint; 1034 1035 spin_lock_irqsave(&log->stripe_in_journal_lock, flags); 1036 if (list_empty(&conf->log->stripe_in_journal_list)) { 1037 /* all stripes flushed */ 1038 spin_unlock_irqrestore(&log->stripe_in_journal_lock, flags); 1039 return log->next_checkpoint; 1040 } 1041 sh = list_first_entry(&conf->log->stripe_in_journal_list, 1042 struct stripe_head, r5c); 1043 new_cp = sh->log_start; 1044 spin_unlock_irqrestore(&log->stripe_in_journal_lock, flags); 1045 return new_cp; 1046 } 1047 1048 static sector_t r5l_reclaimable_space(struct r5l_log *log) 1049 { 1050 struct r5conf *conf = log->rdev->mddev->private; 1051 1052 return r5l_ring_distance(log, log->last_checkpoint, 1053 r5c_calculate_new_cp(conf)); 1054 } 1055 1056 static void r5l_run_no_mem_stripe(struct r5l_log *log) 1057 { 1058 struct stripe_head *sh; 1059 1060 assert_spin_locked(&log->io_list_lock); 1061 1062 if (!list_empty(&log->no_mem_stripes)) { 1063 sh = list_first_entry(&log->no_mem_stripes, 1064 struct stripe_head, log_list); 1065 list_del_init(&sh->log_list); 1066 set_bit(STRIPE_HANDLE, &sh->state); 1067 raid5_release_stripe(sh); 1068 } 1069 } 1070 1071 static bool r5l_complete_finished_ios(struct r5l_log *log) 1072 { 1073 struct r5l_io_unit *io, *next; 1074 bool found = false; 1075 1076 assert_spin_locked(&log->io_list_lock); 1077 1078 list_for_each_entry_safe(io, next, &log->finished_ios, log_sibling) { 1079 /* don't change list order */ 1080 if (io->state < IO_UNIT_STRIPE_END) 1081 break; 1082 1083 log->next_checkpoint = io->log_start; 1084 1085 list_del(&io->log_sibling); 1086 mempool_free(io, log->io_pool); 1087 r5l_run_no_mem_stripe(log); 1088 1089 found = true; 1090 } 1091 1092 return found; 1093 } 1094 1095 static void __r5l_stripe_write_finished(struct r5l_io_unit *io) 1096 { 1097 struct r5l_log *log = io->log; 1098 struct r5conf *conf = log->rdev->mddev->private; 1099 unsigned long flags; 1100 1101 spin_lock_irqsave(&log->io_list_lock, flags); 1102 __r5l_set_io_unit_state(io, IO_UNIT_STRIPE_END); 1103 1104 if (!r5l_complete_finished_ios(log)) { 1105 spin_unlock_irqrestore(&log->io_list_lock, flags); 1106 return; 1107 } 1108 1109 if (r5l_reclaimable_space(log) > log->max_free_space || 1110 test_bit(R5C_LOG_TIGHT, &conf->cache_state)) 1111 r5l_wake_reclaim(log, 0); 1112 1113 spin_unlock_irqrestore(&log->io_list_lock, flags); 1114 wake_up(&log->iounit_wait); 1115 } 1116 1117 void r5l_stripe_write_finished(struct stripe_head *sh) 1118 { 1119 struct r5l_io_unit *io; 1120 1121 io = sh->log_io; 1122 sh->log_io = NULL; 1123 1124 if (io && atomic_dec_and_test(&io->pending_stripe)) 1125 __r5l_stripe_write_finished(io); 1126 } 1127 1128 static void r5l_log_flush_endio(struct bio *bio) 1129 { 1130 struct r5l_log *log = container_of(bio, struct r5l_log, 1131 flush_bio); 1132 unsigned long flags; 1133 struct r5l_io_unit *io; 1134 1135 if (bio->bi_error) 1136 md_error(log->rdev->mddev, log->rdev); 1137 1138 spin_lock_irqsave(&log->io_list_lock, flags); 1139 list_for_each_entry(io, &log->flushing_ios, log_sibling) 1140 r5l_io_run_stripes(io); 1141 list_splice_tail_init(&log->flushing_ios, &log->finished_ios); 1142 spin_unlock_irqrestore(&log->io_list_lock, flags); 1143 } 1144 1145 /* 1146 * Starting dispatch IO to raid. 1147 * io_unit(meta) consists of a log. There is one situation we want to avoid. A 1148 * broken meta in the middle of a log causes recovery can't find meta at the 1149 * head of log. If operations require meta at the head persistent in log, we 1150 * must make sure meta before it persistent in log too. A case is: 1151 * 1152 * stripe data/parity is in log, we start write stripe to raid disks. stripe 1153 * data/parity must be persistent in log before we do the write to raid disks. 1154 * 1155 * The solution is we restrictly maintain io_unit list order. In this case, we 1156 * only write stripes of an io_unit to raid disks till the io_unit is the first 1157 * one whose data/parity is in log. 1158 */ 1159 void r5l_flush_stripe_to_raid(struct r5l_log *log) 1160 { 1161 bool do_flush; 1162 1163 if (!log || !log->need_cache_flush) 1164 return; 1165 1166 spin_lock_irq(&log->io_list_lock); 1167 /* flush bio is running */ 1168 if (!list_empty(&log->flushing_ios)) { 1169 spin_unlock_irq(&log->io_list_lock); 1170 return; 1171 } 1172 list_splice_tail_init(&log->io_end_ios, &log->flushing_ios); 1173 do_flush = !list_empty(&log->flushing_ios); 1174 spin_unlock_irq(&log->io_list_lock); 1175 1176 if (!do_flush) 1177 return; 1178 bio_reset(&log->flush_bio); 1179 log->flush_bio.bi_bdev = log->rdev->bdev; 1180 log->flush_bio.bi_end_io = r5l_log_flush_endio; 1181 log->flush_bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH; 1182 submit_bio(&log->flush_bio); 1183 } 1184 1185 static void r5l_write_super(struct r5l_log *log, sector_t cp); 1186 static void r5l_write_super_and_discard_space(struct r5l_log *log, 1187 sector_t end) 1188 { 1189 struct block_device *bdev = log->rdev->bdev; 1190 struct mddev *mddev; 1191 1192 r5l_write_super(log, end); 1193 1194 if (!blk_queue_discard(bdev_get_queue(bdev))) 1195 return; 1196 1197 mddev = log->rdev->mddev; 1198 /* 1199 * Discard could zero data, so before discard we must make sure 1200 * superblock is updated to new log tail. Updating superblock (either 1201 * directly call md_update_sb() or depend on md thread) must hold 1202 * reconfig mutex. On the other hand, raid5_quiesce is called with 1203 * reconfig_mutex hold. The first step of raid5_quiesce() is waitting 1204 * for all IO finish, hence waitting for reclaim thread, while reclaim 1205 * thread is calling this function and waitting for reconfig mutex. So 1206 * there is a deadlock. We workaround this issue with a trylock. 1207 * FIXME: we could miss discard if we can't take reconfig mutex 1208 */ 1209 set_mask_bits(&mddev->sb_flags, 0, 1210 BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_PENDING)); 1211 if (!mddev_trylock(mddev)) 1212 return; 1213 md_update_sb(mddev, 1); 1214 mddev_unlock(mddev); 1215 1216 /* discard IO error really doesn't matter, ignore it */ 1217 if (log->last_checkpoint < end) { 1218 blkdev_issue_discard(bdev, 1219 log->last_checkpoint + log->rdev->data_offset, 1220 end - log->last_checkpoint, GFP_NOIO, 0); 1221 } else { 1222 blkdev_issue_discard(bdev, 1223 log->last_checkpoint + log->rdev->data_offset, 1224 log->device_size - log->last_checkpoint, 1225 GFP_NOIO, 0); 1226 blkdev_issue_discard(bdev, log->rdev->data_offset, end, 1227 GFP_NOIO, 0); 1228 } 1229 } 1230 1231 /* 1232 * r5c_flush_stripe moves stripe from cached list to handle_list. When called, 1233 * the stripe must be on r5c_cached_full_stripes or r5c_cached_partial_stripes. 1234 * 1235 * must hold conf->device_lock 1236 */ 1237 static void r5c_flush_stripe(struct r5conf *conf, struct stripe_head *sh) 1238 { 1239 BUG_ON(list_empty(&sh->lru)); 1240 BUG_ON(!test_bit(STRIPE_R5C_CACHING, &sh->state)); 1241 BUG_ON(test_bit(STRIPE_HANDLE, &sh->state)); 1242 1243 /* 1244 * The stripe is not ON_RELEASE_LIST, so it is safe to call 1245 * raid5_release_stripe() while holding conf->device_lock 1246 */ 1247 BUG_ON(test_bit(STRIPE_ON_RELEASE_LIST, &sh->state)); 1248 assert_spin_locked(&conf->device_lock); 1249 1250 list_del_init(&sh->lru); 1251 atomic_inc(&sh->count); 1252 1253 set_bit(STRIPE_HANDLE, &sh->state); 1254 atomic_inc(&conf->active_stripes); 1255 r5c_make_stripe_write_out(sh); 1256 1257 raid5_release_stripe(sh); 1258 } 1259 1260 /* 1261 * if num == 0, flush all full stripes 1262 * if num > 0, flush all full stripes. If less than num full stripes are 1263 * flushed, flush some partial stripes until totally num stripes are 1264 * flushed or there is no more cached stripes. 1265 */ 1266 void r5c_flush_cache(struct r5conf *conf, int num) 1267 { 1268 int count; 1269 struct stripe_head *sh, *next; 1270 1271 assert_spin_locked(&conf->device_lock); 1272 if (!conf->log) 1273 return; 1274 1275 count = 0; 1276 list_for_each_entry_safe(sh, next, &conf->r5c_full_stripe_list, lru) { 1277 r5c_flush_stripe(conf, sh); 1278 count++; 1279 } 1280 1281 if (count >= num) 1282 return; 1283 list_for_each_entry_safe(sh, next, 1284 &conf->r5c_partial_stripe_list, lru) { 1285 r5c_flush_stripe(conf, sh); 1286 if (++count >= num) 1287 break; 1288 } 1289 } 1290 1291 static void r5c_do_reclaim(struct r5conf *conf) 1292 { 1293 struct r5l_log *log = conf->log; 1294 struct stripe_head *sh; 1295 int count = 0; 1296 unsigned long flags; 1297 int total_cached; 1298 int stripes_to_flush; 1299 1300 if (!r5c_is_writeback(log)) 1301 return; 1302 1303 total_cached = atomic_read(&conf->r5c_cached_partial_stripes) + 1304 atomic_read(&conf->r5c_cached_full_stripes); 1305 1306 if (total_cached > conf->min_nr_stripes * 3 / 4 || 1307 atomic_read(&conf->empty_inactive_list_nr) > 0) 1308 /* 1309 * if stripe cache pressure high, flush all full stripes and 1310 * some partial stripes 1311 */ 1312 stripes_to_flush = R5C_RECLAIM_STRIPE_GROUP; 1313 else if (total_cached > conf->min_nr_stripes * 1 / 2 || 1314 atomic_read(&conf->r5c_cached_full_stripes) > 1315 R5C_FULL_STRIPE_FLUSH_BATCH) 1316 /* 1317 * if stripe cache pressure moderate, or if there is many full 1318 * stripes,flush all full stripes 1319 */ 1320 stripes_to_flush = 0; 1321 else 1322 /* no need to flush */ 1323 stripes_to_flush = -1; 1324 1325 if (stripes_to_flush >= 0) { 1326 spin_lock_irqsave(&conf->device_lock, flags); 1327 r5c_flush_cache(conf, stripes_to_flush); 1328 spin_unlock_irqrestore(&conf->device_lock, flags); 1329 } 1330 1331 /* if log space is tight, flush stripes on stripe_in_journal_list */ 1332 if (test_bit(R5C_LOG_TIGHT, &conf->cache_state)) { 1333 spin_lock_irqsave(&log->stripe_in_journal_lock, flags); 1334 spin_lock(&conf->device_lock); 1335 list_for_each_entry(sh, &log->stripe_in_journal_list, r5c) { 1336 /* 1337 * stripes on stripe_in_journal_list could be in any 1338 * state of the stripe_cache state machine. In this 1339 * case, we only want to flush stripe on 1340 * r5c_cached_full/partial_stripes. The following 1341 * condition makes sure the stripe is on one of the 1342 * two lists. 1343 */ 1344 if (!list_empty(&sh->lru) && 1345 !test_bit(STRIPE_HANDLE, &sh->state) && 1346 atomic_read(&sh->count) == 0) { 1347 r5c_flush_stripe(conf, sh); 1348 } 1349 if (count++ >= R5C_RECLAIM_STRIPE_GROUP) 1350 break; 1351 } 1352 spin_unlock(&conf->device_lock); 1353 spin_unlock_irqrestore(&log->stripe_in_journal_lock, flags); 1354 } 1355 1356 if (!test_bit(R5C_LOG_CRITICAL, &conf->cache_state)) 1357 r5l_run_no_space_stripes(log); 1358 1359 md_wakeup_thread(conf->mddev->thread); 1360 } 1361 1362 static void r5l_do_reclaim(struct r5l_log *log) 1363 { 1364 struct r5conf *conf = log->rdev->mddev->private; 1365 sector_t reclaim_target = xchg(&log->reclaim_target, 0); 1366 sector_t reclaimable; 1367 sector_t next_checkpoint; 1368 bool write_super; 1369 1370 spin_lock_irq(&log->io_list_lock); 1371 write_super = r5l_reclaimable_space(log) > log->max_free_space || 1372 reclaim_target != 0 || !list_empty(&log->no_space_stripes); 1373 /* 1374 * move proper io_unit to reclaim list. We should not change the order. 1375 * reclaimable/unreclaimable io_unit can be mixed in the list, we 1376 * shouldn't reuse space of an unreclaimable io_unit 1377 */ 1378 while (1) { 1379 reclaimable = r5l_reclaimable_space(log); 1380 if (reclaimable >= reclaim_target || 1381 (list_empty(&log->running_ios) && 1382 list_empty(&log->io_end_ios) && 1383 list_empty(&log->flushing_ios) && 1384 list_empty(&log->finished_ios))) 1385 break; 1386 1387 md_wakeup_thread(log->rdev->mddev->thread); 1388 wait_event_lock_irq(log->iounit_wait, 1389 r5l_reclaimable_space(log) > reclaimable, 1390 log->io_list_lock); 1391 } 1392 1393 next_checkpoint = r5c_calculate_new_cp(conf); 1394 spin_unlock_irq(&log->io_list_lock); 1395 1396 BUG_ON(reclaimable < 0); 1397 1398 if (reclaimable == 0 || !write_super) 1399 return; 1400 1401 /* 1402 * write_super will flush cache of each raid disk. We must write super 1403 * here, because the log area might be reused soon and we don't want to 1404 * confuse recovery 1405 */ 1406 r5l_write_super_and_discard_space(log, next_checkpoint); 1407 1408 mutex_lock(&log->io_mutex); 1409 log->last_checkpoint = next_checkpoint; 1410 r5c_update_log_state(log); 1411 mutex_unlock(&log->io_mutex); 1412 1413 r5l_run_no_space_stripes(log); 1414 } 1415 1416 static void r5l_reclaim_thread(struct md_thread *thread) 1417 { 1418 struct mddev *mddev = thread->mddev; 1419 struct r5conf *conf = mddev->private; 1420 struct r5l_log *log = conf->log; 1421 1422 if (!log) 1423 return; 1424 r5c_do_reclaim(conf); 1425 r5l_do_reclaim(log); 1426 } 1427 1428 void r5l_wake_reclaim(struct r5l_log *log, sector_t space) 1429 { 1430 unsigned long target; 1431 unsigned long new = (unsigned long)space; /* overflow in theory */ 1432 1433 if (!log) 1434 return; 1435 do { 1436 target = log->reclaim_target; 1437 if (new < target) 1438 return; 1439 } while (cmpxchg(&log->reclaim_target, target, new) != target); 1440 md_wakeup_thread(log->reclaim_thread); 1441 } 1442 1443 void r5l_quiesce(struct r5l_log *log, int state) 1444 { 1445 struct mddev *mddev; 1446 if (!log || state == 2) 1447 return; 1448 if (state == 0) 1449 kthread_unpark(log->reclaim_thread->tsk); 1450 else if (state == 1) { 1451 /* make sure r5l_write_super_and_discard_space exits */ 1452 mddev = log->rdev->mddev; 1453 wake_up(&mddev->sb_wait); 1454 kthread_park(log->reclaim_thread->tsk); 1455 r5l_wake_reclaim(log, MaxSector); 1456 r5l_do_reclaim(log); 1457 } 1458 } 1459 1460 bool r5l_log_disk_error(struct r5conf *conf) 1461 { 1462 struct r5l_log *log; 1463 bool ret; 1464 /* don't allow write if journal disk is missing */ 1465 rcu_read_lock(); 1466 log = rcu_dereference(conf->log); 1467 1468 if (!log) 1469 ret = test_bit(MD_HAS_JOURNAL, &conf->mddev->flags); 1470 else 1471 ret = test_bit(Faulty, &log->rdev->flags); 1472 rcu_read_unlock(); 1473 return ret; 1474 } 1475 1476 struct r5l_recovery_ctx { 1477 struct page *meta_page; /* current meta */ 1478 sector_t meta_total_blocks; /* total size of current meta and data */ 1479 sector_t pos; /* recovery position */ 1480 u64 seq; /* recovery position seq */ 1481 int data_parity_stripes; /* number of data_parity stripes */ 1482 int data_only_stripes; /* number of data_only stripes */ 1483 struct list_head cached_list; 1484 }; 1485 1486 static int r5l_recovery_read_meta_block(struct r5l_log *log, 1487 struct r5l_recovery_ctx *ctx) 1488 { 1489 struct page *page = ctx->meta_page; 1490 struct r5l_meta_block *mb; 1491 u32 crc, stored_crc; 1492 1493 if (!sync_page_io(log->rdev, ctx->pos, PAGE_SIZE, page, REQ_OP_READ, 0, 1494 false)) 1495 return -EIO; 1496 1497 mb = page_address(page); 1498 stored_crc = le32_to_cpu(mb->checksum); 1499 mb->checksum = 0; 1500 1501 if (le32_to_cpu(mb->magic) != R5LOG_MAGIC || 1502 le64_to_cpu(mb->seq) != ctx->seq || 1503 mb->version != R5LOG_VERSION || 1504 le64_to_cpu(mb->position) != ctx->pos) 1505 return -EINVAL; 1506 1507 crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE); 1508 if (stored_crc != crc) 1509 return -EINVAL; 1510 1511 if (le32_to_cpu(mb->meta_size) > PAGE_SIZE) 1512 return -EINVAL; 1513 1514 ctx->meta_total_blocks = BLOCK_SECTORS; 1515 1516 return 0; 1517 } 1518 1519 static void 1520 r5l_recovery_create_empty_meta_block(struct r5l_log *log, 1521 struct page *page, 1522 sector_t pos, u64 seq) 1523 { 1524 struct r5l_meta_block *mb; 1525 1526 mb = page_address(page); 1527 clear_page(mb); 1528 mb->magic = cpu_to_le32(R5LOG_MAGIC); 1529 mb->version = R5LOG_VERSION; 1530 mb->meta_size = cpu_to_le32(sizeof(struct r5l_meta_block)); 1531 mb->seq = cpu_to_le64(seq); 1532 mb->position = cpu_to_le64(pos); 1533 } 1534 1535 static int r5l_log_write_empty_meta_block(struct r5l_log *log, sector_t pos, 1536 u64 seq) 1537 { 1538 struct page *page; 1539 struct r5l_meta_block *mb; 1540 1541 page = alloc_page(GFP_KERNEL); 1542 if (!page) 1543 return -ENOMEM; 1544 r5l_recovery_create_empty_meta_block(log, page, pos, seq); 1545 mb = page_address(page); 1546 mb->checksum = cpu_to_le32(crc32c_le(log->uuid_checksum, 1547 mb, PAGE_SIZE)); 1548 if (!sync_page_io(log->rdev, pos, PAGE_SIZE, page, REQ_OP_WRITE, 1549 REQ_FUA, false)) { 1550 __free_page(page); 1551 return -EIO; 1552 } 1553 __free_page(page); 1554 return 0; 1555 } 1556 1557 /* 1558 * r5l_recovery_load_data and r5l_recovery_load_parity uses flag R5_Wantwrite 1559 * to mark valid (potentially not flushed) data in the journal. 1560 * 1561 * We already verified checksum in r5l_recovery_verify_data_checksum_for_mb, 1562 * so there should not be any mismatch here. 1563 */ 1564 static void r5l_recovery_load_data(struct r5l_log *log, 1565 struct stripe_head *sh, 1566 struct r5l_recovery_ctx *ctx, 1567 struct r5l_payload_data_parity *payload, 1568 sector_t log_offset) 1569 { 1570 struct mddev *mddev = log->rdev->mddev; 1571 struct r5conf *conf = mddev->private; 1572 int dd_idx; 1573 1574 raid5_compute_sector(conf, 1575 le64_to_cpu(payload->location), 0, 1576 &dd_idx, sh); 1577 sync_page_io(log->rdev, log_offset, PAGE_SIZE, 1578 sh->dev[dd_idx].page, REQ_OP_READ, 0, false); 1579 sh->dev[dd_idx].log_checksum = 1580 le32_to_cpu(payload->checksum[0]); 1581 ctx->meta_total_blocks += BLOCK_SECTORS; 1582 1583 set_bit(R5_Wantwrite, &sh->dev[dd_idx].flags); 1584 set_bit(STRIPE_R5C_CACHING, &sh->state); 1585 } 1586 1587 static void r5l_recovery_load_parity(struct r5l_log *log, 1588 struct stripe_head *sh, 1589 struct r5l_recovery_ctx *ctx, 1590 struct r5l_payload_data_parity *payload, 1591 sector_t log_offset) 1592 { 1593 struct mddev *mddev = log->rdev->mddev; 1594 struct r5conf *conf = mddev->private; 1595 1596 ctx->meta_total_blocks += BLOCK_SECTORS * conf->max_degraded; 1597 sync_page_io(log->rdev, log_offset, PAGE_SIZE, 1598 sh->dev[sh->pd_idx].page, REQ_OP_READ, 0, false); 1599 sh->dev[sh->pd_idx].log_checksum = 1600 le32_to_cpu(payload->checksum[0]); 1601 set_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags); 1602 1603 if (sh->qd_idx >= 0) { 1604 sync_page_io(log->rdev, 1605 r5l_ring_add(log, log_offset, BLOCK_SECTORS), 1606 PAGE_SIZE, sh->dev[sh->qd_idx].page, 1607 REQ_OP_READ, 0, false); 1608 sh->dev[sh->qd_idx].log_checksum = 1609 le32_to_cpu(payload->checksum[1]); 1610 set_bit(R5_Wantwrite, &sh->dev[sh->qd_idx].flags); 1611 } 1612 clear_bit(STRIPE_R5C_CACHING, &sh->state); 1613 } 1614 1615 static void r5l_recovery_reset_stripe(struct stripe_head *sh) 1616 { 1617 int i; 1618 1619 sh->state = 0; 1620 sh->log_start = MaxSector; 1621 for (i = sh->disks; i--; ) 1622 sh->dev[i].flags = 0; 1623 } 1624 1625 static void 1626 r5l_recovery_replay_one_stripe(struct r5conf *conf, 1627 struct stripe_head *sh, 1628 struct r5l_recovery_ctx *ctx) 1629 { 1630 struct md_rdev *rdev, *rrdev; 1631 int disk_index; 1632 int data_count = 0; 1633 1634 for (disk_index = 0; disk_index < sh->disks; disk_index++) { 1635 if (!test_bit(R5_Wantwrite, &sh->dev[disk_index].flags)) 1636 continue; 1637 if (disk_index == sh->qd_idx || disk_index == sh->pd_idx) 1638 continue; 1639 data_count++; 1640 } 1641 1642 /* 1643 * stripes that only have parity must have been flushed 1644 * before the crash that we are now recovering from, so 1645 * there is nothing more to recovery. 1646 */ 1647 if (data_count == 0) 1648 goto out; 1649 1650 for (disk_index = 0; disk_index < sh->disks; disk_index++) { 1651 if (!test_bit(R5_Wantwrite, &sh->dev[disk_index].flags)) 1652 continue; 1653 1654 /* in case device is broken */ 1655 rcu_read_lock(); 1656 rdev = rcu_dereference(conf->disks[disk_index].rdev); 1657 if (rdev) { 1658 atomic_inc(&rdev->nr_pending); 1659 rcu_read_unlock(); 1660 sync_page_io(rdev, sh->sector, PAGE_SIZE, 1661 sh->dev[disk_index].page, REQ_OP_WRITE, 0, 1662 false); 1663 rdev_dec_pending(rdev, rdev->mddev); 1664 rcu_read_lock(); 1665 } 1666 rrdev = rcu_dereference(conf->disks[disk_index].replacement); 1667 if (rrdev) { 1668 atomic_inc(&rrdev->nr_pending); 1669 rcu_read_unlock(); 1670 sync_page_io(rrdev, sh->sector, PAGE_SIZE, 1671 sh->dev[disk_index].page, REQ_OP_WRITE, 0, 1672 false); 1673 rdev_dec_pending(rrdev, rrdev->mddev); 1674 rcu_read_lock(); 1675 } 1676 rcu_read_unlock(); 1677 } 1678 ctx->data_parity_stripes++; 1679 out: 1680 r5l_recovery_reset_stripe(sh); 1681 } 1682 1683 static struct stripe_head * 1684 r5c_recovery_alloc_stripe(struct r5conf *conf, 1685 sector_t stripe_sect) 1686 { 1687 struct stripe_head *sh; 1688 1689 sh = raid5_get_active_stripe(conf, stripe_sect, 0, 1, 0); 1690 if (!sh) 1691 return NULL; /* no more stripe available */ 1692 1693 r5l_recovery_reset_stripe(sh); 1694 1695 return sh; 1696 } 1697 1698 static struct stripe_head * 1699 r5c_recovery_lookup_stripe(struct list_head *list, sector_t sect) 1700 { 1701 struct stripe_head *sh; 1702 1703 list_for_each_entry(sh, list, lru) 1704 if (sh->sector == sect) 1705 return sh; 1706 return NULL; 1707 } 1708 1709 static void 1710 r5c_recovery_drop_stripes(struct list_head *cached_stripe_list, 1711 struct r5l_recovery_ctx *ctx) 1712 { 1713 struct stripe_head *sh, *next; 1714 1715 list_for_each_entry_safe(sh, next, cached_stripe_list, lru) { 1716 r5l_recovery_reset_stripe(sh); 1717 list_del_init(&sh->lru); 1718 raid5_release_stripe(sh); 1719 } 1720 } 1721 1722 static void 1723 r5c_recovery_replay_stripes(struct list_head *cached_stripe_list, 1724 struct r5l_recovery_ctx *ctx) 1725 { 1726 struct stripe_head *sh, *next; 1727 1728 list_for_each_entry_safe(sh, next, cached_stripe_list, lru) 1729 if (!test_bit(STRIPE_R5C_CACHING, &sh->state)) { 1730 r5l_recovery_replay_one_stripe(sh->raid_conf, sh, ctx); 1731 list_del_init(&sh->lru); 1732 raid5_release_stripe(sh); 1733 } 1734 } 1735 1736 /* if matches return 0; otherwise return -EINVAL */ 1737 static int 1738 r5l_recovery_verify_data_checksum(struct r5l_log *log, struct page *page, 1739 sector_t log_offset, __le32 log_checksum) 1740 { 1741 void *addr; 1742 u32 checksum; 1743 1744 sync_page_io(log->rdev, log_offset, PAGE_SIZE, 1745 page, REQ_OP_READ, 0, false); 1746 addr = kmap_atomic(page); 1747 checksum = crc32c_le(log->uuid_checksum, addr, PAGE_SIZE); 1748 kunmap_atomic(addr); 1749 return (le32_to_cpu(log_checksum) == checksum) ? 0 : -EINVAL; 1750 } 1751 1752 /* 1753 * before loading data to stripe cache, we need verify checksum for all data, 1754 * if there is mismatch for any data page, we drop all data in the mata block 1755 */ 1756 static int 1757 r5l_recovery_verify_data_checksum_for_mb(struct r5l_log *log, 1758 struct r5l_recovery_ctx *ctx) 1759 { 1760 struct mddev *mddev = log->rdev->mddev; 1761 struct r5conf *conf = mddev->private; 1762 struct r5l_meta_block *mb = page_address(ctx->meta_page); 1763 sector_t mb_offset = sizeof(struct r5l_meta_block); 1764 sector_t log_offset = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS); 1765 struct page *page; 1766 struct r5l_payload_data_parity *payload; 1767 1768 page = alloc_page(GFP_KERNEL); 1769 if (!page) 1770 return -ENOMEM; 1771 1772 while (mb_offset < le32_to_cpu(mb->meta_size)) { 1773 payload = (void *)mb + mb_offset; 1774 1775 if (payload->header.type == R5LOG_PAYLOAD_DATA) { 1776 if (r5l_recovery_verify_data_checksum( 1777 log, page, log_offset, 1778 payload->checksum[0]) < 0) 1779 goto mismatch; 1780 } else if (payload->header.type == R5LOG_PAYLOAD_PARITY) { 1781 if (r5l_recovery_verify_data_checksum( 1782 log, page, log_offset, 1783 payload->checksum[0]) < 0) 1784 goto mismatch; 1785 if (conf->max_degraded == 2 && /* q for RAID 6 */ 1786 r5l_recovery_verify_data_checksum( 1787 log, page, 1788 r5l_ring_add(log, log_offset, 1789 BLOCK_SECTORS), 1790 payload->checksum[1]) < 0) 1791 goto mismatch; 1792 } else /* not R5LOG_PAYLOAD_DATA or R5LOG_PAYLOAD_PARITY */ 1793 goto mismatch; 1794 1795 log_offset = r5l_ring_add(log, log_offset, 1796 le32_to_cpu(payload->size)); 1797 1798 mb_offset += sizeof(struct r5l_payload_data_parity) + 1799 sizeof(__le32) * 1800 (le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9)); 1801 } 1802 1803 put_page(page); 1804 return 0; 1805 1806 mismatch: 1807 put_page(page); 1808 return -EINVAL; 1809 } 1810 1811 /* 1812 * Analyze all data/parity pages in one meta block 1813 * Returns: 1814 * 0 for success 1815 * -EINVAL for unknown playload type 1816 * -EAGAIN for checksum mismatch of data page 1817 * -ENOMEM for run out of memory (alloc_page failed or run out of stripes) 1818 */ 1819 static int 1820 r5c_recovery_analyze_meta_block(struct r5l_log *log, 1821 struct r5l_recovery_ctx *ctx, 1822 struct list_head *cached_stripe_list) 1823 { 1824 struct mddev *mddev = log->rdev->mddev; 1825 struct r5conf *conf = mddev->private; 1826 struct r5l_meta_block *mb; 1827 struct r5l_payload_data_parity *payload; 1828 int mb_offset; 1829 sector_t log_offset; 1830 sector_t stripe_sect; 1831 struct stripe_head *sh; 1832 int ret; 1833 1834 /* 1835 * for mismatch in data blocks, we will drop all data in this mb, but 1836 * we will still read next mb for other data with FLUSH flag, as 1837 * io_unit could finish out of order. 1838 */ 1839 ret = r5l_recovery_verify_data_checksum_for_mb(log, ctx); 1840 if (ret == -EINVAL) 1841 return -EAGAIN; 1842 else if (ret) 1843 return ret; /* -ENOMEM duo to alloc_page() failed */ 1844 1845 mb = page_address(ctx->meta_page); 1846 mb_offset = sizeof(struct r5l_meta_block); 1847 log_offset = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS); 1848 1849 while (mb_offset < le32_to_cpu(mb->meta_size)) { 1850 int dd; 1851 1852 payload = (void *)mb + mb_offset; 1853 stripe_sect = (payload->header.type == R5LOG_PAYLOAD_DATA) ? 1854 raid5_compute_sector( 1855 conf, le64_to_cpu(payload->location), 0, &dd, 1856 NULL) 1857 : le64_to_cpu(payload->location); 1858 1859 sh = r5c_recovery_lookup_stripe(cached_stripe_list, 1860 stripe_sect); 1861 1862 if (!sh) { 1863 sh = r5c_recovery_alloc_stripe(conf, stripe_sect); 1864 /* 1865 * cannot get stripe from raid5_get_active_stripe 1866 * try replay some stripes 1867 */ 1868 if (!sh) { 1869 r5c_recovery_replay_stripes( 1870 cached_stripe_list, ctx); 1871 sh = r5c_recovery_alloc_stripe( 1872 conf, stripe_sect); 1873 } 1874 if (!sh) { 1875 pr_debug("md/raid:%s: Increasing stripe cache size to %d to recovery data on journal.\n", 1876 mdname(mddev), 1877 conf->min_nr_stripes * 2); 1878 raid5_set_cache_size(mddev, 1879 conf->min_nr_stripes * 2); 1880 sh = r5c_recovery_alloc_stripe(conf, 1881 stripe_sect); 1882 } 1883 if (!sh) { 1884 pr_err("md/raid:%s: Cannot get enough stripes due to memory pressure. Recovery failed.\n", 1885 mdname(mddev)); 1886 return -ENOMEM; 1887 } 1888 list_add_tail(&sh->lru, cached_stripe_list); 1889 } 1890 1891 if (payload->header.type == R5LOG_PAYLOAD_DATA) { 1892 if (!test_bit(STRIPE_R5C_CACHING, &sh->state) && 1893 test_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags)) { 1894 r5l_recovery_replay_one_stripe(conf, sh, ctx); 1895 list_move_tail(&sh->lru, cached_stripe_list); 1896 } 1897 r5l_recovery_load_data(log, sh, ctx, payload, 1898 log_offset); 1899 } else if (payload->header.type == R5LOG_PAYLOAD_PARITY) 1900 r5l_recovery_load_parity(log, sh, ctx, payload, 1901 log_offset); 1902 else 1903 return -EINVAL; 1904 1905 log_offset = r5l_ring_add(log, log_offset, 1906 le32_to_cpu(payload->size)); 1907 1908 mb_offset += sizeof(struct r5l_payload_data_parity) + 1909 sizeof(__le32) * 1910 (le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9)); 1911 } 1912 1913 return 0; 1914 } 1915 1916 /* 1917 * Load the stripe into cache. The stripe will be written out later by 1918 * the stripe cache state machine. 1919 */ 1920 static void r5c_recovery_load_one_stripe(struct r5l_log *log, 1921 struct stripe_head *sh) 1922 { 1923 struct r5dev *dev; 1924 int i; 1925 1926 for (i = sh->disks; i--; ) { 1927 dev = sh->dev + i; 1928 if (test_and_clear_bit(R5_Wantwrite, &dev->flags)) { 1929 set_bit(R5_InJournal, &dev->flags); 1930 set_bit(R5_UPTODATE, &dev->flags); 1931 } 1932 } 1933 } 1934 1935 /* 1936 * Scan through the log for all to-be-flushed data 1937 * 1938 * For stripes with data and parity, namely Data-Parity stripe 1939 * (STRIPE_R5C_CACHING == 0), we simply replay all the writes. 1940 * 1941 * For stripes with only data, namely Data-Only stripe 1942 * (STRIPE_R5C_CACHING == 1), we load them to stripe cache state machine. 1943 * 1944 * For a stripe, if we see data after parity, we should discard all previous 1945 * data and parity for this stripe, as these data are already flushed to 1946 * the array. 1947 * 1948 * At the end of the scan, we return the new journal_tail, which points to 1949 * first data-only stripe on the journal device, or next invalid meta block. 1950 */ 1951 static int r5c_recovery_flush_log(struct r5l_log *log, 1952 struct r5l_recovery_ctx *ctx) 1953 { 1954 struct stripe_head *sh; 1955 int ret = 0; 1956 1957 /* scan through the log */ 1958 while (1) { 1959 if (r5l_recovery_read_meta_block(log, ctx)) 1960 break; 1961 1962 ret = r5c_recovery_analyze_meta_block(log, ctx, 1963 &ctx->cached_list); 1964 /* 1965 * -EAGAIN means mismatch in data block, in this case, we still 1966 * try scan the next metablock 1967 */ 1968 if (ret && ret != -EAGAIN) 1969 break; /* ret == -EINVAL or -ENOMEM */ 1970 ctx->seq++; 1971 ctx->pos = r5l_ring_add(log, ctx->pos, ctx->meta_total_blocks); 1972 } 1973 1974 if (ret == -ENOMEM) { 1975 r5c_recovery_drop_stripes(&ctx->cached_list, ctx); 1976 return ret; 1977 } 1978 1979 /* replay data-parity stripes */ 1980 r5c_recovery_replay_stripes(&ctx->cached_list, ctx); 1981 1982 /* load data-only stripes to stripe cache */ 1983 list_for_each_entry(sh, &ctx->cached_list, lru) { 1984 WARN_ON(!test_bit(STRIPE_R5C_CACHING, &sh->state)); 1985 r5c_recovery_load_one_stripe(log, sh); 1986 ctx->data_only_stripes++; 1987 } 1988 1989 return 0; 1990 } 1991 1992 /* 1993 * we did a recovery. Now ctx.pos points to an invalid meta block. New 1994 * log will start here. but we can't let superblock point to last valid 1995 * meta block. The log might looks like: 1996 * | meta 1| meta 2| meta 3| 1997 * meta 1 is valid, meta 2 is invalid. meta 3 could be valid. If 1998 * superblock points to meta 1, we write a new valid meta 2n. if crash 1999 * happens again, new recovery will start from meta 1. Since meta 2n is 2000 * valid now, recovery will think meta 3 is valid, which is wrong. 2001 * The solution is we create a new meta in meta2 with its seq == meta 2002 * 1's seq + 10000 and let superblock points to meta2. The same recovery 2003 * will not think meta 3 is a valid meta, because its seq doesn't match 2004 */ 2005 2006 /* 2007 * Before recovery, the log looks like the following 2008 * 2009 * --------------------------------------------- 2010 * | valid log | invalid log | 2011 * --------------------------------------------- 2012 * ^ 2013 * |- log->last_checkpoint 2014 * |- log->last_cp_seq 2015 * 2016 * Now we scan through the log until we see invalid entry 2017 * 2018 * --------------------------------------------- 2019 * | valid log | invalid log | 2020 * --------------------------------------------- 2021 * ^ ^ 2022 * |- log->last_checkpoint |- ctx->pos 2023 * |- log->last_cp_seq |- ctx->seq 2024 * 2025 * From this point, we need to increase seq number by 10 to avoid 2026 * confusing next recovery. 2027 * 2028 * --------------------------------------------- 2029 * | valid log | invalid log | 2030 * --------------------------------------------- 2031 * ^ ^ 2032 * |- log->last_checkpoint |- ctx->pos+1 2033 * |- log->last_cp_seq |- ctx->seq+10001 2034 * 2035 * However, it is not safe to start the state machine yet, because data only 2036 * parities are not yet secured in RAID. To save these data only parities, we 2037 * rewrite them from seq+11. 2038 * 2039 * ----------------------------------------------------------------- 2040 * | valid log | data only stripes | invalid log | 2041 * ----------------------------------------------------------------- 2042 * ^ ^ 2043 * |- log->last_checkpoint |- ctx->pos+n 2044 * |- log->last_cp_seq |- ctx->seq+10000+n 2045 * 2046 * If failure happens again during this process, the recovery can safe start 2047 * again from log->last_checkpoint. 2048 * 2049 * Once data only stripes are rewritten to journal, we move log_tail 2050 * 2051 * ----------------------------------------------------------------- 2052 * | old log | data only stripes | invalid log | 2053 * ----------------------------------------------------------------- 2054 * ^ ^ 2055 * |- log->last_checkpoint |- ctx->pos+n 2056 * |- log->last_cp_seq |- ctx->seq+10000+n 2057 * 2058 * Then we can safely start the state machine. If failure happens from this 2059 * point on, the recovery will start from new log->last_checkpoint. 2060 */ 2061 static int 2062 r5c_recovery_rewrite_data_only_stripes(struct r5l_log *log, 2063 struct r5l_recovery_ctx *ctx) 2064 { 2065 struct stripe_head *sh, *next; 2066 struct mddev *mddev = log->rdev->mddev; 2067 struct page *page; 2068 sector_t next_checkpoint = MaxSector; 2069 2070 page = alloc_page(GFP_KERNEL); 2071 if (!page) { 2072 pr_err("md/raid:%s: cannot allocate memory to rewrite data only stripes\n", 2073 mdname(mddev)); 2074 return -ENOMEM; 2075 } 2076 2077 WARN_ON(list_empty(&ctx->cached_list)); 2078 2079 list_for_each_entry_safe(sh, next, &ctx->cached_list, lru) { 2080 struct r5l_meta_block *mb; 2081 int i; 2082 int offset; 2083 sector_t write_pos; 2084 2085 WARN_ON(!test_bit(STRIPE_R5C_CACHING, &sh->state)); 2086 r5l_recovery_create_empty_meta_block(log, page, 2087 ctx->pos, ctx->seq); 2088 mb = page_address(page); 2089 offset = le32_to_cpu(mb->meta_size); 2090 write_pos = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS); 2091 2092 for (i = sh->disks; i--; ) { 2093 struct r5dev *dev = &sh->dev[i]; 2094 struct r5l_payload_data_parity *payload; 2095 void *addr; 2096 2097 if (test_bit(R5_InJournal, &dev->flags)) { 2098 payload = (void *)mb + offset; 2099 payload->header.type = cpu_to_le16( 2100 R5LOG_PAYLOAD_DATA); 2101 payload->size = BLOCK_SECTORS; 2102 payload->location = cpu_to_le64( 2103 raid5_compute_blocknr(sh, i, 0)); 2104 addr = kmap_atomic(dev->page); 2105 payload->checksum[0] = cpu_to_le32( 2106 crc32c_le(log->uuid_checksum, addr, 2107 PAGE_SIZE)); 2108 kunmap_atomic(addr); 2109 sync_page_io(log->rdev, write_pos, PAGE_SIZE, 2110 dev->page, REQ_OP_WRITE, 0, false); 2111 write_pos = r5l_ring_add(log, write_pos, 2112 BLOCK_SECTORS); 2113 offset += sizeof(__le32) + 2114 sizeof(struct r5l_payload_data_parity); 2115 2116 } 2117 } 2118 mb->meta_size = cpu_to_le32(offset); 2119 mb->checksum = cpu_to_le32(crc32c_le(log->uuid_checksum, 2120 mb, PAGE_SIZE)); 2121 sync_page_io(log->rdev, ctx->pos, PAGE_SIZE, page, 2122 REQ_OP_WRITE, REQ_FUA, false); 2123 sh->log_start = ctx->pos; 2124 list_add_tail(&sh->r5c, &log->stripe_in_journal_list); 2125 atomic_inc(&log->stripe_in_journal_count); 2126 ctx->pos = write_pos; 2127 ctx->seq += 1; 2128 next_checkpoint = sh->log_start; 2129 list_del_init(&sh->lru); 2130 raid5_release_stripe(sh); 2131 } 2132 log->next_checkpoint = next_checkpoint; 2133 __free_page(page); 2134 return 0; 2135 } 2136 2137 static int r5l_recovery_log(struct r5l_log *log) 2138 { 2139 struct mddev *mddev = log->rdev->mddev; 2140 struct r5l_recovery_ctx ctx; 2141 int ret; 2142 sector_t pos; 2143 2144 ctx.pos = log->last_checkpoint; 2145 ctx.seq = log->last_cp_seq; 2146 ctx.meta_page = alloc_page(GFP_KERNEL); 2147 ctx.data_only_stripes = 0; 2148 ctx.data_parity_stripes = 0; 2149 INIT_LIST_HEAD(&ctx.cached_list); 2150 2151 if (!ctx.meta_page) 2152 return -ENOMEM; 2153 2154 ret = r5c_recovery_flush_log(log, &ctx); 2155 __free_page(ctx.meta_page); 2156 2157 if (ret) 2158 return ret; 2159 2160 pos = ctx.pos; 2161 ctx.seq += 10000; 2162 2163 if (ctx.data_only_stripes == 0) { 2164 log->next_checkpoint = ctx.pos; 2165 r5l_log_write_empty_meta_block(log, ctx.pos, ctx.seq++); 2166 ctx.pos = r5l_ring_add(log, ctx.pos, BLOCK_SECTORS); 2167 } 2168 2169 if ((ctx.data_only_stripes == 0) && (ctx.data_parity_stripes == 0)) 2170 pr_debug("md/raid:%s: starting from clean shutdown\n", 2171 mdname(mddev)); 2172 else { 2173 pr_debug("md/raid:%s: recovering %d data-only stripes and %d data-parity stripes\n", 2174 mdname(mddev), ctx.data_only_stripes, 2175 ctx.data_parity_stripes); 2176 2177 if (ctx.data_only_stripes > 0) 2178 if (r5c_recovery_rewrite_data_only_stripes(log, &ctx)) { 2179 pr_err("md/raid:%s: failed to rewrite stripes to journal\n", 2180 mdname(mddev)); 2181 return -EIO; 2182 } 2183 } 2184 2185 log->log_start = ctx.pos; 2186 log->seq = ctx.seq; 2187 log->last_checkpoint = pos; 2188 r5l_write_super(log, pos); 2189 return 0; 2190 } 2191 2192 static void r5l_write_super(struct r5l_log *log, sector_t cp) 2193 { 2194 struct mddev *mddev = log->rdev->mddev; 2195 2196 log->rdev->journal_tail = cp; 2197 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 2198 } 2199 2200 static ssize_t r5c_journal_mode_show(struct mddev *mddev, char *page) 2201 { 2202 struct r5conf *conf = mddev->private; 2203 int ret; 2204 2205 if (!conf->log) 2206 return 0; 2207 2208 switch (conf->log->r5c_journal_mode) { 2209 case R5C_JOURNAL_MODE_WRITE_THROUGH: 2210 ret = snprintf( 2211 page, PAGE_SIZE, "[%s] %s\n", 2212 r5c_journal_mode_str[R5C_JOURNAL_MODE_WRITE_THROUGH], 2213 r5c_journal_mode_str[R5C_JOURNAL_MODE_WRITE_BACK]); 2214 break; 2215 case R5C_JOURNAL_MODE_WRITE_BACK: 2216 ret = snprintf( 2217 page, PAGE_SIZE, "%s [%s]\n", 2218 r5c_journal_mode_str[R5C_JOURNAL_MODE_WRITE_THROUGH], 2219 r5c_journal_mode_str[R5C_JOURNAL_MODE_WRITE_BACK]); 2220 break; 2221 default: 2222 ret = 0; 2223 } 2224 return ret; 2225 } 2226 2227 static ssize_t r5c_journal_mode_store(struct mddev *mddev, 2228 const char *page, size_t length) 2229 { 2230 struct r5conf *conf = mddev->private; 2231 struct r5l_log *log = conf->log; 2232 int val = -1, i; 2233 int len = length; 2234 2235 if (!log) 2236 return -ENODEV; 2237 2238 if (len && page[len - 1] == '\n') 2239 len -= 1; 2240 for (i = 0; i < ARRAY_SIZE(r5c_journal_mode_str); i++) 2241 if (strlen(r5c_journal_mode_str[i]) == len && 2242 strncmp(page, r5c_journal_mode_str[i], len) == 0) { 2243 val = i; 2244 break; 2245 } 2246 if (val < R5C_JOURNAL_MODE_WRITE_THROUGH || 2247 val > R5C_JOURNAL_MODE_WRITE_BACK) 2248 return -EINVAL; 2249 2250 mddev_suspend(mddev); 2251 conf->log->r5c_journal_mode = val; 2252 mddev_resume(mddev); 2253 2254 pr_debug("md/raid:%s: setting r5c cache mode to %d: %s\n", 2255 mdname(mddev), val, r5c_journal_mode_str[val]); 2256 return length; 2257 } 2258 2259 struct md_sysfs_entry 2260 r5c_journal_mode = __ATTR(journal_mode, 0644, 2261 r5c_journal_mode_show, r5c_journal_mode_store); 2262 2263 /* 2264 * Try handle write operation in caching phase. This function should only 2265 * be called in write-back mode. 2266 * 2267 * If all outstanding writes can be handled in caching phase, returns 0 2268 * If writes requires write-out phase, call r5c_make_stripe_write_out() 2269 * and returns -EAGAIN 2270 */ 2271 int r5c_try_caching_write(struct r5conf *conf, 2272 struct stripe_head *sh, 2273 struct stripe_head_state *s, 2274 int disks) 2275 { 2276 struct r5l_log *log = conf->log; 2277 int i; 2278 struct r5dev *dev; 2279 int to_cache = 0; 2280 2281 BUG_ON(!r5c_is_writeback(log)); 2282 2283 if (!test_bit(STRIPE_R5C_CACHING, &sh->state)) { 2284 /* 2285 * There are two different scenarios here: 2286 * 1. The stripe has some data cached, and it is sent to 2287 * write-out phase for reclaim 2288 * 2. The stripe is clean, and this is the first write 2289 * 2290 * For 1, return -EAGAIN, so we continue with 2291 * handle_stripe_dirtying(). 2292 * 2293 * For 2, set STRIPE_R5C_CACHING and continue with caching 2294 * write. 2295 */ 2296 2297 /* case 1: anything injournal or anything in written */ 2298 if (s->injournal > 0 || s->written > 0) 2299 return -EAGAIN; 2300 /* case 2 */ 2301 set_bit(STRIPE_R5C_CACHING, &sh->state); 2302 } 2303 2304 for (i = disks; i--; ) { 2305 dev = &sh->dev[i]; 2306 /* if non-overwrite, use writing-out phase */ 2307 if (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags) && 2308 !test_bit(R5_InJournal, &dev->flags)) { 2309 r5c_make_stripe_write_out(sh); 2310 return -EAGAIN; 2311 } 2312 } 2313 2314 for (i = disks; i--; ) { 2315 dev = &sh->dev[i]; 2316 if (dev->towrite) { 2317 set_bit(R5_Wantwrite, &dev->flags); 2318 set_bit(R5_Wantdrain, &dev->flags); 2319 set_bit(R5_LOCKED, &dev->flags); 2320 to_cache++; 2321 } 2322 } 2323 2324 if (to_cache) { 2325 set_bit(STRIPE_OP_BIODRAIN, &s->ops_request); 2326 /* 2327 * set STRIPE_LOG_TRAPPED, which triggers r5c_cache_data() 2328 * in ops_run_io(). STRIPE_LOG_TRAPPED will be cleared in 2329 * r5c_handle_data_cached() 2330 */ 2331 set_bit(STRIPE_LOG_TRAPPED, &sh->state); 2332 } 2333 2334 return 0; 2335 } 2336 2337 /* 2338 * free extra pages (orig_page) we allocated for prexor 2339 */ 2340 void r5c_release_extra_page(struct stripe_head *sh) 2341 { 2342 struct r5conf *conf = sh->raid_conf; 2343 int i; 2344 bool using_disk_info_extra_page; 2345 2346 using_disk_info_extra_page = 2347 sh->dev[0].orig_page == conf->disks[0].extra_page; 2348 2349 for (i = sh->disks; i--; ) 2350 if (sh->dev[i].page != sh->dev[i].orig_page) { 2351 struct page *p = sh->dev[i].orig_page; 2352 2353 sh->dev[i].orig_page = sh->dev[i].page; 2354 if (!using_disk_info_extra_page) 2355 put_page(p); 2356 } 2357 2358 if (using_disk_info_extra_page) { 2359 clear_bit(R5C_EXTRA_PAGE_IN_USE, &conf->cache_state); 2360 md_wakeup_thread(conf->mddev->thread); 2361 } 2362 } 2363 2364 void r5c_use_extra_page(struct stripe_head *sh) 2365 { 2366 struct r5conf *conf = sh->raid_conf; 2367 int i; 2368 struct r5dev *dev; 2369 2370 for (i = sh->disks; i--; ) { 2371 dev = &sh->dev[i]; 2372 if (dev->orig_page != dev->page) 2373 put_page(dev->orig_page); 2374 dev->orig_page = conf->disks[i].extra_page; 2375 } 2376 } 2377 2378 /* 2379 * clean up the stripe (clear R5_InJournal for dev[pd_idx] etc.) after the 2380 * stripe is committed to RAID disks. 2381 */ 2382 void r5c_finish_stripe_write_out(struct r5conf *conf, 2383 struct stripe_head *sh, 2384 struct stripe_head_state *s) 2385 { 2386 int i; 2387 int do_wakeup = 0; 2388 2389 if (!conf->log || 2390 !test_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags)) 2391 return; 2392 2393 WARN_ON(test_bit(STRIPE_R5C_CACHING, &sh->state)); 2394 clear_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags); 2395 2396 if (conf->log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) 2397 return; 2398 2399 for (i = sh->disks; i--; ) { 2400 clear_bit(R5_InJournal, &sh->dev[i].flags); 2401 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) 2402 do_wakeup = 1; 2403 } 2404 2405 /* 2406 * analyse_stripe() runs before r5c_finish_stripe_write_out(), 2407 * We updated R5_InJournal, so we also update s->injournal. 2408 */ 2409 s->injournal = 0; 2410 2411 if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state)) 2412 if (atomic_dec_and_test(&conf->pending_full_writes)) 2413 md_wakeup_thread(conf->mddev->thread); 2414 2415 if (do_wakeup) 2416 wake_up(&conf->wait_for_overlap); 2417 2418 spin_lock_irq(&conf->log->stripe_in_journal_lock); 2419 list_del_init(&sh->r5c); 2420 spin_unlock_irq(&conf->log->stripe_in_journal_lock); 2421 sh->log_start = MaxSector; 2422 atomic_dec(&conf->log->stripe_in_journal_count); 2423 r5c_update_log_state(conf->log); 2424 } 2425 2426 int 2427 r5c_cache_data(struct r5l_log *log, struct stripe_head *sh, 2428 struct stripe_head_state *s) 2429 { 2430 struct r5conf *conf = sh->raid_conf; 2431 int pages = 0; 2432 int reserve; 2433 int i; 2434 int ret = 0; 2435 2436 BUG_ON(!log); 2437 2438 for (i = 0; i < sh->disks; i++) { 2439 void *addr; 2440 2441 if (!test_bit(R5_Wantwrite, &sh->dev[i].flags)) 2442 continue; 2443 addr = kmap_atomic(sh->dev[i].page); 2444 sh->dev[i].log_checksum = crc32c_le(log->uuid_checksum, 2445 addr, PAGE_SIZE); 2446 kunmap_atomic(addr); 2447 pages++; 2448 } 2449 WARN_ON(pages == 0); 2450 2451 /* 2452 * The stripe must enter state machine again to call endio, so 2453 * don't delay. 2454 */ 2455 clear_bit(STRIPE_DELAYED, &sh->state); 2456 atomic_inc(&sh->count); 2457 2458 mutex_lock(&log->io_mutex); 2459 /* meta + data */ 2460 reserve = (1 + pages) << (PAGE_SHIFT - 9); 2461 2462 if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state) && 2463 sh->log_start == MaxSector) 2464 r5l_add_no_space_stripe(log, sh); 2465 else if (!r5l_has_free_space(log, reserve)) { 2466 if (sh->log_start == log->last_checkpoint) 2467 BUG(); 2468 else 2469 r5l_add_no_space_stripe(log, sh); 2470 } else { 2471 ret = r5l_log_stripe(log, sh, pages, 0); 2472 if (ret) { 2473 spin_lock_irq(&log->io_list_lock); 2474 list_add_tail(&sh->log_list, &log->no_mem_stripes); 2475 spin_unlock_irq(&log->io_list_lock); 2476 } 2477 } 2478 2479 mutex_unlock(&log->io_mutex); 2480 return 0; 2481 } 2482 2483 static int r5l_load_log(struct r5l_log *log) 2484 { 2485 struct md_rdev *rdev = log->rdev; 2486 struct page *page; 2487 struct r5l_meta_block *mb; 2488 sector_t cp = log->rdev->journal_tail; 2489 u32 stored_crc, expected_crc; 2490 bool create_super = false; 2491 int ret = 0; 2492 2493 /* Make sure it's valid */ 2494 if (cp >= rdev->sectors || round_down(cp, BLOCK_SECTORS) != cp) 2495 cp = 0; 2496 page = alloc_page(GFP_KERNEL); 2497 if (!page) 2498 return -ENOMEM; 2499 2500 if (!sync_page_io(rdev, cp, PAGE_SIZE, page, REQ_OP_READ, 0, false)) { 2501 ret = -EIO; 2502 goto ioerr; 2503 } 2504 mb = page_address(page); 2505 2506 if (le32_to_cpu(mb->magic) != R5LOG_MAGIC || 2507 mb->version != R5LOG_VERSION) { 2508 create_super = true; 2509 goto create; 2510 } 2511 stored_crc = le32_to_cpu(mb->checksum); 2512 mb->checksum = 0; 2513 expected_crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE); 2514 if (stored_crc != expected_crc) { 2515 create_super = true; 2516 goto create; 2517 } 2518 if (le64_to_cpu(mb->position) != cp) { 2519 create_super = true; 2520 goto create; 2521 } 2522 create: 2523 if (create_super) { 2524 log->last_cp_seq = prandom_u32(); 2525 cp = 0; 2526 r5l_log_write_empty_meta_block(log, cp, log->last_cp_seq); 2527 /* 2528 * Make sure super points to correct address. Log might have 2529 * data very soon. If super hasn't correct log tail address, 2530 * recovery can't find the log 2531 */ 2532 r5l_write_super(log, cp); 2533 } else 2534 log->last_cp_seq = le64_to_cpu(mb->seq); 2535 2536 log->device_size = round_down(rdev->sectors, BLOCK_SECTORS); 2537 log->max_free_space = log->device_size >> RECLAIM_MAX_FREE_SPACE_SHIFT; 2538 if (log->max_free_space > RECLAIM_MAX_FREE_SPACE) 2539 log->max_free_space = RECLAIM_MAX_FREE_SPACE; 2540 log->last_checkpoint = cp; 2541 2542 __free_page(page); 2543 2544 if (create_super) { 2545 log->log_start = r5l_ring_add(log, cp, BLOCK_SECTORS); 2546 log->seq = log->last_cp_seq + 1; 2547 log->next_checkpoint = cp; 2548 } else 2549 ret = r5l_recovery_log(log); 2550 2551 r5c_update_log_state(log); 2552 return ret; 2553 ioerr: 2554 __free_page(page); 2555 return ret; 2556 } 2557 2558 int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev) 2559 { 2560 struct request_queue *q = bdev_get_queue(rdev->bdev); 2561 struct r5l_log *log; 2562 2563 if (PAGE_SIZE != 4096) 2564 return -EINVAL; 2565 2566 /* 2567 * The PAGE_SIZE must be big enough to hold 1 r5l_meta_block and 2568 * raid_disks r5l_payload_data_parity. 2569 * 2570 * Write journal and cache does not work for very big array 2571 * (raid_disks > 203) 2572 */ 2573 if (sizeof(struct r5l_meta_block) + 2574 ((sizeof(struct r5l_payload_data_parity) + sizeof(__le32)) * 2575 conf->raid_disks) > PAGE_SIZE) { 2576 pr_err("md/raid:%s: write journal/cache doesn't work for array with %d disks\n", 2577 mdname(conf->mddev), conf->raid_disks); 2578 return -EINVAL; 2579 } 2580 2581 log = kzalloc(sizeof(*log), GFP_KERNEL); 2582 if (!log) 2583 return -ENOMEM; 2584 log->rdev = rdev; 2585 2586 log->need_cache_flush = test_bit(QUEUE_FLAG_WC, &q->queue_flags) != 0; 2587 2588 log->uuid_checksum = crc32c_le(~0, rdev->mddev->uuid, 2589 sizeof(rdev->mddev->uuid)); 2590 2591 mutex_init(&log->io_mutex); 2592 2593 spin_lock_init(&log->io_list_lock); 2594 INIT_LIST_HEAD(&log->running_ios); 2595 INIT_LIST_HEAD(&log->io_end_ios); 2596 INIT_LIST_HEAD(&log->flushing_ios); 2597 INIT_LIST_HEAD(&log->finished_ios); 2598 bio_init(&log->flush_bio, NULL, 0); 2599 2600 log->io_kc = KMEM_CACHE(r5l_io_unit, 0); 2601 if (!log->io_kc) 2602 goto io_kc; 2603 2604 log->io_pool = mempool_create_slab_pool(R5L_POOL_SIZE, log->io_kc); 2605 if (!log->io_pool) 2606 goto io_pool; 2607 2608 log->bs = bioset_create(R5L_POOL_SIZE, 0); 2609 if (!log->bs) 2610 goto io_bs; 2611 2612 log->meta_pool = mempool_create_page_pool(R5L_POOL_SIZE, 0); 2613 if (!log->meta_pool) 2614 goto out_mempool; 2615 2616 log->reclaim_thread = md_register_thread(r5l_reclaim_thread, 2617 log->rdev->mddev, "reclaim"); 2618 if (!log->reclaim_thread) 2619 goto reclaim_thread; 2620 log->reclaim_thread->timeout = R5C_RECLAIM_WAKEUP_INTERVAL; 2621 2622 init_waitqueue_head(&log->iounit_wait); 2623 2624 INIT_LIST_HEAD(&log->no_mem_stripes); 2625 2626 INIT_LIST_HEAD(&log->no_space_stripes); 2627 spin_lock_init(&log->no_space_stripes_lock); 2628 2629 INIT_WORK(&log->deferred_io_work, r5l_submit_io_async); 2630 2631 log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH; 2632 INIT_LIST_HEAD(&log->stripe_in_journal_list); 2633 spin_lock_init(&log->stripe_in_journal_lock); 2634 atomic_set(&log->stripe_in_journal_count, 0); 2635 2636 rcu_assign_pointer(conf->log, log); 2637 2638 if (r5l_load_log(log)) 2639 goto error; 2640 2641 set_bit(MD_HAS_JOURNAL, &conf->mddev->flags); 2642 return 0; 2643 2644 error: 2645 rcu_assign_pointer(conf->log, NULL); 2646 md_unregister_thread(&log->reclaim_thread); 2647 reclaim_thread: 2648 mempool_destroy(log->meta_pool); 2649 out_mempool: 2650 bioset_free(log->bs); 2651 io_bs: 2652 mempool_destroy(log->io_pool); 2653 io_pool: 2654 kmem_cache_destroy(log->io_kc); 2655 io_kc: 2656 kfree(log); 2657 return -EINVAL; 2658 } 2659 2660 void r5l_exit_log(struct r5l_log *log) 2661 { 2662 md_unregister_thread(&log->reclaim_thread); 2663 mempool_destroy(log->meta_pool); 2664 bioset_free(log->bs); 2665 mempool_destroy(log->io_pool); 2666 kmem_cache_destroy(log->io_kc); 2667 kfree(log); 2668 } 2669