1 /* 2 * Copyright (C) 2015 Shaohua Li <shli@fb.com> 3 * Copyright (C) 2016 Song Liu <songliubraving@fb.com> 4 * 5 * This program is free software; you can redistribute it and/or modify it 6 * under the terms and conditions of the GNU General Public License, 7 * version 2, as published by the Free Software Foundation. 8 * 9 * This program is distributed in the hope it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 12 * more details. 13 * 14 */ 15 #include <linux/kernel.h> 16 #include <linux/wait.h> 17 #include <linux/blkdev.h> 18 #include <linux/slab.h> 19 #include <linux/raid/md_p.h> 20 #include <linux/crc32c.h> 21 #include <linux/random.h> 22 #include <linux/kthread.h> 23 #include "md.h" 24 #include "raid5.h" 25 #include "bitmap.h" 26 27 /* 28 * metadata/data stored in disk with 4k size unit (a block) regardless 29 * underneath hardware sector size. only works with PAGE_SIZE == 4096 30 */ 31 #define BLOCK_SECTORS (8) 32 33 /* 34 * log->max_free_space is min(1/4 disk size, 10G reclaimable space). 35 * 36 * In write through mode, the reclaim runs every log->max_free_space. 37 * This can prevent the recovery scans for too long 38 */ 39 #define RECLAIM_MAX_FREE_SPACE (10 * 1024 * 1024 * 2) /* sector */ 40 #define RECLAIM_MAX_FREE_SPACE_SHIFT (2) 41 42 /* wake up reclaim thread periodically */ 43 #define R5C_RECLAIM_WAKEUP_INTERVAL (30 * HZ) 44 /* start flush with these full stripes */ 45 #define R5C_FULL_STRIPE_FLUSH_BATCH 256 46 /* reclaim stripes in groups */ 47 #define R5C_RECLAIM_STRIPE_GROUP (NR_STRIPE_HASH_LOCKS * 2) 48 49 /* 50 * We only need 2 bios per I/O unit to make progress, but ensure we 51 * have a few more available to not get too tight. 52 */ 53 #define R5L_POOL_SIZE 4 54 55 /* 56 * r5c journal modes of the array: write-back or write-through. 57 * write-through mode has identical behavior as existing log only 58 * implementation. 59 */ 60 enum r5c_journal_mode { 61 R5C_JOURNAL_MODE_WRITE_THROUGH = 0, 62 R5C_JOURNAL_MODE_WRITE_BACK = 1, 63 }; 64 65 static char *r5c_journal_mode_str[] = {"write-through", 66 "write-back"}; 67 /* 68 * raid5 cache state machine 69 * 70 * With the RAID cache, each stripe works in two phases: 71 * - caching phase 72 * - writing-out phase 73 * 74 * These two phases are controlled by bit STRIPE_R5C_CACHING: 75 * if STRIPE_R5C_CACHING == 0, the stripe is in writing-out phase 76 * if STRIPE_R5C_CACHING == 1, the stripe is in caching phase 77 * 78 * When there is no journal, or the journal is in write-through mode, 79 * the stripe is always in writing-out phase. 80 * 81 * For write-back journal, the stripe is sent to caching phase on write 82 * (r5c_try_caching_write). r5c_make_stripe_write_out() kicks off 83 * the write-out phase by clearing STRIPE_R5C_CACHING. 84 * 85 * Stripes in caching phase do not write the raid disks. Instead, all 86 * writes are committed from the log device. Therefore, a stripe in 87 * caching phase handles writes as: 88 * - write to log device 89 * - return IO 90 * 91 * Stripes in writing-out phase handle writes as: 92 * - calculate parity 93 * - write pending data and parity to journal 94 * - write data and parity to raid disks 95 * - return IO for pending writes 96 */ 97 98 struct r5l_log { 99 struct md_rdev *rdev; 100 101 u32 uuid_checksum; 102 103 sector_t device_size; /* log device size, round to 104 * BLOCK_SECTORS */ 105 sector_t max_free_space; /* reclaim run if free space is at 106 * this size */ 107 108 sector_t last_checkpoint; /* log tail. where recovery scan 109 * starts from */ 110 u64 last_cp_seq; /* log tail sequence */ 111 112 sector_t log_start; /* log head. where new data appends */ 113 u64 seq; /* log head sequence */ 114 115 sector_t next_checkpoint; 116 117 struct mutex io_mutex; 118 struct r5l_io_unit *current_io; /* current io_unit accepting new data */ 119 120 spinlock_t io_list_lock; 121 struct list_head running_ios; /* io_units which are still running, 122 * and have not yet been completely 123 * written to the log */ 124 struct list_head io_end_ios; /* io_units which have been completely 125 * written to the log but not yet written 126 * to the RAID */ 127 struct list_head flushing_ios; /* io_units which are waiting for log 128 * cache flush */ 129 struct list_head finished_ios; /* io_units which settle down in log disk */ 130 struct bio flush_bio; 131 132 struct list_head no_mem_stripes; /* pending stripes, -ENOMEM */ 133 134 struct kmem_cache *io_kc; 135 mempool_t *io_pool; 136 struct bio_set *bs; 137 mempool_t *meta_pool; 138 139 struct md_thread *reclaim_thread; 140 unsigned long reclaim_target; /* number of space that need to be 141 * reclaimed. if it's 0, reclaim spaces 142 * used by io_units which are in 143 * IO_UNIT_STRIPE_END state (eg, reclaim 144 * dones't wait for specific io_unit 145 * switching to IO_UNIT_STRIPE_END 146 * state) */ 147 wait_queue_head_t iounit_wait; 148 149 struct list_head no_space_stripes; /* pending stripes, log has no space */ 150 spinlock_t no_space_stripes_lock; 151 152 bool need_cache_flush; 153 154 /* for r5c_cache */ 155 enum r5c_journal_mode r5c_journal_mode; 156 157 /* all stripes in r5cache, in the order of seq at sh->log_start */ 158 struct list_head stripe_in_journal_list; 159 160 spinlock_t stripe_in_journal_lock; 161 atomic_t stripe_in_journal_count; 162 163 /* to submit async io_units, to fulfill ordering of flush */ 164 struct work_struct deferred_io_work; 165 }; 166 167 /* 168 * an IO range starts from a meta data block and end at the next meta data 169 * block. The io unit's the meta data block tracks data/parity followed it. io 170 * unit is written to log disk with normal write, as we always flush log disk 171 * first and then start move data to raid disks, there is no requirement to 172 * write io unit with FLUSH/FUA 173 */ 174 struct r5l_io_unit { 175 struct r5l_log *log; 176 177 struct page *meta_page; /* store meta block */ 178 int meta_offset; /* current offset in meta_page */ 179 180 struct bio *current_bio;/* current_bio accepting new data */ 181 182 atomic_t pending_stripe;/* how many stripes not flushed to raid */ 183 u64 seq; /* seq number of the metablock */ 184 sector_t log_start; /* where the io_unit starts */ 185 sector_t log_end; /* where the io_unit ends */ 186 struct list_head log_sibling; /* log->running_ios */ 187 struct list_head stripe_list; /* stripes added to the io_unit */ 188 189 int state; 190 bool need_split_bio; 191 struct bio *split_bio; 192 193 unsigned int has_flush:1; /* include flush request */ 194 unsigned int has_fua:1; /* include fua request */ 195 unsigned int has_null_flush:1; /* include empty flush request */ 196 /* 197 * io isn't sent yet, flush/fua request can only be submitted till it's 198 * the first IO in running_ios list 199 */ 200 unsigned int io_deferred:1; 201 202 struct bio_list flush_barriers; /* size == 0 flush bios */ 203 }; 204 205 /* r5l_io_unit state */ 206 enum r5l_io_unit_state { 207 IO_UNIT_RUNNING = 0, /* accepting new IO */ 208 IO_UNIT_IO_START = 1, /* io_unit bio start writing to log, 209 * don't accepting new bio */ 210 IO_UNIT_IO_END = 2, /* io_unit bio finish writing to log */ 211 IO_UNIT_STRIPE_END = 3, /* stripes data finished writing to raid */ 212 }; 213 214 bool r5c_is_writeback(struct r5l_log *log) 215 { 216 return (log != NULL && 217 log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_BACK); 218 } 219 220 static sector_t r5l_ring_add(struct r5l_log *log, sector_t start, sector_t inc) 221 { 222 start += inc; 223 if (start >= log->device_size) 224 start = start - log->device_size; 225 return start; 226 } 227 228 static sector_t r5l_ring_distance(struct r5l_log *log, sector_t start, 229 sector_t end) 230 { 231 if (end >= start) 232 return end - start; 233 else 234 return end + log->device_size - start; 235 } 236 237 static bool r5l_has_free_space(struct r5l_log *log, sector_t size) 238 { 239 sector_t used_size; 240 241 used_size = r5l_ring_distance(log, log->last_checkpoint, 242 log->log_start); 243 244 return log->device_size > used_size + size; 245 } 246 247 static void __r5l_set_io_unit_state(struct r5l_io_unit *io, 248 enum r5l_io_unit_state state) 249 { 250 if (WARN_ON(io->state >= state)) 251 return; 252 io->state = state; 253 } 254 255 static void 256 r5c_return_dev_pending_writes(struct r5conf *conf, struct r5dev *dev, 257 struct bio_list *return_bi) 258 { 259 struct bio *wbi, *wbi2; 260 261 wbi = dev->written; 262 dev->written = NULL; 263 while (wbi && wbi->bi_iter.bi_sector < 264 dev->sector + STRIPE_SECTORS) { 265 wbi2 = r5_next_bio(wbi, dev->sector); 266 if (!raid5_dec_bi_active_stripes(wbi)) { 267 md_write_end(conf->mddev); 268 bio_list_add(return_bi, wbi); 269 } 270 wbi = wbi2; 271 } 272 } 273 274 void r5c_handle_cached_data_endio(struct r5conf *conf, 275 struct stripe_head *sh, int disks, struct bio_list *return_bi) 276 { 277 int i; 278 279 for (i = sh->disks; i--; ) { 280 if (sh->dev[i].written) { 281 set_bit(R5_UPTODATE, &sh->dev[i].flags); 282 r5c_return_dev_pending_writes(conf, &sh->dev[i], 283 return_bi); 284 bitmap_endwrite(conf->mddev->bitmap, sh->sector, 285 STRIPE_SECTORS, 286 !test_bit(STRIPE_DEGRADED, &sh->state), 287 0); 288 } 289 } 290 } 291 292 /* Check whether we should flush some stripes to free up stripe cache */ 293 void r5c_check_stripe_cache_usage(struct r5conf *conf) 294 { 295 int total_cached; 296 297 if (!r5c_is_writeback(conf->log)) 298 return; 299 300 total_cached = atomic_read(&conf->r5c_cached_partial_stripes) + 301 atomic_read(&conf->r5c_cached_full_stripes); 302 303 /* 304 * The following condition is true for either of the following: 305 * - stripe cache pressure high: 306 * total_cached > 3/4 min_nr_stripes || 307 * empty_inactive_list_nr > 0 308 * - stripe cache pressure moderate: 309 * total_cached > 1/2 min_nr_stripes 310 */ 311 if (total_cached > conf->min_nr_stripes * 1 / 2 || 312 atomic_read(&conf->empty_inactive_list_nr) > 0) 313 r5l_wake_reclaim(conf->log, 0); 314 } 315 316 /* 317 * flush cache when there are R5C_FULL_STRIPE_FLUSH_BATCH or more full 318 * stripes in the cache 319 */ 320 void r5c_check_cached_full_stripe(struct r5conf *conf) 321 { 322 if (!r5c_is_writeback(conf->log)) 323 return; 324 325 /* 326 * wake up reclaim for R5C_FULL_STRIPE_FLUSH_BATCH cached stripes 327 * or a full stripe (chunk size / 4k stripes). 328 */ 329 if (atomic_read(&conf->r5c_cached_full_stripes) >= 330 min(R5C_FULL_STRIPE_FLUSH_BATCH, 331 conf->chunk_sectors >> STRIPE_SHIFT)) 332 r5l_wake_reclaim(conf->log, 0); 333 } 334 335 /* 336 * Total log space (in sectors) needed to flush all data in cache 337 * 338 * Currently, writing-out phase automatically includes all pending writes 339 * to the same sector. So the reclaim of each stripe takes up to 340 * (conf->raid_disks + 1) pages of log space. 341 * 342 * To totally avoid deadlock due to log space, the code reserves 343 * (conf->raid_disks + 1) pages for each stripe in cache, which is not 344 * necessary in most cases. 345 * 346 * To improve this, we will need writing-out phase to be able to NOT include 347 * pending writes, which will reduce the requirement to 348 * (conf->max_degraded + 1) pages per stripe in cache. 349 */ 350 static sector_t r5c_log_required_to_flush_cache(struct r5conf *conf) 351 { 352 struct r5l_log *log = conf->log; 353 354 if (!r5c_is_writeback(log)) 355 return 0; 356 357 return BLOCK_SECTORS * (conf->raid_disks + 1) * 358 atomic_read(&log->stripe_in_journal_count); 359 } 360 361 /* 362 * evaluate log space usage and update R5C_LOG_TIGHT and R5C_LOG_CRITICAL 363 * 364 * R5C_LOG_TIGHT is set when free space on the log device is less than 3x of 365 * reclaim_required_space. R5C_LOG_CRITICAL is set when free space on the log 366 * device is less than 2x of reclaim_required_space. 367 */ 368 static inline void r5c_update_log_state(struct r5l_log *log) 369 { 370 struct r5conf *conf = log->rdev->mddev->private; 371 sector_t free_space; 372 sector_t reclaim_space; 373 bool wake_reclaim = false; 374 375 if (!r5c_is_writeback(log)) 376 return; 377 378 free_space = r5l_ring_distance(log, log->log_start, 379 log->last_checkpoint); 380 reclaim_space = r5c_log_required_to_flush_cache(conf); 381 if (free_space < 2 * reclaim_space) 382 set_bit(R5C_LOG_CRITICAL, &conf->cache_state); 383 else { 384 if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state)) 385 wake_reclaim = true; 386 clear_bit(R5C_LOG_CRITICAL, &conf->cache_state); 387 } 388 if (free_space < 3 * reclaim_space) 389 set_bit(R5C_LOG_TIGHT, &conf->cache_state); 390 else 391 clear_bit(R5C_LOG_TIGHT, &conf->cache_state); 392 393 if (wake_reclaim) 394 r5l_wake_reclaim(log, 0); 395 } 396 397 /* 398 * Put the stripe into writing-out phase by clearing STRIPE_R5C_CACHING. 399 * This function should only be called in write-back mode. 400 */ 401 void r5c_make_stripe_write_out(struct stripe_head *sh) 402 { 403 struct r5conf *conf = sh->raid_conf; 404 struct r5l_log *log = conf->log; 405 406 BUG_ON(!r5c_is_writeback(log)); 407 408 WARN_ON(!test_bit(STRIPE_R5C_CACHING, &sh->state)); 409 clear_bit(STRIPE_R5C_CACHING, &sh->state); 410 411 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 412 atomic_inc(&conf->preread_active_stripes); 413 414 if (test_and_clear_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state)) { 415 BUG_ON(atomic_read(&conf->r5c_cached_partial_stripes) == 0); 416 atomic_dec(&conf->r5c_cached_partial_stripes); 417 } 418 419 if (test_and_clear_bit(STRIPE_R5C_FULL_STRIPE, &sh->state)) { 420 BUG_ON(atomic_read(&conf->r5c_cached_full_stripes) == 0); 421 atomic_dec(&conf->r5c_cached_full_stripes); 422 } 423 } 424 425 static void r5c_handle_data_cached(struct stripe_head *sh) 426 { 427 int i; 428 429 for (i = sh->disks; i--; ) 430 if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) { 431 set_bit(R5_InJournal, &sh->dev[i].flags); 432 clear_bit(R5_LOCKED, &sh->dev[i].flags); 433 } 434 clear_bit(STRIPE_LOG_TRAPPED, &sh->state); 435 } 436 437 /* 438 * this journal write must contain full parity, 439 * it may also contain some data pages 440 */ 441 static void r5c_handle_parity_cached(struct stripe_head *sh) 442 { 443 int i; 444 445 for (i = sh->disks; i--; ) 446 if (test_bit(R5_InJournal, &sh->dev[i].flags)) 447 set_bit(R5_Wantwrite, &sh->dev[i].flags); 448 } 449 450 /* 451 * Setting proper flags after writing (or flushing) data and/or parity to the 452 * log device. This is called from r5l_log_endio() or r5l_log_flush_endio(). 453 */ 454 static void r5c_finish_cache_stripe(struct stripe_head *sh) 455 { 456 struct r5l_log *log = sh->raid_conf->log; 457 458 if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) { 459 BUG_ON(test_bit(STRIPE_R5C_CACHING, &sh->state)); 460 /* 461 * Set R5_InJournal for parity dev[pd_idx]. This means 462 * all data AND parity in the journal. For RAID 6, it is 463 * NOT necessary to set the flag for dev[qd_idx], as the 464 * two parities are written out together. 465 */ 466 set_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags); 467 } else if (test_bit(STRIPE_R5C_CACHING, &sh->state)) { 468 r5c_handle_data_cached(sh); 469 } else { 470 r5c_handle_parity_cached(sh); 471 set_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags); 472 } 473 } 474 475 static void r5l_io_run_stripes(struct r5l_io_unit *io) 476 { 477 struct stripe_head *sh, *next; 478 479 list_for_each_entry_safe(sh, next, &io->stripe_list, log_list) { 480 list_del_init(&sh->log_list); 481 482 r5c_finish_cache_stripe(sh); 483 484 set_bit(STRIPE_HANDLE, &sh->state); 485 raid5_release_stripe(sh); 486 } 487 } 488 489 static void r5l_log_run_stripes(struct r5l_log *log) 490 { 491 struct r5l_io_unit *io, *next; 492 493 assert_spin_locked(&log->io_list_lock); 494 495 list_for_each_entry_safe(io, next, &log->running_ios, log_sibling) { 496 /* don't change list order */ 497 if (io->state < IO_UNIT_IO_END) 498 break; 499 500 list_move_tail(&io->log_sibling, &log->finished_ios); 501 r5l_io_run_stripes(io); 502 } 503 } 504 505 static void r5l_move_to_end_ios(struct r5l_log *log) 506 { 507 struct r5l_io_unit *io, *next; 508 509 assert_spin_locked(&log->io_list_lock); 510 511 list_for_each_entry_safe(io, next, &log->running_ios, log_sibling) { 512 /* don't change list order */ 513 if (io->state < IO_UNIT_IO_END) 514 break; 515 list_move_tail(&io->log_sibling, &log->io_end_ios); 516 } 517 } 518 519 static void __r5l_stripe_write_finished(struct r5l_io_unit *io); 520 static void r5l_log_endio(struct bio *bio) 521 { 522 struct r5l_io_unit *io = bio->bi_private; 523 struct r5l_io_unit *io_deferred; 524 struct r5l_log *log = io->log; 525 unsigned long flags; 526 527 if (bio->bi_error) 528 md_error(log->rdev->mddev, log->rdev); 529 530 bio_put(bio); 531 mempool_free(io->meta_page, log->meta_pool); 532 533 spin_lock_irqsave(&log->io_list_lock, flags); 534 __r5l_set_io_unit_state(io, IO_UNIT_IO_END); 535 if (log->need_cache_flush) 536 r5l_move_to_end_ios(log); 537 else 538 r5l_log_run_stripes(log); 539 if (!list_empty(&log->running_ios)) { 540 /* 541 * FLUSH/FUA io_unit is deferred because of ordering, now we 542 * can dispatch it 543 */ 544 io_deferred = list_first_entry(&log->running_ios, 545 struct r5l_io_unit, log_sibling); 546 if (io_deferred->io_deferred) 547 schedule_work(&log->deferred_io_work); 548 } 549 550 spin_unlock_irqrestore(&log->io_list_lock, flags); 551 552 if (log->need_cache_flush) 553 md_wakeup_thread(log->rdev->mddev->thread); 554 555 if (io->has_null_flush) { 556 struct bio *bi; 557 558 WARN_ON(bio_list_empty(&io->flush_barriers)); 559 while ((bi = bio_list_pop(&io->flush_barriers)) != NULL) { 560 bio_endio(bi); 561 atomic_dec(&io->pending_stripe); 562 } 563 if (atomic_read(&io->pending_stripe) == 0) 564 __r5l_stripe_write_finished(io); 565 } 566 } 567 568 static void r5l_do_submit_io(struct r5l_log *log, struct r5l_io_unit *io) 569 { 570 unsigned long flags; 571 572 spin_lock_irqsave(&log->io_list_lock, flags); 573 __r5l_set_io_unit_state(io, IO_UNIT_IO_START); 574 spin_unlock_irqrestore(&log->io_list_lock, flags); 575 576 if (io->has_flush) 577 io->current_bio->bi_opf |= REQ_PREFLUSH; 578 if (io->has_fua) 579 io->current_bio->bi_opf |= REQ_FUA; 580 submit_bio(io->current_bio); 581 582 if (!io->split_bio) 583 return; 584 585 if (io->has_flush) 586 io->split_bio->bi_opf |= REQ_PREFLUSH; 587 if (io->has_fua) 588 io->split_bio->bi_opf |= REQ_FUA; 589 submit_bio(io->split_bio); 590 } 591 592 /* deferred io_unit will be dispatched here */ 593 static void r5l_submit_io_async(struct work_struct *work) 594 { 595 struct r5l_log *log = container_of(work, struct r5l_log, 596 deferred_io_work); 597 struct r5l_io_unit *io = NULL; 598 unsigned long flags; 599 600 spin_lock_irqsave(&log->io_list_lock, flags); 601 if (!list_empty(&log->running_ios)) { 602 io = list_first_entry(&log->running_ios, struct r5l_io_unit, 603 log_sibling); 604 if (!io->io_deferred) 605 io = NULL; 606 else 607 io->io_deferred = 0; 608 } 609 spin_unlock_irqrestore(&log->io_list_lock, flags); 610 if (io) 611 r5l_do_submit_io(log, io); 612 } 613 614 static void r5l_submit_current_io(struct r5l_log *log) 615 { 616 struct r5l_io_unit *io = log->current_io; 617 struct bio *bio; 618 struct r5l_meta_block *block; 619 unsigned long flags; 620 u32 crc; 621 bool do_submit = true; 622 623 if (!io) 624 return; 625 626 block = page_address(io->meta_page); 627 block->meta_size = cpu_to_le32(io->meta_offset); 628 crc = crc32c_le(log->uuid_checksum, block, PAGE_SIZE); 629 block->checksum = cpu_to_le32(crc); 630 bio = io->current_bio; 631 632 log->current_io = NULL; 633 spin_lock_irqsave(&log->io_list_lock, flags); 634 if (io->has_flush || io->has_fua) { 635 if (io != list_first_entry(&log->running_ios, 636 struct r5l_io_unit, log_sibling)) { 637 io->io_deferred = 1; 638 do_submit = false; 639 } 640 } 641 spin_unlock_irqrestore(&log->io_list_lock, flags); 642 if (do_submit) 643 r5l_do_submit_io(log, io); 644 } 645 646 static struct bio *r5l_bio_alloc(struct r5l_log *log) 647 { 648 struct bio *bio = bio_alloc_bioset(GFP_NOIO, BIO_MAX_PAGES, log->bs); 649 650 bio_set_op_attrs(bio, REQ_OP_WRITE, 0); 651 bio->bi_bdev = log->rdev->bdev; 652 bio->bi_iter.bi_sector = log->rdev->data_offset + log->log_start; 653 654 return bio; 655 } 656 657 static void r5_reserve_log_entry(struct r5l_log *log, struct r5l_io_unit *io) 658 { 659 log->log_start = r5l_ring_add(log, log->log_start, BLOCK_SECTORS); 660 661 r5c_update_log_state(log); 662 /* 663 * If we filled up the log device start from the beginning again, 664 * which will require a new bio. 665 * 666 * Note: for this to work properly the log size needs to me a multiple 667 * of BLOCK_SECTORS. 668 */ 669 if (log->log_start == 0) 670 io->need_split_bio = true; 671 672 io->log_end = log->log_start; 673 } 674 675 static struct r5l_io_unit *r5l_new_meta(struct r5l_log *log) 676 { 677 struct r5l_io_unit *io; 678 struct r5l_meta_block *block; 679 680 io = mempool_alloc(log->io_pool, GFP_ATOMIC); 681 if (!io) 682 return NULL; 683 memset(io, 0, sizeof(*io)); 684 685 io->log = log; 686 INIT_LIST_HEAD(&io->log_sibling); 687 INIT_LIST_HEAD(&io->stripe_list); 688 bio_list_init(&io->flush_barriers); 689 io->state = IO_UNIT_RUNNING; 690 691 io->meta_page = mempool_alloc(log->meta_pool, GFP_NOIO); 692 block = page_address(io->meta_page); 693 clear_page(block); 694 block->magic = cpu_to_le32(R5LOG_MAGIC); 695 block->version = R5LOG_VERSION; 696 block->seq = cpu_to_le64(log->seq); 697 block->position = cpu_to_le64(log->log_start); 698 699 io->log_start = log->log_start; 700 io->meta_offset = sizeof(struct r5l_meta_block); 701 io->seq = log->seq++; 702 703 io->current_bio = r5l_bio_alloc(log); 704 io->current_bio->bi_end_io = r5l_log_endio; 705 io->current_bio->bi_private = io; 706 bio_add_page(io->current_bio, io->meta_page, PAGE_SIZE, 0); 707 708 r5_reserve_log_entry(log, io); 709 710 spin_lock_irq(&log->io_list_lock); 711 list_add_tail(&io->log_sibling, &log->running_ios); 712 spin_unlock_irq(&log->io_list_lock); 713 714 return io; 715 } 716 717 static int r5l_get_meta(struct r5l_log *log, unsigned int payload_size) 718 { 719 if (log->current_io && 720 log->current_io->meta_offset + payload_size > PAGE_SIZE) 721 r5l_submit_current_io(log); 722 723 if (!log->current_io) { 724 log->current_io = r5l_new_meta(log); 725 if (!log->current_io) 726 return -ENOMEM; 727 } 728 729 return 0; 730 } 731 732 static void r5l_append_payload_meta(struct r5l_log *log, u16 type, 733 sector_t location, 734 u32 checksum1, u32 checksum2, 735 bool checksum2_valid) 736 { 737 struct r5l_io_unit *io = log->current_io; 738 struct r5l_payload_data_parity *payload; 739 740 payload = page_address(io->meta_page) + io->meta_offset; 741 payload->header.type = cpu_to_le16(type); 742 payload->header.flags = cpu_to_le16(0); 743 payload->size = cpu_to_le32((1 + !!checksum2_valid) << 744 (PAGE_SHIFT - 9)); 745 payload->location = cpu_to_le64(location); 746 payload->checksum[0] = cpu_to_le32(checksum1); 747 if (checksum2_valid) 748 payload->checksum[1] = cpu_to_le32(checksum2); 749 750 io->meta_offset += sizeof(struct r5l_payload_data_parity) + 751 sizeof(__le32) * (1 + !!checksum2_valid); 752 } 753 754 static void r5l_append_payload_page(struct r5l_log *log, struct page *page) 755 { 756 struct r5l_io_unit *io = log->current_io; 757 758 if (io->need_split_bio) { 759 BUG_ON(io->split_bio); 760 io->split_bio = io->current_bio; 761 io->current_bio = r5l_bio_alloc(log); 762 bio_chain(io->current_bio, io->split_bio); 763 io->need_split_bio = false; 764 } 765 766 if (!bio_add_page(io->current_bio, page, PAGE_SIZE, 0)) 767 BUG(); 768 769 r5_reserve_log_entry(log, io); 770 } 771 772 static int r5l_log_stripe(struct r5l_log *log, struct stripe_head *sh, 773 int data_pages, int parity_pages) 774 { 775 int i; 776 int meta_size; 777 int ret; 778 struct r5l_io_unit *io; 779 780 meta_size = 781 ((sizeof(struct r5l_payload_data_parity) + sizeof(__le32)) 782 * data_pages) + 783 sizeof(struct r5l_payload_data_parity) + 784 sizeof(__le32) * parity_pages; 785 786 ret = r5l_get_meta(log, meta_size); 787 if (ret) 788 return ret; 789 790 io = log->current_io; 791 792 if (test_and_clear_bit(STRIPE_R5C_PREFLUSH, &sh->state)) 793 io->has_flush = 1; 794 795 for (i = 0; i < sh->disks; i++) { 796 if (!test_bit(R5_Wantwrite, &sh->dev[i].flags) || 797 test_bit(R5_InJournal, &sh->dev[i].flags)) 798 continue; 799 if (i == sh->pd_idx || i == sh->qd_idx) 800 continue; 801 if (test_bit(R5_WantFUA, &sh->dev[i].flags) && 802 log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_BACK) { 803 io->has_fua = 1; 804 /* 805 * we need to flush journal to make sure recovery can 806 * reach the data with fua flag 807 */ 808 io->has_flush = 1; 809 } 810 r5l_append_payload_meta(log, R5LOG_PAYLOAD_DATA, 811 raid5_compute_blocknr(sh, i, 0), 812 sh->dev[i].log_checksum, 0, false); 813 r5l_append_payload_page(log, sh->dev[i].page); 814 } 815 816 if (parity_pages == 2) { 817 r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY, 818 sh->sector, sh->dev[sh->pd_idx].log_checksum, 819 sh->dev[sh->qd_idx].log_checksum, true); 820 r5l_append_payload_page(log, sh->dev[sh->pd_idx].page); 821 r5l_append_payload_page(log, sh->dev[sh->qd_idx].page); 822 } else if (parity_pages == 1) { 823 r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY, 824 sh->sector, sh->dev[sh->pd_idx].log_checksum, 825 0, false); 826 r5l_append_payload_page(log, sh->dev[sh->pd_idx].page); 827 } else /* Just writing data, not parity, in caching phase */ 828 BUG_ON(parity_pages != 0); 829 830 list_add_tail(&sh->log_list, &io->stripe_list); 831 atomic_inc(&io->pending_stripe); 832 sh->log_io = io; 833 834 if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) 835 return 0; 836 837 if (sh->log_start == MaxSector) { 838 BUG_ON(!list_empty(&sh->r5c)); 839 sh->log_start = io->log_start; 840 spin_lock_irq(&log->stripe_in_journal_lock); 841 list_add_tail(&sh->r5c, 842 &log->stripe_in_journal_list); 843 spin_unlock_irq(&log->stripe_in_journal_lock); 844 atomic_inc(&log->stripe_in_journal_count); 845 } 846 return 0; 847 } 848 849 /* add stripe to no_space_stripes, and then wake up reclaim */ 850 static inline void r5l_add_no_space_stripe(struct r5l_log *log, 851 struct stripe_head *sh) 852 { 853 spin_lock(&log->no_space_stripes_lock); 854 list_add_tail(&sh->log_list, &log->no_space_stripes); 855 spin_unlock(&log->no_space_stripes_lock); 856 } 857 858 /* 859 * running in raid5d, where reclaim could wait for raid5d too (when it flushes 860 * data from log to raid disks), so we shouldn't wait for reclaim here 861 */ 862 int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh) 863 { 864 struct r5conf *conf = sh->raid_conf; 865 int write_disks = 0; 866 int data_pages, parity_pages; 867 int reserve; 868 int i; 869 int ret = 0; 870 bool wake_reclaim = false; 871 872 if (!log) 873 return -EAGAIN; 874 /* Don't support stripe batch */ 875 if (sh->log_io || !test_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags) || 876 test_bit(STRIPE_SYNCING, &sh->state)) { 877 /* the stripe is written to log, we start writing it to raid */ 878 clear_bit(STRIPE_LOG_TRAPPED, &sh->state); 879 return -EAGAIN; 880 } 881 882 WARN_ON(test_bit(STRIPE_R5C_CACHING, &sh->state)); 883 884 for (i = 0; i < sh->disks; i++) { 885 void *addr; 886 887 if (!test_bit(R5_Wantwrite, &sh->dev[i].flags) || 888 test_bit(R5_InJournal, &sh->dev[i].flags)) 889 continue; 890 891 write_disks++; 892 /* checksum is already calculated in last run */ 893 if (test_bit(STRIPE_LOG_TRAPPED, &sh->state)) 894 continue; 895 addr = kmap_atomic(sh->dev[i].page); 896 sh->dev[i].log_checksum = crc32c_le(log->uuid_checksum, 897 addr, PAGE_SIZE); 898 kunmap_atomic(addr); 899 } 900 parity_pages = 1 + !!(sh->qd_idx >= 0); 901 data_pages = write_disks - parity_pages; 902 903 set_bit(STRIPE_LOG_TRAPPED, &sh->state); 904 /* 905 * The stripe must enter state machine again to finish the write, so 906 * don't delay. 907 */ 908 clear_bit(STRIPE_DELAYED, &sh->state); 909 atomic_inc(&sh->count); 910 911 mutex_lock(&log->io_mutex); 912 /* meta + data */ 913 reserve = (1 + write_disks) << (PAGE_SHIFT - 9); 914 915 if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) { 916 if (!r5l_has_free_space(log, reserve)) { 917 r5l_add_no_space_stripe(log, sh); 918 wake_reclaim = true; 919 } else { 920 ret = r5l_log_stripe(log, sh, data_pages, parity_pages); 921 if (ret) { 922 spin_lock_irq(&log->io_list_lock); 923 list_add_tail(&sh->log_list, 924 &log->no_mem_stripes); 925 spin_unlock_irq(&log->io_list_lock); 926 } 927 } 928 } else { /* R5C_JOURNAL_MODE_WRITE_BACK */ 929 /* 930 * log space critical, do not process stripes that are 931 * not in cache yet (sh->log_start == MaxSector). 932 */ 933 if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state) && 934 sh->log_start == MaxSector) { 935 r5l_add_no_space_stripe(log, sh); 936 wake_reclaim = true; 937 reserve = 0; 938 } else if (!r5l_has_free_space(log, reserve)) { 939 if (sh->log_start == log->last_checkpoint) 940 BUG(); 941 else 942 r5l_add_no_space_stripe(log, sh); 943 } else { 944 ret = r5l_log_stripe(log, sh, data_pages, parity_pages); 945 if (ret) { 946 spin_lock_irq(&log->io_list_lock); 947 list_add_tail(&sh->log_list, 948 &log->no_mem_stripes); 949 spin_unlock_irq(&log->io_list_lock); 950 } 951 } 952 } 953 954 mutex_unlock(&log->io_mutex); 955 if (wake_reclaim) 956 r5l_wake_reclaim(log, reserve); 957 return 0; 958 } 959 960 void r5l_write_stripe_run(struct r5l_log *log) 961 { 962 if (!log) 963 return; 964 mutex_lock(&log->io_mutex); 965 r5l_submit_current_io(log); 966 mutex_unlock(&log->io_mutex); 967 } 968 969 int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio) 970 { 971 if (!log) 972 return -ENODEV; 973 974 if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) { 975 /* 976 * in write through (journal only) 977 * we flush log disk cache first, then write stripe data to 978 * raid disks. So if bio is finished, the log disk cache is 979 * flushed already. The recovery guarantees we can recovery 980 * the bio from log disk, so we don't need to flush again 981 */ 982 if (bio->bi_iter.bi_size == 0) { 983 bio_endio(bio); 984 return 0; 985 } 986 bio->bi_opf &= ~REQ_PREFLUSH; 987 } else { 988 /* write back (with cache) */ 989 if (bio->bi_iter.bi_size == 0) { 990 mutex_lock(&log->io_mutex); 991 r5l_get_meta(log, 0); 992 bio_list_add(&log->current_io->flush_barriers, bio); 993 log->current_io->has_flush = 1; 994 log->current_io->has_null_flush = 1; 995 atomic_inc(&log->current_io->pending_stripe); 996 r5l_submit_current_io(log); 997 mutex_unlock(&log->io_mutex); 998 return 0; 999 } 1000 } 1001 return -EAGAIN; 1002 } 1003 1004 /* This will run after log space is reclaimed */ 1005 static void r5l_run_no_space_stripes(struct r5l_log *log) 1006 { 1007 struct stripe_head *sh; 1008 1009 spin_lock(&log->no_space_stripes_lock); 1010 while (!list_empty(&log->no_space_stripes)) { 1011 sh = list_first_entry(&log->no_space_stripes, 1012 struct stripe_head, log_list); 1013 list_del_init(&sh->log_list); 1014 set_bit(STRIPE_HANDLE, &sh->state); 1015 raid5_release_stripe(sh); 1016 } 1017 spin_unlock(&log->no_space_stripes_lock); 1018 } 1019 1020 /* 1021 * calculate new last_checkpoint 1022 * for write through mode, returns log->next_checkpoint 1023 * for write back, returns log_start of first sh in stripe_in_journal_list 1024 */ 1025 static sector_t r5c_calculate_new_cp(struct r5conf *conf) 1026 { 1027 struct stripe_head *sh; 1028 struct r5l_log *log = conf->log; 1029 sector_t new_cp; 1030 unsigned long flags; 1031 1032 if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) 1033 return log->next_checkpoint; 1034 1035 spin_lock_irqsave(&log->stripe_in_journal_lock, flags); 1036 if (list_empty(&conf->log->stripe_in_journal_list)) { 1037 /* all stripes flushed */ 1038 spin_unlock_irqrestore(&log->stripe_in_journal_lock, flags); 1039 return log->next_checkpoint; 1040 } 1041 sh = list_first_entry(&conf->log->stripe_in_journal_list, 1042 struct stripe_head, r5c); 1043 new_cp = sh->log_start; 1044 spin_unlock_irqrestore(&log->stripe_in_journal_lock, flags); 1045 return new_cp; 1046 } 1047 1048 static sector_t r5l_reclaimable_space(struct r5l_log *log) 1049 { 1050 struct r5conf *conf = log->rdev->mddev->private; 1051 1052 return r5l_ring_distance(log, log->last_checkpoint, 1053 r5c_calculate_new_cp(conf)); 1054 } 1055 1056 static void r5l_run_no_mem_stripe(struct r5l_log *log) 1057 { 1058 struct stripe_head *sh; 1059 1060 assert_spin_locked(&log->io_list_lock); 1061 1062 if (!list_empty(&log->no_mem_stripes)) { 1063 sh = list_first_entry(&log->no_mem_stripes, 1064 struct stripe_head, log_list); 1065 list_del_init(&sh->log_list); 1066 set_bit(STRIPE_HANDLE, &sh->state); 1067 raid5_release_stripe(sh); 1068 } 1069 } 1070 1071 static bool r5l_complete_finished_ios(struct r5l_log *log) 1072 { 1073 struct r5l_io_unit *io, *next; 1074 bool found = false; 1075 1076 assert_spin_locked(&log->io_list_lock); 1077 1078 list_for_each_entry_safe(io, next, &log->finished_ios, log_sibling) { 1079 /* don't change list order */ 1080 if (io->state < IO_UNIT_STRIPE_END) 1081 break; 1082 1083 log->next_checkpoint = io->log_start; 1084 1085 list_del(&io->log_sibling); 1086 mempool_free(io, log->io_pool); 1087 r5l_run_no_mem_stripe(log); 1088 1089 found = true; 1090 } 1091 1092 return found; 1093 } 1094 1095 static void __r5l_stripe_write_finished(struct r5l_io_unit *io) 1096 { 1097 struct r5l_log *log = io->log; 1098 struct r5conf *conf = log->rdev->mddev->private; 1099 unsigned long flags; 1100 1101 spin_lock_irqsave(&log->io_list_lock, flags); 1102 __r5l_set_io_unit_state(io, IO_UNIT_STRIPE_END); 1103 1104 if (!r5l_complete_finished_ios(log)) { 1105 spin_unlock_irqrestore(&log->io_list_lock, flags); 1106 return; 1107 } 1108 1109 if (r5l_reclaimable_space(log) > log->max_free_space || 1110 test_bit(R5C_LOG_TIGHT, &conf->cache_state)) 1111 r5l_wake_reclaim(log, 0); 1112 1113 spin_unlock_irqrestore(&log->io_list_lock, flags); 1114 wake_up(&log->iounit_wait); 1115 } 1116 1117 void r5l_stripe_write_finished(struct stripe_head *sh) 1118 { 1119 struct r5l_io_unit *io; 1120 1121 io = sh->log_io; 1122 sh->log_io = NULL; 1123 1124 if (io && atomic_dec_and_test(&io->pending_stripe)) 1125 __r5l_stripe_write_finished(io); 1126 } 1127 1128 static void r5l_log_flush_endio(struct bio *bio) 1129 { 1130 struct r5l_log *log = container_of(bio, struct r5l_log, 1131 flush_bio); 1132 unsigned long flags; 1133 struct r5l_io_unit *io; 1134 1135 if (bio->bi_error) 1136 md_error(log->rdev->mddev, log->rdev); 1137 1138 spin_lock_irqsave(&log->io_list_lock, flags); 1139 list_for_each_entry(io, &log->flushing_ios, log_sibling) 1140 r5l_io_run_stripes(io); 1141 list_splice_tail_init(&log->flushing_ios, &log->finished_ios); 1142 spin_unlock_irqrestore(&log->io_list_lock, flags); 1143 } 1144 1145 /* 1146 * Starting dispatch IO to raid. 1147 * io_unit(meta) consists of a log. There is one situation we want to avoid. A 1148 * broken meta in the middle of a log causes recovery can't find meta at the 1149 * head of log. If operations require meta at the head persistent in log, we 1150 * must make sure meta before it persistent in log too. A case is: 1151 * 1152 * stripe data/parity is in log, we start write stripe to raid disks. stripe 1153 * data/parity must be persistent in log before we do the write to raid disks. 1154 * 1155 * The solution is we restrictly maintain io_unit list order. In this case, we 1156 * only write stripes of an io_unit to raid disks till the io_unit is the first 1157 * one whose data/parity is in log. 1158 */ 1159 void r5l_flush_stripe_to_raid(struct r5l_log *log) 1160 { 1161 bool do_flush; 1162 1163 if (!log || !log->need_cache_flush) 1164 return; 1165 1166 spin_lock_irq(&log->io_list_lock); 1167 /* flush bio is running */ 1168 if (!list_empty(&log->flushing_ios)) { 1169 spin_unlock_irq(&log->io_list_lock); 1170 return; 1171 } 1172 list_splice_tail_init(&log->io_end_ios, &log->flushing_ios); 1173 do_flush = !list_empty(&log->flushing_ios); 1174 spin_unlock_irq(&log->io_list_lock); 1175 1176 if (!do_flush) 1177 return; 1178 bio_reset(&log->flush_bio); 1179 log->flush_bio.bi_bdev = log->rdev->bdev; 1180 log->flush_bio.bi_end_io = r5l_log_flush_endio; 1181 log->flush_bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH; 1182 submit_bio(&log->flush_bio); 1183 } 1184 1185 static void r5l_write_super(struct r5l_log *log, sector_t cp); 1186 static void r5l_write_super_and_discard_space(struct r5l_log *log, 1187 sector_t end) 1188 { 1189 struct block_device *bdev = log->rdev->bdev; 1190 struct mddev *mddev; 1191 1192 r5l_write_super(log, end); 1193 1194 if (!blk_queue_discard(bdev_get_queue(bdev))) 1195 return; 1196 1197 mddev = log->rdev->mddev; 1198 /* 1199 * Discard could zero data, so before discard we must make sure 1200 * superblock is updated to new log tail. Updating superblock (either 1201 * directly call md_update_sb() or depend on md thread) must hold 1202 * reconfig mutex. On the other hand, raid5_quiesce is called with 1203 * reconfig_mutex hold. The first step of raid5_quiesce() is waitting 1204 * for all IO finish, hence waitting for reclaim thread, while reclaim 1205 * thread is calling this function and waitting for reconfig mutex. So 1206 * there is a deadlock. We workaround this issue with a trylock. 1207 * FIXME: we could miss discard if we can't take reconfig mutex 1208 */ 1209 set_mask_bits(&mddev->sb_flags, 0, 1210 BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_PENDING)); 1211 if (!mddev_trylock(mddev)) 1212 return; 1213 md_update_sb(mddev, 1); 1214 mddev_unlock(mddev); 1215 1216 /* discard IO error really doesn't matter, ignore it */ 1217 if (log->last_checkpoint < end) { 1218 blkdev_issue_discard(bdev, 1219 log->last_checkpoint + log->rdev->data_offset, 1220 end - log->last_checkpoint, GFP_NOIO, 0); 1221 } else { 1222 blkdev_issue_discard(bdev, 1223 log->last_checkpoint + log->rdev->data_offset, 1224 log->device_size - log->last_checkpoint, 1225 GFP_NOIO, 0); 1226 blkdev_issue_discard(bdev, log->rdev->data_offset, end, 1227 GFP_NOIO, 0); 1228 } 1229 } 1230 1231 /* 1232 * r5c_flush_stripe moves stripe from cached list to handle_list. When called, 1233 * the stripe must be on r5c_cached_full_stripes or r5c_cached_partial_stripes. 1234 * 1235 * must hold conf->device_lock 1236 */ 1237 static void r5c_flush_stripe(struct r5conf *conf, struct stripe_head *sh) 1238 { 1239 BUG_ON(list_empty(&sh->lru)); 1240 BUG_ON(!test_bit(STRIPE_R5C_CACHING, &sh->state)); 1241 BUG_ON(test_bit(STRIPE_HANDLE, &sh->state)); 1242 1243 /* 1244 * The stripe is not ON_RELEASE_LIST, so it is safe to call 1245 * raid5_release_stripe() while holding conf->device_lock 1246 */ 1247 BUG_ON(test_bit(STRIPE_ON_RELEASE_LIST, &sh->state)); 1248 assert_spin_locked(&conf->device_lock); 1249 1250 list_del_init(&sh->lru); 1251 atomic_inc(&sh->count); 1252 1253 set_bit(STRIPE_HANDLE, &sh->state); 1254 atomic_inc(&conf->active_stripes); 1255 r5c_make_stripe_write_out(sh); 1256 1257 raid5_release_stripe(sh); 1258 } 1259 1260 /* 1261 * if num == 0, flush all full stripes 1262 * if num > 0, flush all full stripes. If less than num full stripes are 1263 * flushed, flush some partial stripes until totally num stripes are 1264 * flushed or there is no more cached stripes. 1265 */ 1266 void r5c_flush_cache(struct r5conf *conf, int num) 1267 { 1268 int count; 1269 struct stripe_head *sh, *next; 1270 1271 assert_spin_locked(&conf->device_lock); 1272 if (!conf->log) 1273 return; 1274 1275 count = 0; 1276 list_for_each_entry_safe(sh, next, &conf->r5c_full_stripe_list, lru) { 1277 r5c_flush_stripe(conf, sh); 1278 count++; 1279 } 1280 1281 if (count >= num) 1282 return; 1283 list_for_each_entry_safe(sh, next, 1284 &conf->r5c_partial_stripe_list, lru) { 1285 r5c_flush_stripe(conf, sh); 1286 if (++count >= num) 1287 break; 1288 } 1289 } 1290 1291 static void r5c_do_reclaim(struct r5conf *conf) 1292 { 1293 struct r5l_log *log = conf->log; 1294 struct stripe_head *sh; 1295 int count = 0; 1296 unsigned long flags; 1297 int total_cached; 1298 int stripes_to_flush; 1299 1300 if (!r5c_is_writeback(log)) 1301 return; 1302 1303 total_cached = atomic_read(&conf->r5c_cached_partial_stripes) + 1304 atomic_read(&conf->r5c_cached_full_stripes); 1305 1306 if (total_cached > conf->min_nr_stripes * 3 / 4 || 1307 atomic_read(&conf->empty_inactive_list_nr) > 0) 1308 /* 1309 * if stripe cache pressure high, flush all full stripes and 1310 * some partial stripes 1311 */ 1312 stripes_to_flush = R5C_RECLAIM_STRIPE_GROUP; 1313 else if (total_cached > conf->min_nr_stripes * 1 / 2 || 1314 atomic_read(&conf->r5c_cached_full_stripes) > 1315 R5C_FULL_STRIPE_FLUSH_BATCH) 1316 /* 1317 * if stripe cache pressure moderate, or if there is many full 1318 * stripes,flush all full stripes 1319 */ 1320 stripes_to_flush = 0; 1321 else 1322 /* no need to flush */ 1323 stripes_to_flush = -1; 1324 1325 if (stripes_to_flush >= 0) { 1326 spin_lock_irqsave(&conf->device_lock, flags); 1327 r5c_flush_cache(conf, stripes_to_flush); 1328 spin_unlock_irqrestore(&conf->device_lock, flags); 1329 } 1330 1331 /* if log space is tight, flush stripes on stripe_in_journal_list */ 1332 if (test_bit(R5C_LOG_TIGHT, &conf->cache_state)) { 1333 spin_lock_irqsave(&log->stripe_in_journal_lock, flags); 1334 spin_lock(&conf->device_lock); 1335 list_for_each_entry(sh, &log->stripe_in_journal_list, r5c) { 1336 /* 1337 * stripes on stripe_in_journal_list could be in any 1338 * state of the stripe_cache state machine. In this 1339 * case, we only want to flush stripe on 1340 * r5c_cached_full/partial_stripes. The following 1341 * condition makes sure the stripe is on one of the 1342 * two lists. 1343 */ 1344 if (!list_empty(&sh->lru) && 1345 !test_bit(STRIPE_HANDLE, &sh->state) && 1346 atomic_read(&sh->count) == 0) { 1347 r5c_flush_stripe(conf, sh); 1348 } 1349 if (count++ >= R5C_RECLAIM_STRIPE_GROUP) 1350 break; 1351 } 1352 spin_unlock(&conf->device_lock); 1353 spin_unlock_irqrestore(&log->stripe_in_journal_lock, flags); 1354 } 1355 1356 if (!test_bit(R5C_LOG_CRITICAL, &conf->cache_state)) 1357 r5l_run_no_space_stripes(log); 1358 1359 md_wakeup_thread(conf->mddev->thread); 1360 } 1361 1362 static void r5l_do_reclaim(struct r5l_log *log) 1363 { 1364 struct r5conf *conf = log->rdev->mddev->private; 1365 sector_t reclaim_target = xchg(&log->reclaim_target, 0); 1366 sector_t reclaimable; 1367 sector_t next_checkpoint; 1368 bool write_super; 1369 1370 spin_lock_irq(&log->io_list_lock); 1371 write_super = r5l_reclaimable_space(log) > log->max_free_space || 1372 reclaim_target != 0 || !list_empty(&log->no_space_stripes); 1373 /* 1374 * move proper io_unit to reclaim list. We should not change the order. 1375 * reclaimable/unreclaimable io_unit can be mixed in the list, we 1376 * shouldn't reuse space of an unreclaimable io_unit 1377 */ 1378 while (1) { 1379 reclaimable = r5l_reclaimable_space(log); 1380 if (reclaimable >= reclaim_target || 1381 (list_empty(&log->running_ios) && 1382 list_empty(&log->io_end_ios) && 1383 list_empty(&log->flushing_ios) && 1384 list_empty(&log->finished_ios))) 1385 break; 1386 1387 md_wakeup_thread(log->rdev->mddev->thread); 1388 wait_event_lock_irq(log->iounit_wait, 1389 r5l_reclaimable_space(log) > reclaimable, 1390 log->io_list_lock); 1391 } 1392 1393 next_checkpoint = r5c_calculate_new_cp(conf); 1394 spin_unlock_irq(&log->io_list_lock); 1395 1396 BUG_ON(reclaimable < 0); 1397 1398 if (reclaimable == 0 || !write_super) 1399 return; 1400 1401 /* 1402 * write_super will flush cache of each raid disk. We must write super 1403 * here, because the log area might be reused soon and we don't want to 1404 * confuse recovery 1405 */ 1406 r5l_write_super_and_discard_space(log, next_checkpoint); 1407 1408 mutex_lock(&log->io_mutex); 1409 log->last_checkpoint = next_checkpoint; 1410 r5c_update_log_state(log); 1411 mutex_unlock(&log->io_mutex); 1412 1413 r5l_run_no_space_stripes(log); 1414 } 1415 1416 static void r5l_reclaim_thread(struct md_thread *thread) 1417 { 1418 struct mddev *mddev = thread->mddev; 1419 struct r5conf *conf = mddev->private; 1420 struct r5l_log *log = conf->log; 1421 1422 if (!log) 1423 return; 1424 r5c_do_reclaim(conf); 1425 r5l_do_reclaim(log); 1426 } 1427 1428 void r5l_wake_reclaim(struct r5l_log *log, sector_t space) 1429 { 1430 unsigned long target; 1431 unsigned long new = (unsigned long)space; /* overflow in theory */ 1432 1433 if (!log) 1434 return; 1435 do { 1436 target = log->reclaim_target; 1437 if (new < target) 1438 return; 1439 } while (cmpxchg(&log->reclaim_target, target, new) != target); 1440 md_wakeup_thread(log->reclaim_thread); 1441 } 1442 1443 void r5l_quiesce(struct r5l_log *log, int state) 1444 { 1445 struct mddev *mddev; 1446 if (!log || state == 2) 1447 return; 1448 if (state == 0) 1449 kthread_unpark(log->reclaim_thread->tsk); 1450 else if (state == 1) { 1451 /* make sure r5l_write_super_and_discard_space exits */ 1452 mddev = log->rdev->mddev; 1453 wake_up(&mddev->sb_wait); 1454 kthread_park(log->reclaim_thread->tsk); 1455 r5l_wake_reclaim(log, MaxSector); 1456 r5l_do_reclaim(log); 1457 } 1458 } 1459 1460 bool r5l_log_disk_error(struct r5conf *conf) 1461 { 1462 struct r5l_log *log; 1463 bool ret; 1464 /* don't allow write if journal disk is missing */ 1465 rcu_read_lock(); 1466 log = rcu_dereference(conf->log); 1467 1468 if (!log) 1469 ret = test_bit(MD_HAS_JOURNAL, &conf->mddev->flags); 1470 else 1471 ret = test_bit(Faulty, &log->rdev->flags); 1472 rcu_read_unlock(); 1473 return ret; 1474 } 1475 1476 struct r5l_recovery_ctx { 1477 struct page *meta_page; /* current meta */ 1478 sector_t meta_total_blocks; /* total size of current meta and data */ 1479 sector_t pos; /* recovery position */ 1480 u64 seq; /* recovery position seq */ 1481 int data_parity_stripes; /* number of data_parity stripes */ 1482 int data_only_stripes; /* number of data_only stripes */ 1483 struct list_head cached_list; 1484 }; 1485 1486 static int r5l_recovery_read_meta_block(struct r5l_log *log, 1487 struct r5l_recovery_ctx *ctx) 1488 { 1489 struct page *page = ctx->meta_page; 1490 struct r5l_meta_block *mb; 1491 u32 crc, stored_crc; 1492 1493 if (!sync_page_io(log->rdev, ctx->pos, PAGE_SIZE, page, REQ_OP_READ, 0, 1494 false)) 1495 return -EIO; 1496 1497 mb = page_address(page); 1498 stored_crc = le32_to_cpu(mb->checksum); 1499 mb->checksum = 0; 1500 1501 if (le32_to_cpu(mb->magic) != R5LOG_MAGIC || 1502 le64_to_cpu(mb->seq) != ctx->seq || 1503 mb->version != R5LOG_VERSION || 1504 le64_to_cpu(mb->position) != ctx->pos) 1505 return -EINVAL; 1506 1507 crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE); 1508 if (stored_crc != crc) 1509 return -EINVAL; 1510 1511 if (le32_to_cpu(mb->meta_size) > PAGE_SIZE) 1512 return -EINVAL; 1513 1514 ctx->meta_total_blocks = BLOCK_SECTORS; 1515 1516 return 0; 1517 } 1518 1519 static void 1520 r5l_recovery_create_empty_meta_block(struct r5l_log *log, 1521 struct page *page, 1522 sector_t pos, u64 seq) 1523 { 1524 struct r5l_meta_block *mb; 1525 1526 mb = page_address(page); 1527 clear_page(mb); 1528 mb->magic = cpu_to_le32(R5LOG_MAGIC); 1529 mb->version = R5LOG_VERSION; 1530 mb->meta_size = cpu_to_le32(sizeof(struct r5l_meta_block)); 1531 mb->seq = cpu_to_le64(seq); 1532 mb->position = cpu_to_le64(pos); 1533 } 1534 1535 static int r5l_log_write_empty_meta_block(struct r5l_log *log, sector_t pos, 1536 u64 seq) 1537 { 1538 struct page *page; 1539 struct r5l_meta_block *mb; 1540 1541 page = alloc_page(GFP_KERNEL); 1542 if (!page) 1543 return -ENOMEM; 1544 r5l_recovery_create_empty_meta_block(log, page, pos, seq); 1545 mb = page_address(page); 1546 mb->checksum = cpu_to_le32(crc32c_le(log->uuid_checksum, 1547 mb, PAGE_SIZE)); 1548 if (!sync_page_io(log->rdev, pos, PAGE_SIZE, page, REQ_OP_WRITE, 1549 REQ_FUA, false)) { 1550 __free_page(page); 1551 return -EIO; 1552 } 1553 __free_page(page); 1554 return 0; 1555 } 1556 1557 /* 1558 * r5l_recovery_load_data and r5l_recovery_load_parity uses flag R5_Wantwrite 1559 * to mark valid (potentially not flushed) data in the journal. 1560 * 1561 * We already verified checksum in r5l_recovery_verify_data_checksum_for_mb, 1562 * so there should not be any mismatch here. 1563 */ 1564 static void r5l_recovery_load_data(struct r5l_log *log, 1565 struct stripe_head *sh, 1566 struct r5l_recovery_ctx *ctx, 1567 struct r5l_payload_data_parity *payload, 1568 sector_t log_offset) 1569 { 1570 struct mddev *mddev = log->rdev->mddev; 1571 struct r5conf *conf = mddev->private; 1572 int dd_idx; 1573 1574 raid5_compute_sector(conf, 1575 le64_to_cpu(payload->location), 0, 1576 &dd_idx, sh); 1577 sync_page_io(log->rdev, log_offset, PAGE_SIZE, 1578 sh->dev[dd_idx].page, REQ_OP_READ, 0, false); 1579 sh->dev[dd_idx].log_checksum = 1580 le32_to_cpu(payload->checksum[0]); 1581 ctx->meta_total_blocks += BLOCK_SECTORS; 1582 1583 set_bit(R5_Wantwrite, &sh->dev[dd_idx].flags); 1584 set_bit(STRIPE_R5C_CACHING, &sh->state); 1585 } 1586 1587 static void r5l_recovery_load_parity(struct r5l_log *log, 1588 struct stripe_head *sh, 1589 struct r5l_recovery_ctx *ctx, 1590 struct r5l_payload_data_parity *payload, 1591 sector_t log_offset) 1592 { 1593 struct mddev *mddev = log->rdev->mddev; 1594 struct r5conf *conf = mddev->private; 1595 1596 ctx->meta_total_blocks += BLOCK_SECTORS * conf->max_degraded; 1597 sync_page_io(log->rdev, log_offset, PAGE_SIZE, 1598 sh->dev[sh->pd_idx].page, REQ_OP_READ, 0, false); 1599 sh->dev[sh->pd_idx].log_checksum = 1600 le32_to_cpu(payload->checksum[0]); 1601 set_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags); 1602 1603 if (sh->qd_idx >= 0) { 1604 sync_page_io(log->rdev, 1605 r5l_ring_add(log, log_offset, BLOCK_SECTORS), 1606 PAGE_SIZE, sh->dev[sh->qd_idx].page, 1607 REQ_OP_READ, 0, false); 1608 sh->dev[sh->qd_idx].log_checksum = 1609 le32_to_cpu(payload->checksum[1]); 1610 set_bit(R5_Wantwrite, &sh->dev[sh->qd_idx].flags); 1611 } 1612 clear_bit(STRIPE_R5C_CACHING, &sh->state); 1613 } 1614 1615 static void r5l_recovery_reset_stripe(struct stripe_head *sh) 1616 { 1617 int i; 1618 1619 sh->state = 0; 1620 sh->log_start = MaxSector; 1621 for (i = sh->disks; i--; ) 1622 sh->dev[i].flags = 0; 1623 } 1624 1625 static void 1626 r5l_recovery_replay_one_stripe(struct r5conf *conf, 1627 struct stripe_head *sh, 1628 struct r5l_recovery_ctx *ctx) 1629 { 1630 struct md_rdev *rdev, *rrdev; 1631 int disk_index; 1632 int data_count = 0; 1633 1634 for (disk_index = 0; disk_index < sh->disks; disk_index++) { 1635 if (!test_bit(R5_Wantwrite, &sh->dev[disk_index].flags)) 1636 continue; 1637 if (disk_index == sh->qd_idx || disk_index == sh->pd_idx) 1638 continue; 1639 data_count++; 1640 } 1641 1642 /* 1643 * stripes that only have parity must have been flushed 1644 * before the crash that we are now recovering from, so 1645 * there is nothing more to recovery. 1646 */ 1647 if (data_count == 0) 1648 goto out; 1649 1650 for (disk_index = 0; disk_index < sh->disks; disk_index++) { 1651 if (!test_bit(R5_Wantwrite, &sh->dev[disk_index].flags)) 1652 continue; 1653 1654 /* in case device is broken */ 1655 rcu_read_lock(); 1656 rdev = rcu_dereference(conf->disks[disk_index].rdev); 1657 if (rdev) { 1658 atomic_inc(&rdev->nr_pending); 1659 rcu_read_unlock(); 1660 sync_page_io(rdev, sh->sector, PAGE_SIZE, 1661 sh->dev[disk_index].page, REQ_OP_WRITE, 0, 1662 false); 1663 rdev_dec_pending(rdev, rdev->mddev); 1664 rcu_read_lock(); 1665 } 1666 rrdev = rcu_dereference(conf->disks[disk_index].replacement); 1667 if (rrdev) { 1668 atomic_inc(&rrdev->nr_pending); 1669 rcu_read_unlock(); 1670 sync_page_io(rrdev, sh->sector, PAGE_SIZE, 1671 sh->dev[disk_index].page, REQ_OP_WRITE, 0, 1672 false); 1673 rdev_dec_pending(rrdev, rrdev->mddev); 1674 rcu_read_lock(); 1675 } 1676 rcu_read_unlock(); 1677 } 1678 ctx->data_parity_stripes++; 1679 out: 1680 r5l_recovery_reset_stripe(sh); 1681 } 1682 1683 static struct stripe_head * 1684 r5c_recovery_alloc_stripe(struct r5conf *conf, 1685 sector_t stripe_sect, 1686 sector_t log_start) 1687 { 1688 struct stripe_head *sh; 1689 1690 sh = raid5_get_active_stripe(conf, stripe_sect, 0, 1, 0); 1691 if (!sh) 1692 return NULL; /* no more stripe available */ 1693 1694 r5l_recovery_reset_stripe(sh); 1695 sh->log_start = log_start; 1696 1697 return sh; 1698 } 1699 1700 static struct stripe_head * 1701 r5c_recovery_lookup_stripe(struct list_head *list, sector_t sect) 1702 { 1703 struct stripe_head *sh; 1704 1705 list_for_each_entry(sh, list, lru) 1706 if (sh->sector == sect) 1707 return sh; 1708 return NULL; 1709 } 1710 1711 static void 1712 r5c_recovery_drop_stripes(struct list_head *cached_stripe_list, 1713 struct r5l_recovery_ctx *ctx) 1714 { 1715 struct stripe_head *sh, *next; 1716 1717 list_for_each_entry_safe(sh, next, cached_stripe_list, lru) { 1718 r5l_recovery_reset_stripe(sh); 1719 list_del_init(&sh->lru); 1720 raid5_release_stripe(sh); 1721 } 1722 } 1723 1724 static void 1725 r5c_recovery_replay_stripes(struct list_head *cached_stripe_list, 1726 struct r5l_recovery_ctx *ctx) 1727 { 1728 struct stripe_head *sh, *next; 1729 1730 list_for_each_entry_safe(sh, next, cached_stripe_list, lru) 1731 if (!test_bit(STRIPE_R5C_CACHING, &sh->state)) { 1732 r5l_recovery_replay_one_stripe(sh->raid_conf, sh, ctx); 1733 list_del_init(&sh->lru); 1734 raid5_release_stripe(sh); 1735 } 1736 } 1737 1738 /* if matches return 0; otherwise return -EINVAL */ 1739 static int 1740 r5l_recovery_verify_data_checksum(struct r5l_log *log, struct page *page, 1741 sector_t log_offset, __le32 log_checksum) 1742 { 1743 void *addr; 1744 u32 checksum; 1745 1746 sync_page_io(log->rdev, log_offset, PAGE_SIZE, 1747 page, REQ_OP_READ, 0, false); 1748 addr = kmap_atomic(page); 1749 checksum = crc32c_le(log->uuid_checksum, addr, PAGE_SIZE); 1750 kunmap_atomic(addr); 1751 return (le32_to_cpu(log_checksum) == checksum) ? 0 : -EINVAL; 1752 } 1753 1754 /* 1755 * before loading data to stripe cache, we need verify checksum for all data, 1756 * if there is mismatch for any data page, we drop all data in the mata block 1757 */ 1758 static int 1759 r5l_recovery_verify_data_checksum_for_mb(struct r5l_log *log, 1760 struct r5l_recovery_ctx *ctx) 1761 { 1762 struct mddev *mddev = log->rdev->mddev; 1763 struct r5conf *conf = mddev->private; 1764 struct r5l_meta_block *mb = page_address(ctx->meta_page); 1765 sector_t mb_offset = sizeof(struct r5l_meta_block); 1766 sector_t log_offset = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS); 1767 struct page *page; 1768 struct r5l_payload_data_parity *payload; 1769 1770 page = alloc_page(GFP_KERNEL); 1771 if (!page) 1772 return -ENOMEM; 1773 1774 while (mb_offset < le32_to_cpu(mb->meta_size)) { 1775 payload = (void *)mb + mb_offset; 1776 1777 if (payload->header.type == R5LOG_PAYLOAD_DATA) { 1778 if (r5l_recovery_verify_data_checksum( 1779 log, page, log_offset, 1780 payload->checksum[0]) < 0) 1781 goto mismatch; 1782 } else if (payload->header.type == R5LOG_PAYLOAD_PARITY) { 1783 if (r5l_recovery_verify_data_checksum( 1784 log, page, log_offset, 1785 payload->checksum[0]) < 0) 1786 goto mismatch; 1787 if (conf->max_degraded == 2 && /* q for RAID 6 */ 1788 r5l_recovery_verify_data_checksum( 1789 log, page, 1790 r5l_ring_add(log, log_offset, 1791 BLOCK_SECTORS), 1792 payload->checksum[1]) < 0) 1793 goto mismatch; 1794 } else /* not R5LOG_PAYLOAD_DATA or R5LOG_PAYLOAD_PARITY */ 1795 goto mismatch; 1796 1797 log_offset = r5l_ring_add(log, log_offset, 1798 le32_to_cpu(payload->size)); 1799 1800 mb_offset += sizeof(struct r5l_payload_data_parity) + 1801 sizeof(__le32) * 1802 (le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9)); 1803 } 1804 1805 put_page(page); 1806 return 0; 1807 1808 mismatch: 1809 put_page(page); 1810 return -EINVAL; 1811 } 1812 1813 /* 1814 * Analyze all data/parity pages in one meta block 1815 * Returns: 1816 * 0 for success 1817 * -EINVAL for unknown playload type 1818 * -EAGAIN for checksum mismatch of data page 1819 * -ENOMEM for run out of memory (alloc_page failed or run out of stripes) 1820 */ 1821 static int 1822 r5c_recovery_analyze_meta_block(struct r5l_log *log, 1823 struct r5l_recovery_ctx *ctx, 1824 struct list_head *cached_stripe_list) 1825 { 1826 struct mddev *mddev = log->rdev->mddev; 1827 struct r5conf *conf = mddev->private; 1828 struct r5l_meta_block *mb; 1829 struct r5l_payload_data_parity *payload; 1830 int mb_offset; 1831 sector_t log_offset; 1832 sector_t stripe_sect; 1833 struct stripe_head *sh; 1834 int ret; 1835 1836 /* 1837 * for mismatch in data blocks, we will drop all data in this mb, but 1838 * we will still read next mb for other data with FLUSH flag, as 1839 * io_unit could finish out of order. 1840 */ 1841 ret = r5l_recovery_verify_data_checksum_for_mb(log, ctx); 1842 if (ret == -EINVAL) 1843 return -EAGAIN; 1844 else if (ret) 1845 return ret; /* -ENOMEM duo to alloc_page() failed */ 1846 1847 mb = page_address(ctx->meta_page); 1848 mb_offset = sizeof(struct r5l_meta_block); 1849 log_offset = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS); 1850 1851 while (mb_offset < le32_to_cpu(mb->meta_size)) { 1852 int dd; 1853 1854 payload = (void *)mb + mb_offset; 1855 stripe_sect = (payload->header.type == R5LOG_PAYLOAD_DATA) ? 1856 raid5_compute_sector( 1857 conf, le64_to_cpu(payload->location), 0, &dd, 1858 NULL) 1859 : le64_to_cpu(payload->location); 1860 1861 sh = r5c_recovery_lookup_stripe(cached_stripe_list, 1862 stripe_sect); 1863 1864 if (!sh) { 1865 sh = r5c_recovery_alloc_stripe(conf, stripe_sect, ctx->pos); 1866 /* 1867 * cannot get stripe from raid5_get_active_stripe 1868 * try replay some stripes 1869 */ 1870 if (!sh) { 1871 r5c_recovery_replay_stripes( 1872 cached_stripe_list, ctx); 1873 sh = r5c_recovery_alloc_stripe( 1874 conf, stripe_sect, ctx->pos); 1875 } 1876 if (!sh) { 1877 pr_debug("md/raid:%s: Increasing stripe cache size to %d to recovery data on journal.\n", 1878 mdname(mddev), 1879 conf->min_nr_stripes * 2); 1880 raid5_set_cache_size(mddev, 1881 conf->min_nr_stripes * 2); 1882 sh = r5c_recovery_alloc_stripe( 1883 conf, stripe_sect, ctx->pos); 1884 } 1885 if (!sh) { 1886 pr_err("md/raid:%s: Cannot get enough stripes due to memory pressure. Recovery failed.\n", 1887 mdname(mddev)); 1888 return -ENOMEM; 1889 } 1890 list_add_tail(&sh->lru, cached_stripe_list); 1891 } 1892 1893 if (payload->header.type == R5LOG_PAYLOAD_DATA) { 1894 if (!test_bit(STRIPE_R5C_CACHING, &sh->state) && 1895 test_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags)) { 1896 r5l_recovery_replay_one_stripe(conf, sh, ctx); 1897 sh->log_start = ctx->pos; 1898 list_move_tail(&sh->lru, cached_stripe_list); 1899 } 1900 r5l_recovery_load_data(log, sh, ctx, payload, 1901 log_offset); 1902 } else if (payload->header.type == R5LOG_PAYLOAD_PARITY) 1903 r5l_recovery_load_parity(log, sh, ctx, payload, 1904 log_offset); 1905 else 1906 return -EINVAL; 1907 1908 log_offset = r5l_ring_add(log, log_offset, 1909 le32_to_cpu(payload->size)); 1910 1911 mb_offset += sizeof(struct r5l_payload_data_parity) + 1912 sizeof(__le32) * 1913 (le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9)); 1914 } 1915 1916 return 0; 1917 } 1918 1919 /* 1920 * Load the stripe into cache. The stripe will be written out later by 1921 * the stripe cache state machine. 1922 */ 1923 static void r5c_recovery_load_one_stripe(struct r5l_log *log, 1924 struct stripe_head *sh) 1925 { 1926 struct r5dev *dev; 1927 int i; 1928 1929 for (i = sh->disks; i--; ) { 1930 dev = sh->dev + i; 1931 if (test_and_clear_bit(R5_Wantwrite, &dev->flags)) { 1932 set_bit(R5_InJournal, &dev->flags); 1933 set_bit(R5_UPTODATE, &dev->flags); 1934 } 1935 } 1936 list_add_tail(&sh->r5c, &log->stripe_in_journal_list); 1937 atomic_inc(&log->stripe_in_journal_count); 1938 } 1939 1940 /* 1941 * Scan through the log for all to-be-flushed data 1942 * 1943 * For stripes with data and parity, namely Data-Parity stripe 1944 * (STRIPE_R5C_CACHING == 0), we simply replay all the writes. 1945 * 1946 * For stripes with only data, namely Data-Only stripe 1947 * (STRIPE_R5C_CACHING == 1), we load them to stripe cache state machine. 1948 * 1949 * For a stripe, if we see data after parity, we should discard all previous 1950 * data and parity for this stripe, as these data are already flushed to 1951 * the array. 1952 * 1953 * At the end of the scan, we return the new journal_tail, which points to 1954 * first data-only stripe on the journal device, or next invalid meta block. 1955 */ 1956 static int r5c_recovery_flush_log(struct r5l_log *log, 1957 struct r5l_recovery_ctx *ctx) 1958 { 1959 struct stripe_head *sh; 1960 int ret = 0; 1961 1962 /* scan through the log */ 1963 while (1) { 1964 if (r5l_recovery_read_meta_block(log, ctx)) 1965 break; 1966 1967 ret = r5c_recovery_analyze_meta_block(log, ctx, 1968 &ctx->cached_list); 1969 /* 1970 * -EAGAIN means mismatch in data block, in this case, we still 1971 * try scan the next metablock 1972 */ 1973 if (ret && ret != -EAGAIN) 1974 break; /* ret == -EINVAL or -ENOMEM */ 1975 ctx->seq++; 1976 ctx->pos = r5l_ring_add(log, ctx->pos, ctx->meta_total_blocks); 1977 } 1978 1979 if (ret == -ENOMEM) { 1980 r5c_recovery_drop_stripes(&ctx->cached_list, ctx); 1981 return ret; 1982 } 1983 1984 /* replay data-parity stripes */ 1985 r5c_recovery_replay_stripes(&ctx->cached_list, ctx); 1986 1987 /* load data-only stripes to stripe cache */ 1988 list_for_each_entry(sh, &ctx->cached_list, lru) { 1989 WARN_ON(!test_bit(STRIPE_R5C_CACHING, &sh->state)); 1990 r5c_recovery_load_one_stripe(log, sh); 1991 ctx->data_only_stripes++; 1992 } 1993 1994 return 0; 1995 } 1996 1997 /* 1998 * we did a recovery. Now ctx.pos points to an invalid meta block. New 1999 * log will start here. but we can't let superblock point to last valid 2000 * meta block. The log might looks like: 2001 * | meta 1| meta 2| meta 3| 2002 * meta 1 is valid, meta 2 is invalid. meta 3 could be valid. If 2003 * superblock points to meta 1, we write a new valid meta 2n. if crash 2004 * happens again, new recovery will start from meta 1. Since meta 2n is 2005 * valid now, recovery will think meta 3 is valid, which is wrong. 2006 * The solution is we create a new meta in meta2 with its seq == meta 2007 * 1's seq + 10000 and let superblock points to meta2. The same recovery 2008 * will not think meta 3 is a valid meta, because its seq doesn't match 2009 */ 2010 2011 /* 2012 * Before recovery, the log looks like the following 2013 * 2014 * --------------------------------------------- 2015 * | valid log | invalid log | 2016 * --------------------------------------------- 2017 * ^ 2018 * |- log->last_checkpoint 2019 * |- log->last_cp_seq 2020 * 2021 * Now we scan through the log until we see invalid entry 2022 * 2023 * --------------------------------------------- 2024 * | valid log | invalid log | 2025 * --------------------------------------------- 2026 * ^ ^ 2027 * |- log->last_checkpoint |- ctx->pos 2028 * |- log->last_cp_seq |- ctx->seq 2029 * 2030 * From this point, we need to increase seq number by 10 to avoid 2031 * confusing next recovery. 2032 * 2033 * --------------------------------------------- 2034 * | valid log | invalid log | 2035 * --------------------------------------------- 2036 * ^ ^ 2037 * |- log->last_checkpoint |- ctx->pos+1 2038 * |- log->last_cp_seq |- ctx->seq+10001 2039 * 2040 * However, it is not safe to start the state machine yet, because data only 2041 * parities are not yet secured in RAID. To save these data only parities, we 2042 * rewrite them from seq+11. 2043 * 2044 * ----------------------------------------------------------------- 2045 * | valid log | data only stripes | invalid log | 2046 * ----------------------------------------------------------------- 2047 * ^ ^ 2048 * |- log->last_checkpoint |- ctx->pos+n 2049 * |- log->last_cp_seq |- ctx->seq+10000+n 2050 * 2051 * If failure happens again during this process, the recovery can safe start 2052 * again from log->last_checkpoint. 2053 * 2054 * Once data only stripes are rewritten to journal, we move log_tail 2055 * 2056 * ----------------------------------------------------------------- 2057 * | old log | data only stripes | invalid log | 2058 * ----------------------------------------------------------------- 2059 * ^ ^ 2060 * |- log->last_checkpoint |- ctx->pos+n 2061 * |- log->last_cp_seq |- ctx->seq+10000+n 2062 * 2063 * Then we can safely start the state machine. If failure happens from this 2064 * point on, the recovery will start from new log->last_checkpoint. 2065 */ 2066 static int 2067 r5c_recovery_rewrite_data_only_stripes(struct r5l_log *log, 2068 struct r5l_recovery_ctx *ctx) 2069 { 2070 struct stripe_head *sh, *next; 2071 struct mddev *mddev = log->rdev->mddev; 2072 struct page *page; 2073 2074 page = alloc_page(GFP_KERNEL); 2075 if (!page) { 2076 pr_err("md/raid:%s: cannot allocate memory to rewrite data only stripes\n", 2077 mdname(mddev)); 2078 return -ENOMEM; 2079 } 2080 2081 list_for_each_entry_safe(sh, next, &ctx->cached_list, lru) { 2082 struct r5l_meta_block *mb; 2083 int i; 2084 int offset; 2085 sector_t write_pos; 2086 2087 WARN_ON(!test_bit(STRIPE_R5C_CACHING, &sh->state)); 2088 r5l_recovery_create_empty_meta_block(log, page, 2089 ctx->pos, ctx->seq); 2090 mb = page_address(page); 2091 offset = le32_to_cpu(mb->meta_size); 2092 write_pos = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS); 2093 2094 for (i = sh->disks; i--; ) { 2095 struct r5dev *dev = &sh->dev[i]; 2096 struct r5l_payload_data_parity *payload; 2097 void *addr; 2098 2099 if (test_bit(R5_InJournal, &dev->flags)) { 2100 payload = (void *)mb + offset; 2101 payload->header.type = cpu_to_le16( 2102 R5LOG_PAYLOAD_DATA); 2103 payload->size = BLOCK_SECTORS; 2104 payload->location = cpu_to_le64( 2105 raid5_compute_blocknr(sh, i, 0)); 2106 addr = kmap_atomic(dev->page); 2107 payload->checksum[0] = cpu_to_le32( 2108 crc32c_le(log->uuid_checksum, addr, 2109 PAGE_SIZE)); 2110 kunmap_atomic(addr); 2111 sync_page_io(log->rdev, write_pos, PAGE_SIZE, 2112 dev->page, REQ_OP_WRITE, 0, false); 2113 write_pos = r5l_ring_add(log, write_pos, 2114 BLOCK_SECTORS); 2115 offset += sizeof(__le32) + 2116 sizeof(struct r5l_payload_data_parity); 2117 2118 } 2119 } 2120 mb->meta_size = cpu_to_le32(offset); 2121 mb->checksum = cpu_to_le32(crc32c_le(log->uuid_checksum, 2122 mb, PAGE_SIZE)); 2123 sync_page_io(log->rdev, ctx->pos, PAGE_SIZE, page, 2124 REQ_OP_WRITE, REQ_FUA, false); 2125 sh->log_start = ctx->pos; 2126 ctx->pos = write_pos; 2127 ctx->seq += 1; 2128 2129 list_del_init(&sh->lru); 2130 raid5_release_stripe(sh); 2131 } 2132 __free_page(page); 2133 return 0; 2134 } 2135 2136 static int r5l_recovery_log(struct r5l_log *log) 2137 { 2138 struct mddev *mddev = log->rdev->mddev; 2139 struct r5l_recovery_ctx ctx; 2140 int ret; 2141 sector_t pos; 2142 struct stripe_head *sh; 2143 2144 ctx.pos = log->last_checkpoint; 2145 ctx.seq = log->last_cp_seq; 2146 ctx.meta_page = alloc_page(GFP_KERNEL); 2147 ctx.data_only_stripes = 0; 2148 ctx.data_parity_stripes = 0; 2149 INIT_LIST_HEAD(&ctx.cached_list); 2150 2151 if (!ctx.meta_page) 2152 return -ENOMEM; 2153 2154 ret = r5c_recovery_flush_log(log, &ctx); 2155 __free_page(ctx.meta_page); 2156 2157 if (ret) 2158 return ret; 2159 2160 pos = ctx.pos; 2161 ctx.seq += 10000; 2162 2163 if (ctx.data_only_stripes == 0) { 2164 log->next_checkpoint = ctx.pos; 2165 r5l_log_write_empty_meta_block(log, ctx.pos, ctx.seq++); 2166 ctx.pos = r5l_ring_add(log, ctx.pos, BLOCK_SECTORS); 2167 } else { 2168 sh = list_last_entry(&ctx.cached_list, struct stripe_head, lru); 2169 log->next_checkpoint = sh->log_start; 2170 } 2171 2172 if ((ctx.data_only_stripes == 0) && (ctx.data_parity_stripes == 0)) 2173 pr_debug("md/raid:%s: starting from clean shutdown\n", 2174 mdname(mddev)); 2175 else { 2176 pr_debug("md/raid:%s: recoverying %d data-only stripes and %d data-parity stripes\n", 2177 mdname(mddev), ctx.data_only_stripes, 2178 ctx.data_parity_stripes); 2179 2180 if (ctx.data_only_stripes > 0) 2181 if (r5c_recovery_rewrite_data_only_stripes(log, &ctx)) { 2182 pr_err("md/raid:%s: failed to rewrite stripes to journal\n", 2183 mdname(mddev)); 2184 return -EIO; 2185 } 2186 } 2187 2188 log->log_start = ctx.pos; 2189 log->seq = ctx.seq; 2190 log->last_checkpoint = pos; 2191 r5l_write_super(log, pos); 2192 return 0; 2193 } 2194 2195 static void r5l_write_super(struct r5l_log *log, sector_t cp) 2196 { 2197 struct mddev *mddev = log->rdev->mddev; 2198 2199 log->rdev->journal_tail = cp; 2200 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 2201 } 2202 2203 static ssize_t r5c_journal_mode_show(struct mddev *mddev, char *page) 2204 { 2205 struct r5conf *conf = mddev->private; 2206 int ret; 2207 2208 if (!conf->log) 2209 return 0; 2210 2211 switch (conf->log->r5c_journal_mode) { 2212 case R5C_JOURNAL_MODE_WRITE_THROUGH: 2213 ret = snprintf( 2214 page, PAGE_SIZE, "[%s] %s\n", 2215 r5c_journal_mode_str[R5C_JOURNAL_MODE_WRITE_THROUGH], 2216 r5c_journal_mode_str[R5C_JOURNAL_MODE_WRITE_BACK]); 2217 break; 2218 case R5C_JOURNAL_MODE_WRITE_BACK: 2219 ret = snprintf( 2220 page, PAGE_SIZE, "%s [%s]\n", 2221 r5c_journal_mode_str[R5C_JOURNAL_MODE_WRITE_THROUGH], 2222 r5c_journal_mode_str[R5C_JOURNAL_MODE_WRITE_BACK]); 2223 break; 2224 default: 2225 ret = 0; 2226 } 2227 return ret; 2228 } 2229 2230 static ssize_t r5c_journal_mode_store(struct mddev *mddev, 2231 const char *page, size_t length) 2232 { 2233 struct r5conf *conf = mddev->private; 2234 struct r5l_log *log = conf->log; 2235 int val = -1, i; 2236 int len = length; 2237 2238 if (!log) 2239 return -ENODEV; 2240 2241 if (len && page[len - 1] == '\n') 2242 len -= 1; 2243 for (i = 0; i < ARRAY_SIZE(r5c_journal_mode_str); i++) 2244 if (strlen(r5c_journal_mode_str[i]) == len && 2245 strncmp(page, r5c_journal_mode_str[i], len) == 0) { 2246 val = i; 2247 break; 2248 } 2249 if (val < R5C_JOURNAL_MODE_WRITE_THROUGH || 2250 val > R5C_JOURNAL_MODE_WRITE_BACK) 2251 return -EINVAL; 2252 2253 mddev_suspend(mddev); 2254 conf->log->r5c_journal_mode = val; 2255 mddev_resume(mddev); 2256 2257 pr_debug("md/raid:%s: setting r5c cache mode to %d: %s\n", 2258 mdname(mddev), val, r5c_journal_mode_str[val]); 2259 return length; 2260 } 2261 2262 struct md_sysfs_entry 2263 r5c_journal_mode = __ATTR(journal_mode, 0644, 2264 r5c_journal_mode_show, r5c_journal_mode_store); 2265 2266 /* 2267 * Try handle write operation in caching phase. This function should only 2268 * be called in write-back mode. 2269 * 2270 * If all outstanding writes can be handled in caching phase, returns 0 2271 * If writes requires write-out phase, call r5c_make_stripe_write_out() 2272 * and returns -EAGAIN 2273 */ 2274 int r5c_try_caching_write(struct r5conf *conf, 2275 struct stripe_head *sh, 2276 struct stripe_head_state *s, 2277 int disks) 2278 { 2279 struct r5l_log *log = conf->log; 2280 int i; 2281 struct r5dev *dev; 2282 int to_cache = 0; 2283 2284 BUG_ON(!r5c_is_writeback(log)); 2285 2286 if (!test_bit(STRIPE_R5C_CACHING, &sh->state)) { 2287 /* 2288 * There are two different scenarios here: 2289 * 1. The stripe has some data cached, and it is sent to 2290 * write-out phase for reclaim 2291 * 2. The stripe is clean, and this is the first write 2292 * 2293 * For 1, return -EAGAIN, so we continue with 2294 * handle_stripe_dirtying(). 2295 * 2296 * For 2, set STRIPE_R5C_CACHING and continue with caching 2297 * write. 2298 */ 2299 2300 /* case 1: anything injournal or anything in written */ 2301 if (s->injournal > 0 || s->written > 0) 2302 return -EAGAIN; 2303 /* case 2 */ 2304 set_bit(STRIPE_R5C_CACHING, &sh->state); 2305 } 2306 2307 for (i = disks; i--; ) { 2308 dev = &sh->dev[i]; 2309 /* if non-overwrite, use writing-out phase */ 2310 if (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags) && 2311 !test_bit(R5_InJournal, &dev->flags)) { 2312 r5c_make_stripe_write_out(sh); 2313 return -EAGAIN; 2314 } 2315 } 2316 2317 for (i = disks; i--; ) { 2318 dev = &sh->dev[i]; 2319 if (dev->towrite) { 2320 set_bit(R5_Wantwrite, &dev->flags); 2321 set_bit(R5_Wantdrain, &dev->flags); 2322 set_bit(R5_LOCKED, &dev->flags); 2323 to_cache++; 2324 } 2325 } 2326 2327 if (to_cache) { 2328 set_bit(STRIPE_OP_BIODRAIN, &s->ops_request); 2329 /* 2330 * set STRIPE_LOG_TRAPPED, which triggers r5c_cache_data() 2331 * in ops_run_io(). STRIPE_LOG_TRAPPED will be cleared in 2332 * r5c_handle_data_cached() 2333 */ 2334 set_bit(STRIPE_LOG_TRAPPED, &sh->state); 2335 } 2336 2337 return 0; 2338 } 2339 2340 /* 2341 * free extra pages (orig_page) we allocated for prexor 2342 */ 2343 void r5c_release_extra_page(struct stripe_head *sh) 2344 { 2345 struct r5conf *conf = sh->raid_conf; 2346 int i; 2347 bool using_disk_info_extra_page; 2348 2349 using_disk_info_extra_page = 2350 sh->dev[0].orig_page == conf->disks[0].extra_page; 2351 2352 for (i = sh->disks; i--; ) 2353 if (sh->dev[i].page != sh->dev[i].orig_page) { 2354 struct page *p = sh->dev[i].orig_page; 2355 2356 sh->dev[i].orig_page = sh->dev[i].page; 2357 if (!using_disk_info_extra_page) 2358 put_page(p); 2359 } 2360 2361 if (using_disk_info_extra_page) { 2362 clear_bit(R5C_EXTRA_PAGE_IN_USE, &conf->cache_state); 2363 md_wakeup_thread(conf->mddev->thread); 2364 } 2365 } 2366 2367 void r5c_use_extra_page(struct stripe_head *sh) 2368 { 2369 struct r5conf *conf = sh->raid_conf; 2370 int i; 2371 struct r5dev *dev; 2372 2373 for (i = sh->disks; i--; ) { 2374 dev = &sh->dev[i]; 2375 if (dev->orig_page != dev->page) 2376 put_page(dev->orig_page); 2377 dev->orig_page = conf->disks[i].extra_page; 2378 } 2379 } 2380 2381 /* 2382 * clean up the stripe (clear R5_InJournal for dev[pd_idx] etc.) after the 2383 * stripe is committed to RAID disks. 2384 */ 2385 void r5c_finish_stripe_write_out(struct r5conf *conf, 2386 struct stripe_head *sh, 2387 struct stripe_head_state *s) 2388 { 2389 int i; 2390 int do_wakeup = 0; 2391 2392 if (!conf->log || 2393 !test_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags)) 2394 return; 2395 2396 WARN_ON(test_bit(STRIPE_R5C_CACHING, &sh->state)); 2397 clear_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags); 2398 2399 if (conf->log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) 2400 return; 2401 2402 for (i = sh->disks; i--; ) { 2403 clear_bit(R5_InJournal, &sh->dev[i].flags); 2404 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) 2405 do_wakeup = 1; 2406 } 2407 2408 /* 2409 * analyse_stripe() runs before r5c_finish_stripe_write_out(), 2410 * We updated R5_InJournal, so we also update s->injournal. 2411 */ 2412 s->injournal = 0; 2413 2414 if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state)) 2415 if (atomic_dec_and_test(&conf->pending_full_writes)) 2416 md_wakeup_thread(conf->mddev->thread); 2417 2418 if (do_wakeup) 2419 wake_up(&conf->wait_for_overlap); 2420 2421 if (conf->log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) 2422 return; 2423 2424 spin_lock_irq(&conf->log->stripe_in_journal_lock); 2425 list_del_init(&sh->r5c); 2426 spin_unlock_irq(&conf->log->stripe_in_journal_lock); 2427 sh->log_start = MaxSector; 2428 atomic_dec(&conf->log->stripe_in_journal_count); 2429 r5c_update_log_state(conf->log); 2430 } 2431 2432 int 2433 r5c_cache_data(struct r5l_log *log, struct stripe_head *sh, 2434 struct stripe_head_state *s) 2435 { 2436 struct r5conf *conf = sh->raid_conf; 2437 int pages = 0; 2438 int reserve; 2439 int i; 2440 int ret = 0; 2441 2442 BUG_ON(!log); 2443 2444 for (i = 0; i < sh->disks; i++) { 2445 void *addr; 2446 2447 if (!test_bit(R5_Wantwrite, &sh->dev[i].flags)) 2448 continue; 2449 addr = kmap_atomic(sh->dev[i].page); 2450 sh->dev[i].log_checksum = crc32c_le(log->uuid_checksum, 2451 addr, PAGE_SIZE); 2452 kunmap_atomic(addr); 2453 pages++; 2454 } 2455 WARN_ON(pages == 0); 2456 2457 /* 2458 * The stripe must enter state machine again to call endio, so 2459 * don't delay. 2460 */ 2461 clear_bit(STRIPE_DELAYED, &sh->state); 2462 atomic_inc(&sh->count); 2463 2464 mutex_lock(&log->io_mutex); 2465 /* meta + data */ 2466 reserve = (1 + pages) << (PAGE_SHIFT - 9); 2467 2468 if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state) && 2469 sh->log_start == MaxSector) 2470 r5l_add_no_space_stripe(log, sh); 2471 else if (!r5l_has_free_space(log, reserve)) { 2472 if (sh->log_start == log->last_checkpoint) 2473 BUG(); 2474 else 2475 r5l_add_no_space_stripe(log, sh); 2476 } else { 2477 ret = r5l_log_stripe(log, sh, pages, 0); 2478 if (ret) { 2479 spin_lock_irq(&log->io_list_lock); 2480 list_add_tail(&sh->log_list, &log->no_mem_stripes); 2481 spin_unlock_irq(&log->io_list_lock); 2482 } 2483 } 2484 2485 mutex_unlock(&log->io_mutex); 2486 return 0; 2487 } 2488 2489 static int r5l_load_log(struct r5l_log *log) 2490 { 2491 struct md_rdev *rdev = log->rdev; 2492 struct page *page; 2493 struct r5l_meta_block *mb; 2494 sector_t cp = log->rdev->journal_tail; 2495 u32 stored_crc, expected_crc; 2496 bool create_super = false; 2497 int ret = 0; 2498 2499 /* Make sure it's valid */ 2500 if (cp >= rdev->sectors || round_down(cp, BLOCK_SECTORS) != cp) 2501 cp = 0; 2502 page = alloc_page(GFP_KERNEL); 2503 if (!page) 2504 return -ENOMEM; 2505 2506 if (!sync_page_io(rdev, cp, PAGE_SIZE, page, REQ_OP_READ, 0, false)) { 2507 ret = -EIO; 2508 goto ioerr; 2509 } 2510 mb = page_address(page); 2511 2512 if (le32_to_cpu(mb->magic) != R5LOG_MAGIC || 2513 mb->version != R5LOG_VERSION) { 2514 create_super = true; 2515 goto create; 2516 } 2517 stored_crc = le32_to_cpu(mb->checksum); 2518 mb->checksum = 0; 2519 expected_crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE); 2520 if (stored_crc != expected_crc) { 2521 create_super = true; 2522 goto create; 2523 } 2524 if (le64_to_cpu(mb->position) != cp) { 2525 create_super = true; 2526 goto create; 2527 } 2528 create: 2529 if (create_super) { 2530 log->last_cp_seq = prandom_u32(); 2531 cp = 0; 2532 r5l_log_write_empty_meta_block(log, cp, log->last_cp_seq); 2533 /* 2534 * Make sure super points to correct address. Log might have 2535 * data very soon. If super hasn't correct log tail address, 2536 * recovery can't find the log 2537 */ 2538 r5l_write_super(log, cp); 2539 } else 2540 log->last_cp_seq = le64_to_cpu(mb->seq); 2541 2542 log->device_size = round_down(rdev->sectors, BLOCK_SECTORS); 2543 log->max_free_space = log->device_size >> RECLAIM_MAX_FREE_SPACE_SHIFT; 2544 if (log->max_free_space > RECLAIM_MAX_FREE_SPACE) 2545 log->max_free_space = RECLAIM_MAX_FREE_SPACE; 2546 log->last_checkpoint = cp; 2547 2548 __free_page(page); 2549 2550 if (create_super) { 2551 log->log_start = r5l_ring_add(log, cp, BLOCK_SECTORS); 2552 log->seq = log->last_cp_seq + 1; 2553 log->next_checkpoint = cp; 2554 } else 2555 ret = r5l_recovery_log(log); 2556 2557 r5c_update_log_state(log); 2558 return ret; 2559 ioerr: 2560 __free_page(page); 2561 return ret; 2562 } 2563 2564 int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev) 2565 { 2566 struct request_queue *q = bdev_get_queue(rdev->bdev); 2567 struct r5l_log *log; 2568 2569 if (PAGE_SIZE != 4096) 2570 return -EINVAL; 2571 2572 /* 2573 * The PAGE_SIZE must be big enough to hold 1 r5l_meta_block and 2574 * raid_disks r5l_payload_data_parity. 2575 * 2576 * Write journal and cache does not work for very big array 2577 * (raid_disks > 203) 2578 */ 2579 if (sizeof(struct r5l_meta_block) + 2580 ((sizeof(struct r5l_payload_data_parity) + sizeof(__le32)) * 2581 conf->raid_disks) > PAGE_SIZE) { 2582 pr_err("md/raid:%s: write journal/cache doesn't work for array with %d disks\n", 2583 mdname(conf->mddev), conf->raid_disks); 2584 return -EINVAL; 2585 } 2586 2587 log = kzalloc(sizeof(*log), GFP_KERNEL); 2588 if (!log) 2589 return -ENOMEM; 2590 log->rdev = rdev; 2591 2592 log->need_cache_flush = test_bit(QUEUE_FLAG_WC, &q->queue_flags) != 0; 2593 2594 log->uuid_checksum = crc32c_le(~0, rdev->mddev->uuid, 2595 sizeof(rdev->mddev->uuid)); 2596 2597 mutex_init(&log->io_mutex); 2598 2599 spin_lock_init(&log->io_list_lock); 2600 INIT_LIST_HEAD(&log->running_ios); 2601 INIT_LIST_HEAD(&log->io_end_ios); 2602 INIT_LIST_HEAD(&log->flushing_ios); 2603 INIT_LIST_HEAD(&log->finished_ios); 2604 bio_init(&log->flush_bio, NULL, 0); 2605 2606 log->io_kc = KMEM_CACHE(r5l_io_unit, 0); 2607 if (!log->io_kc) 2608 goto io_kc; 2609 2610 log->io_pool = mempool_create_slab_pool(R5L_POOL_SIZE, log->io_kc); 2611 if (!log->io_pool) 2612 goto io_pool; 2613 2614 log->bs = bioset_create(R5L_POOL_SIZE, 0); 2615 if (!log->bs) 2616 goto io_bs; 2617 2618 log->meta_pool = mempool_create_page_pool(R5L_POOL_SIZE, 0); 2619 if (!log->meta_pool) 2620 goto out_mempool; 2621 2622 log->reclaim_thread = md_register_thread(r5l_reclaim_thread, 2623 log->rdev->mddev, "reclaim"); 2624 if (!log->reclaim_thread) 2625 goto reclaim_thread; 2626 log->reclaim_thread->timeout = R5C_RECLAIM_WAKEUP_INTERVAL; 2627 2628 init_waitqueue_head(&log->iounit_wait); 2629 2630 INIT_LIST_HEAD(&log->no_mem_stripes); 2631 2632 INIT_LIST_HEAD(&log->no_space_stripes); 2633 spin_lock_init(&log->no_space_stripes_lock); 2634 2635 INIT_WORK(&log->deferred_io_work, r5l_submit_io_async); 2636 2637 log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH; 2638 INIT_LIST_HEAD(&log->stripe_in_journal_list); 2639 spin_lock_init(&log->stripe_in_journal_lock); 2640 atomic_set(&log->stripe_in_journal_count, 0); 2641 2642 if (r5l_load_log(log)) 2643 goto error; 2644 2645 rcu_assign_pointer(conf->log, log); 2646 set_bit(MD_HAS_JOURNAL, &conf->mddev->flags); 2647 return 0; 2648 2649 error: 2650 md_unregister_thread(&log->reclaim_thread); 2651 reclaim_thread: 2652 mempool_destroy(log->meta_pool); 2653 out_mempool: 2654 bioset_free(log->bs); 2655 io_bs: 2656 mempool_destroy(log->io_pool); 2657 io_pool: 2658 kmem_cache_destroy(log->io_kc); 2659 io_kc: 2660 kfree(log); 2661 return -EINVAL; 2662 } 2663 2664 void r5l_exit_log(struct r5l_log *log) 2665 { 2666 md_unregister_thread(&log->reclaim_thread); 2667 mempool_destroy(log->meta_pool); 2668 bioset_free(log->bs); 2669 mempool_destroy(log->io_pool); 2670 kmem_cache_destroy(log->io_kc); 2671 kfree(log); 2672 } 2673