1 /* 2 * Copyright (C) 2015 Shaohua Li <shli@fb.com> 3 * Copyright (C) 2016 Song Liu <songliubraving@fb.com> 4 * 5 * This program is free software; you can redistribute it and/or modify it 6 * under the terms and conditions of the GNU General Public License, 7 * version 2, as published by the Free Software Foundation. 8 * 9 * This program is distributed in the hope it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 12 * more details. 13 * 14 */ 15 #include <linux/kernel.h> 16 #include <linux/wait.h> 17 #include <linux/blkdev.h> 18 #include <linux/slab.h> 19 #include <linux/raid/md_p.h> 20 #include <linux/crc32c.h> 21 #include <linux/random.h> 22 #include <linux/kthread.h> 23 #include "md.h" 24 #include "raid5.h" 25 #include "bitmap.h" 26 27 /* 28 * metadata/data stored in disk with 4k size unit (a block) regardless 29 * underneath hardware sector size. only works with PAGE_SIZE == 4096 30 */ 31 #define BLOCK_SECTORS (8) 32 33 /* 34 * log->max_free_space is min(1/4 disk size, 10G reclaimable space). 35 * 36 * In write through mode, the reclaim runs every log->max_free_space. 37 * This can prevent the recovery scans for too long 38 */ 39 #define RECLAIM_MAX_FREE_SPACE (10 * 1024 * 1024 * 2) /* sector */ 40 #define RECLAIM_MAX_FREE_SPACE_SHIFT (2) 41 42 /* wake up reclaim thread periodically */ 43 #define R5C_RECLAIM_WAKEUP_INTERVAL (30 * HZ) 44 /* start flush with these full stripes */ 45 #define R5C_FULL_STRIPE_FLUSH_BATCH 256 46 /* reclaim stripes in groups */ 47 #define R5C_RECLAIM_STRIPE_GROUP (NR_STRIPE_HASH_LOCKS * 2) 48 49 /* 50 * We only need 2 bios per I/O unit to make progress, but ensure we 51 * have a few more available to not get too tight. 52 */ 53 #define R5L_POOL_SIZE 4 54 55 /* 56 * r5c journal modes of the array: write-back or write-through. 57 * write-through mode has identical behavior as existing log only 58 * implementation. 59 */ 60 enum r5c_journal_mode { 61 R5C_JOURNAL_MODE_WRITE_THROUGH = 0, 62 R5C_JOURNAL_MODE_WRITE_BACK = 1, 63 }; 64 65 static char *r5c_journal_mode_str[] = {"write-through", 66 "write-back"}; 67 /* 68 * raid5 cache state machine 69 * 70 * With rhe RAID cache, each stripe works in two phases: 71 * - caching phase 72 * - writing-out phase 73 * 74 * These two phases are controlled by bit STRIPE_R5C_CACHING: 75 * if STRIPE_R5C_CACHING == 0, the stripe is in writing-out phase 76 * if STRIPE_R5C_CACHING == 1, the stripe is in caching phase 77 * 78 * When there is no journal, or the journal is in write-through mode, 79 * the stripe is always in writing-out phase. 80 * 81 * For write-back journal, the stripe is sent to caching phase on write 82 * (r5c_try_caching_write). r5c_make_stripe_write_out() kicks off 83 * the write-out phase by clearing STRIPE_R5C_CACHING. 84 * 85 * Stripes in caching phase do not write the raid disks. Instead, all 86 * writes are committed from the log device. Therefore, a stripe in 87 * caching phase handles writes as: 88 * - write to log device 89 * - return IO 90 * 91 * Stripes in writing-out phase handle writes as: 92 * - calculate parity 93 * - write pending data and parity to journal 94 * - write data and parity to raid disks 95 * - return IO for pending writes 96 */ 97 98 struct r5l_log { 99 struct md_rdev *rdev; 100 101 u32 uuid_checksum; 102 103 sector_t device_size; /* log device size, round to 104 * BLOCK_SECTORS */ 105 sector_t max_free_space; /* reclaim run if free space is at 106 * this size */ 107 108 sector_t last_checkpoint; /* log tail. where recovery scan 109 * starts from */ 110 u64 last_cp_seq; /* log tail sequence */ 111 112 sector_t log_start; /* log head. where new data appends */ 113 u64 seq; /* log head sequence */ 114 115 sector_t next_checkpoint; 116 u64 next_cp_seq; 117 118 struct mutex io_mutex; 119 struct r5l_io_unit *current_io; /* current io_unit accepting new data */ 120 121 spinlock_t io_list_lock; 122 struct list_head running_ios; /* io_units which are still running, 123 * and have not yet been completely 124 * written to the log */ 125 struct list_head io_end_ios; /* io_units which have been completely 126 * written to the log but not yet written 127 * to the RAID */ 128 struct list_head flushing_ios; /* io_units which are waiting for log 129 * cache flush */ 130 struct list_head finished_ios; /* io_units which settle down in log disk */ 131 struct bio flush_bio; 132 133 struct list_head no_mem_stripes; /* pending stripes, -ENOMEM */ 134 135 struct kmem_cache *io_kc; 136 mempool_t *io_pool; 137 struct bio_set *bs; 138 mempool_t *meta_pool; 139 140 struct md_thread *reclaim_thread; 141 unsigned long reclaim_target; /* number of space that need to be 142 * reclaimed. if it's 0, reclaim spaces 143 * used by io_units which are in 144 * IO_UNIT_STRIPE_END state (eg, reclaim 145 * dones't wait for specific io_unit 146 * switching to IO_UNIT_STRIPE_END 147 * state) */ 148 wait_queue_head_t iounit_wait; 149 150 struct list_head no_space_stripes; /* pending stripes, log has no space */ 151 spinlock_t no_space_stripes_lock; 152 153 bool need_cache_flush; 154 155 /* for r5c_cache */ 156 enum r5c_journal_mode r5c_journal_mode; 157 158 /* all stripes in r5cache, in the order of seq at sh->log_start */ 159 struct list_head stripe_in_journal_list; 160 161 spinlock_t stripe_in_journal_lock; 162 atomic_t stripe_in_journal_count; 163 164 /* to submit async io_units, to fulfill ordering of flush */ 165 struct work_struct deferred_io_work; 166 }; 167 168 /* 169 * an IO range starts from a meta data block and end at the next meta data 170 * block. The io unit's the meta data block tracks data/parity followed it. io 171 * unit is written to log disk with normal write, as we always flush log disk 172 * first and then start move data to raid disks, there is no requirement to 173 * write io unit with FLUSH/FUA 174 */ 175 struct r5l_io_unit { 176 struct r5l_log *log; 177 178 struct page *meta_page; /* store meta block */ 179 int meta_offset; /* current offset in meta_page */ 180 181 struct bio *current_bio;/* current_bio accepting new data */ 182 183 atomic_t pending_stripe;/* how many stripes not flushed to raid */ 184 u64 seq; /* seq number of the metablock */ 185 sector_t log_start; /* where the io_unit starts */ 186 sector_t log_end; /* where the io_unit ends */ 187 struct list_head log_sibling; /* log->running_ios */ 188 struct list_head stripe_list; /* stripes added to the io_unit */ 189 190 int state; 191 bool need_split_bio; 192 struct bio *split_bio; 193 194 unsigned int has_flush:1; /* include flush request */ 195 unsigned int has_fua:1; /* include fua request */ 196 unsigned int has_null_flush:1; /* include empty flush request */ 197 /* 198 * io isn't sent yet, flush/fua request can only be submitted till it's 199 * the first IO in running_ios list 200 */ 201 unsigned int io_deferred:1; 202 203 struct bio_list flush_barriers; /* size == 0 flush bios */ 204 }; 205 206 /* r5l_io_unit state */ 207 enum r5l_io_unit_state { 208 IO_UNIT_RUNNING = 0, /* accepting new IO */ 209 IO_UNIT_IO_START = 1, /* io_unit bio start writing to log, 210 * don't accepting new bio */ 211 IO_UNIT_IO_END = 2, /* io_unit bio finish writing to log */ 212 IO_UNIT_STRIPE_END = 3, /* stripes data finished writing to raid */ 213 }; 214 215 bool r5c_is_writeback(struct r5l_log *log) 216 { 217 return (log != NULL && 218 log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_BACK); 219 } 220 221 static sector_t r5l_ring_add(struct r5l_log *log, sector_t start, sector_t inc) 222 { 223 start += inc; 224 if (start >= log->device_size) 225 start = start - log->device_size; 226 return start; 227 } 228 229 static sector_t r5l_ring_distance(struct r5l_log *log, sector_t start, 230 sector_t end) 231 { 232 if (end >= start) 233 return end - start; 234 else 235 return end + log->device_size - start; 236 } 237 238 static bool r5l_has_free_space(struct r5l_log *log, sector_t size) 239 { 240 sector_t used_size; 241 242 used_size = r5l_ring_distance(log, log->last_checkpoint, 243 log->log_start); 244 245 return log->device_size > used_size + size; 246 } 247 248 static void __r5l_set_io_unit_state(struct r5l_io_unit *io, 249 enum r5l_io_unit_state state) 250 { 251 if (WARN_ON(io->state >= state)) 252 return; 253 io->state = state; 254 } 255 256 static void 257 r5c_return_dev_pending_writes(struct r5conf *conf, struct r5dev *dev, 258 struct bio_list *return_bi) 259 { 260 struct bio *wbi, *wbi2; 261 262 wbi = dev->written; 263 dev->written = NULL; 264 while (wbi && wbi->bi_iter.bi_sector < 265 dev->sector + STRIPE_SECTORS) { 266 wbi2 = r5_next_bio(wbi, dev->sector); 267 if (!raid5_dec_bi_active_stripes(wbi)) { 268 md_write_end(conf->mddev); 269 bio_list_add(return_bi, wbi); 270 } 271 wbi = wbi2; 272 } 273 } 274 275 void r5c_handle_cached_data_endio(struct r5conf *conf, 276 struct stripe_head *sh, int disks, struct bio_list *return_bi) 277 { 278 int i; 279 280 for (i = sh->disks; i--; ) { 281 if (sh->dev[i].written) { 282 set_bit(R5_UPTODATE, &sh->dev[i].flags); 283 r5c_return_dev_pending_writes(conf, &sh->dev[i], 284 return_bi); 285 bitmap_endwrite(conf->mddev->bitmap, sh->sector, 286 STRIPE_SECTORS, 287 !test_bit(STRIPE_DEGRADED, &sh->state), 288 0); 289 } 290 } 291 } 292 293 /* Check whether we should flush some stripes to free up stripe cache */ 294 void r5c_check_stripe_cache_usage(struct r5conf *conf) 295 { 296 int total_cached; 297 298 if (!r5c_is_writeback(conf->log)) 299 return; 300 301 total_cached = atomic_read(&conf->r5c_cached_partial_stripes) + 302 atomic_read(&conf->r5c_cached_full_stripes); 303 304 /* 305 * The following condition is true for either of the following: 306 * - stripe cache pressure high: 307 * total_cached > 3/4 min_nr_stripes || 308 * empty_inactive_list_nr > 0 309 * - stripe cache pressure moderate: 310 * total_cached > 1/2 min_nr_stripes 311 */ 312 if (total_cached > conf->min_nr_stripes * 1 / 2 || 313 atomic_read(&conf->empty_inactive_list_nr) > 0) 314 r5l_wake_reclaim(conf->log, 0); 315 } 316 317 /* 318 * flush cache when there are R5C_FULL_STRIPE_FLUSH_BATCH or more full 319 * stripes in the cache 320 */ 321 void r5c_check_cached_full_stripe(struct r5conf *conf) 322 { 323 if (!r5c_is_writeback(conf->log)) 324 return; 325 326 /* 327 * wake up reclaim for R5C_FULL_STRIPE_FLUSH_BATCH cached stripes 328 * or a full stripe (chunk size / 4k stripes). 329 */ 330 if (atomic_read(&conf->r5c_cached_full_stripes) >= 331 min(R5C_FULL_STRIPE_FLUSH_BATCH, 332 conf->chunk_sectors >> STRIPE_SHIFT)) 333 r5l_wake_reclaim(conf->log, 0); 334 } 335 336 /* 337 * Total log space (in sectors) needed to flush all data in cache 338 * 339 * Currently, writing-out phase automatically includes all pending writes 340 * to the same sector. So the reclaim of each stripe takes up to 341 * (conf->raid_disks + 1) pages of log space. 342 * 343 * To totally avoid deadlock due to log space, the code reserves 344 * (conf->raid_disks + 1) pages for each stripe in cache, which is not 345 * necessary in most cases. 346 * 347 * To improve this, we will need writing-out phase to be able to NOT include 348 * pending writes, which will reduce the requirement to 349 * (conf->max_degraded + 1) pages per stripe in cache. 350 */ 351 static sector_t r5c_log_required_to_flush_cache(struct r5conf *conf) 352 { 353 struct r5l_log *log = conf->log; 354 355 if (!r5c_is_writeback(log)) 356 return 0; 357 358 return BLOCK_SECTORS * (conf->raid_disks + 1) * 359 atomic_read(&log->stripe_in_journal_count); 360 } 361 362 /* 363 * evaluate log space usage and update R5C_LOG_TIGHT and R5C_LOG_CRITICAL 364 * 365 * R5C_LOG_TIGHT is set when free space on the log device is less than 3x of 366 * reclaim_required_space. R5C_LOG_CRITICAL is set when free space on the log 367 * device is less than 2x of reclaim_required_space. 368 */ 369 static inline void r5c_update_log_state(struct r5l_log *log) 370 { 371 struct r5conf *conf = log->rdev->mddev->private; 372 sector_t free_space; 373 sector_t reclaim_space; 374 375 if (!r5c_is_writeback(log)) 376 return; 377 378 free_space = r5l_ring_distance(log, log->log_start, 379 log->last_checkpoint); 380 reclaim_space = r5c_log_required_to_flush_cache(conf); 381 if (free_space < 2 * reclaim_space) 382 set_bit(R5C_LOG_CRITICAL, &conf->cache_state); 383 else 384 clear_bit(R5C_LOG_CRITICAL, &conf->cache_state); 385 if (free_space < 3 * reclaim_space) 386 set_bit(R5C_LOG_TIGHT, &conf->cache_state); 387 else 388 clear_bit(R5C_LOG_TIGHT, &conf->cache_state); 389 } 390 391 /* 392 * Put the stripe into writing-out phase by clearing STRIPE_R5C_CACHING. 393 * This function should only be called in write-back mode. 394 */ 395 void r5c_make_stripe_write_out(struct stripe_head *sh) 396 { 397 struct r5conf *conf = sh->raid_conf; 398 struct r5l_log *log = conf->log; 399 400 BUG_ON(!r5c_is_writeback(log)); 401 402 WARN_ON(!test_bit(STRIPE_R5C_CACHING, &sh->state)); 403 clear_bit(STRIPE_R5C_CACHING, &sh->state); 404 405 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 406 atomic_inc(&conf->preread_active_stripes); 407 408 if (test_and_clear_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state)) { 409 BUG_ON(atomic_read(&conf->r5c_cached_partial_stripes) == 0); 410 atomic_dec(&conf->r5c_cached_partial_stripes); 411 } 412 413 if (test_and_clear_bit(STRIPE_R5C_FULL_STRIPE, &sh->state)) { 414 BUG_ON(atomic_read(&conf->r5c_cached_full_stripes) == 0); 415 atomic_dec(&conf->r5c_cached_full_stripes); 416 } 417 } 418 419 static void r5c_handle_data_cached(struct stripe_head *sh) 420 { 421 int i; 422 423 for (i = sh->disks; i--; ) 424 if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) { 425 set_bit(R5_InJournal, &sh->dev[i].flags); 426 clear_bit(R5_LOCKED, &sh->dev[i].flags); 427 } 428 clear_bit(STRIPE_LOG_TRAPPED, &sh->state); 429 } 430 431 /* 432 * this journal write must contain full parity, 433 * it may also contain some data pages 434 */ 435 static void r5c_handle_parity_cached(struct stripe_head *sh) 436 { 437 int i; 438 439 for (i = sh->disks; i--; ) 440 if (test_bit(R5_InJournal, &sh->dev[i].flags)) 441 set_bit(R5_Wantwrite, &sh->dev[i].flags); 442 } 443 444 /* 445 * Setting proper flags after writing (or flushing) data and/or parity to the 446 * log device. This is called from r5l_log_endio() or r5l_log_flush_endio(). 447 */ 448 static void r5c_finish_cache_stripe(struct stripe_head *sh) 449 { 450 struct r5l_log *log = sh->raid_conf->log; 451 452 if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) { 453 BUG_ON(test_bit(STRIPE_R5C_CACHING, &sh->state)); 454 /* 455 * Set R5_InJournal for parity dev[pd_idx]. This means 456 * all data AND parity in the journal. For RAID 6, it is 457 * NOT necessary to set the flag for dev[qd_idx], as the 458 * two parities are written out together. 459 */ 460 set_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags); 461 } else if (test_bit(STRIPE_R5C_CACHING, &sh->state)) { 462 r5c_handle_data_cached(sh); 463 } else { 464 r5c_handle_parity_cached(sh); 465 set_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags); 466 } 467 } 468 469 static void r5l_io_run_stripes(struct r5l_io_unit *io) 470 { 471 struct stripe_head *sh, *next; 472 473 list_for_each_entry_safe(sh, next, &io->stripe_list, log_list) { 474 list_del_init(&sh->log_list); 475 476 r5c_finish_cache_stripe(sh); 477 478 set_bit(STRIPE_HANDLE, &sh->state); 479 raid5_release_stripe(sh); 480 } 481 } 482 483 static void r5l_log_run_stripes(struct r5l_log *log) 484 { 485 struct r5l_io_unit *io, *next; 486 487 assert_spin_locked(&log->io_list_lock); 488 489 list_for_each_entry_safe(io, next, &log->running_ios, log_sibling) { 490 /* don't change list order */ 491 if (io->state < IO_UNIT_IO_END) 492 break; 493 494 list_move_tail(&io->log_sibling, &log->finished_ios); 495 r5l_io_run_stripes(io); 496 } 497 } 498 499 static void r5l_move_to_end_ios(struct r5l_log *log) 500 { 501 struct r5l_io_unit *io, *next; 502 503 assert_spin_locked(&log->io_list_lock); 504 505 list_for_each_entry_safe(io, next, &log->running_ios, log_sibling) { 506 /* don't change list order */ 507 if (io->state < IO_UNIT_IO_END) 508 break; 509 list_move_tail(&io->log_sibling, &log->io_end_ios); 510 } 511 } 512 513 static void __r5l_stripe_write_finished(struct r5l_io_unit *io); 514 static void r5l_log_endio(struct bio *bio) 515 { 516 struct r5l_io_unit *io = bio->bi_private; 517 struct r5l_io_unit *io_deferred; 518 struct r5l_log *log = io->log; 519 unsigned long flags; 520 521 if (bio->bi_error) 522 md_error(log->rdev->mddev, log->rdev); 523 524 bio_put(bio); 525 mempool_free(io->meta_page, log->meta_pool); 526 527 spin_lock_irqsave(&log->io_list_lock, flags); 528 __r5l_set_io_unit_state(io, IO_UNIT_IO_END); 529 if (log->need_cache_flush) 530 r5l_move_to_end_ios(log); 531 else 532 r5l_log_run_stripes(log); 533 if (!list_empty(&log->running_ios)) { 534 /* 535 * FLUSH/FUA io_unit is deferred because of ordering, now we 536 * can dispatch it 537 */ 538 io_deferred = list_first_entry(&log->running_ios, 539 struct r5l_io_unit, log_sibling); 540 if (io_deferred->io_deferred) 541 schedule_work(&log->deferred_io_work); 542 } 543 544 spin_unlock_irqrestore(&log->io_list_lock, flags); 545 546 if (log->need_cache_flush) 547 md_wakeup_thread(log->rdev->mddev->thread); 548 549 if (io->has_null_flush) { 550 struct bio *bi; 551 552 WARN_ON(bio_list_empty(&io->flush_barriers)); 553 while ((bi = bio_list_pop(&io->flush_barriers)) != NULL) { 554 bio_endio(bi); 555 atomic_dec(&io->pending_stripe); 556 } 557 if (atomic_read(&io->pending_stripe) == 0) 558 __r5l_stripe_write_finished(io); 559 } 560 } 561 562 static void r5l_do_submit_io(struct r5l_log *log, struct r5l_io_unit *io) 563 { 564 unsigned long flags; 565 566 spin_lock_irqsave(&log->io_list_lock, flags); 567 __r5l_set_io_unit_state(io, IO_UNIT_IO_START); 568 spin_unlock_irqrestore(&log->io_list_lock, flags); 569 570 if (io->has_flush) 571 bio_set_op_attrs(io->current_bio, REQ_OP_WRITE, WRITE_FLUSH); 572 if (io->has_fua) 573 bio_set_op_attrs(io->current_bio, REQ_OP_WRITE, WRITE_FUA); 574 submit_bio(io->current_bio); 575 576 if (!io->split_bio) 577 return; 578 579 if (io->has_flush) 580 bio_set_op_attrs(io->split_bio, REQ_OP_WRITE, WRITE_FLUSH); 581 if (io->has_fua) 582 bio_set_op_attrs(io->split_bio, REQ_OP_WRITE, WRITE_FUA); 583 submit_bio(io->split_bio); 584 } 585 586 /* deferred io_unit will be dispatched here */ 587 static void r5l_submit_io_async(struct work_struct *work) 588 { 589 struct r5l_log *log = container_of(work, struct r5l_log, 590 deferred_io_work); 591 struct r5l_io_unit *io = NULL; 592 unsigned long flags; 593 594 spin_lock_irqsave(&log->io_list_lock, flags); 595 if (!list_empty(&log->running_ios)) { 596 io = list_first_entry(&log->running_ios, struct r5l_io_unit, 597 log_sibling); 598 if (!io->io_deferred) 599 io = NULL; 600 else 601 io->io_deferred = 0; 602 } 603 spin_unlock_irqrestore(&log->io_list_lock, flags); 604 if (io) 605 r5l_do_submit_io(log, io); 606 } 607 608 static void r5l_submit_current_io(struct r5l_log *log) 609 { 610 struct r5l_io_unit *io = log->current_io; 611 struct bio *bio; 612 struct r5l_meta_block *block; 613 unsigned long flags; 614 u32 crc; 615 bool do_submit = true; 616 617 if (!io) 618 return; 619 620 block = page_address(io->meta_page); 621 block->meta_size = cpu_to_le32(io->meta_offset); 622 crc = crc32c_le(log->uuid_checksum, block, PAGE_SIZE); 623 block->checksum = cpu_to_le32(crc); 624 bio = io->current_bio; 625 626 log->current_io = NULL; 627 spin_lock_irqsave(&log->io_list_lock, flags); 628 if (io->has_flush || io->has_fua) { 629 if (io != list_first_entry(&log->running_ios, 630 struct r5l_io_unit, log_sibling)) { 631 io->io_deferred = 1; 632 do_submit = false; 633 } 634 } 635 spin_unlock_irqrestore(&log->io_list_lock, flags); 636 if (do_submit) 637 r5l_do_submit_io(log, io); 638 } 639 640 static struct bio *r5l_bio_alloc(struct r5l_log *log) 641 { 642 struct bio *bio = bio_alloc_bioset(GFP_NOIO, BIO_MAX_PAGES, log->bs); 643 644 bio_set_op_attrs(bio, REQ_OP_WRITE, 0); 645 bio->bi_bdev = log->rdev->bdev; 646 bio->bi_iter.bi_sector = log->rdev->data_offset + log->log_start; 647 648 return bio; 649 } 650 651 static void r5_reserve_log_entry(struct r5l_log *log, struct r5l_io_unit *io) 652 { 653 log->log_start = r5l_ring_add(log, log->log_start, BLOCK_SECTORS); 654 655 r5c_update_log_state(log); 656 /* 657 * If we filled up the log device start from the beginning again, 658 * which will require a new bio. 659 * 660 * Note: for this to work properly the log size needs to me a multiple 661 * of BLOCK_SECTORS. 662 */ 663 if (log->log_start == 0) 664 io->need_split_bio = true; 665 666 io->log_end = log->log_start; 667 } 668 669 static struct r5l_io_unit *r5l_new_meta(struct r5l_log *log) 670 { 671 struct r5l_io_unit *io; 672 struct r5l_meta_block *block; 673 674 io = mempool_alloc(log->io_pool, GFP_ATOMIC); 675 if (!io) 676 return NULL; 677 memset(io, 0, sizeof(*io)); 678 679 io->log = log; 680 INIT_LIST_HEAD(&io->log_sibling); 681 INIT_LIST_HEAD(&io->stripe_list); 682 bio_list_init(&io->flush_barriers); 683 io->state = IO_UNIT_RUNNING; 684 685 io->meta_page = mempool_alloc(log->meta_pool, GFP_NOIO); 686 block = page_address(io->meta_page); 687 clear_page(block); 688 block->magic = cpu_to_le32(R5LOG_MAGIC); 689 block->version = R5LOG_VERSION; 690 block->seq = cpu_to_le64(log->seq); 691 block->position = cpu_to_le64(log->log_start); 692 693 io->log_start = log->log_start; 694 io->meta_offset = sizeof(struct r5l_meta_block); 695 io->seq = log->seq++; 696 697 io->current_bio = r5l_bio_alloc(log); 698 io->current_bio->bi_end_io = r5l_log_endio; 699 io->current_bio->bi_private = io; 700 bio_add_page(io->current_bio, io->meta_page, PAGE_SIZE, 0); 701 702 r5_reserve_log_entry(log, io); 703 704 spin_lock_irq(&log->io_list_lock); 705 list_add_tail(&io->log_sibling, &log->running_ios); 706 spin_unlock_irq(&log->io_list_lock); 707 708 return io; 709 } 710 711 static int r5l_get_meta(struct r5l_log *log, unsigned int payload_size) 712 { 713 if (log->current_io && 714 log->current_io->meta_offset + payload_size > PAGE_SIZE) 715 r5l_submit_current_io(log); 716 717 if (!log->current_io) { 718 log->current_io = r5l_new_meta(log); 719 if (!log->current_io) 720 return -ENOMEM; 721 } 722 723 return 0; 724 } 725 726 static void r5l_append_payload_meta(struct r5l_log *log, u16 type, 727 sector_t location, 728 u32 checksum1, u32 checksum2, 729 bool checksum2_valid) 730 { 731 struct r5l_io_unit *io = log->current_io; 732 struct r5l_payload_data_parity *payload; 733 734 payload = page_address(io->meta_page) + io->meta_offset; 735 payload->header.type = cpu_to_le16(type); 736 payload->header.flags = cpu_to_le16(0); 737 payload->size = cpu_to_le32((1 + !!checksum2_valid) << 738 (PAGE_SHIFT - 9)); 739 payload->location = cpu_to_le64(location); 740 payload->checksum[0] = cpu_to_le32(checksum1); 741 if (checksum2_valid) 742 payload->checksum[1] = cpu_to_le32(checksum2); 743 744 io->meta_offset += sizeof(struct r5l_payload_data_parity) + 745 sizeof(__le32) * (1 + !!checksum2_valid); 746 } 747 748 static void r5l_append_payload_page(struct r5l_log *log, struct page *page) 749 { 750 struct r5l_io_unit *io = log->current_io; 751 752 if (io->need_split_bio) { 753 BUG_ON(io->split_bio); 754 io->split_bio = io->current_bio; 755 io->current_bio = r5l_bio_alloc(log); 756 bio_chain(io->current_bio, io->split_bio); 757 io->need_split_bio = false; 758 } 759 760 if (!bio_add_page(io->current_bio, page, PAGE_SIZE, 0)) 761 BUG(); 762 763 r5_reserve_log_entry(log, io); 764 } 765 766 static int r5l_log_stripe(struct r5l_log *log, struct stripe_head *sh, 767 int data_pages, int parity_pages) 768 { 769 int i; 770 int meta_size; 771 int ret; 772 struct r5l_io_unit *io; 773 774 meta_size = 775 ((sizeof(struct r5l_payload_data_parity) + sizeof(__le32)) 776 * data_pages) + 777 sizeof(struct r5l_payload_data_parity) + 778 sizeof(__le32) * parity_pages; 779 780 ret = r5l_get_meta(log, meta_size); 781 if (ret) 782 return ret; 783 784 io = log->current_io; 785 786 if (test_and_clear_bit(STRIPE_R5C_PREFLUSH, &sh->state)) 787 io->has_flush = 1; 788 789 for (i = 0; i < sh->disks; i++) { 790 if (!test_bit(R5_Wantwrite, &sh->dev[i].flags) || 791 test_bit(R5_InJournal, &sh->dev[i].flags)) 792 continue; 793 if (i == sh->pd_idx || i == sh->qd_idx) 794 continue; 795 if (test_bit(R5_WantFUA, &sh->dev[i].flags) && 796 log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_BACK) { 797 io->has_fua = 1; 798 /* 799 * we need to flush journal to make sure recovery can 800 * reach the data with fua flag 801 */ 802 io->has_flush = 1; 803 } 804 r5l_append_payload_meta(log, R5LOG_PAYLOAD_DATA, 805 raid5_compute_blocknr(sh, i, 0), 806 sh->dev[i].log_checksum, 0, false); 807 r5l_append_payload_page(log, sh->dev[i].page); 808 } 809 810 if (parity_pages == 2) { 811 r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY, 812 sh->sector, sh->dev[sh->pd_idx].log_checksum, 813 sh->dev[sh->qd_idx].log_checksum, true); 814 r5l_append_payload_page(log, sh->dev[sh->pd_idx].page); 815 r5l_append_payload_page(log, sh->dev[sh->qd_idx].page); 816 } else if (parity_pages == 1) { 817 r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY, 818 sh->sector, sh->dev[sh->pd_idx].log_checksum, 819 0, false); 820 r5l_append_payload_page(log, sh->dev[sh->pd_idx].page); 821 } else /* Just writing data, not parity, in caching phase */ 822 BUG_ON(parity_pages != 0); 823 824 list_add_tail(&sh->log_list, &io->stripe_list); 825 atomic_inc(&io->pending_stripe); 826 sh->log_io = io; 827 828 if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) 829 return 0; 830 831 if (sh->log_start == MaxSector) { 832 BUG_ON(!list_empty(&sh->r5c)); 833 sh->log_start = io->log_start; 834 spin_lock_irq(&log->stripe_in_journal_lock); 835 list_add_tail(&sh->r5c, 836 &log->stripe_in_journal_list); 837 spin_unlock_irq(&log->stripe_in_journal_lock); 838 atomic_inc(&log->stripe_in_journal_count); 839 } 840 return 0; 841 } 842 843 /* add stripe to no_space_stripes, and then wake up reclaim */ 844 static inline void r5l_add_no_space_stripe(struct r5l_log *log, 845 struct stripe_head *sh) 846 { 847 spin_lock(&log->no_space_stripes_lock); 848 list_add_tail(&sh->log_list, &log->no_space_stripes); 849 spin_unlock(&log->no_space_stripes_lock); 850 } 851 852 /* 853 * running in raid5d, where reclaim could wait for raid5d too (when it flushes 854 * data from log to raid disks), so we shouldn't wait for reclaim here 855 */ 856 int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh) 857 { 858 struct r5conf *conf = sh->raid_conf; 859 int write_disks = 0; 860 int data_pages, parity_pages; 861 int reserve; 862 int i; 863 int ret = 0; 864 bool wake_reclaim = false; 865 866 if (!log) 867 return -EAGAIN; 868 /* Don't support stripe batch */ 869 if (sh->log_io || !test_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags) || 870 test_bit(STRIPE_SYNCING, &sh->state)) { 871 /* the stripe is written to log, we start writing it to raid */ 872 clear_bit(STRIPE_LOG_TRAPPED, &sh->state); 873 return -EAGAIN; 874 } 875 876 WARN_ON(test_bit(STRIPE_R5C_CACHING, &sh->state)); 877 878 for (i = 0; i < sh->disks; i++) { 879 void *addr; 880 881 if (!test_bit(R5_Wantwrite, &sh->dev[i].flags) || 882 test_bit(R5_InJournal, &sh->dev[i].flags)) 883 continue; 884 885 write_disks++; 886 /* checksum is already calculated in last run */ 887 if (test_bit(STRIPE_LOG_TRAPPED, &sh->state)) 888 continue; 889 addr = kmap_atomic(sh->dev[i].page); 890 sh->dev[i].log_checksum = crc32c_le(log->uuid_checksum, 891 addr, PAGE_SIZE); 892 kunmap_atomic(addr); 893 } 894 parity_pages = 1 + !!(sh->qd_idx >= 0); 895 data_pages = write_disks - parity_pages; 896 897 set_bit(STRIPE_LOG_TRAPPED, &sh->state); 898 /* 899 * The stripe must enter state machine again to finish the write, so 900 * don't delay. 901 */ 902 clear_bit(STRIPE_DELAYED, &sh->state); 903 atomic_inc(&sh->count); 904 905 mutex_lock(&log->io_mutex); 906 /* meta + data */ 907 reserve = (1 + write_disks) << (PAGE_SHIFT - 9); 908 909 if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) { 910 if (!r5l_has_free_space(log, reserve)) { 911 r5l_add_no_space_stripe(log, sh); 912 wake_reclaim = true; 913 } else { 914 ret = r5l_log_stripe(log, sh, data_pages, parity_pages); 915 if (ret) { 916 spin_lock_irq(&log->io_list_lock); 917 list_add_tail(&sh->log_list, 918 &log->no_mem_stripes); 919 spin_unlock_irq(&log->io_list_lock); 920 } 921 } 922 } else { /* R5C_JOURNAL_MODE_WRITE_BACK */ 923 /* 924 * log space critical, do not process stripes that are 925 * not in cache yet (sh->log_start == MaxSector). 926 */ 927 if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state) && 928 sh->log_start == MaxSector) { 929 r5l_add_no_space_stripe(log, sh); 930 wake_reclaim = true; 931 reserve = 0; 932 } else if (!r5l_has_free_space(log, reserve)) { 933 if (sh->log_start == log->last_checkpoint) 934 BUG(); 935 else 936 r5l_add_no_space_stripe(log, sh); 937 } else { 938 ret = r5l_log_stripe(log, sh, data_pages, parity_pages); 939 if (ret) { 940 spin_lock_irq(&log->io_list_lock); 941 list_add_tail(&sh->log_list, 942 &log->no_mem_stripes); 943 spin_unlock_irq(&log->io_list_lock); 944 } 945 } 946 } 947 948 mutex_unlock(&log->io_mutex); 949 if (wake_reclaim) 950 r5l_wake_reclaim(log, reserve); 951 return 0; 952 } 953 954 void r5l_write_stripe_run(struct r5l_log *log) 955 { 956 if (!log) 957 return; 958 mutex_lock(&log->io_mutex); 959 r5l_submit_current_io(log); 960 mutex_unlock(&log->io_mutex); 961 } 962 963 int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio) 964 { 965 if (!log) 966 return -ENODEV; 967 968 if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) { 969 /* 970 * in write through (journal only) 971 * we flush log disk cache first, then write stripe data to 972 * raid disks. So if bio is finished, the log disk cache is 973 * flushed already. The recovery guarantees we can recovery 974 * the bio from log disk, so we don't need to flush again 975 */ 976 if (bio->bi_iter.bi_size == 0) { 977 bio_endio(bio); 978 return 0; 979 } 980 bio->bi_opf &= ~REQ_PREFLUSH; 981 } else { 982 /* write back (with cache) */ 983 if (bio->bi_iter.bi_size == 0) { 984 mutex_lock(&log->io_mutex); 985 r5l_get_meta(log, 0); 986 bio_list_add(&log->current_io->flush_barriers, bio); 987 log->current_io->has_flush = 1; 988 log->current_io->has_null_flush = 1; 989 atomic_inc(&log->current_io->pending_stripe); 990 r5l_submit_current_io(log); 991 mutex_unlock(&log->io_mutex); 992 return 0; 993 } 994 } 995 return -EAGAIN; 996 } 997 998 /* This will run after log space is reclaimed */ 999 static void r5l_run_no_space_stripes(struct r5l_log *log) 1000 { 1001 struct stripe_head *sh; 1002 1003 spin_lock(&log->no_space_stripes_lock); 1004 while (!list_empty(&log->no_space_stripes)) { 1005 sh = list_first_entry(&log->no_space_stripes, 1006 struct stripe_head, log_list); 1007 list_del_init(&sh->log_list); 1008 set_bit(STRIPE_HANDLE, &sh->state); 1009 raid5_release_stripe(sh); 1010 } 1011 spin_unlock(&log->no_space_stripes_lock); 1012 } 1013 1014 /* 1015 * calculate new last_checkpoint 1016 * for write through mode, returns log->next_checkpoint 1017 * for write back, returns log_start of first sh in stripe_in_journal_list 1018 */ 1019 static sector_t r5c_calculate_new_cp(struct r5conf *conf) 1020 { 1021 struct stripe_head *sh; 1022 struct r5l_log *log = conf->log; 1023 sector_t new_cp; 1024 unsigned long flags; 1025 1026 if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) 1027 return log->next_checkpoint; 1028 1029 spin_lock_irqsave(&log->stripe_in_journal_lock, flags); 1030 if (list_empty(&conf->log->stripe_in_journal_list)) { 1031 /* all stripes flushed */ 1032 spin_unlock_irqrestore(&log->stripe_in_journal_lock, flags); 1033 return log->next_checkpoint; 1034 } 1035 sh = list_first_entry(&conf->log->stripe_in_journal_list, 1036 struct stripe_head, r5c); 1037 new_cp = sh->log_start; 1038 spin_unlock_irqrestore(&log->stripe_in_journal_lock, flags); 1039 return new_cp; 1040 } 1041 1042 static sector_t r5l_reclaimable_space(struct r5l_log *log) 1043 { 1044 struct r5conf *conf = log->rdev->mddev->private; 1045 1046 return r5l_ring_distance(log, log->last_checkpoint, 1047 r5c_calculate_new_cp(conf)); 1048 } 1049 1050 static void r5l_run_no_mem_stripe(struct r5l_log *log) 1051 { 1052 struct stripe_head *sh; 1053 1054 assert_spin_locked(&log->io_list_lock); 1055 1056 if (!list_empty(&log->no_mem_stripes)) { 1057 sh = list_first_entry(&log->no_mem_stripes, 1058 struct stripe_head, log_list); 1059 list_del_init(&sh->log_list); 1060 set_bit(STRIPE_HANDLE, &sh->state); 1061 raid5_release_stripe(sh); 1062 } 1063 } 1064 1065 static bool r5l_complete_finished_ios(struct r5l_log *log) 1066 { 1067 struct r5l_io_unit *io, *next; 1068 bool found = false; 1069 1070 assert_spin_locked(&log->io_list_lock); 1071 1072 list_for_each_entry_safe(io, next, &log->finished_ios, log_sibling) { 1073 /* don't change list order */ 1074 if (io->state < IO_UNIT_STRIPE_END) 1075 break; 1076 1077 log->next_checkpoint = io->log_start; 1078 log->next_cp_seq = io->seq; 1079 1080 list_del(&io->log_sibling); 1081 mempool_free(io, log->io_pool); 1082 r5l_run_no_mem_stripe(log); 1083 1084 found = true; 1085 } 1086 1087 return found; 1088 } 1089 1090 static void __r5l_stripe_write_finished(struct r5l_io_unit *io) 1091 { 1092 struct r5l_log *log = io->log; 1093 struct r5conf *conf = log->rdev->mddev->private; 1094 unsigned long flags; 1095 1096 spin_lock_irqsave(&log->io_list_lock, flags); 1097 __r5l_set_io_unit_state(io, IO_UNIT_STRIPE_END); 1098 1099 if (!r5l_complete_finished_ios(log)) { 1100 spin_unlock_irqrestore(&log->io_list_lock, flags); 1101 return; 1102 } 1103 1104 if (r5l_reclaimable_space(log) > log->max_free_space || 1105 test_bit(R5C_LOG_TIGHT, &conf->cache_state)) 1106 r5l_wake_reclaim(log, 0); 1107 1108 spin_unlock_irqrestore(&log->io_list_lock, flags); 1109 wake_up(&log->iounit_wait); 1110 } 1111 1112 void r5l_stripe_write_finished(struct stripe_head *sh) 1113 { 1114 struct r5l_io_unit *io; 1115 1116 io = sh->log_io; 1117 sh->log_io = NULL; 1118 1119 if (io && atomic_dec_and_test(&io->pending_stripe)) 1120 __r5l_stripe_write_finished(io); 1121 } 1122 1123 static void r5l_log_flush_endio(struct bio *bio) 1124 { 1125 struct r5l_log *log = container_of(bio, struct r5l_log, 1126 flush_bio); 1127 unsigned long flags; 1128 struct r5l_io_unit *io; 1129 1130 if (bio->bi_error) 1131 md_error(log->rdev->mddev, log->rdev); 1132 1133 spin_lock_irqsave(&log->io_list_lock, flags); 1134 list_for_each_entry(io, &log->flushing_ios, log_sibling) 1135 r5l_io_run_stripes(io); 1136 list_splice_tail_init(&log->flushing_ios, &log->finished_ios); 1137 spin_unlock_irqrestore(&log->io_list_lock, flags); 1138 } 1139 1140 /* 1141 * Starting dispatch IO to raid. 1142 * io_unit(meta) consists of a log. There is one situation we want to avoid. A 1143 * broken meta in the middle of a log causes recovery can't find meta at the 1144 * head of log. If operations require meta at the head persistent in log, we 1145 * must make sure meta before it persistent in log too. A case is: 1146 * 1147 * stripe data/parity is in log, we start write stripe to raid disks. stripe 1148 * data/parity must be persistent in log before we do the write to raid disks. 1149 * 1150 * The solution is we restrictly maintain io_unit list order. In this case, we 1151 * only write stripes of an io_unit to raid disks till the io_unit is the first 1152 * one whose data/parity is in log. 1153 */ 1154 void r5l_flush_stripe_to_raid(struct r5l_log *log) 1155 { 1156 bool do_flush; 1157 1158 if (!log || !log->need_cache_flush) 1159 return; 1160 1161 spin_lock_irq(&log->io_list_lock); 1162 /* flush bio is running */ 1163 if (!list_empty(&log->flushing_ios)) { 1164 spin_unlock_irq(&log->io_list_lock); 1165 return; 1166 } 1167 list_splice_tail_init(&log->io_end_ios, &log->flushing_ios); 1168 do_flush = !list_empty(&log->flushing_ios); 1169 spin_unlock_irq(&log->io_list_lock); 1170 1171 if (!do_flush) 1172 return; 1173 bio_reset(&log->flush_bio); 1174 log->flush_bio.bi_bdev = log->rdev->bdev; 1175 log->flush_bio.bi_end_io = r5l_log_flush_endio; 1176 bio_set_op_attrs(&log->flush_bio, REQ_OP_WRITE, WRITE_FLUSH); 1177 submit_bio(&log->flush_bio); 1178 } 1179 1180 static void r5l_write_super(struct r5l_log *log, sector_t cp); 1181 static void r5l_write_super_and_discard_space(struct r5l_log *log, 1182 sector_t end) 1183 { 1184 struct block_device *bdev = log->rdev->bdev; 1185 struct mddev *mddev; 1186 1187 r5l_write_super(log, end); 1188 1189 if (!blk_queue_discard(bdev_get_queue(bdev))) 1190 return; 1191 1192 mddev = log->rdev->mddev; 1193 /* 1194 * Discard could zero data, so before discard we must make sure 1195 * superblock is updated to new log tail. Updating superblock (either 1196 * directly call md_update_sb() or depend on md thread) must hold 1197 * reconfig mutex. On the other hand, raid5_quiesce is called with 1198 * reconfig_mutex hold. The first step of raid5_quiesce() is waitting 1199 * for all IO finish, hence waitting for reclaim thread, while reclaim 1200 * thread is calling this function and waitting for reconfig mutex. So 1201 * there is a deadlock. We workaround this issue with a trylock. 1202 * FIXME: we could miss discard if we can't take reconfig mutex 1203 */ 1204 set_mask_bits(&mddev->flags, 0, 1205 BIT(MD_CHANGE_DEVS) | BIT(MD_CHANGE_PENDING)); 1206 if (!mddev_trylock(mddev)) 1207 return; 1208 md_update_sb(mddev, 1); 1209 mddev_unlock(mddev); 1210 1211 /* discard IO error really doesn't matter, ignore it */ 1212 if (log->last_checkpoint < end) { 1213 blkdev_issue_discard(bdev, 1214 log->last_checkpoint + log->rdev->data_offset, 1215 end - log->last_checkpoint, GFP_NOIO, 0); 1216 } else { 1217 blkdev_issue_discard(bdev, 1218 log->last_checkpoint + log->rdev->data_offset, 1219 log->device_size - log->last_checkpoint, 1220 GFP_NOIO, 0); 1221 blkdev_issue_discard(bdev, log->rdev->data_offset, end, 1222 GFP_NOIO, 0); 1223 } 1224 } 1225 1226 /* 1227 * r5c_flush_stripe moves stripe from cached list to handle_list. When called, 1228 * the stripe must be on r5c_cached_full_stripes or r5c_cached_partial_stripes. 1229 * 1230 * must hold conf->device_lock 1231 */ 1232 static void r5c_flush_stripe(struct r5conf *conf, struct stripe_head *sh) 1233 { 1234 BUG_ON(list_empty(&sh->lru)); 1235 BUG_ON(!test_bit(STRIPE_R5C_CACHING, &sh->state)); 1236 BUG_ON(test_bit(STRIPE_HANDLE, &sh->state)); 1237 1238 /* 1239 * The stripe is not ON_RELEASE_LIST, so it is safe to call 1240 * raid5_release_stripe() while holding conf->device_lock 1241 */ 1242 BUG_ON(test_bit(STRIPE_ON_RELEASE_LIST, &sh->state)); 1243 assert_spin_locked(&conf->device_lock); 1244 1245 list_del_init(&sh->lru); 1246 atomic_inc(&sh->count); 1247 1248 set_bit(STRIPE_HANDLE, &sh->state); 1249 atomic_inc(&conf->active_stripes); 1250 r5c_make_stripe_write_out(sh); 1251 1252 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 1253 atomic_inc(&conf->preread_active_stripes); 1254 raid5_release_stripe(sh); 1255 } 1256 1257 /* 1258 * if num == 0, flush all full stripes 1259 * if num > 0, flush all full stripes. If less than num full stripes are 1260 * flushed, flush some partial stripes until totally num stripes are 1261 * flushed or there is no more cached stripes. 1262 */ 1263 void r5c_flush_cache(struct r5conf *conf, int num) 1264 { 1265 int count; 1266 struct stripe_head *sh, *next; 1267 1268 assert_spin_locked(&conf->device_lock); 1269 if (!conf->log) 1270 return; 1271 1272 count = 0; 1273 list_for_each_entry_safe(sh, next, &conf->r5c_full_stripe_list, lru) { 1274 r5c_flush_stripe(conf, sh); 1275 count++; 1276 } 1277 1278 if (count >= num) 1279 return; 1280 list_for_each_entry_safe(sh, next, 1281 &conf->r5c_partial_stripe_list, lru) { 1282 r5c_flush_stripe(conf, sh); 1283 if (++count >= num) 1284 break; 1285 } 1286 } 1287 1288 static void r5c_do_reclaim(struct r5conf *conf) 1289 { 1290 struct r5l_log *log = conf->log; 1291 struct stripe_head *sh; 1292 int count = 0; 1293 unsigned long flags; 1294 int total_cached; 1295 int stripes_to_flush; 1296 1297 if (!r5c_is_writeback(log)) 1298 return; 1299 1300 total_cached = atomic_read(&conf->r5c_cached_partial_stripes) + 1301 atomic_read(&conf->r5c_cached_full_stripes); 1302 1303 if (total_cached > conf->min_nr_stripes * 3 / 4 || 1304 atomic_read(&conf->empty_inactive_list_nr) > 0) 1305 /* 1306 * if stripe cache pressure high, flush all full stripes and 1307 * some partial stripes 1308 */ 1309 stripes_to_flush = R5C_RECLAIM_STRIPE_GROUP; 1310 else if (total_cached > conf->min_nr_stripes * 1 / 2 || 1311 atomic_read(&conf->r5c_cached_full_stripes) > 1312 R5C_FULL_STRIPE_FLUSH_BATCH) 1313 /* 1314 * if stripe cache pressure moderate, or if there is many full 1315 * stripes,flush all full stripes 1316 */ 1317 stripes_to_flush = 0; 1318 else 1319 /* no need to flush */ 1320 stripes_to_flush = -1; 1321 1322 if (stripes_to_flush >= 0) { 1323 spin_lock_irqsave(&conf->device_lock, flags); 1324 r5c_flush_cache(conf, stripes_to_flush); 1325 spin_unlock_irqrestore(&conf->device_lock, flags); 1326 } 1327 1328 /* if log space is tight, flush stripes on stripe_in_journal_list */ 1329 if (test_bit(R5C_LOG_TIGHT, &conf->cache_state)) { 1330 spin_lock_irqsave(&log->stripe_in_journal_lock, flags); 1331 spin_lock(&conf->device_lock); 1332 list_for_each_entry(sh, &log->stripe_in_journal_list, r5c) { 1333 /* 1334 * stripes on stripe_in_journal_list could be in any 1335 * state of the stripe_cache state machine. In this 1336 * case, we only want to flush stripe on 1337 * r5c_cached_full/partial_stripes. The following 1338 * condition makes sure the stripe is on one of the 1339 * two lists. 1340 */ 1341 if (!list_empty(&sh->lru) && 1342 !test_bit(STRIPE_HANDLE, &sh->state) && 1343 atomic_read(&sh->count) == 0) { 1344 r5c_flush_stripe(conf, sh); 1345 } 1346 if (count++ >= R5C_RECLAIM_STRIPE_GROUP) 1347 break; 1348 } 1349 spin_unlock(&conf->device_lock); 1350 spin_unlock_irqrestore(&log->stripe_in_journal_lock, flags); 1351 } 1352 md_wakeup_thread(conf->mddev->thread); 1353 } 1354 1355 static void r5l_do_reclaim(struct r5l_log *log) 1356 { 1357 struct r5conf *conf = log->rdev->mddev->private; 1358 sector_t reclaim_target = xchg(&log->reclaim_target, 0); 1359 sector_t reclaimable; 1360 sector_t next_checkpoint; 1361 bool write_super; 1362 1363 spin_lock_irq(&log->io_list_lock); 1364 write_super = r5l_reclaimable_space(log) > log->max_free_space || 1365 reclaim_target != 0 || !list_empty(&log->no_space_stripes); 1366 /* 1367 * move proper io_unit to reclaim list. We should not change the order. 1368 * reclaimable/unreclaimable io_unit can be mixed in the list, we 1369 * shouldn't reuse space of an unreclaimable io_unit 1370 */ 1371 while (1) { 1372 reclaimable = r5l_reclaimable_space(log); 1373 if (reclaimable >= reclaim_target || 1374 (list_empty(&log->running_ios) && 1375 list_empty(&log->io_end_ios) && 1376 list_empty(&log->flushing_ios) && 1377 list_empty(&log->finished_ios))) 1378 break; 1379 1380 md_wakeup_thread(log->rdev->mddev->thread); 1381 wait_event_lock_irq(log->iounit_wait, 1382 r5l_reclaimable_space(log) > reclaimable, 1383 log->io_list_lock); 1384 } 1385 1386 next_checkpoint = r5c_calculate_new_cp(conf); 1387 spin_unlock_irq(&log->io_list_lock); 1388 1389 BUG_ON(reclaimable < 0); 1390 1391 if (reclaimable == 0 || !write_super) 1392 return; 1393 1394 /* 1395 * write_super will flush cache of each raid disk. We must write super 1396 * here, because the log area might be reused soon and we don't want to 1397 * confuse recovery 1398 */ 1399 r5l_write_super_and_discard_space(log, next_checkpoint); 1400 1401 mutex_lock(&log->io_mutex); 1402 log->last_checkpoint = next_checkpoint; 1403 r5c_update_log_state(log); 1404 mutex_unlock(&log->io_mutex); 1405 1406 r5l_run_no_space_stripes(log); 1407 } 1408 1409 static void r5l_reclaim_thread(struct md_thread *thread) 1410 { 1411 struct mddev *mddev = thread->mddev; 1412 struct r5conf *conf = mddev->private; 1413 struct r5l_log *log = conf->log; 1414 1415 if (!log) 1416 return; 1417 r5c_do_reclaim(conf); 1418 r5l_do_reclaim(log); 1419 } 1420 1421 void r5l_wake_reclaim(struct r5l_log *log, sector_t space) 1422 { 1423 unsigned long target; 1424 unsigned long new = (unsigned long)space; /* overflow in theory */ 1425 1426 if (!log) 1427 return; 1428 do { 1429 target = log->reclaim_target; 1430 if (new < target) 1431 return; 1432 } while (cmpxchg(&log->reclaim_target, target, new) != target); 1433 md_wakeup_thread(log->reclaim_thread); 1434 } 1435 1436 void r5l_quiesce(struct r5l_log *log, int state) 1437 { 1438 struct mddev *mddev; 1439 if (!log || state == 2) 1440 return; 1441 if (state == 0) 1442 kthread_unpark(log->reclaim_thread->tsk); 1443 else if (state == 1) { 1444 /* make sure r5l_write_super_and_discard_space exits */ 1445 mddev = log->rdev->mddev; 1446 wake_up(&mddev->sb_wait); 1447 kthread_park(log->reclaim_thread->tsk); 1448 r5l_wake_reclaim(log, MaxSector); 1449 r5l_do_reclaim(log); 1450 } 1451 } 1452 1453 bool r5l_log_disk_error(struct r5conf *conf) 1454 { 1455 struct r5l_log *log; 1456 bool ret; 1457 /* don't allow write if journal disk is missing */ 1458 rcu_read_lock(); 1459 log = rcu_dereference(conf->log); 1460 1461 if (!log) 1462 ret = test_bit(MD_HAS_JOURNAL, &conf->mddev->flags); 1463 else 1464 ret = test_bit(Faulty, &log->rdev->flags); 1465 rcu_read_unlock(); 1466 return ret; 1467 } 1468 1469 struct r5l_recovery_ctx { 1470 struct page *meta_page; /* current meta */ 1471 sector_t meta_total_blocks; /* total size of current meta and data */ 1472 sector_t pos; /* recovery position */ 1473 u64 seq; /* recovery position seq */ 1474 int data_parity_stripes; /* number of data_parity stripes */ 1475 int data_only_stripes; /* number of data_only stripes */ 1476 struct list_head cached_list; 1477 }; 1478 1479 static int r5l_recovery_read_meta_block(struct r5l_log *log, 1480 struct r5l_recovery_ctx *ctx) 1481 { 1482 struct page *page = ctx->meta_page; 1483 struct r5l_meta_block *mb; 1484 u32 crc, stored_crc; 1485 1486 if (!sync_page_io(log->rdev, ctx->pos, PAGE_SIZE, page, REQ_OP_READ, 0, 1487 false)) 1488 return -EIO; 1489 1490 mb = page_address(page); 1491 stored_crc = le32_to_cpu(mb->checksum); 1492 mb->checksum = 0; 1493 1494 if (le32_to_cpu(mb->magic) != R5LOG_MAGIC || 1495 le64_to_cpu(mb->seq) != ctx->seq || 1496 mb->version != R5LOG_VERSION || 1497 le64_to_cpu(mb->position) != ctx->pos) 1498 return -EINVAL; 1499 1500 crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE); 1501 if (stored_crc != crc) 1502 return -EINVAL; 1503 1504 if (le32_to_cpu(mb->meta_size) > PAGE_SIZE) 1505 return -EINVAL; 1506 1507 ctx->meta_total_blocks = BLOCK_SECTORS; 1508 1509 return 0; 1510 } 1511 1512 static void 1513 r5l_recovery_create_empty_meta_block(struct r5l_log *log, 1514 struct page *page, 1515 sector_t pos, u64 seq) 1516 { 1517 struct r5l_meta_block *mb; 1518 u32 crc; 1519 1520 mb = page_address(page); 1521 clear_page(mb); 1522 mb->magic = cpu_to_le32(R5LOG_MAGIC); 1523 mb->version = R5LOG_VERSION; 1524 mb->meta_size = cpu_to_le32(sizeof(struct r5l_meta_block)); 1525 mb->seq = cpu_to_le64(seq); 1526 mb->position = cpu_to_le64(pos); 1527 crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE); 1528 mb->checksum = cpu_to_le32(crc); 1529 } 1530 1531 static int r5l_log_write_empty_meta_block(struct r5l_log *log, sector_t pos, 1532 u64 seq) 1533 { 1534 struct page *page; 1535 1536 page = alloc_page(GFP_KERNEL); 1537 if (!page) 1538 return -ENOMEM; 1539 r5l_recovery_create_empty_meta_block(log, page, pos, seq); 1540 if (!sync_page_io(log->rdev, pos, PAGE_SIZE, page, REQ_OP_WRITE, 1541 WRITE_FUA, false)) { 1542 __free_page(page); 1543 return -EIO; 1544 } 1545 __free_page(page); 1546 return 0; 1547 } 1548 1549 /* 1550 * r5l_recovery_load_data and r5l_recovery_load_parity uses flag R5_Wantwrite 1551 * to mark valid (potentially not flushed) data in the journal. 1552 * 1553 * We already verified checksum in r5l_recovery_verify_data_checksum_for_mb, 1554 * so there should not be any mismatch here. 1555 */ 1556 static void r5l_recovery_load_data(struct r5l_log *log, 1557 struct stripe_head *sh, 1558 struct r5l_recovery_ctx *ctx, 1559 struct r5l_payload_data_parity *payload, 1560 sector_t log_offset) 1561 { 1562 struct mddev *mddev = log->rdev->mddev; 1563 struct r5conf *conf = mddev->private; 1564 int dd_idx; 1565 1566 raid5_compute_sector(conf, 1567 le64_to_cpu(payload->location), 0, 1568 &dd_idx, sh); 1569 sync_page_io(log->rdev, log_offset, PAGE_SIZE, 1570 sh->dev[dd_idx].page, REQ_OP_READ, 0, false); 1571 sh->dev[dd_idx].log_checksum = 1572 le32_to_cpu(payload->checksum[0]); 1573 ctx->meta_total_blocks += BLOCK_SECTORS; 1574 1575 set_bit(R5_Wantwrite, &sh->dev[dd_idx].flags); 1576 set_bit(STRIPE_R5C_CACHING, &sh->state); 1577 } 1578 1579 static void r5l_recovery_load_parity(struct r5l_log *log, 1580 struct stripe_head *sh, 1581 struct r5l_recovery_ctx *ctx, 1582 struct r5l_payload_data_parity *payload, 1583 sector_t log_offset) 1584 { 1585 struct mddev *mddev = log->rdev->mddev; 1586 struct r5conf *conf = mddev->private; 1587 1588 ctx->meta_total_blocks += BLOCK_SECTORS * conf->max_degraded; 1589 sync_page_io(log->rdev, log_offset, PAGE_SIZE, 1590 sh->dev[sh->pd_idx].page, REQ_OP_READ, 0, false); 1591 sh->dev[sh->pd_idx].log_checksum = 1592 le32_to_cpu(payload->checksum[0]); 1593 set_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags); 1594 1595 if (sh->qd_idx >= 0) { 1596 sync_page_io(log->rdev, 1597 r5l_ring_add(log, log_offset, BLOCK_SECTORS), 1598 PAGE_SIZE, sh->dev[sh->qd_idx].page, 1599 REQ_OP_READ, 0, false); 1600 sh->dev[sh->qd_idx].log_checksum = 1601 le32_to_cpu(payload->checksum[1]); 1602 set_bit(R5_Wantwrite, &sh->dev[sh->qd_idx].flags); 1603 } 1604 clear_bit(STRIPE_R5C_CACHING, &sh->state); 1605 } 1606 1607 static void r5l_recovery_reset_stripe(struct stripe_head *sh) 1608 { 1609 int i; 1610 1611 sh->state = 0; 1612 sh->log_start = MaxSector; 1613 for (i = sh->disks; i--; ) 1614 sh->dev[i].flags = 0; 1615 } 1616 1617 static void 1618 r5l_recovery_replay_one_stripe(struct r5conf *conf, 1619 struct stripe_head *sh, 1620 struct r5l_recovery_ctx *ctx) 1621 { 1622 struct md_rdev *rdev, *rrdev; 1623 int disk_index; 1624 int data_count = 0; 1625 1626 for (disk_index = 0; disk_index < sh->disks; disk_index++) { 1627 if (!test_bit(R5_Wantwrite, &sh->dev[disk_index].flags)) 1628 continue; 1629 if (disk_index == sh->qd_idx || disk_index == sh->pd_idx) 1630 continue; 1631 data_count++; 1632 } 1633 1634 /* 1635 * stripes that only have parity must have been flushed 1636 * before the crash that we are now recovering from, so 1637 * there is nothing more to recovery. 1638 */ 1639 if (data_count == 0) 1640 goto out; 1641 1642 for (disk_index = 0; disk_index < sh->disks; disk_index++) { 1643 if (!test_bit(R5_Wantwrite, &sh->dev[disk_index].flags)) 1644 continue; 1645 1646 /* in case device is broken */ 1647 rcu_read_lock(); 1648 rdev = rcu_dereference(conf->disks[disk_index].rdev); 1649 if (rdev) { 1650 atomic_inc(&rdev->nr_pending); 1651 rcu_read_unlock(); 1652 sync_page_io(rdev, sh->sector, PAGE_SIZE, 1653 sh->dev[disk_index].page, REQ_OP_WRITE, 0, 1654 false); 1655 rdev_dec_pending(rdev, rdev->mddev); 1656 rcu_read_lock(); 1657 } 1658 rrdev = rcu_dereference(conf->disks[disk_index].replacement); 1659 if (rrdev) { 1660 atomic_inc(&rrdev->nr_pending); 1661 rcu_read_unlock(); 1662 sync_page_io(rrdev, sh->sector, PAGE_SIZE, 1663 sh->dev[disk_index].page, REQ_OP_WRITE, 0, 1664 false); 1665 rdev_dec_pending(rrdev, rrdev->mddev); 1666 rcu_read_lock(); 1667 } 1668 rcu_read_unlock(); 1669 } 1670 ctx->data_parity_stripes++; 1671 out: 1672 r5l_recovery_reset_stripe(sh); 1673 } 1674 1675 static struct stripe_head * 1676 r5c_recovery_alloc_stripe(struct r5conf *conf, 1677 struct list_head *recovery_list, 1678 sector_t stripe_sect, 1679 sector_t log_start) 1680 { 1681 struct stripe_head *sh; 1682 1683 sh = raid5_get_active_stripe(conf, stripe_sect, 0, 1, 0); 1684 if (!sh) 1685 return NULL; /* no more stripe available */ 1686 1687 r5l_recovery_reset_stripe(sh); 1688 sh->log_start = log_start; 1689 1690 return sh; 1691 } 1692 1693 static struct stripe_head * 1694 r5c_recovery_lookup_stripe(struct list_head *list, sector_t sect) 1695 { 1696 struct stripe_head *sh; 1697 1698 list_for_each_entry(sh, list, lru) 1699 if (sh->sector == sect) 1700 return sh; 1701 return NULL; 1702 } 1703 1704 static void 1705 r5c_recovery_drop_stripes(struct list_head *cached_stripe_list, 1706 struct r5l_recovery_ctx *ctx) 1707 { 1708 struct stripe_head *sh, *next; 1709 1710 list_for_each_entry_safe(sh, next, cached_stripe_list, lru) { 1711 r5l_recovery_reset_stripe(sh); 1712 list_del_init(&sh->lru); 1713 raid5_release_stripe(sh); 1714 } 1715 } 1716 1717 static void 1718 r5c_recovery_replay_stripes(struct list_head *cached_stripe_list, 1719 struct r5l_recovery_ctx *ctx) 1720 { 1721 struct stripe_head *sh, *next; 1722 1723 list_for_each_entry_safe(sh, next, cached_stripe_list, lru) 1724 if (!test_bit(STRIPE_R5C_CACHING, &sh->state)) { 1725 r5l_recovery_replay_one_stripe(sh->raid_conf, sh, ctx); 1726 list_del_init(&sh->lru); 1727 raid5_release_stripe(sh); 1728 } 1729 } 1730 1731 /* if matches return 0; otherwise return -EINVAL */ 1732 static int 1733 r5l_recovery_verify_data_checksum(struct r5l_log *log, struct page *page, 1734 sector_t log_offset, __le32 log_checksum) 1735 { 1736 void *addr; 1737 u32 checksum; 1738 1739 sync_page_io(log->rdev, log_offset, PAGE_SIZE, 1740 page, REQ_OP_READ, 0, false); 1741 addr = kmap_atomic(page); 1742 checksum = crc32c_le(log->uuid_checksum, addr, PAGE_SIZE); 1743 kunmap_atomic(addr); 1744 return (le32_to_cpu(log_checksum) == checksum) ? 0 : -EINVAL; 1745 } 1746 1747 /* 1748 * before loading data to stripe cache, we need verify checksum for all data, 1749 * if there is mismatch for any data page, we drop all data in the mata block 1750 */ 1751 static int 1752 r5l_recovery_verify_data_checksum_for_mb(struct r5l_log *log, 1753 struct r5l_recovery_ctx *ctx) 1754 { 1755 struct mddev *mddev = log->rdev->mddev; 1756 struct r5conf *conf = mddev->private; 1757 struct r5l_meta_block *mb = page_address(ctx->meta_page); 1758 sector_t mb_offset = sizeof(struct r5l_meta_block); 1759 sector_t log_offset = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS); 1760 struct page *page; 1761 struct r5l_payload_data_parity *payload; 1762 1763 page = alloc_page(GFP_KERNEL); 1764 if (!page) 1765 return -ENOMEM; 1766 1767 while (mb_offset < le32_to_cpu(mb->meta_size)) { 1768 payload = (void *)mb + mb_offset; 1769 1770 if (payload->header.type == R5LOG_PAYLOAD_DATA) { 1771 if (r5l_recovery_verify_data_checksum( 1772 log, page, log_offset, 1773 payload->checksum[0]) < 0) 1774 goto mismatch; 1775 } else if (payload->header.type == R5LOG_PAYLOAD_PARITY) { 1776 if (r5l_recovery_verify_data_checksum( 1777 log, page, log_offset, 1778 payload->checksum[0]) < 0) 1779 goto mismatch; 1780 if (conf->max_degraded == 2 && /* q for RAID 6 */ 1781 r5l_recovery_verify_data_checksum( 1782 log, page, 1783 r5l_ring_add(log, log_offset, 1784 BLOCK_SECTORS), 1785 payload->checksum[1]) < 0) 1786 goto mismatch; 1787 } else /* not R5LOG_PAYLOAD_DATA or R5LOG_PAYLOAD_PARITY */ 1788 goto mismatch; 1789 1790 log_offset = r5l_ring_add(log, log_offset, 1791 le32_to_cpu(payload->size)); 1792 1793 mb_offset += sizeof(struct r5l_payload_data_parity) + 1794 sizeof(__le32) * 1795 (le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9)); 1796 } 1797 1798 put_page(page); 1799 return 0; 1800 1801 mismatch: 1802 put_page(page); 1803 return -EINVAL; 1804 } 1805 1806 /* 1807 * Analyze all data/parity pages in one meta block 1808 * Returns: 1809 * 0 for success 1810 * -EINVAL for unknown playload type 1811 * -EAGAIN for checksum mismatch of data page 1812 * -ENOMEM for run out of memory (alloc_page failed or run out of stripes) 1813 */ 1814 static int 1815 r5c_recovery_analyze_meta_block(struct r5l_log *log, 1816 struct r5l_recovery_ctx *ctx, 1817 struct list_head *cached_stripe_list) 1818 { 1819 struct mddev *mddev = log->rdev->mddev; 1820 struct r5conf *conf = mddev->private; 1821 struct r5l_meta_block *mb; 1822 struct r5l_payload_data_parity *payload; 1823 int mb_offset; 1824 sector_t log_offset; 1825 sector_t stripe_sect; 1826 struct stripe_head *sh; 1827 int ret; 1828 1829 /* 1830 * for mismatch in data blocks, we will drop all data in this mb, but 1831 * we will still read next mb for other data with FLUSH flag, as 1832 * io_unit could finish out of order. 1833 */ 1834 ret = r5l_recovery_verify_data_checksum_for_mb(log, ctx); 1835 if (ret == -EINVAL) 1836 return -EAGAIN; 1837 else if (ret) 1838 return ret; /* -ENOMEM duo to alloc_page() failed */ 1839 1840 mb = page_address(ctx->meta_page); 1841 mb_offset = sizeof(struct r5l_meta_block); 1842 log_offset = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS); 1843 1844 while (mb_offset < le32_to_cpu(mb->meta_size)) { 1845 int dd; 1846 1847 payload = (void *)mb + mb_offset; 1848 stripe_sect = (payload->header.type == R5LOG_PAYLOAD_DATA) ? 1849 raid5_compute_sector( 1850 conf, le64_to_cpu(payload->location), 0, &dd, 1851 NULL) 1852 : le64_to_cpu(payload->location); 1853 1854 sh = r5c_recovery_lookup_stripe(cached_stripe_list, 1855 stripe_sect); 1856 1857 if (!sh) { 1858 sh = r5c_recovery_alloc_stripe(conf, cached_stripe_list, 1859 stripe_sect, ctx->pos); 1860 /* 1861 * cannot get stripe from raid5_get_active_stripe 1862 * try replay some stripes 1863 */ 1864 if (!sh) { 1865 r5c_recovery_replay_stripes( 1866 cached_stripe_list, ctx); 1867 sh = r5c_recovery_alloc_stripe( 1868 conf, cached_stripe_list, 1869 stripe_sect, ctx->pos); 1870 } 1871 if (!sh) { 1872 pr_debug("md/raid:%s: Increasing stripe cache size to %d to recovery data on journal.\n", 1873 mdname(mddev), 1874 conf->min_nr_stripes * 2); 1875 raid5_set_cache_size(mddev, 1876 conf->min_nr_stripes * 2); 1877 sh = r5c_recovery_alloc_stripe( 1878 conf, cached_stripe_list, stripe_sect, 1879 ctx->pos); 1880 } 1881 if (!sh) { 1882 pr_err("md/raid:%s: Cannot get enough stripes due to memory pressure. Recovery failed.\n", 1883 mdname(mddev)); 1884 return -ENOMEM; 1885 } 1886 list_add_tail(&sh->lru, cached_stripe_list); 1887 } 1888 1889 if (payload->header.type == R5LOG_PAYLOAD_DATA) { 1890 if (!test_bit(STRIPE_R5C_CACHING, &sh->state) && 1891 test_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags)) { 1892 r5l_recovery_replay_one_stripe(conf, sh, ctx); 1893 sh->log_start = ctx->pos; 1894 list_move_tail(&sh->lru, cached_stripe_list); 1895 } 1896 r5l_recovery_load_data(log, sh, ctx, payload, 1897 log_offset); 1898 } else if (payload->header.type == R5LOG_PAYLOAD_PARITY) 1899 r5l_recovery_load_parity(log, sh, ctx, payload, 1900 log_offset); 1901 else 1902 return -EINVAL; 1903 1904 log_offset = r5l_ring_add(log, log_offset, 1905 le32_to_cpu(payload->size)); 1906 1907 mb_offset += sizeof(struct r5l_payload_data_parity) + 1908 sizeof(__le32) * 1909 (le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9)); 1910 } 1911 1912 return 0; 1913 } 1914 1915 /* 1916 * Load the stripe into cache. The stripe will be written out later by 1917 * the stripe cache state machine. 1918 */ 1919 static void r5c_recovery_load_one_stripe(struct r5l_log *log, 1920 struct stripe_head *sh) 1921 { 1922 struct r5conf *conf = sh->raid_conf; 1923 struct r5dev *dev; 1924 int i; 1925 1926 for (i = sh->disks; i--; ) { 1927 dev = sh->dev + i; 1928 if (test_and_clear_bit(R5_Wantwrite, &dev->flags)) { 1929 set_bit(R5_InJournal, &dev->flags); 1930 set_bit(R5_UPTODATE, &dev->flags); 1931 } 1932 } 1933 set_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state); 1934 atomic_inc(&conf->r5c_cached_partial_stripes); 1935 list_add_tail(&sh->r5c, &log->stripe_in_journal_list); 1936 } 1937 1938 /* 1939 * Scan through the log for all to-be-flushed data 1940 * 1941 * For stripes with data and parity, namely Data-Parity stripe 1942 * (STRIPE_R5C_CACHING == 0), we simply replay all the writes. 1943 * 1944 * For stripes with only data, namely Data-Only stripe 1945 * (STRIPE_R5C_CACHING == 1), we load them to stripe cache state machine. 1946 * 1947 * For a stripe, if we see data after parity, we should discard all previous 1948 * data and parity for this stripe, as these data are already flushed to 1949 * the array. 1950 * 1951 * At the end of the scan, we return the new journal_tail, which points to 1952 * first data-only stripe on the journal device, or next invalid meta block. 1953 */ 1954 static int r5c_recovery_flush_log(struct r5l_log *log, 1955 struct r5l_recovery_ctx *ctx) 1956 { 1957 struct stripe_head *sh, *next; 1958 int ret = 0; 1959 1960 /* scan through the log */ 1961 while (1) { 1962 if (r5l_recovery_read_meta_block(log, ctx)) 1963 break; 1964 1965 ret = r5c_recovery_analyze_meta_block(log, ctx, 1966 &ctx->cached_list); 1967 /* 1968 * -EAGAIN means mismatch in data block, in this case, we still 1969 * try scan the next metablock 1970 */ 1971 if (ret && ret != -EAGAIN) 1972 break; /* ret == -EINVAL or -ENOMEM */ 1973 ctx->seq++; 1974 ctx->pos = r5l_ring_add(log, ctx->pos, ctx->meta_total_blocks); 1975 } 1976 1977 if (ret == -ENOMEM) { 1978 r5c_recovery_drop_stripes(&ctx->cached_list, ctx); 1979 return ret; 1980 } 1981 1982 /* replay data-parity stripes */ 1983 r5c_recovery_replay_stripes(&ctx->cached_list, ctx); 1984 1985 /* load data-only stripes to stripe cache */ 1986 list_for_each_entry_safe(sh, next, &ctx->cached_list, lru) { 1987 WARN_ON(!test_bit(STRIPE_R5C_CACHING, &sh->state)); 1988 r5c_recovery_load_one_stripe(log, sh); 1989 list_del_init(&sh->lru); 1990 raid5_release_stripe(sh); 1991 ctx->data_only_stripes++; 1992 } 1993 1994 return 0; 1995 } 1996 1997 /* 1998 * we did a recovery. Now ctx.pos points to an invalid meta block. New 1999 * log will start here. but we can't let superblock point to last valid 2000 * meta block. The log might looks like: 2001 * | meta 1| meta 2| meta 3| 2002 * meta 1 is valid, meta 2 is invalid. meta 3 could be valid. If 2003 * superblock points to meta 1, we write a new valid meta 2n. if crash 2004 * happens again, new recovery will start from meta 1. Since meta 2n is 2005 * valid now, recovery will think meta 3 is valid, which is wrong. 2006 * The solution is we create a new meta in meta2 with its seq == meta 2007 * 1's seq + 10 and let superblock points to meta2. The same recovery will 2008 * not think meta 3 is a valid meta, because its seq doesn't match 2009 */ 2010 2011 /* 2012 * Before recovery, the log looks like the following 2013 * 2014 * --------------------------------------------- 2015 * | valid log | invalid log | 2016 * --------------------------------------------- 2017 * ^ 2018 * |- log->last_checkpoint 2019 * |- log->last_cp_seq 2020 * 2021 * Now we scan through the log until we see invalid entry 2022 * 2023 * --------------------------------------------- 2024 * | valid log | invalid log | 2025 * --------------------------------------------- 2026 * ^ ^ 2027 * |- log->last_checkpoint |- ctx->pos 2028 * |- log->last_cp_seq |- ctx->seq 2029 * 2030 * From this point, we need to increase seq number by 10 to avoid 2031 * confusing next recovery. 2032 * 2033 * --------------------------------------------- 2034 * | valid log | invalid log | 2035 * --------------------------------------------- 2036 * ^ ^ 2037 * |- log->last_checkpoint |- ctx->pos+1 2038 * |- log->last_cp_seq |- ctx->seq+11 2039 * 2040 * However, it is not safe to start the state machine yet, because data only 2041 * parities are not yet secured in RAID. To save these data only parities, we 2042 * rewrite them from seq+11. 2043 * 2044 * ----------------------------------------------------------------- 2045 * | valid log | data only stripes | invalid log | 2046 * ----------------------------------------------------------------- 2047 * ^ ^ 2048 * |- log->last_checkpoint |- ctx->pos+n 2049 * |- log->last_cp_seq |- ctx->seq+10+n 2050 * 2051 * If failure happens again during this process, the recovery can safe start 2052 * again from log->last_checkpoint. 2053 * 2054 * Once data only stripes are rewritten to journal, we move log_tail 2055 * 2056 * ----------------------------------------------------------------- 2057 * | old log | data only stripes | invalid log | 2058 * ----------------------------------------------------------------- 2059 * ^ ^ 2060 * |- log->last_checkpoint |- ctx->pos+n 2061 * |- log->last_cp_seq |- ctx->seq+10+n 2062 * 2063 * Then we can safely start the state machine. If failure happens from this 2064 * point on, the recovery will start from new log->last_checkpoint. 2065 */ 2066 static int 2067 r5c_recovery_rewrite_data_only_stripes(struct r5l_log *log, 2068 struct r5l_recovery_ctx *ctx) 2069 { 2070 struct stripe_head *sh; 2071 struct mddev *mddev = log->rdev->mddev; 2072 struct page *page; 2073 2074 page = alloc_page(GFP_KERNEL); 2075 if (!page) { 2076 pr_err("md/raid:%s: cannot allocate memory to rewrite data only stripes\n", 2077 mdname(mddev)); 2078 return -ENOMEM; 2079 } 2080 2081 ctx->seq += 10; 2082 list_for_each_entry(sh, &ctx->cached_list, lru) { 2083 struct r5l_meta_block *mb; 2084 int i; 2085 int offset; 2086 sector_t write_pos; 2087 2088 WARN_ON(!test_bit(STRIPE_R5C_CACHING, &sh->state)); 2089 r5l_recovery_create_empty_meta_block(log, page, 2090 ctx->pos, ctx->seq); 2091 mb = page_address(page); 2092 offset = le32_to_cpu(mb->meta_size); 2093 write_pos = ctx->pos + BLOCK_SECTORS; 2094 2095 for (i = sh->disks; i--; ) { 2096 struct r5dev *dev = &sh->dev[i]; 2097 struct r5l_payload_data_parity *payload; 2098 void *addr; 2099 2100 if (test_bit(R5_InJournal, &dev->flags)) { 2101 payload = (void *)mb + offset; 2102 payload->header.type = cpu_to_le16( 2103 R5LOG_PAYLOAD_DATA); 2104 payload->size = BLOCK_SECTORS; 2105 payload->location = cpu_to_le64( 2106 raid5_compute_blocknr(sh, i, 0)); 2107 addr = kmap_atomic(dev->page); 2108 payload->checksum[0] = cpu_to_le32( 2109 crc32c_le(log->uuid_checksum, addr, 2110 PAGE_SIZE)); 2111 kunmap_atomic(addr); 2112 sync_page_io(log->rdev, write_pos, PAGE_SIZE, 2113 dev->page, REQ_OP_WRITE, 0, false); 2114 write_pos = r5l_ring_add(log, write_pos, 2115 BLOCK_SECTORS); 2116 offset += sizeof(__le32) + 2117 sizeof(struct r5l_payload_data_parity); 2118 2119 } 2120 } 2121 mb->meta_size = cpu_to_le32(offset); 2122 mb->checksum = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE); 2123 sync_page_io(log->rdev, ctx->pos, PAGE_SIZE, page, 2124 REQ_OP_WRITE, WRITE_FUA, false); 2125 sh->log_start = ctx->pos; 2126 ctx->pos = write_pos; 2127 ctx->seq += 1; 2128 } 2129 __free_page(page); 2130 return 0; 2131 } 2132 2133 static int r5l_recovery_log(struct r5l_log *log) 2134 { 2135 struct mddev *mddev = log->rdev->mddev; 2136 struct r5l_recovery_ctx ctx; 2137 int ret; 2138 2139 ctx.pos = log->last_checkpoint; 2140 ctx.seq = log->last_cp_seq; 2141 ctx.meta_page = alloc_page(GFP_KERNEL); 2142 ctx.data_only_stripes = 0; 2143 ctx.data_parity_stripes = 0; 2144 INIT_LIST_HEAD(&ctx.cached_list); 2145 2146 if (!ctx.meta_page) 2147 return -ENOMEM; 2148 2149 ret = r5c_recovery_flush_log(log, &ctx); 2150 __free_page(ctx.meta_page); 2151 2152 if (ret) 2153 return ret; 2154 2155 if ((ctx.data_only_stripes == 0) && (ctx.data_parity_stripes == 0)) 2156 pr_debug("md/raid:%s: starting from clean shutdown\n", 2157 mdname(mddev)); 2158 else { 2159 pr_debug("md/raid:%s: recoverying %d data-only stripes and %d data-parity stripes\n", 2160 mdname(mddev), ctx.data_only_stripes, 2161 ctx.data_parity_stripes); 2162 2163 if (ctx.data_only_stripes > 0) 2164 if (r5c_recovery_rewrite_data_only_stripes(log, &ctx)) { 2165 pr_err("md/raid:%s: failed to rewrite stripes to journal\n", 2166 mdname(mddev)); 2167 return -EIO; 2168 } 2169 } 2170 2171 log->log_start = ctx.pos; 2172 log->next_checkpoint = ctx.pos; 2173 log->seq = ctx.seq; 2174 r5l_log_write_empty_meta_block(log, ctx.pos, ctx.seq); 2175 r5l_write_super(log, ctx.pos); 2176 return 0; 2177 } 2178 2179 static void r5l_write_super(struct r5l_log *log, sector_t cp) 2180 { 2181 struct mddev *mddev = log->rdev->mddev; 2182 2183 log->rdev->journal_tail = cp; 2184 set_bit(MD_CHANGE_DEVS, &mddev->flags); 2185 } 2186 2187 static ssize_t r5c_journal_mode_show(struct mddev *mddev, char *page) 2188 { 2189 struct r5conf *conf = mddev->private; 2190 int ret; 2191 2192 if (!conf->log) 2193 return 0; 2194 2195 switch (conf->log->r5c_journal_mode) { 2196 case R5C_JOURNAL_MODE_WRITE_THROUGH: 2197 ret = snprintf( 2198 page, PAGE_SIZE, "[%s] %s\n", 2199 r5c_journal_mode_str[R5C_JOURNAL_MODE_WRITE_THROUGH], 2200 r5c_journal_mode_str[R5C_JOURNAL_MODE_WRITE_BACK]); 2201 break; 2202 case R5C_JOURNAL_MODE_WRITE_BACK: 2203 ret = snprintf( 2204 page, PAGE_SIZE, "%s [%s]\n", 2205 r5c_journal_mode_str[R5C_JOURNAL_MODE_WRITE_THROUGH], 2206 r5c_journal_mode_str[R5C_JOURNAL_MODE_WRITE_BACK]); 2207 break; 2208 default: 2209 ret = 0; 2210 } 2211 return ret; 2212 } 2213 2214 static ssize_t r5c_journal_mode_store(struct mddev *mddev, 2215 const char *page, size_t length) 2216 { 2217 struct r5conf *conf = mddev->private; 2218 struct r5l_log *log = conf->log; 2219 int val = -1, i; 2220 int len = length; 2221 2222 if (!log) 2223 return -ENODEV; 2224 2225 if (len && page[len - 1] == '\n') 2226 len -= 1; 2227 for (i = 0; i < ARRAY_SIZE(r5c_journal_mode_str); i++) 2228 if (strlen(r5c_journal_mode_str[i]) == len && 2229 strncmp(page, r5c_journal_mode_str[i], len) == 0) { 2230 val = i; 2231 break; 2232 } 2233 if (val < R5C_JOURNAL_MODE_WRITE_THROUGH || 2234 val > R5C_JOURNAL_MODE_WRITE_BACK) 2235 return -EINVAL; 2236 2237 mddev_suspend(mddev); 2238 conf->log->r5c_journal_mode = val; 2239 mddev_resume(mddev); 2240 2241 pr_debug("md/raid:%s: setting r5c cache mode to %d: %s\n", 2242 mdname(mddev), val, r5c_journal_mode_str[val]); 2243 return length; 2244 } 2245 2246 struct md_sysfs_entry 2247 r5c_journal_mode = __ATTR(journal_mode, 0644, 2248 r5c_journal_mode_show, r5c_journal_mode_store); 2249 2250 /* 2251 * Try handle write operation in caching phase. This function should only 2252 * be called in write-back mode. 2253 * 2254 * If all outstanding writes can be handled in caching phase, returns 0 2255 * If writes requires write-out phase, call r5c_make_stripe_write_out() 2256 * and returns -EAGAIN 2257 */ 2258 int r5c_try_caching_write(struct r5conf *conf, 2259 struct stripe_head *sh, 2260 struct stripe_head_state *s, 2261 int disks) 2262 { 2263 struct r5l_log *log = conf->log; 2264 int i; 2265 struct r5dev *dev; 2266 int to_cache = 0; 2267 2268 BUG_ON(!r5c_is_writeback(log)); 2269 2270 if (!test_bit(STRIPE_R5C_CACHING, &sh->state)) { 2271 /* 2272 * There are two different scenarios here: 2273 * 1. The stripe has some data cached, and it is sent to 2274 * write-out phase for reclaim 2275 * 2. The stripe is clean, and this is the first write 2276 * 2277 * For 1, return -EAGAIN, so we continue with 2278 * handle_stripe_dirtying(). 2279 * 2280 * For 2, set STRIPE_R5C_CACHING and continue with caching 2281 * write. 2282 */ 2283 2284 /* case 1: anything injournal or anything in written */ 2285 if (s->injournal > 0 || s->written > 0) 2286 return -EAGAIN; 2287 /* case 2 */ 2288 set_bit(STRIPE_R5C_CACHING, &sh->state); 2289 } 2290 2291 for (i = disks; i--; ) { 2292 dev = &sh->dev[i]; 2293 /* if non-overwrite, use writing-out phase */ 2294 if (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags) && 2295 !test_bit(R5_InJournal, &dev->flags)) { 2296 r5c_make_stripe_write_out(sh); 2297 return -EAGAIN; 2298 } 2299 } 2300 2301 for (i = disks; i--; ) { 2302 dev = &sh->dev[i]; 2303 if (dev->towrite) { 2304 set_bit(R5_Wantwrite, &dev->flags); 2305 set_bit(R5_Wantdrain, &dev->flags); 2306 set_bit(R5_LOCKED, &dev->flags); 2307 to_cache++; 2308 } 2309 } 2310 2311 if (to_cache) { 2312 set_bit(STRIPE_OP_BIODRAIN, &s->ops_request); 2313 /* 2314 * set STRIPE_LOG_TRAPPED, which triggers r5c_cache_data() 2315 * in ops_run_io(). STRIPE_LOG_TRAPPED will be cleared in 2316 * r5c_handle_data_cached() 2317 */ 2318 set_bit(STRIPE_LOG_TRAPPED, &sh->state); 2319 } 2320 2321 return 0; 2322 } 2323 2324 /* 2325 * free extra pages (orig_page) we allocated for prexor 2326 */ 2327 void r5c_release_extra_page(struct stripe_head *sh) 2328 { 2329 struct r5conf *conf = sh->raid_conf; 2330 int i; 2331 bool using_disk_info_extra_page; 2332 2333 using_disk_info_extra_page = 2334 sh->dev[0].orig_page == conf->disks[0].extra_page; 2335 2336 for (i = sh->disks; i--; ) 2337 if (sh->dev[i].page != sh->dev[i].orig_page) { 2338 struct page *p = sh->dev[i].orig_page; 2339 2340 sh->dev[i].orig_page = sh->dev[i].page; 2341 if (!using_disk_info_extra_page) 2342 put_page(p); 2343 } 2344 2345 if (using_disk_info_extra_page) { 2346 clear_bit(R5C_EXTRA_PAGE_IN_USE, &conf->cache_state); 2347 md_wakeup_thread(conf->mddev->thread); 2348 } 2349 } 2350 2351 void r5c_use_extra_page(struct stripe_head *sh) 2352 { 2353 struct r5conf *conf = sh->raid_conf; 2354 int i; 2355 struct r5dev *dev; 2356 2357 for (i = sh->disks; i--; ) { 2358 dev = &sh->dev[i]; 2359 if (dev->orig_page != dev->page) 2360 put_page(dev->orig_page); 2361 dev->orig_page = conf->disks[i].extra_page; 2362 } 2363 } 2364 2365 /* 2366 * clean up the stripe (clear R5_InJournal for dev[pd_idx] etc.) after the 2367 * stripe is committed to RAID disks. 2368 */ 2369 void r5c_finish_stripe_write_out(struct r5conf *conf, 2370 struct stripe_head *sh, 2371 struct stripe_head_state *s) 2372 { 2373 int i; 2374 int do_wakeup = 0; 2375 2376 if (!conf->log || 2377 !test_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags)) 2378 return; 2379 2380 WARN_ON(test_bit(STRIPE_R5C_CACHING, &sh->state)); 2381 clear_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags); 2382 2383 if (conf->log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) 2384 return; 2385 2386 for (i = sh->disks; i--; ) { 2387 clear_bit(R5_InJournal, &sh->dev[i].flags); 2388 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) 2389 do_wakeup = 1; 2390 } 2391 2392 /* 2393 * analyse_stripe() runs before r5c_finish_stripe_write_out(), 2394 * We updated R5_InJournal, so we also update s->injournal. 2395 */ 2396 s->injournal = 0; 2397 2398 if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state)) 2399 if (atomic_dec_and_test(&conf->pending_full_writes)) 2400 md_wakeup_thread(conf->mddev->thread); 2401 2402 if (do_wakeup) 2403 wake_up(&conf->wait_for_overlap); 2404 2405 if (conf->log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) 2406 return; 2407 2408 spin_lock_irq(&conf->log->stripe_in_journal_lock); 2409 list_del_init(&sh->r5c); 2410 spin_unlock_irq(&conf->log->stripe_in_journal_lock); 2411 sh->log_start = MaxSector; 2412 atomic_dec(&conf->log->stripe_in_journal_count); 2413 } 2414 2415 int 2416 r5c_cache_data(struct r5l_log *log, struct stripe_head *sh, 2417 struct stripe_head_state *s) 2418 { 2419 struct r5conf *conf = sh->raid_conf; 2420 int pages = 0; 2421 int reserve; 2422 int i; 2423 int ret = 0; 2424 2425 BUG_ON(!log); 2426 2427 for (i = 0; i < sh->disks; i++) { 2428 void *addr; 2429 2430 if (!test_bit(R5_Wantwrite, &sh->dev[i].flags)) 2431 continue; 2432 addr = kmap_atomic(sh->dev[i].page); 2433 sh->dev[i].log_checksum = crc32c_le(log->uuid_checksum, 2434 addr, PAGE_SIZE); 2435 kunmap_atomic(addr); 2436 pages++; 2437 } 2438 WARN_ON(pages == 0); 2439 2440 /* 2441 * The stripe must enter state machine again to call endio, so 2442 * don't delay. 2443 */ 2444 clear_bit(STRIPE_DELAYED, &sh->state); 2445 atomic_inc(&sh->count); 2446 2447 mutex_lock(&log->io_mutex); 2448 /* meta + data */ 2449 reserve = (1 + pages) << (PAGE_SHIFT - 9); 2450 2451 if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state) && 2452 sh->log_start == MaxSector) 2453 r5l_add_no_space_stripe(log, sh); 2454 else if (!r5l_has_free_space(log, reserve)) { 2455 if (sh->log_start == log->last_checkpoint) 2456 BUG(); 2457 else 2458 r5l_add_no_space_stripe(log, sh); 2459 } else { 2460 ret = r5l_log_stripe(log, sh, pages, 0); 2461 if (ret) { 2462 spin_lock_irq(&log->io_list_lock); 2463 list_add_tail(&sh->log_list, &log->no_mem_stripes); 2464 spin_unlock_irq(&log->io_list_lock); 2465 } 2466 } 2467 2468 mutex_unlock(&log->io_mutex); 2469 return 0; 2470 } 2471 2472 static int r5l_load_log(struct r5l_log *log) 2473 { 2474 struct md_rdev *rdev = log->rdev; 2475 struct page *page; 2476 struct r5l_meta_block *mb; 2477 sector_t cp = log->rdev->journal_tail; 2478 u32 stored_crc, expected_crc; 2479 bool create_super = false; 2480 int ret; 2481 2482 /* Make sure it's valid */ 2483 if (cp >= rdev->sectors || round_down(cp, BLOCK_SECTORS) != cp) 2484 cp = 0; 2485 page = alloc_page(GFP_KERNEL); 2486 if (!page) 2487 return -ENOMEM; 2488 2489 if (!sync_page_io(rdev, cp, PAGE_SIZE, page, REQ_OP_READ, 0, false)) { 2490 ret = -EIO; 2491 goto ioerr; 2492 } 2493 mb = page_address(page); 2494 2495 if (le32_to_cpu(mb->magic) != R5LOG_MAGIC || 2496 mb->version != R5LOG_VERSION) { 2497 create_super = true; 2498 goto create; 2499 } 2500 stored_crc = le32_to_cpu(mb->checksum); 2501 mb->checksum = 0; 2502 expected_crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE); 2503 if (stored_crc != expected_crc) { 2504 create_super = true; 2505 goto create; 2506 } 2507 if (le64_to_cpu(mb->position) != cp) { 2508 create_super = true; 2509 goto create; 2510 } 2511 create: 2512 if (create_super) { 2513 log->last_cp_seq = prandom_u32(); 2514 cp = 0; 2515 r5l_log_write_empty_meta_block(log, cp, log->last_cp_seq); 2516 /* 2517 * Make sure super points to correct address. Log might have 2518 * data very soon. If super hasn't correct log tail address, 2519 * recovery can't find the log 2520 */ 2521 r5l_write_super(log, cp); 2522 } else 2523 log->last_cp_seq = le64_to_cpu(mb->seq); 2524 2525 log->device_size = round_down(rdev->sectors, BLOCK_SECTORS); 2526 log->max_free_space = log->device_size >> RECLAIM_MAX_FREE_SPACE_SHIFT; 2527 if (log->max_free_space > RECLAIM_MAX_FREE_SPACE) 2528 log->max_free_space = RECLAIM_MAX_FREE_SPACE; 2529 log->last_checkpoint = cp; 2530 log->next_checkpoint = cp; 2531 mutex_lock(&log->io_mutex); 2532 r5c_update_log_state(log); 2533 mutex_unlock(&log->io_mutex); 2534 2535 __free_page(page); 2536 2537 return r5l_recovery_log(log); 2538 ioerr: 2539 __free_page(page); 2540 return ret; 2541 } 2542 2543 int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev) 2544 { 2545 struct request_queue *q = bdev_get_queue(rdev->bdev); 2546 struct r5l_log *log; 2547 2548 if (PAGE_SIZE != 4096) 2549 return -EINVAL; 2550 2551 /* 2552 * The PAGE_SIZE must be big enough to hold 1 r5l_meta_block and 2553 * raid_disks r5l_payload_data_parity. 2554 * 2555 * Write journal and cache does not work for very big array 2556 * (raid_disks > 203) 2557 */ 2558 if (sizeof(struct r5l_meta_block) + 2559 ((sizeof(struct r5l_payload_data_parity) + sizeof(__le32)) * 2560 conf->raid_disks) > PAGE_SIZE) { 2561 pr_err("md/raid:%s: write journal/cache doesn't work for array with %d disks\n", 2562 mdname(conf->mddev), conf->raid_disks); 2563 return -EINVAL; 2564 } 2565 2566 log = kzalloc(sizeof(*log), GFP_KERNEL); 2567 if (!log) 2568 return -ENOMEM; 2569 log->rdev = rdev; 2570 2571 log->need_cache_flush = test_bit(QUEUE_FLAG_WC, &q->queue_flags) != 0; 2572 2573 log->uuid_checksum = crc32c_le(~0, rdev->mddev->uuid, 2574 sizeof(rdev->mddev->uuid)); 2575 2576 mutex_init(&log->io_mutex); 2577 2578 spin_lock_init(&log->io_list_lock); 2579 INIT_LIST_HEAD(&log->running_ios); 2580 INIT_LIST_HEAD(&log->io_end_ios); 2581 INIT_LIST_HEAD(&log->flushing_ios); 2582 INIT_LIST_HEAD(&log->finished_ios); 2583 bio_init(&log->flush_bio); 2584 2585 log->io_kc = KMEM_CACHE(r5l_io_unit, 0); 2586 if (!log->io_kc) 2587 goto io_kc; 2588 2589 log->io_pool = mempool_create_slab_pool(R5L_POOL_SIZE, log->io_kc); 2590 if (!log->io_pool) 2591 goto io_pool; 2592 2593 log->bs = bioset_create(R5L_POOL_SIZE, 0); 2594 if (!log->bs) 2595 goto io_bs; 2596 2597 log->meta_pool = mempool_create_page_pool(R5L_POOL_SIZE, 0); 2598 if (!log->meta_pool) 2599 goto out_mempool; 2600 2601 log->reclaim_thread = md_register_thread(r5l_reclaim_thread, 2602 log->rdev->mddev, "reclaim"); 2603 if (!log->reclaim_thread) 2604 goto reclaim_thread; 2605 log->reclaim_thread->timeout = R5C_RECLAIM_WAKEUP_INTERVAL; 2606 2607 init_waitqueue_head(&log->iounit_wait); 2608 2609 INIT_LIST_HEAD(&log->no_mem_stripes); 2610 2611 INIT_LIST_HEAD(&log->no_space_stripes); 2612 spin_lock_init(&log->no_space_stripes_lock); 2613 2614 INIT_WORK(&log->deferred_io_work, r5l_submit_io_async); 2615 2616 log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH; 2617 INIT_LIST_HEAD(&log->stripe_in_journal_list); 2618 spin_lock_init(&log->stripe_in_journal_lock); 2619 atomic_set(&log->stripe_in_journal_count, 0); 2620 2621 if (r5l_load_log(log)) 2622 goto error; 2623 2624 rcu_assign_pointer(conf->log, log); 2625 set_bit(MD_HAS_JOURNAL, &conf->mddev->flags); 2626 return 0; 2627 2628 error: 2629 md_unregister_thread(&log->reclaim_thread); 2630 reclaim_thread: 2631 mempool_destroy(log->meta_pool); 2632 out_mempool: 2633 bioset_free(log->bs); 2634 io_bs: 2635 mempool_destroy(log->io_pool); 2636 io_pool: 2637 kmem_cache_destroy(log->io_kc); 2638 io_kc: 2639 kfree(log); 2640 return -EINVAL; 2641 } 2642 2643 void r5l_exit_log(struct r5l_log *log) 2644 { 2645 md_unregister_thread(&log->reclaim_thread); 2646 mempool_destroy(log->meta_pool); 2647 bioset_free(log->bs); 2648 mempool_destroy(log->io_pool); 2649 kmem_cache_destroy(log->io_kc); 2650 kfree(log); 2651 } 2652