raid5-cache.c - OpenGrok cross reference for /openbmc/linux/drivers/md/raid5-cache.c

Deleted Added

sdiffudifftextold (3bddb7f8..)new (3a83f467..)

raid5-cache.c (3bddb7f8f264ec58dc86e11ca97341c24f9d38f6)	raid5-cache.c (3a83f4677539bce8eaa2bca9ee9c20e172d7ab04)
1/* 2 * Copyright (C) 2015 Shaohua Li <shli@fb.com>	1/* 2 * Copyright (C) 2015 Shaohua Li <shli@fb.com>
3 * Copyright (C) 2016 Song Liu <songliubraving@fb.com>
4 * 5 * This program is free software; you can redistribute it and/or modify it 6 * under the terms and conditions of the GNU General Public License, 7 * version 2, as published by the Free Software Foundation. 8 * 9 * This program is distributed in the hope it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for --- 4 unchanged lines hidden (view full) --- 16#include <linux/wait.h> 17#include <linux/blkdev.h> 18#include <linux/slab.h> 19#include <linux/raid/md_p.h> 20#include <linux/crc32c.h> 21#include <linux/random.h> 22#include "md.h" 23#include "raid5.h"	3 * 4 * This program is free software; you can redistribute it and/or modify it 5 * under the terms and conditions of the GNU General Public License, 6 * version 2, as published by the Free Software Foundation. 7 * 8 * This program is distributed in the hope it will be useful, but WITHOUT 9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for --- 4 unchanged lines hidden (view full) --- 15#include <linux/wait.h> 16#include <linux/blkdev.h> 17#include <linux/slab.h> 18#include <linux/raid/md_p.h> 19#include <linux/crc32c.h> 20#include <linux/random.h> 21#include "md.h" 22#include "raid5.h"
24#include "bitmap.h"
25 26/* 27 * metadata/data stored in disk with 4k size unit (a block) regardless 28 * underneath hardware sector size. only works with PAGE_SIZE == 4096 29 / 30#define BLOCK_SECTORS (8) 31 32/	23 24/* 25 * metadata/data stored in disk with 4k size unit (a block) regardless 26 * underneath hardware sector size. only works with PAGE_SIZE == 4096 27 / 28#define BLOCK_SECTORS (8) 29 30/
33 * log->max_free_space is min(1/4 disk size, 10G reclaimable space). 34 * 35 * In write through mode, the reclaim runs every log->max_free_space. 36 * This can prevent the recovery scans for too long	31 * reclaim runs every 1/4 disk size or 10G reclaimable space. This can prevent 32 * recovery scans a very long log
37 / 38#define RECLAIM_MAX_FREE_SPACE (10 1024 * 1024 * 2) /* sector */ 39#define RECLAIM_MAX_FREE_SPACE_SHIFT (2) 40	33 / 34#define RECLAIM_MAX_FREE_SPACE (10 1024 * 1024 * 2) /* sector */ 35#define RECLAIM_MAX_FREE_SPACE_SHIFT (2) 36
41/* wake up reclaim thread periodically / 42#define R5C_RECLAIM_WAKEUP_INTERVAL (30 HZ) 43/* start flush with these full stripes / 44#define R5C_FULL_STRIPE_FLUSH_BATCH 256 45/ reclaim stripes in groups / 46#define R5C_RECLAIM_STRIPE_GROUP (NR_STRIPE_HASH_LOCKS 2) 47
48/* 49 * We only need 2 bios per I/O unit to make progress, but ensure we 50 * have a few more available to not get too tight. 51 */ 52#define R5L_POOL_SIZE 4 53	37/* 38 * We only need 2 bios per I/O unit to make progress, but ensure we 39 * have a few more available to not get too tight. 40 */ 41#define R5L_POOL_SIZE 4 42
54/* 55 * r5c journal modes of the array: write-back or write-through. 56 * write-through mode has identical behavior as existing log only 57 * implementation. 58 / 59enum r5c_journal_mode { 60 R5C_JOURNAL_MODE_WRITE_THROUGH = 0, 61 R5C_JOURNAL_MODE_WRITE_BACK = 1, 62}; 63 64static char r5c_journal_mode_str[] = {"write-through", 65 "write-back"}; 66/* 67 * raid5 cache state machine 68 * 69 * With rhe RAID cache, each stripe works in two phases: 70 * - caching phase 71 * - writing-out phase 72 * 73 * These two phases are controlled by bit STRIPE_R5C_CACHING: 74 * if STRIPE_R5C_CACHING == 0, the stripe is in writing-out phase 75 * if STRIPE_R5C_CACHING == 1, the stripe is in caching phase 76 * 77 * When there is no journal, or the journal is in write-through mode, 78 * the stripe is always in writing-out phase. 79 * 80 * For write-back journal, the stripe is sent to caching phase on write 81 * (r5c_try_caching_write). r5c_make_stripe_write_out() kicks off 82 * the write-out phase by clearing STRIPE_R5C_CACHING. 83 * 84 * Stripes in caching phase do not write the raid disks. Instead, all 85 * writes are committed from the log device. Therefore, a stripe in 86 * caching phase handles writes as: 87 * - write to log device 88 * - return IO 89 * 90 * Stripes in writing-out phase handle writes as: 91 * - calculate parity 92 * - write pending data and parity to journal 93 * - write data and parity to raid disks 94 * - return IO for pending writes 95 */ 96
97struct r5l_log { 98 struct md_rdev rdev; 99 100 u32 uuid_checksum; 101 102 sector_t device_size; / log device size, round to 103 * BLOCK_SECTORS / 104 sector_t max_free_space; / reclaim run if free space is at --- 40 unchanged lines hidden (view full) --- 145 * switching to IO_UNIT_STRIPE_END 146 * state) / 147 wait_queue_head_t iounit_wait; 148 149 struct list_head no_space_stripes; / pending stripes, log has no space */ 150 spinlock_t no_space_stripes_lock; 151 152 bool need_cache_flush;	43struct r5l_log { 44 struct md_rdev rdev; 45 46 u32 uuid_checksum; 47 48 sector_t device_size; / log device size, round to 49 * BLOCK_SECTORS / 50 sector_t max_free_space; / reclaim run if free space is at --- 40 unchanged lines hidden (view full) --- 91 * switching to IO_UNIT_STRIPE_END 92 * state) / 93 wait_queue_head_t iounit_wait; 94 95 struct list_head no_space_stripes; / pending stripes, log has no space */ 96 spinlock_t no_space_stripes_lock; 97 98 bool need_cache_flush;
153 154 /* for r5c_cache / 155 enum r5c_journal_mode r5c_journal_mode; 156 157 / all stripes in r5cache, in the order of seq at sh->log_start / 158 struct list_head stripe_in_journal_list; 159 160 spinlock_t stripe_in_journal_lock; 161 atomic_t stripe_in_journal_count; 162 163 / to submit async io_units, to fulfill ordering of flush */ 164 struct work_struct deferred_io_work;
165}; 166 167/* 168 * an IO range starts from a meta data block and end at the next meta data 169 * block. The io unit's the meta data block tracks data/parity followed it. io 170 * unit is written to log disk with normal write, as we always flush log disk 171 * first and then start move data to raid disks, there is no requirement to 172 * write io unit with FLUSH/FUA --- 10 unchanged lines hidden (view full) --- 183 u64 seq; /* seq number of the metablock / 184 sector_t log_start; / where the io_unit starts / 185 sector_t log_end; / where the io_unit ends / 186 struct list_head log_sibling; / log->running_ios / 187 struct list_head stripe_list; / stripes added to the io_unit */ 188 189 int state; 190 bool need_split_bio;	99}; 100 101/* 102 * an IO range starts from a meta data block and end at the next meta data 103 * block. The io unit's the meta data block tracks data/parity followed it. io 104 * unit is written to log disk with normal write, as we always flush log disk 105 * first and then start move data to raid disks, there is no requirement to 106 * write io unit with FLUSH/FUA --- 10 unchanged lines hidden (view full) --- 117 u64 seq; /* seq number of the metablock / 118 sector_t log_start; / where the io_unit starts / 119 sector_t log_end; / where the io_unit ends / 120 struct list_head log_sibling; / log->running_ios / 121 struct list_head stripe_list; / stripes added to the io_unit */ 122 123 int state; 124 bool need_split_bio;
191 struct bio split_bio; 192 193 unsigned int has_flush:1; / include flush request / 194 unsigned int has_fua:1; / include fua request / 195 unsigned int has_null_flush:1; / include empty flush request / 196 / 197 * io isn't sent yet, flush/fua request can only be submitted till it's 198 * the first IO in running_ios list 199 / 200 unsigned int io_deferred:1; 201 202 struct bio_list flush_barriers; / size == 0 flush bios */
203}; 204 205/* r5l_io_unit state / 206enum r5l_io_unit_state { 207 IO_UNIT_RUNNING = 0, / accepting new IO / 208 IO_UNIT_IO_START = 1, / io_unit bio start writing to log, 209 * don't accepting new bio / 210 IO_UNIT_IO_END = 2, / io_unit bio finish writing to log / 211 IO_UNIT_STRIPE_END = 3, / stripes data finished writing to raid */ 212}; 213	125}; 126 127/* r5l_io_unit state / 128enum r5l_io_unit_state { 129 IO_UNIT_RUNNING = 0, / accepting new IO / 130 IO_UNIT_IO_START = 1, / io_unit bio start writing to log, 131 * don't accepting new bio / 132 IO_UNIT_IO_END = 2, / io_unit bio finish writing to log / 133 IO_UNIT_STRIPE_END = 3, / stripes data finished writing to raid */ 134}; 135
214bool r5c_is_writeback(struct r5l_log *log) 215{ 216 return (log != NULL && 217 log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_BACK); 218} 219
220static sector_t r5l_ring_add(struct r5l_log log, sector_t start, sector_t inc) 221{ 222 start += inc; 223 if (start >= log->device_size) 224 start = start - log->device_size; 225 return start; 226} 227 --- 19 unchanged lines hidden* (view full) --- 247static void __r5l_set_io_unit_state(struct r5l_io_unit *io, 248 enum r5l_io_unit_state state) 249{ 250 if (WARN_ON(io->state >= state)) 251 return; 252 io->state = state; 253} 254	136static sector_t r5l_ring_add(struct r5l_log log, sector_t start, sector_t inc) 137{ 138 start += inc; 139 if (start >= log->device_size) 140 start = start - log->device_size; 141 return start; 142} 143 --- 19 unchanged lines hidden* (view full) --- 163static void __r5l_set_io_unit_state(struct r5l_io_unit *io, 164 enum r5l_io_unit_state state) 165{ 166 if (WARN_ON(io->state >= state)) 167 return; 168 io->state = state; 169} 170
255static void 256r5c_return_dev_pending_writes(struct r5conf conf, struct r5dev dev, 257 struct bio_list return_bi) 258{ 259 struct bio wbi, wbi2; 260 261 wbi = dev->written; 262 dev->written = NULL; 263 while (wbi && wbi->bi_iter.bi_sector < 264 dev->sector + STRIPE_SECTORS) { 265 wbi2 = r5_next_bio(wbi, dev->sector); 266 if (!raid5_dec_bi_active_stripes(wbi)) { 267 md_write_end(conf->mddev); 268 bio_list_add(return_bi, wbi); 269 } 270 wbi = wbi2; 271 } 272} 273 274void r5c_handle_cached_data_endio(struct r5conf conf, 275 struct stripe_head sh, int disks, struct bio_list return_bi) 276{ 277 int i; 278 279 for (i = sh->disks; i--; ) { 280 if (sh->dev[i].written) { 281 set_bit(R5_UPTODATE, &sh->dev[i].flags); 282 r5c_return_dev_pending_writes(conf, &sh->dev[i], 283 return_bi); 284 bitmap_endwrite(conf->mddev->bitmap, sh->sector, 285 STRIPE_SECTORS, 286 !test_bit(STRIPE_DEGRADED, &sh->state), 287 0); 288 } 289 } 290} 291 292/* Check whether we should flush some stripes to free up stripe cache / 293void r5c_check_stripe_cache_usage(struct r5conf conf) 294{ 295 int total_cached; 296 297 if (!r5c_is_writeback(conf->log)) 298 return; 299 300 total_cached = atomic_read(&conf->r5c_cached_partial_stripes) + 301 atomic_read(&conf->r5c_cached_full_stripes); 302 303 /* 304 * The following condition is true for either of the following: 305 * - stripe cache pressure high: 306 * total_cached > 3/4 min_nr_stripes \|\| 307 * empty_inactive_list_nr > 0 308 * - stripe cache pressure moderate: 309 * total_cached > 1/2 min_nr_stripes 310 / 311 if (total_cached > conf->min_nr_stripes 1 / 2 \|\| 312 atomic_read(&conf->empty_inactive_list_nr) > 0) 313 r5l_wake_reclaim(conf->log, 0); 314} 315 316/* 317 * flush cache when there are R5C_FULL_STRIPE_FLUSH_BATCH or more full 318 * stripes in the cache 319 / 320void r5c_check_cached_full_stripe(struct r5conf conf) 321{ 322 if (!r5c_is_writeback(conf->log)) 323 return; 324 325 /* 326 * wake up reclaim for R5C_FULL_STRIPE_FLUSH_BATCH cached stripes 327 * or a full stripe (chunk size / 4k stripes). 328 / 329 if (atomic_read(&conf->r5c_cached_full_stripes) >= 330 min(R5C_FULL_STRIPE_FLUSH_BATCH, 331 conf->chunk_sectors >> STRIPE_SHIFT)) 332 r5l_wake_reclaim(conf->log, 0); 333} 334 335/ 336 * Total log space (in sectors) needed to flush all data in cache 337 * 338 * Currently, writing-out phase automatically includes all pending writes 339 * to the same sector. So the reclaim of each stripe takes up to 340 * (conf->raid_disks + 1) pages of log space. 341 * 342 * To totally avoid deadlock due to log space, the code reserves 343 * (conf->raid_disks + 1) pages for each stripe in cache, which is not 344 * necessary in most cases. 345 * 346 * To improve this, we will need writing-out phase to be able to NOT include 347 * pending writes, which will reduce the requirement to 348 * (conf->max_degraded + 1) pages per stripe in cache. 349 / 350static sector_t r5c_log_required_to_flush_cache(struct r5conf conf) 351{ 352 struct r5l_log log = conf->log; 353 354 if (!r5c_is_writeback(log)) 355 return 0; 356 357 return BLOCK_SECTORS (conf->raid_disks + 1) * 358 atomic_read(&log->stripe_in_journal_count); 359} 360 361/* 362 * evaluate log space usage and update R5C_LOG_TIGHT and R5C_LOG_CRITICAL 363 * 364 * R5C_LOG_TIGHT is set when free space on the log device is less than 3x of 365 * reclaim_required_space. R5C_LOG_CRITICAL is set when free space on the log 366 * device is less than 2x of reclaim_required_space. 367 / 368static inline void r5c_update_log_state(struct r5l_log log) 369{ 370 struct r5conf conf = log->rdev->mddev->private; 371 sector_t free_space; 372 sector_t reclaim_space; 373 374 if (!r5c_is_writeback(log)) 375 return; 376 377 free_space = r5l_ring_distance(log, log->log_start, 378 log->last_checkpoint); 379 reclaim_space = r5c_log_required_to_flush_cache(conf); 380 if (free_space < 2 reclaim_space) 381 set_bit(R5C_LOG_CRITICAL, &conf->cache_state); 382 else 383 clear_bit(R5C_LOG_CRITICAL, &conf->cache_state); 384 if (free_space < 3 * reclaim_space) 385 set_bit(R5C_LOG_TIGHT, &conf->cache_state); 386 else 387 clear_bit(R5C_LOG_TIGHT, &conf->cache_state); 388} 389 390/* 391 * Put the stripe into writing-out phase by clearing STRIPE_R5C_CACHING. 392 * This function should only be called in write-back mode. 393 / 394void r5c_make_stripe_write_out(struct stripe_head sh) 395{ 396 struct r5conf conf = sh->raid_conf; 397 struct r5l_log log = conf->log; 398 399 BUG_ON(!r5c_is_writeback(log)); 400 401 WARN_ON(!test_bit(STRIPE_R5C_CACHING, &sh->state)); 402 clear_bit(STRIPE_R5C_CACHING, &sh->state); 403 404 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 405 atomic_inc(&conf->preread_active_stripes); 406 407 if (test_and_clear_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state)) { 408 BUG_ON(atomic_read(&conf->r5c_cached_partial_stripes) == 0); 409 atomic_dec(&conf->r5c_cached_partial_stripes); 410 } 411 412 if (test_and_clear_bit(STRIPE_R5C_FULL_STRIPE, &sh->state)) { 413 BUG_ON(atomic_read(&conf->r5c_cached_full_stripes) == 0); 414 atomic_dec(&conf->r5c_cached_full_stripes); 415 } 416} 417 418static void r5c_handle_data_cached(struct stripe_head sh) 419{ 420 int i; 421 422 for (i = sh->disks; i--; ) 423 if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) { 424 set_bit(R5_InJournal, &sh->dev[i].flags); 425 clear_bit(R5_LOCKED, &sh->dev[i].flags); 426 } 427 clear_bit(STRIPE_LOG_TRAPPED, &sh->state); 428} 429 430/ 431 * this journal write must contain full parity, 432 * it may also contain some data pages 433 / 434static void r5c_handle_parity_cached(struct stripe_head sh) 435{ 436 int i; 437 438 for (i = sh->disks; i--; ) 439 if (test_bit(R5_InJournal, &sh->dev[i].flags)) 440 set_bit(R5_Wantwrite, &sh->dev[i].flags); 441} 442 443/* 444 * Setting proper flags after writing (or flushing) data and/or parity to the 445 * log device. This is called from r5l_log_endio() or r5l_log_flush_endio(). 446 / 447static void r5c_finish_cache_stripe(struct stripe_head sh) 448{ 449 struct r5l_log log = sh->raid_conf->log; 450 451 if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) { 452 BUG_ON(test_bit(STRIPE_R5C_CACHING, &sh->state)); 453 / 454 * Set R5_InJournal for parity dev[pd_idx]. This means 455 * all data AND parity in the journal. For RAID 6, it is 456 * NOT necessary to set the flag for dev[qd_idx], as the 457 * two parities are written out together. 458 */ 459 set_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags); 460 } else if (test_bit(STRIPE_R5C_CACHING, &sh->state)) { 461 r5c_handle_data_cached(sh); 462 } else { 463 r5c_handle_parity_cached(sh); 464 set_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags); 465 } 466} 467
468static void r5l_io_run_stripes(struct r5l_io_unit io) 469{ 470 struct stripe_head sh, *next; 471 472 list_for_each_entry_safe(sh, next, &io->stripe_list, log_list) { 473 list_del_init(&sh->log_list);	171static void r5l_io_run_stripes(struct r5l_io_unit io) 172{ 173 struct stripe_head sh, *next; 174 175 list_for_each_entry_safe(sh, next, &io->stripe_list, log_list) { 176 list_del_init(&sh->log_list);
474 475 r5c_finish_cache_stripe(sh); 476
477 set_bit(STRIPE_HANDLE, &sh->state); 478 raid5_release_stripe(sh); 479 } 480} 481 482static void r5l_log_run_stripes(struct r5l_log log) 483{ 484 struct r5l_io_unit io, next; --- 19 unchanged lines hidden* (view full) --- 504 list_for_each_entry_safe(io, next, &log->running_ios, log_sibling) { 505 /* don't change list order */ 506 if (io->state < IO_UNIT_IO_END) 507 break; 508 list_move_tail(&io->log_sibling, &log->io_end_ios); 509 } 510} 511	177 set_bit(STRIPE_HANDLE, &sh->state); 178 raid5_release_stripe(sh); 179 } 180} 181 182static void r5l_log_run_stripes(struct r5l_log log) 183{ 184 struct r5l_io_unit io, next; --- 19 unchanged lines hidden* (view full) --- 204 list_for_each_entry_safe(io, next, &log->running_ios, log_sibling) { 205 /* don't change list order */ 206 if (io->state < IO_UNIT_IO_END) 207 break; 208 list_move_tail(&io->log_sibling, &log->io_end_ios); 209 } 210} 211
512static void __r5l_stripe_write_finished(struct r5l_io_unit *io);
513static void r5l_log_endio(struct bio bio) 514{ 515 struct r5l_io_unit io = bio->bi_private;	212static void r5l_log_endio(struct bio bio) 213{ 214 struct r5l_io_unit io = bio->bi_private;
516 struct r5l_io_unit *io_deferred;
517 struct r5l_log *log = io->log; 518 unsigned long flags; 519 520 if (bio->bi_error) 521 md_error(log->rdev->mddev, log->rdev); 522 523 bio_put(bio); 524 mempool_free(io->meta_page, log->meta_pool); 525 526 spin_lock_irqsave(&log->io_list_lock, flags); 527 __r5l_set_io_unit_state(io, IO_UNIT_IO_END); 528 if (log->need_cache_flush) 529 r5l_move_to_end_ios(log); 530 else 531 r5l_log_run_stripes(log);	215 struct r5l_log *log = io->log; 216 unsigned long flags; 217 218 if (bio->bi_error) 219 md_error(log->rdev->mddev, log->rdev); 220 221 bio_put(bio); 222 mempool_free(io->meta_page, log->meta_pool); 223 224 spin_lock_irqsave(&log->io_list_lock, flags); 225 __r5l_set_io_unit_state(io, IO_UNIT_IO_END); 226 if (log->need_cache_flush) 227 r5l_move_to_end_ios(log); 228 else 229 r5l_log_run_stripes(log);
532 if (!list_empty(&log->running_ios)) { 533 /* 534 * FLUSH/FUA io_unit is deferred because of ordering, now we 535 * can dispatch it 536 */ 537 io_deferred = list_first_entry(&log->running_ios, 538 struct r5l_io_unit, log_sibling); 539 if (io_deferred->io_deferred) 540 schedule_work(&log->deferred_io_work); 541 } 542
543 spin_unlock_irqrestore(&log->io_list_lock, flags); 544 545 if (log->need_cache_flush) 546 md_wakeup_thread(log->rdev->mddev->thread);	230 spin_unlock_irqrestore(&log->io_list_lock, flags); 231 232 if (log->need_cache_flush) 233 md_wakeup_thread(log->rdev->mddev->thread);
547 548 if (io->has_null_flush) { 549 struct bio *bi; 550 551 WARN_ON(bio_list_empty(&io->flush_barriers)); 552 while ((bi = bio_list_pop(&io->flush_barriers)) != NULL) { 553 bio_endio(bi); 554 atomic_dec(&io->pending_stripe); 555 } 556 if (atomic_read(&io->pending_stripe) == 0) 557 __r5l_stripe_write_finished(io); 558 }
559} 560	234} 235
561static void r5l_do_submit_io(struct r5l_log log, struct r5l_io_unit io) 562{ 563 unsigned long flags; 564 565 spin_lock_irqsave(&log->io_list_lock, flags); 566 __r5l_set_io_unit_state(io, IO_UNIT_IO_START); 567 spin_unlock_irqrestore(&log->io_list_lock, flags); 568 569 if (io->has_flush) 570 bio_set_op_attrs(io->current_bio, REQ_OP_WRITE, WRITE_FLUSH); 571 if (io->has_fua) 572 bio_set_op_attrs(io->current_bio, REQ_OP_WRITE, WRITE_FUA); 573 submit_bio(io->current_bio); 574 575 if (!io->split_bio) 576 return; 577 578 if (io->has_flush) 579 bio_set_op_attrs(io->split_bio, REQ_OP_WRITE, WRITE_FLUSH); 580 if (io->has_fua) 581 bio_set_op_attrs(io->split_bio, REQ_OP_WRITE, WRITE_FUA); 582 submit_bio(io->split_bio); 583} 584 585/* deferred io_unit will be dispatched here / 586static void r5l_submit_io_async(struct work_struct work) 587{ 588 struct r5l_log log = container_of(work, struct r5l_log, 589 deferred_io_work); 590 struct r5l_io_unit io = NULL; 591 unsigned long flags; 592 593 spin_lock_irqsave(&log->io_list_lock, flags); 594 if (!list_empty(&log->running_ios)) { 595 io = list_first_entry(&log->running_ios, struct r5l_io_unit, 596 log_sibling); 597 if (!io->io_deferred) 598 io = NULL; 599 else 600 io->io_deferred = 0; 601 } 602 spin_unlock_irqrestore(&log->io_list_lock, flags); 603 if (io) 604 r5l_do_submit_io(log, io); 605} 606
607static void r5l_submit_current_io(struct r5l_log log) 608{ 609 struct r5l_io_unit io = log->current_io;	236static void r5l_submit_current_io(struct r5l_log log) 237{ 238 struct r5l_io_unit io = log->current_io;
610 struct bio *bio;
611 struct r5l_meta_block *block; 612 unsigned long flags; 613 u32 crc;	239 struct r5l_meta_block *block; 240 unsigned long flags; 241 u32 crc;
614 bool do_submit = true;
615 616 if (!io) 617 return; 618 619 block = page_address(io->meta_page); 620 block->meta_size = cpu_to_le32(io->meta_offset); 621 crc = crc32c_le(log->uuid_checksum, block, PAGE_SIZE); 622 block->checksum = cpu_to_le32(crc);	242 243 if (!io) 244 return; 245 246 block = page_address(io->meta_page); 247 block->meta_size = cpu_to_le32(io->meta_offset); 248 crc = crc32c_le(log->uuid_checksum, block, PAGE_SIZE); 249 block->checksum = cpu_to_le32(crc);
623 bio = io->current_bio;
624 625 log->current_io = NULL; 626 spin_lock_irqsave(&log->io_list_lock, flags);	250 251 log->current_io = NULL; 252 spin_lock_irqsave(&log->io_list_lock, flags);
627 if (io->has_flush \|\| io->has_fua) { 628 if (io != list_first_entry(&log->running_ios, 629 struct r5l_io_unit, log_sibling)) { 630 io->io_deferred = 1; 631 do_submit = false; 632 } 633 }	253 __r5l_set_io_unit_state(io, IO_UNIT_IO_START);
634 spin_unlock_irqrestore(&log->io_list_lock, flags);	254 spin_unlock_irqrestore(&log->io_list_lock, flags);
635 if (do_submit) 636 r5l_do_submit_io(log, io);	255 256 submit_bio(io->current_bio);
637} 638 639static struct bio r5l_bio_alloc(struct r5l_log log) 640{ 641 struct bio bio = bio_alloc_bioset(GFP_NOIO, BIO_MAX_PAGES, log->bs); 642 643 bio_set_op_attrs(bio, REQ_OP_WRITE, 0); 644 bio->bi_bdev = log->rdev->bdev; 645 bio->bi_iter.bi_sector = log->rdev->data_offset + log->log_start; 646 647 return bio; 648} 649 650static void r5_reserve_log_entry(struct r5l_log log, struct r5l_io_unit *io) 651{ 652 log->log_start = r5l_ring_add(log, log->log_start, BLOCK_SECTORS); 653	257} 258 259static struct bio r5l_bio_alloc(struct r5l_log log) 260{ 261 struct bio bio = bio_alloc_bioset(GFP_NOIO, BIO_MAX_PAGES, log->bs); 262 263 bio_set_op_attrs(bio, REQ_OP_WRITE, 0); 264 bio->bi_bdev = log->rdev->bdev; 265 bio->bi_iter.bi_sector = log->rdev->data_offset + log->log_start; 266 267 return bio; 268} 269 270static void r5_reserve_log_entry(struct r5l_log log, struct r5l_io_unit *io) 271{ 272 log->log_start = r5l_ring_add(log, log->log_start, BLOCK_SECTORS); 273
654 r5c_update_log_state(log);
655 /* 656 * If we filled up the log device start from the beginning again, 657 * which will require a new bio. 658 * 659 * Note: for this to work properly the log size needs to me a multiple 660 * of BLOCK_SECTORS. 661 / 662 if (log->log_start == 0) --- 10 unchanged lines hidden* (view full) --- 673 io = mempool_alloc(log->io_pool, GFP_ATOMIC); 674 if (!io) 675 return NULL; 676 memset(io, 0, sizeof(*io)); 677 678 io->log = log; 679 INIT_LIST_HEAD(&io->log_sibling); 680 INIT_LIST_HEAD(&io->stripe_list);	274 /* 275 * If we filled up the log device start from the beginning again, 276 * which will require a new bio. 277 * 278 * Note: for this to work properly the log size needs to me a multiple 279 * of BLOCK_SECTORS. 280 / 281 if (log->log_start == 0) --- 10 unchanged lines hidden* (view full) --- 292 io = mempool_alloc(log->io_pool, GFP_ATOMIC); 293 if (!io) 294 return NULL; 295 memset(io, 0, sizeof(*io)); 296 297 io->log = log; 298 INIT_LIST_HEAD(&io->log_sibling); 299 INIT_LIST_HEAD(&io->stripe_list);
681 bio_list_init(&io->flush_barriers);
682 io->state = IO_UNIT_RUNNING; 683 684 io->meta_page = mempool_alloc(log->meta_pool, GFP_NOIO); 685 block = page_address(io->meta_page); 686 clear_page(block); 687 block->magic = cpu_to_le32(R5LOG_MAGIC); 688 block->version = R5LOG_VERSION; 689 block->seq = cpu_to_le64(log->seq); --- 54 unchanged lines hidden (view full) --- 744 sizeof(__le32) * (1 + !!checksum2_valid); 745} 746 747static void r5l_append_payload_page(struct r5l_log log, struct page page) 748{ 749 struct r5l_io_unit *io = log->current_io; 750 751 if (io->need_split_bio) {	300 io->state = IO_UNIT_RUNNING; 301 302 io->meta_page = mempool_alloc(log->meta_pool, GFP_NOIO); 303 block = page_address(io->meta_page); 304 clear_page(block); 305 block->magic = cpu_to_le32(R5LOG_MAGIC); 306 block->version = R5LOG_VERSION; 307 block->seq = cpu_to_le64(log->seq); --- 54 unchanged lines hidden (view full) --- 362 sizeof(__le32) * (1 + !!checksum2_valid); 363} 364 365static void r5l_append_payload_page(struct r5l_log log, struct page page) 366{ 367 struct r5l_io_unit *io = log->current_io; 368 369 if (io->need_split_bio) {
752 BUG_ON(io->split_bio); 753 io->split_bio = io->current_bio;	370 struct bio *prev = io->current_bio; 371
754 io->current_bio = r5l_bio_alloc(log);	372 io->current_bio = r5l_bio_alloc(log);
755 bio_chain(io->current_bio, io->split_bio); 756 io->need_split_bio = false;	373 bio_chain(io->current_bio, prev); 374 375 submit_bio(prev);
757 } 758 759 if (!bio_add_page(io->current_bio, page, PAGE_SIZE, 0)) 760 BUG(); 761 762 r5_reserve_log_entry(log, io); 763} 764 --- 12 unchanged lines hidden (view full) --- 777 sizeof(__le32) * parity_pages; 778 779 ret = r5l_get_meta(log, meta_size); 780 if (ret) 781 return ret; 782 783 io = log->current_io; 784	376 } 377 378 if (!bio_add_page(io->current_bio, page, PAGE_SIZE, 0)) 379 BUG(); 380 381 r5_reserve_log_entry(log, io); 382} 383 --- 12 unchanged lines hidden (view full) --- 396 sizeof(__le32) * parity_pages; 397 398 ret = r5l_get_meta(log, meta_size); 399 if (ret) 400 return ret; 401 402 io = log->current_io; 403
785 if (test_and_clear_bit(STRIPE_R5C_PREFLUSH, &sh->state)) 786 io->has_flush = 1; 787
788 for (i = 0; i < sh->disks; i++) {	404 for (i = 0; i < sh->disks; i++) {
789 if (!test_bit(R5_Wantwrite, &sh->dev[i].flags) \|\| 790 test_bit(R5_InJournal, &sh->dev[i].flags))	405 if (!test_bit(R5_Wantwrite, &sh->dev[i].flags))
791 continue; 792 if (i == sh->pd_idx \|\| i == sh->qd_idx) 793 continue;	406 continue; 407 if (i == sh->pd_idx \|\| i == sh->qd_idx) 408 continue;
794 if (test_bit(R5_WantFUA, &sh->dev[i].flags) && 795 log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_BACK) { 796 io->has_fua = 1; 797 /* 798 * we need to flush journal to make sure recovery can 799 * reach the data with fua flag 800 */ 801 io->has_flush = 1; 802 }
803 r5l_append_payload_meta(log, R5LOG_PAYLOAD_DATA, 804 raid5_compute_blocknr(sh, i, 0), 805 sh->dev[i].log_checksum, 0, false); 806 r5l_append_payload_page(log, sh->dev[i].page); 807 } 808	409 r5l_append_payload_meta(log, R5LOG_PAYLOAD_DATA, 410 raid5_compute_blocknr(sh, i, 0), 411 sh->dev[i].log_checksum, 0, false); 412 r5l_append_payload_page(log, sh->dev[i].page); 413 } 414
809 if (parity_pages == 2) {	415 if (sh->qd_idx >= 0) {
810 r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY, 811 sh->sector, sh->dev[sh->pd_idx].log_checksum, 812 sh->dev[sh->qd_idx].log_checksum, true); 813 r5l_append_payload_page(log, sh->dev[sh->pd_idx].page); 814 r5l_append_payload_page(log, sh->dev[sh->qd_idx].page);	416 r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY, 417 sh->sector, sh->dev[sh->pd_idx].log_checksum, 418 sh->dev[sh->qd_idx].log_checksum, true); 419 r5l_append_payload_page(log, sh->dev[sh->pd_idx].page); 420 r5l_append_payload_page(log, sh->dev[sh->qd_idx].page);
815 } else if (parity_pages == 1) {	421 } else {
816 r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY, 817 sh->sector, sh->dev[sh->pd_idx].log_checksum, 818 0, false); 819 r5l_append_payload_page(log, sh->dev[sh->pd_idx].page);	422 r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY, 423 sh->sector, sh->dev[sh->pd_idx].log_checksum, 424 0, false); 425 r5l_append_payload_page(log, sh->dev[sh->pd_idx].page);
820 } else /* Just writing data, not parity, in caching phase */ 821 BUG_ON(parity_pages != 0);	426 }
822 823 list_add_tail(&sh->log_list, &io->stripe_list); 824 atomic_inc(&io->pending_stripe); 825 sh->log_io = io; 826	427 428 list_add_tail(&sh->log_list, &io->stripe_list); 429 atomic_inc(&io->pending_stripe); 430 sh->log_io = io; 431
827 if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) 828 return 0; 829 830 if (sh->log_start == MaxSector) { 831 BUG_ON(!list_empty(&sh->r5c)); 832 sh->log_start = io->log_start; 833 spin_lock_irq(&log->stripe_in_journal_lock); 834 list_add_tail(&sh->r5c, 835 &log->stripe_in_journal_list); 836 spin_unlock_irq(&log->stripe_in_journal_lock); 837 atomic_inc(&log->stripe_in_journal_count); 838 }
839 return 0; 840} 841	432 return 0; 433} 434
842/* add stripe to no_space_stripes, and then wake up reclaim / 843static inline void r5l_add_no_space_stripe(struct r5l_log log, 844 struct stripe_head *sh) 845{ 846 spin_lock(&log->no_space_stripes_lock); 847 list_add_tail(&sh->log_list, &log->no_space_stripes); 848 spin_unlock(&log->no_space_stripes_lock); 849} 850	435static void r5l_wake_reclaim(struct r5l_log *log, sector_t space);
851/* 852 * running in raid5d, where reclaim could wait for raid5d too (when it flushes 853 * data from log to raid disks), so we shouldn't wait for reclaim here 854 / 855int r5l_write_stripe(struct r5l_log log, struct stripe_head *sh) 856{	436/* 437 * running in raid5d, where reclaim could wait for raid5d too (when it flushes 438 * data from log to raid disks), so we shouldn't wait for reclaim here 439 / 440int r5l_write_stripe(struct r5l_log log, struct stripe_head *sh) 441{
857 struct r5conf *conf = sh->raid_conf;
858 int write_disks = 0; 859 int data_pages, parity_pages;	442 int write_disks = 0; 443 int data_pages, parity_pages;
	444 int meta_size;
860 int reserve; 861 int i; 862 int ret = 0;	445 int reserve; 446 int i; 447 int ret = 0;
863 bool wake_reclaim = false;
864 865 if (!log) 866 return -EAGAIN; 867 /* Don't support stripe batch / 868 if (sh->log_io \|\| !test_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags) \|\| 869 test_bit(STRIPE_SYNCING, &sh->state)) { 870 / the stripe is written to log, we start writing it to raid */ 871 clear_bit(STRIPE_LOG_TRAPPED, &sh->state); 872 return -EAGAIN; 873 } 874	448 449 if (!log) 450 return -EAGAIN; 451 /* Don't support stripe batch / 452 if (sh->log_io \|\| !test_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags) \|\| 453 test_bit(STRIPE_SYNCING, &sh->state)) { 454 / the stripe is written to log, we start writing it to raid */ 455 clear_bit(STRIPE_LOG_TRAPPED, &sh->state); 456 return -EAGAIN; 457 } 458
875 WARN_ON(test_bit(STRIPE_R5C_CACHING, &sh->state)); 876
877 for (i = 0; i < sh->disks; i++) { 878 void *addr; 879	459 for (i = 0; i < sh->disks; i++) { 460 void *addr; 461
880 if (!test_bit(R5_Wantwrite, &sh->dev[i].flags) \|\| 881 test_bit(R5_InJournal, &sh->dev[i].flags))	462 if (!test_bit(R5_Wantwrite, &sh->dev[i].flags))
882 continue;	463 continue;
883
884 write_disks++; 885 /* checksum is already calculated in last run */ 886 if (test_bit(STRIPE_LOG_TRAPPED, &sh->state)) 887 continue; 888 addr = kmap_atomic(sh->dev[i].page); 889 sh->dev[i].log_checksum = crc32c_le(log->uuid_checksum, 890 addr, PAGE_SIZE); 891 kunmap_atomic(addr); 892 } 893 parity_pages = 1 + !!(sh->qd_idx >= 0); 894 data_pages = write_disks - parity_pages; 895	464 write_disks++; 465 /* checksum is already calculated in last run */ 466 if (test_bit(STRIPE_LOG_TRAPPED, &sh->state)) 467 continue; 468 addr = kmap_atomic(sh->dev[i].page); 469 sh->dev[i].log_checksum = crc32c_le(log->uuid_checksum, 470 addr, PAGE_SIZE); 471 kunmap_atomic(addr); 472 } 473 parity_pages = 1 + !!(sh->qd_idx >= 0); 474 data_pages = write_disks - parity_pages; 475
	476 meta_size = 477 ((sizeof(struct r5l_payload_data_parity) + sizeof(__le32)) 478 * data_pages) + 479 sizeof(struct r5l_payload_data_parity) + 480 sizeof(__le32) * parity_pages; 481 /* Doesn't work with very big raid array */ 482 if (meta_size + sizeof(struct r5l_meta_block) > PAGE_SIZE) 483 return -EINVAL; 484
896 set_bit(STRIPE_LOG_TRAPPED, &sh->state); 897 /* 898 * The stripe must enter state machine again to finish the write, so 899 * don't delay. 900 / 901 clear_bit(STRIPE_DELAYED, &sh->state); 902 atomic_inc(&sh->count); 903 904 mutex_lock(&log->io_mutex); 905 / meta + data */ 906 reserve = (1 + write_disks) << (PAGE_SHIFT - 9);	485 set_bit(STRIPE_LOG_TRAPPED, &sh->state); 486 /* 487 * The stripe must enter state machine again to finish the write, so 488 * don't delay. 489 / 490 clear_bit(STRIPE_DELAYED, &sh->state); 491 atomic_inc(&sh->count); 492 493 mutex_lock(&log->io_mutex); 494 / meta + data */ 495 reserve = (1 + write_disks) << (PAGE_SHIFT - 9);
	496 if (!r5l_has_free_space(log, reserve)) { 497 spin_lock(&log->no_space_stripes_lock); 498 list_add_tail(&sh->log_list, &log->no_space_stripes); 499 spin_unlock(&log->no_space_stripes_lock);
907	500
908 if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) { 909 if (!r5l_has_free_space(log, reserve)) { 910 r5l_add_no_space_stripe(log, sh); 911 wake_reclaim = true; 912 } else { 913 ret = r5l_log_stripe(log, sh, data_pages, parity_pages); 914 if (ret) { 915 spin_lock_irq(&log->io_list_lock); 916 list_add_tail(&sh->log_list, 917 &log->no_mem_stripes); 918 spin_unlock_irq(&log->io_list_lock); 919 }	501 r5l_wake_reclaim(log, reserve); 502 } else { 503 ret = r5l_log_stripe(log, sh, data_pages, parity_pages); 504 if (ret) { 505 spin_lock_irq(&log->io_list_lock); 506 list_add_tail(&sh->log_list, &log->no_mem_stripes); 507 spin_unlock_irq(&log->io_list_lock);
920 }	508 }
921 } else { /* R5C_JOURNAL_MODE_WRITE_BACK / 922 / 923 * log space critical, do not process stripes that are 924 * not in cache yet (sh->log_start == MaxSector). 925 */ 926 if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state) && 927 sh->log_start == MaxSector) { 928 r5l_add_no_space_stripe(log, sh); 929 wake_reclaim = true; 930 reserve = 0; 931 } else if (!r5l_has_free_space(log, reserve)) { 932 if (sh->log_start == log->last_checkpoint) 933 BUG(); 934 else 935 r5l_add_no_space_stripe(log, sh); 936 } else { 937 ret = r5l_log_stripe(log, sh, data_pages, parity_pages); 938 if (ret) { 939 spin_lock_irq(&log->io_list_lock); 940 list_add_tail(&sh->log_list, 941 &log->no_mem_stripes); 942 spin_unlock_irq(&log->io_list_lock); 943 } 944 }
945 } 946 947 mutex_unlock(&log->io_mutex);	509 } 510 511 mutex_unlock(&log->io_mutex);
948 if (wake_reclaim) 949 r5l_wake_reclaim(log, reserve);
950 return 0; 951} 952 953void r5l_write_stripe_run(struct r5l_log log) 954{ 955 if (!log) 956 return; 957 mutex_lock(&log->io_mutex); 958 r5l_submit_current_io(log); 959 mutex_unlock(&log->io_mutex); 960} 961 962int r5l_handle_flush_request(struct r5l_log log, struct bio *bio) 963{ 964 if (!log) 965 return -ENODEV;	512 return 0; 513} 514 515void r5l_write_stripe_run(struct r5l_log log) 516{ 517 if (!log) 518 return; 519 mutex_lock(&log->io_mutex); 520 r5l_submit_current_io(log); 521 mutex_unlock(&log->io_mutex); 522} 523 524int r5l_handle_flush_request(struct r5l_log log, struct bio *bio) 525{ 526 if (!log) 527 return -ENODEV;
966 967 if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) { 968 /* 969 * in write through (journal only) 970 * we flush log disk cache first, then write stripe data to 971 * raid disks. So if bio is finished, the log disk cache is 972 * flushed already. The recovery guarantees we can recovery 973 * the bio from log disk, so we don't need to flush again 974 / 975 if (bio->bi_iter.bi_size == 0) { 976 bio_endio(bio); 977 return 0; 978 } 979 bio->bi_opf &= ~REQ_PREFLUSH; 980 } else { 981 / write back (with cache) */ 982 if (bio->bi_iter.bi_size == 0) { 983 mutex_lock(&log->io_mutex); 984 r5l_get_meta(log, 0); 985 bio_list_add(&log->current_io->flush_barriers, bio); 986 log->current_io->has_flush = 1; 987 log->current_io->has_null_flush = 1; 988 atomic_inc(&log->current_io->pending_stripe); 989 r5l_submit_current_io(log); 990 mutex_unlock(&log->io_mutex); 991 return 0; 992 }	528 /* 529 * we flush log disk cache first, then write stripe data to raid disks. 530 * So if bio is finished, the log disk cache is flushed already. The 531 * recovery guarantees we can recovery the bio from log disk, so we 532 * don't need to flush again 533 */ 534 if (bio->bi_iter.bi_size == 0) { 535 bio_endio(bio); 536 return 0;
993 }	537 }
	538 bio->bi_opf &= ~REQ_PREFLUSH;
994 return -EAGAIN; 995} 996 997/* This will run after log space is reclaimed / 998static void r5l_run_no_space_stripes(struct r5l_log log) 999{ 1000 struct stripe_head *sh; 1001 1002 spin_lock(&log->no_space_stripes_lock); 1003 while (!list_empty(&log->no_space_stripes)) { 1004 sh = list_first_entry(&log->no_space_stripes, 1005 struct stripe_head, log_list); 1006 list_del_init(&sh->log_list); 1007 set_bit(STRIPE_HANDLE, &sh->state); 1008 raid5_release_stripe(sh); 1009 } 1010 spin_unlock(&log->no_space_stripes_lock); 1011} 1012	539 return -EAGAIN; 540} 541 542/* This will run after log space is reclaimed / 543static void r5l_run_no_space_stripes(struct r5l_log log) 544{ 545 struct stripe_head *sh; 546 547 spin_lock(&log->no_space_stripes_lock); 548 while (!list_empty(&log->no_space_stripes)) { 549 sh = list_first_entry(&log->no_space_stripes, 550 struct stripe_head, log_list); 551 list_del_init(&sh->log_list); 552 set_bit(STRIPE_HANDLE, &sh->state); 553 raid5_release_stripe(sh); 554 } 555 spin_unlock(&log->no_space_stripes_lock); 556} 557
1013/* 1014 * calculate new last_checkpoint 1015 * for write through mode, returns log->next_checkpoint 1016 * for write back, returns log_start of first sh in stripe_in_journal_list 1017 / 1018static sector_t r5c_calculate_new_cp(struct r5conf conf) 1019{ 1020 struct stripe_head sh; 1021 struct r5l_log log = conf->log; 1022 sector_t new_cp; 1023 unsigned long flags; 1024 1025 if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) 1026 return log->next_checkpoint; 1027 1028 spin_lock_irqsave(&log->stripe_in_journal_lock, flags); 1029 if (list_empty(&conf->log->stripe_in_journal_list)) { 1030 /* all stripes flushed */ 1031 spin_unlock(&log->stripe_in_journal_lock); 1032 return log->next_checkpoint; 1033 } 1034 sh = list_first_entry(&conf->log->stripe_in_journal_list, 1035 struct stripe_head, r5c); 1036 new_cp = sh->log_start; 1037 spin_unlock_irqrestore(&log->stripe_in_journal_lock, flags); 1038 return new_cp; 1039} 1040
1041static sector_t r5l_reclaimable_space(struct r5l_log *log) 1042{	558static sector_t r5l_reclaimable_space(struct r5l_log *log) 559{
1043 struct r5conf *conf = log->rdev->mddev->private; 1044
1045 return r5l_ring_distance(log, log->last_checkpoint,	560 return r5l_ring_distance(log, log->last_checkpoint,
1046 r5c_calculate_new_cp(conf));	561 log->next_checkpoint);
1047} 1048 1049static void r5l_run_no_mem_stripe(struct r5l_log log) 1050{ 1051 struct stripe_head sh; 1052 1053 assert_spin_locked(&log->io_list_lock); 1054 --- 29 unchanged lines hidden (view full) --- 1084 } 1085 1086 return found; 1087} 1088 1089static void __r5l_stripe_write_finished(struct r5l_io_unit io) 1090{ 1091 struct r5l_log log = io->log;	562} 563 564static void r5l_run_no_mem_stripe(struct r5l_log log) 565{ 566 struct stripe_head sh; 567 568 assert_spin_locked(&log->io_list_lock); 569 --- 29 unchanged lines hidden (view full) --- 599 } 600 601 return found; 602} 603 604static void __r5l_stripe_write_finished(struct r5l_io_unit io) 605{ 606 struct r5l_log log = io->log;
1092 struct r5conf *conf = log->rdev->mddev->private;
1093 unsigned long flags; 1094 1095 spin_lock_irqsave(&log->io_list_lock, flags); 1096 __r5l_set_io_unit_state(io, IO_UNIT_STRIPE_END); 1097 1098 if (!r5l_complete_finished_ios(log)) { 1099 spin_unlock_irqrestore(&log->io_list_lock, flags); 1100 return; 1101 } 1102	607 unsigned long flags; 608 609 spin_lock_irqsave(&log->io_list_lock, flags); 610 __r5l_set_io_unit_state(io, IO_UNIT_STRIPE_END); 611 612 if (!r5l_complete_finished_ios(log)) { 613 spin_unlock_irqrestore(&log->io_list_lock, flags); 614 return; 615 } 616
1103 if (r5l_reclaimable_space(log) > log->max_free_space \|\| 1104 test_bit(R5C_LOG_TIGHT, &conf->cache_state))	617 if (r5l_reclaimable_space(log) > log->max_free_space)
1105 r5l_wake_reclaim(log, 0); 1106 1107 spin_unlock_irqrestore(&log->io_list_lock, flags); 1108 wake_up(&log->iounit_wait); 1109} 1110 1111void r5l_stripe_write_finished(struct stripe_head sh) 1112{ --- 54 unchanged lines hidden* (view full) --- 1167 do_flush = !list_empty(&log->flushing_ios); 1168 spin_unlock_irq(&log->io_list_lock); 1169 1170 if (!do_flush) 1171 return; 1172 bio_reset(&log->flush_bio); 1173 log->flush_bio.bi_bdev = log->rdev->bdev; 1174 log->flush_bio.bi_end_io = r5l_log_flush_endio;	618 r5l_wake_reclaim(log, 0); 619 620 spin_unlock_irqrestore(&log->io_list_lock, flags); 621 wake_up(&log->iounit_wait); 622} 623 624void r5l_stripe_write_finished(struct stripe_head sh) 625{ --- 54 unchanged lines hidden* (view full) --- 680 do_flush = !list_empty(&log->flushing_ios); 681 spin_unlock_irq(&log->io_list_lock); 682 683 if (!do_flush) 684 return; 685 bio_reset(&log->flush_bio); 686 log->flush_bio.bi_bdev = log->rdev->bdev; 687 log->flush_bio.bi_end_io = r5l_log_flush_endio;
1175 bio_set_op_attrs(&log->flush_bio, REQ_OP_WRITE, WRITE_FLUSH);	688 log->flush_bio.bi_opf = REQ_OP_WRITE \| REQ_PREFLUSH;
1176 submit_bio(&log->flush_bio); 1177} 1178 1179static void r5l_write_super(struct r5l_log log, sector_t cp); 1180static void r5l_write_super_and_discard_space(struct r5l_log log, 1181 sector_t end) 1182{ 1183 struct block_device bdev = log->rdev->bdev; --- 33 unchanged lines hidden* (view full) --- 1217 log->last_checkpoint + log->rdev->data_offset, 1218 log->device_size - log->last_checkpoint, 1219 GFP_NOIO, 0); 1220 blkdev_issue_discard(bdev, log->rdev->data_offset, end, 1221 GFP_NOIO, 0); 1222 } 1223} 1224	689 submit_bio(&log->flush_bio); 690} 691 692static void r5l_write_super(struct r5l_log log, sector_t cp); 693static void r5l_write_super_and_discard_space(struct r5l_log log, 694 sector_t end) 695{ 696 struct block_device bdev = log->rdev->bdev; --- 33 unchanged lines hidden* (view full) --- 730 log->last_checkpoint + log->rdev->data_offset, 731 log->device_size - log->last_checkpoint, 732 GFP_NOIO, 0); 733 blkdev_issue_discard(bdev, log->rdev->data_offset, end, 734 GFP_NOIO, 0); 735 } 736} 737
1225/* 1226 * r5c_flush_stripe moves stripe from cached list to handle_list. When called, 1227 * the stripe must be on r5c_cached_full_stripes or r5c_cached_partial_stripes. 1228 * 1229 * must hold conf->device_lock 1230 / 1231static void r5c_flush_stripe(struct r5conf conf, struct stripe_head *sh) 1232{ 1233 BUG_ON(list_empty(&sh->lru)); 1234 BUG_ON(!test_bit(STRIPE_R5C_CACHING, &sh->state)); 1235 BUG_ON(test_bit(STRIPE_HANDLE, &sh->state));
1236	738
1237 /* 1238 * The stripe is not ON_RELEASE_LIST, so it is safe to call 1239 * raid5_release_stripe() while holding conf->device_lock 1240 / 1241 BUG_ON(test_bit(STRIPE_ON_RELEASE_LIST, &sh->state)); 1242 assert_spin_locked(&conf->device_lock); 1243 1244 list_del_init(&sh->lru); 1245 atomic_inc(&sh->count); 1246 1247 set_bit(STRIPE_HANDLE, &sh->state); 1248 atomic_inc(&conf->active_stripes); 1249 r5c_make_stripe_write_out(sh); 1250 1251 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 1252 atomic_inc(&conf->preread_active_stripes); 1253 raid5_release_stripe(sh); 1254} 1255 1256/ 1257 * if num == 0, flush all full stripes 1258 * if num > 0, flush all full stripes. If less than num full stripes are 1259 * flushed, flush some partial stripes until totally num stripes are 1260 * flushed or there is no more cached stripes. 1261 / 1262void r5c_flush_cache(struct r5conf conf, int num) 1263{ 1264 int count; 1265 struct stripe_head sh, next; 1266 1267 assert_spin_locked(&conf->device_lock); 1268 if (!conf->log) 1269 return; 1270 1271 count = 0; 1272 list_for_each_entry_safe(sh, next, &conf->r5c_full_stripe_list, lru) { 1273 r5c_flush_stripe(conf, sh); 1274 count++; 1275 } 1276 1277 if (count >= num) 1278 return; 1279 list_for_each_entry_safe(sh, next, 1280 &conf->r5c_partial_stripe_list, lru) { 1281 r5c_flush_stripe(conf, sh); 1282 if (++count >= num) 1283 break; 1284 } 1285} 1286 1287static void r5c_do_reclaim(struct r5conf conf) 1288{ 1289 struct r5l_log log = conf->log; 1290 struct stripe_head sh; 1291 int count = 0; 1292 unsigned long flags; 1293 int total_cached; 1294 int stripes_to_flush; 1295 1296 if (!r5c_is_writeback(log)) 1297 return; 1298 1299 total_cached = atomic_read(&conf->r5c_cached_partial_stripes) + 1300 atomic_read(&conf->r5c_cached_full_stripes); 1301 1302 if (total_cached > conf->min_nr_stripes 3 / 4 \|\| 1303 atomic_read(&conf->empty_inactive_list_nr) > 0) 1304 /* 1305 * if stripe cache pressure high, flush all full stripes and 1306 * some partial stripes 1307 / 1308 stripes_to_flush = R5C_RECLAIM_STRIPE_GROUP; 1309 else if (total_cached > conf->min_nr_stripes 1 / 2 \|\| 1310 atomic_read(&conf->r5c_cached_full_stripes) > 1311 R5C_FULL_STRIPE_FLUSH_BATCH) 1312 /* 1313 * if stripe cache pressure moderate, or if there is many full 1314 * stripes,flush all full stripes 1315 / 1316 stripes_to_flush = 0; 1317 else 1318 / no need to flush / 1319 stripes_to_flush = -1; 1320 1321 if (stripes_to_flush >= 0) { 1322 spin_lock_irqsave(&conf->device_lock, flags); 1323 r5c_flush_cache(conf, stripes_to_flush); 1324 spin_unlock_irqrestore(&conf->device_lock, flags); 1325 } 1326 1327 / if log space is tight, flush stripes on stripe_in_journal_list / 1328 if (test_bit(R5C_LOG_TIGHT, &conf->cache_state)) { 1329 spin_lock_irqsave(&log->stripe_in_journal_lock, flags); 1330 spin_lock(&conf->device_lock); 1331 list_for_each_entry(sh, &log->stripe_in_journal_list, r5c) { 1332 / 1333 * stripes on stripe_in_journal_list could be in any 1334 * state of the stripe_cache state machine. In this 1335 * case, we only want to flush stripe on 1336 * r5c_cached_full/partial_stripes. The following 1337 * condition makes sure the stripe is on one of the 1338 * two lists. 1339 */ 1340 if (!list_empty(&sh->lru) && 1341 !test_bit(STRIPE_HANDLE, &sh->state) && 1342 atomic_read(&sh->count) == 0) { 1343 r5c_flush_stripe(conf, sh); 1344 } 1345 if (count++ >= R5C_RECLAIM_STRIPE_GROUP) 1346 break; 1347 } 1348 spin_unlock(&conf->device_lock); 1349 spin_unlock_irqrestore(&log->stripe_in_journal_lock, flags); 1350 } 1351 md_wakeup_thread(conf->mddev->thread); 1352} 1353
1354static void r5l_do_reclaim(struct r5l_log *log) 1355{	739static void r5l_do_reclaim(struct r5l_log *log) 740{
1356 struct r5conf *conf = log->rdev->mddev->private;
1357 sector_t reclaim_target = xchg(&log->reclaim_target, 0); 1358 sector_t reclaimable; 1359 sector_t next_checkpoint;	741 sector_t reclaim_target = xchg(&log->reclaim_target, 0); 742 sector_t reclaimable; 743 sector_t next_checkpoint;
1360 bool write_super;	744 u64 next_cp_seq;
1361 1362 spin_lock_irq(&log->io_list_lock);	745 746 spin_lock_irq(&log->io_list_lock);
1363 write_super = r5l_reclaimable_space(log) > log->max_free_space \|\| 1364 reclaim_target != 0 \|\| !list_empty(&log->no_space_stripes);
1365 /* 1366 * move proper io_unit to reclaim list. We should not change the order. 1367 * reclaimable/unreclaimable io_unit can be mixed in the list, we 1368 * shouldn't reuse space of an unreclaimable io_unit 1369 / 1370 while (1) { 1371 reclaimable = r5l_reclaimable_space(log); 1372 if (reclaimable >= reclaim_target \|\| --- 4 unchanged lines hidden* (view full) --- 1377 break; 1378 1379 md_wakeup_thread(log->rdev->mddev->thread); 1380 wait_event_lock_irq(log->iounit_wait, 1381 r5l_reclaimable_space(log) > reclaimable, 1382 log->io_list_lock); 1383 } 1384	747 /* 748 * move proper io_unit to reclaim list. We should not change the order. 749 * reclaimable/unreclaimable io_unit can be mixed in the list, we 750 * shouldn't reuse space of an unreclaimable io_unit 751 / 752 while (1) { 753 reclaimable = r5l_reclaimable_space(log); 754 if (reclaimable >= reclaim_target \|\| --- 4 unchanged lines hidden* (view full) --- 759 break; 760 761 md_wakeup_thread(log->rdev->mddev->thread); 762 wait_event_lock_irq(log->iounit_wait, 763 r5l_reclaimable_space(log) > reclaimable, 764 log->io_list_lock); 765 } 766
1385 next_checkpoint = r5c_calculate_new_cp(conf);	767 next_checkpoint = log->next_checkpoint; 768 next_cp_seq = log->next_cp_seq;
1386 spin_unlock_irq(&log->io_list_lock); 1387 1388 BUG_ON(reclaimable < 0);	769 spin_unlock_irq(&log->io_list_lock); 770 771 BUG_ON(reclaimable < 0);
1389 1390 if (reclaimable == 0 \|\| !write_super)	772 if (reclaimable == 0)
1391 return; 1392 1393 /* 1394 * write_super will flush cache of each raid disk. We must write super 1395 * here, because the log area might be reused soon and we don't want to 1396 * confuse recovery 1397 */ 1398 r5l_write_super_and_discard_space(log, next_checkpoint); 1399 1400 mutex_lock(&log->io_mutex); 1401 log->last_checkpoint = next_checkpoint;	773 return; 774 775 /* 776 * write_super will flush cache of each raid disk. We must write super 777 * here, because the log area might be reused soon and we don't want to 778 * confuse recovery 779 */ 780 r5l_write_super_and_discard_space(log, next_checkpoint); 781 782 mutex_lock(&log->io_mutex); 783 log->last_checkpoint = next_checkpoint;
1402 r5c_update_log_state(log);	784 log->last_cp_seq = next_cp_seq;
1403 mutex_unlock(&log->io_mutex); 1404 1405 r5l_run_no_space_stripes(log); 1406} 1407 1408static void r5l_reclaim_thread(struct md_thread thread) 1409{ 1410 struct mddev mddev = thread->mddev; 1411 struct r5conf conf = mddev->private; 1412 struct r5l_log log = conf->log; 1413 1414 if (!log) 1415 return;	785 mutex_unlock(&log->io_mutex); 786 787 r5l_run_no_space_stripes(log); 788} 789 790static void r5l_reclaim_thread(struct md_thread thread) 791{ 792 struct mddev mddev = thread->mddev; 793 struct r5conf conf = mddev->private; 794 struct r5l_log log = conf->log; 795 796 if (!log) 797 return;
1416 r5c_do_reclaim(conf);
1417 r5l_do_reclaim(log); 1418} 1419	798 r5l_do_reclaim(log); 799} 800
1420void r5l_wake_reclaim(struct r5l_log *log, sector_t space)	801static void r5l_wake_reclaim(struct r5l_log *log, sector_t space)
1421{ 1422 unsigned long target; 1423 unsigned long new = (unsigned long)space; /* overflow in theory */ 1424	802{ 803 unsigned long target; 804 unsigned long new = (unsigned long)space; /* overflow in theory */ 805
1425 if (!log) 1426 return;
1427 do { 1428 target = log->reclaim_target; 1429 if (new < target) 1430 return; 1431 } while (cmpxchg(&log->reclaim_target, target, new) != target); 1432 md_wakeup_thread(log->reclaim_thread); 1433} 1434 --- 7 unchanged lines hidden (view full) --- 1442 * This is a special case for hotadd. In suspend, the array has 1443 * no journal. In resume, journal is initialized as well as the 1444 * reclaim thread. 1445 */ 1446 if (log->reclaim_thread) 1447 return; 1448 log->reclaim_thread = md_register_thread(r5l_reclaim_thread, 1449 log->rdev->mddev, "reclaim");	806 do { 807 target = log->reclaim_target; 808 if (new < target) 809 return; 810 } while (cmpxchg(&log->reclaim_target, target, new) != target); 811 md_wakeup_thread(log->reclaim_thread); 812} 813 --- 7 unchanged lines hidden (view full) --- 821 * This is a special case for hotadd. In suspend, the array has 822 * no journal. In resume, journal is initialized as well as the 823 * reclaim thread. 824 */ 825 if (log->reclaim_thread) 826 return; 827 log->reclaim_thread = md_register_thread(r5l_reclaim_thread, 828 log->rdev->mddev, "reclaim");
1450 log->reclaim_thread->timeout = R5C_RECLAIM_WAKEUP_INTERVAL;
1451 } else if (state == 1) { 1452 /* make sure r5l_write_super_and_discard_space exits */ 1453 mddev = log->rdev->mddev; 1454 wake_up(&mddev->sb_wait);	829 } else if (state == 1) { 830 /* make sure r5l_write_super_and_discard_space exits */ 831 mddev = log->rdev->mddev; 832 wake_up(&mddev->sb_wait);
1455 r5l_wake_reclaim(log, MaxSector);	833 r5l_wake_reclaim(log, -1L);
1456 md_unregister_thread(&log->reclaim_thread); 1457 r5l_do_reclaim(log); 1458 } 1459} 1460 1461bool r5l_log_disk_error(struct r5conf conf) 1462{ 1463 struct r5l_log log; --- 10 unchanged lines hidden (view full) --- 1474 return ret; 1475} 1476 1477struct r5l_recovery_ctx { 1478 struct page meta_page; / current meta / 1479 sector_t meta_total_blocks; / total size of current meta and data / 1480 sector_t pos; / recovery position / 1481 u64 seq; / recovery position seq */	834 md_unregister_thread(&log->reclaim_thread); 835 r5l_do_reclaim(log); 836 } 837} 838 839bool r5l_log_disk_error(struct r5conf conf) 840{ 841 struct r5l_log log; --- 10 unchanged lines hidden (view full) --- 852 return ret; 853} 854 855struct r5l_recovery_ctx { 856 struct page meta_page; / current meta / 857 sector_t meta_total_blocks; / total size of current meta and data / 858 sector_t pos; / recovery position / 859 u64 seq; / recovery position seq */
1482 int data_parity_stripes; /* number of data_parity stripes / 1483 int data_only_stripes; / number of data_only stripes */ 1484 struct list_head cached_list;
1485}; 1486	860}; 861
1487static int r5l_recovery_read_meta_block(struct r5l_log log, 1488 struct r5l_recovery_ctx ctx)	862static int r5l_read_meta_block(struct r5l_log log, 863 struct r5l_recovery_ctx ctx)
1489{ 1490 struct page page = ctx->meta_page; 1491 struct r5l_meta_block mb; 1492 u32 crc, stored_crc; 1493 1494 if (!sync_page_io(log->rdev, ctx->pos, PAGE_SIZE, page, REQ_OP_READ, 0, 1495 false)) 1496 return -EIO; --- 15 unchanged lines hidden (view full) --- 1512 if (le32_to_cpu(mb->meta_size) > PAGE_SIZE) 1513 return -EINVAL; 1514 1515 ctx->meta_total_blocks = BLOCK_SECTORS; 1516 1517 return 0; 1518} 1519	864{ 865 struct page page = ctx->meta_page; 866 struct r5l_meta_block mb; 867 u32 crc, stored_crc; 868 869 if (!sync_page_io(log->rdev, ctx->pos, PAGE_SIZE, page, REQ_OP_READ, 0, 870 false)) 871 return -EIO; --- 15 unchanged lines hidden (view full) --- 887 if (le32_to_cpu(mb->meta_size) > PAGE_SIZE) 888 return -EINVAL; 889 890 ctx->meta_total_blocks = BLOCK_SECTORS; 891 892 return 0; 893} 894
1520static void 1521r5l_recovery_create_empty_meta_block(struct r5l_log log, 1522 struct page page, 1523 sector_t pos, u64 seq)	895static int r5l_recovery_flush_one_stripe(struct r5l_log log, 896 struct r5l_recovery_ctx ctx, 897 sector_t stripe_sect, 898 int offset, sector_t log_offset)
1524{	899{
1525 struct r5l_meta_block *mb; 1526 u32 crc;	900 struct r5conf conf = log->rdev->mddev->private; 901 struct stripe_head sh; 902 struct r5l_payload_data_parity *payload; 903 int disk_index;
1527	904
1528 mb = page_address(page); 1529 clear_page(mb); 1530 mb->magic = cpu_to_le32(R5LOG_MAGIC); 1531 mb->version = R5LOG_VERSION; 1532 mb->meta_size = cpu_to_le32(sizeof(struct r5l_meta_block)); 1533 mb->seq = cpu_to_le64(seq); 1534 mb->position = cpu_to_le64(pos); 1535 crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE); 1536 mb->checksum = cpu_to_le32(crc); 1537}	905 sh = raid5_get_active_stripe(conf, stripe_sect, 0, 0, 0); 906 while (1) { 907 payload = page_address(ctx->meta_page) + *offset;
1538	908
1539static int r5l_log_write_empty_meta_block(struct r5l_log log, sector_t pos, 1540 u64 seq) 1541{ 1542 struct page page;	909 if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_DATA) { 910 raid5_compute_sector(conf, 911 le64_to_cpu(payload->location), 0, 912 &disk_index, sh);
1543	913
1544 page = alloc_page(GFP_KERNEL); 1545 if (!page) 1546 return -ENOMEM; 1547 r5l_recovery_create_empty_meta_block(log, page, pos, seq); 1548 if (!sync_page_io(log->rdev, pos, PAGE_SIZE, page, REQ_OP_WRITE, 1549 WRITE_FUA, false)) { 1550 __free_page(page); 1551 return -EIO; 1552 } 1553 __free_page(page); 1554 return 0; 1555}	914 sync_page_io(log->rdev, log_offset, PAGE_SIZE, 915 sh->dev[disk_index].page, REQ_OP_READ, 0, 916 false); 917 sh->dev[disk_index].log_checksum = 918 le32_to_cpu(payload->checksum[0]); 919 set_bit(R5_Wantwrite, &sh->dev[disk_index].flags); 920 ctx->meta_total_blocks += BLOCK_SECTORS; 921 } else { 922 disk_index = sh->pd_idx; 923 sync_page_io(log->rdev, log_offset, PAGE_SIZE, 924 sh->dev[disk_index].page, REQ_OP_READ, 0, 925 false); 926 sh->dev[disk_index].log_checksum = 927 le32_to_cpu(payload->checksum[0]); 928 set_bit(R5_Wantwrite, &sh->dev[disk_index].flags);
1556	929
1557/* 1558 * r5l_recovery_load_data and r5l_recovery_load_parity uses flag R5_Wantwrite 1559 * to mark valid (potentially not flushed) data in the journal. 1560 * 1561 * We already verified checksum in r5l_recovery_verify_data_checksum_for_mb, 1562 * so there should not be any mismatch here. 1563 / 1564static void r5l_recovery_load_data(struct r5l_log log, 1565 struct stripe_head sh, 1566 struct r5l_recovery_ctx ctx, 1567 struct r5l_payload_data_parity payload, 1568 sector_t log_offset) 1569{ 1570 struct mddev mddev = log->rdev->mddev; 1571 struct r5conf *conf = mddev->private; 1572 int dd_idx;	930 if (sh->qd_idx >= 0) { 931 disk_index = sh->qd_idx; 932 sync_page_io(log->rdev, 933 r5l_ring_add(log, log_offset, BLOCK_SECTORS), 934 PAGE_SIZE, sh->dev[disk_index].page, 935 REQ_OP_READ, 0, false); 936 sh->dev[disk_index].log_checksum = 937 le32_to_cpu(payload->checksum[1]); 938 set_bit(R5_Wantwrite, 939 &sh->dev[disk_index].flags); 940 } 941 ctx->meta_total_blocks += BLOCK_SECTORS conf->max_degraded; 942 }
1573	943
1574 raid5_compute_sector(conf, 1575 le64_to_cpu(payload->location), 0, 1576 &dd_idx, sh); 1577 sync_page_io(log->rdev, log_offset, PAGE_SIZE, 1578 sh->dev[dd_idx].page, REQ_OP_READ, 0, false); 1579 sh->dev[dd_idx].log_checksum = 1580 le32_to_cpu(payload->checksum[0]); 1581 ctx->meta_total_blocks += BLOCK_SECTORS; 1582 1583 set_bit(R5_Wantwrite, &sh->dev[dd_idx].flags); 1584 set_bit(STRIPE_R5C_CACHING, &sh->state); 1585} 1586 1587static void r5l_recovery_load_parity(struct r5l_log log, 1588 struct stripe_head sh, 1589 struct r5l_recovery_ctx ctx, 1590 struct r5l_payload_data_parity payload, 1591 sector_t log_offset) 1592{ 1593 struct mddev mddev = log->rdev->mddev; 1594 struct r5conf conf = mddev->private; 1595 1596 ctx->meta_total_blocks += BLOCK_SECTORS * conf->max_degraded; 1597 sync_page_io(log->rdev, log_offset, PAGE_SIZE, 1598 sh->dev[sh->pd_idx].page, REQ_OP_READ, 0, false); 1599 sh->dev[sh->pd_idx].log_checksum = 1600 le32_to_cpu(payload->checksum[0]); 1601 set_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags); 1602 1603 if (sh->qd_idx >= 0) { 1604 sync_page_io(log->rdev, 1605 r5l_ring_add(log, log_offset, BLOCK_SECTORS), 1606 PAGE_SIZE, sh->dev[sh->qd_idx].page, 1607 REQ_OP_READ, 0, false); 1608 sh->dev[sh->qd_idx].log_checksum = 1609 le32_to_cpu(payload->checksum[1]); 1610 set_bit(R5_Wantwrite, &sh->dev[sh->qd_idx].flags);	944 log_offset = r5l_ring_add(log, log_offset, 945 le32_to_cpu(payload->size)); 946 offset += sizeof(struct r5l_payload_data_parity) + 947 sizeof(__le32) 948 (le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9)); 949 if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_PARITY) 950 break;
1611 }	951 }
1612 clear_bit(STRIPE_R5C_CACHING, &sh->state); 1613}
1614	952
1615static void r5l_recovery_reset_stripe(struct stripe_head sh) 1616{ 1617 int i; 1618 1619 sh->state = 0; 1620 sh->log_start = MaxSector; 1621 for (i = sh->disks; i--; ) 1622 sh->dev[i].flags = 0; 1623} 1624 1625static void 1626r5l_recovery_replay_one_stripe(struct r5conf conf, 1627 struct stripe_head sh, 1628 struct r5l_recovery_ctx ctx) 1629{ 1630 struct md_rdev rdev, rrdev; 1631 int disk_index; 1632 int data_count = 0; 1633
1634 for (disk_index = 0; disk_index < sh->disks; disk_index++) {	953 for (disk_index = 0; disk_index < sh->disks; disk_index++) {
	954 void *addr; 955 u32 checksum; 956
1635 if (!test_bit(R5_Wantwrite, &sh->dev[disk_index].flags)) 1636 continue;	957 if (!test_bit(R5_Wantwrite, &sh->dev[disk_index].flags)) 958 continue;
1637 if (disk_index == sh->qd_idx \|\| disk_index == sh->pd_idx) 1638 continue; 1639 data_count++;	959 addr = kmap_atomic(sh->dev[disk_index].page); 960 checksum = crc32c_le(log->uuid_checksum, addr, PAGE_SIZE); 961 kunmap_atomic(addr); 962 if (checksum != sh->dev[disk_index].log_checksum) 963 goto error;
1640 } 1641	964 } 965
1642 /* 1643 * stripes that only have parity must have been flushed 1644 * before the crash that we are now recovering from, so 1645 * there is nothing more to recovery. 1646 */ 1647 if (data_count == 0) 1648 goto out; 1649
1650 for (disk_index = 0; disk_index < sh->disks; disk_index++) {	966 for (disk_index = 0; disk_index < sh->disks; disk_index++) {
1651 if (!test_bit(R5_Wantwrite, &sh->dev[disk_index].flags))	967 struct md_rdev rdev, rrdev; 968 969 if (!test_and_clear_bit(R5_Wantwrite, 970 &sh->dev[disk_index].flags))
1652 continue; 1653 1654 /* in case device is broken */	971 continue; 972 973 /* in case device is broken */
1655 rcu_read_lock();
1656 rdev = rcu_dereference(conf->disks[disk_index].rdev);	974 rdev = rcu_dereference(conf->disks[disk_index].rdev);
1657 if (rdev) { 1658 atomic_inc(&rdev->nr_pending); 1659 rcu_read_unlock(); 1660 sync_page_io(rdev, sh->sector, PAGE_SIZE,	975 if (rdev) 976 sync_page_io(rdev, stripe_sect, PAGE_SIZE,
1661 sh->dev[disk_index].page, REQ_OP_WRITE, 0, 1662 false);	977 sh->dev[disk_index].page, REQ_OP_WRITE, 0, 978 false);
1663 rdev_dec_pending(rdev, rdev->mddev); 1664 rcu_read_lock(); 1665 }
1666 rrdev = rcu_dereference(conf->disks[disk_index].replacement);	979 rrdev = rcu_dereference(conf->disks[disk_index].replacement);
1667 if (rrdev) { 1668 atomic_inc(&rrdev->nr_pending); 1669 rcu_read_unlock(); 1670 sync_page_io(rrdev, sh->sector, PAGE_SIZE,	980 if (rrdev) 981 sync_page_io(rrdev, stripe_sect, PAGE_SIZE,
1671 sh->dev[disk_index].page, REQ_OP_WRITE, 0, 1672 false);	982 sh->dev[disk_index].page, REQ_OP_WRITE, 0, 983 false);
1673 rdev_dec_pending(rrdev, rrdev->mddev); 1674 rcu_read_lock(); 1675 } 1676 rcu_read_unlock();
1677 }	984 }
1678 ctx->data_parity_stripes++; 1679out: 1680 r5l_recovery_reset_stripe(sh); 1681} 1682 1683static struct stripe_head * 1684r5c_recovery_alloc_stripe(struct r5conf conf, 1685 struct list_head recovery_list, 1686 sector_t stripe_sect, 1687 sector_t log_start) 1688{ 1689 struct stripe_head sh; 1690 1691 sh = raid5_get_active_stripe(conf, stripe_sect, 0, 1, 0); 1692 if (!sh) 1693 return NULL; / no more stripe available / 1694 1695 r5l_recovery_reset_stripe(sh); 1696 sh->log_start = log_start; 1697 1698 return sh; 1699} 1700 1701static struct stripe_head 1702r5c_recovery_lookup_stripe(struct list_head list, sector_t sect) 1703{ 1704 struct stripe_head sh; 1705 1706 list_for_each_entry(sh, list, lru) 1707 if (sh->sector == sect) 1708 return sh; 1709 return NULL; 1710} 1711 1712static void 1713r5c_recovery_drop_stripes(struct list_head cached_stripe_list, 1714 struct r5l_recovery_ctx ctx) 1715{ 1716 struct stripe_head sh, next; 1717 1718 list_for_each_entry_safe(sh, next, cached_stripe_list, lru) { 1719 r5l_recovery_reset_stripe(sh); 1720 list_del_init(&sh->lru); 1721 raid5_release_stripe(sh); 1722 } 1723} 1724 1725static void 1726r5c_recovery_replay_stripes(struct list_head cached_stripe_list, 1727 struct r5l_recovery_ctx ctx) 1728{ 1729 struct stripe_head sh, next; 1730 1731 list_for_each_entry_safe(sh, next, cached_stripe_list, lru) 1732 if (!test_bit(STRIPE_R5C_CACHING, &sh->state)) { 1733 r5l_recovery_replay_one_stripe(sh->raid_conf, sh, ctx); 1734 list_del_init(&sh->lru); 1735 raid5_release_stripe(sh); 1736 } 1737} 1738 1739/* if matches return 0; otherwise return -EINVAL / 1740static int 1741r5l_recovery_verify_data_checksum(struct r5l_log log, struct page page, 1742 sector_t log_offset, __le32 log_checksum) 1743{ 1744 void addr; 1745 u32 checksum; 1746 1747 sync_page_io(log->rdev, log_offset, PAGE_SIZE, 1748 page, REQ_OP_READ, 0, false); 1749 addr = kmap_atomic(page); 1750 checksum = crc32c_le(log->uuid_checksum, addr, PAGE_SIZE); 1751 kunmap_atomic(addr); 1752 return (le32_to_cpu(log_checksum) == checksum) ? 0 : -EINVAL; 1753} 1754 1755/* 1756 * before loading data to stripe cache, we need verify checksum for all data, 1757 * if there is mismatch for any data page, we drop all data in the mata block 1758 / 1759static int 1760r5l_recovery_verify_data_checksum_for_mb(struct r5l_log log, 1761 struct r5l_recovery_ctx ctx) 1762{ 1763 struct mddev mddev = log->rdev->mddev; 1764 struct r5conf conf = mddev->private; 1765 struct r5l_meta_block mb = page_address(ctx->meta_page); 1766 sector_t mb_offset = sizeof(struct r5l_meta_block); 1767 sector_t log_offset = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS); 1768 struct page page; 1769 struct r5l_payload_data_parity payload; 1770 1771 page = alloc_page(GFP_KERNEL); 1772 if (!page) 1773 return -ENOMEM; 1774 1775 while (mb_offset < le32_to_cpu(mb->meta_size)) { 1776 payload = (void )mb + mb_offset; 1777 1778 if (payload->header.type == R5LOG_PAYLOAD_DATA) { 1779 if (r5l_recovery_verify_data_checksum( 1780 log, page, log_offset, 1781 payload->checksum[0]) < 0) 1782 goto mismatch; 1783 } else if (payload->header.type == R5LOG_PAYLOAD_PARITY) { 1784 if (r5l_recovery_verify_data_checksum( 1785 log, page, log_offset, 1786 payload->checksum[0]) < 0) 1787 goto mismatch; 1788 if (conf->max_degraded == 2 && / q for RAID 6 / 1789 r5l_recovery_verify_data_checksum( 1790 log, page, 1791 r5l_ring_add(log, log_offset, 1792 BLOCK_SECTORS), 1793 payload->checksum[1]) < 0) 1794 goto mismatch; 1795 } else / not R5LOG_PAYLOAD_DATA or R5LOG_PAYLOAD_PARITY / 1796 goto mismatch; 1797 1798 log_offset = r5l_ring_add(log, log_offset, 1799 le32_to_cpu(payload->size)); 1800 1801 mb_offset += sizeof(struct r5l_payload_data_parity) + 1802 sizeof(__le32) 1803 (le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9)); 1804 } 1805 1806 put_page(page);	985 raid5_release_stripe(sh);
1807 return 0; 1808	986 return 0; 987
1809mismatch: 1810 put_page(page);	988error: 989 for (disk_index = 0; disk_index < sh->disks; disk_index++) 990 sh->dev[disk_index].flags = 0; 991 raid5_release_stripe(sh);
1811 return -EINVAL; 1812} 1813	992 return -EINVAL; 993} 994
1814/* 1815 * Analyze all data/parity pages in one meta block 1816 * Returns: 1817 * 0 for success 1818 * -EINVAL for unknown playload type 1819 * -EAGAIN for checksum mismatch of data page 1820 * -ENOMEM for run out of memory (alloc_page failed or run out of stripes) 1821 / 1822static int 1823r5c_recovery_analyze_meta_block(struct r5l_log log, 1824 struct r5l_recovery_ctx ctx, 1825 struct list_head cached_stripe_list)	995static int r5l_recovery_flush_one_meta(struct r5l_log log, 996 struct r5l_recovery_ctx ctx)
1826{	997{
1827 struct mddev mddev = log->rdev->mddev; 1828 struct r5conf conf = mddev->private; 1829 struct r5l_meta_block *mb;	998 struct r5conf *conf = log->rdev->mddev->private;
1830 struct r5l_payload_data_parity *payload;	999 struct r5l_payload_data_parity *payload;
1831 int mb_offset;	1000 struct r5l_meta_block *mb; 1001 int offset;
1832 sector_t log_offset;	1002 sector_t log_offset;
1833 sector_t stripe_sect; 1834 struct stripe_head *sh; 1835 int ret;	1003 sector_t stripe_sector;
1836	1004
1837 /* 1838 * for mismatch in data blocks, we will drop all data in this mb, but 1839 * we will still read next mb for other data with FLUSH flag, as 1840 * io_unit could finish out of order. 1841 / 1842 ret = r5l_recovery_verify_data_checksum_for_mb(log, ctx); 1843 if (ret == -EINVAL) 1844 return -EAGAIN; 1845 else if (ret) 1846 return ret; / -ENOMEM duo to alloc_page() failed */ 1847
1848 mb = page_address(ctx->meta_page);	1005 mb = page_address(ctx->meta_page);
1849 mb_offset = sizeof(struct r5l_meta_block);	1006 offset = sizeof(struct r5l_meta_block);
1850 log_offset = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS); 1851	1007 log_offset = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS); 1008
1852 while (mb_offset < le32_to_cpu(mb->meta_size)) {	1009 while (offset < le32_to_cpu(mb->meta_size)) {
1853 int dd; 1854	1010 int dd; 1011
1855 payload = (void )mb + mb_offset; 1856 stripe_sect = (payload->header.type == R5LOG_PAYLOAD_DATA) ? 1857 raid5_compute_sector( 1858 conf, le64_to_cpu(payload->location), 0, &dd, 1859 NULL) 1860 : le64_to_cpu(payload->location); 1861 1862 sh = r5c_recovery_lookup_stripe(cached_stripe_list, 1863 stripe_sect); 1864 1865 if (!sh) { 1866 sh = r5c_recovery_alloc_stripe(conf, cached_stripe_list, 1867 stripe_sect, ctx->pos); 1868 / 1869 * cannot get stripe from raid5_get_active_stripe 1870 * try replay some stripes 1871 / 1872 if (!sh) { 1873 r5c_recovery_replay_stripes( 1874 cached_stripe_list, ctx); 1875 sh = r5c_recovery_alloc_stripe( 1876 conf, cached_stripe_list, 1877 stripe_sect, ctx->pos); 1878 } 1879 if (!sh) { 1880 pr_debug("md/raid:%s: Increasing stripe cache size to %d to recovery data on journal.\n", 1881 mdname(mddev), 1882 conf->min_nr_stripes 2); 1883 raid5_set_cache_size(mddev, 1884 conf->min_nr_stripes * 2); 1885 sh = r5c_recovery_alloc_stripe( 1886 conf, cached_stripe_list, stripe_sect, 1887 ctx->pos); 1888 } 1889 if (!sh) { 1890 pr_err("md/raid:%s: Cannot get enough stripes due to memory pressure. Recovery failed.\n", 1891 mdname(mddev)); 1892 return -ENOMEM; 1893 } 1894 list_add_tail(&sh->lru, cached_stripe_list); 1895 } 1896 1897 if (payload->header.type == R5LOG_PAYLOAD_DATA) { 1898 if (!test_bit(STRIPE_R5C_CACHING, &sh->state)) { 1899 r5l_recovery_replay_one_stripe(conf, sh, ctx); 1900 r5l_recovery_reset_stripe(sh); 1901 sh->log_start = ctx->pos; 1902 list_move_tail(&sh->lru, cached_stripe_list); 1903 } 1904 r5l_recovery_load_data(log, sh, ctx, payload, 1905 log_offset); 1906 } else if (payload->header.type == R5LOG_PAYLOAD_PARITY) 1907 r5l_recovery_load_parity(log, sh, ctx, payload, 1908 log_offset); 1909 else	1012 payload = (void *)mb + offset; 1013 stripe_sector = raid5_compute_sector(conf, 1014 le64_to_cpu(payload->location), 0, &dd, NULL); 1015 if (r5l_recovery_flush_one_stripe(log, ctx, stripe_sector, 1016 &offset, &log_offset))
1910 return -EINVAL;	1017 return -EINVAL;
1911 1912 log_offset = r5l_ring_add(log, log_offset, 1913 le32_to_cpu(payload->size)); 1914 1915 mb_offset += sizeof(struct r5l_payload_data_parity) + 1916 sizeof(__le32) * 1917 (le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9));
1918 }	1018 }
1919
1920 return 0; 1921} 1922	1019 return 0; 1020} 1021
1923/* 1924 * Load the stripe into cache. The stripe will be written out later by 1925 * the stripe cache state machine. 1926 / 1927static void r5c_recovery_load_one_stripe(struct r5l_log log, 1928 struct stripe_head *sh)	1022/* copy data/parity from log to raid disks / 1023static void r5l_recovery_flush_log(struct r5l_log log, 1024 struct r5l_recovery_ctx *ctx)
1929{	1025{
1930 struct r5conf conf = sh->raid_conf; 1931 struct r5dev dev; 1932 int i; 1933 1934 for (i = sh->disks; i--; ) { 1935 dev = sh->dev + i; 1936 if (test_and_clear_bit(R5_Wantwrite, &dev->flags)) { 1937 set_bit(R5_InJournal, &dev->flags); 1938 set_bit(R5_UPTODATE, &dev->flags); 1939 } 1940 } 1941 set_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state); 1942 atomic_inc(&conf->r5c_cached_partial_stripes); 1943 list_add_tail(&sh->r5c, &log->stripe_in_journal_list); 1944} 1945 1946/* 1947 * Scan through the log for all to-be-flushed data 1948 * 1949 * For stripes with data and parity, namely Data-Parity stripe 1950 * (STRIPE_R5C_CACHING == 0), we simply replay all the writes. 1951 * 1952 * For stripes with only data, namely Data-Only stripe 1953 * (STRIPE_R5C_CACHING == 1), we load them to stripe cache state machine. 1954 * 1955 * For a stripe, if we see data after parity, we should discard all previous 1956 * data and parity for this stripe, as these data are already flushed to 1957 * the array. 1958 * 1959 * At the end of the scan, we return the new journal_tail, which points to 1960 * first data-only stripe on the journal device, or next invalid meta block. 1961 / 1962static int r5c_recovery_flush_log(struct r5l_log log, 1963 struct r5l_recovery_ctx ctx) 1964{ 1965 struct stripe_head sh, next; 1966 int ret = 0; 1967 1968 / scan through the log */
1969 while (1) {	1026 while (1) {
1970 if (r5l_recovery_read_meta_block(log, ctx)) 1971 break; 1972 1973 ret = r5c_recovery_analyze_meta_block(log, ctx, 1974 &ctx->cached_list); 1975 /* 1976 * -EAGAIN means mismatch in data block, in this case, we still 1977 * try scan the next metablock 1978 / 1979 if (ret && ret != -EAGAIN) 1980 break; / ret == -EINVAL or -ENOMEM */	1027 if (r5l_read_meta_block(log, ctx)) 1028 return; 1029 if (r5l_recovery_flush_one_meta(log, ctx)) 1030 return;
1981 ctx->seq++; 1982 ctx->pos = r5l_ring_add(log, ctx->pos, ctx->meta_total_blocks); 1983 }	1031 ctx->seq++; 1032 ctx->pos = r5l_ring_add(log, ctx->pos, ctx->meta_total_blocks); 1033 }
1984 1985 if (ret == -ENOMEM) { 1986 r5c_recovery_drop_stripes(&ctx->cached_list, ctx); 1987 return ret; 1988 } 1989 1990 /* replay data-parity stripes / 1991 r5c_recovery_replay_stripes(&ctx->cached_list, ctx); 1992 1993 / load data-only stripes to stripe cache */ 1994 list_for_each_entry_safe(sh, next, &ctx->cached_list, lru) { 1995 WARN_ON(!test_bit(STRIPE_R5C_CACHING, &sh->state)); 1996 r5c_recovery_load_one_stripe(log, sh); 1997 list_del_init(&sh->lru); 1998 raid5_release_stripe(sh); 1999 ctx->data_only_stripes++; 2000 } 2001 2002 return 0;
2003} 2004	1034} 1035
2005/* 2006 * we did a recovery. Now ctx.pos points to an invalid meta block. New 2007 * log will start here. but we can't let superblock point to last valid 2008 * meta block. The log might looks like: 2009 * \| meta 1\| meta 2\| meta 3\| 2010 * meta 1 is valid, meta 2 is invalid. meta 3 could be valid. If 2011 * superblock points to meta 1, we write a new valid meta 2n. if crash 2012 * happens again, new recovery will start from meta 1. Since meta 2n is 2013 * valid now, recovery will think meta 3 is valid, which is wrong. 2014 * The solution is we create a new meta in meta2 with its seq == meta 2015 * 1's seq + 10 and let superblock points to meta2. The same recovery will 2016 * not think meta 3 is a valid meta, because its seq doesn't match 2017 / 2018 2019/ 2020 * Before recovery, the log looks like the following 2021 * 2022 * --------------------------------------------- 2023 * \| valid log \| invalid log \| 2024 * --------------------------------------------- 2025 * ^ 2026 * \|- log->last_checkpoint 2027 * \|- log->last_cp_seq 2028 * 2029 * Now we scan through the log until we see invalid entry 2030 * 2031 * --------------------------------------------- 2032 * \| valid log \| invalid log \| 2033 * --------------------------------------------- 2034 * ^ ^ 2035 * \|- log->last_checkpoint \|- ctx->pos 2036 * \|- log->last_cp_seq \|- ctx->seq 2037 * 2038 * From this point, we need to increase seq number by 10 to avoid 2039 * confusing next recovery. 2040 * 2041 * --------------------------------------------- 2042 * \| valid log \| invalid log \| 2043 * --------------------------------------------- 2044 * ^ ^ 2045 * \|- log->last_checkpoint \|- ctx->pos+1 2046 * \|- log->last_cp_seq \|- ctx->seq+11 2047 * 2048 * However, it is not safe to start the state machine yet, because data only 2049 * parities are not yet secured in RAID. To save these data only parities, we 2050 * rewrite them from seq+11. 2051 * 2052 * ----------------------------------------------------------------- 2053 * \| valid log \| data only stripes \| invalid log \| 2054 * ----------------------------------------------------------------- 2055 * ^ ^ 2056 * \|- log->last_checkpoint \|- ctx->pos+n 2057 * \|- log->last_cp_seq \|- ctx->seq+10+n 2058 * 2059 * If failure happens again during this process, the recovery can safe start 2060 * again from log->last_checkpoint. 2061 * 2062 * Once data only stripes are rewritten to journal, we move log_tail 2063 * 2064 * ----------------------------------------------------------------- 2065 * \| old log \| data only stripes \| invalid log \| 2066 * ----------------------------------------------------------------- 2067 * ^ ^ 2068 * \|- log->last_checkpoint \|- ctx->pos+n 2069 * \|- log->last_cp_seq \|- ctx->seq+10+n 2070 * 2071 * Then we can safely start the state machine. If failure happens from this 2072 * point on, the recovery will start from new log->last_checkpoint. 2073 / 2074static int 2075r5c_recovery_rewrite_data_only_stripes(struct r5l_log log, 2076 struct r5l_recovery_ctx *ctx)	1036static int r5l_log_write_empty_meta_block(struct r5l_log *log, sector_t pos, 1037 u64 seq)
2077{	1038{
2078 struct stripe_head sh; 2079 struct mddev mddev = log->rdev->mddev;
2080 struct page *page;	1039 struct page *page;
	1040 struct r5l_meta_block *mb; 1041 u32 crc;
2081	1042
2082 page = alloc_page(GFP_KERNEL); 2083 if (!page) { 2084 pr_err("md/raid:%s: cannot allocate memory to rewrite data only stripes\n", 2085 mdname(mddev));	1043 page = alloc_page(GFP_KERNEL \| __GFP_ZERO); 1044 if (!page)
2086 return -ENOMEM;	1045 return -ENOMEM;
2087 }	1046 mb = page_address(page); 1047 mb->magic = cpu_to_le32(R5LOG_MAGIC); 1048 mb->version = R5LOG_VERSION; 1049 mb->meta_size = cpu_to_le32(sizeof(struct r5l_meta_block)); 1050 mb->seq = cpu_to_le64(seq); 1051 mb->position = cpu_to_le64(pos); 1052 crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE); 1053 mb->checksum = cpu_to_le32(crc);
2088	1054
2089 ctx->seq += 10; 2090 list_for_each_entry(sh, &ctx->cached_list, lru) { 2091 struct r5l_meta_block mb; 2092 int i; 2093 int offset; 2094 sector_t write_pos; 2095 2096 WARN_ON(!test_bit(STRIPE_R5C_CACHING, &sh->state)); 2097 r5l_recovery_create_empty_meta_block(log, page, 2098 ctx->pos, ctx->seq); 2099 mb = page_address(page); 2100 offset = le32_to_cpu(mb->meta_size); 2101 write_pos = ctx->pos + BLOCK_SECTORS; 2102 2103 for (i = sh->disks; i--; ) { 2104 struct r5dev dev = &sh->dev[i]; 2105 struct r5l_payload_data_parity payload; 2106 void addr; 2107 2108 if (test_bit(R5_InJournal, &dev->flags)) { 2109 payload = (void *)mb + offset; 2110 payload->header.type = cpu_to_le16( 2111 R5LOG_PAYLOAD_DATA); 2112 payload->size = BLOCK_SECTORS; 2113 payload->location = cpu_to_le64( 2114 raid5_compute_blocknr(sh, i, 0)); 2115 addr = kmap_atomic(dev->page); 2116 payload->checksum[0] = cpu_to_le32( 2117 crc32c_le(log->uuid_checksum, addr, 2118 PAGE_SIZE)); 2119 kunmap_atomic(addr); 2120 sync_page_io(log->rdev, write_pos, PAGE_SIZE, 2121 dev->page, REQ_OP_WRITE, 0, false); 2122 write_pos = r5l_ring_add(log, write_pos, 2123 BLOCK_SECTORS); 2124 offset += sizeof(__le32) + 2125 sizeof(struct r5l_payload_data_parity); 2126 2127 } 2128 } 2129 mb->meta_size = cpu_to_le32(offset); 2130 mb->checksum = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE); 2131 sync_page_io(log->rdev, ctx->pos, PAGE_SIZE, page, 2132 REQ_OP_WRITE, WRITE_FUA, false); 2133 sh->log_start = ctx->pos; 2134 ctx->pos = write_pos; 2135 ctx->seq += 1;	1055 if (!sync_page_io(log->rdev, pos, PAGE_SIZE, page, REQ_OP_WRITE, 1056 REQ_FUA, false)) { 1057 __free_page(page); 1058 return -EIO;
2136 } 2137 __free_page(page); 2138 return 0; 2139} 2140 2141static int r5l_recovery_log(struct r5l_log *log) 2142{	1059 } 1060 __free_page(page); 1061 return 0; 1062} 1063 1064static int r5l_recovery_log(struct r5l_log *log) 1065{
2143 struct mddev *mddev = log->rdev->mddev;
2144 struct r5l_recovery_ctx ctx;	1066 struct r5l_recovery_ctx ctx;
2145 int ret;
2146 2147 ctx.pos = log->last_checkpoint; 2148 ctx.seq = log->last_cp_seq; 2149 ctx.meta_page = alloc_page(GFP_KERNEL);	1067 1068 ctx.pos = log->last_checkpoint; 1069 ctx.seq = log->last_cp_seq; 1070 ctx.meta_page = alloc_page(GFP_KERNEL);
2150 ctx.data_only_stripes = 0; 2151 ctx.data_parity_stripes = 0; 2152 INIT_LIST_HEAD(&ctx.cached_list); 2153
2154 if (!ctx.meta_page) 2155 return -ENOMEM; 2156	1071 if (!ctx.meta_page) 1072 return -ENOMEM; 1073
2157 ret = r5c_recovery_flush_log(log, &ctx);	1074 r5l_recovery_flush_log(log, &ctx);
2158 __free_page(ctx.meta_page); 2159	1075 __free_page(ctx.meta_page); 1076
2160 if (ret) 2161 return ret;	1077 /* 1078 * we did a recovery. Now ctx.pos points to an invalid meta block. New 1079 * log will start here. but we can't let superblock point to last valid 1080 * meta block. The log might looks like: 1081 * \| meta 1\| meta 2\| meta 3\| 1082 * meta 1 is valid, meta 2 is invalid. meta 3 could be valid. If 1083 * superblock points to meta 1, we write a new valid meta 2n. if crash 1084 * happens again, new recovery will start from meta 1. Since meta 2n is 1085 * valid now, recovery will think meta 3 is valid, which is wrong. 1086 * The solution is we create a new meta in meta2 with its seq == meta 1087 * 1's seq + 10 and let superblock points to meta2. The same recovery will 1088 * not think meta 3 is a valid meta, because its seq doesn't match 1089 */ 1090 if (ctx.seq > log->last_cp_seq + 1) { 1091 int ret;
2162	1092
2163 if ((ctx.data_only_stripes == 0) && (ctx.data_parity_stripes == 0)) 2164 pr_debug("md/raid:%s: starting from clean shutdown\n", 2165 mdname(mddev)); 2166 else { 2167 pr_debug("md/raid:%s: recoverying %d data-only stripes and %d data-parity stripes\n", 2168 mdname(mddev), ctx.data_only_stripes, 2169 ctx.data_parity_stripes); 2170 2171 if (ctx.data_only_stripes > 0) 2172 if (r5c_recovery_rewrite_data_only_stripes(log, &ctx)) { 2173 pr_err("md/raid:%s: failed to rewrite stripes to journal\n", 2174 mdname(mddev)); 2175 return -EIO; 2176 }	1093 ret = r5l_log_write_empty_meta_block(log, ctx.pos, ctx.seq + 10); 1094 if (ret) 1095 return ret; 1096 log->seq = ctx.seq + 11; 1097 log->log_start = r5l_ring_add(log, ctx.pos, BLOCK_SECTORS); 1098 r5l_write_super(log, ctx.pos); 1099 } else { 1100 log->log_start = ctx.pos; 1101 log->seq = ctx.seq;
2177 }	1102 }
2178 2179 log->log_start = ctx.pos; 2180 log->next_checkpoint = ctx.pos; 2181 log->seq = ctx.seq; 2182 r5l_log_write_empty_meta_block(log, ctx.pos, ctx.seq); 2183 r5l_write_super(log, ctx.pos);
2184 return 0; 2185} 2186 2187static void r5l_write_super(struct r5l_log log, sector_t cp) 2188{ 2189 struct mddev mddev = log->rdev->mddev; 2190 2191 log->rdev->journal_tail = cp; 2192 set_bit(MD_CHANGE_DEVS, &mddev->flags); 2193} 2194	1103 return 0; 1104} 1105 1106static void r5l_write_super(struct r5l_log log, sector_t cp) 1107{ 1108 struct mddev mddev = log->rdev->mddev; 1109 1110 log->rdev->journal_tail = cp; 1111 set_bit(MD_CHANGE_DEVS, &mddev->flags); 1112} 1113
2195static ssize_t r5c_journal_mode_show(struct mddev mddev, char page) 2196{ 2197 struct r5conf conf = mddev->private; 2198 int ret; 2199 2200 if (!conf->log) 2201 return 0; 2202 2203 switch (conf->log->r5c_journal_mode) { 2204 case R5C_JOURNAL_MODE_WRITE_THROUGH: 2205 ret = snprintf( 2206 page, PAGE_SIZE, "[%s] %s\n", 2207 r5c_journal_mode_str[R5C_JOURNAL_MODE_WRITE_THROUGH], 2208 r5c_journal_mode_str[R5C_JOURNAL_MODE_WRITE_BACK]); 2209 break; 2210 case R5C_JOURNAL_MODE_WRITE_BACK: 2211 ret = snprintf( 2212 page, PAGE_SIZE, "%s [%s]\n", 2213 r5c_journal_mode_str[R5C_JOURNAL_MODE_WRITE_THROUGH], 2214 r5c_journal_mode_str[R5C_JOURNAL_MODE_WRITE_BACK]); 2215 break; 2216 default: 2217 ret = 0; 2218 } 2219 return ret; 2220} 2221 2222static ssize_t r5c_journal_mode_store(struct mddev mddev, 2223 const char page, size_t length) 2224{ 2225 struct r5conf conf = mddev->private; 2226 struct r5l_log log = conf->log; 2227 int val = -1, i; 2228 int len = length; 2229 2230 if (!log) 2231 return -ENODEV; 2232 2233 if (len && page[len - 1] == '\n') 2234 len -= 1; 2235 for (i = 0; i < ARRAY_SIZE(r5c_journal_mode_str); i++) 2236 if (strlen(r5c_journal_mode_str[i]) == len && 2237 strncmp(page, r5c_journal_mode_str[i], len) == 0) { 2238 val = i; 2239 break; 2240 } 2241 if (val < R5C_JOURNAL_MODE_WRITE_THROUGH \|\| 2242 val > R5C_JOURNAL_MODE_WRITE_BACK) 2243 return -EINVAL; 2244 2245 mddev_suspend(mddev); 2246 conf->log->r5c_journal_mode = val; 2247 mddev_resume(mddev); 2248 2249 pr_debug("md/raid:%s: setting r5c cache mode to %d: %s\n", 2250 mdname(mddev), val, r5c_journal_mode_str[val]); 2251 return length; 2252} 2253 2254struct md_sysfs_entry 2255r5c_journal_mode = __ATTR(journal_mode, 0644, 2256 r5c_journal_mode_show, r5c_journal_mode_store); 2257 2258/ 2259 * Try handle write operation in caching phase. This function should only 2260 * be called in write-back mode. 2261 * 2262 * If all outstanding writes can be handled in caching phase, returns 0 2263 * If writes requires write-out phase, call r5c_make_stripe_write_out() 2264 * and returns -EAGAIN 2265 / 2266int r5c_try_caching_write(struct r5conf conf, 2267 struct stripe_head sh, 2268 struct stripe_head_state s, 2269 int disks) 2270{ 2271 struct r5l_log log = conf->log; 2272 int i; 2273 struct r5dev dev; 2274 int to_cache = 0; 2275 2276 BUG_ON(!r5c_is_writeback(log)); 2277 2278 if (!test_bit(STRIPE_R5C_CACHING, &sh->state)) { 2279 /* 2280 * There are two different scenarios here: 2281 * 1. The stripe has some data cached, and it is sent to 2282 * write-out phase for reclaim 2283 * 2. The stripe is clean, and this is the first write 2284 * 2285 * For 1, return -EAGAIN, so we continue with 2286 * handle_stripe_dirtying(). 2287 * 2288 * For 2, set STRIPE_R5C_CACHING and continue with caching 2289 * write. 2290 / 2291 2292 / case 1: anything injournal or anything in written / 2293 if (s->injournal > 0 \|\| s->written > 0) 2294 return -EAGAIN; 2295 / case 2 / 2296 set_bit(STRIPE_R5C_CACHING, &sh->state); 2297 } 2298 2299 for (i = disks; i--; ) { 2300 dev = &sh->dev[i]; 2301 / if non-overwrite, use writing-out phase / 2302 if (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags) && 2303 !test_bit(R5_InJournal, &dev->flags)) { 2304 r5c_make_stripe_write_out(sh); 2305 return -EAGAIN; 2306 } 2307 } 2308 2309 for (i = disks; i--; ) { 2310 dev = &sh->dev[i]; 2311 if (dev->towrite) { 2312 set_bit(R5_Wantwrite, &dev->flags); 2313 set_bit(R5_Wantdrain, &dev->flags); 2314 set_bit(R5_LOCKED, &dev->flags); 2315 to_cache++; 2316 } 2317 } 2318 2319 if (to_cache) { 2320 set_bit(STRIPE_OP_BIODRAIN, &s->ops_request); 2321 / 2322 * set STRIPE_LOG_TRAPPED, which triggers r5c_cache_data() 2323 * in ops_run_io(). STRIPE_LOG_TRAPPED will be cleared in 2324 * r5c_handle_data_cached() 2325 / 2326 set_bit(STRIPE_LOG_TRAPPED, &sh->state); 2327 } 2328 2329 return 0; 2330} 2331 2332/ 2333 * free extra pages (orig_page) we allocated for prexor 2334 / 2335void r5c_release_extra_page(struct stripe_head sh) 2336{ 2337 int i; 2338 2339 for (i = sh->disks; i--; ) 2340 if (sh->dev[i].page != sh->dev[i].orig_page) { 2341 struct page p = sh->dev[i].orig_page; 2342 2343 sh->dev[i].orig_page = sh->dev[i].page; 2344 put_page(p); 2345 } 2346} 2347 2348/ 2349 * clean up the stripe (clear R5_InJournal for dev[pd_idx] etc.) after the 2350 * stripe is committed to RAID disks. 2351 / 2352void r5c_finish_stripe_write_out(struct r5conf conf, 2353 struct stripe_head sh, 2354 struct stripe_head_state s) 2355{ 2356 int i; 2357 int do_wakeup = 0; 2358 2359 if (!conf->log \|\| 2360 !test_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags)) 2361 return; 2362 2363 WARN_ON(test_bit(STRIPE_R5C_CACHING, &sh->state)); 2364 clear_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags); 2365 2366 if (conf->log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) 2367 return; 2368 2369 for (i = sh->disks; i--; ) { 2370 clear_bit(R5_InJournal, &sh->dev[i].flags); 2371 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) 2372 do_wakeup = 1; 2373 } 2374 2375 /* 2376 * analyse_stripe() runs before r5c_finish_stripe_write_out(), 2377 * We updated R5_InJournal, so we also update s->injournal. 2378 / 2379 s->injournal = 0; 2380 2381 if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state)) 2382 if (atomic_dec_and_test(&conf->pending_full_writes)) 2383 md_wakeup_thread(conf->mddev->thread); 2384 2385 if (do_wakeup) 2386 wake_up(&conf->wait_for_overlap); 2387 2388 if (conf->log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) 2389 return; 2390 2391 spin_lock_irq(&conf->log->stripe_in_journal_lock); 2392 list_del_init(&sh->r5c); 2393 spin_unlock_irq(&conf->log->stripe_in_journal_lock); 2394 sh->log_start = MaxSector; 2395 atomic_dec(&conf->log->stripe_in_journal_count); 2396} 2397 2398int 2399r5c_cache_data(struct r5l_log log, struct stripe_head sh, 2400 struct stripe_head_state s) 2401{ 2402 struct r5conf conf = sh->raid_conf; 2403 int pages = 0; 2404 int reserve; 2405 int i; 2406 int ret = 0; 2407 2408 BUG_ON(!log); 2409 2410 for (i = 0; i < sh->disks; i++) { 2411 void addr; 2412 2413 if (!test_bit(R5_Wantwrite, &sh->dev[i].flags)) 2414 continue; 2415 addr = kmap_atomic(sh->dev[i].page); 2416 sh->dev[i].log_checksum = crc32c_le(log->uuid_checksum, 2417 addr, PAGE_SIZE); 2418 kunmap_atomic(addr); 2419 pages++; 2420 } 2421 WARN_ON(pages == 0); 2422 2423 /* 2424 * The stripe must enter state machine again to call endio, so 2425 * don't delay. 2426 / 2427 clear_bit(STRIPE_DELAYED, &sh->state); 2428 atomic_inc(&sh->count); 2429 2430 mutex_lock(&log->io_mutex); 2431 / meta + data */ 2432 reserve = (1 + pages) << (PAGE_SHIFT - 9); 2433 2434 if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state) && 2435 sh->log_start == MaxSector) 2436 r5l_add_no_space_stripe(log, sh); 2437 else if (!r5l_has_free_space(log, reserve)) { 2438 if (sh->log_start == log->last_checkpoint) 2439 BUG(); 2440 else 2441 r5l_add_no_space_stripe(log, sh); 2442 } else { 2443 ret = r5l_log_stripe(log, sh, pages, 0); 2444 if (ret) { 2445 spin_lock_irq(&log->io_list_lock); 2446 list_add_tail(&sh->log_list, &log->no_mem_stripes); 2447 spin_unlock_irq(&log->io_list_lock); 2448 } 2449 } 2450 2451 mutex_unlock(&log->io_mutex); 2452 return 0; 2453} 2454
2455static int r5l_load_log(struct r5l_log log) 2456{ 2457 struct md_rdev rdev = log->rdev; 2458 struct page page; 2459 struct r5l_meta_block mb; 2460 sector_t cp = log->rdev->journal_tail; 2461 u32 stored_crc, expected_crc; 2462 bool create_super = false; --- 27 unchanged lines hidden (view full) --- 2490 if (le64_to_cpu(mb->position) != cp) { 2491 create_super = true; 2492 goto create; 2493 } 2494create: 2495 if (create_super) { 2496 log->last_cp_seq = prandom_u32(); 2497 cp = 0;	1114static int r5l_load_log(struct r5l_log log) 1115{ 1116 struct md_rdev rdev = log->rdev; 1117 struct page page; 1118 struct r5l_meta_block mb; 1119 sector_t cp = log->rdev->journal_tail; 1120 u32 stored_crc, expected_crc; 1121 bool create_super = false; --- 27 unchanged lines hidden (view full) --- 1149 if (le64_to_cpu(mb->position) != cp) { 1150 create_super = true; 1151 goto create; 1152 } 1153create: 1154 if (create_super) { 1155 log->last_cp_seq = prandom_u32(); 1156 cp = 0;
2498 r5l_log_write_empty_meta_block(log, cp, log->last_cp_seq);
2499 /* 2500 * Make sure super points to correct address. Log might have 2501 * data very soon. If super hasn't correct log tail address, 2502 * recovery can't find the log 2503 */ 2504 r5l_write_super(log, cp); 2505 } else 2506 log->last_cp_seq = le64_to_cpu(mb->seq); 2507 2508 log->device_size = round_down(rdev->sectors, BLOCK_SECTORS); 2509 log->max_free_space = log->device_size >> RECLAIM_MAX_FREE_SPACE_SHIFT; 2510 if (log->max_free_space > RECLAIM_MAX_FREE_SPACE) 2511 log->max_free_space = RECLAIM_MAX_FREE_SPACE; 2512 log->last_checkpoint = cp;	1157 /* 1158 * Make sure super points to correct address. Log might have 1159 * data very soon. If super hasn't correct log tail address, 1160 * recovery can't find the log 1161 */ 1162 r5l_write_super(log, cp); 1163 } else 1164 log->last_cp_seq = le64_to_cpu(mb->seq); 1165 1166 log->device_size = round_down(rdev->sectors, BLOCK_SECTORS); 1167 log->max_free_space = log->device_size >> RECLAIM_MAX_FREE_SPACE_SHIFT; 1168 if (log->max_free_space > RECLAIM_MAX_FREE_SPACE) 1169 log->max_free_space = RECLAIM_MAX_FREE_SPACE; 1170 log->last_checkpoint = cp;
2513 log->next_checkpoint = cp; 2514 mutex_lock(&log->io_mutex); 2515 r5c_update_log_state(log); 2516 mutex_unlock(&log->io_mutex);
2517 2518 __free_page(page); 2519 2520 return r5l_recovery_log(log); 2521ioerr: 2522 __free_page(page); 2523 return ret; 2524} 2525 2526int r5l_init_log(struct r5conf conf, struct md_rdev rdev) 2527{ 2528 struct request_queue q = bdev_get_queue(rdev->bdev); 2529 struct r5l_log log; 2530 2531 if (PAGE_SIZE != 4096) 2532 return -EINVAL;	1171 1172 __free_page(page); 1173 1174 return r5l_recovery_log(log); 1175ioerr: 1176 __free_page(page); 1177 return ret; 1178} 1179 1180int r5l_init_log(struct r5conf conf, struct md_rdev rdev) 1181{ 1182 struct request_queue q = bdev_get_queue(rdev->bdev); 1183 struct r5l_log log; 1184 1185 if (PAGE_SIZE != 4096) 1186 return -EINVAL;
2533 2534 /* 2535 * The PAGE_SIZE must be big enough to hold 1 r5l_meta_block and 2536 * raid_disks r5l_payload_data_parity. 2537 * 2538 * Write journal and cache does not work for very big array 2539 * (raid_disks > 203) 2540 / 2541 if (sizeof(struct r5l_meta_block) + 2542 ((sizeof(struct r5l_payload_data_parity) + sizeof(__le32)) 2543 conf->raid_disks) > PAGE_SIZE) { 2544 pr_err("md/raid:%s: write journal/cache doesn't work for array with %d disks\n", 2545 mdname(conf->mddev), conf->raid_disks); 2546 return -EINVAL; 2547 } 2548
2549 log = kzalloc(sizeof(*log), GFP_KERNEL); 2550 if (!log) 2551 return -ENOMEM; 2552 log->rdev = rdev; 2553 2554 log->need_cache_flush = test_bit(QUEUE_FLAG_WC, &q->queue_flags) != 0; 2555 2556 log->uuid_checksum = crc32c_le(~0, rdev->mddev->uuid, 2557 sizeof(rdev->mddev->uuid)); 2558 2559 mutex_init(&log->io_mutex); 2560 2561 spin_lock_init(&log->io_list_lock); 2562 INIT_LIST_HEAD(&log->running_ios); 2563 INIT_LIST_HEAD(&log->io_end_ios); 2564 INIT_LIST_HEAD(&log->flushing_ios); 2565 INIT_LIST_HEAD(&log->finished_ios);	1187 log = kzalloc(sizeof(*log), GFP_KERNEL); 1188 if (!log) 1189 return -ENOMEM; 1190 log->rdev = rdev; 1191 1192 log->need_cache_flush = test_bit(QUEUE_FLAG_WC, &q->queue_flags) != 0; 1193 1194 log->uuid_checksum = crc32c_le(~0, rdev->mddev->uuid, 1195 sizeof(rdev->mddev->uuid)); 1196 1197 mutex_init(&log->io_mutex); 1198 1199 spin_lock_init(&log->io_list_lock); 1200 INIT_LIST_HEAD(&log->running_ios); 1201 INIT_LIST_HEAD(&log->io_end_ios); 1202 INIT_LIST_HEAD(&log->flushing_ios); 1203 INIT_LIST_HEAD(&log->finished_ios);
2566 bio_init(&log->flush_bio);	1204 bio_init(&log->flush_bio, NULL, 0);
2567 2568 log->io_kc = KMEM_CACHE(r5l_io_unit, 0); 2569 if (!log->io_kc) 2570 goto io_kc; 2571 2572 log->io_pool = mempool_create_slab_pool(R5L_POOL_SIZE, log->io_kc); 2573 if (!log->io_pool) 2574 goto io_pool; --- 5 unchanged lines hidden (view full) --- 2580 log->meta_pool = mempool_create_page_pool(R5L_POOL_SIZE, 0); 2581 if (!log->meta_pool) 2582 goto out_mempool; 2583 2584 log->reclaim_thread = md_register_thread(r5l_reclaim_thread, 2585 log->rdev->mddev, "reclaim"); 2586 if (!log->reclaim_thread) 2587 goto reclaim_thread;	1205 1206 log->io_kc = KMEM_CACHE(r5l_io_unit, 0); 1207 if (!log->io_kc) 1208 goto io_kc; 1209 1210 log->io_pool = mempool_create_slab_pool(R5L_POOL_SIZE, log->io_kc); 1211 if (!log->io_pool) 1212 goto io_pool; --- 5 unchanged lines hidden (view full) --- 1218 log->meta_pool = mempool_create_page_pool(R5L_POOL_SIZE, 0); 1219 if (!log->meta_pool) 1220 goto out_mempool; 1221 1222 log->reclaim_thread = md_register_thread(r5l_reclaim_thread, 1223 log->rdev->mddev, "reclaim"); 1224 if (!log->reclaim_thread) 1225 goto reclaim_thread;
2588 log->reclaim_thread->timeout = R5C_RECLAIM_WAKEUP_INTERVAL; 2589
2590 init_waitqueue_head(&log->iounit_wait); 2591 2592 INIT_LIST_HEAD(&log->no_mem_stripes); 2593 2594 INIT_LIST_HEAD(&log->no_space_stripes); 2595 spin_lock_init(&log->no_space_stripes_lock); 2596	1226 init_waitqueue_head(&log->iounit_wait); 1227 1228 INIT_LIST_HEAD(&log->no_mem_stripes); 1229 1230 INIT_LIST_HEAD(&log->no_space_stripes); 1231 spin_lock_init(&log->no_space_stripes_lock); 1232
2597 INIT_WORK(&log->deferred_io_work, r5l_submit_io_async); 2598 2599 log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH; 2600 INIT_LIST_HEAD(&log->stripe_in_journal_list); 2601 spin_lock_init(&log->stripe_in_journal_lock); 2602 atomic_set(&log->stripe_in_journal_count, 0); 2603
2604 if (r5l_load_log(log)) 2605 goto error; 2606 2607 rcu_assign_pointer(conf->log, log); 2608 set_bit(MD_HAS_JOURNAL, &conf->mddev->flags); 2609 return 0; 2610 2611error: --- 23 unchanged lines hidden ---	1233 if (r5l_load_log(log)) 1234 goto error; 1235 1236 rcu_assign_pointer(conf->log, log); 1237 set_bit(MD_HAS_JOURNAL, &conf->mddev->flags); 1238 return 0; 1239 1240error: --- 23 unchanged lines hidden ---