1 /* 2 * Copyright (C) 2015 Shaohua Li <shli@fb.com> 3 * 4 * This program is free software; you can redistribute it and/or modify it 5 * under the terms and conditions of the GNU General Public License, 6 * version 2, as published by the Free Software Foundation. 7 * 8 * This program is distributed in the hope it will be useful, but WITHOUT 9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 11 * more details. 12 * 13 */ 14 #include <linux/kernel.h> 15 #include <linux/wait.h> 16 #include <linux/blkdev.h> 17 #include <linux/slab.h> 18 #include <linux/raid/md_p.h> 19 #include <linux/crc32c.h> 20 #include <linux/random.h> 21 #include "md.h" 22 #include "raid5.h" 23 24 /* 25 * metadata/data stored in disk with 4k size unit (a block) regardless 26 * underneath hardware sector size. only works with PAGE_SIZE == 4096 27 */ 28 #define BLOCK_SECTORS (8) 29 30 /* 31 * reclaim runs every 1/4 disk size or 10G reclaimable space. This can prevent 32 * recovery scans a very long log 33 */ 34 #define RECLAIM_MAX_FREE_SPACE (10 * 1024 * 1024 * 2) /* sector */ 35 #define RECLAIM_MAX_FREE_SPACE_SHIFT (2) 36 37 struct r5l_log { 38 struct md_rdev *rdev; 39 40 u32 uuid_checksum; 41 42 sector_t device_size; /* log device size, round to 43 * BLOCK_SECTORS */ 44 sector_t max_free_space; /* reclaim run if free space is at 45 * this size */ 46 47 sector_t last_checkpoint; /* log tail. where recovery scan 48 * starts from */ 49 u64 last_cp_seq; /* log tail sequence */ 50 51 sector_t log_start; /* log head. where new data appends */ 52 u64 seq; /* log head sequence */ 53 54 sector_t next_checkpoint; 55 u64 next_cp_seq; 56 57 struct mutex io_mutex; 58 struct r5l_io_unit *current_io; /* current io_unit accepting new data */ 59 60 spinlock_t io_list_lock; 61 struct list_head running_ios; /* io_units which are still running, 62 * and have not yet been completely 63 * written to the log */ 64 struct list_head io_end_ios; /* io_units which have been completely 65 * written to the log but not yet written 66 * to the RAID */ 67 struct list_head flushing_ios; /* io_units which are waiting for log 68 * cache flush */ 69 struct list_head finished_ios; /* io_units which settle down in log disk */ 70 struct bio flush_bio; 71 72 struct kmem_cache *io_kc; 73 74 struct md_thread *reclaim_thread; 75 unsigned long reclaim_target; /* number of space that need to be 76 * reclaimed. if it's 0, reclaim spaces 77 * used by io_units which are in 78 * IO_UNIT_STRIPE_END state (eg, reclaim 79 * dones't wait for specific io_unit 80 * switching to IO_UNIT_STRIPE_END 81 * state) */ 82 wait_queue_head_t iounit_wait; 83 84 struct list_head no_space_stripes; /* pending stripes, log has no space */ 85 spinlock_t no_space_stripes_lock; 86 87 bool need_cache_flush; 88 bool in_teardown; 89 }; 90 91 /* 92 * an IO range starts from a meta data block and end at the next meta data 93 * block. The io unit's the meta data block tracks data/parity followed it. io 94 * unit is written to log disk with normal write, as we always flush log disk 95 * first and then start move data to raid disks, there is no requirement to 96 * write io unit with FLUSH/FUA 97 */ 98 struct r5l_io_unit { 99 struct r5l_log *log; 100 101 struct page *meta_page; /* store meta block */ 102 int meta_offset; /* current offset in meta_page */ 103 104 struct bio *current_bio;/* current_bio accepting new data */ 105 106 atomic_t pending_stripe;/* how many stripes not flushed to raid */ 107 u64 seq; /* seq number of the metablock */ 108 sector_t log_start; /* where the io_unit starts */ 109 sector_t log_end; /* where the io_unit ends */ 110 struct list_head log_sibling; /* log->running_ios */ 111 struct list_head stripe_list; /* stripes added to the io_unit */ 112 113 int state; 114 bool need_split_bio; 115 }; 116 117 /* r5l_io_unit state */ 118 enum r5l_io_unit_state { 119 IO_UNIT_RUNNING = 0, /* accepting new IO */ 120 IO_UNIT_IO_START = 1, /* io_unit bio start writing to log, 121 * don't accepting new bio */ 122 IO_UNIT_IO_END = 2, /* io_unit bio finish writing to log */ 123 IO_UNIT_STRIPE_END = 3, /* stripes data finished writing to raid */ 124 }; 125 126 static sector_t r5l_ring_add(struct r5l_log *log, sector_t start, sector_t inc) 127 { 128 start += inc; 129 if (start >= log->device_size) 130 start = start - log->device_size; 131 return start; 132 } 133 134 static sector_t r5l_ring_distance(struct r5l_log *log, sector_t start, 135 sector_t end) 136 { 137 if (end >= start) 138 return end - start; 139 else 140 return end + log->device_size - start; 141 } 142 143 static bool r5l_has_free_space(struct r5l_log *log, sector_t size) 144 { 145 sector_t used_size; 146 147 used_size = r5l_ring_distance(log, log->last_checkpoint, 148 log->log_start); 149 150 return log->device_size > used_size + size; 151 } 152 153 static void __r5l_set_io_unit_state(struct r5l_io_unit *io, 154 enum r5l_io_unit_state state) 155 { 156 if (WARN_ON(io->state >= state)) 157 return; 158 io->state = state; 159 } 160 161 static void r5l_io_run_stripes(struct r5l_io_unit *io) 162 { 163 struct stripe_head *sh, *next; 164 165 list_for_each_entry_safe(sh, next, &io->stripe_list, log_list) { 166 list_del_init(&sh->log_list); 167 set_bit(STRIPE_HANDLE, &sh->state); 168 raid5_release_stripe(sh); 169 } 170 } 171 172 static void r5l_log_run_stripes(struct r5l_log *log) 173 { 174 struct r5l_io_unit *io, *next; 175 176 assert_spin_locked(&log->io_list_lock); 177 178 list_for_each_entry_safe(io, next, &log->running_ios, log_sibling) { 179 /* don't change list order */ 180 if (io->state < IO_UNIT_IO_END) 181 break; 182 183 list_move_tail(&io->log_sibling, &log->finished_ios); 184 r5l_io_run_stripes(io); 185 } 186 } 187 188 static void r5l_move_to_end_ios(struct r5l_log *log) 189 { 190 struct r5l_io_unit *io, *next; 191 192 assert_spin_locked(&log->io_list_lock); 193 194 list_for_each_entry_safe(io, next, &log->running_ios, log_sibling) { 195 /* don't change list order */ 196 if (io->state < IO_UNIT_IO_END) 197 break; 198 list_move_tail(&io->log_sibling, &log->io_end_ios); 199 } 200 } 201 202 static void r5l_log_endio(struct bio *bio) 203 { 204 struct r5l_io_unit *io = bio->bi_private; 205 struct r5l_log *log = io->log; 206 unsigned long flags; 207 208 if (bio->bi_error) 209 md_error(log->rdev->mddev, log->rdev); 210 211 bio_put(bio); 212 __free_page(io->meta_page); 213 214 spin_lock_irqsave(&log->io_list_lock, flags); 215 __r5l_set_io_unit_state(io, IO_UNIT_IO_END); 216 if (log->need_cache_flush) 217 r5l_move_to_end_ios(log); 218 else 219 r5l_log_run_stripes(log); 220 spin_unlock_irqrestore(&log->io_list_lock, flags); 221 222 if (log->need_cache_flush) 223 md_wakeup_thread(log->rdev->mddev->thread); 224 } 225 226 static void r5l_submit_current_io(struct r5l_log *log) 227 { 228 struct r5l_io_unit *io = log->current_io; 229 struct r5l_meta_block *block; 230 unsigned long flags; 231 u32 crc; 232 233 if (!io) 234 return; 235 236 block = page_address(io->meta_page); 237 block->meta_size = cpu_to_le32(io->meta_offset); 238 crc = crc32c_le(log->uuid_checksum, block, PAGE_SIZE); 239 block->checksum = cpu_to_le32(crc); 240 241 log->current_io = NULL; 242 spin_lock_irqsave(&log->io_list_lock, flags); 243 __r5l_set_io_unit_state(io, IO_UNIT_IO_START); 244 spin_unlock_irqrestore(&log->io_list_lock, flags); 245 246 submit_bio(WRITE, io->current_bio); 247 } 248 249 static struct bio *r5l_bio_alloc(struct r5l_log *log) 250 { 251 struct bio *bio = bio_kmalloc(GFP_NOIO | __GFP_NOFAIL, BIO_MAX_PAGES); 252 253 bio->bi_rw = WRITE; 254 bio->bi_bdev = log->rdev->bdev; 255 bio->bi_iter.bi_sector = log->rdev->data_offset + log->log_start; 256 257 return bio; 258 } 259 260 static void r5_reserve_log_entry(struct r5l_log *log, struct r5l_io_unit *io) 261 { 262 log->log_start = r5l_ring_add(log, log->log_start, BLOCK_SECTORS); 263 264 /* 265 * If we filled up the log device start from the beginning again, 266 * which will require a new bio. 267 * 268 * Note: for this to work properly the log size needs to me a multiple 269 * of BLOCK_SECTORS. 270 */ 271 if (log->log_start == 0) 272 io->need_split_bio = true; 273 274 io->log_end = log->log_start; 275 } 276 277 static struct r5l_io_unit *r5l_new_meta(struct r5l_log *log) 278 { 279 struct r5l_io_unit *io; 280 struct r5l_meta_block *block; 281 282 /* We can't handle memory allocate failure so far */ 283 io = kmem_cache_zalloc(log->io_kc, GFP_NOIO | __GFP_NOFAIL); 284 io->log = log; 285 INIT_LIST_HEAD(&io->log_sibling); 286 INIT_LIST_HEAD(&io->stripe_list); 287 io->state = IO_UNIT_RUNNING; 288 289 io->meta_page = alloc_page(GFP_NOIO | __GFP_NOFAIL | __GFP_ZERO); 290 block = page_address(io->meta_page); 291 block->magic = cpu_to_le32(R5LOG_MAGIC); 292 block->version = R5LOG_VERSION; 293 block->seq = cpu_to_le64(log->seq); 294 block->position = cpu_to_le64(log->log_start); 295 296 io->log_start = log->log_start; 297 io->meta_offset = sizeof(struct r5l_meta_block); 298 io->seq = log->seq++; 299 300 io->current_bio = r5l_bio_alloc(log); 301 io->current_bio->bi_end_io = r5l_log_endio; 302 io->current_bio->bi_private = io; 303 bio_add_page(io->current_bio, io->meta_page, PAGE_SIZE, 0); 304 305 r5_reserve_log_entry(log, io); 306 307 spin_lock_irq(&log->io_list_lock); 308 list_add_tail(&io->log_sibling, &log->running_ios); 309 spin_unlock_irq(&log->io_list_lock); 310 311 return io; 312 } 313 314 static int r5l_get_meta(struct r5l_log *log, unsigned int payload_size) 315 { 316 if (log->current_io && 317 log->current_io->meta_offset + payload_size > PAGE_SIZE) 318 r5l_submit_current_io(log); 319 320 if (!log->current_io) 321 log->current_io = r5l_new_meta(log); 322 return 0; 323 } 324 325 static void r5l_append_payload_meta(struct r5l_log *log, u16 type, 326 sector_t location, 327 u32 checksum1, u32 checksum2, 328 bool checksum2_valid) 329 { 330 struct r5l_io_unit *io = log->current_io; 331 struct r5l_payload_data_parity *payload; 332 333 payload = page_address(io->meta_page) + io->meta_offset; 334 payload->header.type = cpu_to_le16(type); 335 payload->header.flags = cpu_to_le16(0); 336 payload->size = cpu_to_le32((1 + !!checksum2_valid) << 337 (PAGE_SHIFT - 9)); 338 payload->location = cpu_to_le64(location); 339 payload->checksum[0] = cpu_to_le32(checksum1); 340 if (checksum2_valid) 341 payload->checksum[1] = cpu_to_le32(checksum2); 342 343 io->meta_offset += sizeof(struct r5l_payload_data_parity) + 344 sizeof(__le32) * (1 + !!checksum2_valid); 345 } 346 347 static void r5l_append_payload_page(struct r5l_log *log, struct page *page) 348 { 349 struct r5l_io_unit *io = log->current_io; 350 351 if (io->need_split_bio) { 352 struct bio *prev = io->current_bio; 353 354 io->current_bio = r5l_bio_alloc(log); 355 bio_chain(io->current_bio, prev); 356 357 submit_bio(WRITE, prev); 358 } 359 360 if (!bio_add_page(io->current_bio, page, PAGE_SIZE, 0)) 361 BUG(); 362 363 r5_reserve_log_entry(log, io); 364 } 365 366 static void r5l_log_stripe(struct r5l_log *log, struct stripe_head *sh, 367 int data_pages, int parity_pages) 368 { 369 int i; 370 int meta_size; 371 struct r5l_io_unit *io; 372 373 meta_size = 374 ((sizeof(struct r5l_payload_data_parity) + sizeof(__le32)) 375 * data_pages) + 376 sizeof(struct r5l_payload_data_parity) + 377 sizeof(__le32) * parity_pages; 378 379 r5l_get_meta(log, meta_size); 380 io = log->current_io; 381 382 for (i = 0; i < sh->disks; i++) { 383 if (!test_bit(R5_Wantwrite, &sh->dev[i].flags)) 384 continue; 385 if (i == sh->pd_idx || i == sh->qd_idx) 386 continue; 387 r5l_append_payload_meta(log, R5LOG_PAYLOAD_DATA, 388 raid5_compute_blocknr(sh, i, 0), 389 sh->dev[i].log_checksum, 0, false); 390 r5l_append_payload_page(log, sh->dev[i].page); 391 } 392 393 if (sh->qd_idx >= 0) { 394 r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY, 395 sh->sector, sh->dev[sh->pd_idx].log_checksum, 396 sh->dev[sh->qd_idx].log_checksum, true); 397 r5l_append_payload_page(log, sh->dev[sh->pd_idx].page); 398 r5l_append_payload_page(log, sh->dev[sh->qd_idx].page); 399 } else { 400 r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY, 401 sh->sector, sh->dev[sh->pd_idx].log_checksum, 402 0, false); 403 r5l_append_payload_page(log, sh->dev[sh->pd_idx].page); 404 } 405 406 list_add_tail(&sh->log_list, &io->stripe_list); 407 atomic_inc(&io->pending_stripe); 408 sh->log_io = io; 409 } 410 411 static void r5l_wake_reclaim(struct r5l_log *log, sector_t space); 412 /* 413 * running in raid5d, where reclaim could wait for raid5d too (when it flushes 414 * data from log to raid disks), so we shouldn't wait for reclaim here 415 */ 416 int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh) 417 { 418 int write_disks = 0; 419 int data_pages, parity_pages; 420 int meta_size; 421 int reserve; 422 int i; 423 424 if (!log) 425 return -EAGAIN; 426 /* Don't support stripe batch */ 427 if (sh->log_io || !test_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags) || 428 test_bit(STRIPE_SYNCING, &sh->state)) { 429 /* the stripe is written to log, we start writing it to raid */ 430 clear_bit(STRIPE_LOG_TRAPPED, &sh->state); 431 return -EAGAIN; 432 } 433 434 for (i = 0; i < sh->disks; i++) { 435 void *addr; 436 437 if (!test_bit(R5_Wantwrite, &sh->dev[i].flags)) 438 continue; 439 write_disks++; 440 /* checksum is already calculated in last run */ 441 if (test_bit(STRIPE_LOG_TRAPPED, &sh->state)) 442 continue; 443 addr = kmap_atomic(sh->dev[i].page); 444 sh->dev[i].log_checksum = crc32c_le(log->uuid_checksum, 445 addr, PAGE_SIZE); 446 kunmap_atomic(addr); 447 } 448 parity_pages = 1 + !!(sh->qd_idx >= 0); 449 data_pages = write_disks - parity_pages; 450 451 meta_size = 452 ((sizeof(struct r5l_payload_data_parity) + sizeof(__le32)) 453 * data_pages) + 454 sizeof(struct r5l_payload_data_parity) + 455 sizeof(__le32) * parity_pages; 456 /* Doesn't work with very big raid array */ 457 if (meta_size + sizeof(struct r5l_meta_block) > PAGE_SIZE) 458 return -EINVAL; 459 460 set_bit(STRIPE_LOG_TRAPPED, &sh->state); 461 /* 462 * The stripe must enter state machine again to finish the write, so 463 * don't delay. 464 */ 465 clear_bit(STRIPE_DELAYED, &sh->state); 466 atomic_inc(&sh->count); 467 468 mutex_lock(&log->io_mutex); 469 /* meta + data */ 470 reserve = (1 + write_disks) << (PAGE_SHIFT - 9); 471 if (r5l_has_free_space(log, reserve)) 472 r5l_log_stripe(log, sh, data_pages, parity_pages); 473 else { 474 spin_lock(&log->no_space_stripes_lock); 475 list_add_tail(&sh->log_list, &log->no_space_stripes); 476 spin_unlock(&log->no_space_stripes_lock); 477 478 r5l_wake_reclaim(log, reserve); 479 } 480 mutex_unlock(&log->io_mutex); 481 482 return 0; 483 } 484 485 void r5l_write_stripe_run(struct r5l_log *log) 486 { 487 if (!log) 488 return; 489 mutex_lock(&log->io_mutex); 490 r5l_submit_current_io(log); 491 mutex_unlock(&log->io_mutex); 492 } 493 494 int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio) 495 { 496 if (!log) 497 return -ENODEV; 498 /* 499 * we flush log disk cache first, then write stripe data to raid disks. 500 * So if bio is finished, the log disk cache is flushed already. The 501 * recovery guarantees we can recovery the bio from log disk, so we 502 * don't need to flush again 503 */ 504 if (bio->bi_iter.bi_size == 0) { 505 bio_endio(bio); 506 return 0; 507 } 508 bio->bi_rw &= ~REQ_FLUSH; 509 return -EAGAIN; 510 } 511 512 /* This will run after log space is reclaimed */ 513 static void r5l_run_no_space_stripes(struct r5l_log *log) 514 { 515 struct stripe_head *sh; 516 517 spin_lock(&log->no_space_stripes_lock); 518 while (!list_empty(&log->no_space_stripes)) { 519 sh = list_first_entry(&log->no_space_stripes, 520 struct stripe_head, log_list); 521 list_del_init(&sh->log_list); 522 set_bit(STRIPE_HANDLE, &sh->state); 523 raid5_release_stripe(sh); 524 } 525 spin_unlock(&log->no_space_stripes_lock); 526 } 527 528 static sector_t r5l_reclaimable_space(struct r5l_log *log) 529 { 530 return r5l_ring_distance(log, log->last_checkpoint, 531 log->next_checkpoint); 532 } 533 534 static bool r5l_complete_finished_ios(struct r5l_log *log) 535 { 536 struct r5l_io_unit *io, *next; 537 bool found = false; 538 539 assert_spin_locked(&log->io_list_lock); 540 541 list_for_each_entry_safe(io, next, &log->finished_ios, log_sibling) { 542 /* don't change list order */ 543 if (io->state < IO_UNIT_STRIPE_END) 544 break; 545 546 log->next_checkpoint = io->log_start; 547 log->next_cp_seq = io->seq; 548 549 list_del(&io->log_sibling); 550 kmem_cache_free(log->io_kc, io); 551 552 found = true; 553 } 554 555 return found; 556 } 557 558 static void __r5l_stripe_write_finished(struct r5l_io_unit *io) 559 { 560 struct r5l_log *log = io->log; 561 unsigned long flags; 562 563 spin_lock_irqsave(&log->io_list_lock, flags); 564 __r5l_set_io_unit_state(io, IO_UNIT_STRIPE_END); 565 566 if (!r5l_complete_finished_ios(log)) { 567 spin_unlock_irqrestore(&log->io_list_lock, flags); 568 return; 569 } 570 571 if (r5l_reclaimable_space(log) > log->max_free_space) 572 r5l_wake_reclaim(log, 0); 573 574 spin_unlock_irqrestore(&log->io_list_lock, flags); 575 wake_up(&log->iounit_wait); 576 } 577 578 void r5l_stripe_write_finished(struct stripe_head *sh) 579 { 580 struct r5l_io_unit *io; 581 582 io = sh->log_io; 583 sh->log_io = NULL; 584 585 if (io && atomic_dec_and_test(&io->pending_stripe)) 586 __r5l_stripe_write_finished(io); 587 } 588 589 static void r5l_log_flush_endio(struct bio *bio) 590 { 591 struct r5l_log *log = container_of(bio, struct r5l_log, 592 flush_bio); 593 unsigned long flags; 594 struct r5l_io_unit *io; 595 596 if (bio->bi_error) 597 md_error(log->rdev->mddev, log->rdev); 598 599 spin_lock_irqsave(&log->io_list_lock, flags); 600 list_for_each_entry(io, &log->flushing_ios, log_sibling) 601 r5l_io_run_stripes(io); 602 list_splice_tail_init(&log->flushing_ios, &log->finished_ios); 603 spin_unlock_irqrestore(&log->io_list_lock, flags); 604 } 605 606 /* 607 * Starting dispatch IO to raid. 608 * io_unit(meta) consists of a log. There is one situation we want to avoid. A 609 * broken meta in the middle of a log causes recovery can't find meta at the 610 * head of log. If operations require meta at the head persistent in log, we 611 * must make sure meta before it persistent in log too. A case is: 612 * 613 * stripe data/parity is in log, we start write stripe to raid disks. stripe 614 * data/parity must be persistent in log before we do the write to raid disks. 615 * 616 * The solution is we restrictly maintain io_unit list order. In this case, we 617 * only write stripes of an io_unit to raid disks till the io_unit is the first 618 * one whose data/parity is in log. 619 */ 620 void r5l_flush_stripe_to_raid(struct r5l_log *log) 621 { 622 bool do_flush; 623 624 if (!log || !log->need_cache_flush) 625 return; 626 627 spin_lock_irq(&log->io_list_lock); 628 /* flush bio is running */ 629 if (!list_empty(&log->flushing_ios)) { 630 spin_unlock_irq(&log->io_list_lock); 631 return; 632 } 633 list_splice_tail_init(&log->io_end_ios, &log->flushing_ios); 634 do_flush = !list_empty(&log->flushing_ios); 635 spin_unlock_irq(&log->io_list_lock); 636 637 if (!do_flush) 638 return; 639 bio_reset(&log->flush_bio); 640 log->flush_bio.bi_bdev = log->rdev->bdev; 641 log->flush_bio.bi_end_io = r5l_log_flush_endio; 642 submit_bio(WRITE_FLUSH, &log->flush_bio); 643 } 644 645 static void r5l_write_super(struct r5l_log *log, sector_t cp); 646 static void r5l_write_super_and_discard_space(struct r5l_log *log, 647 sector_t end) 648 { 649 struct block_device *bdev = log->rdev->bdev; 650 struct mddev *mddev; 651 652 r5l_write_super(log, end); 653 654 if (!blk_queue_discard(bdev_get_queue(bdev))) 655 return; 656 657 mddev = log->rdev->mddev; 658 /* 659 * This is to avoid a deadlock. r5l_quiesce holds reconfig_mutex and 660 * wait for this thread to finish. This thread waits for 661 * MD_CHANGE_PENDING clear, which is supposed to be done in 662 * md_check_recovery(). md_check_recovery() tries to get 663 * reconfig_mutex. Since r5l_quiesce already holds the mutex, 664 * md_check_recovery() fails, so the PENDING never get cleared. The 665 * in_teardown check workaround this issue. 666 */ 667 if (!log->in_teardown) { 668 set_bit(MD_CHANGE_DEVS, &mddev->flags); 669 set_bit(MD_CHANGE_PENDING, &mddev->flags); 670 md_wakeup_thread(mddev->thread); 671 wait_event(mddev->sb_wait, 672 !test_bit(MD_CHANGE_PENDING, &mddev->flags) || 673 log->in_teardown); 674 /* 675 * r5l_quiesce could run after in_teardown check and hold 676 * mutex first. Superblock might get updated twice. 677 */ 678 if (log->in_teardown) 679 md_update_sb(mddev, 1); 680 } else { 681 WARN_ON(!mddev_is_locked(mddev)); 682 md_update_sb(mddev, 1); 683 } 684 685 /* discard IO error really doesn't matter, ignore it */ 686 if (log->last_checkpoint < end) { 687 blkdev_issue_discard(bdev, 688 log->last_checkpoint + log->rdev->data_offset, 689 end - log->last_checkpoint, GFP_NOIO, 0); 690 } else { 691 blkdev_issue_discard(bdev, 692 log->last_checkpoint + log->rdev->data_offset, 693 log->device_size - log->last_checkpoint, 694 GFP_NOIO, 0); 695 blkdev_issue_discard(bdev, log->rdev->data_offset, end, 696 GFP_NOIO, 0); 697 } 698 } 699 700 701 static void r5l_do_reclaim(struct r5l_log *log) 702 { 703 sector_t reclaim_target = xchg(&log->reclaim_target, 0); 704 sector_t reclaimable; 705 sector_t next_checkpoint; 706 u64 next_cp_seq; 707 708 spin_lock_irq(&log->io_list_lock); 709 /* 710 * move proper io_unit to reclaim list. We should not change the order. 711 * reclaimable/unreclaimable io_unit can be mixed in the list, we 712 * shouldn't reuse space of an unreclaimable io_unit 713 */ 714 while (1) { 715 reclaimable = r5l_reclaimable_space(log); 716 if (reclaimable >= reclaim_target || 717 (list_empty(&log->running_ios) && 718 list_empty(&log->io_end_ios) && 719 list_empty(&log->flushing_ios) && 720 list_empty(&log->finished_ios))) 721 break; 722 723 md_wakeup_thread(log->rdev->mddev->thread); 724 wait_event_lock_irq(log->iounit_wait, 725 r5l_reclaimable_space(log) > reclaimable, 726 log->io_list_lock); 727 } 728 729 next_checkpoint = log->next_checkpoint; 730 next_cp_seq = log->next_cp_seq; 731 spin_unlock_irq(&log->io_list_lock); 732 733 BUG_ON(reclaimable < 0); 734 if (reclaimable == 0) 735 return; 736 737 /* 738 * write_super will flush cache of each raid disk. We must write super 739 * here, because the log area might be reused soon and we don't want to 740 * confuse recovery 741 */ 742 r5l_write_super_and_discard_space(log, next_checkpoint); 743 744 mutex_lock(&log->io_mutex); 745 log->last_checkpoint = next_checkpoint; 746 log->last_cp_seq = next_cp_seq; 747 mutex_unlock(&log->io_mutex); 748 749 r5l_run_no_space_stripes(log); 750 } 751 752 static void r5l_reclaim_thread(struct md_thread *thread) 753 { 754 struct mddev *mddev = thread->mddev; 755 struct r5conf *conf = mddev->private; 756 struct r5l_log *log = conf->log; 757 758 if (!log) 759 return; 760 r5l_do_reclaim(log); 761 } 762 763 static void r5l_wake_reclaim(struct r5l_log *log, sector_t space) 764 { 765 unsigned long target; 766 unsigned long new = (unsigned long)space; /* overflow in theory */ 767 768 do { 769 target = log->reclaim_target; 770 if (new < target) 771 return; 772 } while (cmpxchg(&log->reclaim_target, target, new) != target); 773 md_wakeup_thread(log->reclaim_thread); 774 } 775 776 void r5l_quiesce(struct r5l_log *log, int state) 777 { 778 struct mddev *mddev; 779 if (!log || state == 2) 780 return; 781 if (state == 0) { 782 log->in_teardown = 0; 783 log->reclaim_thread = md_register_thread(r5l_reclaim_thread, 784 log->rdev->mddev, "reclaim"); 785 } else if (state == 1) { 786 /* 787 * at this point all stripes are finished, so io_unit is at 788 * least in STRIPE_END state 789 */ 790 log->in_teardown = 1; 791 /* make sure r5l_write_super_and_discard_space exits */ 792 mddev = log->rdev->mddev; 793 wake_up(&mddev->sb_wait); 794 r5l_wake_reclaim(log, -1L); 795 md_unregister_thread(&log->reclaim_thread); 796 r5l_do_reclaim(log); 797 } 798 } 799 800 bool r5l_log_disk_error(struct r5conf *conf) 801 { 802 struct r5l_log *log; 803 bool ret; 804 /* don't allow write if journal disk is missing */ 805 rcu_read_lock(); 806 log = rcu_dereference(conf->log); 807 808 if (!log) 809 ret = test_bit(MD_HAS_JOURNAL, &conf->mddev->flags); 810 else 811 ret = test_bit(Faulty, &log->rdev->flags); 812 rcu_read_unlock(); 813 return ret; 814 } 815 816 struct r5l_recovery_ctx { 817 struct page *meta_page; /* current meta */ 818 sector_t meta_total_blocks; /* total size of current meta and data */ 819 sector_t pos; /* recovery position */ 820 u64 seq; /* recovery position seq */ 821 }; 822 823 static int r5l_read_meta_block(struct r5l_log *log, 824 struct r5l_recovery_ctx *ctx) 825 { 826 struct page *page = ctx->meta_page; 827 struct r5l_meta_block *mb; 828 u32 crc, stored_crc; 829 830 if (!sync_page_io(log->rdev, ctx->pos, PAGE_SIZE, page, READ, false)) 831 return -EIO; 832 833 mb = page_address(page); 834 stored_crc = le32_to_cpu(mb->checksum); 835 mb->checksum = 0; 836 837 if (le32_to_cpu(mb->magic) != R5LOG_MAGIC || 838 le64_to_cpu(mb->seq) != ctx->seq || 839 mb->version != R5LOG_VERSION || 840 le64_to_cpu(mb->position) != ctx->pos) 841 return -EINVAL; 842 843 crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE); 844 if (stored_crc != crc) 845 return -EINVAL; 846 847 if (le32_to_cpu(mb->meta_size) > PAGE_SIZE) 848 return -EINVAL; 849 850 ctx->meta_total_blocks = BLOCK_SECTORS; 851 852 return 0; 853 } 854 855 static int r5l_recovery_flush_one_stripe(struct r5l_log *log, 856 struct r5l_recovery_ctx *ctx, 857 sector_t stripe_sect, 858 int *offset, sector_t *log_offset) 859 { 860 struct r5conf *conf = log->rdev->mddev->private; 861 struct stripe_head *sh; 862 struct r5l_payload_data_parity *payload; 863 int disk_index; 864 865 sh = raid5_get_active_stripe(conf, stripe_sect, 0, 0, 0); 866 while (1) { 867 payload = page_address(ctx->meta_page) + *offset; 868 869 if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_DATA) { 870 raid5_compute_sector(conf, 871 le64_to_cpu(payload->location), 0, 872 &disk_index, sh); 873 874 sync_page_io(log->rdev, *log_offset, PAGE_SIZE, 875 sh->dev[disk_index].page, READ, false); 876 sh->dev[disk_index].log_checksum = 877 le32_to_cpu(payload->checksum[0]); 878 set_bit(R5_Wantwrite, &sh->dev[disk_index].flags); 879 ctx->meta_total_blocks += BLOCK_SECTORS; 880 } else { 881 disk_index = sh->pd_idx; 882 sync_page_io(log->rdev, *log_offset, PAGE_SIZE, 883 sh->dev[disk_index].page, READ, false); 884 sh->dev[disk_index].log_checksum = 885 le32_to_cpu(payload->checksum[0]); 886 set_bit(R5_Wantwrite, &sh->dev[disk_index].flags); 887 888 if (sh->qd_idx >= 0) { 889 disk_index = sh->qd_idx; 890 sync_page_io(log->rdev, 891 r5l_ring_add(log, *log_offset, BLOCK_SECTORS), 892 PAGE_SIZE, sh->dev[disk_index].page, 893 READ, false); 894 sh->dev[disk_index].log_checksum = 895 le32_to_cpu(payload->checksum[1]); 896 set_bit(R5_Wantwrite, 897 &sh->dev[disk_index].flags); 898 } 899 ctx->meta_total_blocks += BLOCK_SECTORS * conf->max_degraded; 900 } 901 902 *log_offset = r5l_ring_add(log, *log_offset, 903 le32_to_cpu(payload->size)); 904 *offset += sizeof(struct r5l_payload_data_parity) + 905 sizeof(__le32) * 906 (le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9)); 907 if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_PARITY) 908 break; 909 } 910 911 for (disk_index = 0; disk_index < sh->disks; disk_index++) { 912 void *addr; 913 u32 checksum; 914 915 if (!test_bit(R5_Wantwrite, &sh->dev[disk_index].flags)) 916 continue; 917 addr = kmap_atomic(sh->dev[disk_index].page); 918 checksum = crc32c_le(log->uuid_checksum, addr, PAGE_SIZE); 919 kunmap_atomic(addr); 920 if (checksum != sh->dev[disk_index].log_checksum) 921 goto error; 922 } 923 924 for (disk_index = 0; disk_index < sh->disks; disk_index++) { 925 struct md_rdev *rdev, *rrdev; 926 927 if (!test_and_clear_bit(R5_Wantwrite, 928 &sh->dev[disk_index].flags)) 929 continue; 930 931 /* in case device is broken */ 932 rdev = rcu_dereference(conf->disks[disk_index].rdev); 933 if (rdev) 934 sync_page_io(rdev, stripe_sect, PAGE_SIZE, 935 sh->dev[disk_index].page, WRITE, false); 936 rrdev = rcu_dereference(conf->disks[disk_index].replacement); 937 if (rrdev) 938 sync_page_io(rrdev, stripe_sect, PAGE_SIZE, 939 sh->dev[disk_index].page, WRITE, false); 940 } 941 raid5_release_stripe(sh); 942 return 0; 943 944 error: 945 for (disk_index = 0; disk_index < sh->disks; disk_index++) 946 sh->dev[disk_index].flags = 0; 947 raid5_release_stripe(sh); 948 return -EINVAL; 949 } 950 951 static int r5l_recovery_flush_one_meta(struct r5l_log *log, 952 struct r5l_recovery_ctx *ctx) 953 { 954 struct r5conf *conf = log->rdev->mddev->private; 955 struct r5l_payload_data_parity *payload; 956 struct r5l_meta_block *mb; 957 int offset; 958 sector_t log_offset; 959 sector_t stripe_sector; 960 961 mb = page_address(ctx->meta_page); 962 offset = sizeof(struct r5l_meta_block); 963 log_offset = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS); 964 965 while (offset < le32_to_cpu(mb->meta_size)) { 966 int dd; 967 968 payload = (void *)mb + offset; 969 stripe_sector = raid5_compute_sector(conf, 970 le64_to_cpu(payload->location), 0, &dd, NULL); 971 if (r5l_recovery_flush_one_stripe(log, ctx, stripe_sector, 972 &offset, &log_offset)) 973 return -EINVAL; 974 } 975 return 0; 976 } 977 978 /* copy data/parity from log to raid disks */ 979 static void r5l_recovery_flush_log(struct r5l_log *log, 980 struct r5l_recovery_ctx *ctx) 981 { 982 while (1) { 983 if (r5l_read_meta_block(log, ctx)) 984 return; 985 if (r5l_recovery_flush_one_meta(log, ctx)) 986 return; 987 ctx->seq++; 988 ctx->pos = r5l_ring_add(log, ctx->pos, ctx->meta_total_blocks); 989 } 990 } 991 992 static int r5l_log_write_empty_meta_block(struct r5l_log *log, sector_t pos, 993 u64 seq) 994 { 995 struct page *page; 996 struct r5l_meta_block *mb; 997 u32 crc; 998 999 page = alloc_page(GFP_KERNEL | __GFP_ZERO); 1000 if (!page) 1001 return -ENOMEM; 1002 mb = page_address(page); 1003 mb->magic = cpu_to_le32(R5LOG_MAGIC); 1004 mb->version = R5LOG_VERSION; 1005 mb->meta_size = cpu_to_le32(sizeof(struct r5l_meta_block)); 1006 mb->seq = cpu_to_le64(seq); 1007 mb->position = cpu_to_le64(pos); 1008 crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE); 1009 mb->checksum = cpu_to_le32(crc); 1010 1011 if (!sync_page_io(log->rdev, pos, PAGE_SIZE, page, WRITE_FUA, false)) { 1012 __free_page(page); 1013 return -EIO; 1014 } 1015 __free_page(page); 1016 return 0; 1017 } 1018 1019 static int r5l_recovery_log(struct r5l_log *log) 1020 { 1021 struct r5l_recovery_ctx ctx; 1022 1023 ctx.pos = log->last_checkpoint; 1024 ctx.seq = log->last_cp_seq; 1025 ctx.meta_page = alloc_page(GFP_KERNEL); 1026 if (!ctx.meta_page) 1027 return -ENOMEM; 1028 1029 r5l_recovery_flush_log(log, &ctx); 1030 __free_page(ctx.meta_page); 1031 1032 /* 1033 * we did a recovery. Now ctx.pos points to an invalid meta block. New 1034 * log will start here. but we can't let superblock point to last valid 1035 * meta block. The log might looks like: 1036 * | meta 1| meta 2| meta 3| 1037 * meta 1 is valid, meta 2 is invalid. meta 3 could be valid. If 1038 * superblock points to meta 1, we write a new valid meta 2n. if crash 1039 * happens again, new recovery will start from meta 1. Since meta 2n is 1040 * valid now, recovery will think meta 3 is valid, which is wrong. 1041 * The solution is we create a new meta in meta2 with its seq == meta 1042 * 1's seq + 10 and let superblock points to meta2. The same recovery will 1043 * not think meta 3 is a valid meta, because its seq doesn't match 1044 */ 1045 if (ctx.seq > log->last_cp_seq + 1) { 1046 int ret; 1047 1048 ret = r5l_log_write_empty_meta_block(log, ctx.pos, ctx.seq + 10); 1049 if (ret) 1050 return ret; 1051 log->seq = ctx.seq + 11; 1052 log->log_start = r5l_ring_add(log, ctx.pos, BLOCK_SECTORS); 1053 r5l_write_super(log, ctx.pos); 1054 } else { 1055 log->log_start = ctx.pos; 1056 log->seq = ctx.seq; 1057 } 1058 return 0; 1059 } 1060 1061 static void r5l_write_super(struct r5l_log *log, sector_t cp) 1062 { 1063 struct mddev *mddev = log->rdev->mddev; 1064 1065 log->rdev->journal_tail = cp; 1066 set_bit(MD_CHANGE_DEVS, &mddev->flags); 1067 } 1068 1069 static int r5l_load_log(struct r5l_log *log) 1070 { 1071 struct md_rdev *rdev = log->rdev; 1072 struct page *page; 1073 struct r5l_meta_block *mb; 1074 sector_t cp = log->rdev->journal_tail; 1075 u32 stored_crc, expected_crc; 1076 bool create_super = false; 1077 int ret; 1078 1079 /* Make sure it's valid */ 1080 if (cp >= rdev->sectors || round_down(cp, BLOCK_SECTORS) != cp) 1081 cp = 0; 1082 page = alloc_page(GFP_KERNEL); 1083 if (!page) 1084 return -ENOMEM; 1085 1086 if (!sync_page_io(rdev, cp, PAGE_SIZE, page, READ, false)) { 1087 ret = -EIO; 1088 goto ioerr; 1089 } 1090 mb = page_address(page); 1091 1092 if (le32_to_cpu(mb->magic) != R5LOG_MAGIC || 1093 mb->version != R5LOG_VERSION) { 1094 create_super = true; 1095 goto create; 1096 } 1097 stored_crc = le32_to_cpu(mb->checksum); 1098 mb->checksum = 0; 1099 expected_crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE); 1100 if (stored_crc != expected_crc) { 1101 create_super = true; 1102 goto create; 1103 } 1104 if (le64_to_cpu(mb->position) != cp) { 1105 create_super = true; 1106 goto create; 1107 } 1108 create: 1109 if (create_super) { 1110 log->last_cp_seq = prandom_u32(); 1111 cp = 0; 1112 /* 1113 * Make sure super points to correct address. Log might have 1114 * data very soon. If super hasn't correct log tail address, 1115 * recovery can't find the log 1116 */ 1117 r5l_write_super(log, cp); 1118 } else 1119 log->last_cp_seq = le64_to_cpu(mb->seq); 1120 1121 log->device_size = round_down(rdev->sectors, BLOCK_SECTORS); 1122 log->max_free_space = log->device_size >> RECLAIM_MAX_FREE_SPACE_SHIFT; 1123 if (log->max_free_space > RECLAIM_MAX_FREE_SPACE) 1124 log->max_free_space = RECLAIM_MAX_FREE_SPACE; 1125 log->last_checkpoint = cp; 1126 1127 __free_page(page); 1128 1129 return r5l_recovery_log(log); 1130 ioerr: 1131 __free_page(page); 1132 return ret; 1133 } 1134 1135 int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev) 1136 { 1137 struct r5l_log *log; 1138 1139 if (PAGE_SIZE != 4096) 1140 return -EINVAL; 1141 log = kzalloc(sizeof(*log), GFP_KERNEL); 1142 if (!log) 1143 return -ENOMEM; 1144 log->rdev = rdev; 1145 1146 log->need_cache_flush = (rdev->bdev->bd_disk->queue->flush_flags != 0); 1147 1148 log->uuid_checksum = crc32c_le(~0, rdev->mddev->uuid, 1149 sizeof(rdev->mddev->uuid)); 1150 1151 mutex_init(&log->io_mutex); 1152 1153 spin_lock_init(&log->io_list_lock); 1154 INIT_LIST_HEAD(&log->running_ios); 1155 INIT_LIST_HEAD(&log->io_end_ios); 1156 INIT_LIST_HEAD(&log->flushing_ios); 1157 INIT_LIST_HEAD(&log->finished_ios); 1158 bio_init(&log->flush_bio); 1159 1160 log->io_kc = KMEM_CACHE(r5l_io_unit, 0); 1161 if (!log->io_kc) 1162 goto io_kc; 1163 1164 log->reclaim_thread = md_register_thread(r5l_reclaim_thread, 1165 log->rdev->mddev, "reclaim"); 1166 if (!log->reclaim_thread) 1167 goto reclaim_thread; 1168 init_waitqueue_head(&log->iounit_wait); 1169 1170 INIT_LIST_HEAD(&log->no_space_stripes); 1171 spin_lock_init(&log->no_space_stripes_lock); 1172 1173 if (r5l_load_log(log)) 1174 goto error; 1175 1176 rcu_assign_pointer(conf->log, log); 1177 return 0; 1178 error: 1179 md_unregister_thread(&log->reclaim_thread); 1180 reclaim_thread: 1181 kmem_cache_destroy(log->io_kc); 1182 io_kc: 1183 kfree(log); 1184 return -EINVAL; 1185 } 1186 1187 void r5l_exit_log(struct r5l_log *log) 1188 { 1189 md_unregister_thread(&log->reclaim_thread); 1190 kmem_cache_destroy(log->io_kc); 1191 kfree(log); 1192 } 1193