1 /* 2 * Copyright (C) 2015 Shaohua Li <shli@fb.com> 3 * 4 * This program is free software; you can redistribute it and/or modify it 5 * under the terms and conditions of the GNU General Public License, 6 * version 2, as published by the Free Software Foundation. 7 * 8 * This program is distributed in the hope it will be useful, but WITHOUT 9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 11 * more details. 12 * 13 */ 14 #include <linux/kernel.h> 15 #include <linux/wait.h> 16 #include <linux/blkdev.h> 17 #include <linux/slab.h> 18 #include <linux/raid/md_p.h> 19 #include <linux/crc32c.h> 20 #include <linux/random.h> 21 #include "md.h" 22 #include "raid5.h" 23 24 /* 25 * metadata/data stored in disk with 4k size unit (a block) regardless 26 * underneath hardware sector size. only works with PAGE_SIZE == 4096 27 */ 28 #define BLOCK_SECTORS (8) 29 30 /* 31 * reclaim runs every 1/4 disk size or 10G reclaimable space. This can prevent 32 * recovery scans a very long log 33 */ 34 #define RECLAIM_MAX_FREE_SPACE (10 * 1024 * 1024 * 2) /* sector */ 35 #define RECLAIM_MAX_FREE_SPACE_SHIFT (2) 36 37 struct r5l_log { 38 struct md_rdev *rdev; 39 40 u32 uuid_checksum; 41 42 sector_t device_size; /* log device size, round to 43 * BLOCK_SECTORS */ 44 sector_t max_free_space; /* reclaim run if free space is at 45 * this size */ 46 47 sector_t last_checkpoint; /* log tail. where recovery scan 48 * starts from */ 49 u64 last_cp_seq; /* log tail sequence */ 50 51 sector_t log_start; /* log head. where new data appends */ 52 u64 seq; /* log head sequence */ 53 54 sector_t next_checkpoint; 55 u64 next_cp_seq; 56 57 struct mutex io_mutex; 58 struct r5l_io_unit *current_io; /* current io_unit accepting new data */ 59 60 spinlock_t io_list_lock; 61 struct list_head running_ios; /* io_units which are still running, 62 * and have not yet been completely 63 * written to the log */ 64 struct list_head io_end_ios; /* io_units which have been completely 65 * written to the log but not yet written 66 * to the RAID */ 67 struct list_head flushing_ios; /* io_units which are waiting for log 68 * cache flush */ 69 struct list_head finished_ios; /* io_units which settle down in log disk */ 70 struct bio flush_bio; 71 72 struct kmem_cache *io_kc; 73 74 struct md_thread *reclaim_thread; 75 unsigned long reclaim_target; /* number of space that need to be 76 * reclaimed. if it's 0, reclaim spaces 77 * used by io_units which are in 78 * IO_UNIT_STRIPE_END state (eg, reclaim 79 * dones't wait for specific io_unit 80 * switching to IO_UNIT_STRIPE_END 81 * state) */ 82 wait_queue_head_t iounit_wait; 83 84 struct list_head no_space_stripes; /* pending stripes, log has no space */ 85 spinlock_t no_space_stripes_lock; 86 87 bool need_cache_flush; 88 bool in_teardown; 89 }; 90 91 /* 92 * an IO range starts from a meta data block and end at the next meta data 93 * block. The io unit's the meta data block tracks data/parity followed it. io 94 * unit is written to log disk with normal write, as we always flush log disk 95 * first and then start move data to raid disks, there is no requirement to 96 * write io unit with FLUSH/FUA 97 */ 98 struct r5l_io_unit { 99 struct r5l_log *log; 100 101 struct page *meta_page; /* store meta block */ 102 int meta_offset; /* current offset in meta_page */ 103 104 struct bio *current_bio;/* current_bio accepting new data */ 105 106 atomic_t pending_stripe;/* how many stripes not flushed to raid */ 107 u64 seq; /* seq number of the metablock */ 108 sector_t log_start; /* where the io_unit starts */ 109 sector_t log_end; /* where the io_unit ends */ 110 struct list_head log_sibling; /* log->running_ios */ 111 struct list_head stripe_list; /* stripes added to the io_unit */ 112 113 int state; 114 bool need_split_bio; 115 }; 116 117 /* r5l_io_unit state */ 118 enum r5l_io_unit_state { 119 IO_UNIT_RUNNING = 0, /* accepting new IO */ 120 IO_UNIT_IO_START = 1, /* io_unit bio start writing to log, 121 * don't accepting new bio */ 122 IO_UNIT_IO_END = 2, /* io_unit bio finish writing to log */ 123 IO_UNIT_STRIPE_END = 3, /* stripes data finished writing to raid */ 124 }; 125 126 static sector_t r5l_ring_add(struct r5l_log *log, sector_t start, sector_t inc) 127 { 128 start += inc; 129 if (start >= log->device_size) 130 start = start - log->device_size; 131 return start; 132 } 133 134 static sector_t r5l_ring_distance(struct r5l_log *log, sector_t start, 135 sector_t end) 136 { 137 if (end >= start) 138 return end - start; 139 else 140 return end + log->device_size - start; 141 } 142 143 static bool r5l_has_free_space(struct r5l_log *log, sector_t size) 144 { 145 sector_t used_size; 146 147 used_size = r5l_ring_distance(log, log->last_checkpoint, 148 log->log_start); 149 150 return log->device_size > used_size + size; 151 } 152 153 static void r5l_free_io_unit(struct r5l_log *log, struct r5l_io_unit *io) 154 { 155 __free_page(io->meta_page); 156 kmem_cache_free(log->io_kc, io); 157 } 158 159 static void r5l_move_io_unit_list(struct list_head *from, struct list_head *to, 160 enum r5l_io_unit_state state) 161 { 162 struct r5l_io_unit *io; 163 164 while (!list_empty(from)) { 165 io = list_first_entry(from, struct r5l_io_unit, log_sibling); 166 /* don't change list order */ 167 if (io->state >= state) 168 list_move_tail(&io->log_sibling, to); 169 else 170 break; 171 } 172 } 173 174 static void __r5l_set_io_unit_state(struct r5l_io_unit *io, 175 enum r5l_io_unit_state state) 176 { 177 if (WARN_ON(io->state >= state)) 178 return; 179 io->state = state; 180 } 181 182 static void r5l_io_run_stripes(struct r5l_io_unit *io) 183 { 184 struct stripe_head *sh, *next; 185 186 list_for_each_entry_safe(sh, next, &io->stripe_list, log_list) { 187 list_del_init(&sh->log_list); 188 set_bit(STRIPE_HANDLE, &sh->state); 189 raid5_release_stripe(sh); 190 } 191 } 192 193 static void r5l_log_run_stripes(struct r5l_log *log) 194 { 195 struct r5l_io_unit *io, *next; 196 197 assert_spin_locked(&log->io_list_lock); 198 199 list_for_each_entry_safe(io, next, &log->running_ios, log_sibling) { 200 /* don't change list order */ 201 if (io->state < IO_UNIT_IO_END) 202 break; 203 204 list_move_tail(&io->log_sibling, &log->finished_ios); 205 r5l_io_run_stripes(io); 206 } 207 } 208 209 static void r5l_log_endio(struct bio *bio) 210 { 211 struct r5l_io_unit *io = bio->bi_private; 212 struct r5l_log *log = io->log; 213 unsigned long flags; 214 215 if (bio->bi_error) 216 md_error(log->rdev->mddev, log->rdev); 217 218 bio_put(bio); 219 220 spin_lock_irqsave(&log->io_list_lock, flags); 221 __r5l_set_io_unit_state(io, IO_UNIT_IO_END); 222 if (log->need_cache_flush) 223 r5l_move_io_unit_list(&log->running_ios, &log->io_end_ios, 224 IO_UNIT_IO_END); 225 else 226 r5l_log_run_stripes(log); 227 spin_unlock_irqrestore(&log->io_list_lock, flags); 228 229 if (log->need_cache_flush) 230 md_wakeup_thread(log->rdev->mddev->thread); 231 } 232 233 static void r5l_submit_current_io(struct r5l_log *log) 234 { 235 struct r5l_io_unit *io = log->current_io; 236 struct r5l_meta_block *block; 237 unsigned long flags; 238 u32 crc; 239 240 if (!io) 241 return; 242 243 block = page_address(io->meta_page); 244 block->meta_size = cpu_to_le32(io->meta_offset); 245 crc = crc32c_le(log->uuid_checksum, block, PAGE_SIZE); 246 block->checksum = cpu_to_le32(crc); 247 248 log->current_io = NULL; 249 spin_lock_irqsave(&log->io_list_lock, flags); 250 __r5l_set_io_unit_state(io, IO_UNIT_IO_START); 251 spin_unlock_irqrestore(&log->io_list_lock, flags); 252 253 submit_bio(WRITE, io->current_bio); 254 } 255 256 static struct bio *r5l_bio_alloc(struct r5l_log *log) 257 { 258 struct bio *bio = bio_kmalloc(GFP_NOIO | __GFP_NOFAIL, BIO_MAX_PAGES); 259 260 bio->bi_rw = WRITE; 261 bio->bi_bdev = log->rdev->bdev; 262 bio->bi_iter.bi_sector = log->rdev->data_offset + log->log_start; 263 264 return bio; 265 } 266 267 static void r5_reserve_log_entry(struct r5l_log *log, struct r5l_io_unit *io) 268 { 269 log->log_start = r5l_ring_add(log, log->log_start, BLOCK_SECTORS); 270 271 /* 272 * If we filled up the log device start from the beginning again, 273 * which will require a new bio. 274 * 275 * Note: for this to work properly the log size needs to me a multiple 276 * of BLOCK_SECTORS. 277 */ 278 if (log->log_start == 0) 279 io->need_split_bio = true; 280 281 io->log_end = log->log_start; 282 } 283 284 static struct r5l_io_unit *r5l_new_meta(struct r5l_log *log) 285 { 286 struct r5l_io_unit *io; 287 struct r5l_meta_block *block; 288 289 /* We can't handle memory allocate failure so far */ 290 io = kmem_cache_zalloc(log->io_kc, GFP_NOIO | __GFP_NOFAIL); 291 io->log = log; 292 INIT_LIST_HEAD(&io->log_sibling); 293 INIT_LIST_HEAD(&io->stripe_list); 294 io->state = IO_UNIT_RUNNING; 295 296 io->meta_page = alloc_page(GFP_NOIO | __GFP_NOFAIL | __GFP_ZERO); 297 block = page_address(io->meta_page); 298 block->magic = cpu_to_le32(R5LOG_MAGIC); 299 block->version = R5LOG_VERSION; 300 block->seq = cpu_to_le64(log->seq); 301 block->position = cpu_to_le64(log->log_start); 302 303 io->log_start = log->log_start; 304 io->meta_offset = sizeof(struct r5l_meta_block); 305 io->seq = log->seq++; 306 307 io->current_bio = r5l_bio_alloc(log); 308 io->current_bio->bi_end_io = r5l_log_endio; 309 io->current_bio->bi_private = io; 310 bio_add_page(io->current_bio, io->meta_page, PAGE_SIZE, 0); 311 312 r5_reserve_log_entry(log, io); 313 314 spin_lock_irq(&log->io_list_lock); 315 list_add_tail(&io->log_sibling, &log->running_ios); 316 spin_unlock_irq(&log->io_list_lock); 317 318 return io; 319 } 320 321 static int r5l_get_meta(struct r5l_log *log, unsigned int payload_size) 322 { 323 if (log->current_io && 324 log->current_io->meta_offset + payload_size > PAGE_SIZE) 325 r5l_submit_current_io(log); 326 327 if (!log->current_io) 328 log->current_io = r5l_new_meta(log); 329 return 0; 330 } 331 332 static void r5l_append_payload_meta(struct r5l_log *log, u16 type, 333 sector_t location, 334 u32 checksum1, u32 checksum2, 335 bool checksum2_valid) 336 { 337 struct r5l_io_unit *io = log->current_io; 338 struct r5l_payload_data_parity *payload; 339 340 payload = page_address(io->meta_page) + io->meta_offset; 341 payload->header.type = cpu_to_le16(type); 342 payload->header.flags = cpu_to_le16(0); 343 payload->size = cpu_to_le32((1 + !!checksum2_valid) << 344 (PAGE_SHIFT - 9)); 345 payload->location = cpu_to_le64(location); 346 payload->checksum[0] = cpu_to_le32(checksum1); 347 if (checksum2_valid) 348 payload->checksum[1] = cpu_to_le32(checksum2); 349 350 io->meta_offset += sizeof(struct r5l_payload_data_parity) + 351 sizeof(__le32) * (1 + !!checksum2_valid); 352 } 353 354 static void r5l_append_payload_page(struct r5l_log *log, struct page *page) 355 { 356 struct r5l_io_unit *io = log->current_io; 357 358 if (io->need_split_bio) { 359 struct bio *prev = io->current_bio; 360 361 io->current_bio = r5l_bio_alloc(log); 362 bio_chain(io->current_bio, prev); 363 364 submit_bio(WRITE, prev); 365 } 366 367 if (!bio_add_page(io->current_bio, page, PAGE_SIZE, 0)) 368 BUG(); 369 370 r5_reserve_log_entry(log, io); 371 } 372 373 static void r5l_log_stripe(struct r5l_log *log, struct stripe_head *sh, 374 int data_pages, int parity_pages) 375 { 376 int i; 377 int meta_size; 378 struct r5l_io_unit *io; 379 380 meta_size = 381 ((sizeof(struct r5l_payload_data_parity) + sizeof(__le32)) 382 * data_pages) + 383 sizeof(struct r5l_payload_data_parity) + 384 sizeof(__le32) * parity_pages; 385 386 r5l_get_meta(log, meta_size); 387 io = log->current_io; 388 389 for (i = 0; i < sh->disks; i++) { 390 if (!test_bit(R5_Wantwrite, &sh->dev[i].flags)) 391 continue; 392 if (i == sh->pd_idx || i == sh->qd_idx) 393 continue; 394 r5l_append_payload_meta(log, R5LOG_PAYLOAD_DATA, 395 raid5_compute_blocknr(sh, i, 0), 396 sh->dev[i].log_checksum, 0, false); 397 r5l_append_payload_page(log, sh->dev[i].page); 398 } 399 400 if (sh->qd_idx >= 0) { 401 r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY, 402 sh->sector, sh->dev[sh->pd_idx].log_checksum, 403 sh->dev[sh->qd_idx].log_checksum, true); 404 r5l_append_payload_page(log, sh->dev[sh->pd_idx].page); 405 r5l_append_payload_page(log, sh->dev[sh->qd_idx].page); 406 } else { 407 r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY, 408 sh->sector, sh->dev[sh->pd_idx].log_checksum, 409 0, false); 410 r5l_append_payload_page(log, sh->dev[sh->pd_idx].page); 411 } 412 413 list_add_tail(&sh->log_list, &io->stripe_list); 414 atomic_inc(&io->pending_stripe); 415 sh->log_io = io; 416 } 417 418 static void r5l_wake_reclaim(struct r5l_log *log, sector_t space); 419 /* 420 * running in raid5d, where reclaim could wait for raid5d too (when it flushes 421 * data from log to raid disks), so we shouldn't wait for reclaim here 422 */ 423 int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh) 424 { 425 int write_disks = 0; 426 int data_pages, parity_pages; 427 int meta_size; 428 int reserve; 429 int i; 430 431 if (!log) 432 return -EAGAIN; 433 /* Don't support stripe batch */ 434 if (sh->log_io || !test_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags) || 435 test_bit(STRIPE_SYNCING, &sh->state)) { 436 /* the stripe is written to log, we start writing it to raid */ 437 clear_bit(STRIPE_LOG_TRAPPED, &sh->state); 438 return -EAGAIN; 439 } 440 441 for (i = 0; i < sh->disks; i++) { 442 void *addr; 443 444 if (!test_bit(R5_Wantwrite, &sh->dev[i].flags)) 445 continue; 446 write_disks++; 447 /* checksum is already calculated in last run */ 448 if (test_bit(STRIPE_LOG_TRAPPED, &sh->state)) 449 continue; 450 addr = kmap_atomic(sh->dev[i].page); 451 sh->dev[i].log_checksum = crc32c_le(log->uuid_checksum, 452 addr, PAGE_SIZE); 453 kunmap_atomic(addr); 454 } 455 parity_pages = 1 + !!(sh->qd_idx >= 0); 456 data_pages = write_disks - parity_pages; 457 458 meta_size = 459 ((sizeof(struct r5l_payload_data_parity) + sizeof(__le32)) 460 * data_pages) + 461 sizeof(struct r5l_payload_data_parity) + 462 sizeof(__le32) * parity_pages; 463 /* Doesn't work with very big raid array */ 464 if (meta_size + sizeof(struct r5l_meta_block) > PAGE_SIZE) 465 return -EINVAL; 466 467 set_bit(STRIPE_LOG_TRAPPED, &sh->state); 468 /* 469 * The stripe must enter state machine again to finish the write, so 470 * don't delay. 471 */ 472 clear_bit(STRIPE_DELAYED, &sh->state); 473 atomic_inc(&sh->count); 474 475 mutex_lock(&log->io_mutex); 476 /* meta + data */ 477 reserve = (1 + write_disks) << (PAGE_SHIFT - 9); 478 if (r5l_has_free_space(log, reserve)) 479 r5l_log_stripe(log, sh, data_pages, parity_pages); 480 else { 481 spin_lock(&log->no_space_stripes_lock); 482 list_add_tail(&sh->log_list, &log->no_space_stripes); 483 spin_unlock(&log->no_space_stripes_lock); 484 485 r5l_wake_reclaim(log, reserve); 486 } 487 mutex_unlock(&log->io_mutex); 488 489 return 0; 490 } 491 492 void r5l_write_stripe_run(struct r5l_log *log) 493 { 494 if (!log) 495 return; 496 mutex_lock(&log->io_mutex); 497 r5l_submit_current_io(log); 498 mutex_unlock(&log->io_mutex); 499 } 500 501 int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio) 502 { 503 if (!log) 504 return -ENODEV; 505 /* 506 * we flush log disk cache first, then write stripe data to raid disks. 507 * So if bio is finished, the log disk cache is flushed already. The 508 * recovery guarantees we can recovery the bio from log disk, so we 509 * don't need to flush again 510 */ 511 if (bio->bi_iter.bi_size == 0) { 512 bio_endio(bio); 513 return 0; 514 } 515 bio->bi_rw &= ~REQ_FLUSH; 516 return -EAGAIN; 517 } 518 519 /* This will run after log space is reclaimed */ 520 static void r5l_run_no_space_stripes(struct r5l_log *log) 521 { 522 struct stripe_head *sh; 523 524 spin_lock(&log->no_space_stripes_lock); 525 while (!list_empty(&log->no_space_stripes)) { 526 sh = list_first_entry(&log->no_space_stripes, 527 struct stripe_head, log_list); 528 list_del_init(&sh->log_list); 529 set_bit(STRIPE_HANDLE, &sh->state); 530 raid5_release_stripe(sh); 531 } 532 spin_unlock(&log->no_space_stripes_lock); 533 } 534 535 static sector_t r5l_reclaimable_space(struct r5l_log *log) 536 { 537 return r5l_ring_distance(log, log->last_checkpoint, 538 log->next_checkpoint); 539 } 540 541 static bool r5l_complete_finished_ios(struct r5l_log *log) 542 { 543 struct r5l_io_unit *io, *next; 544 bool found = false; 545 546 assert_spin_locked(&log->io_list_lock); 547 548 list_for_each_entry_safe(io, next, &log->finished_ios, log_sibling) { 549 /* don't change list order */ 550 if (io->state < IO_UNIT_STRIPE_END) 551 break; 552 553 log->next_checkpoint = io->log_start; 554 log->next_cp_seq = io->seq; 555 556 list_del(&io->log_sibling); 557 r5l_free_io_unit(log, io); 558 559 found = true; 560 } 561 562 return found; 563 } 564 565 static void __r5l_stripe_write_finished(struct r5l_io_unit *io) 566 { 567 struct r5l_log *log = io->log; 568 unsigned long flags; 569 570 spin_lock_irqsave(&log->io_list_lock, flags); 571 __r5l_set_io_unit_state(io, IO_UNIT_STRIPE_END); 572 573 if (!r5l_complete_finished_ios(log)) { 574 spin_unlock_irqrestore(&log->io_list_lock, flags); 575 return; 576 } 577 578 if (r5l_reclaimable_space(log) > log->max_free_space) 579 r5l_wake_reclaim(log, 0); 580 581 spin_unlock_irqrestore(&log->io_list_lock, flags); 582 wake_up(&log->iounit_wait); 583 } 584 585 void r5l_stripe_write_finished(struct stripe_head *sh) 586 { 587 struct r5l_io_unit *io; 588 589 io = sh->log_io; 590 sh->log_io = NULL; 591 592 if (io && atomic_dec_and_test(&io->pending_stripe)) 593 __r5l_stripe_write_finished(io); 594 } 595 596 static void r5l_log_flush_endio(struct bio *bio) 597 { 598 struct r5l_log *log = container_of(bio, struct r5l_log, 599 flush_bio); 600 unsigned long flags; 601 struct r5l_io_unit *io; 602 603 if (bio->bi_error) 604 md_error(log->rdev->mddev, log->rdev); 605 606 spin_lock_irqsave(&log->io_list_lock, flags); 607 list_for_each_entry(io, &log->flushing_ios, log_sibling) 608 r5l_io_run_stripes(io); 609 list_splice_tail_init(&log->flushing_ios, &log->finished_ios); 610 spin_unlock_irqrestore(&log->io_list_lock, flags); 611 } 612 613 /* 614 * Starting dispatch IO to raid. 615 * io_unit(meta) consists of a log. There is one situation we want to avoid. A 616 * broken meta in the middle of a log causes recovery can't find meta at the 617 * head of log. If operations require meta at the head persistent in log, we 618 * must make sure meta before it persistent in log too. A case is: 619 * 620 * stripe data/parity is in log, we start write stripe to raid disks. stripe 621 * data/parity must be persistent in log before we do the write to raid disks. 622 * 623 * The solution is we restrictly maintain io_unit list order. In this case, we 624 * only write stripes of an io_unit to raid disks till the io_unit is the first 625 * one whose data/parity is in log. 626 */ 627 void r5l_flush_stripe_to_raid(struct r5l_log *log) 628 { 629 bool do_flush; 630 631 if (!log || !log->need_cache_flush) 632 return; 633 634 spin_lock_irq(&log->io_list_lock); 635 /* flush bio is running */ 636 if (!list_empty(&log->flushing_ios)) { 637 spin_unlock_irq(&log->io_list_lock); 638 return; 639 } 640 list_splice_tail_init(&log->io_end_ios, &log->flushing_ios); 641 do_flush = !list_empty(&log->flushing_ios); 642 spin_unlock_irq(&log->io_list_lock); 643 644 if (!do_flush) 645 return; 646 bio_reset(&log->flush_bio); 647 log->flush_bio.bi_bdev = log->rdev->bdev; 648 log->flush_bio.bi_end_io = r5l_log_flush_endio; 649 submit_bio(WRITE_FLUSH, &log->flush_bio); 650 } 651 652 static void r5l_write_super(struct r5l_log *log, sector_t cp); 653 static void r5l_write_super_and_discard_space(struct r5l_log *log, 654 sector_t end) 655 { 656 struct block_device *bdev = log->rdev->bdev; 657 struct mddev *mddev; 658 659 r5l_write_super(log, end); 660 661 if (!blk_queue_discard(bdev_get_queue(bdev))) 662 return; 663 664 mddev = log->rdev->mddev; 665 /* 666 * This is to avoid a deadlock. r5l_quiesce holds reconfig_mutex and 667 * wait for this thread to finish. This thread waits for 668 * MD_CHANGE_PENDING clear, which is supposed to be done in 669 * md_check_recovery(). md_check_recovery() tries to get 670 * reconfig_mutex. Since r5l_quiesce already holds the mutex, 671 * md_check_recovery() fails, so the PENDING never get cleared. The 672 * in_teardown check workaround this issue. 673 */ 674 if (!log->in_teardown) { 675 set_bit(MD_CHANGE_DEVS, &mddev->flags); 676 set_bit(MD_CHANGE_PENDING, &mddev->flags); 677 md_wakeup_thread(mddev->thread); 678 wait_event(mddev->sb_wait, 679 !test_bit(MD_CHANGE_PENDING, &mddev->flags) || 680 log->in_teardown); 681 /* 682 * r5l_quiesce could run after in_teardown check and hold 683 * mutex first. Superblock might get updated twice. 684 */ 685 if (log->in_teardown) 686 md_update_sb(mddev, 1); 687 } else { 688 WARN_ON(!mddev_is_locked(mddev)); 689 md_update_sb(mddev, 1); 690 } 691 692 /* discard IO error really doesn't matter, ignore it */ 693 if (log->last_checkpoint < end) { 694 blkdev_issue_discard(bdev, 695 log->last_checkpoint + log->rdev->data_offset, 696 end - log->last_checkpoint, GFP_NOIO, 0); 697 } else { 698 blkdev_issue_discard(bdev, 699 log->last_checkpoint + log->rdev->data_offset, 700 log->device_size - log->last_checkpoint, 701 GFP_NOIO, 0); 702 blkdev_issue_discard(bdev, log->rdev->data_offset, end, 703 GFP_NOIO, 0); 704 } 705 } 706 707 708 static void r5l_do_reclaim(struct r5l_log *log) 709 { 710 sector_t reclaim_target = xchg(&log->reclaim_target, 0); 711 sector_t reclaimable; 712 sector_t next_checkpoint; 713 u64 next_cp_seq; 714 715 spin_lock_irq(&log->io_list_lock); 716 /* 717 * move proper io_unit to reclaim list. We should not change the order. 718 * reclaimable/unreclaimable io_unit can be mixed in the list, we 719 * shouldn't reuse space of an unreclaimable io_unit 720 */ 721 while (1) { 722 reclaimable = r5l_reclaimable_space(log); 723 if (reclaimable >= reclaim_target || 724 (list_empty(&log->running_ios) && 725 list_empty(&log->io_end_ios) && 726 list_empty(&log->flushing_ios) && 727 list_empty(&log->finished_ios))) 728 break; 729 730 md_wakeup_thread(log->rdev->mddev->thread); 731 wait_event_lock_irq(log->iounit_wait, 732 r5l_reclaimable_space(log) > reclaimable, 733 log->io_list_lock); 734 } 735 736 next_checkpoint = log->next_checkpoint; 737 next_cp_seq = log->next_cp_seq; 738 spin_unlock_irq(&log->io_list_lock); 739 740 BUG_ON(reclaimable < 0); 741 if (reclaimable == 0) 742 return; 743 744 /* 745 * write_super will flush cache of each raid disk. We must write super 746 * here, because the log area might be reused soon and we don't want to 747 * confuse recovery 748 */ 749 r5l_write_super_and_discard_space(log, next_checkpoint); 750 751 mutex_lock(&log->io_mutex); 752 log->last_checkpoint = next_checkpoint; 753 log->last_cp_seq = next_cp_seq; 754 mutex_unlock(&log->io_mutex); 755 756 r5l_run_no_space_stripes(log); 757 } 758 759 static void r5l_reclaim_thread(struct md_thread *thread) 760 { 761 struct mddev *mddev = thread->mddev; 762 struct r5conf *conf = mddev->private; 763 struct r5l_log *log = conf->log; 764 765 if (!log) 766 return; 767 r5l_do_reclaim(log); 768 } 769 770 static void r5l_wake_reclaim(struct r5l_log *log, sector_t space) 771 { 772 unsigned long target; 773 unsigned long new = (unsigned long)space; /* overflow in theory */ 774 775 do { 776 target = log->reclaim_target; 777 if (new < target) 778 return; 779 } while (cmpxchg(&log->reclaim_target, target, new) != target); 780 md_wakeup_thread(log->reclaim_thread); 781 } 782 783 void r5l_quiesce(struct r5l_log *log, int state) 784 { 785 struct mddev *mddev; 786 if (!log || state == 2) 787 return; 788 if (state == 0) { 789 log->in_teardown = 0; 790 log->reclaim_thread = md_register_thread(r5l_reclaim_thread, 791 log->rdev->mddev, "reclaim"); 792 } else if (state == 1) { 793 /* 794 * at this point all stripes are finished, so io_unit is at 795 * least in STRIPE_END state 796 */ 797 log->in_teardown = 1; 798 /* make sure r5l_write_super_and_discard_space exits */ 799 mddev = log->rdev->mddev; 800 wake_up(&mddev->sb_wait); 801 r5l_wake_reclaim(log, -1L); 802 md_unregister_thread(&log->reclaim_thread); 803 r5l_do_reclaim(log); 804 } 805 } 806 807 bool r5l_log_disk_error(struct r5conf *conf) 808 { 809 /* don't allow write if journal disk is missing */ 810 if (!conf->log) 811 return test_bit(MD_HAS_JOURNAL, &conf->mddev->flags); 812 return test_bit(Faulty, &conf->log->rdev->flags); 813 } 814 815 struct r5l_recovery_ctx { 816 struct page *meta_page; /* current meta */ 817 sector_t meta_total_blocks; /* total size of current meta and data */ 818 sector_t pos; /* recovery position */ 819 u64 seq; /* recovery position seq */ 820 }; 821 822 static int r5l_read_meta_block(struct r5l_log *log, 823 struct r5l_recovery_ctx *ctx) 824 { 825 struct page *page = ctx->meta_page; 826 struct r5l_meta_block *mb; 827 u32 crc, stored_crc; 828 829 if (!sync_page_io(log->rdev, ctx->pos, PAGE_SIZE, page, READ, false)) 830 return -EIO; 831 832 mb = page_address(page); 833 stored_crc = le32_to_cpu(mb->checksum); 834 mb->checksum = 0; 835 836 if (le32_to_cpu(mb->magic) != R5LOG_MAGIC || 837 le64_to_cpu(mb->seq) != ctx->seq || 838 mb->version != R5LOG_VERSION || 839 le64_to_cpu(mb->position) != ctx->pos) 840 return -EINVAL; 841 842 crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE); 843 if (stored_crc != crc) 844 return -EINVAL; 845 846 if (le32_to_cpu(mb->meta_size) > PAGE_SIZE) 847 return -EINVAL; 848 849 ctx->meta_total_blocks = BLOCK_SECTORS; 850 851 return 0; 852 } 853 854 static int r5l_recovery_flush_one_stripe(struct r5l_log *log, 855 struct r5l_recovery_ctx *ctx, 856 sector_t stripe_sect, 857 int *offset, sector_t *log_offset) 858 { 859 struct r5conf *conf = log->rdev->mddev->private; 860 struct stripe_head *sh; 861 struct r5l_payload_data_parity *payload; 862 int disk_index; 863 864 sh = raid5_get_active_stripe(conf, stripe_sect, 0, 0, 0); 865 while (1) { 866 payload = page_address(ctx->meta_page) + *offset; 867 868 if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_DATA) { 869 raid5_compute_sector(conf, 870 le64_to_cpu(payload->location), 0, 871 &disk_index, sh); 872 873 sync_page_io(log->rdev, *log_offset, PAGE_SIZE, 874 sh->dev[disk_index].page, READ, false); 875 sh->dev[disk_index].log_checksum = 876 le32_to_cpu(payload->checksum[0]); 877 set_bit(R5_Wantwrite, &sh->dev[disk_index].flags); 878 ctx->meta_total_blocks += BLOCK_SECTORS; 879 } else { 880 disk_index = sh->pd_idx; 881 sync_page_io(log->rdev, *log_offset, PAGE_SIZE, 882 sh->dev[disk_index].page, READ, false); 883 sh->dev[disk_index].log_checksum = 884 le32_to_cpu(payload->checksum[0]); 885 set_bit(R5_Wantwrite, &sh->dev[disk_index].flags); 886 887 if (sh->qd_idx >= 0) { 888 disk_index = sh->qd_idx; 889 sync_page_io(log->rdev, 890 r5l_ring_add(log, *log_offset, BLOCK_SECTORS), 891 PAGE_SIZE, sh->dev[disk_index].page, 892 READ, false); 893 sh->dev[disk_index].log_checksum = 894 le32_to_cpu(payload->checksum[1]); 895 set_bit(R5_Wantwrite, 896 &sh->dev[disk_index].flags); 897 } 898 ctx->meta_total_blocks += BLOCK_SECTORS * conf->max_degraded; 899 } 900 901 *log_offset = r5l_ring_add(log, *log_offset, 902 le32_to_cpu(payload->size)); 903 *offset += sizeof(struct r5l_payload_data_parity) + 904 sizeof(__le32) * 905 (le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9)); 906 if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_PARITY) 907 break; 908 } 909 910 for (disk_index = 0; disk_index < sh->disks; disk_index++) { 911 void *addr; 912 u32 checksum; 913 914 if (!test_bit(R5_Wantwrite, &sh->dev[disk_index].flags)) 915 continue; 916 addr = kmap_atomic(sh->dev[disk_index].page); 917 checksum = crc32c_le(log->uuid_checksum, addr, PAGE_SIZE); 918 kunmap_atomic(addr); 919 if (checksum != sh->dev[disk_index].log_checksum) 920 goto error; 921 } 922 923 for (disk_index = 0; disk_index < sh->disks; disk_index++) { 924 struct md_rdev *rdev, *rrdev; 925 926 if (!test_and_clear_bit(R5_Wantwrite, 927 &sh->dev[disk_index].flags)) 928 continue; 929 930 /* in case device is broken */ 931 rdev = rcu_dereference(conf->disks[disk_index].rdev); 932 if (rdev) 933 sync_page_io(rdev, stripe_sect, PAGE_SIZE, 934 sh->dev[disk_index].page, WRITE, false); 935 rrdev = rcu_dereference(conf->disks[disk_index].replacement); 936 if (rrdev) 937 sync_page_io(rrdev, stripe_sect, PAGE_SIZE, 938 sh->dev[disk_index].page, WRITE, false); 939 } 940 raid5_release_stripe(sh); 941 return 0; 942 943 error: 944 for (disk_index = 0; disk_index < sh->disks; disk_index++) 945 sh->dev[disk_index].flags = 0; 946 raid5_release_stripe(sh); 947 return -EINVAL; 948 } 949 950 static int r5l_recovery_flush_one_meta(struct r5l_log *log, 951 struct r5l_recovery_ctx *ctx) 952 { 953 struct r5conf *conf = log->rdev->mddev->private; 954 struct r5l_payload_data_parity *payload; 955 struct r5l_meta_block *mb; 956 int offset; 957 sector_t log_offset; 958 sector_t stripe_sector; 959 960 mb = page_address(ctx->meta_page); 961 offset = sizeof(struct r5l_meta_block); 962 log_offset = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS); 963 964 while (offset < le32_to_cpu(mb->meta_size)) { 965 int dd; 966 967 payload = (void *)mb + offset; 968 stripe_sector = raid5_compute_sector(conf, 969 le64_to_cpu(payload->location), 0, &dd, NULL); 970 if (r5l_recovery_flush_one_stripe(log, ctx, stripe_sector, 971 &offset, &log_offset)) 972 return -EINVAL; 973 } 974 return 0; 975 } 976 977 /* copy data/parity from log to raid disks */ 978 static void r5l_recovery_flush_log(struct r5l_log *log, 979 struct r5l_recovery_ctx *ctx) 980 { 981 while (1) { 982 if (r5l_read_meta_block(log, ctx)) 983 return; 984 if (r5l_recovery_flush_one_meta(log, ctx)) 985 return; 986 ctx->seq++; 987 ctx->pos = r5l_ring_add(log, ctx->pos, ctx->meta_total_blocks); 988 } 989 } 990 991 static int r5l_log_write_empty_meta_block(struct r5l_log *log, sector_t pos, 992 u64 seq) 993 { 994 struct page *page; 995 struct r5l_meta_block *mb; 996 u32 crc; 997 998 page = alloc_page(GFP_KERNEL | __GFP_ZERO); 999 if (!page) 1000 return -ENOMEM; 1001 mb = page_address(page); 1002 mb->magic = cpu_to_le32(R5LOG_MAGIC); 1003 mb->version = R5LOG_VERSION; 1004 mb->meta_size = cpu_to_le32(sizeof(struct r5l_meta_block)); 1005 mb->seq = cpu_to_le64(seq); 1006 mb->position = cpu_to_le64(pos); 1007 crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE); 1008 mb->checksum = cpu_to_le32(crc); 1009 1010 if (!sync_page_io(log->rdev, pos, PAGE_SIZE, page, WRITE_FUA, false)) { 1011 __free_page(page); 1012 return -EIO; 1013 } 1014 __free_page(page); 1015 return 0; 1016 } 1017 1018 static int r5l_recovery_log(struct r5l_log *log) 1019 { 1020 struct r5l_recovery_ctx ctx; 1021 1022 ctx.pos = log->last_checkpoint; 1023 ctx.seq = log->last_cp_seq; 1024 ctx.meta_page = alloc_page(GFP_KERNEL); 1025 if (!ctx.meta_page) 1026 return -ENOMEM; 1027 1028 r5l_recovery_flush_log(log, &ctx); 1029 __free_page(ctx.meta_page); 1030 1031 /* 1032 * we did a recovery. Now ctx.pos points to an invalid meta block. New 1033 * log will start here. but we can't let superblock point to last valid 1034 * meta block. The log might looks like: 1035 * | meta 1| meta 2| meta 3| 1036 * meta 1 is valid, meta 2 is invalid. meta 3 could be valid. If 1037 * superblock points to meta 1, we write a new valid meta 2n. if crash 1038 * happens again, new recovery will start from meta 1. Since meta 2n is 1039 * valid now, recovery will think meta 3 is valid, which is wrong. 1040 * The solution is we create a new meta in meta2 with its seq == meta 1041 * 1's seq + 10 and let superblock points to meta2. The same recovery will 1042 * not think meta 3 is a valid meta, because its seq doesn't match 1043 */ 1044 if (ctx.seq > log->last_cp_seq + 1) { 1045 int ret; 1046 1047 ret = r5l_log_write_empty_meta_block(log, ctx.pos, ctx.seq + 10); 1048 if (ret) 1049 return ret; 1050 log->seq = ctx.seq + 11; 1051 log->log_start = r5l_ring_add(log, ctx.pos, BLOCK_SECTORS); 1052 r5l_write_super(log, ctx.pos); 1053 } else { 1054 log->log_start = ctx.pos; 1055 log->seq = ctx.seq; 1056 } 1057 return 0; 1058 } 1059 1060 static void r5l_write_super(struct r5l_log *log, sector_t cp) 1061 { 1062 struct mddev *mddev = log->rdev->mddev; 1063 1064 log->rdev->journal_tail = cp; 1065 set_bit(MD_CHANGE_DEVS, &mddev->flags); 1066 } 1067 1068 static int r5l_load_log(struct r5l_log *log) 1069 { 1070 struct md_rdev *rdev = log->rdev; 1071 struct page *page; 1072 struct r5l_meta_block *mb; 1073 sector_t cp = log->rdev->journal_tail; 1074 u32 stored_crc, expected_crc; 1075 bool create_super = false; 1076 int ret; 1077 1078 /* Make sure it's valid */ 1079 if (cp >= rdev->sectors || round_down(cp, BLOCK_SECTORS) != cp) 1080 cp = 0; 1081 page = alloc_page(GFP_KERNEL); 1082 if (!page) 1083 return -ENOMEM; 1084 1085 if (!sync_page_io(rdev, cp, PAGE_SIZE, page, READ, false)) { 1086 ret = -EIO; 1087 goto ioerr; 1088 } 1089 mb = page_address(page); 1090 1091 if (le32_to_cpu(mb->magic) != R5LOG_MAGIC || 1092 mb->version != R5LOG_VERSION) { 1093 create_super = true; 1094 goto create; 1095 } 1096 stored_crc = le32_to_cpu(mb->checksum); 1097 mb->checksum = 0; 1098 expected_crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE); 1099 if (stored_crc != expected_crc) { 1100 create_super = true; 1101 goto create; 1102 } 1103 if (le64_to_cpu(mb->position) != cp) { 1104 create_super = true; 1105 goto create; 1106 } 1107 create: 1108 if (create_super) { 1109 log->last_cp_seq = prandom_u32(); 1110 cp = 0; 1111 /* 1112 * Make sure super points to correct address. Log might have 1113 * data very soon. If super hasn't correct log tail address, 1114 * recovery can't find the log 1115 */ 1116 r5l_write_super(log, cp); 1117 } else 1118 log->last_cp_seq = le64_to_cpu(mb->seq); 1119 1120 log->device_size = round_down(rdev->sectors, BLOCK_SECTORS); 1121 log->max_free_space = log->device_size >> RECLAIM_MAX_FREE_SPACE_SHIFT; 1122 if (log->max_free_space > RECLAIM_MAX_FREE_SPACE) 1123 log->max_free_space = RECLAIM_MAX_FREE_SPACE; 1124 log->last_checkpoint = cp; 1125 1126 __free_page(page); 1127 1128 return r5l_recovery_log(log); 1129 ioerr: 1130 __free_page(page); 1131 return ret; 1132 } 1133 1134 int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev) 1135 { 1136 struct r5l_log *log; 1137 1138 if (PAGE_SIZE != 4096) 1139 return -EINVAL; 1140 log = kzalloc(sizeof(*log), GFP_KERNEL); 1141 if (!log) 1142 return -ENOMEM; 1143 log->rdev = rdev; 1144 1145 log->need_cache_flush = (rdev->bdev->bd_disk->queue->flush_flags != 0); 1146 1147 log->uuid_checksum = crc32c_le(~0, rdev->mddev->uuid, 1148 sizeof(rdev->mddev->uuid)); 1149 1150 mutex_init(&log->io_mutex); 1151 1152 spin_lock_init(&log->io_list_lock); 1153 INIT_LIST_HEAD(&log->running_ios); 1154 INIT_LIST_HEAD(&log->io_end_ios); 1155 INIT_LIST_HEAD(&log->flushing_ios); 1156 INIT_LIST_HEAD(&log->finished_ios); 1157 bio_init(&log->flush_bio); 1158 1159 log->io_kc = KMEM_CACHE(r5l_io_unit, 0); 1160 if (!log->io_kc) 1161 goto io_kc; 1162 1163 log->reclaim_thread = md_register_thread(r5l_reclaim_thread, 1164 log->rdev->mddev, "reclaim"); 1165 if (!log->reclaim_thread) 1166 goto reclaim_thread; 1167 init_waitqueue_head(&log->iounit_wait); 1168 1169 INIT_LIST_HEAD(&log->no_space_stripes); 1170 spin_lock_init(&log->no_space_stripes_lock); 1171 1172 if (r5l_load_log(log)) 1173 goto error; 1174 1175 conf->log = log; 1176 return 0; 1177 error: 1178 md_unregister_thread(&log->reclaim_thread); 1179 reclaim_thread: 1180 kmem_cache_destroy(log->io_kc); 1181 io_kc: 1182 kfree(log); 1183 return -EINVAL; 1184 } 1185 1186 void r5l_exit_log(struct r5l_log *log) 1187 { 1188 md_unregister_thread(&log->reclaim_thread); 1189 kmem_cache_destroy(log->io_kc); 1190 kfree(log); 1191 } 1192