1 /* 2 * Partial Parity Log for closing the RAID5 write hole 3 * Copyright (c) 2017, Intel Corporation. 4 * 5 * This program is free software; you can redistribute it and/or modify it 6 * under the terms and conditions of the GNU General Public License, 7 * version 2, as published by the Free Software Foundation. 8 * 9 * This program is distributed in the hope it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 12 * more details. 13 */ 14 15 #include <linux/kernel.h> 16 #include <linux/blkdev.h> 17 #include <linux/slab.h> 18 #include <linux/crc32c.h> 19 #include <linux/flex_array.h> 20 #include <linux/async_tx.h> 21 #include <linux/raid/md_p.h> 22 #include "md.h" 23 #include "raid5.h" 24 25 /* 26 * PPL consists of a 4KB header (struct ppl_header) and at least 128KB for 27 * partial parity data. The header contains an array of entries 28 * (struct ppl_header_entry) which describe the logged write requests. 29 * Partial parity for the entries comes after the header, written in the same 30 * sequence as the entries: 31 * 32 * Header 33 * entry0 34 * ... 35 * entryN 36 * PP data 37 * PP for entry0 38 * ... 39 * PP for entryN 40 * 41 * An entry describes one or more consecutive stripe_heads, up to a full 42 * stripe. The modifed raid data chunks form an m-by-n matrix, where m is the 43 * number of stripe_heads in the entry and n is the number of modified data 44 * disks. Every stripe_head in the entry must write to the same data disks. 45 * An example of a valid case described by a single entry (writes to the first 46 * stripe of a 4 disk array, 16k chunk size): 47 * 48 * sh->sector dd0 dd1 dd2 ppl 49 * +-----+-----+-----+ 50 * 0 | --- | --- | --- | +----+ 51 * 8 | -W- | -W- | --- | | pp | data_sector = 8 52 * 16 | -W- | -W- | --- | | pp | data_size = 3 * 2 * 4k 53 * 24 | -W- | -W- | --- | | pp | pp_size = 3 * 4k 54 * +-----+-----+-----+ +----+ 55 * 56 * data_sector is the first raid sector of the modified data, data_size is the 57 * total size of modified data and pp_size is the size of partial parity for 58 * this entry. Entries for full stripe writes contain no partial parity 59 * (pp_size = 0), they only mark the stripes for which parity should be 60 * recalculated after an unclean shutdown. Every entry holds a checksum of its 61 * partial parity, the header also has a checksum of the header itself. 62 * 63 * A write request is always logged to the PPL instance stored on the parity 64 * disk of the corresponding stripe. For each member disk there is one ppl_log 65 * used to handle logging for this disk, independently from others. They are 66 * grouped in child_logs array in struct ppl_conf, which is assigned to 67 * r5conf->log_private. 68 * 69 * ppl_io_unit represents a full PPL write, header_page contains the ppl_header. 70 * PPL entries for logged stripes are added in ppl_log_stripe(). A stripe_head 71 * can be appended to the last entry if it meets the conditions for a valid 72 * entry described above, otherwise a new entry is added. Checksums of entries 73 * are calculated incrementally as stripes containing partial parity are being 74 * added. ppl_submit_iounit() calculates the checksum of the header and submits 75 * a bio containing the header page and partial parity pages (sh->ppl_page) for 76 * all stripes of the io_unit. When the PPL write completes, the stripes 77 * associated with the io_unit are released and raid5d starts writing their data 78 * and parity. When all stripes are written, the io_unit is freed and the next 79 * can be submitted. 80 * 81 * An io_unit is used to gather stripes until it is submitted or becomes full 82 * (if the maximum number of entries or size of PPL is reached). Another io_unit 83 * can't be submitted until the previous has completed (PPL and stripe 84 * data+parity is written). The log->io_list tracks all io_units of a log 85 * (for a single member disk). New io_units are added to the end of the list 86 * and the first io_unit is submitted, if it is not submitted already. 87 * The current io_unit accepting new stripes is always at the end of the list. 88 */ 89 90 struct ppl_conf { 91 struct mddev *mddev; 92 93 /* array of child logs, one for each raid disk */ 94 struct ppl_log *child_logs; 95 int count; 96 97 int block_size; /* the logical block size used for data_sector 98 * in ppl_header_entry */ 99 u32 signature; /* raid array identifier */ 100 atomic64_t seq; /* current log write sequence number */ 101 102 struct kmem_cache *io_kc; 103 mempool_t *io_pool; 104 struct bio_set *bs; 105 mempool_t *meta_pool; 106 107 /* used only for recovery */ 108 int recovered_entries; 109 int mismatch_count; 110 }; 111 112 struct ppl_log { 113 struct ppl_conf *ppl_conf; /* shared between all log instances */ 114 115 struct md_rdev *rdev; /* array member disk associated with 116 * this log instance */ 117 struct mutex io_mutex; 118 struct ppl_io_unit *current_io; /* current io_unit accepting new data 119 * always at the end of io_list */ 120 spinlock_t io_list_lock; 121 struct list_head io_list; /* all io_units of this log */ 122 struct list_head no_mem_stripes;/* stripes to retry if failed to 123 * allocate io_unit */ 124 }; 125 126 #define PPL_IO_INLINE_BVECS 32 127 128 struct ppl_io_unit { 129 struct ppl_log *log; 130 131 struct page *header_page; /* for ppl_header */ 132 133 unsigned int entries_count; /* number of entries in ppl_header */ 134 unsigned int pp_size; /* total size current of partial parity */ 135 136 u64 seq; /* sequence number of this log write */ 137 struct list_head log_sibling; /* log->io_list */ 138 139 struct list_head stripe_list; /* stripes added to the io_unit */ 140 atomic_t pending_stripes; /* how many stripes not written to raid */ 141 142 bool submitted; /* true if write to log started */ 143 144 /* inline bio and its biovec for submitting the iounit */ 145 struct bio bio; 146 struct bio_vec biovec[PPL_IO_INLINE_BVECS]; 147 }; 148 149 struct dma_async_tx_descriptor * 150 ops_run_partial_parity(struct stripe_head *sh, struct raid5_percpu *percpu, 151 struct dma_async_tx_descriptor *tx) 152 { 153 int disks = sh->disks; 154 struct page **xor_srcs = flex_array_get(percpu->scribble, 0); 155 int count = 0, pd_idx = sh->pd_idx, i; 156 struct async_submit_ctl submit; 157 158 pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); 159 160 /* 161 * Partial parity is the XOR of stripe data chunks that are not changed 162 * during the write request. Depending on available data 163 * (read-modify-write vs. reconstruct-write case) we calculate it 164 * differently. 165 */ 166 if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) { 167 /* rmw: xor old data and parity from updated disks */ 168 for (i = disks; i--;) { 169 struct r5dev *dev = &sh->dev[i]; 170 if (test_bit(R5_Wantdrain, &dev->flags) || i == pd_idx) 171 xor_srcs[count++] = dev->page; 172 } 173 } else if (sh->reconstruct_state == reconstruct_state_drain_run) { 174 /* rcw: xor data from all not updated disks */ 175 for (i = disks; i--;) { 176 struct r5dev *dev = &sh->dev[i]; 177 if (test_bit(R5_UPTODATE, &dev->flags)) 178 xor_srcs[count++] = dev->page; 179 } 180 } else { 181 return tx; 182 } 183 184 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, tx, 185 NULL, sh, flex_array_get(percpu->scribble, 0) 186 + sizeof(struct page *) * (sh->disks + 2)); 187 188 if (count == 1) 189 tx = async_memcpy(sh->ppl_page, xor_srcs[0], 0, 0, PAGE_SIZE, 190 &submit); 191 else 192 tx = async_xor(sh->ppl_page, xor_srcs, 0, count, PAGE_SIZE, 193 &submit); 194 195 return tx; 196 } 197 198 static struct ppl_io_unit *ppl_new_iounit(struct ppl_log *log, 199 struct stripe_head *sh) 200 { 201 struct ppl_conf *ppl_conf = log->ppl_conf; 202 struct ppl_io_unit *io; 203 struct ppl_header *pplhdr; 204 205 io = mempool_alloc(ppl_conf->io_pool, GFP_ATOMIC); 206 if (!io) 207 return NULL; 208 209 memset(io, 0, sizeof(*io)); 210 io->log = log; 211 INIT_LIST_HEAD(&io->log_sibling); 212 INIT_LIST_HEAD(&io->stripe_list); 213 atomic_set(&io->pending_stripes, 0); 214 bio_init(&io->bio, io->biovec, PPL_IO_INLINE_BVECS); 215 216 io->header_page = mempool_alloc(ppl_conf->meta_pool, GFP_NOIO); 217 pplhdr = page_address(io->header_page); 218 clear_page(pplhdr); 219 memset(pplhdr->reserved, 0xff, PPL_HDR_RESERVED); 220 pplhdr->signature = cpu_to_le32(ppl_conf->signature); 221 222 io->seq = atomic64_add_return(1, &ppl_conf->seq); 223 pplhdr->generation = cpu_to_le64(io->seq); 224 225 return io; 226 } 227 228 static int ppl_log_stripe(struct ppl_log *log, struct stripe_head *sh) 229 { 230 struct ppl_io_unit *io = log->current_io; 231 struct ppl_header_entry *e = NULL; 232 struct ppl_header *pplhdr; 233 int i; 234 sector_t data_sector = 0; 235 int data_disks = 0; 236 unsigned int entry_space = (log->rdev->ppl.size << 9) - PPL_HEADER_SIZE; 237 struct r5conf *conf = sh->raid_conf; 238 239 pr_debug("%s: stripe: %llu\n", __func__, (unsigned long long)sh->sector); 240 241 /* check if current io_unit is full */ 242 if (io && (io->pp_size == entry_space || 243 io->entries_count == PPL_HDR_MAX_ENTRIES)) { 244 pr_debug("%s: add io_unit blocked by seq: %llu\n", 245 __func__, io->seq); 246 io = NULL; 247 } 248 249 /* add a new unit if there is none or the current is full */ 250 if (!io) { 251 io = ppl_new_iounit(log, sh); 252 if (!io) 253 return -ENOMEM; 254 spin_lock_irq(&log->io_list_lock); 255 list_add_tail(&io->log_sibling, &log->io_list); 256 spin_unlock_irq(&log->io_list_lock); 257 258 log->current_io = io; 259 } 260 261 for (i = 0; i < sh->disks; i++) { 262 struct r5dev *dev = &sh->dev[i]; 263 264 if (i != sh->pd_idx && test_bit(R5_Wantwrite, &dev->flags)) { 265 if (!data_disks || dev->sector < data_sector) 266 data_sector = dev->sector; 267 data_disks++; 268 } 269 } 270 BUG_ON(!data_disks); 271 272 pr_debug("%s: seq: %llu data_sector: %llu data_disks: %d\n", __func__, 273 io->seq, (unsigned long long)data_sector, data_disks); 274 275 pplhdr = page_address(io->header_page); 276 277 if (io->entries_count > 0) { 278 struct ppl_header_entry *last = 279 &pplhdr->entries[io->entries_count - 1]; 280 struct stripe_head *sh_last = list_last_entry( 281 &io->stripe_list, struct stripe_head, log_list); 282 u64 data_sector_last = le64_to_cpu(last->data_sector); 283 u32 data_size_last = le32_to_cpu(last->data_size); 284 285 /* 286 * Check if we can append the stripe to the last entry. It must 287 * be just after the last logged stripe and write to the same 288 * disks. Use bit shift and logarithm to avoid 64-bit division. 289 */ 290 if ((sh->sector == sh_last->sector + STRIPE_SECTORS) && 291 (data_sector >> ilog2(conf->chunk_sectors) == 292 data_sector_last >> ilog2(conf->chunk_sectors)) && 293 ((data_sector - data_sector_last) * data_disks == 294 data_size_last >> 9)) 295 e = last; 296 } 297 298 if (!e) { 299 e = &pplhdr->entries[io->entries_count++]; 300 e->data_sector = cpu_to_le64(data_sector); 301 e->parity_disk = cpu_to_le32(sh->pd_idx); 302 e->checksum = cpu_to_le32(~0); 303 } 304 305 le32_add_cpu(&e->data_size, data_disks << PAGE_SHIFT); 306 307 /* don't write any PP if full stripe write */ 308 if (!test_bit(STRIPE_FULL_WRITE, &sh->state)) { 309 le32_add_cpu(&e->pp_size, PAGE_SIZE); 310 io->pp_size += PAGE_SIZE; 311 e->checksum = cpu_to_le32(crc32c_le(le32_to_cpu(e->checksum), 312 page_address(sh->ppl_page), 313 PAGE_SIZE)); 314 } 315 316 list_add_tail(&sh->log_list, &io->stripe_list); 317 atomic_inc(&io->pending_stripes); 318 sh->ppl_io = io; 319 320 return 0; 321 } 322 323 int ppl_write_stripe(struct r5conf *conf, struct stripe_head *sh) 324 { 325 struct ppl_conf *ppl_conf = conf->log_private; 326 struct ppl_io_unit *io = sh->ppl_io; 327 struct ppl_log *log; 328 329 if (io || test_bit(STRIPE_SYNCING, &sh->state) || 330 !test_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags) || 331 !test_bit(R5_Insync, &sh->dev[sh->pd_idx].flags)) { 332 clear_bit(STRIPE_LOG_TRAPPED, &sh->state); 333 return -EAGAIN; 334 } 335 336 log = &ppl_conf->child_logs[sh->pd_idx]; 337 338 mutex_lock(&log->io_mutex); 339 340 if (!log->rdev || test_bit(Faulty, &log->rdev->flags)) { 341 mutex_unlock(&log->io_mutex); 342 return -EAGAIN; 343 } 344 345 set_bit(STRIPE_LOG_TRAPPED, &sh->state); 346 clear_bit(STRIPE_DELAYED, &sh->state); 347 atomic_inc(&sh->count); 348 349 if (ppl_log_stripe(log, sh)) { 350 spin_lock_irq(&log->io_list_lock); 351 list_add_tail(&sh->log_list, &log->no_mem_stripes); 352 spin_unlock_irq(&log->io_list_lock); 353 } 354 355 mutex_unlock(&log->io_mutex); 356 357 return 0; 358 } 359 360 static void ppl_log_endio(struct bio *bio) 361 { 362 struct ppl_io_unit *io = bio->bi_private; 363 struct ppl_log *log = io->log; 364 struct ppl_conf *ppl_conf = log->ppl_conf; 365 struct stripe_head *sh, *next; 366 367 pr_debug("%s: seq: %llu\n", __func__, io->seq); 368 369 if (bio->bi_error) 370 md_error(ppl_conf->mddev, log->rdev); 371 372 mempool_free(io->header_page, ppl_conf->meta_pool); 373 374 list_for_each_entry_safe(sh, next, &io->stripe_list, log_list) { 375 list_del_init(&sh->log_list); 376 377 set_bit(STRIPE_HANDLE, &sh->state); 378 raid5_release_stripe(sh); 379 } 380 } 381 382 static void ppl_submit_iounit_bio(struct ppl_io_unit *io, struct bio *bio) 383 { 384 char b[BDEVNAME_SIZE]; 385 386 pr_debug("%s: seq: %llu size: %u sector: %llu dev: %s\n", 387 __func__, io->seq, bio->bi_iter.bi_size, 388 (unsigned long long)bio->bi_iter.bi_sector, 389 bdevname(bio->bi_bdev, b)); 390 391 submit_bio(bio); 392 } 393 394 static void ppl_submit_iounit(struct ppl_io_unit *io) 395 { 396 struct ppl_log *log = io->log; 397 struct ppl_conf *ppl_conf = log->ppl_conf; 398 struct ppl_header *pplhdr = page_address(io->header_page); 399 struct bio *bio = &io->bio; 400 struct stripe_head *sh; 401 int i; 402 403 bio->bi_private = io; 404 405 if (!log->rdev || test_bit(Faulty, &log->rdev->flags)) { 406 ppl_log_endio(bio); 407 return; 408 } 409 410 for (i = 0; i < io->entries_count; i++) { 411 struct ppl_header_entry *e = &pplhdr->entries[i]; 412 413 pr_debug("%s: seq: %llu entry: %d data_sector: %llu pp_size: %u data_size: %u\n", 414 __func__, io->seq, i, le64_to_cpu(e->data_sector), 415 le32_to_cpu(e->pp_size), le32_to_cpu(e->data_size)); 416 417 e->data_sector = cpu_to_le64(le64_to_cpu(e->data_sector) >> 418 ilog2(ppl_conf->block_size >> 9)); 419 e->checksum = cpu_to_le32(~le32_to_cpu(e->checksum)); 420 } 421 422 pplhdr->entries_count = cpu_to_le32(io->entries_count); 423 pplhdr->checksum = cpu_to_le32(~crc32c_le(~0, pplhdr, PPL_HEADER_SIZE)); 424 425 bio->bi_end_io = ppl_log_endio; 426 bio->bi_opf = REQ_OP_WRITE | REQ_FUA; 427 bio->bi_bdev = log->rdev->bdev; 428 bio->bi_iter.bi_sector = log->rdev->ppl.sector; 429 bio_add_page(bio, io->header_page, PAGE_SIZE, 0); 430 431 list_for_each_entry(sh, &io->stripe_list, log_list) { 432 /* entries for full stripe writes have no partial parity */ 433 if (test_bit(STRIPE_FULL_WRITE, &sh->state)) 434 continue; 435 436 if (!bio_add_page(bio, sh->ppl_page, PAGE_SIZE, 0)) { 437 struct bio *prev = bio; 438 439 bio = bio_alloc_bioset(GFP_NOIO, BIO_MAX_PAGES, 440 ppl_conf->bs); 441 bio->bi_opf = prev->bi_opf; 442 bio->bi_bdev = prev->bi_bdev; 443 bio->bi_iter.bi_sector = bio_end_sector(prev); 444 bio_add_page(bio, sh->ppl_page, PAGE_SIZE, 0); 445 446 bio_chain(bio, prev); 447 ppl_submit_iounit_bio(io, prev); 448 } 449 } 450 451 ppl_submit_iounit_bio(io, bio); 452 } 453 454 static void ppl_submit_current_io(struct ppl_log *log) 455 { 456 struct ppl_io_unit *io; 457 458 spin_lock_irq(&log->io_list_lock); 459 460 io = list_first_entry_or_null(&log->io_list, struct ppl_io_unit, 461 log_sibling); 462 if (io && io->submitted) 463 io = NULL; 464 465 spin_unlock_irq(&log->io_list_lock); 466 467 if (io) { 468 io->submitted = true; 469 470 if (io == log->current_io) 471 log->current_io = NULL; 472 473 ppl_submit_iounit(io); 474 } 475 } 476 477 void ppl_write_stripe_run(struct r5conf *conf) 478 { 479 struct ppl_conf *ppl_conf = conf->log_private; 480 struct ppl_log *log; 481 int i; 482 483 for (i = 0; i < ppl_conf->count; i++) { 484 log = &ppl_conf->child_logs[i]; 485 486 mutex_lock(&log->io_mutex); 487 ppl_submit_current_io(log); 488 mutex_unlock(&log->io_mutex); 489 } 490 } 491 492 static void ppl_io_unit_finished(struct ppl_io_unit *io) 493 { 494 struct ppl_log *log = io->log; 495 unsigned long flags; 496 497 pr_debug("%s: seq: %llu\n", __func__, io->seq); 498 499 spin_lock_irqsave(&log->io_list_lock, flags); 500 501 list_del(&io->log_sibling); 502 mempool_free(io, log->ppl_conf->io_pool); 503 504 if (!list_empty(&log->no_mem_stripes)) { 505 struct stripe_head *sh = list_first_entry(&log->no_mem_stripes, 506 struct stripe_head, 507 log_list); 508 list_del_init(&sh->log_list); 509 set_bit(STRIPE_HANDLE, &sh->state); 510 raid5_release_stripe(sh); 511 } 512 513 spin_unlock_irqrestore(&log->io_list_lock, flags); 514 } 515 516 void ppl_stripe_write_finished(struct stripe_head *sh) 517 { 518 struct ppl_io_unit *io; 519 520 io = sh->ppl_io; 521 sh->ppl_io = NULL; 522 523 if (io && atomic_dec_and_test(&io->pending_stripes)) 524 ppl_io_unit_finished(io); 525 } 526 527 static void ppl_xor(int size, struct page *page1, struct page *page2) 528 { 529 struct async_submit_ctl submit; 530 struct dma_async_tx_descriptor *tx; 531 struct page *xor_srcs[] = { page1, page2 }; 532 533 init_async_submit(&submit, ASYNC_TX_ACK|ASYNC_TX_XOR_DROP_DST, 534 NULL, NULL, NULL, NULL); 535 tx = async_xor(page1, xor_srcs, 0, 2, size, &submit); 536 537 async_tx_quiesce(&tx); 538 } 539 540 /* 541 * PPL recovery strategy: xor partial parity and data from all modified data 542 * disks within a stripe and write the result as the new stripe parity. If all 543 * stripe data disks are modified (full stripe write), no partial parity is 544 * available, so just xor the data disks. 545 * 546 * Recovery of a PPL entry shall occur only if all modified data disks are 547 * available and read from all of them succeeds. 548 * 549 * A PPL entry applies to a stripe, partial parity size for an entry is at most 550 * the size of the chunk. Examples of possible cases for a single entry: 551 * 552 * case 0: single data disk write: 553 * data0 data1 data2 ppl parity 554 * +--------+--------+--------+ +--------------------+ 555 * | ------ | ------ | ------ | +----+ | (no change) | 556 * | ------ | -data- | ------ | | pp | -> | data1 ^ pp | 557 * | ------ | -data- | ------ | | pp | -> | data1 ^ pp | 558 * | ------ | ------ | ------ | +----+ | (no change) | 559 * +--------+--------+--------+ +--------------------+ 560 * pp_size = data_size 561 * 562 * case 1: more than one data disk write: 563 * data0 data1 data2 ppl parity 564 * +--------+--------+--------+ +--------------------+ 565 * | ------ | ------ | ------ | +----+ | (no change) | 566 * | -data- | -data- | ------ | | pp | -> | data0 ^ data1 ^ pp | 567 * | -data- | -data- | ------ | | pp | -> | data0 ^ data1 ^ pp | 568 * | ------ | ------ | ------ | +----+ | (no change) | 569 * +--------+--------+--------+ +--------------------+ 570 * pp_size = data_size / modified_data_disks 571 * 572 * case 2: write to all data disks (also full stripe write): 573 * data0 data1 data2 parity 574 * +--------+--------+--------+ +--------------------+ 575 * | ------ | ------ | ------ | | (no change) | 576 * | -data- | -data- | -data- | --------> | xor all data | 577 * | ------ | ------ | ------ | --------> | (no change) | 578 * | ------ | ------ | ------ | | (no change) | 579 * +--------+--------+--------+ +--------------------+ 580 * pp_size = 0 581 * 582 * The following cases are possible only in other implementations. The recovery 583 * code can handle them, but they are not generated at runtime because they can 584 * be reduced to cases 0, 1 and 2: 585 * 586 * case 3: 587 * data0 data1 data2 ppl parity 588 * +--------+--------+--------+ +----+ +--------------------+ 589 * | ------ | -data- | -data- | | pp | | data1 ^ data2 ^ pp | 590 * | ------ | -data- | -data- | | pp | -> | data1 ^ data2 ^ pp | 591 * | -data- | -data- | -data- | | -- | -> | xor all data | 592 * | -data- | -data- | ------ | | pp | | data0 ^ data1 ^ pp | 593 * +--------+--------+--------+ +----+ +--------------------+ 594 * pp_size = chunk_size 595 * 596 * case 4: 597 * data0 data1 data2 ppl parity 598 * +--------+--------+--------+ +----+ +--------------------+ 599 * | ------ | -data- | ------ | | pp | | data1 ^ pp | 600 * | ------ | ------ | ------ | | -- | -> | (no change) | 601 * | ------ | ------ | ------ | | -- | -> | (no change) | 602 * | -data- | ------ | ------ | | pp | | data0 ^ pp | 603 * +--------+--------+--------+ +----+ +--------------------+ 604 * pp_size = chunk_size 605 */ 606 static int ppl_recover_entry(struct ppl_log *log, struct ppl_header_entry *e, 607 sector_t ppl_sector) 608 { 609 struct ppl_conf *ppl_conf = log->ppl_conf; 610 struct mddev *mddev = ppl_conf->mddev; 611 struct r5conf *conf = mddev->private; 612 int block_size = ppl_conf->block_size; 613 struct page *page1; 614 struct page *page2; 615 sector_t r_sector_first; 616 sector_t r_sector_last; 617 int strip_sectors; 618 int data_disks; 619 int i; 620 int ret = 0; 621 char b[BDEVNAME_SIZE]; 622 unsigned int pp_size = le32_to_cpu(e->pp_size); 623 unsigned int data_size = le32_to_cpu(e->data_size); 624 625 page1 = alloc_page(GFP_KERNEL); 626 page2 = alloc_page(GFP_KERNEL); 627 628 if (!page1 || !page2) { 629 ret = -ENOMEM; 630 goto out; 631 } 632 633 r_sector_first = le64_to_cpu(e->data_sector) * (block_size >> 9); 634 635 if ((pp_size >> 9) < conf->chunk_sectors) { 636 if (pp_size > 0) { 637 data_disks = data_size / pp_size; 638 strip_sectors = pp_size >> 9; 639 } else { 640 data_disks = conf->raid_disks - conf->max_degraded; 641 strip_sectors = (data_size >> 9) / data_disks; 642 } 643 r_sector_last = r_sector_first + 644 (data_disks - 1) * conf->chunk_sectors + 645 strip_sectors; 646 } else { 647 data_disks = conf->raid_disks - conf->max_degraded; 648 strip_sectors = conf->chunk_sectors; 649 r_sector_last = r_sector_first + (data_size >> 9); 650 } 651 652 pr_debug("%s: array sector first: %llu last: %llu\n", __func__, 653 (unsigned long long)r_sector_first, 654 (unsigned long long)r_sector_last); 655 656 /* if start and end is 4k aligned, use a 4k block */ 657 if (block_size == 512 && 658 (r_sector_first & (STRIPE_SECTORS - 1)) == 0 && 659 (r_sector_last & (STRIPE_SECTORS - 1)) == 0) 660 block_size = STRIPE_SIZE; 661 662 /* iterate through blocks in strip */ 663 for (i = 0; i < strip_sectors; i += (block_size >> 9)) { 664 bool update_parity = false; 665 sector_t parity_sector; 666 struct md_rdev *parity_rdev; 667 struct stripe_head sh; 668 int disk; 669 int indent = 0; 670 671 pr_debug("%s:%*s iter %d start\n", __func__, indent, "", i); 672 indent += 2; 673 674 memset(page_address(page1), 0, PAGE_SIZE); 675 676 /* iterate through data member disks */ 677 for (disk = 0; disk < data_disks; disk++) { 678 int dd_idx; 679 struct md_rdev *rdev; 680 sector_t sector; 681 sector_t r_sector = r_sector_first + i + 682 (disk * conf->chunk_sectors); 683 684 pr_debug("%s:%*s data member disk %d start\n", 685 __func__, indent, "", disk); 686 indent += 2; 687 688 if (r_sector >= r_sector_last) { 689 pr_debug("%s:%*s array sector %llu doesn't need parity update\n", 690 __func__, indent, "", 691 (unsigned long long)r_sector); 692 indent -= 2; 693 continue; 694 } 695 696 update_parity = true; 697 698 /* map raid sector to member disk */ 699 sector = raid5_compute_sector(conf, r_sector, 0, 700 &dd_idx, NULL); 701 pr_debug("%s:%*s processing array sector %llu => data member disk %d, sector %llu\n", 702 __func__, indent, "", 703 (unsigned long long)r_sector, dd_idx, 704 (unsigned long long)sector); 705 706 rdev = conf->disks[dd_idx].rdev; 707 if (!rdev) { 708 pr_debug("%s:%*s data member disk %d missing\n", 709 __func__, indent, "", dd_idx); 710 update_parity = false; 711 break; 712 } 713 714 pr_debug("%s:%*s reading data member disk %s sector %llu\n", 715 __func__, indent, "", bdevname(rdev->bdev, b), 716 (unsigned long long)sector); 717 if (!sync_page_io(rdev, sector, block_size, page2, 718 REQ_OP_READ, 0, false)) { 719 md_error(mddev, rdev); 720 pr_debug("%s:%*s read failed!\n", __func__, 721 indent, ""); 722 ret = -EIO; 723 goto out; 724 } 725 726 ppl_xor(block_size, page1, page2); 727 728 indent -= 2; 729 } 730 731 if (!update_parity) 732 continue; 733 734 if (pp_size > 0) { 735 pr_debug("%s:%*s reading pp disk sector %llu\n", 736 __func__, indent, "", 737 (unsigned long long)(ppl_sector + i)); 738 if (!sync_page_io(log->rdev, 739 ppl_sector - log->rdev->data_offset + i, 740 block_size, page2, REQ_OP_READ, 0, 741 false)) { 742 pr_debug("%s:%*s read failed!\n", __func__, 743 indent, ""); 744 md_error(mddev, log->rdev); 745 ret = -EIO; 746 goto out; 747 } 748 749 ppl_xor(block_size, page1, page2); 750 } 751 752 /* map raid sector to parity disk */ 753 parity_sector = raid5_compute_sector(conf, r_sector_first + i, 754 0, &disk, &sh); 755 BUG_ON(sh.pd_idx != le32_to_cpu(e->parity_disk)); 756 parity_rdev = conf->disks[sh.pd_idx].rdev; 757 758 BUG_ON(parity_rdev->bdev->bd_dev != log->rdev->bdev->bd_dev); 759 pr_debug("%s:%*s write parity at sector %llu, disk %s\n", 760 __func__, indent, "", 761 (unsigned long long)parity_sector, 762 bdevname(parity_rdev->bdev, b)); 763 if (!sync_page_io(parity_rdev, parity_sector, block_size, 764 page1, REQ_OP_WRITE, 0, false)) { 765 pr_debug("%s:%*s parity write error!\n", __func__, 766 indent, ""); 767 md_error(mddev, parity_rdev); 768 ret = -EIO; 769 goto out; 770 } 771 } 772 out: 773 if (page1) 774 __free_page(page1); 775 if (page2) 776 __free_page(page2); 777 return ret; 778 } 779 780 static int ppl_recover(struct ppl_log *log, struct ppl_header *pplhdr) 781 { 782 struct ppl_conf *ppl_conf = log->ppl_conf; 783 struct md_rdev *rdev = log->rdev; 784 struct mddev *mddev = rdev->mddev; 785 sector_t ppl_sector = rdev->ppl.sector + (PPL_HEADER_SIZE >> 9); 786 struct page *page; 787 int i; 788 int ret = 0; 789 790 page = alloc_page(GFP_KERNEL); 791 if (!page) 792 return -ENOMEM; 793 794 /* iterate through all PPL entries saved */ 795 for (i = 0; i < le32_to_cpu(pplhdr->entries_count); i++) { 796 struct ppl_header_entry *e = &pplhdr->entries[i]; 797 u32 pp_size = le32_to_cpu(e->pp_size); 798 sector_t sector = ppl_sector; 799 int ppl_entry_sectors = pp_size >> 9; 800 u32 crc, crc_stored; 801 802 pr_debug("%s: disk: %d entry: %d ppl_sector: %llu pp_size: %u\n", 803 __func__, rdev->raid_disk, i, 804 (unsigned long long)ppl_sector, pp_size); 805 806 crc = ~0; 807 crc_stored = le32_to_cpu(e->checksum); 808 809 /* read parial parity for this entry and calculate its checksum */ 810 while (pp_size) { 811 int s = pp_size > PAGE_SIZE ? PAGE_SIZE : pp_size; 812 813 if (!sync_page_io(rdev, sector - rdev->data_offset, 814 s, page, REQ_OP_READ, 0, false)) { 815 md_error(mddev, rdev); 816 ret = -EIO; 817 goto out; 818 } 819 820 crc = crc32c_le(crc, page_address(page), s); 821 822 pp_size -= s; 823 sector += s >> 9; 824 } 825 826 crc = ~crc; 827 828 if (crc != crc_stored) { 829 /* 830 * Don't recover this entry if the checksum does not 831 * match, but keep going and try to recover other 832 * entries. 833 */ 834 pr_debug("%s: ppl entry crc does not match: stored: 0x%x calculated: 0x%x\n", 835 __func__, crc_stored, crc); 836 ppl_conf->mismatch_count++; 837 } else { 838 ret = ppl_recover_entry(log, e, ppl_sector); 839 if (ret) 840 goto out; 841 ppl_conf->recovered_entries++; 842 } 843 844 ppl_sector += ppl_entry_sectors; 845 } 846 847 /* flush the disk cache after recovery if necessary */ 848 ret = blkdev_issue_flush(rdev->bdev, GFP_KERNEL, NULL); 849 out: 850 __free_page(page); 851 return ret; 852 } 853 854 static int ppl_write_empty_header(struct ppl_log *log) 855 { 856 struct page *page; 857 struct ppl_header *pplhdr; 858 struct md_rdev *rdev = log->rdev; 859 int ret = 0; 860 861 pr_debug("%s: disk: %d ppl_sector: %llu\n", __func__, 862 rdev->raid_disk, (unsigned long long)rdev->ppl.sector); 863 864 page = alloc_page(GFP_NOIO | __GFP_ZERO); 865 if (!page) 866 return -ENOMEM; 867 868 pplhdr = page_address(page); 869 memset(pplhdr->reserved, 0xff, PPL_HDR_RESERVED); 870 pplhdr->signature = cpu_to_le32(log->ppl_conf->signature); 871 pplhdr->checksum = cpu_to_le32(~crc32c_le(~0, pplhdr, PAGE_SIZE)); 872 873 if (!sync_page_io(rdev, rdev->ppl.sector - rdev->data_offset, 874 PPL_HEADER_SIZE, page, REQ_OP_WRITE | REQ_FUA, 0, 875 false)) { 876 md_error(rdev->mddev, rdev); 877 ret = -EIO; 878 } 879 880 __free_page(page); 881 return ret; 882 } 883 884 static int ppl_load_distributed(struct ppl_log *log) 885 { 886 struct ppl_conf *ppl_conf = log->ppl_conf; 887 struct md_rdev *rdev = log->rdev; 888 struct mddev *mddev = rdev->mddev; 889 struct page *page; 890 struct ppl_header *pplhdr; 891 u32 crc, crc_stored; 892 u32 signature; 893 int ret = 0; 894 895 pr_debug("%s: disk: %d\n", __func__, rdev->raid_disk); 896 897 /* read PPL header */ 898 page = alloc_page(GFP_KERNEL); 899 if (!page) 900 return -ENOMEM; 901 902 if (!sync_page_io(rdev, rdev->ppl.sector - rdev->data_offset, 903 PAGE_SIZE, page, REQ_OP_READ, 0, false)) { 904 md_error(mddev, rdev); 905 ret = -EIO; 906 goto out; 907 } 908 pplhdr = page_address(page); 909 910 /* check header validity */ 911 crc_stored = le32_to_cpu(pplhdr->checksum); 912 pplhdr->checksum = 0; 913 crc = ~crc32c_le(~0, pplhdr, PAGE_SIZE); 914 915 if (crc_stored != crc) { 916 pr_debug("%s: ppl header crc does not match: stored: 0x%x calculated: 0x%x\n", 917 __func__, crc_stored, crc); 918 ppl_conf->mismatch_count++; 919 goto out; 920 } 921 922 signature = le32_to_cpu(pplhdr->signature); 923 924 if (mddev->external) { 925 /* 926 * For external metadata the header signature is set and 927 * validated in userspace. 928 */ 929 ppl_conf->signature = signature; 930 } else if (ppl_conf->signature != signature) { 931 pr_debug("%s: ppl header signature does not match: stored: 0x%x configured: 0x%x\n", 932 __func__, signature, ppl_conf->signature); 933 ppl_conf->mismatch_count++; 934 goto out; 935 } 936 937 /* attempt to recover from log if we are starting a dirty array */ 938 if (!mddev->pers && mddev->recovery_cp != MaxSector) 939 ret = ppl_recover(log, pplhdr); 940 out: 941 /* write empty header if we are starting the array */ 942 if (!ret && !mddev->pers) 943 ret = ppl_write_empty_header(log); 944 945 __free_page(page); 946 947 pr_debug("%s: return: %d mismatch_count: %d recovered_entries: %d\n", 948 __func__, ret, ppl_conf->mismatch_count, 949 ppl_conf->recovered_entries); 950 return ret; 951 } 952 953 static int ppl_load(struct ppl_conf *ppl_conf) 954 { 955 int ret = 0; 956 u32 signature = 0; 957 bool signature_set = false; 958 int i; 959 960 for (i = 0; i < ppl_conf->count; i++) { 961 struct ppl_log *log = &ppl_conf->child_logs[i]; 962 963 /* skip missing drive */ 964 if (!log->rdev) 965 continue; 966 967 ret = ppl_load_distributed(log); 968 if (ret) 969 break; 970 971 /* 972 * For external metadata we can't check if the signature is 973 * correct on a single drive, but we can check if it is the same 974 * on all drives. 975 */ 976 if (ppl_conf->mddev->external) { 977 if (!signature_set) { 978 signature = ppl_conf->signature; 979 signature_set = true; 980 } else if (signature != ppl_conf->signature) { 981 pr_warn("md/raid:%s: PPL header signature does not match on all member drives\n", 982 mdname(ppl_conf->mddev)); 983 ret = -EINVAL; 984 break; 985 } 986 } 987 } 988 989 pr_debug("%s: return: %d mismatch_count: %d recovered_entries: %d\n", 990 __func__, ret, ppl_conf->mismatch_count, 991 ppl_conf->recovered_entries); 992 return ret; 993 } 994 995 static void __ppl_exit_log(struct ppl_conf *ppl_conf) 996 { 997 clear_bit(MD_HAS_PPL, &ppl_conf->mddev->flags); 998 999 kfree(ppl_conf->child_logs); 1000 1001 mempool_destroy(ppl_conf->meta_pool); 1002 if (ppl_conf->bs) 1003 bioset_free(ppl_conf->bs); 1004 mempool_destroy(ppl_conf->io_pool); 1005 kmem_cache_destroy(ppl_conf->io_kc); 1006 1007 kfree(ppl_conf); 1008 } 1009 1010 void ppl_exit_log(struct r5conf *conf) 1011 { 1012 struct ppl_conf *ppl_conf = conf->log_private; 1013 1014 if (ppl_conf) { 1015 __ppl_exit_log(ppl_conf); 1016 conf->log_private = NULL; 1017 } 1018 } 1019 1020 static int ppl_validate_rdev(struct md_rdev *rdev) 1021 { 1022 char b[BDEVNAME_SIZE]; 1023 int ppl_data_sectors; 1024 int ppl_size_new; 1025 1026 /* 1027 * The configured PPL size must be enough to store 1028 * the header and (at the very least) partial parity 1029 * for one stripe. Round it down to ensure the data 1030 * space is cleanly divisible by stripe size. 1031 */ 1032 ppl_data_sectors = rdev->ppl.size - (PPL_HEADER_SIZE >> 9); 1033 1034 if (ppl_data_sectors > 0) 1035 ppl_data_sectors = rounddown(ppl_data_sectors, STRIPE_SECTORS); 1036 1037 if (ppl_data_sectors <= 0) { 1038 pr_warn("md/raid:%s: PPL space too small on %s\n", 1039 mdname(rdev->mddev), bdevname(rdev->bdev, b)); 1040 return -ENOSPC; 1041 } 1042 1043 ppl_size_new = ppl_data_sectors + (PPL_HEADER_SIZE >> 9); 1044 1045 if ((rdev->ppl.sector < rdev->data_offset && 1046 rdev->ppl.sector + ppl_size_new > rdev->data_offset) || 1047 (rdev->ppl.sector >= rdev->data_offset && 1048 rdev->data_offset + rdev->sectors > rdev->ppl.sector)) { 1049 pr_warn("md/raid:%s: PPL space overlaps with data on %s\n", 1050 mdname(rdev->mddev), bdevname(rdev->bdev, b)); 1051 return -EINVAL; 1052 } 1053 1054 if (!rdev->mddev->external && 1055 ((rdev->ppl.offset > 0 && rdev->ppl.offset < (rdev->sb_size >> 9)) || 1056 (rdev->ppl.offset <= 0 && rdev->ppl.offset + ppl_size_new > 0))) { 1057 pr_warn("md/raid:%s: PPL space overlaps with superblock on %s\n", 1058 mdname(rdev->mddev), bdevname(rdev->bdev, b)); 1059 return -EINVAL; 1060 } 1061 1062 rdev->ppl.size = ppl_size_new; 1063 1064 return 0; 1065 } 1066 1067 int ppl_init_log(struct r5conf *conf) 1068 { 1069 struct ppl_conf *ppl_conf; 1070 struct mddev *mddev = conf->mddev; 1071 int ret = 0; 1072 int i; 1073 bool need_cache_flush; 1074 1075 pr_debug("md/raid:%s: enabling distributed Partial Parity Log\n", 1076 mdname(conf->mddev)); 1077 1078 if (PAGE_SIZE != 4096) 1079 return -EINVAL; 1080 1081 if (mddev->level != 5) { 1082 pr_warn("md/raid:%s PPL is not compatible with raid level %d\n", 1083 mdname(mddev), mddev->level); 1084 return -EINVAL; 1085 } 1086 1087 if (mddev->bitmap_info.file || mddev->bitmap_info.offset) { 1088 pr_warn("md/raid:%s PPL is not compatible with bitmap\n", 1089 mdname(mddev)); 1090 return -EINVAL; 1091 } 1092 1093 if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) { 1094 pr_warn("md/raid:%s PPL is not compatible with journal\n", 1095 mdname(mddev)); 1096 return -EINVAL; 1097 } 1098 1099 ppl_conf = kzalloc(sizeof(struct ppl_conf), GFP_KERNEL); 1100 if (!ppl_conf) 1101 return -ENOMEM; 1102 1103 ppl_conf->mddev = mddev; 1104 1105 ppl_conf->io_kc = KMEM_CACHE(ppl_io_unit, 0); 1106 if (!ppl_conf->io_kc) { 1107 ret = -EINVAL; 1108 goto err; 1109 } 1110 1111 ppl_conf->io_pool = mempool_create_slab_pool(conf->raid_disks, ppl_conf->io_kc); 1112 if (!ppl_conf->io_pool) { 1113 ret = -EINVAL; 1114 goto err; 1115 } 1116 1117 ppl_conf->bs = bioset_create(conf->raid_disks, 0); 1118 if (!ppl_conf->bs) { 1119 ret = -EINVAL; 1120 goto err; 1121 } 1122 1123 ppl_conf->meta_pool = mempool_create_page_pool(conf->raid_disks, 0); 1124 if (!ppl_conf->meta_pool) { 1125 ret = -EINVAL; 1126 goto err; 1127 } 1128 1129 ppl_conf->count = conf->raid_disks; 1130 ppl_conf->child_logs = kcalloc(ppl_conf->count, sizeof(struct ppl_log), 1131 GFP_KERNEL); 1132 if (!ppl_conf->child_logs) { 1133 ret = -ENOMEM; 1134 goto err; 1135 } 1136 1137 atomic64_set(&ppl_conf->seq, 0); 1138 1139 if (!mddev->external) { 1140 ppl_conf->signature = ~crc32c_le(~0, mddev->uuid, sizeof(mddev->uuid)); 1141 ppl_conf->block_size = 512; 1142 } else { 1143 ppl_conf->block_size = queue_logical_block_size(mddev->queue); 1144 } 1145 1146 for (i = 0; i < ppl_conf->count; i++) { 1147 struct ppl_log *log = &ppl_conf->child_logs[i]; 1148 struct md_rdev *rdev = conf->disks[i].rdev; 1149 1150 mutex_init(&log->io_mutex); 1151 spin_lock_init(&log->io_list_lock); 1152 INIT_LIST_HEAD(&log->io_list); 1153 INIT_LIST_HEAD(&log->no_mem_stripes); 1154 1155 log->ppl_conf = ppl_conf; 1156 log->rdev = rdev; 1157 1158 if (rdev) { 1159 struct request_queue *q; 1160 1161 ret = ppl_validate_rdev(rdev); 1162 if (ret) 1163 goto err; 1164 1165 q = bdev_get_queue(rdev->bdev); 1166 if (test_bit(QUEUE_FLAG_WC, &q->queue_flags)) 1167 need_cache_flush = true; 1168 } 1169 } 1170 1171 if (need_cache_flush) 1172 pr_warn("md/raid:%s: Volatile write-back cache should be disabled on all member drives when using PPL!\n", 1173 mdname(mddev)); 1174 1175 /* load and possibly recover the logs from the member disks */ 1176 ret = ppl_load(ppl_conf); 1177 1178 if (ret) { 1179 goto err; 1180 } else if (!mddev->pers && 1181 mddev->recovery_cp == 0 && !mddev->degraded && 1182 ppl_conf->recovered_entries > 0 && 1183 ppl_conf->mismatch_count == 0) { 1184 /* 1185 * If we are starting a dirty array and the recovery succeeds 1186 * without any issues, set the array as clean. 1187 */ 1188 mddev->recovery_cp = MaxSector; 1189 set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); 1190 } 1191 1192 conf->log_private = ppl_conf; 1193 1194 return 0; 1195 err: 1196 __ppl_exit_log(ppl_conf); 1197 return ret; 1198 } 1199 1200 int ppl_modify_log(struct r5conf *conf, struct md_rdev *rdev, bool add) 1201 { 1202 struct ppl_conf *ppl_conf = conf->log_private; 1203 struct ppl_log *log; 1204 int ret = 0; 1205 char b[BDEVNAME_SIZE]; 1206 1207 if (!rdev) 1208 return -EINVAL; 1209 1210 pr_debug("%s: disk: %d operation: %s dev: %s\n", 1211 __func__, rdev->raid_disk, add ? "add" : "remove", 1212 bdevname(rdev->bdev, b)); 1213 1214 if (rdev->raid_disk < 0) 1215 return 0; 1216 1217 if (rdev->raid_disk >= ppl_conf->count) 1218 return -ENODEV; 1219 1220 log = &ppl_conf->child_logs[rdev->raid_disk]; 1221 1222 mutex_lock(&log->io_mutex); 1223 if (add) { 1224 ret = ppl_validate_rdev(rdev); 1225 if (!ret) { 1226 log->rdev = rdev; 1227 ret = ppl_write_empty_header(log); 1228 } 1229 } else { 1230 log->rdev = NULL; 1231 } 1232 mutex_unlock(&log->io_mutex); 1233 1234 return ret; 1235 } 1236