1 /* 2 * Partial Parity Log for closing the RAID5 write hole 3 * Copyright (c) 2017, Intel Corporation. 4 * 5 * This program is free software; you can redistribute it and/or modify it 6 * under the terms and conditions of the GNU General Public License, 7 * version 2, as published by the Free Software Foundation. 8 * 9 * This program is distributed in the hope it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 12 * more details. 13 */ 14 15 #include <linux/kernel.h> 16 #include <linux/blkdev.h> 17 #include <linux/slab.h> 18 #include <linux/crc32c.h> 19 #include <linux/flex_array.h> 20 #include <linux/async_tx.h> 21 #include <linux/raid/md_p.h> 22 #include "md.h" 23 #include "raid5.h" 24 25 /* 26 * PPL consists of a 4KB header (struct ppl_header) and at least 128KB for 27 * partial parity data. The header contains an array of entries 28 * (struct ppl_header_entry) which describe the logged write requests. 29 * Partial parity for the entries comes after the header, written in the same 30 * sequence as the entries: 31 * 32 * Header 33 * entry0 34 * ... 35 * entryN 36 * PP data 37 * PP for entry0 38 * ... 39 * PP for entryN 40 * 41 * An entry describes one or more consecutive stripe_heads, up to a full 42 * stripe. The modifed raid data chunks form an m-by-n matrix, where m is the 43 * number of stripe_heads in the entry and n is the number of modified data 44 * disks. Every stripe_head in the entry must write to the same data disks. 45 * An example of a valid case described by a single entry (writes to the first 46 * stripe of a 4 disk array, 16k chunk size): 47 * 48 * sh->sector dd0 dd1 dd2 ppl 49 * +-----+-----+-----+ 50 * 0 | --- | --- | --- | +----+ 51 * 8 | -W- | -W- | --- | | pp | data_sector = 8 52 * 16 | -W- | -W- | --- | | pp | data_size = 3 * 2 * 4k 53 * 24 | -W- | -W- | --- | | pp | pp_size = 3 * 4k 54 * +-----+-----+-----+ +----+ 55 * 56 * data_sector is the first raid sector of the modified data, data_size is the 57 * total size of modified data and pp_size is the size of partial parity for 58 * this entry. Entries for full stripe writes contain no partial parity 59 * (pp_size = 0), they only mark the stripes for which parity should be 60 * recalculated after an unclean shutdown. Every entry holds a checksum of its 61 * partial parity, the header also has a checksum of the header itself. 62 * 63 * A write request is always logged to the PPL instance stored on the parity 64 * disk of the corresponding stripe. For each member disk there is one ppl_log 65 * used to handle logging for this disk, independently from others. They are 66 * grouped in child_logs array in struct ppl_conf, which is assigned to 67 * r5conf->log_private. 68 * 69 * ppl_io_unit represents a full PPL write, header_page contains the ppl_header. 70 * PPL entries for logged stripes are added in ppl_log_stripe(). A stripe_head 71 * can be appended to the last entry if it meets the conditions for a valid 72 * entry described above, otherwise a new entry is added. Checksums of entries 73 * are calculated incrementally as stripes containing partial parity are being 74 * added. ppl_submit_iounit() calculates the checksum of the header and submits 75 * a bio containing the header page and partial parity pages (sh->ppl_page) for 76 * all stripes of the io_unit. When the PPL write completes, the stripes 77 * associated with the io_unit are released and raid5d starts writing their data 78 * and parity. When all stripes are written, the io_unit is freed and the next 79 * can be submitted. 80 * 81 * An io_unit is used to gather stripes until it is submitted or becomes full 82 * (if the maximum number of entries or size of PPL is reached). Another io_unit 83 * can't be submitted until the previous has completed (PPL and stripe 84 * data+parity is written). The log->io_list tracks all io_units of a log 85 * (for a single member disk). New io_units are added to the end of the list 86 * and the first io_unit is submitted, if it is not submitted already. 87 * The current io_unit accepting new stripes is always at the end of the list. 88 */ 89 90 struct ppl_conf { 91 struct mddev *mddev; 92 93 /* array of child logs, one for each raid disk */ 94 struct ppl_log *child_logs; 95 int count; 96 97 int block_size; /* the logical block size used for data_sector 98 * in ppl_header_entry */ 99 u32 signature; /* raid array identifier */ 100 atomic64_t seq; /* current log write sequence number */ 101 102 struct kmem_cache *io_kc; 103 mempool_t *io_pool; 104 struct bio_set *bs; 105 mempool_t *meta_pool; 106 107 /* used only for recovery */ 108 int recovered_entries; 109 int mismatch_count; 110 }; 111 112 struct ppl_log { 113 struct ppl_conf *ppl_conf; /* shared between all log instances */ 114 115 struct md_rdev *rdev; /* array member disk associated with 116 * this log instance */ 117 struct mutex io_mutex; 118 struct ppl_io_unit *current_io; /* current io_unit accepting new data 119 * always at the end of io_list */ 120 spinlock_t io_list_lock; 121 struct list_head io_list; /* all io_units of this log */ 122 struct list_head no_mem_stripes;/* stripes to retry if failed to 123 * allocate io_unit */ 124 }; 125 126 #define PPL_IO_INLINE_BVECS 32 127 128 struct ppl_io_unit { 129 struct ppl_log *log; 130 131 struct page *header_page; /* for ppl_header */ 132 133 unsigned int entries_count; /* number of entries in ppl_header */ 134 unsigned int pp_size; /* total size current of partial parity */ 135 136 u64 seq; /* sequence number of this log write */ 137 struct list_head log_sibling; /* log->io_list */ 138 139 struct list_head stripe_list; /* stripes added to the io_unit */ 140 atomic_t pending_stripes; /* how many stripes not written to raid */ 141 142 bool submitted; /* true if write to log started */ 143 144 /* inline bio and its biovec for submitting the iounit */ 145 struct bio bio; 146 struct bio_vec biovec[PPL_IO_INLINE_BVECS]; 147 }; 148 149 struct dma_async_tx_descriptor * 150 ops_run_partial_parity(struct stripe_head *sh, struct raid5_percpu *percpu, 151 struct dma_async_tx_descriptor *tx) 152 { 153 int disks = sh->disks; 154 struct page **xor_srcs = flex_array_get(percpu->scribble, 0); 155 int count = 0, pd_idx = sh->pd_idx, i; 156 struct async_submit_ctl submit; 157 158 pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); 159 160 /* 161 * Partial parity is the XOR of stripe data chunks that are not changed 162 * during the write request. Depending on available data 163 * (read-modify-write vs. reconstruct-write case) we calculate it 164 * differently. 165 */ 166 if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) { 167 /* rmw: xor old data and parity from updated disks */ 168 for (i = disks; i--;) { 169 struct r5dev *dev = &sh->dev[i]; 170 if (test_bit(R5_Wantdrain, &dev->flags) || i == pd_idx) 171 xor_srcs[count++] = dev->page; 172 } 173 } else if (sh->reconstruct_state == reconstruct_state_drain_run) { 174 /* rcw: xor data from all not updated disks */ 175 for (i = disks; i--;) { 176 struct r5dev *dev = &sh->dev[i]; 177 if (test_bit(R5_UPTODATE, &dev->flags)) 178 xor_srcs[count++] = dev->page; 179 } 180 } else { 181 return tx; 182 } 183 184 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, tx, 185 NULL, sh, flex_array_get(percpu->scribble, 0) 186 + sizeof(struct page *) * (sh->disks + 2)); 187 188 if (count == 1) 189 tx = async_memcpy(sh->ppl_page, xor_srcs[0], 0, 0, PAGE_SIZE, 190 &submit); 191 else 192 tx = async_xor(sh->ppl_page, xor_srcs, 0, count, PAGE_SIZE, 193 &submit); 194 195 return tx; 196 } 197 198 static struct ppl_io_unit *ppl_new_iounit(struct ppl_log *log, 199 struct stripe_head *sh) 200 { 201 struct ppl_conf *ppl_conf = log->ppl_conf; 202 struct ppl_io_unit *io; 203 struct ppl_header *pplhdr; 204 205 io = mempool_alloc(ppl_conf->io_pool, GFP_ATOMIC); 206 if (!io) 207 return NULL; 208 209 memset(io, 0, sizeof(*io)); 210 io->log = log; 211 INIT_LIST_HEAD(&io->log_sibling); 212 INIT_LIST_HEAD(&io->stripe_list); 213 atomic_set(&io->pending_stripes, 0); 214 bio_init(&io->bio, io->biovec, PPL_IO_INLINE_BVECS); 215 216 io->header_page = mempool_alloc(ppl_conf->meta_pool, GFP_NOIO); 217 pplhdr = page_address(io->header_page); 218 clear_page(pplhdr); 219 memset(pplhdr->reserved, 0xff, PPL_HDR_RESERVED); 220 pplhdr->signature = cpu_to_le32(ppl_conf->signature); 221 222 io->seq = atomic64_add_return(1, &ppl_conf->seq); 223 pplhdr->generation = cpu_to_le64(io->seq); 224 225 return io; 226 } 227 228 static int ppl_log_stripe(struct ppl_log *log, struct stripe_head *sh) 229 { 230 struct ppl_io_unit *io = log->current_io; 231 struct ppl_header_entry *e = NULL; 232 struct ppl_header *pplhdr; 233 int i; 234 sector_t data_sector = 0; 235 int data_disks = 0; 236 unsigned int entry_space = (log->rdev->ppl.size << 9) - PPL_HEADER_SIZE; 237 struct r5conf *conf = sh->raid_conf; 238 239 pr_debug("%s: stripe: %llu\n", __func__, (unsigned long long)sh->sector); 240 241 /* check if current io_unit is full */ 242 if (io && (io->pp_size == entry_space || 243 io->entries_count == PPL_HDR_MAX_ENTRIES)) { 244 pr_debug("%s: add io_unit blocked by seq: %llu\n", 245 __func__, io->seq); 246 io = NULL; 247 } 248 249 /* add a new unit if there is none or the current is full */ 250 if (!io) { 251 io = ppl_new_iounit(log, sh); 252 if (!io) 253 return -ENOMEM; 254 spin_lock_irq(&log->io_list_lock); 255 list_add_tail(&io->log_sibling, &log->io_list); 256 spin_unlock_irq(&log->io_list_lock); 257 258 log->current_io = io; 259 } 260 261 for (i = 0; i < sh->disks; i++) { 262 struct r5dev *dev = &sh->dev[i]; 263 264 if (i != sh->pd_idx && test_bit(R5_Wantwrite, &dev->flags)) { 265 if (!data_disks || dev->sector < data_sector) 266 data_sector = dev->sector; 267 data_disks++; 268 } 269 } 270 BUG_ON(!data_disks); 271 272 pr_debug("%s: seq: %llu data_sector: %llu data_disks: %d\n", __func__, 273 io->seq, (unsigned long long)data_sector, data_disks); 274 275 pplhdr = page_address(io->header_page); 276 277 if (io->entries_count > 0) { 278 struct ppl_header_entry *last = 279 &pplhdr->entries[io->entries_count - 1]; 280 struct stripe_head *sh_last = list_last_entry( 281 &io->stripe_list, struct stripe_head, log_list); 282 u64 data_sector_last = le64_to_cpu(last->data_sector); 283 u32 data_size_last = le32_to_cpu(last->data_size); 284 285 /* 286 * Check if we can append the stripe to the last entry. It must 287 * be just after the last logged stripe and write to the same 288 * disks. Use bit shift and logarithm to avoid 64-bit division. 289 */ 290 if ((sh->sector == sh_last->sector + STRIPE_SECTORS) && 291 (data_sector >> ilog2(conf->chunk_sectors) == 292 data_sector_last >> ilog2(conf->chunk_sectors)) && 293 ((data_sector - data_sector_last) * data_disks == 294 data_size_last >> 9)) 295 e = last; 296 } 297 298 if (!e) { 299 e = &pplhdr->entries[io->entries_count++]; 300 e->data_sector = cpu_to_le64(data_sector); 301 e->parity_disk = cpu_to_le32(sh->pd_idx); 302 e->checksum = cpu_to_le32(~0); 303 } 304 305 le32_add_cpu(&e->data_size, data_disks << PAGE_SHIFT); 306 307 /* don't write any PP if full stripe write */ 308 if (!test_bit(STRIPE_FULL_WRITE, &sh->state)) { 309 le32_add_cpu(&e->pp_size, PAGE_SIZE); 310 io->pp_size += PAGE_SIZE; 311 e->checksum = cpu_to_le32(crc32c_le(le32_to_cpu(e->checksum), 312 page_address(sh->ppl_page), 313 PAGE_SIZE)); 314 } 315 316 list_add_tail(&sh->log_list, &io->stripe_list); 317 atomic_inc(&io->pending_stripes); 318 sh->ppl_io = io; 319 320 return 0; 321 } 322 323 int ppl_write_stripe(struct r5conf *conf, struct stripe_head *sh) 324 { 325 struct ppl_conf *ppl_conf = conf->log_private; 326 struct ppl_io_unit *io = sh->ppl_io; 327 struct ppl_log *log; 328 329 if (io || test_bit(STRIPE_SYNCING, &sh->state) || 330 !test_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags) || 331 !test_bit(R5_Insync, &sh->dev[sh->pd_idx].flags)) { 332 clear_bit(STRIPE_LOG_TRAPPED, &sh->state); 333 return -EAGAIN; 334 } 335 336 log = &ppl_conf->child_logs[sh->pd_idx]; 337 338 mutex_lock(&log->io_mutex); 339 340 if (!log->rdev || test_bit(Faulty, &log->rdev->flags)) { 341 mutex_unlock(&log->io_mutex); 342 return -EAGAIN; 343 } 344 345 set_bit(STRIPE_LOG_TRAPPED, &sh->state); 346 clear_bit(STRIPE_DELAYED, &sh->state); 347 atomic_inc(&sh->count); 348 349 if (ppl_log_stripe(log, sh)) { 350 spin_lock_irq(&log->io_list_lock); 351 list_add_tail(&sh->log_list, &log->no_mem_stripes); 352 spin_unlock_irq(&log->io_list_lock); 353 } 354 355 mutex_unlock(&log->io_mutex); 356 357 return 0; 358 } 359 360 static void ppl_log_endio(struct bio *bio) 361 { 362 struct ppl_io_unit *io = bio->bi_private; 363 struct ppl_log *log = io->log; 364 struct ppl_conf *ppl_conf = log->ppl_conf; 365 struct stripe_head *sh, *next; 366 367 pr_debug("%s: seq: %llu\n", __func__, io->seq); 368 369 if (bio->bi_error) 370 md_error(ppl_conf->mddev, log->rdev); 371 372 mempool_free(io->header_page, ppl_conf->meta_pool); 373 374 list_for_each_entry_safe(sh, next, &io->stripe_list, log_list) { 375 list_del_init(&sh->log_list); 376 377 set_bit(STRIPE_HANDLE, &sh->state); 378 raid5_release_stripe(sh); 379 } 380 } 381 382 static void ppl_submit_iounit_bio(struct ppl_io_unit *io, struct bio *bio) 383 { 384 char b[BDEVNAME_SIZE]; 385 386 pr_debug("%s: seq: %llu size: %u sector: %llu dev: %s\n", 387 __func__, io->seq, bio->bi_iter.bi_size, 388 (unsigned long long)bio->bi_iter.bi_sector, 389 bdevname(bio->bi_bdev, b)); 390 391 submit_bio(bio); 392 } 393 394 static void ppl_submit_iounit(struct ppl_io_unit *io) 395 { 396 struct ppl_log *log = io->log; 397 struct ppl_conf *ppl_conf = log->ppl_conf; 398 struct ppl_header *pplhdr = page_address(io->header_page); 399 struct bio *bio = &io->bio; 400 struct stripe_head *sh; 401 int i; 402 403 for (i = 0; i < io->entries_count; i++) { 404 struct ppl_header_entry *e = &pplhdr->entries[i]; 405 406 pr_debug("%s: seq: %llu entry: %d data_sector: %llu pp_size: %u data_size: %u\n", 407 __func__, io->seq, i, le64_to_cpu(e->data_sector), 408 le32_to_cpu(e->pp_size), le32_to_cpu(e->data_size)); 409 410 e->data_sector = cpu_to_le64(le64_to_cpu(e->data_sector) >> 411 ilog2(ppl_conf->block_size >> 9)); 412 e->checksum = cpu_to_le32(~le32_to_cpu(e->checksum)); 413 } 414 415 pplhdr->entries_count = cpu_to_le32(io->entries_count); 416 pplhdr->checksum = cpu_to_le32(~crc32c_le(~0, pplhdr, PPL_HEADER_SIZE)); 417 418 bio->bi_private = io; 419 bio->bi_end_io = ppl_log_endio; 420 bio->bi_opf = REQ_OP_WRITE | REQ_FUA; 421 bio->bi_bdev = log->rdev->bdev; 422 bio->bi_iter.bi_sector = log->rdev->ppl.sector; 423 bio_add_page(bio, io->header_page, PAGE_SIZE, 0); 424 425 list_for_each_entry(sh, &io->stripe_list, log_list) { 426 /* entries for full stripe writes have no partial parity */ 427 if (test_bit(STRIPE_FULL_WRITE, &sh->state)) 428 continue; 429 430 if (!bio_add_page(bio, sh->ppl_page, PAGE_SIZE, 0)) { 431 struct bio *prev = bio; 432 433 bio = bio_alloc_bioset(GFP_NOIO, BIO_MAX_PAGES, 434 ppl_conf->bs); 435 bio->bi_opf = prev->bi_opf; 436 bio->bi_bdev = prev->bi_bdev; 437 bio->bi_iter.bi_sector = bio_end_sector(prev); 438 bio_add_page(bio, sh->ppl_page, PAGE_SIZE, 0); 439 440 bio_chain(bio, prev); 441 ppl_submit_iounit_bio(io, prev); 442 } 443 } 444 445 ppl_submit_iounit_bio(io, bio); 446 } 447 448 static void ppl_submit_current_io(struct ppl_log *log) 449 { 450 struct ppl_io_unit *io; 451 452 spin_lock_irq(&log->io_list_lock); 453 454 io = list_first_entry_or_null(&log->io_list, struct ppl_io_unit, 455 log_sibling); 456 if (io && io->submitted) 457 io = NULL; 458 459 spin_unlock_irq(&log->io_list_lock); 460 461 if (io) { 462 io->submitted = true; 463 464 if (io == log->current_io) 465 log->current_io = NULL; 466 467 ppl_submit_iounit(io); 468 } 469 } 470 471 void ppl_write_stripe_run(struct r5conf *conf) 472 { 473 struct ppl_conf *ppl_conf = conf->log_private; 474 struct ppl_log *log; 475 int i; 476 477 for (i = 0; i < ppl_conf->count; i++) { 478 log = &ppl_conf->child_logs[i]; 479 480 mutex_lock(&log->io_mutex); 481 ppl_submit_current_io(log); 482 mutex_unlock(&log->io_mutex); 483 } 484 } 485 486 static void ppl_io_unit_finished(struct ppl_io_unit *io) 487 { 488 struct ppl_log *log = io->log; 489 unsigned long flags; 490 491 pr_debug("%s: seq: %llu\n", __func__, io->seq); 492 493 spin_lock_irqsave(&log->io_list_lock, flags); 494 495 list_del(&io->log_sibling); 496 mempool_free(io, log->ppl_conf->io_pool); 497 498 if (!list_empty(&log->no_mem_stripes)) { 499 struct stripe_head *sh = list_first_entry(&log->no_mem_stripes, 500 struct stripe_head, 501 log_list); 502 list_del_init(&sh->log_list); 503 set_bit(STRIPE_HANDLE, &sh->state); 504 raid5_release_stripe(sh); 505 } 506 507 spin_unlock_irqrestore(&log->io_list_lock, flags); 508 } 509 510 void ppl_stripe_write_finished(struct stripe_head *sh) 511 { 512 struct ppl_io_unit *io; 513 514 io = sh->ppl_io; 515 sh->ppl_io = NULL; 516 517 if (io && atomic_dec_and_test(&io->pending_stripes)) 518 ppl_io_unit_finished(io); 519 } 520 521 static void ppl_xor(int size, struct page *page1, struct page *page2) 522 { 523 struct async_submit_ctl submit; 524 struct dma_async_tx_descriptor *tx; 525 struct page *xor_srcs[] = { page1, page2 }; 526 527 init_async_submit(&submit, ASYNC_TX_ACK|ASYNC_TX_XOR_DROP_DST, 528 NULL, NULL, NULL, NULL); 529 tx = async_xor(page1, xor_srcs, 0, 2, size, &submit); 530 531 async_tx_quiesce(&tx); 532 } 533 534 /* 535 * PPL recovery strategy: xor partial parity and data from all modified data 536 * disks within a stripe and write the result as the new stripe parity. If all 537 * stripe data disks are modified (full stripe write), no partial parity is 538 * available, so just xor the data disks. 539 * 540 * Recovery of a PPL entry shall occur only if all modified data disks are 541 * available and read from all of them succeeds. 542 * 543 * A PPL entry applies to a stripe, partial parity size for an entry is at most 544 * the size of the chunk. Examples of possible cases for a single entry: 545 * 546 * case 0: single data disk write: 547 * data0 data1 data2 ppl parity 548 * +--------+--------+--------+ +--------------------+ 549 * | ------ | ------ | ------ | +----+ | (no change) | 550 * | ------ | -data- | ------ | | pp | -> | data1 ^ pp | 551 * | ------ | -data- | ------ | | pp | -> | data1 ^ pp | 552 * | ------ | ------ | ------ | +----+ | (no change) | 553 * +--------+--------+--------+ +--------------------+ 554 * pp_size = data_size 555 * 556 * case 1: more than one data disk write: 557 * data0 data1 data2 ppl parity 558 * +--------+--------+--------+ +--------------------+ 559 * | ------ | ------ | ------ | +----+ | (no change) | 560 * | -data- | -data- | ------ | | pp | -> | data0 ^ data1 ^ pp | 561 * | -data- | -data- | ------ | | pp | -> | data0 ^ data1 ^ pp | 562 * | ------ | ------ | ------ | +----+ | (no change) | 563 * +--------+--------+--------+ +--------------------+ 564 * pp_size = data_size / modified_data_disks 565 * 566 * case 2: write to all data disks (also full stripe write): 567 * data0 data1 data2 parity 568 * +--------+--------+--------+ +--------------------+ 569 * | ------ | ------ | ------ | | (no change) | 570 * | -data- | -data- | -data- | --------> | xor all data | 571 * | ------ | ------ | ------ | --------> | (no change) | 572 * | ------ | ------ | ------ | | (no change) | 573 * +--------+--------+--------+ +--------------------+ 574 * pp_size = 0 575 * 576 * The following cases are possible only in other implementations. The recovery 577 * code can handle them, but they are not generated at runtime because they can 578 * be reduced to cases 0, 1 and 2: 579 * 580 * case 3: 581 * data0 data1 data2 ppl parity 582 * +--------+--------+--------+ +----+ +--------------------+ 583 * | ------ | -data- | -data- | | pp | | data1 ^ data2 ^ pp | 584 * | ------ | -data- | -data- | | pp | -> | data1 ^ data2 ^ pp | 585 * | -data- | -data- | -data- | | -- | -> | xor all data | 586 * | -data- | -data- | ------ | | pp | | data0 ^ data1 ^ pp | 587 * +--------+--------+--------+ +----+ +--------------------+ 588 * pp_size = chunk_size 589 * 590 * case 4: 591 * data0 data1 data2 ppl parity 592 * +--------+--------+--------+ +----+ +--------------------+ 593 * | ------ | -data- | ------ | | pp | | data1 ^ pp | 594 * | ------ | ------ | ------ | | -- | -> | (no change) | 595 * | ------ | ------ | ------ | | -- | -> | (no change) | 596 * | -data- | ------ | ------ | | pp | | data0 ^ pp | 597 * +--------+--------+--------+ +----+ +--------------------+ 598 * pp_size = chunk_size 599 */ 600 static int ppl_recover_entry(struct ppl_log *log, struct ppl_header_entry *e, 601 sector_t ppl_sector) 602 { 603 struct ppl_conf *ppl_conf = log->ppl_conf; 604 struct mddev *mddev = ppl_conf->mddev; 605 struct r5conf *conf = mddev->private; 606 int block_size = ppl_conf->block_size; 607 struct page *page1; 608 struct page *page2; 609 sector_t r_sector_first; 610 sector_t r_sector_last; 611 int strip_sectors; 612 int data_disks; 613 int i; 614 int ret = 0; 615 char b[BDEVNAME_SIZE]; 616 unsigned int pp_size = le32_to_cpu(e->pp_size); 617 unsigned int data_size = le32_to_cpu(e->data_size); 618 619 page1 = alloc_page(GFP_KERNEL); 620 page2 = alloc_page(GFP_KERNEL); 621 622 if (!page1 || !page2) { 623 ret = -ENOMEM; 624 goto out; 625 } 626 627 r_sector_first = le64_to_cpu(e->data_sector) * (block_size >> 9); 628 629 if ((pp_size >> 9) < conf->chunk_sectors) { 630 if (pp_size > 0) { 631 data_disks = data_size / pp_size; 632 strip_sectors = pp_size >> 9; 633 } else { 634 data_disks = conf->raid_disks - conf->max_degraded; 635 strip_sectors = (data_size >> 9) / data_disks; 636 } 637 r_sector_last = r_sector_first + 638 (data_disks - 1) * conf->chunk_sectors + 639 strip_sectors; 640 } else { 641 data_disks = conf->raid_disks - conf->max_degraded; 642 strip_sectors = conf->chunk_sectors; 643 r_sector_last = r_sector_first + (data_size >> 9); 644 } 645 646 pr_debug("%s: array sector first: %llu last: %llu\n", __func__, 647 (unsigned long long)r_sector_first, 648 (unsigned long long)r_sector_last); 649 650 /* if start and end is 4k aligned, use a 4k block */ 651 if (block_size == 512 && 652 (r_sector_first & (STRIPE_SECTORS - 1)) == 0 && 653 (r_sector_last & (STRIPE_SECTORS - 1)) == 0) 654 block_size = STRIPE_SIZE; 655 656 /* iterate through blocks in strip */ 657 for (i = 0; i < strip_sectors; i += (block_size >> 9)) { 658 bool update_parity = false; 659 sector_t parity_sector; 660 struct md_rdev *parity_rdev; 661 struct stripe_head sh; 662 int disk; 663 int indent = 0; 664 665 pr_debug("%s:%*s iter %d start\n", __func__, indent, "", i); 666 indent += 2; 667 668 memset(page_address(page1), 0, PAGE_SIZE); 669 670 /* iterate through data member disks */ 671 for (disk = 0; disk < data_disks; disk++) { 672 int dd_idx; 673 struct md_rdev *rdev; 674 sector_t sector; 675 sector_t r_sector = r_sector_first + i + 676 (disk * conf->chunk_sectors); 677 678 pr_debug("%s:%*s data member disk %d start\n", 679 __func__, indent, "", disk); 680 indent += 2; 681 682 if (r_sector >= r_sector_last) { 683 pr_debug("%s:%*s array sector %llu doesn't need parity update\n", 684 __func__, indent, "", 685 (unsigned long long)r_sector); 686 indent -= 2; 687 continue; 688 } 689 690 update_parity = true; 691 692 /* map raid sector to member disk */ 693 sector = raid5_compute_sector(conf, r_sector, 0, 694 &dd_idx, NULL); 695 pr_debug("%s:%*s processing array sector %llu => data member disk %d, sector %llu\n", 696 __func__, indent, "", 697 (unsigned long long)r_sector, dd_idx, 698 (unsigned long long)sector); 699 700 rdev = conf->disks[dd_idx].rdev; 701 if (!rdev) { 702 pr_debug("%s:%*s data member disk %d missing\n", 703 __func__, indent, "", dd_idx); 704 update_parity = false; 705 break; 706 } 707 708 pr_debug("%s:%*s reading data member disk %s sector %llu\n", 709 __func__, indent, "", bdevname(rdev->bdev, b), 710 (unsigned long long)sector); 711 if (!sync_page_io(rdev, sector, block_size, page2, 712 REQ_OP_READ, 0, false)) { 713 md_error(mddev, rdev); 714 pr_debug("%s:%*s read failed!\n", __func__, 715 indent, ""); 716 ret = -EIO; 717 goto out; 718 } 719 720 ppl_xor(block_size, page1, page2); 721 722 indent -= 2; 723 } 724 725 if (!update_parity) 726 continue; 727 728 if (pp_size > 0) { 729 pr_debug("%s:%*s reading pp disk sector %llu\n", 730 __func__, indent, "", 731 (unsigned long long)(ppl_sector + i)); 732 if (!sync_page_io(log->rdev, 733 ppl_sector - log->rdev->data_offset + i, 734 block_size, page2, REQ_OP_READ, 0, 735 false)) { 736 pr_debug("%s:%*s read failed!\n", __func__, 737 indent, ""); 738 md_error(mddev, log->rdev); 739 ret = -EIO; 740 goto out; 741 } 742 743 ppl_xor(block_size, page1, page2); 744 } 745 746 /* map raid sector to parity disk */ 747 parity_sector = raid5_compute_sector(conf, r_sector_first + i, 748 0, &disk, &sh); 749 BUG_ON(sh.pd_idx != le32_to_cpu(e->parity_disk)); 750 parity_rdev = conf->disks[sh.pd_idx].rdev; 751 752 BUG_ON(parity_rdev->bdev->bd_dev != log->rdev->bdev->bd_dev); 753 pr_debug("%s:%*s write parity at sector %llu, disk %s\n", 754 __func__, indent, "", 755 (unsigned long long)parity_sector, 756 bdevname(parity_rdev->bdev, b)); 757 if (!sync_page_io(parity_rdev, parity_sector, block_size, 758 page1, REQ_OP_WRITE, 0, false)) { 759 pr_debug("%s:%*s parity write error!\n", __func__, 760 indent, ""); 761 md_error(mddev, parity_rdev); 762 ret = -EIO; 763 goto out; 764 } 765 } 766 out: 767 if (page1) 768 __free_page(page1); 769 if (page2) 770 __free_page(page2); 771 return ret; 772 } 773 774 static int ppl_recover(struct ppl_log *log, struct ppl_header *pplhdr) 775 { 776 struct ppl_conf *ppl_conf = log->ppl_conf; 777 struct md_rdev *rdev = log->rdev; 778 struct mddev *mddev = rdev->mddev; 779 sector_t ppl_sector = rdev->ppl.sector + (PPL_HEADER_SIZE >> 9); 780 struct page *page; 781 int i; 782 int ret = 0; 783 784 page = alloc_page(GFP_KERNEL); 785 if (!page) 786 return -ENOMEM; 787 788 /* iterate through all PPL entries saved */ 789 for (i = 0; i < le32_to_cpu(pplhdr->entries_count); i++) { 790 struct ppl_header_entry *e = &pplhdr->entries[i]; 791 u32 pp_size = le32_to_cpu(e->pp_size); 792 sector_t sector = ppl_sector; 793 int ppl_entry_sectors = pp_size >> 9; 794 u32 crc, crc_stored; 795 796 pr_debug("%s: disk: %d entry: %d ppl_sector: %llu pp_size: %u\n", 797 __func__, rdev->raid_disk, i, 798 (unsigned long long)ppl_sector, pp_size); 799 800 crc = ~0; 801 crc_stored = le32_to_cpu(e->checksum); 802 803 /* read parial parity for this entry and calculate its checksum */ 804 while (pp_size) { 805 int s = pp_size > PAGE_SIZE ? PAGE_SIZE : pp_size; 806 807 if (!sync_page_io(rdev, sector - rdev->data_offset, 808 s, page, REQ_OP_READ, 0, false)) { 809 md_error(mddev, rdev); 810 ret = -EIO; 811 goto out; 812 } 813 814 crc = crc32c_le(crc, page_address(page), s); 815 816 pp_size -= s; 817 sector += s >> 9; 818 } 819 820 crc = ~crc; 821 822 if (crc != crc_stored) { 823 /* 824 * Don't recover this entry if the checksum does not 825 * match, but keep going and try to recover other 826 * entries. 827 */ 828 pr_debug("%s: ppl entry crc does not match: stored: 0x%x calculated: 0x%x\n", 829 __func__, crc_stored, crc); 830 ppl_conf->mismatch_count++; 831 } else { 832 ret = ppl_recover_entry(log, e, ppl_sector); 833 if (ret) 834 goto out; 835 ppl_conf->recovered_entries++; 836 } 837 838 ppl_sector += ppl_entry_sectors; 839 } 840 841 /* flush the disk cache after recovery if necessary */ 842 ret = blkdev_issue_flush(rdev->bdev, GFP_KERNEL, NULL); 843 out: 844 __free_page(page); 845 return ret; 846 } 847 848 static int ppl_write_empty_header(struct ppl_log *log) 849 { 850 struct page *page; 851 struct ppl_header *pplhdr; 852 struct md_rdev *rdev = log->rdev; 853 int ret = 0; 854 855 pr_debug("%s: disk: %d ppl_sector: %llu\n", __func__, 856 rdev->raid_disk, (unsigned long long)rdev->ppl.sector); 857 858 page = alloc_page(GFP_NOIO | __GFP_ZERO); 859 if (!page) 860 return -ENOMEM; 861 862 pplhdr = page_address(page); 863 memset(pplhdr->reserved, 0xff, PPL_HDR_RESERVED); 864 pplhdr->signature = cpu_to_le32(log->ppl_conf->signature); 865 pplhdr->checksum = cpu_to_le32(~crc32c_le(~0, pplhdr, PAGE_SIZE)); 866 867 if (!sync_page_io(rdev, rdev->ppl.sector - rdev->data_offset, 868 PPL_HEADER_SIZE, page, REQ_OP_WRITE | REQ_FUA, 0, 869 false)) { 870 md_error(rdev->mddev, rdev); 871 ret = -EIO; 872 } 873 874 __free_page(page); 875 return ret; 876 } 877 878 static int ppl_load_distributed(struct ppl_log *log) 879 { 880 struct ppl_conf *ppl_conf = log->ppl_conf; 881 struct md_rdev *rdev = log->rdev; 882 struct mddev *mddev = rdev->mddev; 883 struct page *page; 884 struct ppl_header *pplhdr; 885 u32 crc, crc_stored; 886 u32 signature; 887 int ret = 0; 888 889 pr_debug("%s: disk: %d\n", __func__, rdev->raid_disk); 890 891 /* read PPL header */ 892 page = alloc_page(GFP_KERNEL); 893 if (!page) 894 return -ENOMEM; 895 896 if (!sync_page_io(rdev, rdev->ppl.sector - rdev->data_offset, 897 PAGE_SIZE, page, REQ_OP_READ, 0, false)) { 898 md_error(mddev, rdev); 899 ret = -EIO; 900 goto out; 901 } 902 pplhdr = page_address(page); 903 904 /* check header validity */ 905 crc_stored = le32_to_cpu(pplhdr->checksum); 906 pplhdr->checksum = 0; 907 crc = ~crc32c_le(~0, pplhdr, PAGE_SIZE); 908 909 if (crc_stored != crc) { 910 pr_debug("%s: ppl header crc does not match: stored: 0x%x calculated: 0x%x\n", 911 __func__, crc_stored, crc); 912 ppl_conf->mismatch_count++; 913 goto out; 914 } 915 916 signature = le32_to_cpu(pplhdr->signature); 917 918 if (mddev->external) { 919 /* 920 * For external metadata the header signature is set and 921 * validated in userspace. 922 */ 923 ppl_conf->signature = signature; 924 } else if (ppl_conf->signature != signature) { 925 pr_debug("%s: ppl header signature does not match: stored: 0x%x configured: 0x%x\n", 926 __func__, signature, ppl_conf->signature); 927 ppl_conf->mismatch_count++; 928 goto out; 929 } 930 931 /* attempt to recover from log if we are starting a dirty array */ 932 if (!mddev->pers && mddev->recovery_cp != MaxSector) 933 ret = ppl_recover(log, pplhdr); 934 out: 935 /* write empty header if we are starting the array */ 936 if (!ret && !mddev->pers) 937 ret = ppl_write_empty_header(log); 938 939 __free_page(page); 940 941 pr_debug("%s: return: %d mismatch_count: %d recovered_entries: %d\n", 942 __func__, ret, ppl_conf->mismatch_count, 943 ppl_conf->recovered_entries); 944 return ret; 945 } 946 947 static int ppl_load(struct ppl_conf *ppl_conf) 948 { 949 int ret = 0; 950 u32 signature = 0; 951 bool signature_set = false; 952 int i; 953 954 for (i = 0; i < ppl_conf->count; i++) { 955 struct ppl_log *log = &ppl_conf->child_logs[i]; 956 957 /* skip missing drive */ 958 if (!log->rdev) 959 continue; 960 961 ret = ppl_load_distributed(log); 962 if (ret) 963 break; 964 965 /* 966 * For external metadata we can't check if the signature is 967 * correct on a single drive, but we can check if it is the same 968 * on all drives. 969 */ 970 if (ppl_conf->mddev->external) { 971 if (!signature_set) { 972 signature = ppl_conf->signature; 973 signature_set = true; 974 } else if (signature != ppl_conf->signature) { 975 pr_warn("md/raid:%s: PPL header signature does not match on all member drives\n", 976 mdname(ppl_conf->mddev)); 977 ret = -EINVAL; 978 break; 979 } 980 } 981 } 982 983 pr_debug("%s: return: %d mismatch_count: %d recovered_entries: %d\n", 984 __func__, ret, ppl_conf->mismatch_count, 985 ppl_conf->recovered_entries); 986 return ret; 987 } 988 989 static void __ppl_exit_log(struct ppl_conf *ppl_conf) 990 { 991 clear_bit(MD_HAS_PPL, &ppl_conf->mddev->flags); 992 993 kfree(ppl_conf->child_logs); 994 995 mempool_destroy(ppl_conf->meta_pool); 996 if (ppl_conf->bs) 997 bioset_free(ppl_conf->bs); 998 mempool_destroy(ppl_conf->io_pool); 999 kmem_cache_destroy(ppl_conf->io_kc); 1000 1001 kfree(ppl_conf); 1002 } 1003 1004 void ppl_exit_log(struct r5conf *conf) 1005 { 1006 struct ppl_conf *ppl_conf = conf->log_private; 1007 1008 if (ppl_conf) { 1009 __ppl_exit_log(ppl_conf); 1010 conf->log_private = NULL; 1011 } 1012 } 1013 1014 static int ppl_validate_rdev(struct md_rdev *rdev) 1015 { 1016 char b[BDEVNAME_SIZE]; 1017 int ppl_data_sectors; 1018 int ppl_size_new; 1019 1020 /* 1021 * The configured PPL size must be enough to store 1022 * the header and (at the very least) partial parity 1023 * for one stripe. Round it down to ensure the data 1024 * space is cleanly divisible by stripe size. 1025 */ 1026 ppl_data_sectors = rdev->ppl.size - (PPL_HEADER_SIZE >> 9); 1027 1028 if (ppl_data_sectors > 0) 1029 ppl_data_sectors = rounddown(ppl_data_sectors, STRIPE_SECTORS); 1030 1031 if (ppl_data_sectors <= 0) { 1032 pr_warn("md/raid:%s: PPL space too small on %s\n", 1033 mdname(rdev->mddev), bdevname(rdev->bdev, b)); 1034 return -ENOSPC; 1035 } 1036 1037 ppl_size_new = ppl_data_sectors + (PPL_HEADER_SIZE >> 9); 1038 1039 if ((rdev->ppl.sector < rdev->data_offset && 1040 rdev->ppl.sector + ppl_size_new > rdev->data_offset) || 1041 (rdev->ppl.sector >= rdev->data_offset && 1042 rdev->data_offset + rdev->sectors > rdev->ppl.sector)) { 1043 pr_warn("md/raid:%s: PPL space overlaps with data on %s\n", 1044 mdname(rdev->mddev), bdevname(rdev->bdev, b)); 1045 return -EINVAL; 1046 } 1047 1048 if (!rdev->mddev->external && 1049 ((rdev->ppl.offset > 0 && rdev->ppl.offset < (rdev->sb_size >> 9)) || 1050 (rdev->ppl.offset <= 0 && rdev->ppl.offset + ppl_size_new > 0))) { 1051 pr_warn("md/raid:%s: PPL space overlaps with superblock on %s\n", 1052 mdname(rdev->mddev), bdevname(rdev->bdev, b)); 1053 return -EINVAL; 1054 } 1055 1056 rdev->ppl.size = ppl_size_new; 1057 1058 return 0; 1059 } 1060 1061 int ppl_init_log(struct r5conf *conf) 1062 { 1063 struct ppl_conf *ppl_conf; 1064 struct mddev *mddev = conf->mddev; 1065 int ret = 0; 1066 int i; 1067 bool need_cache_flush; 1068 1069 pr_debug("md/raid:%s: enabling distributed Partial Parity Log\n", 1070 mdname(conf->mddev)); 1071 1072 if (PAGE_SIZE != 4096) 1073 return -EINVAL; 1074 1075 if (mddev->level != 5) { 1076 pr_warn("md/raid:%s PPL is not compatible with raid level %d\n", 1077 mdname(mddev), mddev->level); 1078 return -EINVAL; 1079 } 1080 1081 if (mddev->bitmap_info.file || mddev->bitmap_info.offset) { 1082 pr_warn("md/raid:%s PPL is not compatible with bitmap\n", 1083 mdname(mddev)); 1084 return -EINVAL; 1085 } 1086 1087 if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) { 1088 pr_warn("md/raid:%s PPL is not compatible with journal\n", 1089 mdname(mddev)); 1090 return -EINVAL; 1091 } 1092 1093 ppl_conf = kzalloc(sizeof(struct ppl_conf), GFP_KERNEL); 1094 if (!ppl_conf) 1095 return -ENOMEM; 1096 1097 ppl_conf->mddev = mddev; 1098 1099 ppl_conf->io_kc = KMEM_CACHE(ppl_io_unit, 0); 1100 if (!ppl_conf->io_kc) { 1101 ret = -EINVAL; 1102 goto err; 1103 } 1104 1105 ppl_conf->io_pool = mempool_create_slab_pool(conf->raid_disks, ppl_conf->io_kc); 1106 if (!ppl_conf->io_pool) { 1107 ret = -EINVAL; 1108 goto err; 1109 } 1110 1111 ppl_conf->bs = bioset_create(conf->raid_disks, 0); 1112 if (!ppl_conf->bs) { 1113 ret = -EINVAL; 1114 goto err; 1115 } 1116 1117 ppl_conf->meta_pool = mempool_create_page_pool(conf->raid_disks, 0); 1118 if (!ppl_conf->meta_pool) { 1119 ret = -EINVAL; 1120 goto err; 1121 } 1122 1123 ppl_conf->count = conf->raid_disks; 1124 ppl_conf->child_logs = kcalloc(ppl_conf->count, sizeof(struct ppl_log), 1125 GFP_KERNEL); 1126 if (!ppl_conf->child_logs) { 1127 ret = -ENOMEM; 1128 goto err; 1129 } 1130 1131 atomic64_set(&ppl_conf->seq, 0); 1132 1133 if (!mddev->external) { 1134 ppl_conf->signature = ~crc32c_le(~0, mddev->uuid, sizeof(mddev->uuid)); 1135 ppl_conf->block_size = 512; 1136 } else { 1137 ppl_conf->block_size = queue_logical_block_size(mddev->queue); 1138 } 1139 1140 for (i = 0; i < ppl_conf->count; i++) { 1141 struct ppl_log *log = &ppl_conf->child_logs[i]; 1142 struct md_rdev *rdev = conf->disks[i].rdev; 1143 1144 mutex_init(&log->io_mutex); 1145 spin_lock_init(&log->io_list_lock); 1146 INIT_LIST_HEAD(&log->io_list); 1147 INIT_LIST_HEAD(&log->no_mem_stripes); 1148 1149 log->ppl_conf = ppl_conf; 1150 log->rdev = rdev; 1151 1152 if (rdev) { 1153 struct request_queue *q; 1154 1155 ret = ppl_validate_rdev(rdev); 1156 if (ret) 1157 goto err; 1158 1159 q = bdev_get_queue(rdev->bdev); 1160 if (test_bit(QUEUE_FLAG_WC, &q->queue_flags)) 1161 need_cache_flush = true; 1162 } 1163 } 1164 1165 if (need_cache_flush) 1166 pr_warn("md/raid:%s: Volatile write-back cache should be disabled on all member drives when using PPL!\n", 1167 mdname(mddev)); 1168 1169 /* load and possibly recover the logs from the member disks */ 1170 ret = ppl_load(ppl_conf); 1171 1172 if (ret) { 1173 goto err; 1174 } else if (!mddev->pers && 1175 mddev->recovery_cp == 0 && !mddev->degraded && 1176 ppl_conf->recovered_entries > 0 && 1177 ppl_conf->mismatch_count == 0) { 1178 /* 1179 * If we are starting a dirty array and the recovery succeeds 1180 * without any issues, set the array as clean. 1181 */ 1182 mddev->recovery_cp = MaxSector; 1183 set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); 1184 } 1185 1186 conf->log_private = ppl_conf; 1187 1188 return 0; 1189 err: 1190 __ppl_exit_log(ppl_conf); 1191 return ret; 1192 } 1193