1 /* 2 * Copyright (C) 2014 Facebook. All rights reserved. 3 * 4 * This file is released under the GPL. 5 */ 6 7 #include <linux/device-mapper.h> 8 9 #include <linux/module.h> 10 #include <linux/init.h> 11 #include <linux/blkdev.h> 12 #include <linux/bio.h> 13 #include <linux/dax.h> 14 #include <linux/slab.h> 15 #include <linux/kthread.h> 16 #include <linux/freezer.h> 17 #include <linux/uio.h> 18 19 #define DM_MSG_PREFIX "log-writes" 20 21 /* 22 * This target will sequentially log all writes to the target device onto the 23 * log device. This is helpful for replaying writes to check for fs consistency 24 * at all times. This target provides a mechanism to mark specific events to 25 * check data at a later time. So for example you would: 26 * 27 * write data 28 * fsync 29 * dmsetup message /dev/whatever mark mymark 30 * unmount /mnt/test 31 * 32 * Then replay the log up to mymark and check the contents of the replay to 33 * verify it matches what was written. 34 * 35 * We log writes only after they have been flushed, this makes the log describe 36 * close to the order in which the data hits the actual disk, not its cache. So 37 * for example the following sequence (W means write, C means complete) 38 * 39 * Wa,Wb,Wc,Cc,Ca,FLUSH,FUAd,Cb,CFLUSH,CFUAd 40 * 41 * Would result in the log looking like this: 42 * 43 * c,a,flush,fuad,b,<other writes>,<next flush> 44 * 45 * This is meant to help expose problems where file systems do not properly wait 46 * on data being written before invoking a FLUSH. FUA bypasses cache so once it 47 * completes it is added to the log as it should be on disk. 48 * 49 * We treat DISCARDs as if they don't bypass cache so that they are logged in 50 * order of completion along with the normal writes. If we didn't do it this 51 * way we would process all the discards first and then write all the data, when 52 * in fact we want to do the data and the discard in the order that they 53 * completed. 54 */ 55 #define LOG_FLUSH_FLAG (1 << 0) 56 #define LOG_FUA_FLAG (1 << 1) 57 #define LOG_DISCARD_FLAG (1 << 2) 58 #define LOG_MARK_FLAG (1 << 3) 59 60 #define WRITE_LOG_VERSION 1ULL 61 #define WRITE_LOG_MAGIC 0x6a736677736872ULL 62 63 /* 64 * The disk format for this is braindead simple. 65 * 66 * At byte 0 we have our super, followed by the following sequence for 67 * nr_entries: 68 * 69 * [ 1 sector ][ entry->nr_sectors ] 70 * [log_write_entry][ data written ] 71 * 72 * The log_write_entry takes up a full sector so we can have arbitrary length 73 * marks and it leaves us room for extra content in the future. 74 */ 75 76 /* 77 * Basic info about the log for userspace. 78 */ 79 struct log_write_super { 80 __le64 magic; 81 __le64 version; 82 __le64 nr_entries; 83 __le32 sectorsize; 84 }; 85 86 /* 87 * sector - the sector we wrote. 88 * nr_sectors - the number of sectors we wrote. 89 * flags - flags for this log entry. 90 * data_len - the size of the data in this log entry, this is for private log 91 * entry stuff, the MARK data provided by userspace for example. 92 */ 93 struct log_write_entry { 94 __le64 sector; 95 __le64 nr_sectors; 96 __le64 flags; 97 __le64 data_len; 98 }; 99 100 struct log_writes_c { 101 struct dm_dev *dev; 102 struct dm_dev *logdev; 103 u64 logged_entries; 104 u32 sectorsize; 105 u32 sectorshift; 106 atomic_t io_blocks; 107 atomic_t pending_blocks; 108 sector_t next_sector; 109 sector_t end_sector; 110 bool logging_enabled; 111 bool device_supports_discard; 112 spinlock_t blocks_lock; 113 struct list_head unflushed_blocks; 114 struct list_head logging_blocks; 115 wait_queue_head_t wait; 116 struct task_struct *log_kthread; 117 }; 118 119 struct pending_block { 120 int vec_cnt; 121 u64 flags; 122 sector_t sector; 123 sector_t nr_sectors; 124 char *data; 125 u32 datalen; 126 struct list_head list; 127 struct bio_vec vecs[0]; 128 }; 129 130 struct per_bio_data { 131 struct pending_block *block; 132 }; 133 134 static inline sector_t bio_to_dev_sectors(struct log_writes_c *lc, 135 sector_t sectors) 136 { 137 return sectors >> (lc->sectorshift - SECTOR_SHIFT); 138 } 139 140 static inline sector_t dev_to_bio_sectors(struct log_writes_c *lc, 141 sector_t sectors) 142 { 143 return sectors << (lc->sectorshift - SECTOR_SHIFT); 144 } 145 146 static void put_pending_block(struct log_writes_c *lc) 147 { 148 if (atomic_dec_and_test(&lc->pending_blocks)) { 149 smp_mb__after_atomic(); 150 if (waitqueue_active(&lc->wait)) 151 wake_up(&lc->wait); 152 } 153 } 154 155 static void put_io_block(struct log_writes_c *lc) 156 { 157 if (atomic_dec_and_test(&lc->io_blocks)) { 158 smp_mb__after_atomic(); 159 if (waitqueue_active(&lc->wait)) 160 wake_up(&lc->wait); 161 } 162 } 163 164 static void log_end_io(struct bio *bio) 165 { 166 struct log_writes_c *lc = bio->bi_private; 167 168 if (bio->bi_status) { 169 unsigned long flags; 170 171 DMERR("Error writing log block, error=%d", bio->bi_status); 172 spin_lock_irqsave(&lc->blocks_lock, flags); 173 lc->logging_enabled = false; 174 spin_unlock_irqrestore(&lc->blocks_lock, flags); 175 } 176 177 bio_free_pages(bio); 178 put_io_block(lc); 179 bio_put(bio); 180 } 181 182 /* 183 * Meant to be called if there is an error, it will free all the pages 184 * associated with the block. 185 */ 186 static void free_pending_block(struct log_writes_c *lc, 187 struct pending_block *block) 188 { 189 int i; 190 191 for (i = 0; i < block->vec_cnt; i++) { 192 if (block->vecs[i].bv_page) 193 __free_page(block->vecs[i].bv_page); 194 } 195 kfree(block->data); 196 kfree(block); 197 put_pending_block(lc); 198 } 199 200 static int write_metadata(struct log_writes_c *lc, void *entry, 201 size_t entrylen, void *data, size_t datalen, 202 sector_t sector) 203 { 204 struct bio *bio; 205 struct page *page; 206 void *ptr; 207 size_t ret; 208 209 bio = bio_alloc(GFP_KERNEL, 1); 210 if (!bio) { 211 DMERR("Couldn't alloc log bio"); 212 goto error; 213 } 214 bio->bi_iter.bi_size = 0; 215 bio->bi_iter.bi_sector = sector; 216 bio_set_dev(bio, lc->logdev->bdev); 217 bio->bi_end_io = log_end_io; 218 bio->bi_private = lc; 219 bio_set_op_attrs(bio, REQ_OP_WRITE, 0); 220 221 page = alloc_page(GFP_KERNEL); 222 if (!page) { 223 DMERR("Couldn't alloc log page"); 224 bio_put(bio); 225 goto error; 226 } 227 228 ptr = kmap_atomic(page); 229 memcpy(ptr, entry, entrylen); 230 if (datalen) 231 memcpy(ptr + entrylen, data, datalen); 232 memset(ptr + entrylen + datalen, 0, 233 lc->sectorsize - entrylen - datalen); 234 kunmap_atomic(ptr); 235 236 ret = bio_add_page(bio, page, lc->sectorsize, 0); 237 if (ret != lc->sectorsize) { 238 DMERR("Couldn't add page to the log block"); 239 goto error_bio; 240 } 241 submit_bio(bio); 242 return 0; 243 error_bio: 244 bio_put(bio); 245 __free_page(page); 246 error: 247 put_io_block(lc); 248 return -1; 249 } 250 251 static int write_inline_data(struct log_writes_c *lc, void *entry, 252 size_t entrylen, void *data, size_t datalen, 253 sector_t sector) 254 { 255 int num_pages, bio_pages, pg_datalen, pg_sectorlen, i; 256 struct page *page; 257 struct bio *bio; 258 size_t ret; 259 void *ptr; 260 261 while (datalen) { 262 num_pages = ALIGN(datalen, PAGE_SIZE) >> PAGE_SHIFT; 263 bio_pages = min(num_pages, BIO_MAX_PAGES); 264 265 atomic_inc(&lc->io_blocks); 266 267 bio = bio_alloc(GFP_KERNEL, bio_pages); 268 if (!bio) { 269 DMERR("Couldn't alloc inline data bio"); 270 goto error; 271 } 272 273 bio->bi_iter.bi_size = 0; 274 bio->bi_iter.bi_sector = sector; 275 bio_set_dev(bio, lc->logdev->bdev); 276 bio->bi_end_io = log_end_io; 277 bio->bi_private = lc; 278 bio_set_op_attrs(bio, REQ_OP_WRITE, 0); 279 280 for (i = 0; i < bio_pages; i++) { 281 pg_datalen = min_t(int, datalen, PAGE_SIZE); 282 pg_sectorlen = ALIGN(pg_datalen, lc->sectorsize); 283 284 page = alloc_page(GFP_KERNEL); 285 if (!page) { 286 DMERR("Couldn't alloc inline data page"); 287 goto error_bio; 288 } 289 290 ptr = kmap_atomic(page); 291 memcpy(ptr, data, pg_datalen); 292 if (pg_sectorlen > pg_datalen) 293 memset(ptr + pg_datalen, 0, pg_sectorlen - pg_datalen); 294 kunmap_atomic(ptr); 295 296 ret = bio_add_page(bio, page, pg_sectorlen, 0); 297 if (ret != pg_sectorlen) { 298 DMERR("Couldn't add page of inline data"); 299 __free_page(page); 300 goto error_bio; 301 } 302 303 datalen -= pg_datalen; 304 data += pg_datalen; 305 } 306 submit_bio(bio); 307 308 sector += bio_pages * PAGE_SECTORS; 309 } 310 return 0; 311 error_bio: 312 bio_free_pages(bio); 313 bio_put(bio); 314 error: 315 put_io_block(lc); 316 return -1; 317 } 318 319 static int log_one_block(struct log_writes_c *lc, 320 struct pending_block *block, sector_t sector) 321 { 322 struct bio *bio; 323 struct log_write_entry entry; 324 size_t metadatalen, ret; 325 int i; 326 327 entry.sector = cpu_to_le64(block->sector); 328 entry.nr_sectors = cpu_to_le64(block->nr_sectors); 329 entry.flags = cpu_to_le64(block->flags); 330 entry.data_len = cpu_to_le64(block->datalen); 331 332 metadatalen = (block->flags & LOG_MARK_FLAG) ? block->datalen : 0; 333 if (write_metadata(lc, &entry, sizeof(entry), block->data, 334 metadatalen, sector)) { 335 free_pending_block(lc, block); 336 return -1; 337 } 338 339 sector += dev_to_bio_sectors(lc, 1); 340 341 if (block->datalen && metadatalen == 0) { 342 if (write_inline_data(lc, &entry, sizeof(entry), block->data, 343 block->datalen, sector)) { 344 free_pending_block(lc, block); 345 return -1; 346 } 347 /* we don't support both inline data & bio data */ 348 goto out; 349 } 350 351 if (!block->vec_cnt) 352 goto out; 353 354 atomic_inc(&lc->io_blocks); 355 bio = bio_alloc(GFP_KERNEL, min(block->vec_cnt, BIO_MAX_PAGES)); 356 if (!bio) { 357 DMERR("Couldn't alloc log bio"); 358 goto error; 359 } 360 bio->bi_iter.bi_size = 0; 361 bio->bi_iter.bi_sector = sector; 362 bio_set_dev(bio, lc->logdev->bdev); 363 bio->bi_end_io = log_end_io; 364 bio->bi_private = lc; 365 bio_set_op_attrs(bio, REQ_OP_WRITE, 0); 366 367 for (i = 0; i < block->vec_cnt; i++) { 368 /* 369 * The page offset is always 0 because we allocate a new page 370 * for every bvec in the original bio for simplicity sake. 371 */ 372 ret = bio_add_page(bio, block->vecs[i].bv_page, 373 block->vecs[i].bv_len, 0); 374 if (ret != block->vecs[i].bv_len) { 375 atomic_inc(&lc->io_blocks); 376 submit_bio(bio); 377 bio = bio_alloc(GFP_KERNEL, min(block->vec_cnt - i, BIO_MAX_PAGES)); 378 if (!bio) { 379 DMERR("Couldn't alloc log bio"); 380 goto error; 381 } 382 bio->bi_iter.bi_size = 0; 383 bio->bi_iter.bi_sector = sector; 384 bio_set_dev(bio, lc->logdev->bdev); 385 bio->bi_end_io = log_end_io; 386 bio->bi_private = lc; 387 bio_set_op_attrs(bio, REQ_OP_WRITE, 0); 388 389 ret = bio_add_page(bio, block->vecs[i].bv_page, 390 block->vecs[i].bv_len, 0); 391 if (ret != block->vecs[i].bv_len) { 392 DMERR("Couldn't add page on new bio?"); 393 bio_put(bio); 394 goto error; 395 } 396 } 397 sector += block->vecs[i].bv_len >> SECTOR_SHIFT; 398 } 399 submit_bio(bio); 400 out: 401 kfree(block->data); 402 kfree(block); 403 put_pending_block(lc); 404 return 0; 405 error: 406 free_pending_block(lc, block); 407 put_io_block(lc); 408 return -1; 409 } 410 411 static int log_super(struct log_writes_c *lc) 412 { 413 struct log_write_super super; 414 415 super.magic = cpu_to_le64(WRITE_LOG_MAGIC); 416 super.version = cpu_to_le64(WRITE_LOG_VERSION); 417 super.nr_entries = cpu_to_le64(lc->logged_entries); 418 super.sectorsize = cpu_to_le32(lc->sectorsize); 419 420 if (write_metadata(lc, &super, sizeof(super), NULL, 0, 0)) { 421 DMERR("Couldn't write super"); 422 return -1; 423 } 424 425 return 0; 426 } 427 428 static inline sector_t logdev_last_sector(struct log_writes_c *lc) 429 { 430 return i_size_read(lc->logdev->bdev->bd_inode) >> SECTOR_SHIFT; 431 } 432 433 static int log_writes_kthread(void *arg) 434 { 435 struct log_writes_c *lc = (struct log_writes_c *)arg; 436 sector_t sector = 0; 437 438 while (!kthread_should_stop()) { 439 bool super = false; 440 bool logging_enabled; 441 struct pending_block *block = NULL; 442 int ret; 443 444 spin_lock_irq(&lc->blocks_lock); 445 if (!list_empty(&lc->logging_blocks)) { 446 block = list_first_entry(&lc->logging_blocks, 447 struct pending_block, list); 448 list_del_init(&block->list); 449 if (!lc->logging_enabled) 450 goto next; 451 452 sector = lc->next_sector; 453 if (!(block->flags & LOG_DISCARD_FLAG)) 454 lc->next_sector += dev_to_bio_sectors(lc, block->nr_sectors); 455 lc->next_sector += dev_to_bio_sectors(lc, 1); 456 457 /* 458 * Apparently the size of the device may not be known 459 * right away, so handle this properly. 460 */ 461 if (!lc->end_sector) 462 lc->end_sector = logdev_last_sector(lc); 463 if (lc->end_sector && 464 lc->next_sector >= lc->end_sector) { 465 DMERR("Ran out of space on the logdev"); 466 lc->logging_enabled = false; 467 goto next; 468 } 469 lc->logged_entries++; 470 atomic_inc(&lc->io_blocks); 471 472 super = (block->flags & (LOG_FUA_FLAG | LOG_MARK_FLAG)); 473 if (super) 474 atomic_inc(&lc->io_blocks); 475 } 476 next: 477 logging_enabled = lc->logging_enabled; 478 spin_unlock_irq(&lc->blocks_lock); 479 if (block) { 480 if (logging_enabled) { 481 ret = log_one_block(lc, block, sector); 482 if (!ret && super) 483 ret = log_super(lc); 484 if (ret) { 485 spin_lock_irq(&lc->blocks_lock); 486 lc->logging_enabled = false; 487 spin_unlock_irq(&lc->blocks_lock); 488 } 489 } else 490 free_pending_block(lc, block); 491 continue; 492 } 493 494 if (!try_to_freeze()) { 495 set_current_state(TASK_INTERRUPTIBLE); 496 if (!kthread_should_stop() && 497 list_empty(&lc->logging_blocks)) 498 schedule(); 499 __set_current_state(TASK_RUNNING); 500 } 501 } 502 return 0; 503 } 504 505 /* 506 * Construct a log-writes mapping: 507 * log-writes <dev_path> <log_dev_path> 508 */ 509 static int log_writes_ctr(struct dm_target *ti, unsigned int argc, char **argv) 510 { 511 struct log_writes_c *lc; 512 struct dm_arg_set as; 513 const char *devname, *logdevname; 514 int ret; 515 516 as.argc = argc; 517 as.argv = argv; 518 519 if (argc < 2) { 520 ti->error = "Invalid argument count"; 521 return -EINVAL; 522 } 523 524 lc = kzalloc(sizeof(struct log_writes_c), GFP_KERNEL); 525 if (!lc) { 526 ti->error = "Cannot allocate context"; 527 return -ENOMEM; 528 } 529 spin_lock_init(&lc->blocks_lock); 530 INIT_LIST_HEAD(&lc->unflushed_blocks); 531 INIT_LIST_HEAD(&lc->logging_blocks); 532 init_waitqueue_head(&lc->wait); 533 atomic_set(&lc->io_blocks, 0); 534 atomic_set(&lc->pending_blocks, 0); 535 536 devname = dm_shift_arg(&as); 537 ret = dm_get_device(ti, devname, dm_table_get_mode(ti->table), &lc->dev); 538 if (ret) { 539 ti->error = "Device lookup failed"; 540 goto bad; 541 } 542 543 logdevname = dm_shift_arg(&as); 544 ret = dm_get_device(ti, logdevname, dm_table_get_mode(ti->table), 545 &lc->logdev); 546 if (ret) { 547 ti->error = "Log device lookup failed"; 548 dm_put_device(ti, lc->dev); 549 goto bad; 550 } 551 552 lc->sectorsize = bdev_logical_block_size(lc->dev->bdev); 553 lc->sectorshift = ilog2(lc->sectorsize); 554 lc->log_kthread = kthread_run(log_writes_kthread, lc, "log-write"); 555 if (IS_ERR(lc->log_kthread)) { 556 ret = PTR_ERR(lc->log_kthread); 557 ti->error = "Couldn't alloc kthread"; 558 dm_put_device(ti, lc->dev); 559 dm_put_device(ti, lc->logdev); 560 goto bad; 561 } 562 563 /* 564 * next_sector is in 512b sectors to correspond to what bi_sector expects. 565 * The super starts at sector 0, and the next_sector is the next logical 566 * one based on the sectorsize of the device. 567 */ 568 lc->next_sector = lc->sectorsize >> SECTOR_SHIFT; 569 lc->logging_enabled = true; 570 lc->end_sector = logdev_last_sector(lc); 571 lc->device_supports_discard = true; 572 573 ti->num_flush_bios = 1; 574 ti->flush_supported = true; 575 ti->num_discard_bios = 1; 576 ti->discards_supported = true; 577 ti->per_io_data_size = sizeof(struct per_bio_data); 578 ti->private = lc; 579 return 0; 580 581 bad: 582 kfree(lc); 583 return ret; 584 } 585 586 static int log_mark(struct log_writes_c *lc, char *data) 587 { 588 struct pending_block *block; 589 size_t maxsize = lc->sectorsize - sizeof(struct log_write_entry); 590 591 block = kzalloc(sizeof(struct pending_block), GFP_KERNEL); 592 if (!block) { 593 DMERR("Error allocating pending block"); 594 return -ENOMEM; 595 } 596 597 block->data = kstrndup(data, maxsize - 1, GFP_KERNEL); 598 if (!block->data) { 599 DMERR("Error copying mark data"); 600 kfree(block); 601 return -ENOMEM; 602 } 603 atomic_inc(&lc->pending_blocks); 604 block->datalen = strlen(block->data); 605 block->flags |= LOG_MARK_FLAG; 606 spin_lock_irq(&lc->blocks_lock); 607 list_add_tail(&block->list, &lc->logging_blocks); 608 spin_unlock_irq(&lc->blocks_lock); 609 wake_up_process(lc->log_kthread); 610 return 0; 611 } 612 613 static int log_dax(struct log_writes_c *lc, sector_t sector, size_t bytes, 614 struct iov_iter *i) 615 { 616 struct pending_block *block; 617 618 if (!bytes) 619 return 0; 620 621 block = kzalloc(sizeof(struct pending_block), GFP_KERNEL); 622 if (!block) { 623 DMERR("Error allocating dax pending block"); 624 return -ENOMEM; 625 } 626 627 block->data = kzalloc(bytes, GFP_KERNEL); 628 if (!block->data) { 629 DMERR("Error allocating dax data space"); 630 kfree(block); 631 return -ENOMEM; 632 } 633 634 /* write data provided via the iterator */ 635 if (!copy_from_iter(block->data, bytes, i)) { 636 DMERR("Error copying dax data"); 637 kfree(block->data); 638 kfree(block); 639 return -EIO; 640 } 641 642 /* rewind the iterator so that the block driver can use it */ 643 iov_iter_revert(i, bytes); 644 645 block->datalen = bytes; 646 block->sector = bio_to_dev_sectors(lc, sector); 647 block->nr_sectors = ALIGN(bytes, lc->sectorsize) >> lc->sectorshift; 648 649 atomic_inc(&lc->pending_blocks); 650 spin_lock_irq(&lc->blocks_lock); 651 list_add_tail(&block->list, &lc->unflushed_blocks); 652 spin_unlock_irq(&lc->blocks_lock); 653 wake_up_process(lc->log_kthread); 654 655 return 0; 656 } 657 658 static void log_writes_dtr(struct dm_target *ti) 659 { 660 struct log_writes_c *lc = ti->private; 661 662 spin_lock_irq(&lc->blocks_lock); 663 list_splice_init(&lc->unflushed_blocks, &lc->logging_blocks); 664 spin_unlock_irq(&lc->blocks_lock); 665 666 /* 667 * This is just nice to have since it'll update the super to include the 668 * unflushed blocks, if it fails we don't really care. 669 */ 670 log_mark(lc, "dm-log-writes-end"); 671 wake_up_process(lc->log_kthread); 672 wait_event(lc->wait, !atomic_read(&lc->io_blocks) && 673 !atomic_read(&lc->pending_blocks)); 674 kthread_stop(lc->log_kthread); 675 676 WARN_ON(!list_empty(&lc->logging_blocks)); 677 WARN_ON(!list_empty(&lc->unflushed_blocks)); 678 dm_put_device(ti, lc->dev); 679 dm_put_device(ti, lc->logdev); 680 kfree(lc); 681 } 682 683 static void normal_map_bio(struct dm_target *ti, struct bio *bio) 684 { 685 struct log_writes_c *lc = ti->private; 686 687 bio_set_dev(bio, lc->dev->bdev); 688 } 689 690 static int log_writes_map(struct dm_target *ti, struct bio *bio) 691 { 692 struct log_writes_c *lc = ti->private; 693 struct per_bio_data *pb = dm_per_bio_data(bio, sizeof(struct per_bio_data)); 694 struct pending_block *block; 695 struct bvec_iter iter; 696 struct bio_vec bv; 697 size_t alloc_size; 698 int i = 0; 699 bool flush_bio = (bio->bi_opf & REQ_PREFLUSH); 700 bool fua_bio = (bio->bi_opf & REQ_FUA); 701 bool discard_bio = (bio_op(bio) == REQ_OP_DISCARD); 702 703 pb->block = NULL; 704 705 /* Don't bother doing anything if logging has been disabled */ 706 if (!lc->logging_enabled) 707 goto map_bio; 708 709 /* 710 * Map reads as normal. 711 */ 712 if (bio_data_dir(bio) == READ) 713 goto map_bio; 714 715 /* No sectors and not a flush? Don't care */ 716 if (!bio_sectors(bio) && !flush_bio) 717 goto map_bio; 718 719 /* 720 * Discards will have bi_size set but there's no actual data, so just 721 * allocate the size of the pending block. 722 */ 723 if (discard_bio) 724 alloc_size = sizeof(struct pending_block); 725 else 726 alloc_size = sizeof(struct pending_block) + sizeof(struct bio_vec) * bio_segments(bio); 727 728 block = kzalloc(alloc_size, GFP_NOIO); 729 if (!block) { 730 DMERR("Error allocating pending block"); 731 spin_lock_irq(&lc->blocks_lock); 732 lc->logging_enabled = false; 733 spin_unlock_irq(&lc->blocks_lock); 734 return DM_MAPIO_KILL; 735 } 736 INIT_LIST_HEAD(&block->list); 737 pb->block = block; 738 atomic_inc(&lc->pending_blocks); 739 740 if (flush_bio) 741 block->flags |= LOG_FLUSH_FLAG; 742 if (fua_bio) 743 block->flags |= LOG_FUA_FLAG; 744 if (discard_bio) 745 block->flags |= LOG_DISCARD_FLAG; 746 747 block->sector = bio_to_dev_sectors(lc, bio->bi_iter.bi_sector); 748 block->nr_sectors = bio_to_dev_sectors(lc, bio_sectors(bio)); 749 750 /* We don't need the data, just submit */ 751 if (discard_bio) { 752 WARN_ON(flush_bio || fua_bio); 753 if (lc->device_supports_discard) 754 goto map_bio; 755 bio_endio(bio); 756 return DM_MAPIO_SUBMITTED; 757 } 758 759 /* Flush bio, splice the unflushed blocks onto this list and submit */ 760 if (flush_bio && !bio_sectors(bio)) { 761 spin_lock_irq(&lc->blocks_lock); 762 list_splice_init(&lc->unflushed_blocks, &block->list); 763 spin_unlock_irq(&lc->blocks_lock); 764 goto map_bio; 765 } 766 767 /* 768 * We will write this bio somewhere else way later so we need to copy 769 * the actual contents into new pages so we know the data will always be 770 * there. 771 * 772 * We do this because this could be a bio from O_DIRECT in which case we 773 * can't just hold onto the page until some later point, we have to 774 * manually copy the contents. 775 */ 776 bio_for_each_segment(bv, bio, iter) { 777 struct page *page; 778 void *src, *dst; 779 780 page = alloc_page(GFP_NOIO); 781 if (!page) { 782 DMERR("Error allocing page"); 783 free_pending_block(lc, block); 784 spin_lock_irq(&lc->blocks_lock); 785 lc->logging_enabled = false; 786 spin_unlock_irq(&lc->blocks_lock); 787 return DM_MAPIO_KILL; 788 } 789 790 src = kmap_atomic(bv.bv_page); 791 dst = kmap_atomic(page); 792 memcpy(dst, src + bv.bv_offset, bv.bv_len); 793 kunmap_atomic(dst); 794 kunmap_atomic(src); 795 block->vecs[i].bv_page = page; 796 block->vecs[i].bv_len = bv.bv_len; 797 block->vec_cnt++; 798 i++; 799 } 800 801 /* Had a flush with data in it, weird */ 802 if (flush_bio) { 803 spin_lock_irq(&lc->blocks_lock); 804 list_splice_init(&lc->unflushed_blocks, &block->list); 805 spin_unlock_irq(&lc->blocks_lock); 806 } 807 map_bio: 808 normal_map_bio(ti, bio); 809 return DM_MAPIO_REMAPPED; 810 } 811 812 static int normal_end_io(struct dm_target *ti, struct bio *bio, 813 blk_status_t *error) 814 { 815 struct log_writes_c *lc = ti->private; 816 struct per_bio_data *pb = dm_per_bio_data(bio, sizeof(struct per_bio_data)); 817 818 if (bio_data_dir(bio) == WRITE && pb->block) { 819 struct pending_block *block = pb->block; 820 unsigned long flags; 821 822 spin_lock_irqsave(&lc->blocks_lock, flags); 823 if (block->flags & LOG_FLUSH_FLAG) { 824 list_splice_tail_init(&block->list, &lc->logging_blocks); 825 list_add_tail(&block->list, &lc->logging_blocks); 826 wake_up_process(lc->log_kthread); 827 } else if (block->flags & LOG_FUA_FLAG) { 828 list_add_tail(&block->list, &lc->logging_blocks); 829 wake_up_process(lc->log_kthread); 830 } else 831 list_add_tail(&block->list, &lc->unflushed_blocks); 832 spin_unlock_irqrestore(&lc->blocks_lock, flags); 833 } 834 835 return DM_ENDIO_DONE; 836 } 837 838 /* 839 * INFO format: <logged entries> <highest allocated sector> 840 */ 841 static void log_writes_status(struct dm_target *ti, status_type_t type, 842 unsigned status_flags, char *result, 843 unsigned maxlen) 844 { 845 unsigned sz = 0; 846 struct log_writes_c *lc = ti->private; 847 848 switch (type) { 849 case STATUSTYPE_INFO: 850 DMEMIT("%llu %llu", lc->logged_entries, 851 (unsigned long long)lc->next_sector - 1); 852 if (!lc->logging_enabled) 853 DMEMIT(" logging_disabled"); 854 break; 855 856 case STATUSTYPE_TABLE: 857 DMEMIT("%s %s", lc->dev->name, lc->logdev->name); 858 break; 859 } 860 } 861 862 static int log_writes_prepare_ioctl(struct dm_target *ti, 863 struct block_device **bdev, fmode_t *mode) 864 { 865 struct log_writes_c *lc = ti->private; 866 struct dm_dev *dev = lc->dev; 867 868 *bdev = dev->bdev; 869 /* 870 * Only pass ioctls through if the device sizes match exactly. 871 */ 872 if (ti->len != i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT) 873 return 1; 874 return 0; 875 } 876 877 static int log_writes_iterate_devices(struct dm_target *ti, 878 iterate_devices_callout_fn fn, 879 void *data) 880 { 881 struct log_writes_c *lc = ti->private; 882 883 return fn(ti, lc->dev, 0, ti->len, data); 884 } 885 886 /* 887 * Messages supported: 888 * mark <mark data> - specify the marked data. 889 */ 890 static int log_writes_message(struct dm_target *ti, unsigned argc, char **argv) 891 { 892 int r = -EINVAL; 893 struct log_writes_c *lc = ti->private; 894 895 if (argc != 2) { 896 DMWARN("Invalid log-writes message arguments, expect 2 arguments, got %d", argc); 897 return r; 898 } 899 900 if (!strcasecmp(argv[0], "mark")) 901 r = log_mark(lc, argv[1]); 902 else 903 DMWARN("Unrecognised log writes target message received: %s", argv[0]); 904 905 return r; 906 } 907 908 static void log_writes_io_hints(struct dm_target *ti, struct queue_limits *limits) 909 { 910 struct log_writes_c *lc = ti->private; 911 struct request_queue *q = bdev_get_queue(lc->dev->bdev); 912 913 if (!q || !blk_queue_discard(q)) { 914 lc->device_supports_discard = false; 915 limits->discard_granularity = lc->sectorsize; 916 limits->max_discard_sectors = (UINT_MAX >> SECTOR_SHIFT); 917 } 918 limits->logical_block_size = bdev_logical_block_size(lc->dev->bdev); 919 limits->physical_block_size = bdev_physical_block_size(lc->dev->bdev); 920 limits->io_min = limits->physical_block_size; 921 } 922 923 static long log_writes_dax_direct_access(struct dm_target *ti, pgoff_t pgoff, 924 long nr_pages, void **kaddr, pfn_t *pfn) 925 { 926 struct log_writes_c *lc = ti->private; 927 sector_t sector = pgoff * PAGE_SECTORS; 928 int ret; 929 930 ret = bdev_dax_pgoff(lc->dev->bdev, sector, nr_pages * PAGE_SIZE, &pgoff); 931 if (ret) 932 return ret; 933 return dax_direct_access(lc->dev->dax_dev, pgoff, nr_pages, kaddr, pfn); 934 } 935 936 static size_t log_writes_dax_copy_from_iter(struct dm_target *ti, 937 pgoff_t pgoff, void *addr, size_t bytes, 938 struct iov_iter *i) 939 { 940 struct log_writes_c *lc = ti->private; 941 sector_t sector = pgoff * PAGE_SECTORS; 942 int err; 943 944 if (bdev_dax_pgoff(lc->dev->bdev, sector, ALIGN(bytes, PAGE_SIZE), &pgoff)) 945 return 0; 946 947 /* Don't bother doing anything if logging has been disabled */ 948 if (!lc->logging_enabled) 949 goto dax_copy; 950 951 err = log_dax(lc, sector, bytes, i); 952 if (err) { 953 DMWARN("Error %d logging DAX write", err); 954 return 0; 955 } 956 dax_copy: 957 return dax_copy_from_iter(lc->dev->dax_dev, pgoff, addr, bytes, i); 958 } 959 960 static struct target_type log_writes_target = { 961 .name = "log-writes", 962 .version = {1, 1, 0}, 963 .module = THIS_MODULE, 964 .ctr = log_writes_ctr, 965 .dtr = log_writes_dtr, 966 .map = log_writes_map, 967 .end_io = normal_end_io, 968 .status = log_writes_status, 969 .prepare_ioctl = log_writes_prepare_ioctl, 970 .message = log_writes_message, 971 .iterate_devices = log_writes_iterate_devices, 972 .io_hints = log_writes_io_hints, 973 .direct_access = log_writes_dax_direct_access, 974 .dax_copy_from_iter = log_writes_dax_copy_from_iter, 975 }; 976 977 static int __init dm_log_writes_init(void) 978 { 979 int r = dm_register_target(&log_writes_target); 980 981 if (r < 0) 982 DMERR("register failed %d", r); 983 984 return r; 985 } 986 987 static void __exit dm_log_writes_exit(void) 988 { 989 dm_unregister_target(&log_writes_target); 990 } 991 992 module_init(dm_log_writes_init); 993 module_exit(dm_log_writes_exit); 994 995 MODULE_DESCRIPTION(DM_NAME " log writes target"); 996 MODULE_AUTHOR("Josef Bacik <jbacik@fb.com>"); 997 MODULE_LICENSE("GPL"); 998