1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2018 Red Hat. All rights reserved. 4 * 5 * This file is released under the GPL. 6 */ 7 8 #include <linux/device-mapper.h> 9 #include <linux/module.h> 10 #include <linux/init.h> 11 #include <linux/vmalloc.h> 12 #include <linux/kthread.h> 13 #include <linux/dm-io.h> 14 #include <linux/dm-kcopyd.h> 15 #include <linux/dax.h> 16 #include <linux/pfn_t.h> 17 #include <linux/libnvdimm.h> 18 #include <linux/delay.h> 19 #include "dm-io-tracker.h" 20 21 #define DM_MSG_PREFIX "writecache" 22 23 #define HIGH_WATERMARK 50 24 #define LOW_WATERMARK 45 25 #define MAX_WRITEBACK_JOBS min(0x10000000 / PAGE_SIZE, totalram_pages() / 16) 26 #define ENDIO_LATENCY 16 27 #define WRITEBACK_LATENCY 64 28 #define AUTOCOMMIT_BLOCKS_SSD 65536 29 #define AUTOCOMMIT_BLOCKS_PMEM 64 30 #define AUTOCOMMIT_MSEC 1000 31 #define MAX_AGE_DIV 16 32 #define MAX_AGE_UNSPECIFIED -1UL 33 #define PAUSE_WRITEBACK (HZ * 3) 34 35 #define BITMAP_GRANULARITY 65536 36 #if BITMAP_GRANULARITY < PAGE_SIZE 37 #undef BITMAP_GRANULARITY 38 #define BITMAP_GRANULARITY PAGE_SIZE 39 #endif 40 41 #if IS_ENABLED(CONFIG_ARCH_HAS_PMEM_API) && IS_ENABLED(CONFIG_FS_DAX) 42 #define DM_WRITECACHE_HAS_PMEM 43 #endif 44 45 #ifdef DM_WRITECACHE_HAS_PMEM 46 #define pmem_assign(dest, src) \ 47 do { \ 48 typeof(dest) uniq = (src); \ 49 memcpy_flushcache(&(dest), &uniq, sizeof(dest)); \ 50 } while (0) 51 #else 52 #define pmem_assign(dest, src) ((dest) = (src)) 53 #endif 54 55 #if IS_ENABLED(CONFIG_ARCH_HAS_COPY_MC) && defined(DM_WRITECACHE_HAS_PMEM) 56 #define DM_WRITECACHE_HANDLE_HARDWARE_ERRORS 57 #endif 58 59 #define MEMORY_SUPERBLOCK_MAGIC 0x23489321 60 #define MEMORY_SUPERBLOCK_VERSION 1 61 62 struct wc_memory_entry { 63 __le64 original_sector; 64 __le64 seq_count; 65 }; 66 67 struct wc_memory_superblock { 68 union { 69 struct { 70 __le32 magic; 71 __le32 version; 72 __le32 block_size; 73 __le32 pad; 74 __le64 n_blocks; 75 __le64 seq_count; 76 }; 77 __le64 padding[8]; 78 }; 79 struct wc_memory_entry entries[]; 80 }; 81 82 struct wc_entry { 83 struct rb_node rb_node; 84 struct list_head lru; 85 unsigned short wc_list_contiguous; 86 bool write_in_progress 87 #if BITS_PER_LONG == 64 88 :1 89 #endif 90 ; 91 unsigned long index 92 #if BITS_PER_LONG == 64 93 :47 94 #endif 95 ; 96 unsigned long age; 97 #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS 98 uint64_t original_sector; 99 uint64_t seq_count; 100 #endif 101 }; 102 103 #ifdef DM_WRITECACHE_HAS_PMEM 104 #define WC_MODE_PMEM(wc) ((wc)->pmem_mode) 105 #define WC_MODE_FUA(wc) ((wc)->writeback_fua) 106 #else 107 #define WC_MODE_PMEM(wc) false 108 #define WC_MODE_FUA(wc) false 109 #endif 110 #define WC_MODE_SORT_FREELIST(wc) (!WC_MODE_PMEM(wc)) 111 112 struct dm_writecache { 113 struct mutex lock; 114 struct list_head lru; 115 union { 116 struct list_head freelist; 117 struct { 118 struct rb_root freetree; 119 struct wc_entry *current_free; 120 }; 121 }; 122 struct rb_root tree; 123 124 size_t freelist_size; 125 size_t writeback_size; 126 size_t freelist_high_watermark; 127 size_t freelist_low_watermark; 128 unsigned long max_age; 129 unsigned long pause; 130 131 unsigned uncommitted_blocks; 132 unsigned autocommit_blocks; 133 unsigned max_writeback_jobs; 134 135 int error; 136 137 unsigned long autocommit_jiffies; 138 struct timer_list autocommit_timer; 139 struct wait_queue_head freelist_wait; 140 141 struct timer_list max_age_timer; 142 143 atomic_t bio_in_progress[2]; 144 struct wait_queue_head bio_in_progress_wait[2]; 145 146 struct dm_target *ti; 147 struct dm_dev *dev; 148 struct dm_dev *ssd_dev; 149 sector_t start_sector; 150 void *memory_map; 151 uint64_t memory_map_size; 152 size_t metadata_sectors; 153 size_t n_blocks; 154 uint64_t seq_count; 155 sector_t data_device_sectors; 156 void *block_start; 157 struct wc_entry *entries; 158 unsigned block_size; 159 unsigned char block_size_bits; 160 161 bool pmem_mode:1; 162 bool writeback_fua:1; 163 164 bool overwrote_committed:1; 165 bool memory_vmapped:1; 166 167 bool start_sector_set:1; 168 bool high_wm_percent_set:1; 169 bool low_wm_percent_set:1; 170 bool max_writeback_jobs_set:1; 171 bool autocommit_blocks_set:1; 172 bool autocommit_time_set:1; 173 bool max_age_set:1; 174 bool writeback_fua_set:1; 175 bool flush_on_suspend:1; 176 bool cleaner:1; 177 bool cleaner_set:1; 178 bool metadata_only:1; 179 bool pause_set:1; 180 181 unsigned high_wm_percent_value; 182 unsigned low_wm_percent_value; 183 unsigned autocommit_time_value; 184 unsigned max_age_value; 185 unsigned pause_value; 186 187 unsigned writeback_all; 188 struct workqueue_struct *writeback_wq; 189 struct work_struct writeback_work; 190 struct work_struct flush_work; 191 192 struct dm_io_tracker iot; 193 194 struct dm_io_client *dm_io; 195 196 raw_spinlock_t endio_list_lock; 197 struct list_head endio_list; 198 struct task_struct *endio_thread; 199 200 struct task_struct *flush_thread; 201 struct bio_list flush_list; 202 203 struct dm_kcopyd_client *dm_kcopyd; 204 unsigned long *dirty_bitmap; 205 unsigned dirty_bitmap_size; 206 207 struct bio_set bio_set; 208 mempool_t copy_pool; 209 210 struct { 211 unsigned long long reads; 212 unsigned long long read_hits; 213 unsigned long long writes; 214 unsigned long long write_hits_uncommitted; 215 unsigned long long write_hits_committed; 216 unsigned long long writes_around; 217 unsigned long long writes_allocate; 218 unsigned long long writes_blocked_on_freelist; 219 unsigned long long flushes; 220 unsigned long long discards; 221 } stats; 222 }; 223 224 #define WB_LIST_INLINE 16 225 226 struct writeback_struct { 227 struct list_head endio_entry; 228 struct dm_writecache *wc; 229 struct wc_entry **wc_list; 230 unsigned wc_list_n; 231 struct wc_entry *wc_list_inline[WB_LIST_INLINE]; 232 struct bio bio; 233 }; 234 235 struct copy_struct { 236 struct list_head endio_entry; 237 struct dm_writecache *wc; 238 struct wc_entry *e; 239 unsigned n_entries; 240 int error; 241 }; 242 243 DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(dm_writecache_throttle, 244 "A percentage of time allocated for data copying"); 245 246 static void wc_lock(struct dm_writecache *wc) 247 { 248 mutex_lock(&wc->lock); 249 } 250 251 static void wc_unlock(struct dm_writecache *wc) 252 { 253 mutex_unlock(&wc->lock); 254 } 255 256 #ifdef DM_WRITECACHE_HAS_PMEM 257 static int persistent_memory_claim(struct dm_writecache *wc) 258 { 259 int r; 260 loff_t s; 261 long p, da; 262 pfn_t pfn; 263 int id; 264 struct page **pages; 265 sector_t offset; 266 267 wc->memory_vmapped = false; 268 269 s = wc->memory_map_size; 270 p = s >> PAGE_SHIFT; 271 if (!p) { 272 r = -EINVAL; 273 goto err1; 274 } 275 if (p != s >> PAGE_SHIFT) { 276 r = -EOVERFLOW; 277 goto err1; 278 } 279 280 offset = get_start_sect(wc->ssd_dev->bdev); 281 if (offset & (PAGE_SIZE / 512 - 1)) { 282 r = -EINVAL; 283 goto err1; 284 } 285 offset >>= PAGE_SHIFT - 9; 286 287 id = dax_read_lock(); 288 289 da = dax_direct_access(wc->ssd_dev->dax_dev, offset, p, DAX_ACCESS, 290 &wc->memory_map, &pfn); 291 if (da < 0) { 292 wc->memory_map = NULL; 293 r = da; 294 goto err2; 295 } 296 if (!pfn_t_has_page(pfn)) { 297 wc->memory_map = NULL; 298 r = -EOPNOTSUPP; 299 goto err2; 300 } 301 if (da != p) { 302 long i; 303 wc->memory_map = NULL; 304 pages = kvmalloc_array(p, sizeof(struct page *), GFP_KERNEL); 305 if (!pages) { 306 r = -ENOMEM; 307 goto err2; 308 } 309 i = 0; 310 do { 311 long daa; 312 daa = dax_direct_access(wc->ssd_dev->dax_dev, offset + i, 313 p - i, DAX_ACCESS, NULL, &pfn); 314 if (daa <= 0) { 315 r = daa ? daa : -EINVAL; 316 goto err3; 317 } 318 if (!pfn_t_has_page(pfn)) { 319 r = -EOPNOTSUPP; 320 goto err3; 321 } 322 while (daa-- && i < p) { 323 pages[i++] = pfn_t_to_page(pfn); 324 pfn.val++; 325 if (!(i & 15)) 326 cond_resched(); 327 } 328 } while (i < p); 329 wc->memory_map = vmap(pages, p, VM_MAP, PAGE_KERNEL); 330 if (!wc->memory_map) { 331 r = -ENOMEM; 332 goto err3; 333 } 334 kvfree(pages); 335 wc->memory_vmapped = true; 336 } 337 338 dax_read_unlock(id); 339 340 wc->memory_map += (size_t)wc->start_sector << SECTOR_SHIFT; 341 wc->memory_map_size -= (size_t)wc->start_sector << SECTOR_SHIFT; 342 343 return 0; 344 err3: 345 kvfree(pages); 346 err2: 347 dax_read_unlock(id); 348 err1: 349 return r; 350 } 351 #else 352 static int persistent_memory_claim(struct dm_writecache *wc) 353 { 354 return -EOPNOTSUPP; 355 } 356 #endif 357 358 static void persistent_memory_release(struct dm_writecache *wc) 359 { 360 if (wc->memory_vmapped) 361 vunmap(wc->memory_map - ((size_t)wc->start_sector << SECTOR_SHIFT)); 362 } 363 364 static struct page *persistent_memory_page(void *addr) 365 { 366 if (is_vmalloc_addr(addr)) 367 return vmalloc_to_page(addr); 368 else 369 return virt_to_page(addr); 370 } 371 372 static unsigned persistent_memory_page_offset(void *addr) 373 { 374 return (unsigned long)addr & (PAGE_SIZE - 1); 375 } 376 377 static void persistent_memory_flush_cache(void *ptr, size_t size) 378 { 379 if (is_vmalloc_addr(ptr)) 380 flush_kernel_vmap_range(ptr, size); 381 } 382 383 static void persistent_memory_invalidate_cache(void *ptr, size_t size) 384 { 385 if (is_vmalloc_addr(ptr)) 386 invalidate_kernel_vmap_range(ptr, size); 387 } 388 389 static struct wc_memory_superblock *sb(struct dm_writecache *wc) 390 { 391 return wc->memory_map; 392 } 393 394 static struct wc_memory_entry *memory_entry(struct dm_writecache *wc, struct wc_entry *e) 395 { 396 return &sb(wc)->entries[e->index]; 397 } 398 399 static void *memory_data(struct dm_writecache *wc, struct wc_entry *e) 400 { 401 return (char *)wc->block_start + (e->index << wc->block_size_bits); 402 } 403 404 static sector_t cache_sector(struct dm_writecache *wc, struct wc_entry *e) 405 { 406 return wc->start_sector + wc->metadata_sectors + 407 ((sector_t)e->index << (wc->block_size_bits - SECTOR_SHIFT)); 408 } 409 410 static uint64_t read_original_sector(struct dm_writecache *wc, struct wc_entry *e) 411 { 412 #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS 413 return e->original_sector; 414 #else 415 return le64_to_cpu(memory_entry(wc, e)->original_sector); 416 #endif 417 } 418 419 static uint64_t read_seq_count(struct dm_writecache *wc, struct wc_entry *e) 420 { 421 #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS 422 return e->seq_count; 423 #else 424 return le64_to_cpu(memory_entry(wc, e)->seq_count); 425 #endif 426 } 427 428 static void clear_seq_count(struct dm_writecache *wc, struct wc_entry *e) 429 { 430 #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS 431 e->seq_count = -1; 432 #endif 433 pmem_assign(memory_entry(wc, e)->seq_count, cpu_to_le64(-1)); 434 } 435 436 static void write_original_sector_seq_count(struct dm_writecache *wc, struct wc_entry *e, 437 uint64_t original_sector, uint64_t seq_count) 438 { 439 struct wc_memory_entry me; 440 #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS 441 e->original_sector = original_sector; 442 e->seq_count = seq_count; 443 #endif 444 me.original_sector = cpu_to_le64(original_sector); 445 me.seq_count = cpu_to_le64(seq_count); 446 pmem_assign(*memory_entry(wc, e), me); 447 } 448 449 #define writecache_error(wc, err, msg, arg...) \ 450 do { \ 451 if (!cmpxchg(&(wc)->error, 0, err)) \ 452 DMERR(msg, ##arg); \ 453 wake_up(&(wc)->freelist_wait); \ 454 } while (0) 455 456 #define writecache_has_error(wc) (unlikely(READ_ONCE((wc)->error))) 457 458 static void writecache_flush_all_metadata(struct dm_writecache *wc) 459 { 460 if (!WC_MODE_PMEM(wc)) 461 memset(wc->dirty_bitmap, -1, wc->dirty_bitmap_size); 462 } 463 464 static void writecache_flush_region(struct dm_writecache *wc, void *ptr, size_t size) 465 { 466 if (!WC_MODE_PMEM(wc)) 467 __set_bit(((char *)ptr - (char *)wc->memory_map) / BITMAP_GRANULARITY, 468 wc->dirty_bitmap); 469 } 470 471 static void writecache_disk_flush(struct dm_writecache *wc, struct dm_dev *dev); 472 473 struct io_notify { 474 struct dm_writecache *wc; 475 struct completion c; 476 atomic_t count; 477 }; 478 479 static void writecache_notify_io(unsigned long error, void *context) 480 { 481 struct io_notify *endio = context; 482 483 if (unlikely(error != 0)) 484 writecache_error(endio->wc, -EIO, "error writing metadata"); 485 BUG_ON(atomic_read(&endio->count) <= 0); 486 if (atomic_dec_and_test(&endio->count)) 487 complete(&endio->c); 488 } 489 490 static void writecache_wait_for_ios(struct dm_writecache *wc, int direction) 491 { 492 wait_event(wc->bio_in_progress_wait[direction], 493 !atomic_read(&wc->bio_in_progress[direction])); 494 } 495 496 static void ssd_commit_flushed(struct dm_writecache *wc, bool wait_for_ios) 497 { 498 struct dm_io_region region; 499 struct dm_io_request req; 500 struct io_notify endio = { 501 wc, 502 COMPLETION_INITIALIZER_ONSTACK(endio.c), 503 ATOMIC_INIT(1), 504 }; 505 unsigned bitmap_bits = wc->dirty_bitmap_size * 8; 506 unsigned i = 0; 507 508 while (1) { 509 unsigned j; 510 i = find_next_bit(wc->dirty_bitmap, bitmap_bits, i); 511 if (unlikely(i == bitmap_bits)) 512 break; 513 j = find_next_zero_bit(wc->dirty_bitmap, bitmap_bits, i); 514 515 region.bdev = wc->ssd_dev->bdev; 516 region.sector = (sector_t)i * (BITMAP_GRANULARITY >> SECTOR_SHIFT); 517 region.count = (sector_t)(j - i) * (BITMAP_GRANULARITY >> SECTOR_SHIFT); 518 519 if (unlikely(region.sector >= wc->metadata_sectors)) 520 break; 521 if (unlikely(region.sector + region.count > wc->metadata_sectors)) 522 region.count = wc->metadata_sectors - region.sector; 523 524 region.sector += wc->start_sector; 525 atomic_inc(&endio.count); 526 req.bi_opf = REQ_OP_WRITE | REQ_SYNC; 527 req.mem.type = DM_IO_VMA; 528 req.mem.ptr.vma = (char *)wc->memory_map + (size_t)i * BITMAP_GRANULARITY; 529 req.client = wc->dm_io; 530 req.notify.fn = writecache_notify_io; 531 req.notify.context = &endio; 532 533 /* writing via async dm-io (implied by notify.fn above) won't return an error */ 534 (void) dm_io(&req, 1, ®ion, NULL); 535 i = j; 536 } 537 538 writecache_notify_io(0, &endio); 539 wait_for_completion_io(&endio.c); 540 541 if (wait_for_ios) 542 writecache_wait_for_ios(wc, WRITE); 543 544 writecache_disk_flush(wc, wc->ssd_dev); 545 546 memset(wc->dirty_bitmap, 0, wc->dirty_bitmap_size); 547 } 548 549 static void ssd_commit_superblock(struct dm_writecache *wc) 550 { 551 int r; 552 struct dm_io_region region; 553 struct dm_io_request req; 554 555 region.bdev = wc->ssd_dev->bdev; 556 region.sector = 0; 557 region.count = max(4096U, wc->block_size) >> SECTOR_SHIFT; 558 559 if (unlikely(region.sector + region.count > wc->metadata_sectors)) 560 region.count = wc->metadata_sectors - region.sector; 561 562 region.sector += wc->start_sector; 563 564 req.bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_FUA; 565 req.mem.type = DM_IO_VMA; 566 req.mem.ptr.vma = (char *)wc->memory_map; 567 req.client = wc->dm_io; 568 req.notify.fn = NULL; 569 req.notify.context = NULL; 570 571 r = dm_io(&req, 1, ®ion, NULL); 572 if (unlikely(r)) 573 writecache_error(wc, r, "error writing superblock"); 574 } 575 576 static void writecache_commit_flushed(struct dm_writecache *wc, bool wait_for_ios) 577 { 578 if (WC_MODE_PMEM(wc)) 579 pmem_wmb(); 580 else 581 ssd_commit_flushed(wc, wait_for_ios); 582 } 583 584 static void writecache_disk_flush(struct dm_writecache *wc, struct dm_dev *dev) 585 { 586 int r; 587 struct dm_io_region region; 588 struct dm_io_request req; 589 590 region.bdev = dev->bdev; 591 region.sector = 0; 592 region.count = 0; 593 req.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH; 594 req.mem.type = DM_IO_KMEM; 595 req.mem.ptr.addr = NULL; 596 req.client = wc->dm_io; 597 req.notify.fn = NULL; 598 599 r = dm_io(&req, 1, ®ion, NULL); 600 if (unlikely(r)) 601 writecache_error(wc, r, "error flushing metadata: %d", r); 602 } 603 604 #define WFE_RETURN_FOLLOWING 1 605 #define WFE_LOWEST_SEQ 2 606 607 static struct wc_entry *writecache_find_entry(struct dm_writecache *wc, 608 uint64_t block, int flags) 609 { 610 struct wc_entry *e; 611 struct rb_node *node = wc->tree.rb_node; 612 613 if (unlikely(!node)) 614 return NULL; 615 616 while (1) { 617 e = container_of(node, struct wc_entry, rb_node); 618 if (read_original_sector(wc, e) == block) 619 break; 620 621 node = (read_original_sector(wc, e) >= block ? 622 e->rb_node.rb_left : e->rb_node.rb_right); 623 if (unlikely(!node)) { 624 if (!(flags & WFE_RETURN_FOLLOWING)) 625 return NULL; 626 if (read_original_sector(wc, e) >= block) { 627 return e; 628 } else { 629 node = rb_next(&e->rb_node); 630 if (unlikely(!node)) 631 return NULL; 632 e = container_of(node, struct wc_entry, rb_node); 633 return e; 634 } 635 } 636 } 637 638 while (1) { 639 struct wc_entry *e2; 640 if (flags & WFE_LOWEST_SEQ) 641 node = rb_prev(&e->rb_node); 642 else 643 node = rb_next(&e->rb_node); 644 if (unlikely(!node)) 645 return e; 646 e2 = container_of(node, struct wc_entry, rb_node); 647 if (read_original_sector(wc, e2) != block) 648 return e; 649 e = e2; 650 } 651 } 652 653 static void writecache_insert_entry(struct dm_writecache *wc, struct wc_entry *ins) 654 { 655 struct wc_entry *e; 656 struct rb_node **node = &wc->tree.rb_node, *parent = NULL; 657 658 while (*node) { 659 e = container_of(*node, struct wc_entry, rb_node); 660 parent = &e->rb_node; 661 if (read_original_sector(wc, e) > read_original_sector(wc, ins)) 662 node = &parent->rb_left; 663 else 664 node = &parent->rb_right; 665 } 666 rb_link_node(&ins->rb_node, parent, node); 667 rb_insert_color(&ins->rb_node, &wc->tree); 668 list_add(&ins->lru, &wc->lru); 669 ins->age = jiffies; 670 } 671 672 static void writecache_unlink(struct dm_writecache *wc, struct wc_entry *e) 673 { 674 list_del(&e->lru); 675 rb_erase(&e->rb_node, &wc->tree); 676 } 677 678 static void writecache_add_to_freelist(struct dm_writecache *wc, struct wc_entry *e) 679 { 680 if (WC_MODE_SORT_FREELIST(wc)) { 681 struct rb_node **node = &wc->freetree.rb_node, *parent = NULL; 682 if (unlikely(!*node)) 683 wc->current_free = e; 684 while (*node) { 685 parent = *node; 686 if (&e->rb_node < *node) 687 node = &parent->rb_left; 688 else 689 node = &parent->rb_right; 690 } 691 rb_link_node(&e->rb_node, parent, node); 692 rb_insert_color(&e->rb_node, &wc->freetree); 693 } else { 694 list_add_tail(&e->lru, &wc->freelist); 695 } 696 wc->freelist_size++; 697 } 698 699 static inline void writecache_verify_watermark(struct dm_writecache *wc) 700 { 701 if (unlikely(wc->freelist_size + wc->writeback_size <= wc->freelist_high_watermark)) 702 queue_work(wc->writeback_wq, &wc->writeback_work); 703 } 704 705 static void writecache_max_age_timer(struct timer_list *t) 706 { 707 struct dm_writecache *wc = from_timer(wc, t, max_age_timer); 708 709 if (!dm_suspended(wc->ti) && !writecache_has_error(wc)) { 710 queue_work(wc->writeback_wq, &wc->writeback_work); 711 mod_timer(&wc->max_age_timer, jiffies + wc->max_age / MAX_AGE_DIV); 712 } 713 } 714 715 static struct wc_entry *writecache_pop_from_freelist(struct dm_writecache *wc, sector_t expected_sector) 716 { 717 struct wc_entry *e; 718 719 if (WC_MODE_SORT_FREELIST(wc)) { 720 struct rb_node *next; 721 if (unlikely(!wc->current_free)) 722 return NULL; 723 e = wc->current_free; 724 if (expected_sector != (sector_t)-1 && unlikely(cache_sector(wc, e) != expected_sector)) 725 return NULL; 726 next = rb_next(&e->rb_node); 727 rb_erase(&e->rb_node, &wc->freetree); 728 if (unlikely(!next)) 729 next = rb_first(&wc->freetree); 730 wc->current_free = next ? container_of(next, struct wc_entry, rb_node) : NULL; 731 } else { 732 if (unlikely(list_empty(&wc->freelist))) 733 return NULL; 734 e = container_of(wc->freelist.next, struct wc_entry, lru); 735 if (expected_sector != (sector_t)-1 && unlikely(cache_sector(wc, e) != expected_sector)) 736 return NULL; 737 list_del(&e->lru); 738 } 739 wc->freelist_size--; 740 741 writecache_verify_watermark(wc); 742 743 return e; 744 } 745 746 static void writecache_free_entry(struct dm_writecache *wc, struct wc_entry *e) 747 { 748 writecache_unlink(wc, e); 749 writecache_add_to_freelist(wc, e); 750 clear_seq_count(wc, e); 751 writecache_flush_region(wc, memory_entry(wc, e), sizeof(struct wc_memory_entry)); 752 if (unlikely(waitqueue_active(&wc->freelist_wait))) 753 wake_up(&wc->freelist_wait); 754 } 755 756 static void writecache_wait_on_freelist(struct dm_writecache *wc) 757 { 758 DEFINE_WAIT(wait); 759 760 prepare_to_wait(&wc->freelist_wait, &wait, TASK_UNINTERRUPTIBLE); 761 wc_unlock(wc); 762 io_schedule(); 763 finish_wait(&wc->freelist_wait, &wait); 764 wc_lock(wc); 765 } 766 767 static void writecache_poison_lists(struct dm_writecache *wc) 768 { 769 /* 770 * Catch incorrect access to these values while the device is suspended. 771 */ 772 memset(&wc->tree, -1, sizeof wc->tree); 773 wc->lru.next = LIST_POISON1; 774 wc->lru.prev = LIST_POISON2; 775 wc->freelist.next = LIST_POISON1; 776 wc->freelist.prev = LIST_POISON2; 777 } 778 779 static void writecache_flush_entry(struct dm_writecache *wc, struct wc_entry *e) 780 { 781 writecache_flush_region(wc, memory_entry(wc, e), sizeof(struct wc_memory_entry)); 782 if (WC_MODE_PMEM(wc)) 783 writecache_flush_region(wc, memory_data(wc, e), wc->block_size); 784 } 785 786 static bool writecache_entry_is_committed(struct dm_writecache *wc, struct wc_entry *e) 787 { 788 return read_seq_count(wc, e) < wc->seq_count; 789 } 790 791 static void writecache_flush(struct dm_writecache *wc) 792 { 793 struct wc_entry *e, *e2; 794 bool need_flush_after_free; 795 796 wc->uncommitted_blocks = 0; 797 del_timer(&wc->autocommit_timer); 798 799 if (list_empty(&wc->lru)) 800 return; 801 802 e = container_of(wc->lru.next, struct wc_entry, lru); 803 if (writecache_entry_is_committed(wc, e)) { 804 if (wc->overwrote_committed) { 805 writecache_wait_for_ios(wc, WRITE); 806 writecache_disk_flush(wc, wc->ssd_dev); 807 wc->overwrote_committed = false; 808 } 809 return; 810 } 811 while (1) { 812 writecache_flush_entry(wc, e); 813 if (unlikely(e->lru.next == &wc->lru)) 814 break; 815 e2 = container_of(e->lru.next, struct wc_entry, lru); 816 if (writecache_entry_is_committed(wc, e2)) 817 break; 818 e = e2; 819 cond_resched(); 820 } 821 writecache_commit_flushed(wc, true); 822 823 wc->seq_count++; 824 pmem_assign(sb(wc)->seq_count, cpu_to_le64(wc->seq_count)); 825 if (WC_MODE_PMEM(wc)) 826 writecache_commit_flushed(wc, false); 827 else 828 ssd_commit_superblock(wc); 829 830 wc->overwrote_committed = false; 831 832 need_flush_after_free = false; 833 while (1) { 834 /* Free another committed entry with lower seq-count */ 835 struct rb_node *rb_node = rb_prev(&e->rb_node); 836 837 if (rb_node) { 838 e2 = container_of(rb_node, struct wc_entry, rb_node); 839 if (read_original_sector(wc, e2) == read_original_sector(wc, e) && 840 likely(!e2->write_in_progress)) { 841 writecache_free_entry(wc, e2); 842 need_flush_after_free = true; 843 } 844 } 845 if (unlikely(e->lru.prev == &wc->lru)) 846 break; 847 e = container_of(e->lru.prev, struct wc_entry, lru); 848 cond_resched(); 849 } 850 851 if (need_flush_after_free) 852 writecache_commit_flushed(wc, false); 853 } 854 855 static void writecache_flush_work(struct work_struct *work) 856 { 857 struct dm_writecache *wc = container_of(work, struct dm_writecache, flush_work); 858 859 wc_lock(wc); 860 writecache_flush(wc); 861 wc_unlock(wc); 862 } 863 864 static void writecache_autocommit_timer(struct timer_list *t) 865 { 866 struct dm_writecache *wc = from_timer(wc, t, autocommit_timer); 867 if (!writecache_has_error(wc)) 868 queue_work(wc->writeback_wq, &wc->flush_work); 869 } 870 871 static void writecache_schedule_autocommit(struct dm_writecache *wc) 872 { 873 if (!timer_pending(&wc->autocommit_timer)) 874 mod_timer(&wc->autocommit_timer, jiffies + wc->autocommit_jiffies); 875 } 876 877 static void writecache_discard(struct dm_writecache *wc, sector_t start, sector_t end) 878 { 879 struct wc_entry *e; 880 bool discarded_something = false; 881 882 e = writecache_find_entry(wc, start, WFE_RETURN_FOLLOWING | WFE_LOWEST_SEQ); 883 if (unlikely(!e)) 884 return; 885 886 while (read_original_sector(wc, e) < end) { 887 struct rb_node *node = rb_next(&e->rb_node); 888 889 if (likely(!e->write_in_progress)) { 890 if (!discarded_something) { 891 if (!WC_MODE_PMEM(wc)) { 892 writecache_wait_for_ios(wc, READ); 893 writecache_wait_for_ios(wc, WRITE); 894 } 895 discarded_something = true; 896 } 897 if (!writecache_entry_is_committed(wc, e)) 898 wc->uncommitted_blocks--; 899 writecache_free_entry(wc, e); 900 } 901 902 if (unlikely(!node)) 903 break; 904 905 e = container_of(node, struct wc_entry, rb_node); 906 } 907 908 if (discarded_something) 909 writecache_commit_flushed(wc, false); 910 } 911 912 static bool writecache_wait_for_writeback(struct dm_writecache *wc) 913 { 914 if (wc->writeback_size) { 915 writecache_wait_on_freelist(wc); 916 return true; 917 } 918 return false; 919 } 920 921 static void writecache_suspend(struct dm_target *ti) 922 { 923 struct dm_writecache *wc = ti->private; 924 bool flush_on_suspend; 925 926 del_timer_sync(&wc->autocommit_timer); 927 del_timer_sync(&wc->max_age_timer); 928 929 wc_lock(wc); 930 writecache_flush(wc); 931 flush_on_suspend = wc->flush_on_suspend; 932 if (flush_on_suspend) { 933 wc->flush_on_suspend = false; 934 wc->writeback_all++; 935 queue_work(wc->writeback_wq, &wc->writeback_work); 936 } 937 wc_unlock(wc); 938 939 drain_workqueue(wc->writeback_wq); 940 941 wc_lock(wc); 942 if (flush_on_suspend) 943 wc->writeback_all--; 944 while (writecache_wait_for_writeback(wc)); 945 946 if (WC_MODE_PMEM(wc)) 947 persistent_memory_flush_cache(wc->memory_map, wc->memory_map_size); 948 949 writecache_poison_lists(wc); 950 951 wc_unlock(wc); 952 } 953 954 static int writecache_alloc_entries(struct dm_writecache *wc) 955 { 956 size_t b; 957 958 if (wc->entries) 959 return 0; 960 wc->entries = vmalloc(array_size(sizeof(struct wc_entry), wc->n_blocks)); 961 if (!wc->entries) 962 return -ENOMEM; 963 for (b = 0; b < wc->n_blocks; b++) { 964 struct wc_entry *e = &wc->entries[b]; 965 e->index = b; 966 e->write_in_progress = false; 967 cond_resched(); 968 } 969 970 return 0; 971 } 972 973 static int writecache_read_metadata(struct dm_writecache *wc, sector_t n_sectors) 974 { 975 struct dm_io_region region; 976 struct dm_io_request req; 977 978 region.bdev = wc->ssd_dev->bdev; 979 region.sector = wc->start_sector; 980 region.count = n_sectors; 981 req.bi_opf = REQ_OP_READ | REQ_SYNC; 982 req.mem.type = DM_IO_VMA; 983 req.mem.ptr.vma = (char *)wc->memory_map; 984 req.client = wc->dm_io; 985 req.notify.fn = NULL; 986 987 return dm_io(&req, 1, ®ion, NULL); 988 } 989 990 static void writecache_resume(struct dm_target *ti) 991 { 992 struct dm_writecache *wc = ti->private; 993 size_t b; 994 bool need_flush = false; 995 __le64 sb_seq_count; 996 int r; 997 998 wc_lock(wc); 999 1000 wc->data_device_sectors = bdev_nr_sectors(wc->dev->bdev); 1001 1002 if (WC_MODE_PMEM(wc)) { 1003 persistent_memory_invalidate_cache(wc->memory_map, wc->memory_map_size); 1004 } else { 1005 r = writecache_read_metadata(wc, wc->metadata_sectors); 1006 if (r) { 1007 size_t sb_entries_offset; 1008 writecache_error(wc, r, "unable to read metadata: %d", r); 1009 sb_entries_offset = offsetof(struct wc_memory_superblock, entries); 1010 memset((char *)wc->memory_map + sb_entries_offset, -1, 1011 (wc->metadata_sectors << SECTOR_SHIFT) - sb_entries_offset); 1012 } 1013 } 1014 1015 wc->tree = RB_ROOT; 1016 INIT_LIST_HEAD(&wc->lru); 1017 if (WC_MODE_SORT_FREELIST(wc)) { 1018 wc->freetree = RB_ROOT; 1019 wc->current_free = NULL; 1020 } else { 1021 INIT_LIST_HEAD(&wc->freelist); 1022 } 1023 wc->freelist_size = 0; 1024 1025 r = copy_mc_to_kernel(&sb_seq_count, &sb(wc)->seq_count, 1026 sizeof(uint64_t)); 1027 if (r) { 1028 writecache_error(wc, r, "hardware memory error when reading superblock: %d", r); 1029 sb_seq_count = cpu_to_le64(0); 1030 } 1031 wc->seq_count = le64_to_cpu(sb_seq_count); 1032 1033 #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS 1034 for (b = 0; b < wc->n_blocks; b++) { 1035 struct wc_entry *e = &wc->entries[b]; 1036 struct wc_memory_entry wme; 1037 if (writecache_has_error(wc)) { 1038 e->original_sector = -1; 1039 e->seq_count = -1; 1040 continue; 1041 } 1042 r = copy_mc_to_kernel(&wme, memory_entry(wc, e), 1043 sizeof(struct wc_memory_entry)); 1044 if (r) { 1045 writecache_error(wc, r, "hardware memory error when reading metadata entry %lu: %d", 1046 (unsigned long)b, r); 1047 e->original_sector = -1; 1048 e->seq_count = -1; 1049 } else { 1050 e->original_sector = le64_to_cpu(wme.original_sector); 1051 e->seq_count = le64_to_cpu(wme.seq_count); 1052 } 1053 cond_resched(); 1054 } 1055 #endif 1056 for (b = 0; b < wc->n_blocks; b++) { 1057 struct wc_entry *e = &wc->entries[b]; 1058 if (!writecache_entry_is_committed(wc, e)) { 1059 if (read_seq_count(wc, e) != -1) { 1060 erase_this: 1061 clear_seq_count(wc, e); 1062 need_flush = true; 1063 } 1064 writecache_add_to_freelist(wc, e); 1065 } else { 1066 struct wc_entry *old; 1067 1068 old = writecache_find_entry(wc, read_original_sector(wc, e), 0); 1069 if (!old) { 1070 writecache_insert_entry(wc, e); 1071 } else { 1072 if (read_seq_count(wc, old) == read_seq_count(wc, e)) { 1073 writecache_error(wc, -EINVAL, 1074 "two identical entries, position %llu, sector %llu, sequence %llu", 1075 (unsigned long long)b, (unsigned long long)read_original_sector(wc, e), 1076 (unsigned long long)read_seq_count(wc, e)); 1077 } 1078 if (read_seq_count(wc, old) > read_seq_count(wc, e)) { 1079 goto erase_this; 1080 } else { 1081 writecache_free_entry(wc, old); 1082 writecache_insert_entry(wc, e); 1083 need_flush = true; 1084 } 1085 } 1086 } 1087 cond_resched(); 1088 } 1089 1090 if (need_flush) { 1091 writecache_flush_all_metadata(wc); 1092 writecache_commit_flushed(wc, false); 1093 } 1094 1095 writecache_verify_watermark(wc); 1096 1097 if (wc->max_age != MAX_AGE_UNSPECIFIED) 1098 mod_timer(&wc->max_age_timer, jiffies + wc->max_age / MAX_AGE_DIV); 1099 1100 wc_unlock(wc); 1101 } 1102 1103 static int process_flush_mesg(unsigned argc, char **argv, struct dm_writecache *wc) 1104 { 1105 if (argc != 1) 1106 return -EINVAL; 1107 1108 wc_lock(wc); 1109 if (dm_suspended(wc->ti)) { 1110 wc_unlock(wc); 1111 return -EBUSY; 1112 } 1113 if (writecache_has_error(wc)) { 1114 wc_unlock(wc); 1115 return -EIO; 1116 } 1117 1118 writecache_flush(wc); 1119 wc->writeback_all++; 1120 queue_work(wc->writeback_wq, &wc->writeback_work); 1121 wc_unlock(wc); 1122 1123 flush_workqueue(wc->writeback_wq); 1124 1125 wc_lock(wc); 1126 wc->writeback_all--; 1127 if (writecache_has_error(wc)) { 1128 wc_unlock(wc); 1129 return -EIO; 1130 } 1131 wc_unlock(wc); 1132 1133 return 0; 1134 } 1135 1136 static int process_flush_on_suspend_mesg(unsigned argc, char **argv, struct dm_writecache *wc) 1137 { 1138 if (argc != 1) 1139 return -EINVAL; 1140 1141 wc_lock(wc); 1142 wc->flush_on_suspend = true; 1143 wc_unlock(wc); 1144 1145 return 0; 1146 } 1147 1148 static void activate_cleaner(struct dm_writecache *wc) 1149 { 1150 wc->flush_on_suspend = true; 1151 wc->cleaner = true; 1152 wc->freelist_high_watermark = wc->n_blocks; 1153 wc->freelist_low_watermark = wc->n_blocks; 1154 } 1155 1156 static int process_cleaner_mesg(unsigned argc, char **argv, struct dm_writecache *wc) 1157 { 1158 if (argc != 1) 1159 return -EINVAL; 1160 1161 wc_lock(wc); 1162 activate_cleaner(wc); 1163 if (!dm_suspended(wc->ti)) 1164 writecache_verify_watermark(wc); 1165 wc_unlock(wc); 1166 1167 return 0; 1168 } 1169 1170 static int process_clear_stats_mesg(unsigned argc, char **argv, struct dm_writecache *wc) 1171 { 1172 if (argc != 1) 1173 return -EINVAL; 1174 1175 wc_lock(wc); 1176 memset(&wc->stats, 0, sizeof wc->stats); 1177 wc_unlock(wc); 1178 1179 return 0; 1180 } 1181 1182 static int writecache_message(struct dm_target *ti, unsigned argc, char **argv, 1183 char *result, unsigned maxlen) 1184 { 1185 int r = -EINVAL; 1186 struct dm_writecache *wc = ti->private; 1187 1188 if (!strcasecmp(argv[0], "flush")) 1189 r = process_flush_mesg(argc, argv, wc); 1190 else if (!strcasecmp(argv[0], "flush_on_suspend")) 1191 r = process_flush_on_suspend_mesg(argc, argv, wc); 1192 else if (!strcasecmp(argv[0], "cleaner")) 1193 r = process_cleaner_mesg(argc, argv, wc); 1194 else if (!strcasecmp(argv[0], "clear_stats")) 1195 r = process_clear_stats_mesg(argc, argv, wc); 1196 else 1197 DMERR("unrecognised message received: %s", argv[0]); 1198 1199 return r; 1200 } 1201 1202 static void memcpy_flushcache_optimized(void *dest, void *source, size_t size) 1203 { 1204 /* 1205 * clflushopt performs better with block size 1024, 2048, 4096 1206 * non-temporal stores perform better with block size 512 1207 * 1208 * block size 512 1024 2048 4096 1209 * movnti 496 MB/s 642 MB/s 725 MB/s 744 MB/s 1210 * clflushopt 373 MB/s 688 MB/s 1.1 GB/s 1.2 GB/s 1211 * 1212 * We see that movnti performs better for 512-byte blocks, and 1213 * clflushopt performs better for 1024-byte and larger blocks. So, we 1214 * prefer clflushopt for sizes >= 768. 1215 * 1216 * NOTE: this happens to be the case now (with dm-writecache's single 1217 * threaded model) but re-evaluate this once memcpy_flushcache() is 1218 * enabled to use movdir64b which might invalidate this performance 1219 * advantage seen with cache-allocating-writes plus flushing. 1220 */ 1221 #ifdef CONFIG_X86 1222 if (static_cpu_has(X86_FEATURE_CLFLUSHOPT) && 1223 likely(boot_cpu_data.x86_clflush_size == 64) && 1224 likely(size >= 768)) { 1225 do { 1226 memcpy((void *)dest, (void *)source, 64); 1227 clflushopt((void *)dest); 1228 dest += 64; 1229 source += 64; 1230 size -= 64; 1231 } while (size >= 64); 1232 return; 1233 } 1234 #endif 1235 memcpy_flushcache(dest, source, size); 1236 } 1237 1238 static void bio_copy_block(struct dm_writecache *wc, struct bio *bio, void *data) 1239 { 1240 void *buf; 1241 unsigned size; 1242 int rw = bio_data_dir(bio); 1243 unsigned remaining_size = wc->block_size; 1244 1245 do { 1246 struct bio_vec bv = bio_iter_iovec(bio, bio->bi_iter); 1247 buf = bvec_kmap_local(&bv); 1248 size = bv.bv_len; 1249 if (unlikely(size > remaining_size)) 1250 size = remaining_size; 1251 1252 if (rw == READ) { 1253 int r; 1254 r = copy_mc_to_kernel(buf, data, size); 1255 flush_dcache_page(bio_page(bio)); 1256 if (unlikely(r)) { 1257 writecache_error(wc, r, "hardware memory error when reading data: %d", r); 1258 bio->bi_status = BLK_STS_IOERR; 1259 } 1260 } else { 1261 flush_dcache_page(bio_page(bio)); 1262 memcpy_flushcache_optimized(data, buf, size); 1263 } 1264 1265 kunmap_local(buf); 1266 1267 data = (char *)data + size; 1268 remaining_size -= size; 1269 bio_advance(bio, size); 1270 } while (unlikely(remaining_size)); 1271 } 1272 1273 static int writecache_flush_thread(void *data) 1274 { 1275 struct dm_writecache *wc = data; 1276 1277 while (1) { 1278 struct bio *bio; 1279 1280 wc_lock(wc); 1281 bio = bio_list_pop(&wc->flush_list); 1282 if (!bio) { 1283 set_current_state(TASK_INTERRUPTIBLE); 1284 wc_unlock(wc); 1285 1286 if (unlikely(kthread_should_stop())) { 1287 set_current_state(TASK_RUNNING); 1288 break; 1289 } 1290 1291 schedule(); 1292 continue; 1293 } 1294 1295 if (bio_op(bio) == REQ_OP_DISCARD) { 1296 writecache_discard(wc, bio->bi_iter.bi_sector, 1297 bio_end_sector(bio)); 1298 wc_unlock(wc); 1299 bio_set_dev(bio, wc->dev->bdev); 1300 submit_bio_noacct(bio); 1301 } else { 1302 writecache_flush(wc); 1303 wc_unlock(wc); 1304 if (writecache_has_error(wc)) 1305 bio->bi_status = BLK_STS_IOERR; 1306 bio_endio(bio); 1307 } 1308 } 1309 1310 return 0; 1311 } 1312 1313 static void writecache_offload_bio(struct dm_writecache *wc, struct bio *bio) 1314 { 1315 if (bio_list_empty(&wc->flush_list)) 1316 wake_up_process(wc->flush_thread); 1317 bio_list_add(&wc->flush_list, bio); 1318 } 1319 1320 enum wc_map_op { 1321 WC_MAP_SUBMIT, 1322 WC_MAP_REMAP, 1323 WC_MAP_REMAP_ORIGIN, 1324 WC_MAP_RETURN, 1325 WC_MAP_ERROR, 1326 }; 1327 1328 static void writecache_map_remap_origin(struct dm_writecache *wc, struct bio *bio, 1329 struct wc_entry *e) 1330 { 1331 if (e) { 1332 sector_t next_boundary = 1333 read_original_sector(wc, e) - bio->bi_iter.bi_sector; 1334 if (next_boundary < bio->bi_iter.bi_size >> SECTOR_SHIFT) 1335 dm_accept_partial_bio(bio, next_boundary); 1336 } 1337 } 1338 1339 static enum wc_map_op writecache_map_read(struct dm_writecache *wc, struct bio *bio) 1340 { 1341 enum wc_map_op map_op; 1342 struct wc_entry *e; 1343 1344 read_next_block: 1345 wc->stats.reads++; 1346 e = writecache_find_entry(wc, bio->bi_iter.bi_sector, WFE_RETURN_FOLLOWING); 1347 if (e && read_original_sector(wc, e) == bio->bi_iter.bi_sector) { 1348 wc->stats.read_hits++; 1349 if (WC_MODE_PMEM(wc)) { 1350 bio_copy_block(wc, bio, memory_data(wc, e)); 1351 if (bio->bi_iter.bi_size) 1352 goto read_next_block; 1353 map_op = WC_MAP_SUBMIT; 1354 } else { 1355 dm_accept_partial_bio(bio, wc->block_size >> SECTOR_SHIFT); 1356 bio_set_dev(bio, wc->ssd_dev->bdev); 1357 bio->bi_iter.bi_sector = cache_sector(wc, e); 1358 if (!writecache_entry_is_committed(wc, e)) 1359 writecache_wait_for_ios(wc, WRITE); 1360 map_op = WC_MAP_REMAP; 1361 } 1362 } else { 1363 writecache_map_remap_origin(wc, bio, e); 1364 wc->stats.reads += (bio->bi_iter.bi_size - wc->block_size) >> wc->block_size_bits; 1365 map_op = WC_MAP_REMAP_ORIGIN; 1366 } 1367 1368 return map_op; 1369 } 1370 1371 static void writecache_bio_copy_ssd(struct dm_writecache *wc, struct bio *bio, 1372 struct wc_entry *e, bool search_used) 1373 { 1374 unsigned bio_size = wc->block_size; 1375 sector_t start_cache_sec = cache_sector(wc, e); 1376 sector_t current_cache_sec = start_cache_sec + (bio_size >> SECTOR_SHIFT); 1377 1378 while (bio_size < bio->bi_iter.bi_size) { 1379 if (!search_used) { 1380 struct wc_entry *f = writecache_pop_from_freelist(wc, current_cache_sec); 1381 if (!f) 1382 break; 1383 write_original_sector_seq_count(wc, f, bio->bi_iter.bi_sector + 1384 (bio_size >> SECTOR_SHIFT), wc->seq_count); 1385 writecache_insert_entry(wc, f); 1386 wc->uncommitted_blocks++; 1387 } else { 1388 struct wc_entry *f; 1389 struct rb_node *next = rb_next(&e->rb_node); 1390 if (!next) 1391 break; 1392 f = container_of(next, struct wc_entry, rb_node); 1393 if (f != e + 1) 1394 break; 1395 if (read_original_sector(wc, f) != 1396 read_original_sector(wc, e) + (wc->block_size >> SECTOR_SHIFT)) 1397 break; 1398 if (unlikely(f->write_in_progress)) 1399 break; 1400 if (writecache_entry_is_committed(wc, f)) 1401 wc->overwrote_committed = true; 1402 e = f; 1403 } 1404 bio_size += wc->block_size; 1405 current_cache_sec += wc->block_size >> SECTOR_SHIFT; 1406 } 1407 1408 bio_set_dev(bio, wc->ssd_dev->bdev); 1409 bio->bi_iter.bi_sector = start_cache_sec; 1410 dm_accept_partial_bio(bio, bio_size >> SECTOR_SHIFT); 1411 1412 wc->stats.writes += bio->bi_iter.bi_size >> wc->block_size_bits; 1413 wc->stats.writes_allocate += (bio->bi_iter.bi_size - wc->block_size) >> wc->block_size_bits; 1414 1415 if (unlikely(wc->uncommitted_blocks >= wc->autocommit_blocks)) { 1416 wc->uncommitted_blocks = 0; 1417 queue_work(wc->writeback_wq, &wc->flush_work); 1418 } else { 1419 writecache_schedule_autocommit(wc); 1420 } 1421 } 1422 1423 static enum wc_map_op writecache_map_write(struct dm_writecache *wc, struct bio *bio) 1424 { 1425 struct wc_entry *e; 1426 1427 do { 1428 bool found_entry = false; 1429 bool search_used = false; 1430 if (writecache_has_error(wc)) { 1431 wc->stats.writes += bio->bi_iter.bi_size >> wc->block_size_bits; 1432 return WC_MAP_ERROR; 1433 } 1434 e = writecache_find_entry(wc, bio->bi_iter.bi_sector, 0); 1435 if (e) { 1436 if (!writecache_entry_is_committed(wc, e)) { 1437 wc->stats.write_hits_uncommitted++; 1438 search_used = true; 1439 goto bio_copy; 1440 } 1441 wc->stats.write_hits_committed++; 1442 if (!WC_MODE_PMEM(wc) && !e->write_in_progress) { 1443 wc->overwrote_committed = true; 1444 search_used = true; 1445 goto bio_copy; 1446 } 1447 found_entry = true; 1448 } else { 1449 if (unlikely(wc->cleaner) || 1450 (wc->metadata_only && !(bio->bi_opf & REQ_META))) 1451 goto direct_write; 1452 } 1453 e = writecache_pop_from_freelist(wc, (sector_t)-1); 1454 if (unlikely(!e)) { 1455 if (!WC_MODE_PMEM(wc) && !found_entry) { 1456 direct_write: 1457 e = writecache_find_entry(wc, bio->bi_iter.bi_sector, WFE_RETURN_FOLLOWING); 1458 writecache_map_remap_origin(wc, bio, e); 1459 wc->stats.writes_around += bio->bi_iter.bi_size >> wc->block_size_bits; 1460 wc->stats.writes += bio->bi_iter.bi_size >> wc->block_size_bits; 1461 return WC_MAP_REMAP_ORIGIN; 1462 } 1463 wc->stats.writes_blocked_on_freelist++; 1464 writecache_wait_on_freelist(wc); 1465 continue; 1466 } 1467 write_original_sector_seq_count(wc, e, bio->bi_iter.bi_sector, wc->seq_count); 1468 writecache_insert_entry(wc, e); 1469 wc->uncommitted_blocks++; 1470 wc->stats.writes_allocate++; 1471 bio_copy: 1472 if (WC_MODE_PMEM(wc)) { 1473 bio_copy_block(wc, bio, memory_data(wc, e)); 1474 wc->stats.writes++; 1475 } else { 1476 writecache_bio_copy_ssd(wc, bio, e, search_used); 1477 return WC_MAP_REMAP; 1478 } 1479 } while (bio->bi_iter.bi_size); 1480 1481 if (unlikely(bio->bi_opf & REQ_FUA || wc->uncommitted_blocks >= wc->autocommit_blocks)) 1482 writecache_flush(wc); 1483 else 1484 writecache_schedule_autocommit(wc); 1485 1486 return WC_MAP_SUBMIT; 1487 } 1488 1489 static enum wc_map_op writecache_map_flush(struct dm_writecache *wc, struct bio *bio) 1490 { 1491 if (writecache_has_error(wc)) 1492 return WC_MAP_ERROR; 1493 1494 if (WC_MODE_PMEM(wc)) { 1495 wc->stats.flushes++; 1496 writecache_flush(wc); 1497 if (writecache_has_error(wc)) 1498 return WC_MAP_ERROR; 1499 else if (unlikely(wc->cleaner) || unlikely(wc->metadata_only)) 1500 return WC_MAP_REMAP_ORIGIN; 1501 return WC_MAP_SUBMIT; 1502 } 1503 /* SSD: */ 1504 if (dm_bio_get_target_bio_nr(bio)) 1505 return WC_MAP_REMAP_ORIGIN; 1506 wc->stats.flushes++; 1507 writecache_offload_bio(wc, bio); 1508 return WC_MAP_RETURN; 1509 } 1510 1511 static enum wc_map_op writecache_map_discard(struct dm_writecache *wc, struct bio *bio) 1512 { 1513 wc->stats.discards += bio->bi_iter.bi_size >> wc->block_size_bits; 1514 1515 if (writecache_has_error(wc)) 1516 return WC_MAP_ERROR; 1517 1518 if (WC_MODE_PMEM(wc)) { 1519 writecache_discard(wc, bio->bi_iter.bi_sector, bio_end_sector(bio)); 1520 return WC_MAP_REMAP_ORIGIN; 1521 } 1522 /* SSD: */ 1523 writecache_offload_bio(wc, bio); 1524 return WC_MAP_RETURN; 1525 } 1526 1527 static int writecache_map(struct dm_target *ti, struct bio *bio) 1528 { 1529 struct dm_writecache *wc = ti->private; 1530 enum wc_map_op map_op; 1531 1532 bio->bi_private = NULL; 1533 1534 wc_lock(wc); 1535 1536 if (unlikely(bio->bi_opf & REQ_PREFLUSH)) { 1537 map_op = writecache_map_flush(wc, bio); 1538 goto done; 1539 } 1540 1541 bio->bi_iter.bi_sector = dm_target_offset(ti, bio->bi_iter.bi_sector); 1542 1543 if (unlikely((((unsigned)bio->bi_iter.bi_sector | bio_sectors(bio)) & 1544 (wc->block_size / 512 - 1)) != 0)) { 1545 DMERR("I/O is not aligned, sector %llu, size %u, block size %u", 1546 (unsigned long long)bio->bi_iter.bi_sector, 1547 bio->bi_iter.bi_size, wc->block_size); 1548 map_op = WC_MAP_ERROR; 1549 goto done; 1550 } 1551 1552 if (unlikely(bio_op(bio) == REQ_OP_DISCARD)) { 1553 map_op = writecache_map_discard(wc, bio); 1554 goto done; 1555 } 1556 1557 if (bio_data_dir(bio) == READ) 1558 map_op = writecache_map_read(wc, bio); 1559 else 1560 map_op = writecache_map_write(wc, bio); 1561 done: 1562 switch (map_op) { 1563 case WC_MAP_REMAP_ORIGIN: 1564 if (likely(wc->pause != 0)) { 1565 if (bio_op(bio) == REQ_OP_WRITE) { 1566 dm_iot_io_begin(&wc->iot, 1); 1567 bio->bi_private = (void *)2; 1568 } 1569 } 1570 bio_set_dev(bio, wc->dev->bdev); 1571 wc_unlock(wc); 1572 return DM_MAPIO_REMAPPED; 1573 1574 case WC_MAP_REMAP: 1575 /* make sure that writecache_end_io decrements bio_in_progress: */ 1576 bio->bi_private = (void *)1; 1577 atomic_inc(&wc->bio_in_progress[bio_data_dir(bio)]); 1578 wc_unlock(wc); 1579 return DM_MAPIO_REMAPPED; 1580 1581 case WC_MAP_SUBMIT: 1582 wc_unlock(wc); 1583 bio_endio(bio); 1584 return DM_MAPIO_SUBMITTED; 1585 1586 case WC_MAP_RETURN: 1587 wc_unlock(wc); 1588 return DM_MAPIO_SUBMITTED; 1589 1590 case WC_MAP_ERROR: 1591 wc_unlock(wc); 1592 bio_io_error(bio); 1593 return DM_MAPIO_SUBMITTED; 1594 1595 default: 1596 BUG(); 1597 wc_unlock(wc); 1598 return DM_MAPIO_KILL; 1599 } 1600 } 1601 1602 static int writecache_end_io(struct dm_target *ti, struct bio *bio, blk_status_t *status) 1603 { 1604 struct dm_writecache *wc = ti->private; 1605 1606 if (bio->bi_private == (void *)1) { 1607 int dir = bio_data_dir(bio); 1608 if (atomic_dec_and_test(&wc->bio_in_progress[dir])) 1609 if (unlikely(waitqueue_active(&wc->bio_in_progress_wait[dir]))) 1610 wake_up(&wc->bio_in_progress_wait[dir]); 1611 } else if (bio->bi_private == (void *)2) { 1612 dm_iot_io_end(&wc->iot, 1); 1613 } 1614 return 0; 1615 } 1616 1617 static int writecache_iterate_devices(struct dm_target *ti, 1618 iterate_devices_callout_fn fn, void *data) 1619 { 1620 struct dm_writecache *wc = ti->private; 1621 1622 return fn(ti, wc->dev, 0, ti->len, data); 1623 } 1624 1625 static void writecache_io_hints(struct dm_target *ti, struct queue_limits *limits) 1626 { 1627 struct dm_writecache *wc = ti->private; 1628 1629 if (limits->logical_block_size < wc->block_size) 1630 limits->logical_block_size = wc->block_size; 1631 1632 if (limits->physical_block_size < wc->block_size) 1633 limits->physical_block_size = wc->block_size; 1634 1635 if (limits->io_min < wc->block_size) 1636 limits->io_min = wc->block_size; 1637 } 1638 1639 1640 static void writecache_writeback_endio(struct bio *bio) 1641 { 1642 struct writeback_struct *wb = container_of(bio, struct writeback_struct, bio); 1643 struct dm_writecache *wc = wb->wc; 1644 unsigned long flags; 1645 1646 raw_spin_lock_irqsave(&wc->endio_list_lock, flags); 1647 if (unlikely(list_empty(&wc->endio_list))) 1648 wake_up_process(wc->endio_thread); 1649 list_add_tail(&wb->endio_entry, &wc->endio_list); 1650 raw_spin_unlock_irqrestore(&wc->endio_list_lock, flags); 1651 } 1652 1653 static void writecache_copy_endio(int read_err, unsigned long write_err, void *ptr) 1654 { 1655 struct copy_struct *c = ptr; 1656 struct dm_writecache *wc = c->wc; 1657 1658 c->error = likely(!(read_err | write_err)) ? 0 : -EIO; 1659 1660 raw_spin_lock_irq(&wc->endio_list_lock); 1661 if (unlikely(list_empty(&wc->endio_list))) 1662 wake_up_process(wc->endio_thread); 1663 list_add_tail(&c->endio_entry, &wc->endio_list); 1664 raw_spin_unlock_irq(&wc->endio_list_lock); 1665 } 1666 1667 static void __writecache_endio_pmem(struct dm_writecache *wc, struct list_head *list) 1668 { 1669 unsigned i; 1670 struct writeback_struct *wb; 1671 struct wc_entry *e; 1672 unsigned long n_walked = 0; 1673 1674 do { 1675 wb = list_entry(list->next, struct writeback_struct, endio_entry); 1676 list_del(&wb->endio_entry); 1677 1678 if (unlikely(wb->bio.bi_status != BLK_STS_OK)) 1679 writecache_error(wc, blk_status_to_errno(wb->bio.bi_status), 1680 "write error %d", wb->bio.bi_status); 1681 i = 0; 1682 do { 1683 e = wb->wc_list[i]; 1684 BUG_ON(!e->write_in_progress); 1685 e->write_in_progress = false; 1686 INIT_LIST_HEAD(&e->lru); 1687 if (!writecache_has_error(wc)) 1688 writecache_free_entry(wc, e); 1689 BUG_ON(!wc->writeback_size); 1690 wc->writeback_size--; 1691 n_walked++; 1692 if (unlikely(n_walked >= ENDIO_LATENCY)) { 1693 writecache_commit_flushed(wc, false); 1694 wc_unlock(wc); 1695 wc_lock(wc); 1696 n_walked = 0; 1697 } 1698 } while (++i < wb->wc_list_n); 1699 1700 if (wb->wc_list != wb->wc_list_inline) 1701 kfree(wb->wc_list); 1702 bio_put(&wb->bio); 1703 } while (!list_empty(list)); 1704 } 1705 1706 static void __writecache_endio_ssd(struct dm_writecache *wc, struct list_head *list) 1707 { 1708 struct copy_struct *c; 1709 struct wc_entry *e; 1710 1711 do { 1712 c = list_entry(list->next, struct copy_struct, endio_entry); 1713 list_del(&c->endio_entry); 1714 1715 if (unlikely(c->error)) 1716 writecache_error(wc, c->error, "copy error"); 1717 1718 e = c->e; 1719 do { 1720 BUG_ON(!e->write_in_progress); 1721 e->write_in_progress = false; 1722 INIT_LIST_HEAD(&e->lru); 1723 if (!writecache_has_error(wc)) 1724 writecache_free_entry(wc, e); 1725 1726 BUG_ON(!wc->writeback_size); 1727 wc->writeback_size--; 1728 e++; 1729 } while (--c->n_entries); 1730 mempool_free(c, &wc->copy_pool); 1731 } while (!list_empty(list)); 1732 } 1733 1734 static int writecache_endio_thread(void *data) 1735 { 1736 struct dm_writecache *wc = data; 1737 1738 while (1) { 1739 struct list_head list; 1740 1741 raw_spin_lock_irq(&wc->endio_list_lock); 1742 if (!list_empty(&wc->endio_list)) 1743 goto pop_from_list; 1744 set_current_state(TASK_INTERRUPTIBLE); 1745 raw_spin_unlock_irq(&wc->endio_list_lock); 1746 1747 if (unlikely(kthread_should_stop())) { 1748 set_current_state(TASK_RUNNING); 1749 break; 1750 } 1751 1752 schedule(); 1753 1754 continue; 1755 1756 pop_from_list: 1757 list = wc->endio_list; 1758 list.next->prev = list.prev->next = &list; 1759 INIT_LIST_HEAD(&wc->endio_list); 1760 raw_spin_unlock_irq(&wc->endio_list_lock); 1761 1762 if (!WC_MODE_FUA(wc)) 1763 writecache_disk_flush(wc, wc->dev); 1764 1765 wc_lock(wc); 1766 1767 if (WC_MODE_PMEM(wc)) { 1768 __writecache_endio_pmem(wc, &list); 1769 } else { 1770 __writecache_endio_ssd(wc, &list); 1771 writecache_wait_for_ios(wc, READ); 1772 } 1773 1774 writecache_commit_flushed(wc, false); 1775 1776 wc_unlock(wc); 1777 } 1778 1779 return 0; 1780 } 1781 1782 static bool wc_add_block(struct writeback_struct *wb, struct wc_entry *e) 1783 { 1784 struct dm_writecache *wc = wb->wc; 1785 unsigned block_size = wc->block_size; 1786 void *address = memory_data(wc, e); 1787 1788 persistent_memory_flush_cache(address, block_size); 1789 1790 if (unlikely(bio_end_sector(&wb->bio) >= wc->data_device_sectors)) 1791 return true; 1792 1793 return bio_add_page(&wb->bio, persistent_memory_page(address), 1794 block_size, persistent_memory_page_offset(address)) != 0; 1795 } 1796 1797 struct writeback_list { 1798 struct list_head list; 1799 size_t size; 1800 }; 1801 1802 static void __writeback_throttle(struct dm_writecache *wc, struct writeback_list *wbl) 1803 { 1804 if (unlikely(wc->max_writeback_jobs)) { 1805 if (READ_ONCE(wc->writeback_size) - wbl->size >= wc->max_writeback_jobs) { 1806 wc_lock(wc); 1807 while (wc->writeback_size - wbl->size >= wc->max_writeback_jobs) 1808 writecache_wait_on_freelist(wc); 1809 wc_unlock(wc); 1810 } 1811 } 1812 cond_resched(); 1813 } 1814 1815 static void __writecache_writeback_pmem(struct dm_writecache *wc, struct writeback_list *wbl) 1816 { 1817 struct wc_entry *e, *f; 1818 struct bio *bio; 1819 struct writeback_struct *wb; 1820 unsigned max_pages; 1821 1822 while (wbl->size) { 1823 wbl->size--; 1824 e = container_of(wbl->list.prev, struct wc_entry, lru); 1825 list_del(&e->lru); 1826 1827 max_pages = e->wc_list_contiguous; 1828 1829 bio = bio_alloc_bioset(wc->dev->bdev, max_pages, REQ_OP_WRITE, 1830 GFP_NOIO, &wc->bio_set); 1831 wb = container_of(bio, struct writeback_struct, bio); 1832 wb->wc = wc; 1833 bio->bi_end_io = writecache_writeback_endio; 1834 bio->bi_iter.bi_sector = read_original_sector(wc, e); 1835 if (max_pages <= WB_LIST_INLINE || 1836 unlikely(!(wb->wc_list = kmalloc_array(max_pages, sizeof(struct wc_entry *), 1837 GFP_NOIO | __GFP_NORETRY | 1838 __GFP_NOMEMALLOC | __GFP_NOWARN)))) { 1839 wb->wc_list = wb->wc_list_inline; 1840 max_pages = WB_LIST_INLINE; 1841 } 1842 1843 BUG_ON(!wc_add_block(wb, e)); 1844 1845 wb->wc_list[0] = e; 1846 wb->wc_list_n = 1; 1847 1848 while (wbl->size && wb->wc_list_n < max_pages) { 1849 f = container_of(wbl->list.prev, struct wc_entry, lru); 1850 if (read_original_sector(wc, f) != 1851 read_original_sector(wc, e) + (wc->block_size >> SECTOR_SHIFT)) 1852 break; 1853 if (!wc_add_block(wb, f)) 1854 break; 1855 wbl->size--; 1856 list_del(&f->lru); 1857 wb->wc_list[wb->wc_list_n++] = f; 1858 e = f; 1859 } 1860 if (WC_MODE_FUA(wc)) 1861 bio->bi_opf |= REQ_FUA; 1862 if (writecache_has_error(wc)) { 1863 bio->bi_status = BLK_STS_IOERR; 1864 bio_endio(bio); 1865 } else if (unlikely(!bio_sectors(bio))) { 1866 bio->bi_status = BLK_STS_OK; 1867 bio_endio(bio); 1868 } else { 1869 submit_bio(bio); 1870 } 1871 1872 __writeback_throttle(wc, wbl); 1873 } 1874 } 1875 1876 static void __writecache_writeback_ssd(struct dm_writecache *wc, struct writeback_list *wbl) 1877 { 1878 struct wc_entry *e, *f; 1879 struct dm_io_region from, to; 1880 struct copy_struct *c; 1881 1882 while (wbl->size) { 1883 unsigned n_sectors; 1884 1885 wbl->size--; 1886 e = container_of(wbl->list.prev, struct wc_entry, lru); 1887 list_del(&e->lru); 1888 1889 n_sectors = e->wc_list_contiguous << (wc->block_size_bits - SECTOR_SHIFT); 1890 1891 from.bdev = wc->ssd_dev->bdev; 1892 from.sector = cache_sector(wc, e); 1893 from.count = n_sectors; 1894 to.bdev = wc->dev->bdev; 1895 to.sector = read_original_sector(wc, e); 1896 to.count = n_sectors; 1897 1898 c = mempool_alloc(&wc->copy_pool, GFP_NOIO); 1899 c->wc = wc; 1900 c->e = e; 1901 c->n_entries = e->wc_list_contiguous; 1902 1903 while ((n_sectors -= wc->block_size >> SECTOR_SHIFT)) { 1904 wbl->size--; 1905 f = container_of(wbl->list.prev, struct wc_entry, lru); 1906 BUG_ON(f != e + 1); 1907 list_del(&f->lru); 1908 e = f; 1909 } 1910 1911 if (unlikely(to.sector + to.count > wc->data_device_sectors)) { 1912 if (to.sector >= wc->data_device_sectors) { 1913 writecache_copy_endio(0, 0, c); 1914 continue; 1915 } 1916 from.count = to.count = wc->data_device_sectors - to.sector; 1917 } 1918 1919 dm_kcopyd_copy(wc->dm_kcopyd, &from, 1, &to, 0, writecache_copy_endio, c); 1920 1921 __writeback_throttle(wc, wbl); 1922 } 1923 } 1924 1925 static void writecache_writeback(struct work_struct *work) 1926 { 1927 struct dm_writecache *wc = container_of(work, struct dm_writecache, writeback_work); 1928 struct blk_plug plug; 1929 struct wc_entry *f, *g, *e = NULL; 1930 struct rb_node *node, *next_node; 1931 struct list_head skipped; 1932 struct writeback_list wbl; 1933 unsigned long n_walked; 1934 1935 if (!WC_MODE_PMEM(wc)) { 1936 /* Wait for any active kcopyd work on behalf of ssd writeback */ 1937 dm_kcopyd_client_flush(wc->dm_kcopyd); 1938 } 1939 1940 if (likely(wc->pause != 0)) { 1941 while (1) { 1942 unsigned long idle; 1943 if (unlikely(wc->cleaner) || unlikely(wc->writeback_all) || 1944 unlikely(dm_suspended(wc->ti))) 1945 break; 1946 idle = dm_iot_idle_time(&wc->iot); 1947 if (idle >= wc->pause) 1948 break; 1949 idle = wc->pause - idle; 1950 if (idle > HZ) 1951 idle = HZ; 1952 schedule_timeout_idle(idle); 1953 } 1954 } 1955 1956 wc_lock(wc); 1957 restart: 1958 if (writecache_has_error(wc)) { 1959 wc_unlock(wc); 1960 return; 1961 } 1962 1963 if (unlikely(wc->writeback_all)) { 1964 if (writecache_wait_for_writeback(wc)) 1965 goto restart; 1966 } 1967 1968 if (wc->overwrote_committed) { 1969 writecache_wait_for_ios(wc, WRITE); 1970 } 1971 1972 n_walked = 0; 1973 INIT_LIST_HEAD(&skipped); 1974 INIT_LIST_HEAD(&wbl.list); 1975 wbl.size = 0; 1976 while (!list_empty(&wc->lru) && 1977 (wc->writeback_all || 1978 wc->freelist_size + wc->writeback_size <= wc->freelist_low_watermark || 1979 (jiffies - container_of(wc->lru.prev, struct wc_entry, lru)->age >= 1980 wc->max_age - wc->max_age / MAX_AGE_DIV))) { 1981 1982 n_walked++; 1983 if (unlikely(n_walked > WRITEBACK_LATENCY) && 1984 likely(!wc->writeback_all)) { 1985 if (likely(!dm_suspended(wc->ti))) 1986 queue_work(wc->writeback_wq, &wc->writeback_work); 1987 break; 1988 } 1989 1990 if (unlikely(wc->writeback_all)) { 1991 if (unlikely(!e)) { 1992 writecache_flush(wc); 1993 e = container_of(rb_first(&wc->tree), struct wc_entry, rb_node); 1994 } else 1995 e = g; 1996 } else 1997 e = container_of(wc->lru.prev, struct wc_entry, lru); 1998 BUG_ON(e->write_in_progress); 1999 if (unlikely(!writecache_entry_is_committed(wc, e))) { 2000 writecache_flush(wc); 2001 } 2002 node = rb_prev(&e->rb_node); 2003 if (node) { 2004 f = container_of(node, struct wc_entry, rb_node); 2005 if (unlikely(read_original_sector(wc, f) == 2006 read_original_sector(wc, e))) { 2007 BUG_ON(!f->write_in_progress); 2008 list_move(&e->lru, &skipped); 2009 cond_resched(); 2010 continue; 2011 } 2012 } 2013 wc->writeback_size++; 2014 list_move(&e->lru, &wbl.list); 2015 wbl.size++; 2016 e->write_in_progress = true; 2017 e->wc_list_contiguous = 1; 2018 2019 f = e; 2020 2021 while (1) { 2022 next_node = rb_next(&f->rb_node); 2023 if (unlikely(!next_node)) 2024 break; 2025 g = container_of(next_node, struct wc_entry, rb_node); 2026 if (unlikely(read_original_sector(wc, g) == 2027 read_original_sector(wc, f))) { 2028 f = g; 2029 continue; 2030 } 2031 if (read_original_sector(wc, g) != 2032 read_original_sector(wc, f) + (wc->block_size >> SECTOR_SHIFT)) 2033 break; 2034 if (unlikely(g->write_in_progress)) 2035 break; 2036 if (unlikely(!writecache_entry_is_committed(wc, g))) 2037 break; 2038 2039 if (!WC_MODE_PMEM(wc)) { 2040 if (g != f + 1) 2041 break; 2042 } 2043 2044 n_walked++; 2045 //if (unlikely(n_walked > WRITEBACK_LATENCY) && likely(!wc->writeback_all)) 2046 // break; 2047 2048 wc->writeback_size++; 2049 list_move(&g->lru, &wbl.list); 2050 wbl.size++; 2051 g->write_in_progress = true; 2052 g->wc_list_contiguous = BIO_MAX_VECS; 2053 f = g; 2054 e->wc_list_contiguous++; 2055 if (unlikely(e->wc_list_contiguous == BIO_MAX_VECS)) { 2056 if (unlikely(wc->writeback_all)) { 2057 next_node = rb_next(&f->rb_node); 2058 if (likely(next_node)) 2059 g = container_of(next_node, struct wc_entry, rb_node); 2060 } 2061 break; 2062 } 2063 } 2064 cond_resched(); 2065 } 2066 2067 if (!list_empty(&skipped)) { 2068 list_splice_tail(&skipped, &wc->lru); 2069 /* 2070 * If we didn't do any progress, we must wait until some 2071 * writeback finishes to avoid burning CPU in a loop 2072 */ 2073 if (unlikely(!wbl.size)) 2074 writecache_wait_for_writeback(wc); 2075 } 2076 2077 wc_unlock(wc); 2078 2079 blk_start_plug(&plug); 2080 2081 if (WC_MODE_PMEM(wc)) 2082 __writecache_writeback_pmem(wc, &wbl); 2083 else 2084 __writecache_writeback_ssd(wc, &wbl); 2085 2086 blk_finish_plug(&plug); 2087 2088 if (unlikely(wc->writeback_all)) { 2089 wc_lock(wc); 2090 while (writecache_wait_for_writeback(wc)); 2091 wc_unlock(wc); 2092 } 2093 } 2094 2095 static int calculate_memory_size(uint64_t device_size, unsigned block_size, 2096 size_t *n_blocks_p, size_t *n_metadata_blocks_p) 2097 { 2098 uint64_t n_blocks, offset; 2099 struct wc_entry e; 2100 2101 n_blocks = device_size; 2102 do_div(n_blocks, block_size + sizeof(struct wc_memory_entry)); 2103 2104 while (1) { 2105 if (!n_blocks) 2106 return -ENOSPC; 2107 /* Verify the following entries[n_blocks] won't overflow */ 2108 if (n_blocks >= ((size_t)-sizeof(struct wc_memory_superblock) / 2109 sizeof(struct wc_memory_entry))) 2110 return -EFBIG; 2111 offset = offsetof(struct wc_memory_superblock, entries[n_blocks]); 2112 offset = (offset + block_size - 1) & ~(uint64_t)(block_size - 1); 2113 if (offset + n_blocks * block_size <= device_size) 2114 break; 2115 n_blocks--; 2116 } 2117 2118 /* check if the bit field overflows */ 2119 e.index = n_blocks; 2120 if (e.index != n_blocks) 2121 return -EFBIG; 2122 2123 if (n_blocks_p) 2124 *n_blocks_p = n_blocks; 2125 if (n_metadata_blocks_p) 2126 *n_metadata_blocks_p = offset >> __ffs(block_size); 2127 return 0; 2128 } 2129 2130 static int init_memory(struct dm_writecache *wc) 2131 { 2132 size_t b; 2133 int r; 2134 2135 r = calculate_memory_size(wc->memory_map_size, wc->block_size, &wc->n_blocks, NULL); 2136 if (r) 2137 return r; 2138 2139 r = writecache_alloc_entries(wc); 2140 if (r) 2141 return r; 2142 2143 for (b = 0; b < ARRAY_SIZE(sb(wc)->padding); b++) 2144 pmem_assign(sb(wc)->padding[b], cpu_to_le64(0)); 2145 pmem_assign(sb(wc)->version, cpu_to_le32(MEMORY_SUPERBLOCK_VERSION)); 2146 pmem_assign(sb(wc)->block_size, cpu_to_le32(wc->block_size)); 2147 pmem_assign(sb(wc)->n_blocks, cpu_to_le64(wc->n_blocks)); 2148 pmem_assign(sb(wc)->seq_count, cpu_to_le64(0)); 2149 2150 for (b = 0; b < wc->n_blocks; b++) { 2151 write_original_sector_seq_count(wc, &wc->entries[b], -1, -1); 2152 cond_resched(); 2153 } 2154 2155 writecache_flush_all_metadata(wc); 2156 writecache_commit_flushed(wc, false); 2157 pmem_assign(sb(wc)->magic, cpu_to_le32(MEMORY_SUPERBLOCK_MAGIC)); 2158 writecache_flush_region(wc, &sb(wc)->magic, sizeof sb(wc)->magic); 2159 writecache_commit_flushed(wc, false); 2160 2161 return 0; 2162 } 2163 2164 static void writecache_dtr(struct dm_target *ti) 2165 { 2166 struct dm_writecache *wc = ti->private; 2167 2168 if (!wc) 2169 return; 2170 2171 if (wc->endio_thread) 2172 kthread_stop(wc->endio_thread); 2173 2174 if (wc->flush_thread) 2175 kthread_stop(wc->flush_thread); 2176 2177 bioset_exit(&wc->bio_set); 2178 2179 mempool_exit(&wc->copy_pool); 2180 2181 if (wc->writeback_wq) 2182 destroy_workqueue(wc->writeback_wq); 2183 2184 if (wc->dev) 2185 dm_put_device(ti, wc->dev); 2186 2187 if (wc->ssd_dev) 2188 dm_put_device(ti, wc->ssd_dev); 2189 2190 vfree(wc->entries); 2191 2192 if (wc->memory_map) { 2193 if (WC_MODE_PMEM(wc)) 2194 persistent_memory_release(wc); 2195 else 2196 vfree(wc->memory_map); 2197 } 2198 2199 if (wc->dm_kcopyd) 2200 dm_kcopyd_client_destroy(wc->dm_kcopyd); 2201 2202 if (wc->dm_io) 2203 dm_io_client_destroy(wc->dm_io); 2204 2205 vfree(wc->dirty_bitmap); 2206 2207 kfree(wc); 2208 } 2209 2210 static int writecache_ctr(struct dm_target *ti, unsigned argc, char **argv) 2211 { 2212 struct dm_writecache *wc; 2213 struct dm_arg_set as; 2214 const char *string; 2215 unsigned opt_params; 2216 size_t offset, data_size; 2217 int i, r; 2218 char dummy; 2219 int high_wm_percent = HIGH_WATERMARK; 2220 int low_wm_percent = LOW_WATERMARK; 2221 uint64_t x; 2222 struct wc_memory_superblock s; 2223 2224 static struct dm_arg _args[] = { 2225 {0, 18, "Invalid number of feature args"}, 2226 }; 2227 2228 as.argc = argc; 2229 as.argv = argv; 2230 2231 wc = kzalloc(sizeof(struct dm_writecache), GFP_KERNEL); 2232 if (!wc) { 2233 ti->error = "Cannot allocate writecache structure"; 2234 r = -ENOMEM; 2235 goto bad; 2236 } 2237 ti->private = wc; 2238 wc->ti = ti; 2239 2240 mutex_init(&wc->lock); 2241 wc->max_age = MAX_AGE_UNSPECIFIED; 2242 writecache_poison_lists(wc); 2243 init_waitqueue_head(&wc->freelist_wait); 2244 timer_setup(&wc->autocommit_timer, writecache_autocommit_timer, 0); 2245 timer_setup(&wc->max_age_timer, writecache_max_age_timer, 0); 2246 2247 for (i = 0; i < 2; i++) { 2248 atomic_set(&wc->bio_in_progress[i], 0); 2249 init_waitqueue_head(&wc->bio_in_progress_wait[i]); 2250 } 2251 2252 wc->dm_io = dm_io_client_create(); 2253 if (IS_ERR(wc->dm_io)) { 2254 r = PTR_ERR(wc->dm_io); 2255 ti->error = "Unable to allocate dm-io client"; 2256 wc->dm_io = NULL; 2257 goto bad; 2258 } 2259 2260 wc->writeback_wq = alloc_workqueue("writecache-writeback", WQ_MEM_RECLAIM, 1); 2261 if (!wc->writeback_wq) { 2262 r = -ENOMEM; 2263 ti->error = "Could not allocate writeback workqueue"; 2264 goto bad; 2265 } 2266 INIT_WORK(&wc->writeback_work, writecache_writeback); 2267 INIT_WORK(&wc->flush_work, writecache_flush_work); 2268 2269 dm_iot_init(&wc->iot); 2270 2271 raw_spin_lock_init(&wc->endio_list_lock); 2272 INIT_LIST_HEAD(&wc->endio_list); 2273 wc->endio_thread = kthread_run(writecache_endio_thread, wc, "writecache_endio"); 2274 if (IS_ERR(wc->endio_thread)) { 2275 r = PTR_ERR(wc->endio_thread); 2276 wc->endio_thread = NULL; 2277 ti->error = "Couldn't spawn endio thread"; 2278 goto bad; 2279 } 2280 2281 /* 2282 * Parse the mode (pmem or ssd) 2283 */ 2284 string = dm_shift_arg(&as); 2285 if (!string) 2286 goto bad_arguments; 2287 2288 if (!strcasecmp(string, "s")) { 2289 wc->pmem_mode = false; 2290 } else if (!strcasecmp(string, "p")) { 2291 #ifdef DM_WRITECACHE_HAS_PMEM 2292 wc->pmem_mode = true; 2293 wc->writeback_fua = true; 2294 #else 2295 /* 2296 * If the architecture doesn't support persistent memory or 2297 * the kernel doesn't support any DAX drivers, this driver can 2298 * only be used in SSD-only mode. 2299 */ 2300 r = -EOPNOTSUPP; 2301 ti->error = "Persistent memory or DAX not supported on this system"; 2302 goto bad; 2303 #endif 2304 } else { 2305 goto bad_arguments; 2306 } 2307 2308 if (WC_MODE_PMEM(wc)) { 2309 r = bioset_init(&wc->bio_set, BIO_POOL_SIZE, 2310 offsetof(struct writeback_struct, bio), 2311 BIOSET_NEED_BVECS); 2312 if (r) { 2313 ti->error = "Could not allocate bio set"; 2314 goto bad; 2315 } 2316 } else { 2317 wc->pause = PAUSE_WRITEBACK; 2318 r = mempool_init_kmalloc_pool(&wc->copy_pool, 1, sizeof(struct copy_struct)); 2319 if (r) { 2320 ti->error = "Could not allocate mempool"; 2321 goto bad; 2322 } 2323 } 2324 2325 /* 2326 * Parse the origin data device 2327 */ 2328 string = dm_shift_arg(&as); 2329 if (!string) 2330 goto bad_arguments; 2331 r = dm_get_device(ti, string, dm_table_get_mode(ti->table), &wc->dev); 2332 if (r) { 2333 ti->error = "Origin data device lookup failed"; 2334 goto bad; 2335 } 2336 2337 /* 2338 * Parse cache data device (be it pmem or ssd) 2339 */ 2340 string = dm_shift_arg(&as); 2341 if (!string) 2342 goto bad_arguments; 2343 2344 r = dm_get_device(ti, string, dm_table_get_mode(ti->table), &wc->ssd_dev); 2345 if (r) { 2346 ti->error = "Cache data device lookup failed"; 2347 goto bad; 2348 } 2349 wc->memory_map_size = bdev_nr_bytes(wc->ssd_dev->bdev); 2350 2351 /* 2352 * Parse the cache block size 2353 */ 2354 string = dm_shift_arg(&as); 2355 if (!string) 2356 goto bad_arguments; 2357 if (sscanf(string, "%u%c", &wc->block_size, &dummy) != 1 || 2358 wc->block_size < 512 || wc->block_size > PAGE_SIZE || 2359 (wc->block_size & (wc->block_size - 1))) { 2360 r = -EINVAL; 2361 ti->error = "Invalid block size"; 2362 goto bad; 2363 } 2364 if (wc->block_size < bdev_logical_block_size(wc->dev->bdev) || 2365 wc->block_size < bdev_logical_block_size(wc->ssd_dev->bdev)) { 2366 r = -EINVAL; 2367 ti->error = "Block size is smaller than device logical block size"; 2368 goto bad; 2369 } 2370 wc->block_size_bits = __ffs(wc->block_size); 2371 2372 wc->max_writeback_jobs = MAX_WRITEBACK_JOBS; 2373 wc->autocommit_blocks = !WC_MODE_PMEM(wc) ? AUTOCOMMIT_BLOCKS_SSD : AUTOCOMMIT_BLOCKS_PMEM; 2374 wc->autocommit_jiffies = msecs_to_jiffies(AUTOCOMMIT_MSEC); 2375 2376 /* 2377 * Parse optional arguments 2378 */ 2379 r = dm_read_arg_group(_args, &as, &opt_params, &ti->error); 2380 if (r) 2381 goto bad; 2382 2383 while (opt_params) { 2384 string = dm_shift_arg(&as), opt_params--; 2385 if (!strcasecmp(string, "start_sector") && opt_params >= 1) { 2386 unsigned long long start_sector; 2387 string = dm_shift_arg(&as), opt_params--; 2388 if (sscanf(string, "%llu%c", &start_sector, &dummy) != 1) 2389 goto invalid_optional; 2390 wc->start_sector = start_sector; 2391 wc->start_sector_set = true; 2392 if (wc->start_sector != start_sector || 2393 wc->start_sector >= wc->memory_map_size >> SECTOR_SHIFT) 2394 goto invalid_optional; 2395 } else if (!strcasecmp(string, "high_watermark") && opt_params >= 1) { 2396 string = dm_shift_arg(&as), opt_params--; 2397 if (sscanf(string, "%d%c", &high_wm_percent, &dummy) != 1) 2398 goto invalid_optional; 2399 if (high_wm_percent < 0 || high_wm_percent > 100) 2400 goto invalid_optional; 2401 wc->high_wm_percent_value = high_wm_percent; 2402 wc->high_wm_percent_set = true; 2403 } else if (!strcasecmp(string, "low_watermark") && opt_params >= 1) { 2404 string = dm_shift_arg(&as), opt_params--; 2405 if (sscanf(string, "%d%c", &low_wm_percent, &dummy) != 1) 2406 goto invalid_optional; 2407 if (low_wm_percent < 0 || low_wm_percent > 100) 2408 goto invalid_optional; 2409 wc->low_wm_percent_value = low_wm_percent; 2410 wc->low_wm_percent_set = true; 2411 } else if (!strcasecmp(string, "writeback_jobs") && opt_params >= 1) { 2412 string = dm_shift_arg(&as), opt_params--; 2413 if (sscanf(string, "%u%c", &wc->max_writeback_jobs, &dummy) != 1) 2414 goto invalid_optional; 2415 wc->max_writeback_jobs_set = true; 2416 } else if (!strcasecmp(string, "autocommit_blocks") && opt_params >= 1) { 2417 string = dm_shift_arg(&as), opt_params--; 2418 if (sscanf(string, "%u%c", &wc->autocommit_blocks, &dummy) != 1) 2419 goto invalid_optional; 2420 wc->autocommit_blocks_set = true; 2421 } else if (!strcasecmp(string, "autocommit_time") && opt_params >= 1) { 2422 unsigned autocommit_msecs; 2423 string = dm_shift_arg(&as), opt_params--; 2424 if (sscanf(string, "%u%c", &autocommit_msecs, &dummy) != 1) 2425 goto invalid_optional; 2426 if (autocommit_msecs > 3600000) 2427 goto invalid_optional; 2428 wc->autocommit_jiffies = msecs_to_jiffies(autocommit_msecs); 2429 wc->autocommit_time_value = autocommit_msecs; 2430 wc->autocommit_time_set = true; 2431 } else if (!strcasecmp(string, "max_age") && opt_params >= 1) { 2432 unsigned max_age_msecs; 2433 string = dm_shift_arg(&as), opt_params--; 2434 if (sscanf(string, "%u%c", &max_age_msecs, &dummy) != 1) 2435 goto invalid_optional; 2436 if (max_age_msecs > 86400000) 2437 goto invalid_optional; 2438 wc->max_age = msecs_to_jiffies(max_age_msecs); 2439 wc->max_age_set = true; 2440 wc->max_age_value = max_age_msecs; 2441 } else if (!strcasecmp(string, "cleaner")) { 2442 wc->cleaner_set = true; 2443 wc->cleaner = true; 2444 } else if (!strcasecmp(string, "fua")) { 2445 if (WC_MODE_PMEM(wc)) { 2446 wc->writeback_fua = true; 2447 wc->writeback_fua_set = true; 2448 } else goto invalid_optional; 2449 } else if (!strcasecmp(string, "nofua")) { 2450 if (WC_MODE_PMEM(wc)) { 2451 wc->writeback_fua = false; 2452 wc->writeback_fua_set = true; 2453 } else goto invalid_optional; 2454 } else if (!strcasecmp(string, "metadata_only")) { 2455 wc->metadata_only = true; 2456 } else if (!strcasecmp(string, "pause_writeback") && opt_params >= 1) { 2457 unsigned pause_msecs; 2458 if (WC_MODE_PMEM(wc)) 2459 goto invalid_optional; 2460 string = dm_shift_arg(&as), opt_params--; 2461 if (sscanf(string, "%u%c", &pause_msecs, &dummy) != 1) 2462 goto invalid_optional; 2463 if (pause_msecs > 60000) 2464 goto invalid_optional; 2465 wc->pause = msecs_to_jiffies(pause_msecs); 2466 wc->pause_set = true; 2467 wc->pause_value = pause_msecs; 2468 } else { 2469 invalid_optional: 2470 r = -EINVAL; 2471 ti->error = "Invalid optional argument"; 2472 goto bad; 2473 } 2474 } 2475 2476 if (high_wm_percent < low_wm_percent) { 2477 r = -EINVAL; 2478 ti->error = "High watermark must be greater than or equal to low watermark"; 2479 goto bad; 2480 } 2481 2482 if (WC_MODE_PMEM(wc)) { 2483 if (!dax_synchronous(wc->ssd_dev->dax_dev)) { 2484 r = -EOPNOTSUPP; 2485 ti->error = "Asynchronous persistent memory not supported as pmem cache"; 2486 goto bad; 2487 } 2488 2489 r = persistent_memory_claim(wc); 2490 if (r) { 2491 ti->error = "Unable to map persistent memory for cache"; 2492 goto bad; 2493 } 2494 } else { 2495 size_t n_blocks, n_metadata_blocks; 2496 uint64_t n_bitmap_bits; 2497 2498 wc->memory_map_size -= (uint64_t)wc->start_sector << SECTOR_SHIFT; 2499 2500 bio_list_init(&wc->flush_list); 2501 wc->flush_thread = kthread_run(writecache_flush_thread, wc, "dm_writecache_flush"); 2502 if (IS_ERR(wc->flush_thread)) { 2503 r = PTR_ERR(wc->flush_thread); 2504 wc->flush_thread = NULL; 2505 ti->error = "Couldn't spawn flush thread"; 2506 goto bad; 2507 } 2508 2509 r = calculate_memory_size(wc->memory_map_size, wc->block_size, 2510 &n_blocks, &n_metadata_blocks); 2511 if (r) { 2512 ti->error = "Invalid device size"; 2513 goto bad; 2514 } 2515 2516 n_bitmap_bits = (((uint64_t)n_metadata_blocks << wc->block_size_bits) + 2517 BITMAP_GRANULARITY - 1) / BITMAP_GRANULARITY; 2518 /* this is limitation of test_bit functions */ 2519 if (n_bitmap_bits > 1U << 31) { 2520 r = -EFBIG; 2521 ti->error = "Invalid device size"; 2522 goto bad; 2523 } 2524 2525 wc->memory_map = vmalloc(n_metadata_blocks << wc->block_size_bits); 2526 if (!wc->memory_map) { 2527 r = -ENOMEM; 2528 ti->error = "Unable to allocate memory for metadata"; 2529 goto bad; 2530 } 2531 2532 wc->dm_kcopyd = dm_kcopyd_client_create(&dm_kcopyd_throttle); 2533 if (IS_ERR(wc->dm_kcopyd)) { 2534 r = PTR_ERR(wc->dm_kcopyd); 2535 ti->error = "Unable to allocate dm-kcopyd client"; 2536 wc->dm_kcopyd = NULL; 2537 goto bad; 2538 } 2539 2540 wc->metadata_sectors = n_metadata_blocks << (wc->block_size_bits - SECTOR_SHIFT); 2541 wc->dirty_bitmap_size = (n_bitmap_bits + BITS_PER_LONG - 1) / 2542 BITS_PER_LONG * sizeof(unsigned long); 2543 wc->dirty_bitmap = vzalloc(wc->dirty_bitmap_size); 2544 if (!wc->dirty_bitmap) { 2545 r = -ENOMEM; 2546 ti->error = "Unable to allocate dirty bitmap"; 2547 goto bad; 2548 } 2549 2550 r = writecache_read_metadata(wc, wc->block_size >> SECTOR_SHIFT); 2551 if (r) { 2552 ti->error = "Unable to read first block of metadata"; 2553 goto bad; 2554 } 2555 } 2556 2557 r = copy_mc_to_kernel(&s, sb(wc), sizeof(struct wc_memory_superblock)); 2558 if (r) { 2559 ti->error = "Hardware memory error when reading superblock"; 2560 goto bad; 2561 } 2562 if (!le32_to_cpu(s.magic) && !le32_to_cpu(s.version)) { 2563 r = init_memory(wc); 2564 if (r) { 2565 ti->error = "Unable to initialize device"; 2566 goto bad; 2567 } 2568 r = copy_mc_to_kernel(&s, sb(wc), 2569 sizeof(struct wc_memory_superblock)); 2570 if (r) { 2571 ti->error = "Hardware memory error when reading superblock"; 2572 goto bad; 2573 } 2574 } 2575 2576 if (le32_to_cpu(s.magic) != MEMORY_SUPERBLOCK_MAGIC) { 2577 ti->error = "Invalid magic in the superblock"; 2578 r = -EINVAL; 2579 goto bad; 2580 } 2581 2582 if (le32_to_cpu(s.version) != MEMORY_SUPERBLOCK_VERSION) { 2583 ti->error = "Invalid version in the superblock"; 2584 r = -EINVAL; 2585 goto bad; 2586 } 2587 2588 if (le32_to_cpu(s.block_size) != wc->block_size) { 2589 ti->error = "Block size does not match superblock"; 2590 r = -EINVAL; 2591 goto bad; 2592 } 2593 2594 wc->n_blocks = le64_to_cpu(s.n_blocks); 2595 2596 offset = wc->n_blocks * sizeof(struct wc_memory_entry); 2597 if (offset / sizeof(struct wc_memory_entry) != le64_to_cpu(sb(wc)->n_blocks)) { 2598 overflow: 2599 ti->error = "Overflow in size calculation"; 2600 r = -EINVAL; 2601 goto bad; 2602 } 2603 offset += sizeof(struct wc_memory_superblock); 2604 if (offset < sizeof(struct wc_memory_superblock)) 2605 goto overflow; 2606 offset = (offset + wc->block_size - 1) & ~(size_t)(wc->block_size - 1); 2607 data_size = wc->n_blocks * (size_t)wc->block_size; 2608 if (!offset || (data_size / wc->block_size != wc->n_blocks) || 2609 (offset + data_size < offset)) 2610 goto overflow; 2611 if (offset + data_size > wc->memory_map_size) { 2612 ti->error = "Memory area is too small"; 2613 r = -EINVAL; 2614 goto bad; 2615 } 2616 2617 wc->metadata_sectors = offset >> SECTOR_SHIFT; 2618 wc->block_start = (char *)sb(wc) + offset; 2619 2620 x = (uint64_t)wc->n_blocks * (100 - high_wm_percent); 2621 x += 50; 2622 do_div(x, 100); 2623 wc->freelist_high_watermark = x; 2624 x = (uint64_t)wc->n_blocks * (100 - low_wm_percent); 2625 x += 50; 2626 do_div(x, 100); 2627 wc->freelist_low_watermark = x; 2628 2629 if (wc->cleaner) 2630 activate_cleaner(wc); 2631 2632 r = writecache_alloc_entries(wc); 2633 if (r) { 2634 ti->error = "Cannot allocate memory"; 2635 goto bad; 2636 } 2637 2638 ti->num_flush_bios = WC_MODE_PMEM(wc) ? 1 : 2; 2639 ti->flush_supported = true; 2640 ti->num_discard_bios = 1; 2641 2642 if (WC_MODE_PMEM(wc)) 2643 persistent_memory_flush_cache(wc->memory_map, wc->memory_map_size); 2644 2645 return 0; 2646 2647 bad_arguments: 2648 r = -EINVAL; 2649 ti->error = "Bad arguments"; 2650 bad: 2651 writecache_dtr(ti); 2652 return r; 2653 } 2654 2655 static void writecache_status(struct dm_target *ti, status_type_t type, 2656 unsigned status_flags, char *result, unsigned maxlen) 2657 { 2658 struct dm_writecache *wc = ti->private; 2659 unsigned extra_args; 2660 unsigned sz = 0; 2661 2662 switch (type) { 2663 case STATUSTYPE_INFO: 2664 DMEMIT("%ld %llu %llu %llu %llu %llu %llu %llu %llu %llu %llu %llu %llu %llu", 2665 writecache_has_error(wc), 2666 (unsigned long long)wc->n_blocks, (unsigned long long)wc->freelist_size, 2667 (unsigned long long)wc->writeback_size, 2668 wc->stats.reads, 2669 wc->stats.read_hits, 2670 wc->stats.writes, 2671 wc->stats.write_hits_uncommitted, 2672 wc->stats.write_hits_committed, 2673 wc->stats.writes_around, 2674 wc->stats.writes_allocate, 2675 wc->stats.writes_blocked_on_freelist, 2676 wc->stats.flushes, 2677 wc->stats.discards); 2678 break; 2679 case STATUSTYPE_TABLE: 2680 DMEMIT("%c %s %s %u ", WC_MODE_PMEM(wc) ? 'p' : 's', 2681 wc->dev->name, wc->ssd_dev->name, wc->block_size); 2682 extra_args = 0; 2683 if (wc->start_sector_set) 2684 extra_args += 2; 2685 if (wc->high_wm_percent_set) 2686 extra_args += 2; 2687 if (wc->low_wm_percent_set) 2688 extra_args += 2; 2689 if (wc->max_writeback_jobs_set) 2690 extra_args += 2; 2691 if (wc->autocommit_blocks_set) 2692 extra_args += 2; 2693 if (wc->autocommit_time_set) 2694 extra_args += 2; 2695 if (wc->max_age_set) 2696 extra_args += 2; 2697 if (wc->cleaner_set) 2698 extra_args++; 2699 if (wc->writeback_fua_set) 2700 extra_args++; 2701 if (wc->metadata_only) 2702 extra_args++; 2703 if (wc->pause_set) 2704 extra_args += 2; 2705 2706 DMEMIT("%u", extra_args); 2707 if (wc->start_sector_set) 2708 DMEMIT(" start_sector %llu", (unsigned long long)wc->start_sector); 2709 if (wc->high_wm_percent_set) 2710 DMEMIT(" high_watermark %u", wc->high_wm_percent_value); 2711 if (wc->low_wm_percent_set) 2712 DMEMIT(" low_watermark %u", wc->low_wm_percent_value); 2713 if (wc->max_writeback_jobs_set) 2714 DMEMIT(" writeback_jobs %u", wc->max_writeback_jobs); 2715 if (wc->autocommit_blocks_set) 2716 DMEMIT(" autocommit_blocks %u", wc->autocommit_blocks); 2717 if (wc->autocommit_time_set) 2718 DMEMIT(" autocommit_time %u", wc->autocommit_time_value); 2719 if (wc->max_age_set) 2720 DMEMIT(" max_age %u", wc->max_age_value); 2721 if (wc->cleaner_set) 2722 DMEMIT(" cleaner"); 2723 if (wc->writeback_fua_set) 2724 DMEMIT(" %sfua", wc->writeback_fua ? "" : "no"); 2725 if (wc->metadata_only) 2726 DMEMIT(" metadata_only"); 2727 if (wc->pause_set) 2728 DMEMIT(" pause_writeback %u", wc->pause_value); 2729 break; 2730 case STATUSTYPE_IMA: 2731 *result = '\0'; 2732 break; 2733 } 2734 } 2735 2736 static struct target_type writecache_target = { 2737 .name = "writecache", 2738 .version = {1, 6, 0}, 2739 .module = THIS_MODULE, 2740 .ctr = writecache_ctr, 2741 .dtr = writecache_dtr, 2742 .status = writecache_status, 2743 .postsuspend = writecache_suspend, 2744 .resume = writecache_resume, 2745 .message = writecache_message, 2746 .map = writecache_map, 2747 .end_io = writecache_end_io, 2748 .iterate_devices = writecache_iterate_devices, 2749 .io_hints = writecache_io_hints, 2750 }; 2751 2752 static int __init dm_writecache_init(void) 2753 { 2754 int r; 2755 2756 r = dm_register_target(&writecache_target); 2757 if (r < 0) { 2758 DMERR("register failed %d", r); 2759 return r; 2760 } 2761 2762 return 0; 2763 } 2764 2765 static void __exit dm_writecache_exit(void) 2766 { 2767 dm_unregister_target(&writecache_target); 2768 } 2769 2770 module_init(dm_writecache_init); 2771 module_exit(dm_writecache_exit); 2772 2773 MODULE_DESCRIPTION(DM_NAME " writecache target"); 2774 MODULE_AUTHOR("Mikulas Patocka <dm-devel@redhat.com>"); 2775 MODULE_LICENSE("GPL"); 2776