1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2018 Red Hat. All rights reserved. 4 * 5 * This file is released under the GPL. 6 */ 7 8 #include <linux/device-mapper.h> 9 #include <linux/module.h> 10 #include <linux/init.h> 11 #include <linux/vmalloc.h> 12 #include <linux/kthread.h> 13 #include <linux/dm-io.h> 14 #include <linux/dm-kcopyd.h> 15 #include <linux/dax.h> 16 #include <linux/pfn_t.h> 17 #include <linux/libnvdimm.h> 18 #include <linux/delay.h> 19 #include "dm-io-tracker.h" 20 21 #define DM_MSG_PREFIX "writecache" 22 23 #define HIGH_WATERMARK 50 24 #define LOW_WATERMARK 45 25 #define MAX_WRITEBACK_JOBS min(0x10000000 / PAGE_SIZE, totalram_pages() / 16) 26 #define ENDIO_LATENCY 16 27 #define WRITEBACK_LATENCY 64 28 #define AUTOCOMMIT_BLOCKS_SSD 65536 29 #define AUTOCOMMIT_BLOCKS_PMEM 64 30 #define AUTOCOMMIT_MSEC 1000 31 #define MAX_AGE_DIV 16 32 #define MAX_AGE_UNSPECIFIED -1UL 33 #define PAUSE_WRITEBACK (HZ * 3) 34 35 #define BITMAP_GRANULARITY 65536 36 #if BITMAP_GRANULARITY < PAGE_SIZE 37 #undef BITMAP_GRANULARITY 38 #define BITMAP_GRANULARITY PAGE_SIZE 39 #endif 40 41 #if IS_ENABLED(CONFIG_ARCH_HAS_PMEM_API) && IS_ENABLED(CONFIG_FS_DAX) 42 #define DM_WRITECACHE_HAS_PMEM 43 #endif 44 45 #ifdef DM_WRITECACHE_HAS_PMEM 46 #define pmem_assign(dest, src) \ 47 do { \ 48 typeof(dest) uniq = (src); \ 49 memcpy_flushcache(&(dest), &uniq, sizeof(dest)); \ 50 } while (0) 51 #else 52 #define pmem_assign(dest, src) ((dest) = (src)) 53 #endif 54 55 #if IS_ENABLED(CONFIG_ARCH_HAS_COPY_MC) && defined(DM_WRITECACHE_HAS_PMEM) 56 #define DM_WRITECACHE_HANDLE_HARDWARE_ERRORS 57 #endif 58 59 #define MEMORY_SUPERBLOCK_MAGIC 0x23489321 60 #define MEMORY_SUPERBLOCK_VERSION 1 61 62 struct wc_memory_entry { 63 __le64 original_sector; 64 __le64 seq_count; 65 }; 66 67 struct wc_memory_superblock { 68 union { 69 struct { 70 __le32 magic; 71 __le32 version; 72 __le32 block_size; 73 __le32 pad; 74 __le64 n_blocks; 75 __le64 seq_count; 76 }; 77 __le64 padding[8]; 78 }; 79 struct wc_memory_entry entries[]; 80 }; 81 82 struct wc_entry { 83 struct rb_node rb_node; 84 struct list_head lru; 85 unsigned short wc_list_contiguous; 86 bool write_in_progress 87 #if BITS_PER_LONG == 64 88 :1 89 #endif 90 ; 91 unsigned long index 92 #if BITS_PER_LONG == 64 93 :47 94 #endif 95 ; 96 unsigned long age; 97 #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS 98 uint64_t original_sector; 99 uint64_t seq_count; 100 #endif 101 }; 102 103 #ifdef DM_WRITECACHE_HAS_PMEM 104 #define WC_MODE_PMEM(wc) ((wc)->pmem_mode) 105 #define WC_MODE_FUA(wc) ((wc)->writeback_fua) 106 #else 107 #define WC_MODE_PMEM(wc) false 108 #define WC_MODE_FUA(wc) false 109 #endif 110 #define WC_MODE_SORT_FREELIST(wc) (!WC_MODE_PMEM(wc)) 111 112 struct dm_writecache { 113 struct mutex lock; 114 struct list_head lru; 115 union { 116 struct list_head freelist; 117 struct { 118 struct rb_root freetree; 119 struct wc_entry *current_free; 120 }; 121 }; 122 struct rb_root tree; 123 124 size_t freelist_size; 125 size_t writeback_size; 126 size_t freelist_high_watermark; 127 size_t freelist_low_watermark; 128 unsigned long max_age; 129 unsigned long pause; 130 131 unsigned uncommitted_blocks; 132 unsigned autocommit_blocks; 133 unsigned max_writeback_jobs; 134 135 int error; 136 137 unsigned long autocommit_jiffies; 138 struct timer_list autocommit_timer; 139 struct wait_queue_head freelist_wait; 140 141 struct timer_list max_age_timer; 142 143 atomic_t bio_in_progress[2]; 144 struct wait_queue_head bio_in_progress_wait[2]; 145 146 struct dm_target *ti; 147 struct dm_dev *dev; 148 struct dm_dev *ssd_dev; 149 sector_t start_sector; 150 void *memory_map; 151 uint64_t memory_map_size; 152 size_t metadata_sectors; 153 size_t n_blocks; 154 uint64_t seq_count; 155 sector_t data_device_sectors; 156 void *block_start; 157 struct wc_entry *entries; 158 unsigned block_size; 159 unsigned char block_size_bits; 160 161 bool pmem_mode:1; 162 bool writeback_fua:1; 163 164 bool overwrote_committed:1; 165 bool memory_vmapped:1; 166 167 bool start_sector_set:1; 168 bool high_wm_percent_set:1; 169 bool low_wm_percent_set:1; 170 bool max_writeback_jobs_set:1; 171 bool autocommit_blocks_set:1; 172 bool autocommit_time_set:1; 173 bool max_age_set:1; 174 bool writeback_fua_set:1; 175 bool flush_on_suspend:1; 176 bool cleaner:1; 177 bool cleaner_set:1; 178 bool metadata_only:1; 179 bool pause_set:1; 180 181 unsigned high_wm_percent_value; 182 unsigned low_wm_percent_value; 183 unsigned autocommit_time_value; 184 unsigned max_age_value; 185 unsigned pause_value; 186 187 unsigned writeback_all; 188 struct workqueue_struct *writeback_wq; 189 struct work_struct writeback_work; 190 struct work_struct flush_work; 191 192 struct dm_io_tracker iot; 193 194 struct dm_io_client *dm_io; 195 196 raw_spinlock_t endio_list_lock; 197 struct list_head endio_list; 198 struct task_struct *endio_thread; 199 200 struct task_struct *flush_thread; 201 struct bio_list flush_list; 202 203 struct dm_kcopyd_client *dm_kcopyd; 204 unsigned long *dirty_bitmap; 205 unsigned dirty_bitmap_size; 206 207 struct bio_set bio_set; 208 mempool_t copy_pool; 209 210 struct { 211 unsigned long long reads; 212 unsigned long long read_hits; 213 unsigned long long writes; 214 unsigned long long write_hits_uncommitted; 215 unsigned long long write_hits_committed; 216 unsigned long long writes_around; 217 unsigned long long writes_allocate; 218 unsigned long long writes_blocked_on_freelist; 219 unsigned long long flushes; 220 unsigned long long discards; 221 } stats; 222 }; 223 224 #define WB_LIST_INLINE 16 225 226 struct writeback_struct { 227 struct list_head endio_entry; 228 struct dm_writecache *wc; 229 struct wc_entry **wc_list; 230 unsigned wc_list_n; 231 struct wc_entry *wc_list_inline[WB_LIST_INLINE]; 232 struct bio bio; 233 }; 234 235 struct copy_struct { 236 struct list_head endio_entry; 237 struct dm_writecache *wc; 238 struct wc_entry *e; 239 unsigned n_entries; 240 int error; 241 }; 242 243 DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(dm_writecache_throttle, 244 "A percentage of time allocated for data copying"); 245 246 static void wc_lock(struct dm_writecache *wc) 247 { 248 mutex_lock(&wc->lock); 249 } 250 251 static void wc_unlock(struct dm_writecache *wc) 252 { 253 mutex_unlock(&wc->lock); 254 } 255 256 #ifdef DM_WRITECACHE_HAS_PMEM 257 static int persistent_memory_claim(struct dm_writecache *wc) 258 { 259 int r; 260 loff_t s; 261 long p, da; 262 pfn_t pfn; 263 int id; 264 struct page **pages; 265 sector_t offset; 266 267 wc->memory_vmapped = false; 268 269 s = wc->memory_map_size; 270 p = s >> PAGE_SHIFT; 271 if (!p) { 272 r = -EINVAL; 273 goto err1; 274 } 275 if (p != s >> PAGE_SHIFT) { 276 r = -EOVERFLOW; 277 goto err1; 278 } 279 280 offset = get_start_sect(wc->ssd_dev->bdev); 281 if (offset & (PAGE_SIZE / 512 - 1)) { 282 r = -EINVAL; 283 goto err1; 284 } 285 offset >>= PAGE_SHIFT - 9; 286 287 id = dax_read_lock(); 288 289 da = dax_direct_access(wc->ssd_dev->dax_dev, offset, p, DAX_ACCESS, 290 &wc->memory_map, &pfn); 291 if (da < 0) { 292 wc->memory_map = NULL; 293 r = da; 294 goto err2; 295 } 296 if (!pfn_t_has_page(pfn)) { 297 wc->memory_map = NULL; 298 r = -EOPNOTSUPP; 299 goto err2; 300 } 301 if (da != p) { 302 long i; 303 wc->memory_map = NULL; 304 pages = kvmalloc_array(p, sizeof(struct page *), GFP_KERNEL); 305 if (!pages) { 306 r = -ENOMEM; 307 goto err2; 308 } 309 i = 0; 310 do { 311 long daa; 312 daa = dax_direct_access(wc->ssd_dev->dax_dev, offset + i, 313 p - i, DAX_ACCESS, NULL, &pfn); 314 if (daa <= 0) { 315 r = daa ? daa : -EINVAL; 316 goto err3; 317 } 318 if (!pfn_t_has_page(pfn)) { 319 r = -EOPNOTSUPP; 320 goto err3; 321 } 322 while (daa-- && i < p) { 323 pages[i++] = pfn_t_to_page(pfn); 324 pfn.val++; 325 if (!(i & 15)) 326 cond_resched(); 327 } 328 } while (i < p); 329 wc->memory_map = vmap(pages, p, VM_MAP, PAGE_KERNEL); 330 if (!wc->memory_map) { 331 r = -ENOMEM; 332 goto err3; 333 } 334 kvfree(pages); 335 wc->memory_vmapped = true; 336 } 337 338 dax_read_unlock(id); 339 340 wc->memory_map += (size_t)wc->start_sector << SECTOR_SHIFT; 341 wc->memory_map_size -= (size_t)wc->start_sector << SECTOR_SHIFT; 342 343 return 0; 344 err3: 345 kvfree(pages); 346 err2: 347 dax_read_unlock(id); 348 err1: 349 return r; 350 } 351 #else 352 static int persistent_memory_claim(struct dm_writecache *wc) 353 { 354 return -EOPNOTSUPP; 355 } 356 #endif 357 358 static void persistent_memory_release(struct dm_writecache *wc) 359 { 360 if (wc->memory_vmapped) 361 vunmap(wc->memory_map - ((size_t)wc->start_sector << SECTOR_SHIFT)); 362 } 363 364 static struct page *persistent_memory_page(void *addr) 365 { 366 if (is_vmalloc_addr(addr)) 367 return vmalloc_to_page(addr); 368 else 369 return virt_to_page(addr); 370 } 371 372 static unsigned persistent_memory_page_offset(void *addr) 373 { 374 return (unsigned long)addr & (PAGE_SIZE - 1); 375 } 376 377 static void persistent_memory_flush_cache(void *ptr, size_t size) 378 { 379 if (is_vmalloc_addr(ptr)) 380 flush_kernel_vmap_range(ptr, size); 381 } 382 383 static void persistent_memory_invalidate_cache(void *ptr, size_t size) 384 { 385 if (is_vmalloc_addr(ptr)) 386 invalidate_kernel_vmap_range(ptr, size); 387 } 388 389 static struct wc_memory_superblock *sb(struct dm_writecache *wc) 390 { 391 return wc->memory_map; 392 } 393 394 static struct wc_memory_entry *memory_entry(struct dm_writecache *wc, struct wc_entry *e) 395 { 396 return &sb(wc)->entries[e->index]; 397 } 398 399 static void *memory_data(struct dm_writecache *wc, struct wc_entry *e) 400 { 401 return (char *)wc->block_start + (e->index << wc->block_size_bits); 402 } 403 404 static sector_t cache_sector(struct dm_writecache *wc, struct wc_entry *e) 405 { 406 return wc->start_sector + wc->metadata_sectors + 407 ((sector_t)e->index << (wc->block_size_bits - SECTOR_SHIFT)); 408 } 409 410 static uint64_t read_original_sector(struct dm_writecache *wc, struct wc_entry *e) 411 { 412 #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS 413 return e->original_sector; 414 #else 415 return le64_to_cpu(memory_entry(wc, e)->original_sector); 416 #endif 417 } 418 419 static uint64_t read_seq_count(struct dm_writecache *wc, struct wc_entry *e) 420 { 421 #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS 422 return e->seq_count; 423 #else 424 return le64_to_cpu(memory_entry(wc, e)->seq_count); 425 #endif 426 } 427 428 static void clear_seq_count(struct dm_writecache *wc, struct wc_entry *e) 429 { 430 #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS 431 e->seq_count = -1; 432 #endif 433 pmem_assign(memory_entry(wc, e)->seq_count, cpu_to_le64(-1)); 434 } 435 436 static void write_original_sector_seq_count(struct dm_writecache *wc, struct wc_entry *e, 437 uint64_t original_sector, uint64_t seq_count) 438 { 439 struct wc_memory_entry me; 440 #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS 441 e->original_sector = original_sector; 442 e->seq_count = seq_count; 443 #endif 444 me.original_sector = cpu_to_le64(original_sector); 445 me.seq_count = cpu_to_le64(seq_count); 446 pmem_assign(*memory_entry(wc, e), me); 447 } 448 449 #define writecache_error(wc, err, msg, arg...) \ 450 do { \ 451 if (!cmpxchg(&(wc)->error, 0, err)) \ 452 DMERR(msg, ##arg); \ 453 wake_up(&(wc)->freelist_wait); \ 454 } while (0) 455 456 #define writecache_has_error(wc) (unlikely(READ_ONCE((wc)->error))) 457 458 static void writecache_flush_all_metadata(struct dm_writecache *wc) 459 { 460 if (!WC_MODE_PMEM(wc)) 461 memset(wc->dirty_bitmap, -1, wc->dirty_bitmap_size); 462 } 463 464 static void writecache_flush_region(struct dm_writecache *wc, void *ptr, size_t size) 465 { 466 if (!WC_MODE_PMEM(wc)) 467 __set_bit(((char *)ptr - (char *)wc->memory_map) / BITMAP_GRANULARITY, 468 wc->dirty_bitmap); 469 } 470 471 static void writecache_disk_flush(struct dm_writecache *wc, struct dm_dev *dev); 472 473 struct io_notify { 474 struct dm_writecache *wc; 475 struct completion c; 476 atomic_t count; 477 }; 478 479 static void writecache_notify_io(unsigned long error, void *context) 480 { 481 struct io_notify *endio = context; 482 483 if (unlikely(error != 0)) 484 writecache_error(endio->wc, -EIO, "error writing metadata"); 485 BUG_ON(atomic_read(&endio->count) <= 0); 486 if (atomic_dec_and_test(&endio->count)) 487 complete(&endio->c); 488 } 489 490 static void writecache_wait_for_ios(struct dm_writecache *wc, int direction) 491 { 492 wait_event(wc->bio_in_progress_wait[direction], 493 !atomic_read(&wc->bio_in_progress[direction])); 494 } 495 496 static void ssd_commit_flushed(struct dm_writecache *wc, bool wait_for_ios) 497 { 498 struct dm_io_region region; 499 struct dm_io_request req; 500 struct io_notify endio = { 501 wc, 502 COMPLETION_INITIALIZER_ONSTACK(endio.c), 503 ATOMIC_INIT(1), 504 }; 505 unsigned bitmap_bits = wc->dirty_bitmap_size * 8; 506 unsigned i = 0; 507 508 while (1) { 509 unsigned j; 510 i = find_next_bit(wc->dirty_bitmap, bitmap_bits, i); 511 if (unlikely(i == bitmap_bits)) 512 break; 513 j = find_next_zero_bit(wc->dirty_bitmap, bitmap_bits, i); 514 515 region.bdev = wc->ssd_dev->bdev; 516 region.sector = (sector_t)i * (BITMAP_GRANULARITY >> SECTOR_SHIFT); 517 region.count = (sector_t)(j - i) * (BITMAP_GRANULARITY >> SECTOR_SHIFT); 518 519 if (unlikely(region.sector >= wc->metadata_sectors)) 520 break; 521 if (unlikely(region.sector + region.count > wc->metadata_sectors)) 522 region.count = wc->metadata_sectors - region.sector; 523 524 region.sector += wc->start_sector; 525 atomic_inc(&endio.count); 526 req.bi_opf = REQ_OP_WRITE | REQ_SYNC; 527 req.mem.type = DM_IO_VMA; 528 req.mem.ptr.vma = (char *)wc->memory_map + (size_t)i * BITMAP_GRANULARITY; 529 req.client = wc->dm_io; 530 req.notify.fn = writecache_notify_io; 531 req.notify.context = &endio; 532 533 /* writing via async dm-io (implied by notify.fn above) won't return an error */ 534 (void) dm_io(&req, 1, ®ion, NULL); 535 i = j; 536 } 537 538 writecache_notify_io(0, &endio); 539 wait_for_completion_io(&endio.c); 540 541 if (wait_for_ios) 542 writecache_wait_for_ios(wc, WRITE); 543 544 writecache_disk_flush(wc, wc->ssd_dev); 545 546 memset(wc->dirty_bitmap, 0, wc->dirty_bitmap_size); 547 } 548 549 static void ssd_commit_superblock(struct dm_writecache *wc) 550 { 551 int r; 552 struct dm_io_region region; 553 struct dm_io_request req; 554 555 region.bdev = wc->ssd_dev->bdev; 556 region.sector = 0; 557 region.count = max(4096U, wc->block_size) >> SECTOR_SHIFT; 558 559 if (unlikely(region.sector + region.count > wc->metadata_sectors)) 560 region.count = wc->metadata_sectors - region.sector; 561 562 region.sector += wc->start_sector; 563 564 req.bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_FUA; 565 req.mem.type = DM_IO_VMA; 566 req.mem.ptr.vma = (char *)wc->memory_map; 567 req.client = wc->dm_io; 568 req.notify.fn = NULL; 569 req.notify.context = NULL; 570 571 r = dm_io(&req, 1, ®ion, NULL); 572 if (unlikely(r)) 573 writecache_error(wc, r, "error writing superblock"); 574 } 575 576 static void writecache_commit_flushed(struct dm_writecache *wc, bool wait_for_ios) 577 { 578 if (WC_MODE_PMEM(wc)) 579 pmem_wmb(); 580 else 581 ssd_commit_flushed(wc, wait_for_ios); 582 } 583 584 static void writecache_disk_flush(struct dm_writecache *wc, struct dm_dev *dev) 585 { 586 int r; 587 struct dm_io_region region; 588 struct dm_io_request req; 589 590 region.bdev = dev->bdev; 591 region.sector = 0; 592 region.count = 0; 593 req.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH; 594 req.mem.type = DM_IO_KMEM; 595 req.mem.ptr.addr = NULL; 596 req.client = wc->dm_io; 597 req.notify.fn = NULL; 598 599 r = dm_io(&req, 1, ®ion, NULL); 600 if (unlikely(r)) 601 writecache_error(wc, r, "error flushing metadata: %d", r); 602 } 603 604 #define WFE_RETURN_FOLLOWING 1 605 #define WFE_LOWEST_SEQ 2 606 607 static struct wc_entry *writecache_find_entry(struct dm_writecache *wc, 608 uint64_t block, int flags) 609 { 610 struct wc_entry *e; 611 struct rb_node *node = wc->tree.rb_node; 612 613 if (unlikely(!node)) 614 return NULL; 615 616 while (1) { 617 e = container_of(node, struct wc_entry, rb_node); 618 if (read_original_sector(wc, e) == block) 619 break; 620 621 node = (read_original_sector(wc, e) >= block ? 622 e->rb_node.rb_left : e->rb_node.rb_right); 623 if (unlikely(!node)) { 624 if (!(flags & WFE_RETURN_FOLLOWING)) 625 return NULL; 626 if (read_original_sector(wc, e) >= block) { 627 return e; 628 } else { 629 node = rb_next(&e->rb_node); 630 if (unlikely(!node)) 631 return NULL; 632 e = container_of(node, struct wc_entry, rb_node); 633 return e; 634 } 635 } 636 } 637 638 while (1) { 639 struct wc_entry *e2; 640 if (flags & WFE_LOWEST_SEQ) 641 node = rb_prev(&e->rb_node); 642 else 643 node = rb_next(&e->rb_node); 644 if (unlikely(!node)) 645 return e; 646 e2 = container_of(node, struct wc_entry, rb_node); 647 if (read_original_sector(wc, e2) != block) 648 return e; 649 e = e2; 650 } 651 } 652 653 static void writecache_insert_entry(struct dm_writecache *wc, struct wc_entry *ins) 654 { 655 struct wc_entry *e; 656 struct rb_node **node = &wc->tree.rb_node, *parent = NULL; 657 658 while (*node) { 659 e = container_of(*node, struct wc_entry, rb_node); 660 parent = &e->rb_node; 661 if (read_original_sector(wc, e) > read_original_sector(wc, ins)) 662 node = &parent->rb_left; 663 else 664 node = &parent->rb_right; 665 } 666 rb_link_node(&ins->rb_node, parent, node); 667 rb_insert_color(&ins->rb_node, &wc->tree); 668 list_add(&ins->lru, &wc->lru); 669 ins->age = jiffies; 670 } 671 672 static void writecache_unlink(struct dm_writecache *wc, struct wc_entry *e) 673 { 674 list_del(&e->lru); 675 rb_erase(&e->rb_node, &wc->tree); 676 } 677 678 static void writecache_add_to_freelist(struct dm_writecache *wc, struct wc_entry *e) 679 { 680 if (WC_MODE_SORT_FREELIST(wc)) { 681 struct rb_node **node = &wc->freetree.rb_node, *parent = NULL; 682 if (unlikely(!*node)) 683 wc->current_free = e; 684 while (*node) { 685 parent = *node; 686 if (&e->rb_node < *node) 687 node = &parent->rb_left; 688 else 689 node = &parent->rb_right; 690 } 691 rb_link_node(&e->rb_node, parent, node); 692 rb_insert_color(&e->rb_node, &wc->freetree); 693 } else { 694 list_add_tail(&e->lru, &wc->freelist); 695 } 696 wc->freelist_size++; 697 } 698 699 static inline void writecache_verify_watermark(struct dm_writecache *wc) 700 { 701 if (unlikely(wc->freelist_size + wc->writeback_size <= wc->freelist_high_watermark)) 702 queue_work(wc->writeback_wq, &wc->writeback_work); 703 } 704 705 static void writecache_max_age_timer(struct timer_list *t) 706 { 707 struct dm_writecache *wc = from_timer(wc, t, max_age_timer); 708 709 if (!dm_suspended(wc->ti) && !writecache_has_error(wc)) { 710 queue_work(wc->writeback_wq, &wc->writeback_work); 711 mod_timer(&wc->max_age_timer, jiffies + wc->max_age / MAX_AGE_DIV); 712 } 713 } 714 715 static struct wc_entry *writecache_pop_from_freelist(struct dm_writecache *wc, sector_t expected_sector) 716 { 717 struct wc_entry *e; 718 719 if (WC_MODE_SORT_FREELIST(wc)) { 720 struct rb_node *next; 721 if (unlikely(!wc->current_free)) 722 return NULL; 723 e = wc->current_free; 724 if (expected_sector != (sector_t)-1 && unlikely(cache_sector(wc, e) != expected_sector)) 725 return NULL; 726 next = rb_next(&e->rb_node); 727 rb_erase(&e->rb_node, &wc->freetree); 728 if (unlikely(!next)) 729 next = rb_first(&wc->freetree); 730 wc->current_free = next ? container_of(next, struct wc_entry, rb_node) : NULL; 731 } else { 732 if (unlikely(list_empty(&wc->freelist))) 733 return NULL; 734 e = container_of(wc->freelist.next, struct wc_entry, lru); 735 if (expected_sector != (sector_t)-1 && unlikely(cache_sector(wc, e) != expected_sector)) 736 return NULL; 737 list_del(&e->lru); 738 } 739 wc->freelist_size--; 740 741 writecache_verify_watermark(wc); 742 743 return e; 744 } 745 746 static void writecache_free_entry(struct dm_writecache *wc, struct wc_entry *e) 747 { 748 writecache_unlink(wc, e); 749 writecache_add_to_freelist(wc, e); 750 clear_seq_count(wc, e); 751 writecache_flush_region(wc, memory_entry(wc, e), sizeof(struct wc_memory_entry)); 752 if (unlikely(waitqueue_active(&wc->freelist_wait))) 753 wake_up(&wc->freelist_wait); 754 } 755 756 static void writecache_wait_on_freelist(struct dm_writecache *wc) 757 { 758 DEFINE_WAIT(wait); 759 760 prepare_to_wait(&wc->freelist_wait, &wait, TASK_UNINTERRUPTIBLE); 761 wc_unlock(wc); 762 io_schedule(); 763 finish_wait(&wc->freelist_wait, &wait); 764 wc_lock(wc); 765 } 766 767 static void writecache_poison_lists(struct dm_writecache *wc) 768 { 769 /* 770 * Catch incorrect access to these values while the device is suspended. 771 */ 772 memset(&wc->tree, -1, sizeof wc->tree); 773 wc->lru.next = LIST_POISON1; 774 wc->lru.prev = LIST_POISON2; 775 wc->freelist.next = LIST_POISON1; 776 wc->freelist.prev = LIST_POISON2; 777 } 778 779 static void writecache_flush_entry(struct dm_writecache *wc, struct wc_entry *e) 780 { 781 writecache_flush_region(wc, memory_entry(wc, e), sizeof(struct wc_memory_entry)); 782 if (WC_MODE_PMEM(wc)) 783 writecache_flush_region(wc, memory_data(wc, e), wc->block_size); 784 } 785 786 static bool writecache_entry_is_committed(struct dm_writecache *wc, struct wc_entry *e) 787 { 788 return read_seq_count(wc, e) < wc->seq_count; 789 } 790 791 static void writecache_flush(struct dm_writecache *wc) 792 { 793 struct wc_entry *e, *e2; 794 bool need_flush_after_free; 795 796 wc->uncommitted_blocks = 0; 797 del_timer(&wc->autocommit_timer); 798 799 if (list_empty(&wc->lru)) 800 return; 801 802 e = container_of(wc->lru.next, struct wc_entry, lru); 803 if (writecache_entry_is_committed(wc, e)) { 804 if (wc->overwrote_committed) { 805 writecache_wait_for_ios(wc, WRITE); 806 writecache_disk_flush(wc, wc->ssd_dev); 807 wc->overwrote_committed = false; 808 } 809 return; 810 } 811 while (1) { 812 writecache_flush_entry(wc, e); 813 if (unlikely(e->lru.next == &wc->lru)) 814 break; 815 e2 = container_of(e->lru.next, struct wc_entry, lru); 816 if (writecache_entry_is_committed(wc, e2)) 817 break; 818 e = e2; 819 cond_resched(); 820 } 821 writecache_commit_flushed(wc, true); 822 823 wc->seq_count++; 824 pmem_assign(sb(wc)->seq_count, cpu_to_le64(wc->seq_count)); 825 if (WC_MODE_PMEM(wc)) 826 writecache_commit_flushed(wc, false); 827 else 828 ssd_commit_superblock(wc); 829 830 wc->overwrote_committed = false; 831 832 need_flush_after_free = false; 833 while (1) { 834 /* Free another committed entry with lower seq-count */ 835 struct rb_node *rb_node = rb_prev(&e->rb_node); 836 837 if (rb_node) { 838 e2 = container_of(rb_node, struct wc_entry, rb_node); 839 if (read_original_sector(wc, e2) == read_original_sector(wc, e) && 840 likely(!e2->write_in_progress)) { 841 writecache_free_entry(wc, e2); 842 need_flush_after_free = true; 843 } 844 } 845 if (unlikely(e->lru.prev == &wc->lru)) 846 break; 847 e = container_of(e->lru.prev, struct wc_entry, lru); 848 cond_resched(); 849 } 850 851 if (need_flush_after_free) 852 writecache_commit_flushed(wc, false); 853 } 854 855 static void writecache_flush_work(struct work_struct *work) 856 { 857 struct dm_writecache *wc = container_of(work, struct dm_writecache, flush_work); 858 859 wc_lock(wc); 860 writecache_flush(wc); 861 wc_unlock(wc); 862 } 863 864 static void writecache_autocommit_timer(struct timer_list *t) 865 { 866 struct dm_writecache *wc = from_timer(wc, t, autocommit_timer); 867 if (!writecache_has_error(wc)) 868 queue_work(wc->writeback_wq, &wc->flush_work); 869 } 870 871 static void writecache_schedule_autocommit(struct dm_writecache *wc) 872 { 873 if (!timer_pending(&wc->autocommit_timer)) 874 mod_timer(&wc->autocommit_timer, jiffies + wc->autocommit_jiffies); 875 } 876 877 static void writecache_discard(struct dm_writecache *wc, sector_t start, sector_t end) 878 { 879 struct wc_entry *e; 880 bool discarded_something = false; 881 882 e = writecache_find_entry(wc, start, WFE_RETURN_FOLLOWING | WFE_LOWEST_SEQ); 883 if (unlikely(!e)) 884 return; 885 886 while (read_original_sector(wc, e) < end) { 887 struct rb_node *node = rb_next(&e->rb_node); 888 889 if (likely(!e->write_in_progress)) { 890 if (!discarded_something) { 891 if (!WC_MODE_PMEM(wc)) { 892 writecache_wait_for_ios(wc, READ); 893 writecache_wait_for_ios(wc, WRITE); 894 } 895 discarded_something = true; 896 } 897 if (!writecache_entry_is_committed(wc, e)) 898 wc->uncommitted_blocks--; 899 writecache_free_entry(wc, e); 900 } 901 902 if (unlikely(!node)) 903 break; 904 905 e = container_of(node, struct wc_entry, rb_node); 906 } 907 908 if (discarded_something) 909 writecache_commit_flushed(wc, false); 910 } 911 912 static bool writecache_wait_for_writeback(struct dm_writecache *wc) 913 { 914 if (wc->writeback_size) { 915 writecache_wait_on_freelist(wc); 916 return true; 917 } 918 return false; 919 } 920 921 static void writecache_suspend(struct dm_target *ti) 922 { 923 struct dm_writecache *wc = ti->private; 924 bool flush_on_suspend; 925 926 del_timer_sync(&wc->autocommit_timer); 927 del_timer_sync(&wc->max_age_timer); 928 929 wc_lock(wc); 930 writecache_flush(wc); 931 flush_on_suspend = wc->flush_on_suspend; 932 if (flush_on_suspend) { 933 wc->flush_on_suspend = false; 934 wc->writeback_all++; 935 queue_work(wc->writeback_wq, &wc->writeback_work); 936 } 937 wc_unlock(wc); 938 939 drain_workqueue(wc->writeback_wq); 940 941 wc_lock(wc); 942 if (flush_on_suspend) 943 wc->writeback_all--; 944 while (writecache_wait_for_writeback(wc)); 945 946 if (WC_MODE_PMEM(wc)) 947 persistent_memory_flush_cache(wc->memory_map, wc->memory_map_size); 948 949 writecache_poison_lists(wc); 950 951 wc_unlock(wc); 952 } 953 954 static int writecache_alloc_entries(struct dm_writecache *wc) 955 { 956 size_t b; 957 958 if (wc->entries) 959 return 0; 960 wc->entries = vmalloc(array_size(sizeof(struct wc_entry), wc->n_blocks)); 961 if (!wc->entries) 962 return -ENOMEM; 963 for (b = 0; b < wc->n_blocks; b++) { 964 struct wc_entry *e = &wc->entries[b]; 965 e->index = b; 966 e->write_in_progress = false; 967 cond_resched(); 968 } 969 970 return 0; 971 } 972 973 static int writecache_read_metadata(struct dm_writecache *wc, sector_t n_sectors) 974 { 975 struct dm_io_region region; 976 struct dm_io_request req; 977 978 region.bdev = wc->ssd_dev->bdev; 979 region.sector = wc->start_sector; 980 region.count = n_sectors; 981 req.bi_opf = REQ_OP_READ | REQ_SYNC; 982 req.mem.type = DM_IO_VMA; 983 req.mem.ptr.vma = (char *)wc->memory_map; 984 req.client = wc->dm_io; 985 req.notify.fn = NULL; 986 987 return dm_io(&req, 1, ®ion, NULL); 988 } 989 990 static void writecache_resume(struct dm_target *ti) 991 { 992 struct dm_writecache *wc = ti->private; 993 size_t b; 994 bool need_flush = false; 995 __le64 sb_seq_count; 996 int r; 997 998 wc_lock(wc); 999 1000 wc->data_device_sectors = bdev_nr_sectors(wc->dev->bdev); 1001 1002 if (WC_MODE_PMEM(wc)) { 1003 persistent_memory_invalidate_cache(wc->memory_map, wc->memory_map_size); 1004 } else { 1005 r = writecache_read_metadata(wc, wc->metadata_sectors); 1006 if (r) { 1007 size_t sb_entries_offset; 1008 writecache_error(wc, r, "unable to read metadata: %d", r); 1009 sb_entries_offset = offsetof(struct wc_memory_superblock, entries); 1010 memset((char *)wc->memory_map + sb_entries_offset, -1, 1011 (wc->metadata_sectors << SECTOR_SHIFT) - sb_entries_offset); 1012 } 1013 } 1014 1015 wc->tree = RB_ROOT; 1016 INIT_LIST_HEAD(&wc->lru); 1017 if (WC_MODE_SORT_FREELIST(wc)) { 1018 wc->freetree = RB_ROOT; 1019 wc->current_free = NULL; 1020 } else { 1021 INIT_LIST_HEAD(&wc->freelist); 1022 } 1023 wc->freelist_size = 0; 1024 1025 r = copy_mc_to_kernel(&sb_seq_count, &sb(wc)->seq_count, 1026 sizeof(uint64_t)); 1027 if (r) { 1028 writecache_error(wc, r, "hardware memory error when reading superblock: %d", r); 1029 sb_seq_count = cpu_to_le64(0); 1030 } 1031 wc->seq_count = le64_to_cpu(sb_seq_count); 1032 1033 #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS 1034 for (b = 0; b < wc->n_blocks; b++) { 1035 struct wc_entry *e = &wc->entries[b]; 1036 struct wc_memory_entry wme; 1037 if (writecache_has_error(wc)) { 1038 e->original_sector = -1; 1039 e->seq_count = -1; 1040 continue; 1041 } 1042 r = copy_mc_to_kernel(&wme, memory_entry(wc, e), 1043 sizeof(struct wc_memory_entry)); 1044 if (r) { 1045 writecache_error(wc, r, "hardware memory error when reading metadata entry %lu: %d", 1046 (unsigned long)b, r); 1047 e->original_sector = -1; 1048 e->seq_count = -1; 1049 } else { 1050 e->original_sector = le64_to_cpu(wme.original_sector); 1051 e->seq_count = le64_to_cpu(wme.seq_count); 1052 } 1053 cond_resched(); 1054 } 1055 #endif 1056 for (b = 0; b < wc->n_blocks; b++) { 1057 struct wc_entry *e = &wc->entries[b]; 1058 if (!writecache_entry_is_committed(wc, e)) { 1059 if (read_seq_count(wc, e) != -1) { 1060 erase_this: 1061 clear_seq_count(wc, e); 1062 need_flush = true; 1063 } 1064 writecache_add_to_freelist(wc, e); 1065 } else { 1066 struct wc_entry *old; 1067 1068 old = writecache_find_entry(wc, read_original_sector(wc, e), 0); 1069 if (!old) { 1070 writecache_insert_entry(wc, e); 1071 } else { 1072 if (read_seq_count(wc, old) == read_seq_count(wc, e)) { 1073 writecache_error(wc, -EINVAL, 1074 "two identical entries, position %llu, sector %llu, sequence %llu", 1075 (unsigned long long)b, (unsigned long long)read_original_sector(wc, e), 1076 (unsigned long long)read_seq_count(wc, e)); 1077 } 1078 if (read_seq_count(wc, old) > read_seq_count(wc, e)) { 1079 goto erase_this; 1080 } else { 1081 writecache_free_entry(wc, old); 1082 writecache_insert_entry(wc, e); 1083 need_flush = true; 1084 } 1085 } 1086 } 1087 cond_resched(); 1088 } 1089 1090 if (need_flush) { 1091 writecache_flush_all_metadata(wc); 1092 writecache_commit_flushed(wc, false); 1093 } 1094 1095 writecache_verify_watermark(wc); 1096 1097 if (wc->max_age != MAX_AGE_UNSPECIFIED) 1098 mod_timer(&wc->max_age_timer, jiffies + wc->max_age / MAX_AGE_DIV); 1099 1100 wc_unlock(wc); 1101 } 1102 1103 static int process_flush_mesg(unsigned argc, char **argv, struct dm_writecache *wc) 1104 { 1105 if (argc != 1) 1106 return -EINVAL; 1107 1108 wc_lock(wc); 1109 if (dm_suspended(wc->ti)) { 1110 wc_unlock(wc); 1111 return -EBUSY; 1112 } 1113 if (writecache_has_error(wc)) { 1114 wc_unlock(wc); 1115 return -EIO; 1116 } 1117 1118 writecache_flush(wc); 1119 wc->writeback_all++; 1120 queue_work(wc->writeback_wq, &wc->writeback_work); 1121 wc_unlock(wc); 1122 1123 flush_workqueue(wc->writeback_wq); 1124 1125 wc_lock(wc); 1126 wc->writeback_all--; 1127 if (writecache_has_error(wc)) { 1128 wc_unlock(wc); 1129 return -EIO; 1130 } 1131 wc_unlock(wc); 1132 1133 return 0; 1134 } 1135 1136 static int process_flush_on_suspend_mesg(unsigned argc, char **argv, struct dm_writecache *wc) 1137 { 1138 if (argc != 1) 1139 return -EINVAL; 1140 1141 wc_lock(wc); 1142 wc->flush_on_suspend = true; 1143 wc_unlock(wc); 1144 1145 return 0; 1146 } 1147 1148 static void activate_cleaner(struct dm_writecache *wc) 1149 { 1150 wc->flush_on_suspend = true; 1151 wc->cleaner = true; 1152 wc->freelist_high_watermark = wc->n_blocks; 1153 wc->freelist_low_watermark = wc->n_blocks; 1154 } 1155 1156 static int process_cleaner_mesg(unsigned argc, char **argv, struct dm_writecache *wc) 1157 { 1158 if (argc != 1) 1159 return -EINVAL; 1160 1161 wc_lock(wc); 1162 activate_cleaner(wc); 1163 if (!dm_suspended(wc->ti)) 1164 writecache_verify_watermark(wc); 1165 wc_unlock(wc); 1166 1167 return 0; 1168 } 1169 1170 static int process_clear_stats_mesg(unsigned argc, char **argv, struct dm_writecache *wc) 1171 { 1172 if (argc != 1) 1173 return -EINVAL; 1174 1175 wc_lock(wc); 1176 memset(&wc->stats, 0, sizeof wc->stats); 1177 wc_unlock(wc); 1178 1179 return 0; 1180 } 1181 1182 static int writecache_message(struct dm_target *ti, unsigned argc, char **argv, 1183 char *result, unsigned maxlen) 1184 { 1185 int r = -EINVAL; 1186 struct dm_writecache *wc = ti->private; 1187 1188 if (!strcasecmp(argv[0], "flush")) 1189 r = process_flush_mesg(argc, argv, wc); 1190 else if (!strcasecmp(argv[0], "flush_on_suspend")) 1191 r = process_flush_on_suspend_mesg(argc, argv, wc); 1192 else if (!strcasecmp(argv[0], "cleaner")) 1193 r = process_cleaner_mesg(argc, argv, wc); 1194 else if (!strcasecmp(argv[0], "clear_stats")) 1195 r = process_clear_stats_mesg(argc, argv, wc); 1196 else 1197 DMERR("unrecognised message received: %s", argv[0]); 1198 1199 return r; 1200 } 1201 1202 static void memcpy_flushcache_optimized(void *dest, void *source, size_t size) 1203 { 1204 /* 1205 * clflushopt performs better with block size 1024, 2048, 4096 1206 * non-temporal stores perform better with block size 512 1207 * 1208 * block size 512 1024 2048 4096 1209 * movnti 496 MB/s 642 MB/s 725 MB/s 744 MB/s 1210 * clflushopt 373 MB/s 688 MB/s 1.1 GB/s 1.2 GB/s 1211 * 1212 * We see that movnti performs better for 512-byte blocks, and 1213 * clflushopt performs better for 1024-byte and larger blocks. So, we 1214 * prefer clflushopt for sizes >= 768. 1215 * 1216 * NOTE: this happens to be the case now (with dm-writecache's single 1217 * threaded model) but re-evaluate this once memcpy_flushcache() is 1218 * enabled to use movdir64b which might invalidate this performance 1219 * advantage seen with cache-allocating-writes plus flushing. 1220 */ 1221 #ifdef CONFIG_X86 1222 if (static_cpu_has(X86_FEATURE_CLFLUSHOPT) && 1223 likely(boot_cpu_data.x86_clflush_size == 64) && 1224 likely(size >= 768)) { 1225 do { 1226 memcpy((void *)dest, (void *)source, 64); 1227 clflushopt((void *)dest); 1228 dest += 64; 1229 source += 64; 1230 size -= 64; 1231 } while (size >= 64); 1232 return; 1233 } 1234 #endif 1235 memcpy_flushcache(dest, source, size); 1236 } 1237 1238 static void bio_copy_block(struct dm_writecache *wc, struct bio *bio, void *data) 1239 { 1240 void *buf; 1241 unsigned size; 1242 int rw = bio_data_dir(bio); 1243 unsigned remaining_size = wc->block_size; 1244 1245 do { 1246 struct bio_vec bv = bio_iter_iovec(bio, bio->bi_iter); 1247 buf = bvec_kmap_local(&bv); 1248 size = bv.bv_len; 1249 if (unlikely(size > remaining_size)) 1250 size = remaining_size; 1251 1252 if (rw == READ) { 1253 int r; 1254 r = copy_mc_to_kernel(buf, data, size); 1255 flush_dcache_page(bio_page(bio)); 1256 if (unlikely(r)) { 1257 writecache_error(wc, r, "hardware memory error when reading data: %d", r); 1258 bio->bi_status = BLK_STS_IOERR; 1259 } 1260 } else { 1261 flush_dcache_page(bio_page(bio)); 1262 memcpy_flushcache_optimized(data, buf, size); 1263 } 1264 1265 kunmap_local(buf); 1266 1267 data = (char *)data + size; 1268 remaining_size -= size; 1269 bio_advance(bio, size); 1270 } while (unlikely(remaining_size)); 1271 } 1272 1273 static int writecache_flush_thread(void *data) 1274 { 1275 struct dm_writecache *wc = data; 1276 1277 while (1) { 1278 struct bio *bio; 1279 1280 wc_lock(wc); 1281 bio = bio_list_pop(&wc->flush_list); 1282 if (!bio) { 1283 set_current_state(TASK_INTERRUPTIBLE); 1284 wc_unlock(wc); 1285 1286 if (unlikely(kthread_should_stop())) { 1287 set_current_state(TASK_RUNNING); 1288 break; 1289 } 1290 1291 schedule(); 1292 continue; 1293 } 1294 1295 if (bio_op(bio) == REQ_OP_DISCARD) { 1296 writecache_discard(wc, bio->bi_iter.bi_sector, 1297 bio_end_sector(bio)); 1298 wc_unlock(wc); 1299 bio_set_dev(bio, wc->dev->bdev); 1300 submit_bio_noacct(bio); 1301 } else { 1302 writecache_flush(wc); 1303 wc_unlock(wc); 1304 if (writecache_has_error(wc)) 1305 bio->bi_status = BLK_STS_IOERR; 1306 bio_endio(bio); 1307 } 1308 } 1309 1310 return 0; 1311 } 1312 1313 static void writecache_offload_bio(struct dm_writecache *wc, struct bio *bio) 1314 { 1315 if (bio_list_empty(&wc->flush_list)) 1316 wake_up_process(wc->flush_thread); 1317 bio_list_add(&wc->flush_list, bio); 1318 } 1319 1320 enum wc_map_op { 1321 WC_MAP_SUBMIT, 1322 WC_MAP_REMAP, 1323 WC_MAP_REMAP_ORIGIN, 1324 WC_MAP_RETURN, 1325 WC_MAP_ERROR, 1326 }; 1327 1328 static void writecache_map_remap_origin(struct dm_writecache *wc, struct bio *bio, 1329 struct wc_entry *e) 1330 { 1331 if (e) { 1332 sector_t next_boundary = 1333 read_original_sector(wc, e) - bio->bi_iter.bi_sector; 1334 if (next_boundary < bio->bi_iter.bi_size >> SECTOR_SHIFT) 1335 dm_accept_partial_bio(bio, next_boundary); 1336 } 1337 } 1338 1339 static enum wc_map_op writecache_map_read(struct dm_writecache *wc, struct bio *bio) 1340 { 1341 enum wc_map_op map_op; 1342 struct wc_entry *e; 1343 1344 read_next_block: 1345 wc->stats.reads++; 1346 e = writecache_find_entry(wc, bio->bi_iter.bi_sector, WFE_RETURN_FOLLOWING); 1347 if (e && read_original_sector(wc, e) == bio->bi_iter.bi_sector) { 1348 wc->stats.read_hits++; 1349 if (WC_MODE_PMEM(wc)) { 1350 bio_copy_block(wc, bio, memory_data(wc, e)); 1351 if (bio->bi_iter.bi_size) 1352 goto read_next_block; 1353 map_op = WC_MAP_SUBMIT; 1354 } else { 1355 dm_accept_partial_bio(bio, wc->block_size >> SECTOR_SHIFT); 1356 bio_set_dev(bio, wc->ssd_dev->bdev); 1357 bio->bi_iter.bi_sector = cache_sector(wc, e); 1358 if (!writecache_entry_is_committed(wc, e)) 1359 writecache_wait_for_ios(wc, WRITE); 1360 map_op = WC_MAP_REMAP; 1361 } 1362 } else { 1363 writecache_map_remap_origin(wc, bio, e); 1364 wc->stats.reads += (bio->bi_iter.bi_size - wc->block_size) >> wc->block_size_bits; 1365 map_op = WC_MAP_REMAP_ORIGIN; 1366 } 1367 1368 return map_op; 1369 } 1370 1371 static void writecache_bio_copy_ssd(struct dm_writecache *wc, struct bio *bio, 1372 struct wc_entry *e, bool search_used) 1373 { 1374 unsigned bio_size = wc->block_size; 1375 sector_t start_cache_sec = cache_sector(wc, e); 1376 sector_t current_cache_sec = start_cache_sec + (bio_size >> SECTOR_SHIFT); 1377 1378 while (bio_size < bio->bi_iter.bi_size) { 1379 if (!search_used) { 1380 struct wc_entry *f = writecache_pop_from_freelist(wc, current_cache_sec); 1381 if (!f) 1382 break; 1383 write_original_sector_seq_count(wc, f, bio->bi_iter.bi_sector + 1384 (bio_size >> SECTOR_SHIFT), wc->seq_count); 1385 writecache_insert_entry(wc, f); 1386 wc->uncommitted_blocks++; 1387 } else { 1388 struct wc_entry *f; 1389 struct rb_node *next = rb_next(&e->rb_node); 1390 if (!next) 1391 break; 1392 f = container_of(next, struct wc_entry, rb_node); 1393 if (f != e + 1) 1394 break; 1395 if (read_original_sector(wc, f) != 1396 read_original_sector(wc, e) + (wc->block_size >> SECTOR_SHIFT)) 1397 break; 1398 if (unlikely(f->write_in_progress)) 1399 break; 1400 if (writecache_entry_is_committed(wc, f)) 1401 wc->overwrote_committed = true; 1402 e = f; 1403 } 1404 bio_size += wc->block_size; 1405 current_cache_sec += wc->block_size >> SECTOR_SHIFT; 1406 } 1407 1408 bio_set_dev(bio, wc->ssd_dev->bdev); 1409 bio->bi_iter.bi_sector = start_cache_sec; 1410 dm_accept_partial_bio(bio, bio_size >> SECTOR_SHIFT); 1411 1412 wc->stats.writes += bio->bi_iter.bi_size >> wc->block_size_bits; 1413 wc->stats.writes_allocate += (bio->bi_iter.bi_size - wc->block_size) >> wc->block_size_bits; 1414 1415 if (unlikely(wc->uncommitted_blocks >= wc->autocommit_blocks)) { 1416 wc->uncommitted_blocks = 0; 1417 queue_work(wc->writeback_wq, &wc->flush_work); 1418 } else { 1419 writecache_schedule_autocommit(wc); 1420 } 1421 } 1422 1423 static enum wc_map_op writecache_map_write(struct dm_writecache *wc, struct bio *bio) 1424 { 1425 struct wc_entry *e; 1426 1427 do { 1428 bool found_entry = false; 1429 bool search_used = false; 1430 if (writecache_has_error(wc)) { 1431 wc->stats.writes += bio->bi_iter.bi_size >> wc->block_size_bits; 1432 return WC_MAP_ERROR; 1433 } 1434 e = writecache_find_entry(wc, bio->bi_iter.bi_sector, 0); 1435 if (e) { 1436 if (!writecache_entry_is_committed(wc, e)) { 1437 wc->stats.write_hits_uncommitted++; 1438 search_used = true; 1439 goto bio_copy; 1440 } 1441 wc->stats.write_hits_committed++; 1442 if (!WC_MODE_PMEM(wc) && !e->write_in_progress) { 1443 wc->overwrote_committed = true; 1444 search_used = true; 1445 goto bio_copy; 1446 } 1447 found_entry = true; 1448 } else { 1449 if (unlikely(wc->cleaner) || 1450 (wc->metadata_only && !(bio->bi_opf & REQ_META))) 1451 goto direct_write; 1452 } 1453 e = writecache_pop_from_freelist(wc, (sector_t)-1); 1454 if (unlikely(!e)) { 1455 if (!WC_MODE_PMEM(wc) && !found_entry) { 1456 direct_write: 1457 e = writecache_find_entry(wc, bio->bi_iter.bi_sector, WFE_RETURN_FOLLOWING); 1458 writecache_map_remap_origin(wc, bio, e); 1459 wc->stats.writes_around += bio->bi_iter.bi_size >> wc->block_size_bits; 1460 wc->stats.writes += bio->bi_iter.bi_size >> wc->block_size_bits; 1461 return WC_MAP_REMAP_ORIGIN; 1462 } 1463 wc->stats.writes_blocked_on_freelist++; 1464 writecache_wait_on_freelist(wc); 1465 continue; 1466 } 1467 write_original_sector_seq_count(wc, e, bio->bi_iter.bi_sector, wc->seq_count); 1468 writecache_insert_entry(wc, e); 1469 wc->uncommitted_blocks++; 1470 wc->stats.writes_allocate++; 1471 bio_copy: 1472 if (WC_MODE_PMEM(wc)) { 1473 bio_copy_block(wc, bio, memory_data(wc, e)); 1474 wc->stats.writes++; 1475 } else { 1476 writecache_bio_copy_ssd(wc, bio, e, search_used); 1477 return WC_MAP_REMAP; 1478 } 1479 } while (bio->bi_iter.bi_size); 1480 1481 if (unlikely(bio->bi_opf & REQ_FUA || wc->uncommitted_blocks >= wc->autocommit_blocks)) 1482 writecache_flush(wc); 1483 else 1484 writecache_schedule_autocommit(wc); 1485 1486 return WC_MAP_SUBMIT; 1487 } 1488 1489 static enum wc_map_op writecache_map_flush(struct dm_writecache *wc, struct bio *bio) 1490 { 1491 if (writecache_has_error(wc)) 1492 return WC_MAP_ERROR; 1493 1494 if (WC_MODE_PMEM(wc)) { 1495 wc->stats.flushes++; 1496 writecache_flush(wc); 1497 if (writecache_has_error(wc)) 1498 return WC_MAP_ERROR; 1499 else if (unlikely(wc->cleaner) || unlikely(wc->metadata_only)) 1500 return WC_MAP_REMAP_ORIGIN; 1501 return WC_MAP_SUBMIT; 1502 } 1503 /* SSD: */ 1504 if (dm_bio_get_target_bio_nr(bio)) 1505 return WC_MAP_REMAP_ORIGIN; 1506 wc->stats.flushes++; 1507 writecache_offload_bio(wc, bio); 1508 return WC_MAP_RETURN; 1509 } 1510 1511 static enum wc_map_op writecache_map_discard(struct dm_writecache *wc, struct bio *bio) 1512 { 1513 wc->stats.discards += bio->bi_iter.bi_size >> wc->block_size_bits; 1514 1515 if (writecache_has_error(wc)) 1516 return WC_MAP_ERROR; 1517 1518 if (WC_MODE_PMEM(wc)) { 1519 writecache_discard(wc, bio->bi_iter.bi_sector, bio_end_sector(bio)); 1520 return WC_MAP_REMAP_ORIGIN; 1521 } 1522 /* SSD: */ 1523 writecache_offload_bio(wc, bio); 1524 return WC_MAP_RETURN; 1525 } 1526 1527 static int writecache_map(struct dm_target *ti, struct bio *bio) 1528 { 1529 struct dm_writecache *wc = ti->private; 1530 enum wc_map_op map_op; 1531 1532 bio->bi_private = NULL; 1533 1534 wc_lock(wc); 1535 1536 if (unlikely(bio->bi_opf & REQ_PREFLUSH)) { 1537 map_op = writecache_map_flush(wc, bio); 1538 goto done; 1539 } 1540 1541 bio->bi_iter.bi_sector = dm_target_offset(ti, bio->bi_iter.bi_sector); 1542 1543 if (unlikely((((unsigned)bio->bi_iter.bi_sector | bio_sectors(bio)) & 1544 (wc->block_size / 512 - 1)) != 0)) { 1545 DMERR("I/O is not aligned, sector %llu, size %u, block size %u", 1546 (unsigned long long)bio->bi_iter.bi_sector, 1547 bio->bi_iter.bi_size, wc->block_size); 1548 map_op = WC_MAP_ERROR; 1549 goto done; 1550 } 1551 1552 if (unlikely(bio_op(bio) == REQ_OP_DISCARD)) { 1553 map_op = writecache_map_discard(wc, bio); 1554 goto done; 1555 } 1556 1557 if (bio_data_dir(bio) == READ) 1558 map_op = writecache_map_read(wc, bio); 1559 else 1560 map_op = writecache_map_write(wc, bio); 1561 done: 1562 switch (map_op) { 1563 case WC_MAP_REMAP_ORIGIN: 1564 if (likely(wc->pause != 0)) { 1565 if (bio_op(bio) == REQ_OP_WRITE) { 1566 dm_iot_io_begin(&wc->iot, 1); 1567 bio->bi_private = (void *)2; 1568 } 1569 } 1570 bio_set_dev(bio, wc->dev->bdev); 1571 wc_unlock(wc); 1572 return DM_MAPIO_REMAPPED; 1573 1574 case WC_MAP_REMAP: 1575 /* make sure that writecache_end_io decrements bio_in_progress: */ 1576 bio->bi_private = (void *)1; 1577 atomic_inc(&wc->bio_in_progress[bio_data_dir(bio)]); 1578 wc_unlock(wc); 1579 return DM_MAPIO_REMAPPED; 1580 1581 case WC_MAP_SUBMIT: 1582 wc_unlock(wc); 1583 bio_endio(bio); 1584 return DM_MAPIO_SUBMITTED; 1585 1586 case WC_MAP_RETURN: 1587 wc_unlock(wc); 1588 return DM_MAPIO_SUBMITTED; 1589 1590 case WC_MAP_ERROR: 1591 wc_unlock(wc); 1592 bio_io_error(bio); 1593 return DM_MAPIO_SUBMITTED; 1594 1595 default: 1596 BUG(); 1597 return -1; 1598 } 1599 } 1600 1601 static int writecache_end_io(struct dm_target *ti, struct bio *bio, blk_status_t *status) 1602 { 1603 struct dm_writecache *wc = ti->private; 1604 1605 if (bio->bi_private == (void *)1) { 1606 int dir = bio_data_dir(bio); 1607 if (atomic_dec_and_test(&wc->bio_in_progress[dir])) 1608 if (unlikely(waitqueue_active(&wc->bio_in_progress_wait[dir]))) 1609 wake_up(&wc->bio_in_progress_wait[dir]); 1610 } else if (bio->bi_private == (void *)2) { 1611 dm_iot_io_end(&wc->iot, 1); 1612 } 1613 return 0; 1614 } 1615 1616 static int writecache_iterate_devices(struct dm_target *ti, 1617 iterate_devices_callout_fn fn, void *data) 1618 { 1619 struct dm_writecache *wc = ti->private; 1620 1621 return fn(ti, wc->dev, 0, ti->len, data); 1622 } 1623 1624 static void writecache_io_hints(struct dm_target *ti, struct queue_limits *limits) 1625 { 1626 struct dm_writecache *wc = ti->private; 1627 1628 if (limits->logical_block_size < wc->block_size) 1629 limits->logical_block_size = wc->block_size; 1630 1631 if (limits->physical_block_size < wc->block_size) 1632 limits->physical_block_size = wc->block_size; 1633 1634 if (limits->io_min < wc->block_size) 1635 limits->io_min = wc->block_size; 1636 } 1637 1638 1639 static void writecache_writeback_endio(struct bio *bio) 1640 { 1641 struct writeback_struct *wb = container_of(bio, struct writeback_struct, bio); 1642 struct dm_writecache *wc = wb->wc; 1643 unsigned long flags; 1644 1645 raw_spin_lock_irqsave(&wc->endio_list_lock, flags); 1646 if (unlikely(list_empty(&wc->endio_list))) 1647 wake_up_process(wc->endio_thread); 1648 list_add_tail(&wb->endio_entry, &wc->endio_list); 1649 raw_spin_unlock_irqrestore(&wc->endio_list_lock, flags); 1650 } 1651 1652 static void writecache_copy_endio(int read_err, unsigned long write_err, void *ptr) 1653 { 1654 struct copy_struct *c = ptr; 1655 struct dm_writecache *wc = c->wc; 1656 1657 c->error = likely(!(read_err | write_err)) ? 0 : -EIO; 1658 1659 raw_spin_lock_irq(&wc->endio_list_lock); 1660 if (unlikely(list_empty(&wc->endio_list))) 1661 wake_up_process(wc->endio_thread); 1662 list_add_tail(&c->endio_entry, &wc->endio_list); 1663 raw_spin_unlock_irq(&wc->endio_list_lock); 1664 } 1665 1666 static void __writecache_endio_pmem(struct dm_writecache *wc, struct list_head *list) 1667 { 1668 unsigned i; 1669 struct writeback_struct *wb; 1670 struct wc_entry *e; 1671 unsigned long n_walked = 0; 1672 1673 do { 1674 wb = list_entry(list->next, struct writeback_struct, endio_entry); 1675 list_del(&wb->endio_entry); 1676 1677 if (unlikely(wb->bio.bi_status != BLK_STS_OK)) 1678 writecache_error(wc, blk_status_to_errno(wb->bio.bi_status), 1679 "write error %d", wb->bio.bi_status); 1680 i = 0; 1681 do { 1682 e = wb->wc_list[i]; 1683 BUG_ON(!e->write_in_progress); 1684 e->write_in_progress = false; 1685 INIT_LIST_HEAD(&e->lru); 1686 if (!writecache_has_error(wc)) 1687 writecache_free_entry(wc, e); 1688 BUG_ON(!wc->writeback_size); 1689 wc->writeback_size--; 1690 n_walked++; 1691 if (unlikely(n_walked >= ENDIO_LATENCY)) { 1692 writecache_commit_flushed(wc, false); 1693 wc_unlock(wc); 1694 wc_lock(wc); 1695 n_walked = 0; 1696 } 1697 } while (++i < wb->wc_list_n); 1698 1699 if (wb->wc_list != wb->wc_list_inline) 1700 kfree(wb->wc_list); 1701 bio_put(&wb->bio); 1702 } while (!list_empty(list)); 1703 } 1704 1705 static void __writecache_endio_ssd(struct dm_writecache *wc, struct list_head *list) 1706 { 1707 struct copy_struct *c; 1708 struct wc_entry *e; 1709 1710 do { 1711 c = list_entry(list->next, struct copy_struct, endio_entry); 1712 list_del(&c->endio_entry); 1713 1714 if (unlikely(c->error)) 1715 writecache_error(wc, c->error, "copy error"); 1716 1717 e = c->e; 1718 do { 1719 BUG_ON(!e->write_in_progress); 1720 e->write_in_progress = false; 1721 INIT_LIST_HEAD(&e->lru); 1722 if (!writecache_has_error(wc)) 1723 writecache_free_entry(wc, e); 1724 1725 BUG_ON(!wc->writeback_size); 1726 wc->writeback_size--; 1727 e++; 1728 } while (--c->n_entries); 1729 mempool_free(c, &wc->copy_pool); 1730 } while (!list_empty(list)); 1731 } 1732 1733 static int writecache_endio_thread(void *data) 1734 { 1735 struct dm_writecache *wc = data; 1736 1737 while (1) { 1738 struct list_head list; 1739 1740 raw_spin_lock_irq(&wc->endio_list_lock); 1741 if (!list_empty(&wc->endio_list)) 1742 goto pop_from_list; 1743 set_current_state(TASK_INTERRUPTIBLE); 1744 raw_spin_unlock_irq(&wc->endio_list_lock); 1745 1746 if (unlikely(kthread_should_stop())) { 1747 set_current_state(TASK_RUNNING); 1748 break; 1749 } 1750 1751 schedule(); 1752 1753 continue; 1754 1755 pop_from_list: 1756 list = wc->endio_list; 1757 list.next->prev = list.prev->next = &list; 1758 INIT_LIST_HEAD(&wc->endio_list); 1759 raw_spin_unlock_irq(&wc->endio_list_lock); 1760 1761 if (!WC_MODE_FUA(wc)) 1762 writecache_disk_flush(wc, wc->dev); 1763 1764 wc_lock(wc); 1765 1766 if (WC_MODE_PMEM(wc)) { 1767 __writecache_endio_pmem(wc, &list); 1768 } else { 1769 __writecache_endio_ssd(wc, &list); 1770 writecache_wait_for_ios(wc, READ); 1771 } 1772 1773 writecache_commit_flushed(wc, false); 1774 1775 wc_unlock(wc); 1776 } 1777 1778 return 0; 1779 } 1780 1781 static bool wc_add_block(struct writeback_struct *wb, struct wc_entry *e) 1782 { 1783 struct dm_writecache *wc = wb->wc; 1784 unsigned block_size = wc->block_size; 1785 void *address = memory_data(wc, e); 1786 1787 persistent_memory_flush_cache(address, block_size); 1788 1789 if (unlikely(bio_end_sector(&wb->bio) >= wc->data_device_sectors)) 1790 return true; 1791 1792 return bio_add_page(&wb->bio, persistent_memory_page(address), 1793 block_size, persistent_memory_page_offset(address)) != 0; 1794 } 1795 1796 struct writeback_list { 1797 struct list_head list; 1798 size_t size; 1799 }; 1800 1801 static void __writeback_throttle(struct dm_writecache *wc, struct writeback_list *wbl) 1802 { 1803 if (unlikely(wc->max_writeback_jobs)) { 1804 if (READ_ONCE(wc->writeback_size) - wbl->size >= wc->max_writeback_jobs) { 1805 wc_lock(wc); 1806 while (wc->writeback_size - wbl->size >= wc->max_writeback_jobs) 1807 writecache_wait_on_freelist(wc); 1808 wc_unlock(wc); 1809 } 1810 } 1811 cond_resched(); 1812 } 1813 1814 static void __writecache_writeback_pmem(struct dm_writecache *wc, struct writeback_list *wbl) 1815 { 1816 struct wc_entry *e, *f; 1817 struct bio *bio; 1818 struct writeback_struct *wb; 1819 unsigned max_pages; 1820 1821 while (wbl->size) { 1822 wbl->size--; 1823 e = container_of(wbl->list.prev, struct wc_entry, lru); 1824 list_del(&e->lru); 1825 1826 max_pages = e->wc_list_contiguous; 1827 1828 bio = bio_alloc_bioset(wc->dev->bdev, max_pages, REQ_OP_WRITE, 1829 GFP_NOIO, &wc->bio_set); 1830 wb = container_of(bio, struct writeback_struct, bio); 1831 wb->wc = wc; 1832 bio->bi_end_io = writecache_writeback_endio; 1833 bio->bi_iter.bi_sector = read_original_sector(wc, e); 1834 if (max_pages <= WB_LIST_INLINE || 1835 unlikely(!(wb->wc_list = kmalloc_array(max_pages, sizeof(struct wc_entry *), 1836 GFP_NOIO | __GFP_NORETRY | 1837 __GFP_NOMEMALLOC | __GFP_NOWARN)))) { 1838 wb->wc_list = wb->wc_list_inline; 1839 max_pages = WB_LIST_INLINE; 1840 } 1841 1842 BUG_ON(!wc_add_block(wb, e)); 1843 1844 wb->wc_list[0] = e; 1845 wb->wc_list_n = 1; 1846 1847 while (wbl->size && wb->wc_list_n < max_pages) { 1848 f = container_of(wbl->list.prev, struct wc_entry, lru); 1849 if (read_original_sector(wc, f) != 1850 read_original_sector(wc, e) + (wc->block_size >> SECTOR_SHIFT)) 1851 break; 1852 if (!wc_add_block(wb, f)) 1853 break; 1854 wbl->size--; 1855 list_del(&f->lru); 1856 wb->wc_list[wb->wc_list_n++] = f; 1857 e = f; 1858 } 1859 if (WC_MODE_FUA(wc)) 1860 bio->bi_opf |= REQ_FUA; 1861 if (writecache_has_error(wc)) { 1862 bio->bi_status = BLK_STS_IOERR; 1863 bio_endio(bio); 1864 } else if (unlikely(!bio_sectors(bio))) { 1865 bio->bi_status = BLK_STS_OK; 1866 bio_endio(bio); 1867 } else { 1868 submit_bio(bio); 1869 } 1870 1871 __writeback_throttle(wc, wbl); 1872 } 1873 } 1874 1875 static void __writecache_writeback_ssd(struct dm_writecache *wc, struct writeback_list *wbl) 1876 { 1877 struct wc_entry *e, *f; 1878 struct dm_io_region from, to; 1879 struct copy_struct *c; 1880 1881 while (wbl->size) { 1882 unsigned n_sectors; 1883 1884 wbl->size--; 1885 e = container_of(wbl->list.prev, struct wc_entry, lru); 1886 list_del(&e->lru); 1887 1888 n_sectors = e->wc_list_contiguous << (wc->block_size_bits - SECTOR_SHIFT); 1889 1890 from.bdev = wc->ssd_dev->bdev; 1891 from.sector = cache_sector(wc, e); 1892 from.count = n_sectors; 1893 to.bdev = wc->dev->bdev; 1894 to.sector = read_original_sector(wc, e); 1895 to.count = n_sectors; 1896 1897 c = mempool_alloc(&wc->copy_pool, GFP_NOIO); 1898 c->wc = wc; 1899 c->e = e; 1900 c->n_entries = e->wc_list_contiguous; 1901 1902 while ((n_sectors -= wc->block_size >> SECTOR_SHIFT)) { 1903 wbl->size--; 1904 f = container_of(wbl->list.prev, struct wc_entry, lru); 1905 BUG_ON(f != e + 1); 1906 list_del(&f->lru); 1907 e = f; 1908 } 1909 1910 if (unlikely(to.sector + to.count > wc->data_device_sectors)) { 1911 if (to.sector >= wc->data_device_sectors) { 1912 writecache_copy_endio(0, 0, c); 1913 continue; 1914 } 1915 from.count = to.count = wc->data_device_sectors - to.sector; 1916 } 1917 1918 dm_kcopyd_copy(wc->dm_kcopyd, &from, 1, &to, 0, writecache_copy_endio, c); 1919 1920 __writeback_throttle(wc, wbl); 1921 } 1922 } 1923 1924 static void writecache_writeback(struct work_struct *work) 1925 { 1926 struct dm_writecache *wc = container_of(work, struct dm_writecache, writeback_work); 1927 struct blk_plug plug; 1928 struct wc_entry *f, *g, *e = NULL; 1929 struct rb_node *node, *next_node; 1930 struct list_head skipped; 1931 struct writeback_list wbl; 1932 unsigned long n_walked; 1933 1934 if (!WC_MODE_PMEM(wc)) { 1935 /* Wait for any active kcopyd work on behalf of ssd writeback */ 1936 dm_kcopyd_client_flush(wc->dm_kcopyd); 1937 } 1938 1939 if (likely(wc->pause != 0)) { 1940 while (1) { 1941 unsigned long idle; 1942 if (unlikely(wc->cleaner) || unlikely(wc->writeback_all) || 1943 unlikely(dm_suspended(wc->ti))) 1944 break; 1945 idle = dm_iot_idle_time(&wc->iot); 1946 if (idle >= wc->pause) 1947 break; 1948 idle = wc->pause - idle; 1949 if (idle > HZ) 1950 idle = HZ; 1951 schedule_timeout_idle(idle); 1952 } 1953 } 1954 1955 wc_lock(wc); 1956 restart: 1957 if (writecache_has_error(wc)) { 1958 wc_unlock(wc); 1959 return; 1960 } 1961 1962 if (unlikely(wc->writeback_all)) { 1963 if (writecache_wait_for_writeback(wc)) 1964 goto restart; 1965 } 1966 1967 if (wc->overwrote_committed) { 1968 writecache_wait_for_ios(wc, WRITE); 1969 } 1970 1971 n_walked = 0; 1972 INIT_LIST_HEAD(&skipped); 1973 INIT_LIST_HEAD(&wbl.list); 1974 wbl.size = 0; 1975 while (!list_empty(&wc->lru) && 1976 (wc->writeback_all || 1977 wc->freelist_size + wc->writeback_size <= wc->freelist_low_watermark || 1978 (jiffies - container_of(wc->lru.prev, struct wc_entry, lru)->age >= 1979 wc->max_age - wc->max_age / MAX_AGE_DIV))) { 1980 1981 n_walked++; 1982 if (unlikely(n_walked > WRITEBACK_LATENCY) && 1983 likely(!wc->writeback_all)) { 1984 if (likely(!dm_suspended(wc->ti))) 1985 queue_work(wc->writeback_wq, &wc->writeback_work); 1986 break; 1987 } 1988 1989 if (unlikely(wc->writeback_all)) { 1990 if (unlikely(!e)) { 1991 writecache_flush(wc); 1992 e = container_of(rb_first(&wc->tree), struct wc_entry, rb_node); 1993 } else 1994 e = g; 1995 } else 1996 e = container_of(wc->lru.prev, struct wc_entry, lru); 1997 BUG_ON(e->write_in_progress); 1998 if (unlikely(!writecache_entry_is_committed(wc, e))) { 1999 writecache_flush(wc); 2000 } 2001 node = rb_prev(&e->rb_node); 2002 if (node) { 2003 f = container_of(node, struct wc_entry, rb_node); 2004 if (unlikely(read_original_sector(wc, f) == 2005 read_original_sector(wc, e))) { 2006 BUG_ON(!f->write_in_progress); 2007 list_move(&e->lru, &skipped); 2008 cond_resched(); 2009 continue; 2010 } 2011 } 2012 wc->writeback_size++; 2013 list_move(&e->lru, &wbl.list); 2014 wbl.size++; 2015 e->write_in_progress = true; 2016 e->wc_list_contiguous = 1; 2017 2018 f = e; 2019 2020 while (1) { 2021 next_node = rb_next(&f->rb_node); 2022 if (unlikely(!next_node)) 2023 break; 2024 g = container_of(next_node, struct wc_entry, rb_node); 2025 if (unlikely(read_original_sector(wc, g) == 2026 read_original_sector(wc, f))) { 2027 f = g; 2028 continue; 2029 } 2030 if (read_original_sector(wc, g) != 2031 read_original_sector(wc, f) + (wc->block_size >> SECTOR_SHIFT)) 2032 break; 2033 if (unlikely(g->write_in_progress)) 2034 break; 2035 if (unlikely(!writecache_entry_is_committed(wc, g))) 2036 break; 2037 2038 if (!WC_MODE_PMEM(wc)) { 2039 if (g != f + 1) 2040 break; 2041 } 2042 2043 n_walked++; 2044 //if (unlikely(n_walked > WRITEBACK_LATENCY) && likely(!wc->writeback_all)) 2045 // break; 2046 2047 wc->writeback_size++; 2048 list_move(&g->lru, &wbl.list); 2049 wbl.size++; 2050 g->write_in_progress = true; 2051 g->wc_list_contiguous = BIO_MAX_VECS; 2052 f = g; 2053 e->wc_list_contiguous++; 2054 if (unlikely(e->wc_list_contiguous == BIO_MAX_VECS)) { 2055 if (unlikely(wc->writeback_all)) { 2056 next_node = rb_next(&f->rb_node); 2057 if (likely(next_node)) 2058 g = container_of(next_node, struct wc_entry, rb_node); 2059 } 2060 break; 2061 } 2062 } 2063 cond_resched(); 2064 } 2065 2066 if (!list_empty(&skipped)) { 2067 list_splice_tail(&skipped, &wc->lru); 2068 /* 2069 * If we didn't do any progress, we must wait until some 2070 * writeback finishes to avoid burning CPU in a loop 2071 */ 2072 if (unlikely(!wbl.size)) 2073 writecache_wait_for_writeback(wc); 2074 } 2075 2076 wc_unlock(wc); 2077 2078 blk_start_plug(&plug); 2079 2080 if (WC_MODE_PMEM(wc)) 2081 __writecache_writeback_pmem(wc, &wbl); 2082 else 2083 __writecache_writeback_ssd(wc, &wbl); 2084 2085 blk_finish_plug(&plug); 2086 2087 if (unlikely(wc->writeback_all)) { 2088 wc_lock(wc); 2089 while (writecache_wait_for_writeback(wc)); 2090 wc_unlock(wc); 2091 } 2092 } 2093 2094 static int calculate_memory_size(uint64_t device_size, unsigned block_size, 2095 size_t *n_blocks_p, size_t *n_metadata_blocks_p) 2096 { 2097 uint64_t n_blocks, offset; 2098 struct wc_entry e; 2099 2100 n_blocks = device_size; 2101 do_div(n_blocks, block_size + sizeof(struct wc_memory_entry)); 2102 2103 while (1) { 2104 if (!n_blocks) 2105 return -ENOSPC; 2106 /* Verify the following entries[n_blocks] won't overflow */ 2107 if (n_blocks >= ((size_t)-sizeof(struct wc_memory_superblock) / 2108 sizeof(struct wc_memory_entry))) 2109 return -EFBIG; 2110 offset = offsetof(struct wc_memory_superblock, entries[n_blocks]); 2111 offset = (offset + block_size - 1) & ~(uint64_t)(block_size - 1); 2112 if (offset + n_blocks * block_size <= device_size) 2113 break; 2114 n_blocks--; 2115 } 2116 2117 /* check if the bit field overflows */ 2118 e.index = n_blocks; 2119 if (e.index != n_blocks) 2120 return -EFBIG; 2121 2122 if (n_blocks_p) 2123 *n_blocks_p = n_blocks; 2124 if (n_metadata_blocks_p) 2125 *n_metadata_blocks_p = offset >> __ffs(block_size); 2126 return 0; 2127 } 2128 2129 static int init_memory(struct dm_writecache *wc) 2130 { 2131 size_t b; 2132 int r; 2133 2134 r = calculate_memory_size(wc->memory_map_size, wc->block_size, &wc->n_blocks, NULL); 2135 if (r) 2136 return r; 2137 2138 r = writecache_alloc_entries(wc); 2139 if (r) 2140 return r; 2141 2142 for (b = 0; b < ARRAY_SIZE(sb(wc)->padding); b++) 2143 pmem_assign(sb(wc)->padding[b], cpu_to_le64(0)); 2144 pmem_assign(sb(wc)->version, cpu_to_le32(MEMORY_SUPERBLOCK_VERSION)); 2145 pmem_assign(sb(wc)->block_size, cpu_to_le32(wc->block_size)); 2146 pmem_assign(sb(wc)->n_blocks, cpu_to_le64(wc->n_blocks)); 2147 pmem_assign(sb(wc)->seq_count, cpu_to_le64(0)); 2148 2149 for (b = 0; b < wc->n_blocks; b++) { 2150 write_original_sector_seq_count(wc, &wc->entries[b], -1, -1); 2151 cond_resched(); 2152 } 2153 2154 writecache_flush_all_metadata(wc); 2155 writecache_commit_flushed(wc, false); 2156 pmem_assign(sb(wc)->magic, cpu_to_le32(MEMORY_SUPERBLOCK_MAGIC)); 2157 writecache_flush_region(wc, &sb(wc)->magic, sizeof sb(wc)->magic); 2158 writecache_commit_flushed(wc, false); 2159 2160 return 0; 2161 } 2162 2163 static void writecache_dtr(struct dm_target *ti) 2164 { 2165 struct dm_writecache *wc = ti->private; 2166 2167 if (!wc) 2168 return; 2169 2170 if (wc->endio_thread) 2171 kthread_stop(wc->endio_thread); 2172 2173 if (wc->flush_thread) 2174 kthread_stop(wc->flush_thread); 2175 2176 bioset_exit(&wc->bio_set); 2177 2178 mempool_exit(&wc->copy_pool); 2179 2180 if (wc->writeback_wq) 2181 destroy_workqueue(wc->writeback_wq); 2182 2183 if (wc->dev) 2184 dm_put_device(ti, wc->dev); 2185 2186 if (wc->ssd_dev) 2187 dm_put_device(ti, wc->ssd_dev); 2188 2189 vfree(wc->entries); 2190 2191 if (wc->memory_map) { 2192 if (WC_MODE_PMEM(wc)) 2193 persistent_memory_release(wc); 2194 else 2195 vfree(wc->memory_map); 2196 } 2197 2198 if (wc->dm_kcopyd) 2199 dm_kcopyd_client_destroy(wc->dm_kcopyd); 2200 2201 if (wc->dm_io) 2202 dm_io_client_destroy(wc->dm_io); 2203 2204 vfree(wc->dirty_bitmap); 2205 2206 kfree(wc); 2207 } 2208 2209 static int writecache_ctr(struct dm_target *ti, unsigned argc, char **argv) 2210 { 2211 struct dm_writecache *wc; 2212 struct dm_arg_set as; 2213 const char *string; 2214 unsigned opt_params; 2215 size_t offset, data_size; 2216 int i, r; 2217 char dummy; 2218 int high_wm_percent = HIGH_WATERMARK; 2219 int low_wm_percent = LOW_WATERMARK; 2220 uint64_t x; 2221 struct wc_memory_superblock s; 2222 2223 static struct dm_arg _args[] = { 2224 {0, 18, "Invalid number of feature args"}, 2225 }; 2226 2227 as.argc = argc; 2228 as.argv = argv; 2229 2230 wc = kzalloc(sizeof(struct dm_writecache), GFP_KERNEL); 2231 if (!wc) { 2232 ti->error = "Cannot allocate writecache structure"; 2233 r = -ENOMEM; 2234 goto bad; 2235 } 2236 ti->private = wc; 2237 wc->ti = ti; 2238 2239 mutex_init(&wc->lock); 2240 wc->max_age = MAX_AGE_UNSPECIFIED; 2241 writecache_poison_lists(wc); 2242 init_waitqueue_head(&wc->freelist_wait); 2243 timer_setup(&wc->autocommit_timer, writecache_autocommit_timer, 0); 2244 timer_setup(&wc->max_age_timer, writecache_max_age_timer, 0); 2245 2246 for (i = 0; i < 2; i++) { 2247 atomic_set(&wc->bio_in_progress[i], 0); 2248 init_waitqueue_head(&wc->bio_in_progress_wait[i]); 2249 } 2250 2251 wc->dm_io = dm_io_client_create(); 2252 if (IS_ERR(wc->dm_io)) { 2253 r = PTR_ERR(wc->dm_io); 2254 ti->error = "Unable to allocate dm-io client"; 2255 wc->dm_io = NULL; 2256 goto bad; 2257 } 2258 2259 wc->writeback_wq = alloc_workqueue("writecache-writeback", WQ_MEM_RECLAIM, 1); 2260 if (!wc->writeback_wq) { 2261 r = -ENOMEM; 2262 ti->error = "Could not allocate writeback workqueue"; 2263 goto bad; 2264 } 2265 INIT_WORK(&wc->writeback_work, writecache_writeback); 2266 INIT_WORK(&wc->flush_work, writecache_flush_work); 2267 2268 dm_iot_init(&wc->iot); 2269 2270 raw_spin_lock_init(&wc->endio_list_lock); 2271 INIT_LIST_HEAD(&wc->endio_list); 2272 wc->endio_thread = kthread_run(writecache_endio_thread, wc, "writecache_endio"); 2273 if (IS_ERR(wc->endio_thread)) { 2274 r = PTR_ERR(wc->endio_thread); 2275 wc->endio_thread = NULL; 2276 ti->error = "Couldn't spawn endio thread"; 2277 goto bad; 2278 } 2279 2280 /* 2281 * Parse the mode (pmem or ssd) 2282 */ 2283 string = dm_shift_arg(&as); 2284 if (!string) 2285 goto bad_arguments; 2286 2287 if (!strcasecmp(string, "s")) { 2288 wc->pmem_mode = false; 2289 } else if (!strcasecmp(string, "p")) { 2290 #ifdef DM_WRITECACHE_HAS_PMEM 2291 wc->pmem_mode = true; 2292 wc->writeback_fua = true; 2293 #else 2294 /* 2295 * If the architecture doesn't support persistent memory or 2296 * the kernel doesn't support any DAX drivers, this driver can 2297 * only be used in SSD-only mode. 2298 */ 2299 r = -EOPNOTSUPP; 2300 ti->error = "Persistent memory or DAX not supported on this system"; 2301 goto bad; 2302 #endif 2303 } else { 2304 goto bad_arguments; 2305 } 2306 2307 if (WC_MODE_PMEM(wc)) { 2308 r = bioset_init(&wc->bio_set, BIO_POOL_SIZE, 2309 offsetof(struct writeback_struct, bio), 2310 BIOSET_NEED_BVECS); 2311 if (r) { 2312 ti->error = "Could not allocate bio set"; 2313 goto bad; 2314 } 2315 } else { 2316 wc->pause = PAUSE_WRITEBACK; 2317 r = mempool_init_kmalloc_pool(&wc->copy_pool, 1, sizeof(struct copy_struct)); 2318 if (r) { 2319 ti->error = "Could not allocate mempool"; 2320 goto bad; 2321 } 2322 } 2323 2324 /* 2325 * Parse the origin data device 2326 */ 2327 string = dm_shift_arg(&as); 2328 if (!string) 2329 goto bad_arguments; 2330 r = dm_get_device(ti, string, dm_table_get_mode(ti->table), &wc->dev); 2331 if (r) { 2332 ti->error = "Origin data device lookup failed"; 2333 goto bad; 2334 } 2335 2336 /* 2337 * Parse cache data device (be it pmem or ssd) 2338 */ 2339 string = dm_shift_arg(&as); 2340 if (!string) 2341 goto bad_arguments; 2342 2343 r = dm_get_device(ti, string, dm_table_get_mode(ti->table), &wc->ssd_dev); 2344 if (r) { 2345 ti->error = "Cache data device lookup failed"; 2346 goto bad; 2347 } 2348 wc->memory_map_size = bdev_nr_bytes(wc->ssd_dev->bdev); 2349 2350 /* 2351 * Parse the cache block size 2352 */ 2353 string = dm_shift_arg(&as); 2354 if (!string) 2355 goto bad_arguments; 2356 if (sscanf(string, "%u%c", &wc->block_size, &dummy) != 1 || 2357 wc->block_size < 512 || wc->block_size > PAGE_SIZE || 2358 (wc->block_size & (wc->block_size - 1))) { 2359 r = -EINVAL; 2360 ti->error = "Invalid block size"; 2361 goto bad; 2362 } 2363 if (wc->block_size < bdev_logical_block_size(wc->dev->bdev) || 2364 wc->block_size < bdev_logical_block_size(wc->ssd_dev->bdev)) { 2365 r = -EINVAL; 2366 ti->error = "Block size is smaller than device logical block size"; 2367 goto bad; 2368 } 2369 wc->block_size_bits = __ffs(wc->block_size); 2370 2371 wc->max_writeback_jobs = MAX_WRITEBACK_JOBS; 2372 wc->autocommit_blocks = !WC_MODE_PMEM(wc) ? AUTOCOMMIT_BLOCKS_SSD : AUTOCOMMIT_BLOCKS_PMEM; 2373 wc->autocommit_jiffies = msecs_to_jiffies(AUTOCOMMIT_MSEC); 2374 2375 /* 2376 * Parse optional arguments 2377 */ 2378 r = dm_read_arg_group(_args, &as, &opt_params, &ti->error); 2379 if (r) 2380 goto bad; 2381 2382 while (opt_params) { 2383 string = dm_shift_arg(&as), opt_params--; 2384 if (!strcasecmp(string, "start_sector") && opt_params >= 1) { 2385 unsigned long long start_sector; 2386 string = dm_shift_arg(&as), opt_params--; 2387 if (sscanf(string, "%llu%c", &start_sector, &dummy) != 1) 2388 goto invalid_optional; 2389 wc->start_sector = start_sector; 2390 wc->start_sector_set = true; 2391 if (wc->start_sector != start_sector || 2392 wc->start_sector >= wc->memory_map_size >> SECTOR_SHIFT) 2393 goto invalid_optional; 2394 } else if (!strcasecmp(string, "high_watermark") && opt_params >= 1) { 2395 string = dm_shift_arg(&as), opt_params--; 2396 if (sscanf(string, "%d%c", &high_wm_percent, &dummy) != 1) 2397 goto invalid_optional; 2398 if (high_wm_percent < 0 || high_wm_percent > 100) 2399 goto invalid_optional; 2400 wc->high_wm_percent_value = high_wm_percent; 2401 wc->high_wm_percent_set = true; 2402 } else if (!strcasecmp(string, "low_watermark") && opt_params >= 1) { 2403 string = dm_shift_arg(&as), opt_params--; 2404 if (sscanf(string, "%d%c", &low_wm_percent, &dummy) != 1) 2405 goto invalid_optional; 2406 if (low_wm_percent < 0 || low_wm_percent > 100) 2407 goto invalid_optional; 2408 wc->low_wm_percent_value = low_wm_percent; 2409 wc->low_wm_percent_set = true; 2410 } else if (!strcasecmp(string, "writeback_jobs") && opt_params >= 1) { 2411 string = dm_shift_arg(&as), opt_params--; 2412 if (sscanf(string, "%u%c", &wc->max_writeback_jobs, &dummy) != 1) 2413 goto invalid_optional; 2414 wc->max_writeback_jobs_set = true; 2415 } else if (!strcasecmp(string, "autocommit_blocks") && opt_params >= 1) { 2416 string = dm_shift_arg(&as), opt_params--; 2417 if (sscanf(string, "%u%c", &wc->autocommit_blocks, &dummy) != 1) 2418 goto invalid_optional; 2419 wc->autocommit_blocks_set = true; 2420 } else if (!strcasecmp(string, "autocommit_time") && opt_params >= 1) { 2421 unsigned autocommit_msecs; 2422 string = dm_shift_arg(&as), opt_params--; 2423 if (sscanf(string, "%u%c", &autocommit_msecs, &dummy) != 1) 2424 goto invalid_optional; 2425 if (autocommit_msecs > 3600000) 2426 goto invalid_optional; 2427 wc->autocommit_jiffies = msecs_to_jiffies(autocommit_msecs); 2428 wc->autocommit_time_value = autocommit_msecs; 2429 wc->autocommit_time_set = true; 2430 } else if (!strcasecmp(string, "max_age") && opt_params >= 1) { 2431 unsigned max_age_msecs; 2432 string = dm_shift_arg(&as), opt_params--; 2433 if (sscanf(string, "%u%c", &max_age_msecs, &dummy) != 1) 2434 goto invalid_optional; 2435 if (max_age_msecs > 86400000) 2436 goto invalid_optional; 2437 wc->max_age = msecs_to_jiffies(max_age_msecs); 2438 wc->max_age_set = true; 2439 wc->max_age_value = max_age_msecs; 2440 } else if (!strcasecmp(string, "cleaner")) { 2441 wc->cleaner_set = true; 2442 wc->cleaner = true; 2443 } else if (!strcasecmp(string, "fua")) { 2444 if (WC_MODE_PMEM(wc)) { 2445 wc->writeback_fua = true; 2446 wc->writeback_fua_set = true; 2447 } else goto invalid_optional; 2448 } else if (!strcasecmp(string, "nofua")) { 2449 if (WC_MODE_PMEM(wc)) { 2450 wc->writeback_fua = false; 2451 wc->writeback_fua_set = true; 2452 } else goto invalid_optional; 2453 } else if (!strcasecmp(string, "metadata_only")) { 2454 wc->metadata_only = true; 2455 } else if (!strcasecmp(string, "pause_writeback") && opt_params >= 1) { 2456 unsigned pause_msecs; 2457 if (WC_MODE_PMEM(wc)) 2458 goto invalid_optional; 2459 string = dm_shift_arg(&as), opt_params--; 2460 if (sscanf(string, "%u%c", &pause_msecs, &dummy) != 1) 2461 goto invalid_optional; 2462 if (pause_msecs > 60000) 2463 goto invalid_optional; 2464 wc->pause = msecs_to_jiffies(pause_msecs); 2465 wc->pause_set = true; 2466 wc->pause_value = pause_msecs; 2467 } else { 2468 invalid_optional: 2469 r = -EINVAL; 2470 ti->error = "Invalid optional argument"; 2471 goto bad; 2472 } 2473 } 2474 2475 if (high_wm_percent < low_wm_percent) { 2476 r = -EINVAL; 2477 ti->error = "High watermark must be greater than or equal to low watermark"; 2478 goto bad; 2479 } 2480 2481 if (WC_MODE_PMEM(wc)) { 2482 if (!dax_synchronous(wc->ssd_dev->dax_dev)) { 2483 r = -EOPNOTSUPP; 2484 ti->error = "Asynchronous persistent memory not supported as pmem cache"; 2485 goto bad; 2486 } 2487 2488 r = persistent_memory_claim(wc); 2489 if (r) { 2490 ti->error = "Unable to map persistent memory for cache"; 2491 goto bad; 2492 } 2493 } else { 2494 size_t n_blocks, n_metadata_blocks; 2495 uint64_t n_bitmap_bits; 2496 2497 wc->memory_map_size -= (uint64_t)wc->start_sector << SECTOR_SHIFT; 2498 2499 bio_list_init(&wc->flush_list); 2500 wc->flush_thread = kthread_run(writecache_flush_thread, wc, "dm_writecache_flush"); 2501 if (IS_ERR(wc->flush_thread)) { 2502 r = PTR_ERR(wc->flush_thread); 2503 wc->flush_thread = NULL; 2504 ti->error = "Couldn't spawn flush thread"; 2505 goto bad; 2506 } 2507 2508 r = calculate_memory_size(wc->memory_map_size, wc->block_size, 2509 &n_blocks, &n_metadata_blocks); 2510 if (r) { 2511 ti->error = "Invalid device size"; 2512 goto bad; 2513 } 2514 2515 n_bitmap_bits = (((uint64_t)n_metadata_blocks << wc->block_size_bits) + 2516 BITMAP_GRANULARITY - 1) / BITMAP_GRANULARITY; 2517 /* this is limitation of test_bit functions */ 2518 if (n_bitmap_bits > 1U << 31) { 2519 r = -EFBIG; 2520 ti->error = "Invalid device size"; 2521 goto bad; 2522 } 2523 2524 wc->memory_map = vmalloc(n_metadata_blocks << wc->block_size_bits); 2525 if (!wc->memory_map) { 2526 r = -ENOMEM; 2527 ti->error = "Unable to allocate memory for metadata"; 2528 goto bad; 2529 } 2530 2531 wc->dm_kcopyd = dm_kcopyd_client_create(&dm_kcopyd_throttle); 2532 if (IS_ERR(wc->dm_kcopyd)) { 2533 r = PTR_ERR(wc->dm_kcopyd); 2534 ti->error = "Unable to allocate dm-kcopyd client"; 2535 wc->dm_kcopyd = NULL; 2536 goto bad; 2537 } 2538 2539 wc->metadata_sectors = n_metadata_blocks << (wc->block_size_bits - SECTOR_SHIFT); 2540 wc->dirty_bitmap_size = (n_bitmap_bits + BITS_PER_LONG - 1) / 2541 BITS_PER_LONG * sizeof(unsigned long); 2542 wc->dirty_bitmap = vzalloc(wc->dirty_bitmap_size); 2543 if (!wc->dirty_bitmap) { 2544 r = -ENOMEM; 2545 ti->error = "Unable to allocate dirty bitmap"; 2546 goto bad; 2547 } 2548 2549 r = writecache_read_metadata(wc, wc->block_size >> SECTOR_SHIFT); 2550 if (r) { 2551 ti->error = "Unable to read first block of metadata"; 2552 goto bad; 2553 } 2554 } 2555 2556 r = copy_mc_to_kernel(&s, sb(wc), sizeof(struct wc_memory_superblock)); 2557 if (r) { 2558 ti->error = "Hardware memory error when reading superblock"; 2559 goto bad; 2560 } 2561 if (!le32_to_cpu(s.magic) && !le32_to_cpu(s.version)) { 2562 r = init_memory(wc); 2563 if (r) { 2564 ti->error = "Unable to initialize device"; 2565 goto bad; 2566 } 2567 r = copy_mc_to_kernel(&s, sb(wc), 2568 sizeof(struct wc_memory_superblock)); 2569 if (r) { 2570 ti->error = "Hardware memory error when reading superblock"; 2571 goto bad; 2572 } 2573 } 2574 2575 if (le32_to_cpu(s.magic) != MEMORY_SUPERBLOCK_MAGIC) { 2576 ti->error = "Invalid magic in the superblock"; 2577 r = -EINVAL; 2578 goto bad; 2579 } 2580 2581 if (le32_to_cpu(s.version) != MEMORY_SUPERBLOCK_VERSION) { 2582 ti->error = "Invalid version in the superblock"; 2583 r = -EINVAL; 2584 goto bad; 2585 } 2586 2587 if (le32_to_cpu(s.block_size) != wc->block_size) { 2588 ti->error = "Block size does not match superblock"; 2589 r = -EINVAL; 2590 goto bad; 2591 } 2592 2593 wc->n_blocks = le64_to_cpu(s.n_blocks); 2594 2595 offset = wc->n_blocks * sizeof(struct wc_memory_entry); 2596 if (offset / sizeof(struct wc_memory_entry) != le64_to_cpu(sb(wc)->n_blocks)) { 2597 overflow: 2598 ti->error = "Overflow in size calculation"; 2599 r = -EINVAL; 2600 goto bad; 2601 } 2602 offset += sizeof(struct wc_memory_superblock); 2603 if (offset < sizeof(struct wc_memory_superblock)) 2604 goto overflow; 2605 offset = (offset + wc->block_size - 1) & ~(size_t)(wc->block_size - 1); 2606 data_size = wc->n_blocks * (size_t)wc->block_size; 2607 if (!offset || (data_size / wc->block_size != wc->n_blocks) || 2608 (offset + data_size < offset)) 2609 goto overflow; 2610 if (offset + data_size > wc->memory_map_size) { 2611 ti->error = "Memory area is too small"; 2612 r = -EINVAL; 2613 goto bad; 2614 } 2615 2616 wc->metadata_sectors = offset >> SECTOR_SHIFT; 2617 wc->block_start = (char *)sb(wc) + offset; 2618 2619 x = (uint64_t)wc->n_blocks * (100 - high_wm_percent); 2620 x += 50; 2621 do_div(x, 100); 2622 wc->freelist_high_watermark = x; 2623 x = (uint64_t)wc->n_blocks * (100 - low_wm_percent); 2624 x += 50; 2625 do_div(x, 100); 2626 wc->freelist_low_watermark = x; 2627 2628 if (wc->cleaner) 2629 activate_cleaner(wc); 2630 2631 r = writecache_alloc_entries(wc); 2632 if (r) { 2633 ti->error = "Cannot allocate memory"; 2634 goto bad; 2635 } 2636 2637 ti->num_flush_bios = WC_MODE_PMEM(wc) ? 1 : 2; 2638 ti->flush_supported = true; 2639 ti->num_discard_bios = 1; 2640 2641 if (WC_MODE_PMEM(wc)) 2642 persistent_memory_flush_cache(wc->memory_map, wc->memory_map_size); 2643 2644 return 0; 2645 2646 bad_arguments: 2647 r = -EINVAL; 2648 ti->error = "Bad arguments"; 2649 bad: 2650 writecache_dtr(ti); 2651 return r; 2652 } 2653 2654 static void writecache_status(struct dm_target *ti, status_type_t type, 2655 unsigned status_flags, char *result, unsigned maxlen) 2656 { 2657 struct dm_writecache *wc = ti->private; 2658 unsigned extra_args; 2659 unsigned sz = 0; 2660 2661 switch (type) { 2662 case STATUSTYPE_INFO: 2663 DMEMIT("%ld %llu %llu %llu %llu %llu %llu %llu %llu %llu %llu %llu %llu %llu", 2664 writecache_has_error(wc), 2665 (unsigned long long)wc->n_blocks, (unsigned long long)wc->freelist_size, 2666 (unsigned long long)wc->writeback_size, 2667 wc->stats.reads, 2668 wc->stats.read_hits, 2669 wc->stats.writes, 2670 wc->stats.write_hits_uncommitted, 2671 wc->stats.write_hits_committed, 2672 wc->stats.writes_around, 2673 wc->stats.writes_allocate, 2674 wc->stats.writes_blocked_on_freelist, 2675 wc->stats.flushes, 2676 wc->stats.discards); 2677 break; 2678 case STATUSTYPE_TABLE: 2679 DMEMIT("%c %s %s %u ", WC_MODE_PMEM(wc) ? 'p' : 's', 2680 wc->dev->name, wc->ssd_dev->name, wc->block_size); 2681 extra_args = 0; 2682 if (wc->start_sector_set) 2683 extra_args += 2; 2684 if (wc->high_wm_percent_set) 2685 extra_args += 2; 2686 if (wc->low_wm_percent_set) 2687 extra_args += 2; 2688 if (wc->max_writeback_jobs_set) 2689 extra_args += 2; 2690 if (wc->autocommit_blocks_set) 2691 extra_args += 2; 2692 if (wc->autocommit_time_set) 2693 extra_args += 2; 2694 if (wc->max_age_set) 2695 extra_args += 2; 2696 if (wc->cleaner_set) 2697 extra_args++; 2698 if (wc->writeback_fua_set) 2699 extra_args++; 2700 if (wc->metadata_only) 2701 extra_args++; 2702 if (wc->pause_set) 2703 extra_args += 2; 2704 2705 DMEMIT("%u", extra_args); 2706 if (wc->start_sector_set) 2707 DMEMIT(" start_sector %llu", (unsigned long long)wc->start_sector); 2708 if (wc->high_wm_percent_set) 2709 DMEMIT(" high_watermark %u", wc->high_wm_percent_value); 2710 if (wc->low_wm_percent_set) 2711 DMEMIT(" low_watermark %u", wc->low_wm_percent_value); 2712 if (wc->max_writeback_jobs_set) 2713 DMEMIT(" writeback_jobs %u", wc->max_writeback_jobs); 2714 if (wc->autocommit_blocks_set) 2715 DMEMIT(" autocommit_blocks %u", wc->autocommit_blocks); 2716 if (wc->autocommit_time_set) 2717 DMEMIT(" autocommit_time %u", wc->autocommit_time_value); 2718 if (wc->max_age_set) 2719 DMEMIT(" max_age %u", wc->max_age_value); 2720 if (wc->cleaner_set) 2721 DMEMIT(" cleaner"); 2722 if (wc->writeback_fua_set) 2723 DMEMIT(" %sfua", wc->writeback_fua ? "" : "no"); 2724 if (wc->metadata_only) 2725 DMEMIT(" metadata_only"); 2726 if (wc->pause_set) 2727 DMEMIT(" pause_writeback %u", wc->pause_value); 2728 break; 2729 case STATUSTYPE_IMA: 2730 *result = '\0'; 2731 break; 2732 } 2733 } 2734 2735 static struct target_type writecache_target = { 2736 .name = "writecache", 2737 .version = {1, 6, 0}, 2738 .module = THIS_MODULE, 2739 .ctr = writecache_ctr, 2740 .dtr = writecache_dtr, 2741 .status = writecache_status, 2742 .postsuspend = writecache_suspend, 2743 .resume = writecache_resume, 2744 .message = writecache_message, 2745 .map = writecache_map, 2746 .end_io = writecache_end_io, 2747 .iterate_devices = writecache_iterate_devices, 2748 .io_hints = writecache_io_hints, 2749 }; 2750 2751 static int __init dm_writecache_init(void) 2752 { 2753 int r; 2754 2755 r = dm_register_target(&writecache_target); 2756 if (r < 0) { 2757 DMERR("register failed %d", r); 2758 return r; 2759 } 2760 2761 return 0; 2762 } 2763 2764 static void __exit dm_writecache_exit(void) 2765 { 2766 dm_unregister_target(&writecache_target); 2767 } 2768 2769 module_init(dm_writecache_init); 2770 module_exit(dm_writecache_exit); 2771 2772 MODULE_DESCRIPTION(DM_NAME " writecache target"); 2773 MODULE_AUTHOR("Mikulas Patocka <dm-devel@redhat.com>"); 2774 MODULE_LICENSE("GPL"); 2775