1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2018 Red Hat. All rights reserved. 4 * 5 * This file is released under the GPL. 6 */ 7 8 #include <linux/device-mapper.h> 9 #include <linux/module.h> 10 #include <linux/init.h> 11 #include <linux/vmalloc.h> 12 #include <linux/kthread.h> 13 #include <linux/dm-io.h> 14 #include <linux/dm-kcopyd.h> 15 #include <linux/dax.h> 16 #include <linux/pfn_t.h> 17 #include <linux/libnvdimm.h> 18 19 #define DM_MSG_PREFIX "writecache" 20 21 #define HIGH_WATERMARK 50 22 #define LOW_WATERMARK 45 23 #define MAX_WRITEBACK_JOBS 0 24 #define ENDIO_LATENCY 16 25 #define WRITEBACK_LATENCY 64 26 #define AUTOCOMMIT_BLOCKS_SSD 65536 27 #define AUTOCOMMIT_BLOCKS_PMEM 64 28 #define AUTOCOMMIT_MSEC 1000 29 30 #define BITMAP_GRANULARITY 65536 31 #if BITMAP_GRANULARITY < PAGE_SIZE 32 #undef BITMAP_GRANULARITY 33 #define BITMAP_GRANULARITY PAGE_SIZE 34 #endif 35 36 #if IS_ENABLED(CONFIG_ARCH_HAS_PMEM_API) && IS_ENABLED(CONFIG_DAX_DRIVER) 37 #define DM_WRITECACHE_HAS_PMEM 38 #endif 39 40 #ifdef DM_WRITECACHE_HAS_PMEM 41 #define pmem_assign(dest, src) \ 42 do { \ 43 typeof(dest) uniq = (src); \ 44 memcpy_flushcache(&(dest), &uniq, sizeof(dest)); \ 45 } while (0) 46 #else 47 #define pmem_assign(dest, src) ((dest) = (src)) 48 #endif 49 50 #if defined(__HAVE_ARCH_MEMCPY_MCSAFE) && defined(DM_WRITECACHE_HAS_PMEM) 51 #define DM_WRITECACHE_HANDLE_HARDWARE_ERRORS 52 #endif 53 54 #define MEMORY_SUPERBLOCK_MAGIC 0x23489321 55 #define MEMORY_SUPERBLOCK_VERSION 1 56 57 struct wc_memory_entry { 58 __le64 original_sector; 59 __le64 seq_count; 60 }; 61 62 struct wc_memory_superblock { 63 union { 64 struct { 65 __le32 magic; 66 __le32 version; 67 __le32 block_size; 68 __le32 pad; 69 __le64 n_blocks; 70 __le64 seq_count; 71 }; 72 __le64 padding[8]; 73 }; 74 struct wc_memory_entry entries[0]; 75 }; 76 77 struct wc_entry { 78 struct rb_node rb_node; 79 struct list_head lru; 80 unsigned short wc_list_contiguous; 81 bool write_in_progress 82 #if BITS_PER_LONG == 64 83 :1 84 #endif 85 ; 86 unsigned long index 87 #if BITS_PER_LONG == 64 88 :47 89 #endif 90 ; 91 #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS 92 uint64_t original_sector; 93 uint64_t seq_count; 94 #endif 95 }; 96 97 #ifdef DM_WRITECACHE_HAS_PMEM 98 #define WC_MODE_PMEM(wc) ((wc)->pmem_mode) 99 #define WC_MODE_FUA(wc) ((wc)->writeback_fua) 100 #else 101 #define WC_MODE_PMEM(wc) false 102 #define WC_MODE_FUA(wc) false 103 #endif 104 #define WC_MODE_SORT_FREELIST(wc) (!WC_MODE_PMEM(wc)) 105 106 struct dm_writecache { 107 struct mutex lock; 108 struct list_head lru; 109 union { 110 struct list_head freelist; 111 struct { 112 struct rb_root freetree; 113 struct wc_entry *current_free; 114 }; 115 }; 116 struct rb_root tree; 117 118 size_t freelist_size; 119 size_t writeback_size; 120 size_t freelist_high_watermark; 121 size_t freelist_low_watermark; 122 123 unsigned uncommitted_blocks; 124 unsigned autocommit_blocks; 125 unsigned max_writeback_jobs; 126 127 int error; 128 129 unsigned long autocommit_jiffies; 130 struct timer_list autocommit_timer; 131 struct wait_queue_head freelist_wait; 132 133 atomic_t bio_in_progress[2]; 134 struct wait_queue_head bio_in_progress_wait[2]; 135 136 struct dm_target *ti; 137 struct dm_dev *dev; 138 struct dm_dev *ssd_dev; 139 sector_t start_sector; 140 void *memory_map; 141 uint64_t memory_map_size; 142 size_t metadata_sectors; 143 size_t n_blocks; 144 uint64_t seq_count; 145 void *block_start; 146 struct wc_entry *entries; 147 unsigned block_size; 148 unsigned char block_size_bits; 149 150 bool pmem_mode:1; 151 bool writeback_fua:1; 152 153 bool overwrote_committed:1; 154 bool memory_vmapped:1; 155 156 bool high_wm_percent_set:1; 157 bool low_wm_percent_set:1; 158 bool max_writeback_jobs_set:1; 159 bool autocommit_blocks_set:1; 160 bool autocommit_time_set:1; 161 bool writeback_fua_set:1; 162 bool flush_on_suspend:1; 163 164 unsigned writeback_all; 165 struct workqueue_struct *writeback_wq; 166 struct work_struct writeback_work; 167 struct work_struct flush_work; 168 169 struct dm_io_client *dm_io; 170 171 raw_spinlock_t endio_list_lock; 172 struct list_head endio_list; 173 struct task_struct *endio_thread; 174 175 struct task_struct *flush_thread; 176 struct bio_list flush_list; 177 178 struct dm_kcopyd_client *dm_kcopyd; 179 unsigned long *dirty_bitmap; 180 unsigned dirty_bitmap_size; 181 182 struct bio_set bio_set; 183 mempool_t copy_pool; 184 }; 185 186 #define WB_LIST_INLINE 16 187 188 struct writeback_struct { 189 struct list_head endio_entry; 190 struct dm_writecache *wc; 191 struct wc_entry **wc_list; 192 unsigned wc_list_n; 193 unsigned page_offset; 194 struct page *page; 195 struct wc_entry *wc_list_inline[WB_LIST_INLINE]; 196 struct bio bio; 197 }; 198 199 struct copy_struct { 200 struct list_head endio_entry; 201 struct dm_writecache *wc; 202 struct wc_entry *e; 203 unsigned n_entries; 204 int error; 205 }; 206 207 DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(dm_writecache_throttle, 208 "A percentage of time allocated for data copying"); 209 210 static void wc_lock(struct dm_writecache *wc) 211 { 212 mutex_lock(&wc->lock); 213 } 214 215 static void wc_unlock(struct dm_writecache *wc) 216 { 217 mutex_unlock(&wc->lock); 218 } 219 220 #ifdef DM_WRITECACHE_HAS_PMEM 221 static int persistent_memory_claim(struct dm_writecache *wc) 222 { 223 int r; 224 loff_t s; 225 long p, da; 226 pfn_t pfn; 227 int id; 228 struct page **pages; 229 230 wc->memory_vmapped = false; 231 232 if (!wc->ssd_dev->dax_dev) { 233 r = -EOPNOTSUPP; 234 goto err1; 235 } 236 s = wc->memory_map_size; 237 p = s >> PAGE_SHIFT; 238 if (!p) { 239 r = -EINVAL; 240 goto err1; 241 } 242 if (p != s >> PAGE_SHIFT) { 243 r = -EOVERFLOW; 244 goto err1; 245 } 246 247 id = dax_read_lock(); 248 249 da = dax_direct_access(wc->ssd_dev->dax_dev, 0, p, &wc->memory_map, &pfn); 250 if (da < 0) { 251 wc->memory_map = NULL; 252 r = da; 253 goto err2; 254 } 255 if (!pfn_t_has_page(pfn)) { 256 wc->memory_map = NULL; 257 r = -EOPNOTSUPP; 258 goto err2; 259 } 260 if (da != p) { 261 long i; 262 wc->memory_map = NULL; 263 pages = kvmalloc_array(p, sizeof(struct page *), GFP_KERNEL); 264 if (!pages) { 265 r = -ENOMEM; 266 goto err2; 267 } 268 i = 0; 269 do { 270 long daa; 271 void *dummy_addr; 272 daa = dax_direct_access(wc->ssd_dev->dax_dev, i, p - i, 273 &dummy_addr, &pfn); 274 if (daa <= 0) { 275 r = daa ? daa : -EINVAL; 276 goto err3; 277 } 278 if (!pfn_t_has_page(pfn)) { 279 r = -EOPNOTSUPP; 280 goto err3; 281 } 282 while (daa-- && i < p) { 283 pages[i++] = pfn_t_to_page(pfn); 284 pfn.val++; 285 } 286 } while (i < p); 287 wc->memory_map = vmap(pages, p, VM_MAP, PAGE_KERNEL); 288 if (!wc->memory_map) { 289 r = -ENOMEM; 290 goto err3; 291 } 292 kvfree(pages); 293 wc->memory_vmapped = true; 294 } 295 296 dax_read_unlock(id); 297 298 wc->memory_map += (size_t)wc->start_sector << SECTOR_SHIFT; 299 wc->memory_map_size -= (size_t)wc->start_sector << SECTOR_SHIFT; 300 301 return 0; 302 err3: 303 kvfree(pages); 304 err2: 305 dax_read_unlock(id); 306 err1: 307 return r; 308 } 309 #else 310 static int persistent_memory_claim(struct dm_writecache *wc) 311 { 312 BUG(); 313 } 314 #endif 315 316 static void persistent_memory_release(struct dm_writecache *wc) 317 { 318 if (wc->memory_vmapped) 319 vunmap(wc->memory_map - ((size_t)wc->start_sector << SECTOR_SHIFT)); 320 } 321 322 static struct page *persistent_memory_page(void *addr) 323 { 324 if (is_vmalloc_addr(addr)) 325 return vmalloc_to_page(addr); 326 else 327 return virt_to_page(addr); 328 } 329 330 static unsigned persistent_memory_page_offset(void *addr) 331 { 332 return (unsigned long)addr & (PAGE_SIZE - 1); 333 } 334 335 static void persistent_memory_flush_cache(void *ptr, size_t size) 336 { 337 if (is_vmalloc_addr(ptr)) 338 flush_kernel_vmap_range(ptr, size); 339 } 340 341 static void persistent_memory_invalidate_cache(void *ptr, size_t size) 342 { 343 if (is_vmalloc_addr(ptr)) 344 invalidate_kernel_vmap_range(ptr, size); 345 } 346 347 static struct wc_memory_superblock *sb(struct dm_writecache *wc) 348 { 349 return wc->memory_map; 350 } 351 352 static struct wc_memory_entry *memory_entry(struct dm_writecache *wc, struct wc_entry *e) 353 { 354 if (is_power_of_2(sizeof(struct wc_entry)) && 0) 355 return &sb(wc)->entries[e - wc->entries]; 356 else 357 return &sb(wc)->entries[e->index]; 358 } 359 360 static void *memory_data(struct dm_writecache *wc, struct wc_entry *e) 361 { 362 return (char *)wc->block_start + (e->index << wc->block_size_bits); 363 } 364 365 static sector_t cache_sector(struct dm_writecache *wc, struct wc_entry *e) 366 { 367 return wc->start_sector + wc->metadata_sectors + 368 ((sector_t)e->index << (wc->block_size_bits - SECTOR_SHIFT)); 369 } 370 371 static uint64_t read_original_sector(struct dm_writecache *wc, struct wc_entry *e) 372 { 373 #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS 374 return e->original_sector; 375 #else 376 return le64_to_cpu(memory_entry(wc, e)->original_sector); 377 #endif 378 } 379 380 static uint64_t read_seq_count(struct dm_writecache *wc, struct wc_entry *e) 381 { 382 #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS 383 return e->seq_count; 384 #else 385 return le64_to_cpu(memory_entry(wc, e)->seq_count); 386 #endif 387 } 388 389 static void clear_seq_count(struct dm_writecache *wc, struct wc_entry *e) 390 { 391 #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS 392 e->seq_count = -1; 393 #endif 394 pmem_assign(memory_entry(wc, e)->seq_count, cpu_to_le64(-1)); 395 } 396 397 static void write_original_sector_seq_count(struct dm_writecache *wc, struct wc_entry *e, 398 uint64_t original_sector, uint64_t seq_count) 399 { 400 struct wc_memory_entry me; 401 #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS 402 e->original_sector = original_sector; 403 e->seq_count = seq_count; 404 #endif 405 me.original_sector = cpu_to_le64(original_sector); 406 me.seq_count = cpu_to_le64(seq_count); 407 pmem_assign(*memory_entry(wc, e), me); 408 } 409 410 #define writecache_error(wc, err, msg, arg...) \ 411 do { \ 412 if (!cmpxchg(&(wc)->error, 0, err)) \ 413 DMERR(msg, ##arg); \ 414 wake_up(&(wc)->freelist_wait); \ 415 } while (0) 416 417 #define writecache_has_error(wc) (unlikely(READ_ONCE((wc)->error))) 418 419 static void writecache_flush_all_metadata(struct dm_writecache *wc) 420 { 421 if (!WC_MODE_PMEM(wc)) 422 memset(wc->dirty_bitmap, -1, wc->dirty_bitmap_size); 423 } 424 425 static void writecache_flush_region(struct dm_writecache *wc, void *ptr, size_t size) 426 { 427 if (!WC_MODE_PMEM(wc)) 428 __set_bit(((char *)ptr - (char *)wc->memory_map) / BITMAP_GRANULARITY, 429 wc->dirty_bitmap); 430 } 431 432 static void writecache_disk_flush(struct dm_writecache *wc, struct dm_dev *dev); 433 434 struct io_notify { 435 struct dm_writecache *wc; 436 struct completion c; 437 atomic_t count; 438 }; 439 440 static void writecache_notify_io(unsigned long error, void *context) 441 { 442 struct io_notify *endio = context; 443 444 if (unlikely(error != 0)) 445 writecache_error(endio->wc, -EIO, "error writing metadata"); 446 BUG_ON(atomic_read(&endio->count) <= 0); 447 if (atomic_dec_and_test(&endio->count)) 448 complete(&endio->c); 449 } 450 451 static void ssd_commit_flushed(struct dm_writecache *wc) 452 { 453 struct dm_io_region region; 454 struct dm_io_request req; 455 struct io_notify endio = { 456 wc, 457 COMPLETION_INITIALIZER_ONSTACK(endio.c), 458 ATOMIC_INIT(1), 459 }; 460 unsigned bitmap_bits = wc->dirty_bitmap_size * 8; 461 unsigned i = 0; 462 463 while (1) { 464 unsigned j; 465 i = find_next_bit(wc->dirty_bitmap, bitmap_bits, i); 466 if (unlikely(i == bitmap_bits)) 467 break; 468 j = find_next_zero_bit(wc->dirty_bitmap, bitmap_bits, i); 469 470 region.bdev = wc->ssd_dev->bdev; 471 region.sector = (sector_t)i * (BITMAP_GRANULARITY >> SECTOR_SHIFT); 472 region.count = (sector_t)(j - i) * (BITMAP_GRANULARITY >> SECTOR_SHIFT); 473 474 if (unlikely(region.sector >= wc->metadata_sectors)) 475 break; 476 if (unlikely(region.sector + region.count > wc->metadata_sectors)) 477 region.count = wc->metadata_sectors - region.sector; 478 479 region.sector += wc->start_sector; 480 atomic_inc(&endio.count); 481 req.bi_op = REQ_OP_WRITE; 482 req.bi_op_flags = REQ_SYNC; 483 req.mem.type = DM_IO_VMA; 484 req.mem.ptr.vma = (char *)wc->memory_map + (size_t)i * BITMAP_GRANULARITY; 485 req.client = wc->dm_io; 486 req.notify.fn = writecache_notify_io; 487 req.notify.context = &endio; 488 489 /* writing via async dm-io (implied by notify.fn above) won't return an error */ 490 (void) dm_io(&req, 1, ®ion, NULL); 491 i = j; 492 } 493 494 writecache_notify_io(0, &endio); 495 wait_for_completion_io(&endio.c); 496 497 writecache_disk_flush(wc, wc->ssd_dev); 498 499 memset(wc->dirty_bitmap, 0, wc->dirty_bitmap_size); 500 } 501 502 static void writecache_commit_flushed(struct dm_writecache *wc) 503 { 504 if (WC_MODE_PMEM(wc)) 505 wmb(); 506 else 507 ssd_commit_flushed(wc); 508 } 509 510 static void writecache_disk_flush(struct dm_writecache *wc, struct dm_dev *dev) 511 { 512 int r; 513 struct dm_io_region region; 514 struct dm_io_request req; 515 516 region.bdev = dev->bdev; 517 region.sector = 0; 518 region.count = 0; 519 req.bi_op = REQ_OP_WRITE; 520 req.bi_op_flags = REQ_PREFLUSH; 521 req.mem.type = DM_IO_KMEM; 522 req.mem.ptr.addr = NULL; 523 req.client = wc->dm_io; 524 req.notify.fn = NULL; 525 526 r = dm_io(&req, 1, ®ion, NULL); 527 if (unlikely(r)) 528 writecache_error(wc, r, "error flushing metadata: %d", r); 529 } 530 531 static void writecache_wait_for_ios(struct dm_writecache *wc, int direction) 532 { 533 wait_event(wc->bio_in_progress_wait[direction], 534 !atomic_read(&wc->bio_in_progress[direction])); 535 } 536 537 #define WFE_RETURN_FOLLOWING 1 538 #define WFE_LOWEST_SEQ 2 539 540 static struct wc_entry *writecache_find_entry(struct dm_writecache *wc, 541 uint64_t block, int flags) 542 { 543 struct wc_entry *e; 544 struct rb_node *node = wc->tree.rb_node; 545 546 if (unlikely(!node)) 547 return NULL; 548 549 while (1) { 550 e = container_of(node, struct wc_entry, rb_node); 551 if (read_original_sector(wc, e) == block) 552 break; 553 node = (read_original_sector(wc, e) >= block ? 554 e->rb_node.rb_left : e->rb_node.rb_right); 555 if (unlikely(!node)) { 556 if (!(flags & WFE_RETURN_FOLLOWING)) { 557 return NULL; 558 } 559 if (read_original_sector(wc, e) >= block) { 560 break; 561 } else { 562 node = rb_next(&e->rb_node); 563 if (unlikely(!node)) { 564 return NULL; 565 } 566 e = container_of(node, struct wc_entry, rb_node); 567 break; 568 } 569 } 570 } 571 572 while (1) { 573 struct wc_entry *e2; 574 if (flags & WFE_LOWEST_SEQ) 575 node = rb_prev(&e->rb_node); 576 else 577 node = rb_next(&e->rb_node); 578 if (!node) 579 return e; 580 e2 = container_of(node, struct wc_entry, rb_node); 581 if (read_original_sector(wc, e2) != block) 582 return e; 583 e = e2; 584 } 585 } 586 587 static void writecache_insert_entry(struct dm_writecache *wc, struct wc_entry *ins) 588 { 589 struct wc_entry *e; 590 struct rb_node **node = &wc->tree.rb_node, *parent = NULL; 591 592 while (*node) { 593 e = container_of(*node, struct wc_entry, rb_node); 594 parent = &e->rb_node; 595 if (read_original_sector(wc, e) > read_original_sector(wc, ins)) 596 node = &parent->rb_left; 597 else 598 node = &parent->rb_right; 599 } 600 rb_link_node(&ins->rb_node, parent, node); 601 rb_insert_color(&ins->rb_node, &wc->tree); 602 list_add(&ins->lru, &wc->lru); 603 } 604 605 static void writecache_unlink(struct dm_writecache *wc, struct wc_entry *e) 606 { 607 list_del(&e->lru); 608 rb_erase(&e->rb_node, &wc->tree); 609 } 610 611 static void writecache_add_to_freelist(struct dm_writecache *wc, struct wc_entry *e) 612 { 613 if (WC_MODE_SORT_FREELIST(wc)) { 614 struct rb_node **node = &wc->freetree.rb_node, *parent = NULL; 615 if (unlikely(!*node)) 616 wc->current_free = e; 617 while (*node) { 618 parent = *node; 619 if (&e->rb_node < *node) 620 node = &parent->rb_left; 621 else 622 node = &parent->rb_right; 623 } 624 rb_link_node(&e->rb_node, parent, node); 625 rb_insert_color(&e->rb_node, &wc->freetree); 626 } else { 627 list_add_tail(&e->lru, &wc->freelist); 628 } 629 wc->freelist_size++; 630 } 631 632 static struct wc_entry *writecache_pop_from_freelist(struct dm_writecache *wc) 633 { 634 struct wc_entry *e; 635 636 if (WC_MODE_SORT_FREELIST(wc)) { 637 struct rb_node *next; 638 if (unlikely(!wc->current_free)) 639 return NULL; 640 e = wc->current_free; 641 next = rb_next(&e->rb_node); 642 rb_erase(&e->rb_node, &wc->freetree); 643 if (unlikely(!next)) 644 next = rb_first(&wc->freetree); 645 wc->current_free = next ? container_of(next, struct wc_entry, rb_node) : NULL; 646 } else { 647 if (unlikely(list_empty(&wc->freelist))) 648 return NULL; 649 e = container_of(wc->freelist.next, struct wc_entry, lru); 650 list_del(&e->lru); 651 } 652 wc->freelist_size--; 653 if (unlikely(wc->freelist_size + wc->writeback_size <= wc->freelist_high_watermark)) 654 queue_work(wc->writeback_wq, &wc->writeback_work); 655 656 return e; 657 } 658 659 static void writecache_free_entry(struct dm_writecache *wc, struct wc_entry *e) 660 { 661 writecache_unlink(wc, e); 662 writecache_add_to_freelist(wc, e); 663 clear_seq_count(wc, e); 664 writecache_flush_region(wc, memory_entry(wc, e), sizeof(struct wc_memory_entry)); 665 if (unlikely(waitqueue_active(&wc->freelist_wait))) 666 wake_up(&wc->freelist_wait); 667 } 668 669 static void writecache_wait_on_freelist(struct dm_writecache *wc) 670 { 671 DEFINE_WAIT(wait); 672 673 prepare_to_wait(&wc->freelist_wait, &wait, TASK_UNINTERRUPTIBLE); 674 wc_unlock(wc); 675 io_schedule(); 676 finish_wait(&wc->freelist_wait, &wait); 677 wc_lock(wc); 678 } 679 680 static void writecache_poison_lists(struct dm_writecache *wc) 681 { 682 /* 683 * Catch incorrect access to these values while the device is suspended. 684 */ 685 memset(&wc->tree, -1, sizeof wc->tree); 686 wc->lru.next = LIST_POISON1; 687 wc->lru.prev = LIST_POISON2; 688 wc->freelist.next = LIST_POISON1; 689 wc->freelist.prev = LIST_POISON2; 690 } 691 692 static void writecache_flush_entry(struct dm_writecache *wc, struct wc_entry *e) 693 { 694 writecache_flush_region(wc, memory_entry(wc, e), sizeof(struct wc_memory_entry)); 695 if (WC_MODE_PMEM(wc)) 696 writecache_flush_region(wc, memory_data(wc, e), wc->block_size); 697 } 698 699 static bool writecache_entry_is_committed(struct dm_writecache *wc, struct wc_entry *e) 700 { 701 return read_seq_count(wc, e) < wc->seq_count; 702 } 703 704 static void writecache_flush(struct dm_writecache *wc) 705 { 706 struct wc_entry *e, *e2; 707 bool need_flush_after_free; 708 709 wc->uncommitted_blocks = 0; 710 del_timer(&wc->autocommit_timer); 711 712 if (list_empty(&wc->lru)) 713 return; 714 715 e = container_of(wc->lru.next, struct wc_entry, lru); 716 if (writecache_entry_is_committed(wc, e)) { 717 if (wc->overwrote_committed) { 718 writecache_wait_for_ios(wc, WRITE); 719 writecache_disk_flush(wc, wc->ssd_dev); 720 wc->overwrote_committed = false; 721 } 722 return; 723 } 724 while (1) { 725 writecache_flush_entry(wc, e); 726 if (unlikely(e->lru.next == &wc->lru)) 727 break; 728 e2 = container_of(e->lru.next, struct wc_entry, lru); 729 if (writecache_entry_is_committed(wc, e2)) 730 break; 731 e = e2; 732 cond_resched(); 733 } 734 writecache_commit_flushed(wc); 735 736 writecache_wait_for_ios(wc, WRITE); 737 738 wc->seq_count++; 739 pmem_assign(sb(wc)->seq_count, cpu_to_le64(wc->seq_count)); 740 writecache_flush_region(wc, &sb(wc)->seq_count, sizeof sb(wc)->seq_count); 741 writecache_commit_flushed(wc); 742 743 wc->overwrote_committed = false; 744 745 need_flush_after_free = false; 746 while (1) { 747 /* Free another committed entry with lower seq-count */ 748 struct rb_node *rb_node = rb_prev(&e->rb_node); 749 750 if (rb_node) { 751 e2 = container_of(rb_node, struct wc_entry, rb_node); 752 if (read_original_sector(wc, e2) == read_original_sector(wc, e) && 753 likely(!e2->write_in_progress)) { 754 writecache_free_entry(wc, e2); 755 need_flush_after_free = true; 756 } 757 } 758 if (unlikely(e->lru.prev == &wc->lru)) 759 break; 760 e = container_of(e->lru.prev, struct wc_entry, lru); 761 cond_resched(); 762 } 763 764 if (need_flush_after_free) 765 writecache_commit_flushed(wc); 766 } 767 768 static void writecache_flush_work(struct work_struct *work) 769 { 770 struct dm_writecache *wc = container_of(work, struct dm_writecache, flush_work); 771 772 wc_lock(wc); 773 writecache_flush(wc); 774 wc_unlock(wc); 775 } 776 777 static void writecache_autocommit_timer(struct timer_list *t) 778 { 779 struct dm_writecache *wc = from_timer(wc, t, autocommit_timer); 780 if (!writecache_has_error(wc)) 781 queue_work(wc->writeback_wq, &wc->flush_work); 782 } 783 784 static void writecache_schedule_autocommit(struct dm_writecache *wc) 785 { 786 if (!timer_pending(&wc->autocommit_timer)) 787 mod_timer(&wc->autocommit_timer, jiffies + wc->autocommit_jiffies); 788 } 789 790 static void writecache_discard(struct dm_writecache *wc, sector_t start, sector_t end) 791 { 792 struct wc_entry *e; 793 bool discarded_something = false; 794 795 e = writecache_find_entry(wc, start, WFE_RETURN_FOLLOWING | WFE_LOWEST_SEQ); 796 if (unlikely(!e)) 797 return; 798 799 while (read_original_sector(wc, e) < end) { 800 struct rb_node *node = rb_next(&e->rb_node); 801 802 if (likely(!e->write_in_progress)) { 803 if (!discarded_something) { 804 writecache_wait_for_ios(wc, READ); 805 writecache_wait_for_ios(wc, WRITE); 806 discarded_something = true; 807 } 808 writecache_free_entry(wc, e); 809 } 810 811 if (!node) 812 break; 813 814 e = container_of(node, struct wc_entry, rb_node); 815 } 816 817 if (discarded_something) 818 writecache_commit_flushed(wc); 819 } 820 821 static bool writecache_wait_for_writeback(struct dm_writecache *wc) 822 { 823 if (wc->writeback_size) { 824 writecache_wait_on_freelist(wc); 825 return true; 826 } 827 return false; 828 } 829 830 static void writecache_suspend(struct dm_target *ti) 831 { 832 struct dm_writecache *wc = ti->private; 833 bool flush_on_suspend; 834 835 del_timer_sync(&wc->autocommit_timer); 836 837 wc_lock(wc); 838 writecache_flush(wc); 839 flush_on_suspend = wc->flush_on_suspend; 840 if (flush_on_suspend) { 841 wc->flush_on_suspend = false; 842 wc->writeback_all++; 843 queue_work(wc->writeback_wq, &wc->writeback_work); 844 } 845 wc_unlock(wc); 846 847 flush_workqueue(wc->writeback_wq); 848 849 wc_lock(wc); 850 if (flush_on_suspend) 851 wc->writeback_all--; 852 while (writecache_wait_for_writeback(wc)); 853 854 if (WC_MODE_PMEM(wc)) 855 persistent_memory_flush_cache(wc->memory_map, wc->memory_map_size); 856 857 writecache_poison_lists(wc); 858 859 wc_unlock(wc); 860 } 861 862 static int writecache_alloc_entries(struct dm_writecache *wc) 863 { 864 size_t b; 865 866 if (wc->entries) 867 return 0; 868 wc->entries = vmalloc(array_size(sizeof(struct wc_entry), wc->n_blocks)); 869 if (!wc->entries) 870 return -ENOMEM; 871 for (b = 0; b < wc->n_blocks; b++) { 872 struct wc_entry *e = &wc->entries[b]; 873 e->index = b; 874 e->write_in_progress = false; 875 } 876 877 return 0; 878 } 879 880 static void writecache_resume(struct dm_target *ti) 881 { 882 struct dm_writecache *wc = ti->private; 883 size_t b; 884 bool need_flush = false; 885 __le64 sb_seq_count; 886 int r; 887 888 wc_lock(wc); 889 890 if (WC_MODE_PMEM(wc)) 891 persistent_memory_invalidate_cache(wc->memory_map, wc->memory_map_size); 892 893 wc->tree = RB_ROOT; 894 INIT_LIST_HEAD(&wc->lru); 895 if (WC_MODE_SORT_FREELIST(wc)) { 896 wc->freetree = RB_ROOT; 897 wc->current_free = NULL; 898 } else { 899 INIT_LIST_HEAD(&wc->freelist); 900 } 901 wc->freelist_size = 0; 902 903 r = memcpy_mcsafe(&sb_seq_count, &sb(wc)->seq_count, sizeof(uint64_t)); 904 if (r) { 905 writecache_error(wc, r, "hardware memory error when reading superblock: %d", r); 906 sb_seq_count = cpu_to_le64(0); 907 } 908 wc->seq_count = le64_to_cpu(sb_seq_count); 909 910 #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS 911 for (b = 0; b < wc->n_blocks; b++) { 912 struct wc_entry *e = &wc->entries[b]; 913 struct wc_memory_entry wme; 914 if (writecache_has_error(wc)) { 915 e->original_sector = -1; 916 e->seq_count = -1; 917 continue; 918 } 919 r = memcpy_mcsafe(&wme, memory_entry(wc, e), sizeof(struct wc_memory_entry)); 920 if (r) { 921 writecache_error(wc, r, "hardware memory error when reading metadata entry %lu: %d", 922 (unsigned long)b, r); 923 e->original_sector = -1; 924 e->seq_count = -1; 925 } else { 926 e->original_sector = le64_to_cpu(wme.original_sector); 927 e->seq_count = le64_to_cpu(wme.seq_count); 928 } 929 } 930 #endif 931 for (b = 0; b < wc->n_blocks; b++) { 932 struct wc_entry *e = &wc->entries[b]; 933 if (!writecache_entry_is_committed(wc, e)) { 934 if (read_seq_count(wc, e) != -1) { 935 erase_this: 936 clear_seq_count(wc, e); 937 need_flush = true; 938 } 939 writecache_add_to_freelist(wc, e); 940 } else { 941 struct wc_entry *old; 942 943 old = writecache_find_entry(wc, read_original_sector(wc, e), 0); 944 if (!old) { 945 writecache_insert_entry(wc, e); 946 } else { 947 if (read_seq_count(wc, old) == read_seq_count(wc, e)) { 948 writecache_error(wc, -EINVAL, 949 "two identical entries, position %llu, sector %llu, sequence %llu", 950 (unsigned long long)b, (unsigned long long)read_original_sector(wc, e), 951 (unsigned long long)read_seq_count(wc, e)); 952 } 953 if (read_seq_count(wc, old) > read_seq_count(wc, e)) { 954 goto erase_this; 955 } else { 956 writecache_free_entry(wc, old); 957 writecache_insert_entry(wc, e); 958 need_flush = true; 959 } 960 } 961 } 962 cond_resched(); 963 } 964 965 if (need_flush) { 966 writecache_flush_all_metadata(wc); 967 writecache_commit_flushed(wc); 968 } 969 970 wc_unlock(wc); 971 } 972 973 static int process_flush_mesg(unsigned argc, char **argv, struct dm_writecache *wc) 974 { 975 if (argc != 1) 976 return -EINVAL; 977 978 wc_lock(wc); 979 if (dm_suspended(wc->ti)) { 980 wc_unlock(wc); 981 return -EBUSY; 982 } 983 if (writecache_has_error(wc)) { 984 wc_unlock(wc); 985 return -EIO; 986 } 987 988 writecache_flush(wc); 989 wc->writeback_all++; 990 queue_work(wc->writeback_wq, &wc->writeback_work); 991 wc_unlock(wc); 992 993 flush_workqueue(wc->writeback_wq); 994 995 wc_lock(wc); 996 wc->writeback_all--; 997 if (writecache_has_error(wc)) { 998 wc_unlock(wc); 999 return -EIO; 1000 } 1001 wc_unlock(wc); 1002 1003 return 0; 1004 } 1005 1006 static int process_flush_on_suspend_mesg(unsigned argc, char **argv, struct dm_writecache *wc) 1007 { 1008 if (argc != 1) 1009 return -EINVAL; 1010 1011 wc_lock(wc); 1012 wc->flush_on_suspend = true; 1013 wc_unlock(wc); 1014 1015 return 0; 1016 } 1017 1018 static int writecache_message(struct dm_target *ti, unsigned argc, char **argv, 1019 char *result, unsigned maxlen) 1020 { 1021 int r = -EINVAL; 1022 struct dm_writecache *wc = ti->private; 1023 1024 if (!strcasecmp(argv[0], "flush")) 1025 r = process_flush_mesg(argc, argv, wc); 1026 else if (!strcasecmp(argv[0], "flush_on_suspend")) 1027 r = process_flush_on_suspend_mesg(argc, argv, wc); 1028 else 1029 DMERR("unrecognised message received: %s", argv[0]); 1030 1031 return r; 1032 } 1033 1034 static void bio_copy_block(struct dm_writecache *wc, struct bio *bio, void *data) 1035 { 1036 void *buf; 1037 unsigned long flags; 1038 unsigned size; 1039 int rw = bio_data_dir(bio); 1040 unsigned remaining_size = wc->block_size; 1041 1042 do { 1043 struct bio_vec bv = bio_iter_iovec(bio, bio->bi_iter); 1044 buf = bvec_kmap_irq(&bv, &flags); 1045 size = bv.bv_len; 1046 if (unlikely(size > remaining_size)) 1047 size = remaining_size; 1048 1049 if (rw == READ) { 1050 int r; 1051 r = memcpy_mcsafe(buf, data, size); 1052 flush_dcache_page(bio_page(bio)); 1053 if (unlikely(r)) { 1054 writecache_error(wc, r, "hardware memory error when reading data: %d", r); 1055 bio->bi_status = BLK_STS_IOERR; 1056 } 1057 } else { 1058 flush_dcache_page(bio_page(bio)); 1059 memcpy_flushcache(data, buf, size); 1060 } 1061 1062 bvec_kunmap_irq(buf, &flags); 1063 1064 data = (char *)data + size; 1065 remaining_size -= size; 1066 bio_advance(bio, size); 1067 } while (unlikely(remaining_size)); 1068 } 1069 1070 static int writecache_flush_thread(void *data) 1071 { 1072 struct dm_writecache *wc = data; 1073 1074 while (1) { 1075 struct bio *bio; 1076 1077 wc_lock(wc); 1078 bio = bio_list_pop(&wc->flush_list); 1079 if (!bio) { 1080 set_current_state(TASK_INTERRUPTIBLE); 1081 wc_unlock(wc); 1082 1083 if (unlikely(kthread_should_stop())) { 1084 set_current_state(TASK_RUNNING); 1085 break; 1086 } 1087 1088 schedule(); 1089 continue; 1090 } 1091 1092 if (bio_op(bio) == REQ_OP_DISCARD) { 1093 writecache_discard(wc, bio->bi_iter.bi_sector, 1094 bio_end_sector(bio)); 1095 wc_unlock(wc); 1096 bio_set_dev(bio, wc->dev->bdev); 1097 generic_make_request(bio); 1098 } else { 1099 writecache_flush(wc); 1100 wc_unlock(wc); 1101 if (writecache_has_error(wc)) 1102 bio->bi_status = BLK_STS_IOERR; 1103 bio_endio(bio); 1104 } 1105 } 1106 1107 return 0; 1108 } 1109 1110 static void writecache_offload_bio(struct dm_writecache *wc, struct bio *bio) 1111 { 1112 if (bio_list_empty(&wc->flush_list)) 1113 wake_up_process(wc->flush_thread); 1114 bio_list_add(&wc->flush_list, bio); 1115 } 1116 1117 static int writecache_map(struct dm_target *ti, struct bio *bio) 1118 { 1119 struct wc_entry *e; 1120 struct dm_writecache *wc = ti->private; 1121 1122 bio->bi_private = NULL; 1123 1124 wc_lock(wc); 1125 1126 if (unlikely(bio->bi_opf & REQ_PREFLUSH)) { 1127 if (writecache_has_error(wc)) 1128 goto unlock_error; 1129 if (WC_MODE_PMEM(wc)) { 1130 writecache_flush(wc); 1131 if (writecache_has_error(wc)) 1132 goto unlock_error; 1133 goto unlock_submit; 1134 } else { 1135 writecache_offload_bio(wc, bio); 1136 goto unlock_return; 1137 } 1138 } 1139 1140 bio->bi_iter.bi_sector = dm_target_offset(ti, bio->bi_iter.bi_sector); 1141 1142 if (unlikely((((unsigned)bio->bi_iter.bi_sector | bio_sectors(bio)) & 1143 (wc->block_size / 512 - 1)) != 0)) { 1144 DMERR("I/O is not aligned, sector %llu, size %u, block size %u", 1145 (unsigned long long)bio->bi_iter.bi_sector, 1146 bio->bi_iter.bi_size, wc->block_size); 1147 goto unlock_error; 1148 } 1149 1150 if (unlikely(bio_op(bio) == REQ_OP_DISCARD)) { 1151 if (writecache_has_error(wc)) 1152 goto unlock_error; 1153 if (WC_MODE_PMEM(wc)) { 1154 writecache_discard(wc, bio->bi_iter.bi_sector, bio_end_sector(bio)); 1155 goto unlock_remap_origin; 1156 } else { 1157 writecache_offload_bio(wc, bio); 1158 goto unlock_return; 1159 } 1160 } 1161 1162 if (bio_data_dir(bio) == READ) { 1163 read_next_block: 1164 e = writecache_find_entry(wc, bio->bi_iter.bi_sector, WFE_RETURN_FOLLOWING); 1165 if (e && read_original_sector(wc, e) == bio->bi_iter.bi_sector) { 1166 if (WC_MODE_PMEM(wc)) { 1167 bio_copy_block(wc, bio, memory_data(wc, e)); 1168 if (bio->bi_iter.bi_size) 1169 goto read_next_block; 1170 goto unlock_submit; 1171 } else { 1172 dm_accept_partial_bio(bio, wc->block_size >> SECTOR_SHIFT); 1173 bio_set_dev(bio, wc->ssd_dev->bdev); 1174 bio->bi_iter.bi_sector = cache_sector(wc, e); 1175 if (!writecache_entry_is_committed(wc, e)) 1176 writecache_wait_for_ios(wc, WRITE); 1177 goto unlock_remap; 1178 } 1179 } else { 1180 if (e) { 1181 sector_t next_boundary = 1182 read_original_sector(wc, e) - bio->bi_iter.bi_sector; 1183 if (next_boundary < bio->bi_iter.bi_size >> SECTOR_SHIFT) { 1184 dm_accept_partial_bio(bio, next_boundary); 1185 } 1186 } 1187 goto unlock_remap_origin; 1188 } 1189 } else { 1190 do { 1191 if (writecache_has_error(wc)) 1192 goto unlock_error; 1193 e = writecache_find_entry(wc, bio->bi_iter.bi_sector, 0); 1194 if (e) { 1195 if (!writecache_entry_is_committed(wc, e)) 1196 goto bio_copy; 1197 if (!WC_MODE_PMEM(wc) && !e->write_in_progress) { 1198 wc->overwrote_committed = true; 1199 goto bio_copy; 1200 } 1201 } 1202 e = writecache_pop_from_freelist(wc); 1203 if (unlikely(!e)) { 1204 writecache_wait_on_freelist(wc); 1205 continue; 1206 } 1207 write_original_sector_seq_count(wc, e, bio->bi_iter.bi_sector, wc->seq_count); 1208 writecache_insert_entry(wc, e); 1209 wc->uncommitted_blocks++; 1210 bio_copy: 1211 if (WC_MODE_PMEM(wc)) { 1212 bio_copy_block(wc, bio, memory_data(wc, e)); 1213 } else { 1214 dm_accept_partial_bio(bio, wc->block_size >> SECTOR_SHIFT); 1215 bio_set_dev(bio, wc->ssd_dev->bdev); 1216 bio->bi_iter.bi_sector = cache_sector(wc, e); 1217 if (unlikely(wc->uncommitted_blocks >= wc->autocommit_blocks)) { 1218 wc->uncommitted_blocks = 0; 1219 queue_work(wc->writeback_wq, &wc->flush_work); 1220 } else { 1221 writecache_schedule_autocommit(wc); 1222 } 1223 goto unlock_remap; 1224 } 1225 } while (bio->bi_iter.bi_size); 1226 1227 if (unlikely(wc->uncommitted_blocks >= wc->autocommit_blocks)) 1228 writecache_flush(wc); 1229 else 1230 writecache_schedule_autocommit(wc); 1231 goto unlock_submit; 1232 } 1233 1234 unlock_remap_origin: 1235 bio_set_dev(bio, wc->dev->bdev); 1236 wc_unlock(wc); 1237 return DM_MAPIO_REMAPPED; 1238 1239 unlock_remap: 1240 /* make sure that writecache_end_io decrements bio_in_progress: */ 1241 bio->bi_private = (void *)1; 1242 atomic_inc(&wc->bio_in_progress[bio_data_dir(bio)]); 1243 wc_unlock(wc); 1244 return DM_MAPIO_REMAPPED; 1245 1246 unlock_submit: 1247 wc_unlock(wc); 1248 bio_endio(bio); 1249 return DM_MAPIO_SUBMITTED; 1250 1251 unlock_return: 1252 wc_unlock(wc); 1253 return DM_MAPIO_SUBMITTED; 1254 1255 unlock_error: 1256 wc_unlock(wc); 1257 bio_io_error(bio); 1258 return DM_MAPIO_SUBMITTED; 1259 } 1260 1261 static int writecache_end_io(struct dm_target *ti, struct bio *bio, blk_status_t *status) 1262 { 1263 struct dm_writecache *wc = ti->private; 1264 1265 if (bio->bi_private != NULL) { 1266 int dir = bio_data_dir(bio); 1267 if (atomic_dec_and_test(&wc->bio_in_progress[dir])) 1268 if (unlikely(waitqueue_active(&wc->bio_in_progress_wait[dir]))) 1269 wake_up(&wc->bio_in_progress_wait[dir]); 1270 } 1271 return 0; 1272 } 1273 1274 static int writecache_iterate_devices(struct dm_target *ti, 1275 iterate_devices_callout_fn fn, void *data) 1276 { 1277 struct dm_writecache *wc = ti->private; 1278 1279 return fn(ti, wc->dev, 0, ti->len, data); 1280 } 1281 1282 static void writecache_io_hints(struct dm_target *ti, struct queue_limits *limits) 1283 { 1284 struct dm_writecache *wc = ti->private; 1285 1286 if (limits->logical_block_size < wc->block_size) 1287 limits->logical_block_size = wc->block_size; 1288 1289 if (limits->physical_block_size < wc->block_size) 1290 limits->physical_block_size = wc->block_size; 1291 1292 if (limits->io_min < wc->block_size) 1293 limits->io_min = wc->block_size; 1294 } 1295 1296 1297 static void writecache_writeback_endio(struct bio *bio) 1298 { 1299 struct writeback_struct *wb = container_of(bio, struct writeback_struct, bio); 1300 struct dm_writecache *wc = wb->wc; 1301 unsigned long flags; 1302 1303 raw_spin_lock_irqsave(&wc->endio_list_lock, flags); 1304 if (unlikely(list_empty(&wc->endio_list))) 1305 wake_up_process(wc->endio_thread); 1306 list_add_tail(&wb->endio_entry, &wc->endio_list); 1307 raw_spin_unlock_irqrestore(&wc->endio_list_lock, flags); 1308 } 1309 1310 static void writecache_copy_endio(int read_err, unsigned long write_err, void *ptr) 1311 { 1312 struct copy_struct *c = ptr; 1313 struct dm_writecache *wc = c->wc; 1314 1315 c->error = likely(!(read_err | write_err)) ? 0 : -EIO; 1316 1317 raw_spin_lock_irq(&wc->endio_list_lock); 1318 if (unlikely(list_empty(&wc->endio_list))) 1319 wake_up_process(wc->endio_thread); 1320 list_add_tail(&c->endio_entry, &wc->endio_list); 1321 raw_spin_unlock_irq(&wc->endio_list_lock); 1322 } 1323 1324 static void __writecache_endio_pmem(struct dm_writecache *wc, struct list_head *list) 1325 { 1326 unsigned i; 1327 struct writeback_struct *wb; 1328 struct wc_entry *e; 1329 unsigned long n_walked = 0; 1330 1331 do { 1332 wb = list_entry(list->next, struct writeback_struct, endio_entry); 1333 list_del(&wb->endio_entry); 1334 1335 if (unlikely(wb->bio.bi_status != BLK_STS_OK)) 1336 writecache_error(wc, blk_status_to_errno(wb->bio.bi_status), 1337 "write error %d", wb->bio.bi_status); 1338 i = 0; 1339 do { 1340 e = wb->wc_list[i]; 1341 BUG_ON(!e->write_in_progress); 1342 e->write_in_progress = false; 1343 INIT_LIST_HEAD(&e->lru); 1344 if (!writecache_has_error(wc)) 1345 writecache_free_entry(wc, e); 1346 BUG_ON(!wc->writeback_size); 1347 wc->writeback_size--; 1348 n_walked++; 1349 if (unlikely(n_walked >= ENDIO_LATENCY)) { 1350 writecache_commit_flushed(wc); 1351 wc_unlock(wc); 1352 wc_lock(wc); 1353 n_walked = 0; 1354 } 1355 } while (++i < wb->wc_list_n); 1356 1357 if (wb->wc_list != wb->wc_list_inline) 1358 kfree(wb->wc_list); 1359 bio_put(&wb->bio); 1360 } while (!list_empty(list)); 1361 } 1362 1363 static void __writecache_endio_ssd(struct dm_writecache *wc, struct list_head *list) 1364 { 1365 struct copy_struct *c; 1366 struct wc_entry *e; 1367 1368 do { 1369 c = list_entry(list->next, struct copy_struct, endio_entry); 1370 list_del(&c->endio_entry); 1371 1372 if (unlikely(c->error)) 1373 writecache_error(wc, c->error, "copy error"); 1374 1375 e = c->e; 1376 do { 1377 BUG_ON(!e->write_in_progress); 1378 e->write_in_progress = false; 1379 INIT_LIST_HEAD(&e->lru); 1380 if (!writecache_has_error(wc)) 1381 writecache_free_entry(wc, e); 1382 1383 BUG_ON(!wc->writeback_size); 1384 wc->writeback_size--; 1385 e++; 1386 } while (--c->n_entries); 1387 mempool_free(c, &wc->copy_pool); 1388 } while (!list_empty(list)); 1389 } 1390 1391 static int writecache_endio_thread(void *data) 1392 { 1393 struct dm_writecache *wc = data; 1394 1395 while (1) { 1396 struct list_head list; 1397 1398 raw_spin_lock_irq(&wc->endio_list_lock); 1399 if (!list_empty(&wc->endio_list)) 1400 goto pop_from_list; 1401 set_current_state(TASK_INTERRUPTIBLE); 1402 raw_spin_unlock_irq(&wc->endio_list_lock); 1403 1404 if (unlikely(kthread_should_stop())) { 1405 set_current_state(TASK_RUNNING); 1406 break; 1407 } 1408 1409 schedule(); 1410 1411 continue; 1412 1413 pop_from_list: 1414 list = wc->endio_list; 1415 list.next->prev = list.prev->next = &list; 1416 INIT_LIST_HEAD(&wc->endio_list); 1417 raw_spin_unlock_irq(&wc->endio_list_lock); 1418 1419 if (!WC_MODE_FUA(wc)) 1420 writecache_disk_flush(wc, wc->dev); 1421 1422 wc_lock(wc); 1423 1424 if (WC_MODE_PMEM(wc)) { 1425 __writecache_endio_pmem(wc, &list); 1426 } else { 1427 __writecache_endio_ssd(wc, &list); 1428 writecache_wait_for_ios(wc, READ); 1429 } 1430 1431 writecache_commit_flushed(wc); 1432 1433 wc_unlock(wc); 1434 } 1435 1436 return 0; 1437 } 1438 1439 static bool wc_add_block(struct writeback_struct *wb, struct wc_entry *e, gfp_t gfp) 1440 { 1441 struct dm_writecache *wc = wb->wc; 1442 unsigned block_size = wc->block_size; 1443 void *address = memory_data(wc, e); 1444 1445 persistent_memory_flush_cache(address, block_size); 1446 return bio_add_page(&wb->bio, persistent_memory_page(address), 1447 block_size, persistent_memory_page_offset(address)) != 0; 1448 } 1449 1450 struct writeback_list { 1451 struct list_head list; 1452 size_t size; 1453 }; 1454 1455 static void __writeback_throttle(struct dm_writecache *wc, struct writeback_list *wbl) 1456 { 1457 if (unlikely(wc->max_writeback_jobs)) { 1458 if (READ_ONCE(wc->writeback_size) - wbl->size >= wc->max_writeback_jobs) { 1459 wc_lock(wc); 1460 while (wc->writeback_size - wbl->size >= wc->max_writeback_jobs) 1461 writecache_wait_on_freelist(wc); 1462 wc_unlock(wc); 1463 } 1464 } 1465 cond_resched(); 1466 } 1467 1468 static void __writecache_writeback_pmem(struct dm_writecache *wc, struct writeback_list *wbl) 1469 { 1470 struct wc_entry *e, *f; 1471 struct bio *bio; 1472 struct writeback_struct *wb; 1473 unsigned max_pages; 1474 1475 while (wbl->size) { 1476 wbl->size--; 1477 e = container_of(wbl->list.prev, struct wc_entry, lru); 1478 list_del(&e->lru); 1479 1480 max_pages = e->wc_list_contiguous; 1481 1482 bio = bio_alloc_bioset(GFP_NOIO, max_pages, &wc->bio_set); 1483 wb = container_of(bio, struct writeback_struct, bio); 1484 wb->wc = wc; 1485 wb->bio.bi_end_io = writecache_writeback_endio; 1486 bio_set_dev(&wb->bio, wc->dev->bdev); 1487 wb->bio.bi_iter.bi_sector = read_original_sector(wc, e); 1488 wb->page_offset = PAGE_SIZE; 1489 if (max_pages <= WB_LIST_INLINE || 1490 unlikely(!(wb->wc_list = kmalloc_array(max_pages, sizeof(struct wc_entry *), 1491 GFP_NOIO | __GFP_NORETRY | 1492 __GFP_NOMEMALLOC | __GFP_NOWARN)))) { 1493 wb->wc_list = wb->wc_list_inline; 1494 max_pages = WB_LIST_INLINE; 1495 } 1496 1497 BUG_ON(!wc_add_block(wb, e, GFP_NOIO)); 1498 1499 wb->wc_list[0] = e; 1500 wb->wc_list_n = 1; 1501 1502 while (wbl->size && wb->wc_list_n < max_pages) { 1503 f = container_of(wbl->list.prev, struct wc_entry, lru); 1504 if (read_original_sector(wc, f) != 1505 read_original_sector(wc, e) + (wc->block_size >> SECTOR_SHIFT)) 1506 break; 1507 if (!wc_add_block(wb, f, GFP_NOWAIT | __GFP_NOWARN)) 1508 break; 1509 wbl->size--; 1510 list_del(&f->lru); 1511 wb->wc_list[wb->wc_list_n++] = f; 1512 e = f; 1513 } 1514 bio_set_op_attrs(&wb->bio, REQ_OP_WRITE, WC_MODE_FUA(wc) * REQ_FUA); 1515 if (writecache_has_error(wc)) { 1516 bio->bi_status = BLK_STS_IOERR; 1517 bio_endio(&wb->bio); 1518 } else { 1519 submit_bio(&wb->bio); 1520 } 1521 1522 __writeback_throttle(wc, wbl); 1523 } 1524 } 1525 1526 static void __writecache_writeback_ssd(struct dm_writecache *wc, struct writeback_list *wbl) 1527 { 1528 struct wc_entry *e, *f; 1529 struct dm_io_region from, to; 1530 struct copy_struct *c; 1531 1532 while (wbl->size) { 1533 unsigned n_sectors; 1534 1535 wbl->size--; 1536 e = container_of(wbl->list.prev, struct wc_entry, lru); 1537 list_del(&e->lru); 1538 1539 n_sectors = e->wc_list_contiguous << (wc->block_size_bits - SECTOR_SHIFT); 1540 1541 from.bdev = wc->ssd_dev->bdev; 1542 from.sector = cache_sector(wc, e); 1543 from.count = n_sectors; 1544 to.bdev = wc->dev->bdev; 1545 to.sector = read_original_sector(wc, e); 1546 to.count = n_sectors; 1547 1548 c = mempool_alloc(&wc->copy_pool, GFP_NOIO); 1549 c->wc = wc; 1550 c->e = e; 1551 c->n_entries = e->wc_list_contiguous; 1552 1553 while ((n_sectors -= wc->block_size >> SECTOR_SHIFT)) { 1554 wbl->size--; 1555 f = container_of(wbl->list.prev, struct wc_entry, lru); 1556 BUG_ON(f != e + 1); 1557 list_del(&f->lru); 1558 e = f; 1559 } 1560 1561 dm_kcopyd_copy(wc->dm_kcopyd, &from, 1, &to, 0, writecache_copy_endio, c); 1562 1563 __writeback_throttle(wc, wbl); 1564 } 1565 } 1566 1567 static void writecache_writeback(struct work_struct *work) 1568 { 1569 struct dm_writecache *wc = container_of(work, struct dm_writecache, writeback_work); 1570 struct blk_plug plug; 1571 struct wc_entry *e, *f, *g; 1572 struct rb_node *node, *next_node; 1573 struct list_head skipped; 1574 struct writeback_list wbl; 1575 unsigned long n_walked; 1576 1577 wc_lock(wc); 1578 restart: 1579 if (writecache_has_error(wc)) { 1580 wc_unlock(wc); 1581 return; 1582 } 1583 1584 if (unlikely(wc->writeback_all)) { 1585 if (writecache_wait_for_writeback(wc)) 1586 goto restart; 1587 } 1588 1589 if (wc->overwrote_committed) { 1590 writecache_wait_for_ios(wc, WRITE); 1591 } 1592 1593 n_walked = 0; 1594 INIT_LIST_HEAD(&skipped); 1595 INIT_LIST_HEAD(&wbl.list); 1596 wbl.size = 0; 1597 while (!list_empty(&wc->lru) && 1598 (wc->writeback_all || 1599 wc->freelist_size + wc->writeback_size <= wc->freelist_low_watermark)) { 1600 1601 n_walked++; 1602 if (unlikely(n_walked > WRITEBACK_LATENCY) && 1603 likely(!wc->writeback_all) && likely(!dm_suspended(wc->ti))) { 1604 queue_work(wc->writeback_wq, &wc->writeback_work); 1605 break; 1606 } 1607 1608 e = container_of(wc->lru.prev, struct wc_entry, lru); 1609 BUG_ON(e->write_in_progress); 1610 if (unlikely(!writecache_entry_is_committed(wc, e))) { 1611 writecache_flush(wc); 1612 } 1613 node = rb_prev(&e->rb_node); 1614 if (node) { 1615 f = container_of(node, struct wc_entry, rb_node); 1616 if (unlikely(read_original_sector(wc, f) == 1617 read_original_sector(wc, e))) { 1618 BUG_ON(!f->write_in_progress); 1619 list_del(&e->lru); 1620 list_add(&e->lru, &skipped); 1621 cond_resched(); 1622 continue; 1623 } 1624 } 1625 wc->writeback_size++; 1626 list_del(&e->lru); 1627 list_add(&e->lru, &wbl.list); 1628 wbl.size++; 1629 e->write_in_progress = true; 1630 e->wc_list_contiguous = 1; 1631 1632 f = e; 1633 1634 while (1) { 1635 next_node = rb_next(&f->rb_node); 1636 if (unlikely(!next_node)) 1637 break; 1638 g = container_of(next_node, struct wc_entry, rb_node); 1639 if (read_original_sector(wc, g) == 1640 read_original_sector(wc, f)) { 1641 f = g; 1642 continue; 1643 } 1644 if (read_original_sector(wc, g) != 1645 read_original_sector(wc, f) + (wc->block_size >> SECTOR_SHIFT)) 1646 break; 1647 if (unlikely(g->write_in_progress)) 1648 break; 1649 if (unlikely(!writecache_entry_is_committed(wc, g))) 1650 break; 1651 1652 if (!WC_MODE_PMEM(wc)) { 1653 if (g != f + 1) 1654 break; 1655 } 1656 1657 n_walked++; 1658 //if (unlikely(n_walked > WRITEBACK_LATENCY) && likely(!wc->writeback_all)) 1659 // break; 1660 1661 wc->writeback_size++; 1662 list_del(&g->lru); 1663 list_add(&g->lru, &wbl.list); 1664 wbl.size++; 1665 g->write_in_progress = true; 1666 g->wc_list_contiguous = BIO_MAX_PAGES; 1667 f = g; 1668 e->wc_list_contiguous++; 1669 if (unlikely(e->wc_list_contiguous == BIO_MAX_PAGES)) 1670 break; 1671 } 1672 cond_resched(); 1673 } 1674 1675 if (!list_empty(&skipped)) { 1676 list_splice_tail(&skipped, &wc->lru); 1677 /* 1678 * If we didn't do any progress, we must wait until some 1679 * writeback finishes to avoid burning CPU in a loop 1680 */ 1681 if (unlikely(!wbl.size)) 1682 writecache_wait_for_writeback(wc); 1683 } 1684 1685 wc_unlock(wc); 1686 1687 blk_start_plug(&plug); 1688 1689 if (WC_MODE_PMEM(wc)) 1690 __writecache_writeback_pmem(wc, &wbl); 1691 else 1692 __writecache_writeback_ssd(wc, &wbl); 1693 1694 blk_finish_plug(&plug); 1695 1696 if (unlikely(wc->writeback_all)) { 1697 wc_lock(wc); 1698 while (writecache_wait_for_writeback(wc)); 1699 wc_unlock(wc); 1700 } 1701 } 1702 1703 static int calculate_memory_size(uint64_t device_size, unsigned block_size, 1704 size_t *n_blocks_p, size_t *n_metadata_blocks_p) 1705 { 1706 uint64_t n_blocks, offset; 1707 struct wc_entry e; 1708 1709 n_blocks = device_size; 1710 do_div(n_blocks, block_size + sizeof(struct wc_memory_entry)); 1711 1712 while (1) { 1713 if (!n_blocks) 1714 return -ENOSPC; 1715 /* Verify the following entries[n_blocks] won't overflow */ 1716 if (n_blocks >= ((size_t)-sizeof(struct wc_memory_superblock) / 1717 sizeof(struct wc_memory_entry))) 1718 return -EFBIG; 1719 offset = offsetof(struct wc_memory_superblock, entries[n_blocks]); 1720 offset = (offset + block_size - 1) & ~(uint64_t)(block_size - 1); 1721 if (offset + n_blocks * block_size <= device_size) 1722 break; 1723 n_blocks--; 1724 } 1725 1726 /* check if the bit field overflows */ 1727 e.index = n_blocks; 1728 if (e.index != n_blocks) 1729 return -EFBIG; 1730 1731 if (n_blocks_p) 1732 *n_blocks_p = n_blocks; 1733 if (n_metadata_blocks_p) 1734 *n_metadata_blocks_p = offset >> __ffs(block_size); 1735 return 0; 1736 } 1737 1738 static int init_memory(struct dm_writecache *wc) 1739 { 1740 size_t b; 1741 int r; 1742 1743 r = calculate_memory_size(wc->memory_map_size, wc->block_size, &wc->n_blocks, NULL); 1744 if (r) 1745 return r; 1746 1747 r = writecache_alloc_entries(wc); 1748 if (r) 1749 return r; 1750 1751 for (b = 0; b < ARRAY_SIZE(sb(wc)->padding); b++) 1752 pmem_assign(sb(wc)->padding[b], cpu_to_le64(0)); 1753 pmem_assign(sb(wc)->version, cpu_to_le32(MEMORY_SUPERBLOCK_VERSION)); 1754 pmem_assign(sb(wc)->block_size, cpu_to_le32(wc->block_size)); 1755 pmem_assign(sb(wc)->n_blocks, cpu_to_le64(wc->n_blocks)); 1756 pmem_assign(sb(wc)->seq_count, cpu_to_le64(0)); 1757 1758 for (b = 0; b < wc->n_blocks; b++) 1759 write_original_sector_seq_count(wc, &wc->entries[b], -1, -1); 1760 1761 writecache_flush_all_metadata(wc); 1762 writecache_commit_flushed(wc); 1763 pmem_assign(sb(wc)->magic, cpu_to_le32(MEMORY_SUPERBLOCK_MAGIC)); 1764 writecache_flush_region(wc, &sb(wc)->magic, sizeof sb(wc)->magic); 1765 writecache_commit_flushed(wc); 1766 1767 return 0; 1768 } 1769 1770 static void writecache_dtr(struct dm_target *ti) 1771 { 1772 struct dm_writecache *wc = ti->private; 1773 1774 if (!wc) 1775 return; 1776 1777 if (wc->endio_thread) 1778 kthread_stop(wc->endio_thread); 1779 1780 if (wc->flush_thread) 1781 kthread_stop(wc->flush_thread); 1782 1783 bioset_exit(&wc->bio_set); 1784 1785 mempool_exit(&wc->copy_pool); 1786 1787 if (wc->writeback_wq) 1788 destroy_workqueue(wc->writeback_wq); 1789 1790 if (wc->dev) 1791 dm_put_device(ti, wc->dev); 1792 1793 if (wc->ssd_dev) 1794 dm_put_device(ti, wc->ssd_dev); 1795 1796 if (wc->entries) 1797 vfree(wc->entries); 1798 1799 if (wc->memory_map) { 1800 if (WC_MODE_PMEM(wc)) 1801 persistent_memory_release(wc); 1802 else 1803 vfree(wc->memory_map); 1804 } 1805 1806 if (wc->dm_kcopyd) 1807 dm_kcopyd_client_destroy(wc->dm_kcopyd); 1808 1809 if (wc->dm_io) 1810 dm_io_client_destroy(wc->dm_io); 1811 1812 if (wc->dirty_bitmap) 1813 vfree(wc->dirty_bitmap); 1814 1815 kfree(wc); 1816 } 1817 1818 static int writecache_ctr(struct dm_target *ti, unsigned argc, char **argv) 1819 { 1820 struct dm_writecache *wc; 1821 struct dm_arg_set as; 1822 const char *string; 1823 unsigned opt_params; 1824 size_t offset, data_size; 1825 int i, r; 1826 char dummy; 1827 int high_wm_percent = HIGH_WATERMARK; 1828 int low_wm_percent = LOW_WATERMARK; 1829 uint64_t x; 1830 struct wc_memory_superblock s; 1831 1832 static struct dm_arg _args[] = { 1833 {0, 10, "Invalid number of feature args"}, 1834 }; 1835 1836 as.argc = argc; 1837 as.argv = argv; 1838 1839 wc = kzalloc(sizeof(struct dm_writecache), GFP_KERNEL); 1840 if (!wc) { 1841 ti->error = "Cannot allocate writecache structure"; 1842 r = -ENOMEM; 1843 goto bad; 1844 } 1845 ti->private = wc; 1846 wc->ti = ti; 1847 1848 mutex_init(&wc->lock); 1849 writecache_poison_lists(wc); 1850 init_waitqueue_head(&wc->freelist_wait); 1851 timer_setup(&wc->autocommit_timer, writecache_autocommit_timer, 0); 1852 1853 for (i = 0; i < 2; i++) { 1854 atomic_set(&wc->bio_in_progress[i], 0); 1855 init_waitqueue_head(&wc->bio_in_progress_wait[i]); 1856 } 1857 1858 wc->dm_io = dm_io_client_create(); 1859 if (IS_ERR(wc->dm_io)) { 1860 r = PTR_ERR(wc->dm_io); 1861 ti->error = "Unable to allocate dm-io client"; 1862 wc->dm_io = NULL; 1863 goto bad; 1864 } 1865 1866 wc->writeback_wq = alloc_workqueue("writecache-writeabck", WQ_MEM_RECLAIM, 1); 1867 if (!wc->writeback_wq) { 1868 r = -ENOMEM; 1869 ti->error = "Could not allocate writeback workqueue"; 1870 goto bad; 1871 } 1872 INIT_WORK(&wc->writeback_work, writecache_writeback); 1873 INIT_WORK(&wc->flush_work, writecache_flush_work); 1874 1875 raw_spin_lock_init(&wc->endio_list_lock); 1876 INIT_LIST_HEAD(&wc->endio_list); 1877 wc->endio_thread = kthread_create(writecache_endio_thread, wc, "writecache_endio"); 1878 if (IS_ERR(wc->endio_thread)) { 1879 r = PTR_ERR(wc->endio_thread); 1880 wc->endio_thread = NULL; 1881 ti->error = "Couldn't spawn endio thread"; 1882 goto bad; 1883 } 1884 wake_up_process(wc->endio_thread); 1885 1886 /* 1887 * Parse the mode (pmem or ssd) 1888 */ 1889 string = dm_shift_arg(&as); 1890 if (!string) 1891 goto bad_arguments; 1892 1893 if (!strcasecmp(string, "s")) { 1894 wc->pmem_mode = false; 1895 } else if (!strcasecmp(string, "p")) { 1896 #ifdef DM_WRITECACHE_HAS_PMEM 1897 wc->pmem_mode = true; 1898 wc->writeback_fua = true; 1899 #else 1900 /* 1901 * If the architecture doesn't support persistent memory or 1902 * the kernel doesn't support any DAX drivers, this driver can 1903 * only be used in SSD-only mode. 1904 */ 1905 r = -EOPNOTSUPP; 1906 ti->error = "Persistent memory or DAX not supported on this system"; 1907 goto bad; 1908 #endif 1909 } else { 1910 goto bad_arguments; 1911 } 1912 1913 if (WC_MODE_PMEM(wc)) { 1914 r = bioset_init(&wc->bio_set, BIO_POOL_SIZE, 1915 offsetof(struct writeback_struct, bio), 1916 BIOSET_NEED_BVECS); 1917 if (r) { 1918 ti->error = "Could not allocate bio set"; 1919 goto bad; 1920 } 1921 } else { 1922 r = mempool_init_kmalloc_pool(&wc->copy_pool, 1, sizeof(struct copy_struct)); 1923 if (r) { 1924 ti->error = "Could not allocate mempool"; 1925 goto bad; 1926 } 1927 } 1928 1929 /* 1930 * Parse the origin data device 1931 */ 1932 string = dm_shift_arg(&as); 1933 if (!string) 1934 goto bad_arguments; 1935 r = dm_get_device(ti, string, dm_table_get_mode(ti->table), &wc->dev); 1936 if (r) { 1937 ti->error = "Origin data device lookup failed"; 1938 goto bad; 1939 } 1940 1941 /* 1942 * Parse cache data device (be it pmem or ssd) 1943 */ 1944 string = dm_shift_arg(&as); 1945 if (!string) 1946 goto bad_arguments; 1947 1948 r = dm_get_device(ti, string, dm_table_get_mode(ti->table), &wc->ssd_dev); 1949 if (r) { 1950 ti->error = "Cache data device lookup failed"; 1951 goto bad; 1952 } 1953 wc->memory_map_size = i_size_read(wc->ssd_dev->bdev->bd_inode); 1954 1955 /* 1956 * Parse the cache block size 1957 */ 1958 string = dm_shift_arg(&as); 1959 if (!string) 1960 goto bad_arguments; 1961 if (sscanf(string, "%u%c", &wc->block_size, &dummy) != 1 || 1962 wc->block_size < 512 || wc->block_size > PAGE_SIZE || 1963 (wc->block_size & (wc->block_size - 1))) { 1964 r = -EINVAL; 1965 ti->error = "Invalid block size"; 1966 goto bad; 1967 } 1968 wc->block_size_bits = __ffs(wc->block_size); 1969 1970 wc->max_writeback_jobs = MAX_WRITEBACK_JOBS; 1971 wc->autocommit_blocks = !WC_MODE_PMEM(wc) ? AUTOCOMMIT_BLOCKS_SSD : AUTOCOMMIT_BLOCKS_PMEM; 1972 wc->autocommit_jiffies = msecs_to_jiffies(AUTOCOMMIT_MSEC); 1973 1974 /* 1975 * Parse optional arguments 1976 */ 1977 r = dm_read_arg_group(_args, &as, &opt_params, &ti->error); 1978 if (r) 1979 goto bad; 1980 1981 while (opt_params) { 1982 string = dm_shift_arg(&as), opt_params--; 1983 if (!strcasecmp(string, "start_sector") && opt_params >= 1) { 1984 unsigned long long start_sector; 1985 string = dm_shift_arg(&as), opt_params--; 1986 if (sscanf(string, "%llu%c", &start_sector, &dummy) != 1) 1987 goto invalid_optional; 1988 wc->start_sector = start_sector; 1989 if (wc->start_sector != start_sector || 1990 wc->start_sector >= wc->memory_map_size >> SECTOR_SHIFT) 1991 goto invalid_optional; 1992 } else if (!strcasecmp(string, "high_watermark") && opt_params >= 1) { 1993 string = dm_shift_arg(&as), opt_params--; 1994 if (sscanf(string, "%d%c", &high_wm_percent, &dummy) != 1) 1995 goto invalid_optional; 1996 if (high_wm_percent < 0 || high_wm_percent > 100) 1997 goto invalid_optional; 1998 wc->high_wm_percent_set = true; 1999 } else if (!strcasecmp(string, "low_watermark") && opt_params >= 1) { 2000 string = dm_shift_arg(&as), opt_params--; 2001 if (sscanf(string, "%d%c", &low_wm_percent, &dummy) != 1) 2002 goto invalid_optional; 2003 if (low_wm_percent < 0 || low_wm_percent > 100) 2004 goto invalid_optional; 2005 wc->low_wm_percent_set = true; 2006 } else if (!strcasecmp(string, "writeback_jobs") && opt_params >= 1) { 2007 string = dm_shift_arg(&as), opt_params--; 2008 if (sscanf(string, "%u%c", &wc->max_writeback_jobs, &dummy) != 1) 2009 goto invalid_optional; 2010 wc->max_writeback_jobs_set = true; 2011 } else if (!strcasecmp(string, "autocommit_blocks") && opt_params >= 1) { 2012 string = dm_shift_arg(&as), opt_params--; 2013 if (sscanf(string, "%u%c", &wc->autocommit_blocks, &dummy) != 1) 2014 goto invalid_optional; 2015 wc->autocommit_blocks_set = true; 2016 } else if (!strcasecmp(string, "autocommit_time") && opt_params >= 1) { 2017 unsigned autocommit_msecs; 2018 string = dm_shift_arg(&as), opt_params--; 2019 if (sscanf(string, "%u%c", &autocommit_msecs, &dummy) != 1) 2020 goto invalid_optional; 2021 if (autocommit_msecs > 3600000) 2022 goto invalid_optional; 2023 wc->autocommit_jiffies = msecs_to_jiffies(autocommit_msecs); 2024 wc->autocommit_time_set = true; 2025 } else if (!strcasecmp(string, "fua")) { 2026 if (WC_MODE_PMEM(wc)) { 2027 wc->writeback_fua = true; 2028 wc->writeback_fua_set = true; 2029 } else goto invalid_optional; 2030 } else if (!strcasecmp(string, "nofua")) { 2031 if (WC_MODE_PMEM(wc)) { 2032 wc->writeback_fua = false; 2033 wc->writeback_fua_set = true; 2034 } else goto invalid_optional; 2035 } else { 2036 invalid_optional: 2037 r = -EINVAL; 2038 ti->error = "Invalid optional argument"; 2039 goto bad; 2040 } 2041 } 2042 2043 if (high_wm_percent < low_wm_percent) { 2044 r = -EINVAL; 2045 ti->error = "High watermark must be greater than or equal to low watermark"; 2046 goto bad; 2047 } 2048 2049 if (WC_MODE_PMEM(wc)) { 2050 r = persistent_memory_claim(wc); 2051 if (r) { 2052 ti->error = "Unable to map persistent memory for cache"; 2053 goto bad; 2054 } 2055 } else { 2056 struct dm_io_region region; 2057 struct dm_io_request req; 2058 size_t n_blocks, n_metadata_blocks; 2059 uint64_t n_bitmap_bits; 2060 2061 wc->memory_map_size -= (uint64_t)wc->start_sector << SECTOR_SHIFT; 2062 2063 bio_list_init(&wc->flush_list); 2064 wc->flush_thread = kthread_create(writecache_flush_thread, wc, "dm_writecache_flush"); 2065 if (IS_ERR(wc->flush_thread)) { 2066 r = PTR_ERR(wc->flush_thread); 2067 wc->flush_thread = NULL; 2068 ti->error = "Couldn't spawn endio thread"; 2069 goto bad; 2070 } 2071 wake_up_process(wc->flush_thread); 2072 2073 r = calculate_memory_size(wc->memory_map_size, wc->block_size, 2074 &n_blocks, &n_metadata_blocks); 2075 if (r) { 2076 ti->error = "Invalid device size"; 2077 goto bad; 2078 } 2079 2080 n_bitmap_bits = (((uint64_t)n_metadata_blocks << wc->block_size_bits) + 2081 BITMAP_GRANULARITY - 1) / BITMAP_GRANULARITY; 2082 /* this is limitation of test_bit functions */ 2083 if (n_bitmap_bits > 1U << 31) { 2084 r = -EFBIG; 2085 ti->error = "Invalid device size"; 2086 goto bad; 2087 } 2088 2089 wc->memory_map = vmalloc(n_metadata_blocks << wc->block_size_bits); 2090 if (!wc->memory_map) { 2091 r = -ENOMEM; 2092 ti->error = "Unable to allocate memory for metadata"; 2093 goto bad; 2094 } 2095 2096 wc->dm_kcopyd = dm_kcopyd_client_create(&dm_kcopyd_throttle); 2097 if (IS_ERR(wc->dm_kcopyd)) { 2098 r = PTR_ERR(wc->dm_kcopyd); 2099 ti->error = "Unable to allocate dm-kcopyd client"; 2100 wc->dm_kcopyd = NULL; 2101 goto bad; 2102 } 2103 2104 wc->metadata_sectors = n_metadata_blocks << (wc->block_size_bits - SECTOR_SHIFT); 2105 wc->dirty_bitmap_size = (n_bitmap_bits + BITS_PER_LONG - 1) / 2106 BITS_PER_LONG * sizeof(unsigned long); 2107 wc->dirty_bitmap = vzalloc(wc->dirty_bitmap_size); 2108 if (!wc->dirty_bitmap) { 2109 r = -ENOMEM; 2110 ti->error = "Unable to allocate dirty bitmap"; 2111 goto bad; 2112 } 2113 2114 region.bdev = wc->ssd_dev->bdev; 2115 region.sector = wc->start_sector; 2116 region.count = wc->metadata_sectors; 2117 req.bi_op = REQ_OP_READ; 2118 req.bi_op_flags = REQ_SYNC; 2119 req.mem.type = DM_IO_VMA; 2120 req.mem.ptr.vma = (char *)wc->memory_map; 2121 req.client = wc->dm_io; 2122 req.notify.fn = NULL; 2123 2124 r = dm_io(&req, 1, ®ion, NULL); 2125 if (r) { 2126 ti->error = "Unable to read metadata"; 2127 goto bad; 2128 } 2129 } 2130 2131 r = memcpy_mcsafe(&s, sb(wc), sizeof(struct wc_memory_superblock)); 2132 if (r) { 2133 ti->error = "Hardware memory error when reading superblock"; 2134 goto bad; 2135 } 2136 if (!le32_to_cpu(s.magic) && !le32_to_cpu(s.version)) { 2137 r = init_memory(wc); 2138 if (r) { 2139 ti->error = "Unable to initialize device"; 2140 goto bad; 2141 } 2142 r = memcpy_mcsafe(&s, sb(wc), sizeof(struct wc_memory_superblock)); 2143 if (r) { 2144 ti->error = "Hardware memory error when reading superblock"; 2145 goto bad; 2146 } 2147 } 2148 2149 if (le32_to_cpu(s.magic) != MEMORY_SUPERBLOCK_MAGIC) { 2150 ti->error = "Invalid magic in the superblock"; 2151 r = -EINVAL; 2152 goto bad; 2153 } 2154 2155 if (le32_to_cpu(s.version) != MEMORY_SUPERBLOCK_VERSION) { 2156 ti->error = "Invalid version in the superblock"; 2157 r = -EINVAL; 2158 goto bad; 2159 } 2160 2161 if (le32_to_cpu(s.block_size) != wc->block_size) { 2162 ti->error = "Block size does not match superblock"; 2163 r = -EINVAL; 2164 goto bad; 2165 } 2166 2167 wc->n_blocks = le64_to_cpu(s.n_blocks); 2168 2169 offset = wc->n_blocks * sizeof(struct wc_memory_entry); 2170 if (offset / sizeof(struct wc_memory_entry) != le64_to_cpu(sb(wc)->n_blocks)) { 2171 overflow: 2172 ti->error = "Overflow in size calculation"; 2173 r = -EINVAL; 2174 goto bad; 2175 } 2176 offset += sizeof(struct wc_memory_superblock); 2177 if (offset < sizeof(struct wc_memory_superblock)) 2178 goto overflow; 2179 offset = (offset + wc->block_size - 1) & ~(size_t)(wc->block_size - 1); 2180 data_size = wc->n_blocks * (size_t)wc->block_size; 2181 if (!offset || (data_size / wc->block_size != wc->n_blocks) || 2182 (offset + data_size < offset)) 2183 goto overflow; 2184 if (offset + data_size > wc->memory_map_size) { 2185 ti->error = "Memory area is too small"; 2186 r = -EINVAL; 2187 goto bad; 2188 } 2189 2190 wc->metadata_sectors = offset >> SECTOR_SHIFT; 2191 wc->block_start = (char *)sb(wc) + offset; 2192 2193 x = (uint64_t)wc->n_blocks * (100 - high_wm_percent); 2194 x += 50; 2195 do_div(x, 100); 2196 wc->freelist_high_watermark = x; 2197 x = (uint64_t)wc->n_blocks * (100 - low_wm_percent); 2198 x += 50; 2199 do_div(x, 100); 2200 wc->freelist_low_watermark = x; 2201 2202 r = writecache_alloc_entries(wc); 2203 if (r) { 2204 ti->error = "Cannot allocate memory"; 2205 goto bad; 2206 } 2207 2208 ti->num_flush_bios = 1; 2209 ti->flush_supported = true; 2210 ti->num_discard_bios = 1; 2211 2212 if (WC_MODE_PMEM(wc)) 2213 persistent_memory_flush_cache(wc->memory_map, wc->memory_map_size); 2214 2215 return 0; 2216 2217 bad_arguments: 2218 r = -EINVAL; 2219 ti->error = "Bad arguments"; 2220 bad: 2221 writecache_dtr(ti); 2222 return r; 2223 } 2224 2225 static void writecache_status(struct dm_target *ti, status_type_t type, 2226 unsigned status_flags, char *result, unsigned maxlen) 2227 { 2228 struct dm_writecache *wc = ti->private; 2229 unsigned extra_args; 2230 unsigned sz = 0; 2231 uint64_t x; 2232 2233 switch (type) { 2234 case STATUSTYPE_INFO: 2235 DMEMIT("%ld %llu %llu %llu", writecache_has_error(wc), 2236 (unsigned long long)wc->n_blocks, (unsigned long long)wc->freelist_size, 2237 (unsigned long long)wc->writeback_size); 2238 break; 2239 case STATUSTYPE_TABLE: 2240 DMEMIT("%c %s %s %u ", WC_MODE_PMEM(wc) ? 'p' : 's', 2241 wc->dev->name, wc->ssd_dev->name, wc->block_size); 2242 extra_args = 0; 2243 if (wc->start_sector) 2244 extra_args += 2; 2245 if (wc->high_wm_percent_set) 2246 extra_args += 2; 2247 if (wc->low_wm_percent_set) 2248 extra_args += 2; 2249 if (wc->max_writeback_jobs_set) 2250 extra_args += 2; 2251 if (wc->autocommit_blocks_set) 2252 extra_args += 2; 2253 if (wc->autocommit_time_set) 2254 extra_args += 2; 2255 if (wc->writeback_fua_set) 2256 extra_args++; 2257 2258 DMEMIT("%u", extra_args); 2259 if (wc->start_sector) 2260 DMEMIT(" start_sector %llu", (unsigned long long)wc->start_sector); 2261 if (wc->high_wm_percent_set) { 2262 x = (uint64_t)wc->freelist_high_watermark * 100; 2263 x += wc->n_blocks / 2; 2264 do_div(x, (size_t)wc->n_blocks); 2265 DMEMIT(" high_watermark %u", 100 - (unsigned)x); 2266 } 2267 if (wc->low_wm_percent_set) { 2268 x = (uint64_t)wc->freelist_low_watermark * 100; 2269 x += wc->n_blocks / 2; 2270 do_div(x, (size_t)wc->n_blocks); 2271 DMEMIT(" low_watermark %u", 100 - (unsigned)x); 2272 } 2273 if (wc->max_writeback_jobs_set) 2274 DMEMIT(" writeback_jobs %u", wc->max_writeback_jobs); 2275 if (wc->autocommit_blocks_set) 2276 DMEMIT(" autocommit_blocks %u", wc->autocommit_blocks); 2277 if (wc->autocommit_time_set) 2278 DMEMIT(" autocommit_time %u", jiffies_to_msecs(wc->autocommit_jiffies)); 2279 if (wc->writeback_fua_set) 2280 DMEMIT(" %sfua", wc->writeback_fua ? "" : "no"); 2281 break; 2282 } 2283 } 2284 2285 static struct target_type writecache_target = { 2286 .name = "writecache", 2287 .version = {1, 1, 1}, 2288 .module = THIS_MODULE, 2289 .ctr = writecache_ctr, 2290 .dtr = writecache_dtr, 2291 .status = writecache_status, 2292 .postsuspend = writecache_suspend, 2293 .resume = writecache_resume, 2294 .message = writecache_message, 2295 .map = writecache_map, 2296 .end_io = writecache_end_io, 2297 .iterate_devices = writecache_iterate_devices, 2298 .io_hints = writecache_io_hints, 2299 }; 2300 2301 static int __init dm_writecache_init(void) 2302 { 2303 int r; 2304 2305 r = dm_register_target(&writecache_target); 2306 if (r < 0) { 2307 DMERR("register failed %d", r); 2308 return r; 2309 } 2310 2311 return 0; 2312 } 2313 2314 static void __exit dm_writecache_exit(void) 2315 { 2316 dm_unregister_target(&writecache_target); 2317 } 2318 2319 module_init(dm_writecache_init); 2320 module_exit(dm_writecache_exit); 2321 2322 MODULE_DESCRIPTION(DM_NAME " writecache target"); 2323 MODULE_AUTHOR("Mikulas Patocka <dm-devel@redhat.com>"); 2324 MODULE_LICENSE("GPL"); 2325