1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2018 Red Hat. All rights reserved. 4 * 5 * This file is released under the GPL. 6 */ 7 8 #include <linux/device-mapper.h> 9 #include <linux/module.h> 10 #include <linux/init.h> 11 #include <linux/vmalloc.h> 12 #include <linux/kthread.h> 13 #include <linux/dm-io.h> 14 #include <linux/dm-kcopyd.h> 15 #include <linux/dax.h> 16 #include <linux/pfn_t.h> 17 #include <linux/libnvdimm.h> 18 19 #define DM_MSG_PREFIX "writecache" 20 21 #define HIGH_WATERMARK 50 22 #define LOW_WATERMARK 45 23 #define MAX_WRITEBACK_JOBS 0 24 #define ENDIO_LATENCY 16 25 #define WRITEBACK_LATENCY 64 26 #define AUTOCOMMIT_BLOCKS_SSD 65536 27 #define AUTOCOMMIT_BLOCKS_PMEM 64 28 #define AUTOCOMMIT_MSEC 1000 29 #define MAX_AGE_DIV 16 30 #define MAX_AGE_UNSPECIFIED -1UL 31 32 #define BITMAP_GRANULARITY 65536 33 #if BITMAP_GRANULARITY < PAGE_SIZE 34 #undef BITMAP_GRANULARITY 35 #define BITMAP_GRANULARITY PAGE_SIZE 36 #endif 37 38 #if IS_ENABLED(CONFIG_ARCH_HAS_PMEM_API) && IS_ENABLED(CONFIG_DAX_DRIVER) 39 #define DM_WRITECACHE_HAS_PMEM 40 #endif 41 42 #ifdef DM_WRITECACHE_HAS_PMEM 43 #define pmem_assign(dest, src) \ 44 do { \ 45 typeof(dest) uniq = (src); \ 46 memcpy_flushcache(&(dest), &uniq, sizeof(dest)); \ 47 } while (0) 48 #else 49 #define pmem_assign(dest, src) ((dest) = (src)) 50 #endif 51 52 #if defined(__HAVE_ARCH_MEMCPY_MCSAFE) && defined(DM_WRITECACHE_HAS_PMEM) 53 #define DM_WRITECACHE_HANDLE_HARDWARE_ERRORS 54 #endif 55 56 #define MEMORY_SUPERBLOCK_MAGIC 0x23489321 57 #define MEMORY_SUPERBLOCK_VERSION 1 58 59 struct wc_memory_entry { 60 __le64 original_sector; 61 __le64 seq_count; 62 }; 63 64 struct wc_memory_superblock { 65 union { 66 struct { 67 __le32 magic; 68 __le32 version; 69 __le32 block_size; 70 __le32 pad; 71 __le64 n_blocks; 72 __le64 seq_count; 73 }; 74 __le64 padding[8]; 75 }; 76 struct wc_memory_entry entries[0]; 77 }; 78 79 struct wc_entry { 80 struct rb_node rb_node; 81 struct list_head lru; 82 unsigned short wc_list_contiguous; 83 bool write_in_progress 84 #if BITS_PER_LONG == 64 85 :1 86 #endif 87 ; 88 unsigned long index 89 #if BITS_PER_LONG == 64 90 :47 91 #endif 92 ; 93 unsigned long age; 94 #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS 95 uint64_t original_sector; 96 uint64_t seq_count; 97 #endif 98 }; 99 100 #ifdef DM_WRITECACHE_HAS_PMEM 101 #define WC_MODE_PMEM(wc) ((wc)->pmem_mode) 102 #define WC_MODE_FUA(wc) ((wc)->writeback_fua) 103 #else 104 #define WC_MODE_PMEM(wc) false 105 #define WC_MODE_FUA(wc) false 106 #endif 107 #define WC_MODE_SORT_FREELIST(wc) (!WC_MODE_PMEM(wc)) 108 109 struct dm_writecache { 110 struct mutex lock; 111 struct list_head lru; 112 union { 113 struct list_head freelist; 114 struct { 115 struct rb_root freetree; 116 struct wc_entry *current_free; 117 }; 118 }; 119 struct rb_root tree; 120 121 size_t freelist_size; 122 size_t writeback_size; 123 size_t freelist_high_watermark; 124 size_t freelist_low_watermark; 125 unsigned long max_age; 126 127 unsigned uncommitted_blocks; 128 unsigned autocommit_blocks; 129 unsigned max_writeback_jobs; 130 131 int error; 132 133 unsigned long autocommit_jiffies; 134 struct timer_list autocommit_timer; 135 struct wait_queue_head freelist_wait; 136 137 struct timer_list max_age_timer; 138 139 atomic_t bio_in_progress[2]; 140 struct wait_queue_head bio_in_progress_wait[2]; 141 142 struct dm_target *ti; 143 struct dm_dev *dev; 144 struct dm_dev *ssd_dev; 145 sector_t start_sector; 146 void *memory_map; 147 uint64_t memory_map_size; 148 size_t metadata_sectors; 149 size_t n_blocks; 150 uint64_t seq_count; 151 void *block_start; 152 struct wc_entry *entries; 153 unsigned block_size; 154 unsigned char block_size_bits; 155 156 bool pmem_mode:1; 157 bool writeback_fua:1; 158 159 bool overwrote_committed:1; 160 bool memory_vmapped:1; 161 162 bool high_wm_percent_set:1; 163 bool low_wm_percent_set:1; 164 bool max_writeback_jobs_set:1; 165 bool autocommit_blocks_set:1; 166 bool autocommit_time_set:1; 167 bool writeback_fua_set:1; 168 bool flush_on_suspend:1; 169 bool cleaner:1; 170 171 unsigned writeback_all; 172 struct workqueue_struct *writeback_wq; 173 struct work_struct writeback_work; 174 struct work_struct flush_work; 175 176 struct dm_io_client *dm_io; 177 178 raw_spinlock_t endio_list_lock; 179 struct list_head endio_list; 180 struct task_struct *endio_thread; 181 182 struct task_struct *flush_thread; 183 struct bio_list flush_list; 184 185 struct dm_kcopyd_client *dm_kcopyd; 186 unsigned long *dirty_bitmap; 187 unsigned dirty_bitmap_size; 188 189 struct bio_set bio_set; 190 mempool_t copy_pool; 191 }; 192 193 #define WB_LIST_INLINE 16 194 195 struct writeback_struct { 196 struct list_head endio_entry; 197 struct dm_writecache *wc; 198 struct wc_entry **wc_list; 199 unsigned wc_list_n; 200 struct wc_entry *wc_list_inline[WB_LIST_INLINE]; 201 struct bio bio; 202 }; 203 204 struct copy_struct { 205 struct list_head endio_entry; 206 struct dm_writecache *wc; 207 struct wc_entry *e; 208 unsigned n_entries; 209 int error; 210 }; 211 212 DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(dm_writecache_throttle, 213 "A percentage of time allocated for data copying"); 214 215 static void wc_lock(struct dm_writecache *wc) 216 { 217 mutex_lock(&wc->lock); 218 } 219 220 static void wc_unlock(struct dm_writecache *wc) 221 { 222 mutex_unlock(&wc->lock); 223 } 224 225 #ifdef DM_WRITECACHE_HAS_PMEM 226 static int persistent_memory_claim(struct dm_writecache *wc) 227 { 228 int r; 229 loff_t s; 230 long p, da; 231 pfn_t pfn; 232 int id; 233 struct page **pages; 234 235 wc->memory_vmapped = false; 236 237 if (!wc->ssd_dev->dax_dev) { 238 r = -EOPNOTSUPP; 239 goto err1; 240 } 241 s = wc->memory_map_size; 242 p = s >> PAGE_SHIFT; 243 if (!p) { 244 r = -EINVAL; 245 goto err1; 246 } 247 if (p != s >> PAGE_SHIFT) { 248 r = -EOVERFLOW; 249 goto err1; 250 } 251 252 id = dax_read_lock(); 253 254 da = dax_direct_access(wc->ssd_dev->dax_dev, 0, p, &wc->memory_map, &pfn); 255 if (da < 0) { 256 wc->memory_map = NULL; 257 r = da; 258 goto err2; 259 } 260 if (!pfn_t_has_page(pfn)) { 261 wc->memory_map = NULL; 262 r = -EOPNOTSUPP; 263 goto err2; 264 } 265 if (da != p) { 266 long i; 267 wc->memory_map = NULL; 268 pages = kvmalloc_array(p, sizeof(struct page *), GFP_KERNEL); 269 if (!pages) { 270 r = -ENOMEM; 271 goto err2; 272 } 273 i = 0; 274 do { 275 long daa; 276 daa = dax_direct_access(wc->ssd_dev->dax_dev, i, p - i, 277 NULL, &pfn); 278 if (daa <= 0) { 279 r = daa ? daa : -EINVAL; 280 goto err3; 281 } 282 if (!pfn_t_has_page(pfn)) { 283 r = -EOPNOTSUPP; 284 goto err3; 285 } 286 while (daa-- && i < p) { 287 pages[i++] = pfn_t_to_page(pfn); 288 pfn.val++; 289 } 290 } while (i < p); 291 wc->memory_map = vmap(pages, p, VM_MAP, PAGE_KERNEL); 292 if (!wc->memory_map) { 293 r = -ENOMEM; 294 goto err3; 295 } 296 kvfree(pages); 297 wc->memory_vmapped = true; 298 } 299 300 dax_read_unlock(id); 301 302 wc->memory_map += (size_t)wc->start_sector << SECTOR_SHIFT; 303 wc->memory_map_size -= (size_t)wc->start_sector << SECTOR_SHIFT; 304 305 return 0; 306 err3: 307 kvfree(pages); 308 err2: 309 dax_read_unlock(id); 310 err1: 311 return r; 312 } 313 #else 314 static int persistent_memory_claim(struct dm_writecache *wc) 315 { 316 BUG(); 317 } 318 #endif 319 320 static void persistent_memory_release(struct dm_writecache *wc) 321 { 322 if (wc->memory_vmapped) 323 vunmap(wc->memory_map - ((size_t)wc->start_sector << SECTOR_SHIFT)); 324 } 325 326 static struct page *persistent_memory_page(void *addr) 327 { 328 if (is_vmalloc_addr(addr)) 329 return vmalloc_to_page(addr); 330 else 331 return virt_to_page(addr); 332 } 333 334 static unsigned persistent_memory_page_offset(void *addr) 335 { 336 return (unsigned long)addr & (PAGE_SIZE - 1); 337 } 338 339 static void persistent_memory_flush_cache(void *ptr, size_t size) 340 { 341 if (is_vmalloc_addr(ptr)) 342 flush_kernel_vmap_range(ptr, size); 343 } 344 345 static void persistent_memory_invalidate_cache(void *ptr, size_t size) 346 { 347 if (is_vmalloc_addr(ptr)) 348 invalidate_kernel_vmap_range(ptr, size); 349 } 350 351 static struct wc_memory_superblock *sb(struct dm_writecache *wc) 352 { 353 return wc->memory_map; 354 } 355 356 static struct wc_memory_entry *memory_entry(struct dm_writecache *wc, struct wc_entry *e) 357 { 358 return &sb(wc)->entries[e->index]; 359 } 360 361 static void *memory_data(struct dm_writecache *wc, struct wc_entry *e) 362 { 363 return (char *)wc->block_start + (e->index << wc->block_size_bits); 364 } 365 366 static sector_t cache_sector(struct dm_writecache *wc, struct wc_entry *e) 367 { 368 return wc->start_sector + wc->metadata_sectors + 369 ((sector_t)e->index << (wc->block_size_bits - SECTOR_SHIFT)); 370 } 371 372 static uint64_t read_original_sector(struct dm_writecache *wc, struct wc_entry *e) 373 { 374 #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS 375 return e->original_sector; 376 #else 377 return le64_to_cpu(memory_entry(wc, e)->original_sector); 378 #endif 379 } 380 381 static uint64_t read_seq_count(struct dm_writecache *wc, struct wc_entry *e) 382 { 383 #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS 384 return e->seq_count; 385 #else 386 return le64_to_cpu(memory_entry(wc, e)->seq_count); 387 #endif 388 } 389 390 static void clear_seq_count(struct dm_writecache *wc, struct wc_entry *e) 391 { 392 #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS 393 e->seq_count = -1; 394 #endif 395 pmem_assign(memory_entry(wc, e)->seq_count, cpu_to_le64(-1)); 396 } 397 398 static void write_original_sector_seq_count(struct dm_writecache *wc, struct wc_entry *e, 399 uint64_t original_sector, uint64_t seq_count) 400 { 401 struct wc_memory_entry me; 402 #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS 403 e->original_sector = original_sector; 404 e->seq_count = seq_count; 405 #endif 406 me.original_sector = cpu_to_le64(original_sector); 407 me.seq_count = cpu_to_le64(seq_count); 408 pmem_assign(*memory_entry(wc, e), me); 409 } 410 411 #define writecache_error(wc, err, msg, arg...) \ 412 do { \ 413 if (!cmpxchg(&(wc)->error, 0, err)) \ 414 DMERR(msg, ##arg); \ 415 wake_up(&(wc)->freelist_wait); \ 416 } while (0) 417 418 #define writecache_has_error(wc) (unlikely(READ_ONCE((wc)->error))) 419 420 static void writecache_flush_all_metadata(struct dm_writecache *wc) 421 { 422 if (!WC_MODE_PMEM(wc)) 423 memset(wc->dirty_bitmap, -1, wc->dirty_bitmap_size); 424 } 425 426 static void writecache_flush_region(struct dm_writecache *wc, void *ptr, size_t size) 427 { 428 if (!WC_MODE_PMEM(wc)) 429 __set_bit(((char *)ptr - (char *)wc->memory_map) / BITMAP_GRANULARITY, 430 wc->dirty_bitmap); 431 } 432 433 static void writecache_disk_flush(struct dm_writecache *wc, struct dm_dev *dev); 434 435 struct io_notify { 436 struct dm_writecache *wc; 437 struct completion c; 438 atomic_t count; 439 }; 440 441 static void writecache_notify_io(unsigned long error, void *context) 442 { 443 struct io_notify *endio = context; 444 445 if (unlikely(error != 0)) 446 writecache_error(endio->wc, -EIO, "error writing metadata"); 447 BUG_ON(atomic_read(&endio->count) <= 0); 448 if (atomic_dec_and_test(&endio->count)) 449 complete(&endio->c); 450 } 451 452 static void writecache_wait_for_ios(struct dm_writecache *wc, int direction) 453 { 454 wait_event(wc->bio_in_progress_wait[direction], 455 !atomic_read(&wc->bio_in_progress[direction])); 456 } 457 458 static void ssd_commit_flushed(struct dm_writecache *wc, bool wait_for_ios) 459 { 460 struct dm_io_region region; 461 struct dm_io_request req; 462 struct io_notify endio = { 463 wc, 464 COMPLETION_INITIALIZER_ONSTACK(endio.c), 465 ATOMIC_INIT(1), 466 }; 467 unsigned bitmap_bits = wc->dirty_bitmap_size * 8; 468 unsigned i = 0; 469 470 while (1) { 471 unsigned j; 472 i = find_next_bit(wc->dirty_bitmap, bitmap_bits, i); 473 if (unlikely(i == bitmap_bits)) 474 break; 475 j = find_next_zero_bit(wc->dirty_bitmap, bitmap_bits, i); 476 477 region.bdev = wc->ssd_dev->bdev; 478 region.sector = (sector_t)i * (BITMAP_GRANULARITY >> SECTOR_SHIFT); 479 region.count = (sector_t)(j - i) * (BITMAP_GRANULARITY >> SECTOR_SHIFT); 480 481 if (unlikely(region.sector >= wc->metadata_sectors)) 482 break; 483 if (unlikely(region.sector + region.count > wc->metadata_sectors)) 484 region.count = wc->metadata_sectors - region.sector; 485 486 region.sector += wc->start_sector; 487 atomic_inc(&endio.count); 488 req.bi_op = REQ_OP_WRITE; 489 req.bi_op_flags = REQ_SYNC; 490 req.mem.type = DM_IO_VMA; 491 req.mem.ptr.vma = (char *)wc->memory_map + (size_t)i * BITMAP_GRANULARITY; 492 req.client = wc->dm_io; 493 req.notify.fn = writecache_notify_io; 494 req.notify.context = &endio; 495 496 /* writing via async dm-io (implied by notify.fn above) won't return an error */ 497 (void) dm_io(&req, 1, ®ion, NULL); 498 i = j; 499 } 500 501 writecache_notify_io(0, &endio); 502 wait_for_completion_io(&endio.c); 503 504 if (wait_for_ios) 505 writecache_wait_for_ios(wc, WRITE); 506 507 writecache_disk_flush(wc, wc->ssd_dev); 508 509 memset(wc->dirty_bitmap, 0, wc->dirty_bitmap_size); 510 } 511 512 static void ssd_commit_superblock(struct dm_writecache *wc) 513 { 514 int r; 515 struct dm_io_region region; 516 struct dm_io_request req; 517 518 region.bdev = wc->ssd_dev->bdev; 519 region.sector = 0; 520 region.count = PAGE_SIZE; 521 522 if (unlikely(region.sector + region.count > wc->metadata_sectors)) 523 region.count = wc->metadata_sectors - region.sector; 524 525 region.sector += wc->start_sector; 526 527 req.bi_op = REQ_OP_WRITE; 528 req.bi_op_flags = REQ_SYNC | REQ_FUA; 529 req.mem.type = DM_IO_VMA; 530 req.mem.ptr.vma = (char *)wc->memory_map; 531 req.client = wc->dm_io; 532 req.notify.fn = NULL; 533 req.notify.context = NULL; 534 535 r = dm_io(&req, 1, ®ion, NULL); 536 if (unlikely(r)) 537 writecache_error(wc, r, "error writing superblock"); 538 } 539 540 static void writecache_commit_flushed(struct dm_writecache *wc, bool wait_for_ios) 541 { 542 if (WC_MODE_PMEM(wc)) 543 wmb(); 544 else 545 ssd_commit_flushed(wc, wait_for_ios); 546 } 547 548 static void writecache_disk_flush(struct dm_writecache *wc, struct dm_dev *dev) 549 { 550 int r; 551 struct dm_io_region region; 552 struct dm_io_request req; 553 554 region.bdev = dev->bdev; 555 region.sector = 0; 556 region.count = 0; 557 req.bi_op = REQ_OP_WRITE; 558 req.bi_op_flags = REQ_PREFLUSH; 559 req.mem.type = DM_IO_KMEM; 560 req.mem.ptr.addr = NULL; 561 req.client = wc->dm_io; 562 req.notify.fn = NULL; 563 564 r = dm_io(&req, 1, ®ion, NULL); 565 if (unlikely(r)) 566 writecache_error(wc, r, "error flushing metadata: %d", r); 567 } 568 569 #define WFE_RETURN_FOLLOWING 1 570 #define WFE_LOWEST_SEQ 2 571 572 static struct wc_entry *writecache_find_entry(struct dm_writecache *wc, 573 uint64_t block, int flags) 574 { 575 struct wc_entry *e; 576 struct rb_node *node = wc->tree.rb_node; 577 578 if (unlikely(!node)) 579 return NULL; 580 581 while (1) { 582 e = container_of(node, struct wc_entry, rb_node); 583 if (read_original_sector(wc, e) == block) 584 break; 585 586 node = (read_original_sector(wc, e) >= block ? 587 e->rb_node.rb_left : e->rb_node.rb_right); 588 if (unlikely(!node)) { 589 if (!(flags & WFE_RETURN_FOLLOWING)) 590 return NULL; 591 if (read_original_sector(wc, e) >= block) { 592 return e; 593 } else { 594 node = rb_next(&e->rb_node); 595 if (unlikely(!node)) 596 return NULL; 597 e = container_of(node, struct wc_entry, rb_node); 598 return e; 599 } 600 } 601 } 602 603 while (1) { 604 struct wc_entry *e2; 605 if (flags & WFE_LOWEST_SEQ) 606 node = rb_prev(&e->rb_node); 607 else 608 node = rb_next(&e->rb_node); 609 if (unlikely(!node)) 610 return e; 611 e2 = container_of(node, struct wc_entry, rb_node); 612 if (read_original_sector(wc, e2) != block) 613 return e; 614 e = e2; 615 } 616 } 617 618 static void writecache_insert_entry(struct dm_writecache *wc, struct wc_entry *ins) 619 { 620 struct wc_entry *e; 621 struct rb_node **node = &wc->tree.rb_node, *parent = NULL; 622 623 while (*node) { 624 e = container_of(*node, struct wc_entry, rb_node); 625 parent = &e->rb_node; 626 if (read_original_sector(wc, e) > read_original_sector(wc, ins)) 627 node = &parent->rb_left; 628 else 629 node = &parent->rb_right; 630 } 631 rb_link_node(&ins->rb_node, parent, node); 632 rb_insert_color(&ins->rb_node, &wc->tree); 633 list_add(&ins->lru, &wc->lru); 634 ins->age = jiffies; 635 } 636 637 static void writecache_unlink(struct dm_writecache *wc, struct wc_entry *e) 638 { 639 list_del(&e->lru); 640 rb_erase(&e->rb_node, &wc->tree); 641 } 642 643 static void writecache_add_to_freelist(struct dm_writecache *wc, struct wc_entry *e) 644 { 645 if (WC_MODE_SORT_FREELIST(wc)) { 646 struct rb_node **node = &wc->freetree.rb_node, *parent = NULL; 647 if (unlikely(!*node)) 648 wc->current_free = e; 649 while (*node) { 650 parent = *node; 651 if (&e->rb_node < *node) 652 node = &parent->rb_left; 653 else 654 node = &parent->rb_right; 655 } 656 rb_link_node(&e->rb_node, parent, node); 657 rb_insert_color(&e->rb_node, &wc->freetree); 658 } else { 659 list_add_tail(&e->lru, &wc->freelist); 660 } 661 wc->freelist_size++; 662 } 663 664 static inline void writecache_verify_watermark(struct dm_writecache *wc) 665 { 666 if (unlikely(wc->freelist_size + wc->writeback_size <= wc->freelist_high_watermark)) 667 queue_work(wc->writeback_wq, &wc->writeback_work); 668 } 669 670 static void writecache_max_age_timer(struct timer_list *t) 671 { 672 struct dm_writecache *wc = from_timer(wc, t, max_age_timer); 673 674 if (!dm_suspended(wc->ti) && !writecache_has_error(wc)) { 675 queue_work(wc->writeback_wq, &wc->writeback_work); 676 mod_timer(&wc->max_age_timer, jiffies + wc->max_age / MAX_AGE_DIV); 677 } 678 } 679 680 static struct wc_entry *writecache_pop_from_freelist(struct dm_writecache *wc, sector_t expected_sector) 681 { 682 struct wc_entry *e; 683 684 if (WC_MODE_SORT_FREELIST(wc)) { 685 struct rb_node *next; 686 if (unlikely(!wc->current_free)) 687 return NULL; 688 e = wc->current_free; 689 if (expected_sector != (sector_t)-1 && unlikely(cache_sector(wc, e) != expected_sector)) 690 return NULL; 691 next = rb_next(&e->rb_node); 692 rb_erase(&e->rb_node, &wc->freetree); 693 if (unlikely(!next)) 694 next = rb_first(&wc->freetree); 695 wc->current_free = next ? container_of(next, struct wc_entry, rb_node) : NULL; 696 } else { 697 if (unlikely(list_empty(&wc->freelist))) 698 return NULL; 699 e = container_of(wc->freelist.next, struct wc_entry, lru); 700 if (expected_sector != (sector_t)-1 && unlikely(cache_sector(wc, e) != expected_sector)) 701 return NULL; 702 list_del(&e->lru); 703 } 704 wc->freelist_size--; 705 706 writecache_verify_watermark(wc); 707 708 return e; 709 } 710 711 static void writecache_free_entry(struct dm_writecache *wc, struct wc_entry *e) 712 { 713 writecache_unlink(wc, e); 714 writecache_add_to_freelist(wc, e); 715 clear_seq_count(wc, e); 716 writecache_flush_region(wc, memory_entry(wc, e), sizeof(struct wc_memory_entry)); 717 if (unlikely(waitqueue_active(&wc->freelist_wait))) 718 wake_up(&wc->freelist_wait); 719 } 720 721 static void writecache_wait_on_freelist(struct dm_writecache *wc) 722 { 723 DEFINE_WAIT(wait); 724 725 prepare_to_wait(&wc->freelist_wait, &wait, TASK_UNINTERRUPTIBLE); 726 wc_unlock(wc); 727 io_schedule(); 728 finish_wait(&wc->freelist_wait, &wait); 729 wc_lock(wc); 730 } 731 732 static void writecache_poison_lists(struct dm_writecache *wc) 733 { 734 /* 735 * Catch incorrect access to these values while the device is suspended. 736 */ 737 memset(&wc->tree, -1, sizeof wc->tree); 738 wc->lru.next = LIST_POISON1; 739 wc->lru.prev = LIST_POISON2; 740 wc->freelist.next = LIST_POISON1; 741 wc->freelist.prev = LIST_POISON2; 742 } 743 744 static void writecache_flush_entry(struct dm_writecache *wc, struct wc_entry *e) 745 { 746 writecache_flush_region(wc, memory_entry(wc, e), sizeof(struct wc_memory_entry)); 747 if (WC_MODE_PMEM(wc)) 748 writecache_flush_region(wc, memory_data(wc, e), wc->block_size); 749 } 750 751 static bool writecache_entry_is_committed(struct dm_writecache *wc, struct wc_entry *e) 752 { 753 return read_seq_count(wc, e) < wc->seq_count; 754 } 755 756 static void writecache_flush(struct dm_writecache *wc) 757 { 758 struct wc_entry *e, *e2; 759 bool need_flush_after_free; 760 761 wc->uncommitted_blocks = 0; 762 del_timer(&wc->autocommit_timer); 763 764 if (list_empty(&wc->lru)) 765 return; 766 767 e = container_of(wc->lru.next, struct wc_entry, lru); 768 if (writecache_entry_is_committed(wc, e)) { 769 if (wc->overwrote_committed) { 770 writecache_wait_for_ios(wc, WRITE); 771 writecache_disk_flush(wc, wc->ssd_dev); 772 wc->overwrote_committed = false; 773 } 774 return; 775 } 776 while (1) { 777 writecache_flush_entry(wc, e); 778 if (unlikely(e->lru.next == &wc->lru)) 779 break; 780 e2 = container_of(e->lru.next, struct wc_entry, lru); 781 if (writecache_entry_is_committed(wc, e2)) 782 break; 783 e = e2; 784 cond_resched(); 785 } 786 writecache_commit_flushed(wc, true); 787 788 wc->seq_count++; 789 pmem_assign(sb(wc)->seq_count, cpu_to_le64(wc->seq_count)); 790 if (WC_MODE_PMEM(wc)) 791 writecache_commit_flushed(wc, false); 792 else 793 ssd_commit_superblock(wc); 794 795 wc->overwrote_committed = false; 796 797 need_flush_after_free = false; 798 while (1) { 799 /* Free another committed entry with lower seq-count */ 800 struct rb_node *rb_node = rb_prev(&e->rb_node); 801 802 if (rb_node) { 803 e2 = container_of(rb_node, struct wc_entry, rb_node); 804 if (read_original_sector(wc, e2) == read_original_sector(wc, e) && 805 likely(!e2->write_in_progress)) { 806 writecache_free_entry(wc, e2); 807 need_flush_after_free = true; 808 } 809 } 810 if (unlikely(e->lru.prev == &wc->lru)) 811 break; 812 e = container_of(e->lru.prev, struct wc_entry, lru); 813 cond_resched(); 814 } 815 816 if (need_flush_after_free) 817 writecache_commit_flushed(wc, false); 818 } 819 820 static void writecache_flush_work(struct work_struct *work) 821 { 822 struct dm_writecache *wc = container_of(work, struct dm_writecache, flush_work); 823 824 wc_lock(wc); 825 writecache_flush(wc); 826 wc_unlock(wc); 827 } 828 829 static void writecache_autocommit_timer(struct timer_list *t) 830 { 831 struct dm_writecache *wc = from_timer(wc, t, autocommit_timer); 832 if (!writecache_has_error(wc)) 833 queue_work(wc->writeback_wq, &wc->flush_work); 834 } 835 836 static void writecache_schedule_autocommit(struct dm_writecache *wc) 837 { 838 if (!timer_pending(&wc->autocommit_timer)) 839 mod_timer(&wc->autocommit_timer, jiffies + wc->autocommit_jiffies); 840 } 841 842 static void writecache_discard(struct dm_writecache *wc, sector_t start, sector_t end) 843 { 844 struct wc_entry *e; 845 bool discarded_something = false; 846 847 e = writecache_find_entry(wc, start, WFE_RETURN_FOLLOWING | WFE_LOWEST_SEQ); 848 if (unlikely(!e)) 849 return; 850 851 while (read_original_sector(wc, e) < end) { 852 struct rb_node *node = rb_next(&e->rb_node); 853 854 if (likely(!e->write_in_progress)) { 855 if (!discarded_something) { 856 writecache_wait_for_ios(wc, READ); 857 writecache_wait_for_ios(wc, WRITE); 858 discarded_something = true; 859 } 860 writecache_free_entry(wc, e); 861 } 862 863 if (unlikely(!node)) 864 break; 865 866 e = container_of(node, struct wc_entry, rb_node); 867 } 868 869 if (discarded_something) 870 writecache_commit_flushed(wc, false); 871 } 872 873 static bool writecache_wait_for_writeback(struct dm_writecache *wc) 874 { 875 if (wc->writeback_size) { 876 writecache_wait_on_freelist(wc); 877 return true; 878 } 879 return false; 880 } 881 882 static void writecache_suspend(struct dm_target *ti) 883 { 884 struct dm_writecache *wc = ti->private; 885 bool flush_on_suspend; 886 887 del_timer_sync(&wc->autocommit_timer); 888 del_timer_sync(&wc->max_age_timer); 889 890 wc_lock(wc); 891 writecache_flush(wc); 892 flush_on_suspend = wc->flush_on_suspend; 893 if (flush_on_suspend) { 894 wc->flush_on_suspend = false; 895 wc->writeback_all++; 896 queue_work(wc->writeback_wq, &wc->writeback_work); 897 } 898 wc_unlock(wc); 899 900 drain_workqueue(wc->writeback_wq); 901 902 wc_lock(wc); 903 if (flush_on_suspend) 904 wc->writeback_all--; 905 while (writecache_wait_for_writeback(wc)); 906 907 if (WC_MODE_PMEM(wc)) 908 persistent_memory_flush_cache(wc->memory_map, wc->memory_map_size); 909 910 writecache_poison_lists(wc); 911 912 wc_unlock(wc); 913 } 914 915 static int writecache_alloc_entries(struct dm_writecache *wc) 916 { 917 size_t b; 918 919 if (wc->entries) 920 return 0; 921 wc->entries = vmalloc(array_size(sizeof(struct wc_entry), wc->n_blocks)); 922 if (!wc->entries) 923 return -ENOMEM; 924 for (b = 0; b < wc->n_blocks; b++) { 925 struct wc_entry *e = &wc->entries[b]; 926 e->index = b; 927 e->write_in_progress = false; 928 cond_resched(); 929 } 930 931 return 0; 932 } 933 934 static int writecache_read_metadata(struct dm_writecache *wc, sector_t n_sectors) 935 { 936 struct dm_io_region region; 937 struct dm_io_request req; 938 939 region.bdev = wc->ssd_dev->bdev; 940 region.sector = wc->start_sector; 941 region.count = n_sectors; 942 req.bi_op = REQ_OP_READ; 943 req.bi_op_flags = REQ_SYNC; 944 req.mem.type = DM_IO_VMA; 945 req.mem.ptr.vma = (char *)wc->memory_map; 946 req.client = wc->dm_io; 947 req.notify.fn = NULL; 948 949 return dm_io(&req, 1, ®ion, NULL); 950 } 951 952 static void writecache_resume(struct dm_target *ti) 953 { 954 struct dm_writecache *wc = ti->private; 955 size_t b; 956 bool need_flush = false; 957 __le64 sb_seq_count; 958 int r; 959 960 wc_lock(wc); 961 962 if (WC_MODE_PMEM(wc)) { 963 persistent_memory_invalidate_cache(wc->memory_map, wc->memory_map_size); 964 } else { 965 r = writecache_read_metadata(wc, wc->metadata_sectors); 966 if (r) { 967 size_t sb_entries_offset; 968 writecache_error(wc, r, "unable to read metadata: %d", r); 969 sb_entries_offset = offsetof(struct wc_memory_superblock, entries); 970 memset((char *)wc->memory_map + sb_entries_offset, -1, 971 (wc->metadata_sectors << SECTOR_SHIFT) - sb_entries_offset); 972 } 973 } 974 975 wc->tree = RB_ROOT; 976 INIT_LIST_HEAD(&wc->lru); 977 if (WC_MODE_SORT_FREELIST(wc)) { 978 wc->freetree = RB_ROOT; 979 wc->current_free = NULL; 980 } else { 981 INIT_LIST_HEAD(&wc->freelist); 982 } 983 wc->freelist_size = 0; 984 985 r = memcpy_mcsafe(&sb_seq_count, &sb(wc)->seq_count, sizeof(uint64_t)); 986 if (r) { 987 writecache_error(wc, r, "hardware memory error when reading superblock: %d", r); 988 sb_seq_count = cpu_to_le64(0); 989 } 990 wc->seq_count = le64_to_cpu(sb_seq_count); 991 992 #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS 993 for (b = 0; b < wc->n_blocks; b++) { 994 struct wc_entry *e = &wc->entries[b]; 995 struct wc_memory_entry wme; 996 if (writecache_has_error(wc)) { 997 e->original_sector = -1; 998 e->seq_count = -1; 999 continue; 1000 } 1001 r = memcpy_mcsafe(&wme, memory_entry(wc, e), sizeof(struct wc_memory_entry)); 1002 if (r) { 1003 writecache_error(wc, r, "hardware memory error when reading metadata entry %lu: %d", 1004 (unsigned long)b, r); 1005 e->original_sector = -1; 1006 e->seq_count = -1; 1007 } else { 1008 e->original_sector = le64_to_cpu(wme.original_sector); 1009 e->seq_count = le64_to_cpu(wme.seq_count); 1010 } 1011 cond_resched(); 1012 } 1013 #endif 1014 for (b = 0; b < wc->n_blocks; b++) { 1015 struct wc_entry *e = &wc->entries[b]; 1016 if (!writecache_entry_is_committed(wc, e)) { 1017 if (read_seq_count(wc, e) != -1) { 1018 erase_this: 1019 clear_seq_count(wc, e); 1020 need_flush = true; 1021 } 1022 writecache_add_to_freelist(wc, e); 1023 } else { 1024 struct wc_entry *old; 1025 1026 old = writecache_find_entry(wc, read_original_sector(wc, e), 0); 1027 if (!old) { 1028 writecache_insert_entry(wc, e); 1029 } else { 1030 if (read_seq_count(wc, old) == read_seq_count(wc, e)) { 1031 writecache_error(wc, -EINVAL, 1032 "two identical entries, position %llu, sector %llu, sequence %llu", 1033 (unsigned long long)b, (unsigned long long)read_original_sector(wc, e), 1034 (unsigned long long)read_seq_count(wc, e)); 1035 } 1036 if (read_seq_count(wc, old) > read_seq_count(wc, e)) { 1037 goto erase_this; 1038 } else { 1039 writecache_free_entry(wc, old); 1040 writecache_insert_entry(wc, e); 1041 need_flush = true; 1042 } 1043 } 1044 } 1045 cond_resched(); 1046 } 1047 1048 if (need_flush) { 1049 writecache_flush_all_metadata(wc); 1050 writecache_commit_flushed(wc, false); 1051 } 1052 1053 writecache_verify_watermark(wc); 1054 1055 if (wc->max_age != MAX_AGE_UNSPECIFIED) 1056 mod_timer(&wc->max_age_timer, jiffies + wc->max_age / MAX_AGE_DIV); 1057 1058 wc_unlock(wc); 1059 } 1060 1061 static int process_flush_mesg(unsigned argc, char **argv, struct dm_writecache *wc) 1062 { 1063 if (argc != 1) 1064 return -EINVAL; 1065 1066 wc_lock(wc); 1067 if (dm_suspended(wc->ti)) { 1068 wc_unlock(wc); 1069 return -EBUSY; 1070 } 1071 if (writecache_has_error(wc)) { 1072 wc_unlock(wc); 1073 return -EIO; 1074 } 1075 1076 writecache_flush(wc); 1077 wc->writeback_all++; 1078 queue_work(wc->writeback_wq, &wc->writeback_work); 1079 wc_unlock(wc); 1080 1081 flush_workqueue(wc->writeback_wq); 1082 1083 wc_lock(wc); 1084 wc->writeback_all--; 1085 if (writecache_has_error(wc)) { 1086 wc_unlock(wc); 1087 return -EIO; 1088 } 1089 wc_unlock(wc); 1090 1091 return 0; 1092 } 1093 1094 static int process_flush_on_suspend_mesg(unsigned argc, char **argv, struct dm_writecache *wc) 1095 { 1096 if (argc != 1) 1097 return -EINVAL; 1098 1099 wc_lock(wc); 1100 wc->flush_on_suspend = true; 1101 wc_unlock(wc); 1102 1103 return 0; 1104 } 1105 1106 static void activate_cleaner(struct dm_writecache *wc) 1107 { 1108 wc->flush_on_suspend = true; 1109 wc->cleaner = true; 1110 wc->freelist_high_watermark = wc->n_blocks; 1111 wc->freelist_low_watermark = wc->n_blocks; 1112 } 1113 1114 static int process_cleaner_mesg(unsigned argc, char **argv, struct dm_writecache *wc) 1115 { 1116 if (argc != 1) 1117 return -EINVAL; 1118 1119 wc_lock(wc); 1120 activate_cleaner(wc); 1121 if (!dm_suspended(wc->ti)) 1122 writecache_verify_watermark(wc); 1123 wc_unlock(wc); 1124 1125 return 0; 1126 } 1127 1128 static int writecache_message(struct dm_target *ti, unsigned argc, char **argv, 1129 char *result, unsigned maxlen) 1130 { 1131 int r = -EINVAL; 1132 struct dm_writecache *wc = ti->private; 1133 1134 if (!strcasecmp(argv[0], "flush")) 1135 r = process_flush_mesg(argc, argv, wc); 1136 else if (!strcasecmp(argv[0], "flush_on_suspend")) 1137 r = process_flush_on_suspend_mesg(argc, argv, wc); 1138 else if (!strcasecmp(argv[0], "cleaner")) 1139 r = process_cleaner_mesg(argc, argv, wc); 1140 else 1141 DMERR("unrecognised message received: %s", argv[0]); 1142 1143 return r; 1144 } 1145 1146 static void bio_copy_block(struct dm_writecache *wc, struct bio *bio, void *data) 1147 { 1148 void *buf; 1149 unsigned long flags; 1150 unsigned size; 1151 int rw = bio_data_dir(bio); 1152 unsigned remaining_size = wc->block_size; 1153 1154 do { 1155 struct bio_vec bv = bio_iter_iovec(bio, bio->bi_iter); 1156 buf = bvec_kmap_irq(&bv, &flags); 1157 size = bv.bv_len; 1158 if (unlikely(size > remaining_size)) 1159 size = remaining_size; 1160 1161 if (rw == READ) { 1162 int r; 1163 r = memcpy_mcsafe(buf, data, size); 1164 flush_dcache_page(bio_page(bio)); 1165 if (unlikely(r)) { 1166 writecache_error(wc, r, "hardware memory error when reading data: %d", r); 1167 bio->bi_status = BLK_STS_IOERR; 1168 } 1169 } else { 1170 flush_dcache_page(bio_page(bio)); 1171 memcpy_flushcache(data, buf, size); 1172 } 1173 1174 bvec_kunmap_irq(buf, &flags); 1175 1176 data = (char *)data + size; 1177 remaining_size -= size; 1178 bio_advance(bio, size); 1179 } while (unlikely(remaining_size)); 1180 } 1181 1182 static int writecache_flush_thread(void *data) 1183 { 1184 struct dm_writecache *wc = data; 1185 1186 while (1) { 1187 struct bio *bio; 1188 1189 wc_lock(wc); 1190 bio = bio_list_pop(&wc->flush_list); 1191 if (!bio) { 1192 set_current_state(TASK_INTERRUPTIBLE); 1193 wc_unlock(wc); 1194 1195 if (unlikely(kthread_should_stop())) { 1196 set_current_state(TASK_RUNNING); 1197 break; 1198 } 1199 1200 schedule(); 1201 continue; 1202 } 1203 1204 if (bio_op(bio) == REQ_OP_DISCARD) { 1205 writecache_discard(wc, bio->bi_iter.bi_sector, 1206 bio_end_sector(bio)); 1207 wc_unlock(wc); 1208 bio_set_dev(bio, wc->dev->bdev); 1209 generic_make_request(bio); 1210 } else { 1211 writecache_flush(wc); 1212 wc_unlock(wc); 1213 if (writecache_has_error(wc)) 1214 bio->bi_status = BLK_STS_IOERR; 1215 bio_endio(bio); 1216 } 1217 } 1218 1219 return 0; 1220 } 1221 1222 static void writecache_offload_bio(struct dm_writecache *wc, struct bio *bio) 1223 { 1224 if (bio_list_empty(&wc->flush_list)) 1225 wake_up_process(wc->flush_thread); 1226 bio_list_add(&wc->flush_list, bio); 1227 } 1228 1229 static int writecache_map(struct dm_target *ti, struct bio *bio) 1230 { 1231 struct wc_entry *e; 1232 struct dm_writecache *wc = ti->private; 1233 1234 bio->bi_private = NULL; 1235 1236 wc_lock(wc); 1237 1238 if (unlikely(bio->bi_opf & REQ_PREFLUSH)) { 1239 if (writecache_has_error(wc)) 1240 goto unlock_error; 1241 if (WC_MODE_PMEM(wc)) { 1242 writecache_flush(wc); 1243 if (writecache_has_error(wc)) 1244 goto unlock_error; 1245 goto unlock_submit; 1246 } else { 1247 writecache_offload_bio(wc, bio); 1248 goto unlock_return; 1249 } 1250 } 1251 1252 bio->bi_iter.bi_sector = dm_target_offset(ti, bio->bi_iter.bi_sector); 1253 1254 if (unlikely((((unsigned)bio->bi_iter.bi_sector | bio_sectors(bio)) & 1255 (wc->block_size / 512 - 1)) != 0)) { 1256 DMERR("I/O is not aligned, sector %llu, size %u, block size %u", 1257 (unsigned long long)bio->bi_iter.bi_sector, 1258 bio->bi_iter.bi_size, wc->block_size); 1259 goto unlock_error; 1260 } 1261 1262 if (unlikely(bio_op(bio) == REQ_OP_DISCARD)) { 1263 if (writecache_has_error(wc)) 1264 goto unlock_error; 1265 if (WC_MODE_PMEM(wc)) { 1266 writecache_discard(wc, bio->bi_iter.bi_sector, bio_end_sector(bio)); 1267 goto unlock_remap_origin; 1268 } else { 1269 writecache_offload_bio(wc, bio); 1270 goto unlock_return; 1271 } 1272 } 1273 1274 if (bio_data_dir(bio) == READ) { 1275 read_next_block: 1276 e = writecache_find_entry(wc, bio->bi_iter.bi_sector, WFE_RETURN_FOLLOWING); 1277 if (e && read_original_sector(wc, e) == bio->bi_iter.bi_sector) { 1278 if (WC_MODE_PMEM(wc)) { 1279 bio_copy_block(wc, bio, memory_data(wc, e)); 1280 if (bio->bi_iter.bi_size) 1281 goto read_next_block; 1282 goto unlock_submit; 1283 } else { 1284 dm_accept_partial_bio(bio, wc->block_size >> SECTOR_SHIFT); 1285 bio_set_dev(bio, wc->ssd_dev->bdev); 1286 bio->bi_iter.bi_sector = cache_sector(wc, e); 1287 if (!writecache_entry_is_committed(wc, e)) 1288 writecache_wait_for_ios(wc, WRITE); 1289 goto unlock_remap; 1290 } 1291 } else { 1292 if (e) { 1293 sector_t next_boundary = 1294 read_original_sector(wc, e) - bio->bi_iter.bi_sector; 1295 if (next_boundary < bio->bi_iter.bi_size >> SECTOR_SHIFT) { 1296 dm_accept_partial_bio(bio, next_boundary); 1297 } 1298 } 1299 goto unlock_remap_origin; 1300 } 1301 } else { 1302 do { 1303 bool found_entry = false; 1304 if (writecache_has_error(wc)) 1305 goto unlock_error; 1306 e = writecache_find_entry(wc, bio->bi_iter.bi_sector, 0); 1307 if (e) { 1308 if (!writecache_entry_is_committed(wc, e)) 1309 goto bio_copy; 1310 if (!WC_MODE_PMEM(wc) && !e->write_in_progress) { 1311 wc->overwrote_committed = true; 1312 goto bio_copy; 1313 } 1314 found_entry = true; 1315 } else { 1316 if (unlikely(wc->cleaner)) 1317 goto direct_write; 1318 } 1319 e = writecache_pop_from_freelist(wc, (sector_t)-1); 1320 if (unlikely(!e)) { 1321 if (!found_entry) { 1322 direct_write: 1323 e = writecache_find_entry(wc, bio->bi_iter.bi_sector, WFE_RETURN_FOLLOWING); 1324 if (e) { 1325 sector_t next_boundary = read_original_sector(wc, e) - bio->bi_iter.bi_sector; 1326 BUG_ON(!next_boundary); 1327 if (next_boundary < bio->bi_iter.bi_size >> SECTOR_SHIFT) { 1328 dm_accept_partial_bio(bio, next_boundary); 1329 } 1330 } 1331 goto unlock_remap_origin; 1332 } 1333 writecache_wait_on_freelist(wc); 1334 continue; 1335 } 1336 write_original_sector_seq_count(wc, e, bio->bi_iter.bi_sector, wc->seq_count); 1337 writecache_insert_entry(wc, e); 1338 wc->uncommitted_blocks++; 1339 bio_copy: 1340 if (WC_MODE_PMEM(wc)) { 1341 bio_copy_block(wc, bio, memory_data(wc, e)); 1342 } else { 1343 unsigned bio_size = wc->block_size; 1344 sector_t start_cache_sec = cache_sector(wc, e); 1345 sector_t current_cache_sec = start_cache_sec + (bio_size >> SECTOR_SHIFT); 1346 1347 while (bio_size < bio->bi_iter.bi_size) { 1348 struct wc_entry *f = writecache_pop_from_freelist(wc, current_cache_sec); 1349 if (!f) 1350 break; 1351 write_original_sector_seq_count(wc, f, bio->bi_iter.bi_sector + 1352 (bio_size >> SECTOR_SHIFT), wc->seq_count); 1353 writecache_insert_entry(wc, f); 1354 wc->uncommitted_blocks++; 1355 bio_size += wc->block_size; 1356 current_cache_sec += wc->block_size >> SECTOR_SHIFT; 1357 } 1358 1359 bio_set_dev(bio, wc->ssd_dev->bdev); 1360 bio->bi_iter.bi_sector = start_cache_sec; 1361 dm_accept_partial_bio(bio, bio_size >> SECTOR_SHIFT); 1362 1363 if (unlikely(wc->uncommitted_blocks >= wc->autocommit_blocks)) { 1364 wc->uncommitted_blocks = 0; 1365 queue_work(wc->writeback_wq, &wc->flush_work); 1366 } else { 1367 writecache_schedule_autocommit(wc); 1368 } 1369 goto unlock_remap; 1370 } 1371 } while (bio->bi_iter.bi_size); 1372 1373 if (unlikely(bio->bi_opf & REQ_FUA || 1374 wc->uncommitted_blocks >= wc->autocommit_blocks)) 1375 writecache_flush(wc); 1376 else 1377 writecache_schedule_autocommit(wc); 1378 goto unlock_submit; 1379 } 1380 1381 unlock_remap_origin: 1382 bio_set_dev(bio, wc->dev->bdev); 1383 wc_unlock(wc); 1384 return DM_MAPIO_REMAPPED; 1385 1386 unlock_remap: 1387 /* make sure that writecache_end_io decrements bio_in_progress: */ 1388 bio->bi_private = (void *)1; 1389 atomic_inc(&wc->bio_in_progress[bio_data_dir(bio)]); 1390 wc_unlock(wc); 1391 return DM_MAPIO_REMAPPED; 1392 1393 unlock_submit: 1394 wc_unlock(wc); 1395 bio_endio(bio); 1396 return DM_MAPIO_SUBMITTED; 1397 1398 unlock_return: 1399 wc_unlock(wc); 1400 return DM_MAPIO_SUBMITTED; 1401 1402 unlock_error: 1403 wc_unlock(wc); 1404 bio_io_error(bio); 1405 return DM_MAPIO_SUBMITTED; 1406 } 1407 1408 static int writecache_end_io(struct dm_target *ti, struct bio *bio, blk_status_t *status) 1409 { 1410 struct dm_writecache *wc = ti->private; 1411 1412 if (bio->bi_private != NULL) { 1413 int dir = bio_data_dir(bio); 1414 if (atomic_dec_and_test(&wc->bio_in_progress[dir])) 1415 if (unlikely(waitqueue_active(&wc->bio_in_progress_wait[dir]))) 1416 wake_up(&wc->bio_in_progress_wait[dir]); 1417 } 1418 return 0; 1419 } 1420 1421 static int writecache_iterate_devices(struct dm_target *ti, 1422 iterate_devices_callout_fn fn, void *data) 1423 { 1424 struct dm_writecache *wc = ti->private; 1425 1426 return fn(ti, wc->dev, 0, ti->len, data); 1427 } 1428 1429 static void writecache_io_hints(struct dm_target *ti, struct queue_limits *limits) 1430 { 1431 struct dm_writecache *wc = ti->private; 1432 1433 if (limits->logical_block_size < wc->block_size) 1434 limits->logical_block_size = wc->block_size; 1435 1436 if (limits->physical_block_size < wc->block_size) 1437 limits->physical_block_size = wc->block_size; 1438 1439 if (limits->io_min < wc->block_size) 1440 limits->io_min = wc->block_size; 1441 } 1442 1443 1444 static void writecache_writeback_endio(struct bio *bio) 1445 { 1446 struct writeback_struct *wb = container_of(bio, struct writeback_struct, bio); 1447 struct dm_writecache *wc = wb->wc; 1448 unsigned long flags; 1449 1450 raw_spin_lock_irqsave(&wc->endio_list_lock, flags); 1451 if (unlikely(list_empty(&wc->endio_list))) 1452 wake_up_process(wc->endio_thread); 1453 list_add_tail(&wb->endio_entry, &wc->endio_list); 1454 raw_spin_unlock_irqrestore(&wc->endio_list_lock, flags); 1455 } 1456 1457 static void writecache_copy_endio(int read_err, unsigned long write_err, void *ptr) 1458 { 1459 struct copy_struct *c = ptr; 1460 struct dm_writecache *wc = c->wc; 1461 1462 c->error = likely(!(read_err | write_err)) ? 0 : -EIO; 1463 1464 raw_spin_lock_irq(&wc->endio_list_lock); 1465 if (unlikely(list_empty(&wc->endio_list))) 1466 wake_up_process(wc->endio_thread); 1467 list_add_tail(&c->endio_entry, &wc->endio_list); 1468 raw_spin_unlock_irq(&wc->endio_list_lock); 1469 } 1470 1471 static void __writecache_endio_pmem(struct dm_writecache *wc, struct list_head *list) 1472 { 1473 unsigned i; 1474 struct writeback_struct *wb; 1475 struct wc_entry *e; 1476 unsigned long n_walked = 0; 1477 1478 do { 1479 wb = list_entry(list->next, struct writeback_struct, endio_entry); 1480 list_del(&wb->endio_entry); 1481 1482 if (unlikely(wb->bio.bi_status != BLK_STS_OK)) 1483 writecache_error(wc, blk_status_to_errno(wb->bio.bi_status), 1484 "write error %d", wb->bio.bi_status); 1485 i = 0; 1486 do { 1487 e = wb->wc_list[i]; 1488 BUG_ON(!e->write_in_progress); 1489 e->write_in_progress = false; 1490 INIT_LIST_HEAD(&e->lru); 1491 if (!writecache_has_error(wc)) 1492 writecache_free_entry(wc, e); 1493 BUG_ON(!wc->writeback_size); 1494 wc->writeback_size--; 1495 n_walked++; 1496 if (unlikely(n_walked >= ENDIO_LATENCY)) { 1497 writecache_commit_flushed(wc, false); 1498 wc_unlock(wc); 1499 wc_lock(wc); 1500 n_walked = 0; 1501 } 1502 } while (++i < wb->wc_list_n); 1503 1504 if (wb->wc_list != wb->wc_list_inline) 1505 kfree(wb->wc_list); 1506 bio_put(&wb->bio); 1507 } while (!list_empty(list)); 1508 } 1509 1510 static void __writecache_endio_ssd(struct dm_writecache *wc, struct list_head *list) 1511 { 1512 struct copy_struct *c; 1513 struct wc_entry *e; 1514 1515 do { 1516 c = list_entry(list->next, struct copy_struct, endio_entry); 1517 list_del(&c->endio_entry); 1518 1519 if (unlikely(c->error)) 1520 writecache_error(wc, c->error, "copy error"); 1521 1522 e = c->e; 1523 do { 1524 BUG_ON(!e->write_in_progress); 1525 e->write_in_progress = false; 1526 INIT_LIST_HEAD(&e->lru); 1527 if (!writecache_has_error(wc)) 1528 writecache_free_entry(wc, e); 1529 1530 BUG_ON(!wc->writeback_size); 1531 wc->writeback_size--; 1532 e++; 1533 } while (--c->n_entries); 1534 mempool_free(c, &wc->copy_pool); 1535 } while (!list_empty(list)); 1536 } 1537 1538 static int writecache_endio_thread(void *data) 1539 { 1540 struct dm_writecache *wc = data; 1541 1542 while (1) { 1543 struct list_head list; 1544 1545 raw_spin_lock_irq(&wc->endio_list_lock); 1546 if (!list_empty(&wc->endio_list)) 1547 goto pop_from_list; 1548 set_current_state(TASK_INTERRUPTIBLE); 1549 raw_spin_unlock_irq(&wc->endio_list_lock); 1550 1551 if (unlikely(kthread_should_stop())) { 1552 set_current_state(TASK_RUNNING); 1553 break; 1554 } 1555 1556 schedule(); 1557 1558 continue; 1559 1560 pop_from_list: 1561 list = wc->endio_list; 1562 list.next->prev = list.prev->next = &list; 1563 INIT_LIST_HEAD(&wc->endio_list); 1564 raw_spin_unlock_irq(&wc->endio_list_lock); 1565 1566 if (!WC_MODE_FUA(wc)) 1567 writecache_disk_flush(wc, wc->dev); 1568 1569 wc_lock(wc); 1570 1571 if (WC_MODE_PMEM(wc)) { 1572 __writecache_endio_pmem(wc, &list); 1573 } else { 1574 __writecache_endio_ssd(wc, &list); 1575 writecache_wait_for_ios(wc, READ); 1576 } 1577 1578 writecache_commit_flushed(wc, false); 1579 1580 wc_unlock(wc); 1581 } 1582 1583 return 0; 1584 } 1585 1586 static bool wc_add_block(struct writeback_struct *wb, struct wc_entry *e, gfp_t gfp) 1587 { 1588 struct dm_writecache *wc = wb->wc; 1589 unsigned block_size = wc->block_size; 1590 void *address = memory_data(wc, e); 1591 1592 persistent_memory_flush_cache(address, block_size); 1593 return bio_add_page(&wb->bio, persistent_memory_page(address), 1594 block_size, persistent_memory_page_offset(address)) != 0; 1595 } 1596 1597 struct writeback_list { 1598 struct list_head list; 1599 size_t size; 1600 }; 1601 1602 static void __writeback_throttle(struct dm_writecache *wc, struct writeback_list *wbl) 1603 { 1604 if (unlikely(wc->max_writeback_jobs)) { 1605 if (READ_ONCE(wc->writeback_size) - wbl->size >= wc->max_writeback_jobs) { 1606 wc_lock(wc); 1607 while (wc->writeback_size - wbl->size >= wc->max_writeback_jobs) 1608 writecache_wait_on_freelist(wc); 1609 wc_unlock(wc); 1610 } 1611 } 1612 cond_resched(); 1613 } 1614 1615 static void __writecache_writeback_pmem(struct dm_writecache *wc, struct writeback_list *wbl) 1616 { 1617 struct wc_entry *e, *f; 1618 struct bio *bio; 1619 struct writeback_struct *wb; 1620 unsigned max_pages; 1621 1622 while (wbl->size) { 1623 wbl->size--; 1624 e = container_of(wbl->list.prev, struct wc_entry, lru); 1625 list_del(&e->lru); 1626 1627 max_pages = e->wc_list_contiguous; 1628 1629 bio = bio_alloc_bioset(GFP_NOIO, max_pages, &wc->bio_set); 1630 wb = container_of(bio, struct writeback_struct, bio); 1631 wb->wc = wc; 1632 bio->bi_end_io = writecache_writeback_endio; 1633 bio_set_dev(bio, wc->dev->bdev); 1634 bio->bi_iter.bi_sector = read_original_sector(wc, e); 1635 if (max_pages <= WB_LIST_INLINE || 1636 unlikely(!(wb->wc_list = kmalloc_array(max_pages, sizeof(struct wc_entry *), 1637 GFP_NOIO | __GFP_NORETRY | 1638 __GFP_NOMEMALLOC | __GFP_NOWARN)))) { 1639 wb->wc_list = wb->wc_list_inline; 1640 max_pages = WB_LIST_INLINE; 1641 } 1642 1643 BUG_ON(!wc_add_block(wb, e, GFP_NOIO)); 1644 1645 wb->wc_list[0] = e; 1646 wb->wc_list_n = 1; 1647 1648 while (wbl->size && wb->wc_list_n < max_pages) { 1649 f = container_of(wbl->list.prev, struct wc_entry, lru); 1650 if (read_original_sector(wc, f) != 1651 read_original_sector(wc, e) + (wc->block_size >> SECTOR_SHIFT)) 1652 break; 1653 if (!wc_add_block(wb, f, GFP_NOWAIT | __GFP_NOWARN)) 1654 break; 1655 wbl->size--; 1656 list_del(&f->lru); 1657 wb->wc_list[wb->wc_list_n++] = f; 1658 e = f; 1659 } 1660 bio_set_op_attrs(bio, REQ_OP_WRITE, WC_MODE_FUA(wc) * REQ_FUA); 1661 if (writecache_has_error(wc)) { 1662 bio->bi_status = BLK_STS_IOERR; 1663 bio_endio(bio); 1664 } else { 1665 submit_bio(bio); 1666 } 1667 1668 __writeback_throttle(wc, wbl); 1669 } 1670 } 1671 1672 static void __writecache_writeback_ssd(struct dm_writecache *wc, struct writeback_list *wbl) 1673 { 1674 struct wc_entry *e, *f; 1675 struct dm_io_region from, to; 1676 struct copy_struct *c; 1677 1678 while (wbl->size) { 1679 unsigned n_sectors; 1680 1681 wbl->size--; 1682 e = container_of(wbl->list.prev, struct wc_entry, lru); 1683 list_del(&e->lru); 1684 1685 n_sectors = e->wc_list_contiguous << (wc->block_size_bits - SECTOR_SHIFT); 1686 1687 from.bdev = wc->ssd_dev->bdev; 1688 from.sector = cache_sector(wc, e); 1689 from.count = n_sectors; 1690 to.bdev = wc->dev->bdev; 1691 to.sector = read_original_sector(wc, e); 1692 to.count = n_sectors; 1693 1694 c = mempool_alloc(&wc->copy_pool, GFP_NOIO); 1695 c->wc = wc; 1696 c->e = e; 1697 c->n_entries = e->wc_list_contiguous; 1698 1699 while ((n_sectors -= wc->block_size >> SECTOR_SHIFT)) { 1700 wbl->size--; 1701 f = container_of(wbl->list.prev, struct wc_entry, lru); 1702 BUG_ON(f != e + 1); 1703 list_del(&f->lru); 1704 e = f; 1705 } 1706 1707 dm_kcopyd_copy(wc->dm_kcopyd, &from, 1, &to, 0, writecache_copy_endio, c); 1708 1709 __writeback_throttle(wc, wbl); 1710 } 1711 } 1712 1713 static void writecache_writeback(struct work_struct *work) 1714 { 1715 struct dm_writecache *wc = container_of(work, struct dm_writecache, writeback_work); 1716 struct blk_plug plug; 1717 struct wc_entry *f, *uninitialized_var(g), *e = NULL; 1718 struct rb_node *node, *next_node; 1719 struct list_head skipped; 1720 struct writeback_list wbl; 1721 unsigned long n_walked; 1722 1723 wc_lock(wc); 1724 restart: 1725 if (writecache_has_error(wc)) { 1726 wc_unlock(wc); 1727 return; 1728 } 1729 1730 if (unlikely(wc->writeback_all)) { 1731 if (writecache_wait_for_writeback(wc)) 1732 goto restart; 1733 } 1734 1735 if (wc->overwrote_committed) { 1736 writecache_wait_for_ios(wc, WRITE); 1737 } 1738 1739 n_walked = 0; 1740 INIT_LIST_HEAD(&skipped); 1741 INIT_LIST_HEAD(&wbl.list); 1742 wbl.size = 0; 1743 while (!list_empty(&wc->lru) && 1744 (wc->writeback_all || 1745 wc->freelist_size + wc->writeback_size <= wc->freelist_low_watermark || 1746 (jiffies - container_of(wc->lru.prev, struct wc_entry, lru)->age >= 1747 wc->max_age - wc->max_age / MAX_AGE_DIV))) { 1748 1749 n_walked++; 1750 if (unlikely(n_walked > WRITEBACK_LATENCY) && 1751 likely(!wc->writeback_all) && likely(!dm_suspended(wc->ti))) { 1752 queue_work(wc->writeback_wq, &wc->writeback_work); 1753 break; 1754 } 1755 1756 if (unlikely(wc->writeback_all)) { 1757 if (unlikely(!e)) { 1758 writecache_flush(wc); 1759 e = container_of(rb_first(&wc->tree), struct wc_entry, rb_node); 1760 } else 1761 e = g; 1762 } else 1763 e = container_of(wc->lru.prev, struct wc_entry, lru); 1764 BUG_ON(e->write_in_progress); 1765 if (unlikely(!writecache_entry_is_committed(wc, e))) { 1766 writecache_flush(wc); 1767 } 1768 node = rb_prev(&e->rb_node); 1769 if (node) { 1770 f = container_of(node, struct wc_entry, rb_node); 1771 if (unlikely(read_original_sector(wc, f) == 1772 read_original_sector(wc, e))) { 1773 BUG_ON(!f->write_in_progress); 1774 list_del(&e->lru); 1775 list_add(&e->lru, &skipped); 1776 cond_resched(); 1777 continue; 1778 } 1779 } 1780 wc->writeback_size++; 1781 list_del(&e->lru); 1782 list_add(&e->lru, &wbl.list); 1783 wbl.size++; 1784 e->write_in_progress = true; 1785 e->wc_list_contiguous = 1; 1786 1787 f = e; 1788 1789 while (1) { 1790 next_node = rb_next(&f->rb_node); 1791 if (unlikely(!next_node)) 1792 break; 1793 g = container_of(next_node, struct wc_entry, rb_node); 1794 if (unlikely(read_original_sector(wc, g) == 1795 read_original_sector(wc, f))) { 1796 f = g; 1797 continue; 1798 } 1799 if (read_original_sector(wc, g) != 1800 read_original_sector(wc, f) + (wc->block_size >> SECTOR_SHIFT)) 1801 break; 1802 if (unlikely(g->write_in_progress)) 1803 break; 1804 if (unlikely(!writecache_entry_is_committed(wc, g))) 1805 break; 1806 1807 if (!WC_MODE_PMEM(wc)) { 1808 if (g != f + 1) 1809 break; 1810 } 1811 1812 n_walked++; 1813 //if (unlikely(n_walked > WRITEBACK_LATENCY) && likely(!wc->writeback_all)) 1814 // break; 1815 1816 wc->writeback_size++; 1817 list_del(&g->lru); 1818 list_add(&g->lru, &wbl.list); 1819 wbl.size++; 1820 g->write_in_progress = true; 1821 g->wc_list_contiguous = BIO_MAX_PAGES; 1822 f = g; 1823 e->wc_list_contiguous++; 1824 if (unlikely(e->wc_list_contiguous == BIO_MAX_PAGES)) { 1825 if (unlikely(wc->writeback_all)) { 1826 next_node = rb_next(&f->rb_node); 1827 if (likely(next_node)) 1828 g = container_of(next_node, struct wc_entry, rb_node); 1829 } 1830 break; 1831 } 1832 } 1833 cond_resched(); 1834 } 1835 1836 if (!list_empty(&skipped)) { 1837 list_splice_tail(&skipped, &wc->lru); 1838 /* 1839 * If we didn't do any progress, we must wait until some 1840 * writeback finishes to avoid burning CPU in a loop 1841 */ 1842 if (unlikely(!wbl.size)) 1843 writecache_wait_for_writeback(wc); 1844 } 1845 1846 wc_unlock(wc); 1847 1848 blk_start_plug(&plug); 1849 1850 if (WC_MODE_PMEM(wc)) 1851 __writecache_writeback_pmem(wc, &wbl); 1852 else 1853 __writecache_writeback_ssd(wc, &wbl); 1854 1855 blk_finish_plug(&plug); 1856 1857 if (unlikely(wc->writeback_all)) { 1858 wc_lock(wc); 1859 while (writecache_wait_for_writeback(wc)); 1860 wc_unlock(wc); 1861 } 1862 } 1863 1864 static int calculate_memory_size(uint64_t device_size, unsigned block_size, 1865 size_t *n_blocks_p, size_t *n_metadata_blocks_p) 1866 { 1867 uint64_t n_blocks, offset; 1868 struct wc_entry e; 1869 1870 n_blocks = device_size; 1871 do_div(n_blocks, block_size + sizeof(struct wc_memory_entry)); 1872 1873 while (1) { 1874 if (!n_blocks) 1875 return -ENOSPC; 1876 /* Verify the following entries[n_blocks] won't overflow */ 1877 if (n_blocks >= ((size_t)-sizeof(struct wc_memory_superblock) / 1878 sizeof(struct wc_memory_entry))) 1879 return -EFBIG; 1880 offset = offsetof(struct wc_memory_superblock, entries[n_blocks]); 1881 offset = (offset + block_size - 1) & ~(uint64_t)(block_size - 1); 1882 if (offset + n_blocks * block_size <= device_size) 1883 break; 1884 n_blocks--; 1885 } 1886 1887 /* check if the bit field overflows */ 1888 e.index = n_blocks; 1889 if (e.index != n_blocks) 1890 return -EFBIG; 1891 1892 if (n_blocks_p) 1893 *n_blocks_p = n_blocks; 1894 if (n_metadata_blocks_p) 1895 *n_metadata_blocks_p = offset >> __ffs(block_size); 1896 return 0; 1897 } 1898 1899 static int init_memory(struct dm_writecache *wc) 1900 { 1901 size_t b; 1902 int r; 1903 1904 r = calculate_memory_size(wc->memory_map_size, wc->block_size, &wc->n_blocks, NULL); 1905 if (r) 1906 return r; 1907 1908 r = writecache_alloc_entries(wc); 1909 if (r) 1910 return r; 1911 1912 for (b = 0; b < ARRAY_SIZE(sb(wc)->padding); b++) 1913 pmem_assign(sb(wc)->padding[b], cpu_to_le64(0)); 1914 pmem_assign(sb(wc)->version, cpu_to_le32(MEMORY_SUPERBLOCK_VERSION)); 1915 pmem_assign(sb(wc)->block_size, cpu_to_le32(wc->block_size)); 1916 pmem_assign(sb(wc)->n_blocks, cpu_to_le64(wc->n_blocks)); 1917 pmem_assign(sb(wc)->seq_count, cpu_to_le64(0)); 1918 1919 for (b = 0; b < wc->n_blocks; b++) { 1920 write_original_sector_seq_count(wc, &wc->entries[b], -1, -1); 1921 cond_resched(); 1922 } 1923 1924 writecache_flush_all_metadata(wc); 1925 writecache_commit_flushed(wc, false); 1926 pmem_assign(sb(wc)->magic, cpu_to_le32(MEMORY_SUPERBLOCK_MAGIC)); 1927 writecache_flush_region(wc, &sb(wc)->magic, sizeof sb(wc)->magic); 1928 writecache_commit_flushed(wc, false); 1929 1930 return 0; 1931 } 1932 1933 static void writecache_dtr(struct dm_target *ti) 1934 { 1935 struct dm_writecache *wc = ti->private; 1936 1937 if (!wc) 1938 return; 1939 1940 if (wc->endio_thread) 1941 kthread_stop(wc->endio_thread); 1942 1943 if (wc->flush_thread) 1944 kthread_stop(wc->flush_thread); 1945 1946 bioset_exit(&wc->bio_set); 1947 1948 mempool_exit(&wc->copy_pool); 1949 1950 if (wc->writeback_wq) 1951 destroy_workqueue(wc->writeback_wq); 1952 1953 if (wc->dev) 1954 dm_put_device(ti, wc->dev); 1955 1956 if (wc->ssd_dev) 1957 dm_put_device(ti, wc->ssd_dev); 1958 1959 if (wc->entries) 1960 vfree(wc->entries); 1961 1962 if (wc->memory_map) { 1963 if (WC_MODE_PMEM(wc)) 1964 persistent_memory_release(wc); 1965 else 1966 vfree(wc->memory_map); 1967 } 1968 1969 if (wc->dm_kcopyd) 1970 dm_kcopyd_client_destroy(wc->dm_kcopyd); 1971 1972 if (wc->dm_io) 1973 dm_io_client_destroy(wc->dm_io); 1974 1975 if (wc->dirty_bitmap) 1976 vfree(wc->dirty_bitmap); 1977 1978 kfree(wc); 1979 } 1980 1981 static int writecache_ctr(struct dm_target *ti, unsigned argc, char **argv) 1982 { 1983 struct dm_writecache *wc; 1984 struct dm_arg_set as; 1985 const char *string; 1986 unsigned opt_params; 1987 size_t offset, data_size; 1988 int i, r; 1989 char dummy; 1990 int high_wm_percent = HIGH_WATERMARK; 1991 int low_wm_percent = LOW_WATERMARK; 1992 uint64_t x; 1993 struct wc_memory_superblock s; 1994 1995 static struct dm_arg _args[] = { 1996 {0, 10, "Invalid number of feature args"}, 1997 }; 1998 1999 as.argc = argc; 2000 as.argv = argv; 2001 2002 wc = kzalloc(sizeof(struct dm_writecache), GFP_KERNEL); 2003 if (!wc) { 2004 ti->error = "Cannot allocate writecache structure"; 2005 r = -ENOMEM; 2006 goto bad; 2007 } 2008 ti->private = wc; 2009 wc->ti = ti; 2010 2011 mutex_init(&wc->lock); 2012 wc->max_age = MAX_AGE_UNSPECIFIED; 2013 writecache_poison_lists(wc); 2014 init_waitqueue_head(&wc->freelist_wait); 2015 timer_setup(&wc->autocommit_timer, writecache_autocommit_timer, 0); 2016 timer_setup(&wc->max_age_timer, writecache_max_age_timer, 0); 2017 2018 for (i = 0; i < 2; i++) { 2019 atomic_set(&wc->bio_in_progress[i], 0); 2020 init_waitqueue_head(&wc->bio_in_progress_wait[i]); 2021 } 2022 2023 wc->dm_io = dm_io_client_create(); 2024 if (IS_ERR(wc->dm_io)) { 2025 r = PTR_ERR(wc->dm_io); 2026 ti->error = "Unable to allocate dm-io client"; 2027 wc->dm_io = NULL; 2028 goto bad; 2029 } 2030 2031 wc->writeback_wq = alloc_workqueue("writecache-writeback", WQ_MEM_RECLAIM, 1); 2032 if (!wc->writeback_wq) { 2033 r = -ENOMEM; 2034 ti->error = "Could not allocate writeback workqueue"; 2035 goto bad; 2036 } 2037 INIT_WORK(&wc->writeback_work, writecache_writeback); 2038 INIT_WORK(&wc->flush_work, writecache_flush_work); 2039 2040 raw_spin_lock_init(&wc->endio_list_lock); 2041 INIT_LIST_HEAD(&wc->endio_list); 2042 wc->endio_thread = kthread_create(writecache_endio_thread, wc, "writecache_endio"); 2043 if (IS_ERR(wc->endio_thread)) { 2044 r = PTR_ERR(wc->endio_thread); 2045 wc->endio_thread = NULL; 2046 ti->error = "Couldn't spawn endio thread"; 2047 goto bad; 2048 } 2049 wake_up_process(wc->endio_thread); 2050 2051 /* 2052 * Parse the mode (pmem or ssd) 2053 */ 2054 string = dm_shift_arg(&as); 2055 if (!string) 2056 goto bad_arguments; 2057 2058 if (!strcasecmp(string, "s")) { 2059 wc->pmem_mode = false; 2060 } else if (!strcasecmp(string, "p")) { 2061 #ifdef DM_WRITECACHE_HAS_PMEM 2062 wc->pmem_mode = true; 2063 wc->writeback_fua = true; 2064 #else 2065 /* 2066 * If the architecture doesn't support persistent memory or 2067 * the kernel doesn't support any DAX drivers, this driver can 2068 * only be used in SSD-only mode. 2069 */ 2070 r = -EOPNOTSUPP; 2071 ti->error = "Persistent memory or DAX not supported on this system"; 2072 goto bad; 2073 #endif 2074 } else { 2075 goto bad_arguments; 2076 } 2077 2078 if (WC_MODE_PMEM(wc)) { 2079 r = bioset_init(&wc->bio_set, BIO_POOL_SIZE, 2080 offsetof(struct writeback_struct, bio), 2081 BIOSET_NEED_BVECS); 2082 if (r) { 2083 ti->error = "Could not allocate bio set"; 2084 goto bad; 2085 } 2086 } else { 2087 r = mempool_init_kmalloc_pool(&wc->copy_pool, 1, sizeof(struct copy_struct)); 2088 if (r) { 2089 ti->error = "Could not allocate mempool"; 2090 goto bad; 2091 } 2092 } 2093 2094 /* 2095 * Parse the origin data device 2096 */ 2097 string = dm_shift_arg(&as); 2098 if (!string) 2099 goto bad_arguments; 2100 r = dm_get_device(ti, string, dm_table_get_mode(ti->table), &wc->dev); 2101 if (r) { 2102 ti->error = "Origin data device lookup failed"; 2103 goto bad; 2104 } 2105 2106 /* 2107 * Parse cache data device (be it pmem or ssd) 2108 */ 2109 string = dm_shift_arg(&as); 2110 if (!string) 2111 goto bad_arguments; 2112 2113 r = dm_get_device(ti, string, dm_table_get_mode(ti->table), &wc->ssd_dev); 2114 if (r) { 2115 ti->error = "Cache data device lookup failed"; 2116 goto bad; 2117 } 2118 wc->memory_map_size = i_size_read(wc->ssd_dev->bdev->bd_inode); 2119 2120 /* 2121 * Parse the cache block size 2122 */ 2123 string = dm_shift_arg(&as); 2124 if (!string) 2125 goto bad_arguments; 2126 if (sscanf(string, "%u%c", &wc->block_size, &dummy) != 1 || 2127 wc->block_size < 512 || wc->block_size > PAGE_SIZE || 2128 (wc->block_size & (wc->block_size - 1))) { 2129 r = -EINVAL; 2130 ti->error = "Invalid block size"; 2131 goto bad; 2132 } 2133 if (wc->block_size < bdev_logical_block_size(wc->dev->bdev) || 2134 wc->block_size < bdev_logical_block_size(wc->ssd_dev->bdev)) { 2135 r = -EINVAL; 2136 ti->error = "Block size is smaller than device logical block size"; 2137 goto bad; 2138 } 2139 wc->block_size_bits = __ffs(wc->block_size); 2140 2141 wc->max_writeback_jobs = MAX_WRITEBACK_JOBS; 2142 wc->autocommit_blocks = !WC_MODE_PMEM(wc) ? AUTOCOMMIT_BLOCKS_SSD : AUTOCOMMIT_BLOCKS_PMEM; 2143 wc->autocommit_jiffies = msecs_to_jiffies(AUTOCOMMIT_MSEC); 2144 2145 /* 2146 * Parse optional arguments 2147 */ 2148 r = dm_read_arg_group(_args, &as, &opt_params, &ti->error); 2149 if (r) 2150 goto bad; 2151 2152 while (opt_params) { 2153 string = dm_shift_arg(&as), opt_params--; 2154 if (!strcasecmp(string, "start_sector") && opt_params >= 1) { 2155 unsigned long long start_sector; 2156 string = dm_shift_arg(&as), opt_params--; 2157 if (sscanf(string, "%llu%c", &start_sector, &dummy) != 1) 2158 goto invalid_optional; 2159 wc->start_sector = start_sector; 2160 if (wc->start_sector != start_sector || 2161 wc->start_sector >= wc->memory_map_size >> SECTOR_SHIFT) 2162 goto invalid_optional; 2163 } else if (!strcasecmp(string, "high_watermark") && opt_params >= 1) { 2164 string = dm_shift_arg(&as), opt_params--; 2165 if (sscanf(string, "%d%c", &high_wm_percent, &dummy) != 1) 2166 goto invalid_optional; 2167 if (high_wm_percent < 0 || high_wm_percent > 100) 2168 goto invalid_optional; 2169 wc->high_wm_percent_set = true; 2170 } else if (!strcasecmp(string, "low_watermark") && opt_params >= 1) { 2171 string = dm_shift_arg(&as), opt_params--; 2172 if (sscanf(string, "%d%c", &low_wm_percent, &dummy) != 1) 2173 goto invalid_optional; 2174 if (low_wm_percent < 0 || low_wm_percent > 100) 2175 goto invalid_optional; 2176 wc->low_wm_percent_set = true; 2177 } else if (!strcasecmp(string, "writeback_jobs") && opt_params >= 1) { 2178 string = dm_shift_arg(&as), opt_params--; 2179 if (sscanf(string, "%u%c", &wc->max_writeback_jobs, &dummy) != 1) 2180 goto invalid_optional; 2181 wc->max_writeback_jobs_set = true; 2182 } else if (!strcasecmp(string, "autocommit_blocks") && opt_params >= 1) { 2183 string = dm_shift_arg(&as), opt_params--; 2184 if (sscanf(string, "%u%c", &wc->autocommit_blocks, &dummy) != 1) 2185 goto invalid_optional; 2186 wc->autocommit_blocks_set = true; 2187 } else if (!strcasecmp(string, "autocommit_time") && opt_params >= 1) { 2188 unsigned autocommit_msecs; 2189 string = dm_shift_arg(&as), opt_params--; 2190 if (sscanf(string, "%u%c", &autocommit_msecs, &dummy) != 1) 2191 goto invalid_optional; 2192 if (autocommit_msecs > 3600000) 2193 goto invalid_optional; 2194 wc->autocommit_jiffies = msecs_to_jiffies(autocommit_msecs); 2195 wc->autocommit_time_set = true; 2196 } else if (!strcasecmp(string, "max_age") && opt_params >= 1) { 2197 unsigned max_age_msecs; 2198 string = dm_shift_arg(&as), opt_params--; 2199 if (sscanf(string, "%u%c", &max_age_msecs, &dummy) != 1) 2200 goto invalid_optional; 2201 if (max_age_msecs > 86400000) 2202 goto invalid_optional; 2203 wc->max_age = msecs_to_jiffies(max_age_msecs); 2204 } else if (!strcasecmp(string, "cleaner")) { 2205 wc->cleaner = true; 2206 } else if (!strcasecmp(string, "fua")) { 2207 if (WC_MODE_PMEM(wc)) { 2208 wc->writeback_fua = true; 2209 wc->writeback_fua_set = true; 2210 } else goto invalid_optional; 2211 } else if (!strcasecmp(string, "nofua")) { 2212 if (WC_MODE_PMEM(wc)) { 2213 wc->writeback_fua = false; 2214 wc->writeback_fua_set = true; 2215 } else goto invalid_optional; 2216 } else { 2217 invalid_optional: 2218 r = -EINVAL; 2219 ti->error = "Invalid optional argument"; 2220 goto bad; 2221 } 2222 } 2223 2224 if (high_wm_percent < low_wm_percent) { 2225 r = -EINVAL; 2226 ti->error = "High watermark must be greater than or equal to low watermark"; 2227 goto bad; 2228 } 2229 2230 if (WC_MODE_PMEM(wc)) { 2231 r = persistent_memory_claim(wc); 2232 if (r) { 2233 ti->error = "Unable to map persistent memory for cache"; 2234 goto bad; 2235 } 2236 } else { 2237 size_t n_blocks, n_metadata_blocks; 2238 uint64_t n_bitmap_bits; 2239 2240 wc->memory_map_size -= (uint64_t)wc->start_sector << SECTOR_SHIFT; 2241 2242 bio_list_init(&wc->flush_list); 2243 wc->flush_thread = kthread_create(writecache_flush_thread, wc, "dm_writecache_flush"); 2244 if (IS_ERR(wc->flush_thread)) { 2245 r = PTR_ERR(wc->flush_thread); 2246 wc->flush_thread = NULL; 2247 ti->error = "Couldn't spawn flush thread"; 2248 goto bad; 2249 } 2250 wake_up_process(wc->flush_thread); 2251 2252 r = calculate_memory_size(wc->memory_map_size, wc->block_size, 2253 &n_blocks, &n_metadata_blocks); 2254 if (r) { 2255 ti->error = "Invalid device size"; 2256 goto bad; 2257 } 2258 2259 n_bitmap_bits = (((uint64_t)n_metadata_blocks << wc->block_size_bits) + 2260 BITMAP_GRANULARITY - 1) / BITMAP_GRANULARITY; 2261 /* this is limitation of test_bit functions */ 2262 if (n_bitmap_bits > 1U << 31) { 2263 r = -EFBIG; 2264 ti->error = "Invalid device size"; 2265 goto bad; 2266 } 2267 2268 wc->memory_map = vmalloc(n_metadata_blocks << wc->block_size_bits); 2269 if (!wc->memory_map) { 2270 r = -ENOMEM; 2271 ti->error = "Unable to allocate memory for metadata"; 2272 goto bad; 2273 } 2274 2275 wc->dm_kcopyd = dm_kcopyd_client_create(&dm_kcopyd_throttle); 2276 if (IS_ERR(wc->dm_kcopyd)) { 2277 r = PTR_ERR(wc->dm_kcopyd); 2278 ti->error = "Unable to allocate dm-kcopyd client"; 2279 wc->dm_kcopyd = NULL; 2280 goto bad; 2281 } 2282 2283 wc->metadata_sectors = n_metadata_blocks << (wc->block_size_bits - SECTOR_SHIFT); 2284 wc->dirty_bitmap_size = (n_bitmap_bits + BITS_PER_LONG - 1) / 2285 BITS_PER_LONG * sizeof(unsigned long); 2286 wc->dirty_bitmap = vzalloc(wc->dirty_bitmap_size); 2287 if (!wc->dirty_bitmap) { 2288 r = -ENOMEM; 2289 ti->error = "Unable to allocate dirty bitmap"; 2290 goto bad; 2291 } 2292 2293 r = writecache_read_metadata(wc, wc->block_size >> SECTOR_SHIFT); 2294 if (r) { 2295 ti->error = "Unable to read first block of metadata"; 2296 goto bad; 2297 } 2298 } 2299 2300 r = memcpy_mcsafe(&s, sb(wc), sizeof(struct wc_memory_superblock)); 2301 if (r) { 2302 ti->error = "Hardware memory error when reading superblock"; 2303 goto bad; 2304 } 2305 if (!le32_to_cpu(s.magic) && !le32_to_cpu(s.version)) { 2306 r = init_memory(wc); 2307 if (r) { 2308 ti->error = "Unable to initialize device"; 2309 goto bad; 2310 } 2311 r = memcpy_mcsafe(&s, sb(wc), sizeof(struct wc_memory_superblock)); 2312 if (r) { 2313 ti->error = "Hardware memory error when reading superblock"; 2314 goto bad; 2315 } 2316 } 2317 2318 if (le32_to_cpu(s.magic) != MEMORY_SUPERBLOCK_MAGIC) { 2319 ti->error = "Invalid magic in the superblock"; 2320 r = -EINVAL; 2321 goto bad; 2322 } 2323 2324 if (le32_to_cpu(s.version) != MEMORY_SUPERBLOCK_VERSION) { 2325 ti->error = "Invalid version in the superblock"; 2326 r = -EINVAL; 2327 goto bad; 2328 } 2329 2330 if (le32_to_cpu(s.block_size) != wc->block_size) { 2331 ti->error = "Block size does not match superblock"; 2332 r = -EINVAL; 2333 goto bad; 2334 } 2335 2336 wc->n_blocks = le64_to_cpu(s.n_blocks); 2337 2338 offset = wc->n_blocks * sizeof(struct wc_memory_entry); 2339 if (offset / sizeof(struct wc_memory_entry) != le64_to_cpu(sb(wc)->n_blocks)) { 2340 overflow: 2341 ti->error = "Overflow in size calculation"; 2342 r = -EINVAL; 2343 goto bad; 2344 } 2345 offset += sizeof(struct wc_memory_superblock); 2346 if (offset < sizeof(struct wc_memory_superblock)) 2347 goto overflow; 2348 offset = (offset + wc->block_size - 1) & ~(size_t)(wc->block_size - 1); 2349 data_size = wc->n_blocks * (size_t)wc->block_size; 2350 if (!offset || (data_size / wc->block_size != wc->n_blocks) || 2351 (offset + data_size < offset)) 2352 goto overflow; 2353 if (offset + data_size > wc->memory_map_size) { 2354 ti->error = "Memory area is too small"; 2355 r = -EINVAL; 2356 goto bad; 2357 } 2358 2359 wc->metadata_sectors = offset >> SECTOR_SHIFT; 2360 wc->block_start = (char *)sb(wc) + offset; 2361 2362 x = (uint64_t)wc->n_blocks * (100 - high_wm_percent); 2363 x += 50; 2364 do_div(x, 100); 2365 wc->freelist_high_watermark = x; 2366 x = (uint64_t)wc->n_blocks * (100 - low_wm_percent); 2367 x += 50; 2368 do_div(x, 100); 2369 wc->freelist_low_watermark = x; 2370 2371 if (wc->cleaner) 2372 activate_cleaner(wc); 2373 2374 r = writecache_alloc_entries(wc); 2375 if (r) { 2376 ti->error = "Cannot allocate memory"; 2377 goto bad; 2378 } 2379 2380 ti->num_flush_bios = 1; 2381 ti->flush_supported = true; 2382 ti->num_discard_bios = 1; 2383 2384 if (WC_MODE_PMEM(wc)) 2385 persistent_memory_flush_cache(wc->memory_map, wc->memory_map_size); 2386 2387 return 0; 2388 2389 bad_arguments: 2390 r = -EINVAL; 2391 ti->error = "Bad arguments"; 2392 bad: 2393 writecache_dtr(ti); 2394 return r; 2395 } 2396 2397 static void writecache_status(struct dm_target *ti, status_type_t type, 2398 unsigned status_flags, char *result, unsigned maxlen) 2399 { 2400 struct dm_writecache *wc = ti->private; 2401 unsigned extra_args; 2402 unsigned sz = 0; 2403 uint64_t x; 2404 2405 switch (type) { 2406 case STATUSTYPE_INFO: 2407 DMEMIT("%ld %llu %llu %llu", writecache_has_error(wc), 2408 (unsigned long long)wc->n_blocks, (unsigned long long)wc->freelist_size, 2409 (unsigned long long)wc->writeback_size); 2410 break; 2411 case STATUSTYPE_TABLE: 2412 DMEMIT("%c %s %s %u ", WC_MODE_PMEM(wc) ? 'p' : 's', 2413 wc->dev->name, wc->ssd_dev->name, wc->block_size); 2414 extra_args = 0; 2415 if (wc->start_sector) 2416 extra_args += 2; 2417 if (wc->high_wm_percent_set && !wc->cleaner) 2418 extra_args += 2; 2419 if (wc->low_wm_percent_set && !wc->cleaner) 2420 extra_args += 2; 2421 if (wc->max_writeback_jobs_set) 2422 extra_args += 2; 2423 if (wc->autocommit_blocks_set) 2424 extra_args += 2; 2425 if (wc->autocommit_time_set) 2426 extra_args += 2; 2427 if (wc->cleaner) 2428 extra_args++; 2429 if (wc->writeback_fua_set) 2430 extra_args++; 2431 2432 DMEMIT("%u", extra_args); 2433 if (wc->start_sector) 2434 DMEMIT(" start_sector %llu", (unsigned long long)wc->start_sector); 2435 if (wc->high_wm_percent_set && !wc->cleaner) { 2436 x = (uint64_t)wc->freelist_high_watermark * 100; 2437 x += wc->n_blocks / 2; 2438 do_div(x, (size_t)wc->n_blocks); 2439 DMEMIT(" high_watermark %u", 100 - (unsigned)x); 2440 } 2441 if (wc->low_wm_percent_set && !wc->cleaner) { 2442 x = (uint64_t)wc->freelist_low_watermark * 100; 2443 x += wc->n_blocks / 2; 2444 do_div(x, (size_t)wc->n_blocks); 2445 DMEMIT(" low_watermark %u", 100 - (unsigned)x); 2446 } 2447 if (wc->max_writeback_jobs_set) 2448 DMEMIT(" writeback_jobs %u", wc->max_writeback_jobs); 2449 if (wc->autocommit_blocks_set) 2450 DMEMIT(" autocommit_blocks %u", wc->autocommit_blocks); 2451 if (wc->autocommit_time_set) 2452 DMEMIT(" autocommit_time %u", jiffies_to_msecs(wc->autocommit_jiffies)); 2453 if (wc->max_age != MAX_AGE_UNSPECIFIED) 2454 DMEMIT(" max_age %u", jiffies_to_msecs(wc->max_age)); 2455 if (wc->cleaner) 2456 DMEMIT(" cleaner"); 2457 if (wc->writeback_fua_set) 2458 DMEMIT(" %sfua", wc->writeback_fua ? "" : "no"); 2459 break; 2460 } 2461 } 2462 2463 static struct target_type writecache_target = { 2464 .name = "writecache", 2465 .version = {1, 3, 0}, 2466 .module = THIS_MODULE, 2467 .ctr = writecache_ctr, 2468 .dtr = writecache_dtr, 2469 .status = writecache_status, 2470 .postsuspend = writecache_suspend, 2471 .resume = writecache_resume, 2472 .message = writecache_message, 2473 .map = writecache_map, 2474 .end_io = writecache_end_io, 2475 .iterate_devices = writecache_iterate_devices, 2476 .io_hints = writecache_io_hints, 2477 }; 2478 2479 static int __init dm_writecache_init(void) 2480 { 2481 int r; 2482 2483 r = dm_register_target(&writecache_target); 2484 if (r < 0) { 2485 DMERR("register failed %d", r); 2486 return r; 2487 } 2488 2489 return 0; 2490 } 2491 2492 static void __exit dm_writecache_exit(void) 2493 { 2494 dm_unregister_target(&writecache_target); 2495 } 2496 2497 module_init(dm_writecache_init); 2498 module_exit(dm_writecache_exit); 2499 2500 MODULE_DESCRIPTION(DM_NAME " writecache target"); 2501 MODULE_AUTHOR("Mikulas Patocka <dm-devel@redhat.com>"); 2502 MODULE_LICENSE("GPL"); 2503