xref: /openbmc/linux/drivers/md/dm-writecache.c (revision 6614a3c3164a5df2b54abb0b3559f51041cf705b)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (C) 2018 Red Hat. All rights reserved.
4  *
5  * This file is released under the GPL.
6  */
7 
8 #include <linux/device-mapper.h>
9 #include <linux/module.h>
10 #include <linux/init.h>
11 #include <linux/vmalloc.h>
12 #include <linux/kthread.h>
13 #include <linux/dm-io.h>
14 #include <linux/dm-kcopyd.h>
15 #include <linux/dax.h>
16 #include <linux/pfn_t.h>
17 #include <linux/libnvdimm.h>
18 #include <linux/delay.h>
19 #include "dm-io-tracker.h"
20 
21 #define DM_MSG_PREFIX "writecache"
22 
23 #define HIGH_WATERMARK			50
24 #define LOW_WATERMARK			45
25 #define MAX_WRITEBACK_JOBS		min(0x10000000 / PAGE_SIZE, totalram_pages() / 16)
26 #define ENDIO_LATENCY			16
27 #define WRITEBACK_LATENCY		64
28 #define AUTOCOMMIT_BLOCKS_SSD		65536
29 #define AUTOCOMMIT_BLOCKS_PMEM		64
30 #define AUTOCOMMIT_MSEC			1000
31 #define MAX_AGE_DIV			16
32 #define MAX_AGE_UNSPECIFIED		-1UL
33 #define PAUSE_WRITEBACK			(HZ * 3)
34 
35 #define BITMAP_GRANULARITY	65536
36 #if BITMAP_GRANULARITY < PAGE_SIZE
37 #undef BITMAP_GRANULARITY
38 #define BITMAP_GRANULARITY	PAGE_SIZE
39 #endif
40 
41 #if IS_ENABLED(CONFIG_ARCH_HAS_PMEM_API) && IS_ENABLED(CONFIG_FS_DAX)
42 #define DM_WRITECACHE_HAS_PMEM
43 #endif
44 
45 #ifdef DM_WRITECACHE_HAS_PMEM
46 #define pmem_assign(dest, src)					\
47 do {								\
48 	typeof(dest) uniq = (src);				\
49 	memcpy_flushcache(&(dest), &uniq, sizeof(dest));	\
50 } while (0)
51 #else
52 #define pmem_assign(dest, src)	((dest) = (src))
53 #endif
54 
55 #if IS_ENABLED(CONFIG_ARCH_HAS_COPY_MC) && defined(DM_WRITECACHE_HAS_PMEM)
56 #define DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
57 #endif
58 
59 #define MEMORY_SUPERBLOCK_MAGIC		0x23489321
60 #define MEMORY_SUPERBLOCK_VERSION	1
61 
62 struct wc_memory_entry {
63 	__le64 original_sector;
64 	__le64 seq_count;
65 };
66 
67 struct wc_memory_superblock {
68 	union {
69 		struct {
70 			__le32 magic;
71 			__le32 version;
72 			__le32 block_size;
73 			__le32 pad;
74 			__le64 n_blocks;
75 			__le64 seq_count;
76 		};
77 		__le64 padding[8];
78 	};
79 	struct wc_memory_entry entries[];
80 };
81 
82 struct wc_entry {
83 	struct rb_node rb_node;
84 	struct list_head lru;
85 	unsigned short wc_list_contiguous;
86 	bool write_in_progress
87 #if BITS_PER_LONG == 64
88 		:1
89 #endif
90 	;
91 	unsigned long index
92 #if BITS_PER_LONG == 64
93 		:47
94 #endif
95 	;
96 	unsigned long age;
97 #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
98 	uint64_t original_sector;
99 	uint64_t seq_count;
100 #endif
101 };
102 
103 #ifdef DM_WRITECACHE_HAS_PMEM
104 #define WC_MODE_PMEM(wc)			((wc)->pmem_mode)
105 #define WC_MODE_FUA(wc)				((wc)->writeback_fua)
106 #else
107 #define WC_MODE_PMEM(wc)			false
108 #define WC_MODE_FUA(wc)				false
109 #endif
110 #define WC_MODE_SORT_FREELIST(wc)		(!WC_MODE_PMEM(wc))
111 
112 struct dm_writecache {
113 	struct mutex lock;
114 	struct list_head lru;
115 	union {
116 		struct list_head freelist;
117 		struct {
118 			struct rb_root freetree;
119 			struct wc_entry *current_free;
120 		};
121 	};
122 	struct rb_root tree;
123 
124 	size_t freelist_size;
125 	size_t writeback_size;
126 	size_t freelist_high_watermark;
127 	size_t freelist_low_watermark;
128 	unsigned long max_age;
129 	unsigned long pause;
130 
131 	unsigned uncommitted_blocks;
132 	unsigned autocommit_blocks;
133 	unsigned max_writeback_jobs;
134 
135 	int error;
136 
137 	unsigned long autocommit_jiffies;
138 	struct timer_list autocommit_timer;
139 	struct wait_queue_head freelist_wait;
140 
141 	struct timer_list max_age_timer;
142 
143 	atomic_t bio_in_progress[2];
144 	struct wait_queue_head bio_in_progress_wait[2];
145 
146 	struct dm_target *ti;
147 	struct dm_dev *dev;
148 	struct dm_dev *ssd_dev;
149 	sector_t start_sector;
150 	void *memory_map;
151 	uint64_t memory_map_size;
152 	size_t metadata_sectors;
153 	size_t n_blocks;
154 	uint64_t seq_count;
155 	sector_t data_device_sectors;
156 	void *block_start;
157 	struct wc_entry *entries;
158 	unsigned block_size;
159 	unsigned char block_size_bits;
160 
161 	bool pmem_mode:1;
162 	bool writeback_fua:1;
163 
164 	bool overwrote_committed:1;
165 	bool memory_vmapped:1;
166 
167 	bool start_sector_set:1;
168 	bool high_wm_percent_set:1;
169 	bool low_wm_percent_set:1;
170 	bool max_writeback_jobs_set:1;
171 	bool autocommit_blocks_set:1;
172 	bool autocommit_time_set:1;
173 	bool max_age_set:1;
174 	bool writeback_fua_set:1;
175 	bool flush_on_suspend:1;
176 	bool cleaner:1;
177 	bool cleaner_set:1;
178 	bool metadata_only:1;
179 	bool pause_set:1;
180 
181 	unsigned high_wm_percent_value;
182 	unsigned low_wm_percent_value;
183 	unsigned autocommit_time_value;
184 	unsigned max_age_value;
185 	unsigned pause_value;
186 
187 	unsigned writeback_all;
188 	struct workqueue_struct *writeback_wq;
189 	struct work_struct writeback_work;
190 	struct work_struct flush_work;
191 
192 	struct dm_io_tracker iot;
193 
194 	struct dm_io_client *dm_io;
195 
196 	raw_spinlock_t endio_list_lock;
197 	struct list_head endio_list;
198 	struct task_struct *endio_thread;
199 
200 	struct task_struct *flush_thread;
201 	struct bio_list flush_list;
202 
203 	struct dm_kcopyd_client *dm_kcopyd;
204 	unsigned long *dirty_bitmap;
205 	unsigned dirty_bitmap_size;
206 
207 	struct bio_set bio_set;
208 	mempool_t copy_pool;
209 
210 	struct {
211 		unsigned long long reads;
212 		unsigned long long read_hits;
213 		unsigned long long writes;
214 		unsigned long long write_hits_uncommitted;
215 		unsigned long long write_hits_committed;
216 		unsigned long long writes_around;
217 		unsigned long long writes_allocate;
218 		unsigned long long writes_blocked_on_freelist;
219 		unsigned long long flushes;
220 		unsigned long long discards;
221 	} stats;
222 };
223 
224 #define WB_LIST_INLINE		16
225 
226 struct writeback_struct {
227 	struct list_head endio_entry;
228 	struct dm_writecache *wc;
229 	struct wc_entry **wc_list;
230 	unsigned wc_list_n;
231 	struct wc_entry *wc_list_inline[WB_LIST_INLINE];
232 	struct bio bio;
233 };
234 
235 struct copy_struct {
236 	struct list_head endio_entry;
237 	struct dm_writecache *wc;
238 	struct wc_entry *e;
239 	unsigned n_entries;
240 	int error;
241 };
242 
243 DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(dm_writecache_throttle,
244 					    "A percentage of time allocated for data copying");
245 
246 static void wc_lock(struct dm_writecache *wc)
247 {
248 	mutex_lock(&wc->lock);
249 }
250 
251 static void wc_unlock(struct dm_writecache *wc)
252 {
253 	mutex_unlock(&wc->lock);
254 }
255 
256 #ifdef DM_WRITECACHE_HAS_PMEM
257 static int persistent_memory_claim(struct dm_writecache *wc)
258 {
259 	int r;
260 	loff_t s;
261 	long p, da;
262 	pfn_t pfn;
263 	int id;
264 	struct page **pages;
265 	sector_t offset;
266 
267 	wc->memory_vmapped = false;
268 
269 	s = wc->memory_map_size;
270 	p = s >> PAGE_SHIFT;
271 	if (!p) {
272 		r = -EINVAL;
273 		goto err1;
274 	}
275 	if (p != s >> PAGE_SHIFT) {
276 		r = -EOVERFLOW;
277 		goto err1;
278 	}
279 
280 	offset = get_start_sect(wc->ssd_dev->bdev);
281 	if (offset & (PAGE_SIZE / 512 - 1)) {
282 		r = -EINVAL;
283 		goto err1;
284 	}
285 	offset >>= PAGE_SHIFT - 9;
286 
287 	id = dax_read_lock();
288 
289 	da = dax_direct_access(wc->ssd_dev->dax_dev, offset, p, DAX_ACCESS,
290 			&wc->memory_map, &pfn);
291 	if (da < 0) {
292 		wc->memory_map = NULL;
293 		r = da;
294 		goto err2;
295 	}
296 	if (!pfn_t_has_page(pfn)) {
297 		wc->memory_map = NULL;
298 		r = -EOPNOTSUPP;
299 		goto err2;
300 	}
301 	if (da != p) {
302 		long i;
303 		wc->memory_map = NULL;
304 		pages = kvmalloc_array(p, sizeof(struct page *), GFP_KERNEL);
305 		if (!pages) {
306 			r = -ENOMEM;
307 			goto err2;
308 		}
309 		i = 0;
310 		do {
311 			long daa;
312 			daa = dax_direct_access(wc->ssd_dev->dax_dev, offset + i,
313 					p - i, DAX_ACCESS, NULL, &pfn);
314 			if (daa <= 0) {
315 				r = daa ? daa : -EINVAL;
316 				goto err3;
317 			}
318 			if (!pfn_t_has_page(pfn)) {
319 				r = -EOPNOTSUPP;
320 				goto err3;
321 			}
322 			while (daa-- && i < p) {
323 				pages[i++] = pfn_t_to_page(pfn);
324 				pfn.val++;
325 				if (!(i & 15))
326 					cond_resched();
327 			}
328 		} while (i < p);
329 		wc->memory_map = vmap(pages, p, VM_MAP, PAGE_KERNEL);
330 		if (!wc->memory_map) {
331 			r = -ENOMEM;
332 			goto err3;
333 		}
334 		kvfree(pages);
335 		wc->memory_vmapped = true;
336 	}
337 
338 	dax_read_unlock(id);
339 
340 	wc->memory_map += (size_t)wc->start_sector << SECTOR_SHIFT;
341 	wc->memory_map_size -= (size_t)wc->start_sector << SECTOR_SHIFT;
342 
343 	return 0;
344 err3:
345 	kvfree(pages);
346 err2:
347 	dax_read_unlock(id);
348 err1:
349 	return r;
350 }
351 #else
352 static int persistent_memory_claim(struct dm_writecache *wc)
353 {
354 	return -EOPNOTSUPP;
355 }
356 #endif
357 
358 static void persistent_memory_release(struct dm_writecache *wc)
359 {
360 	if (wc->memory_vmapped)
361 		vunmap(wc->memory_map - ((size_t)wc->start_sector << SECTOR_SHIFT));
362 }
363 
364 static struct page *persistent_memory_page(void *addr)
365 {
366 	if (is_vmalloc_addr(addr))
367 		return vmalloc_to_page(addr);
368 	else
369 		return virt_to_page(addr);
370 }
371 
372 static unsigned persistent_memory_page_offset(void *addr)
373 {
374 	return (unsigned long)addr & (PAGE_SIZE - 1);
375 }
376 
377 static void persistent_memory_flush_cache(void *ptr, size_t size)
378 {
379 	if (is_vmalloc_addr(ptr))
380 		flush_kernel_vmap_range(ptr, size);
381 }
382 
383 static void persistent_memory_invalidate_cache(void *ptr, size_t size)
384 {
385 	if (is_vmalloc_addr(ptr))
386 		invalidate_kernel_vmap_range(ptr, size);
387 }
388 
389 static struct wc_memory_superblock *sb(struct dm_writecache *wc)
390 {
391 	return wc->memory_map;
392 }
393 
394 static struct wc_memory_entry *memory_entry(struct dm_writecache *wc, struct wc_entry *e)
395 {
396 	return &sb(wc)->entries[e->index];
397 }
398 
399 static void *memory_data(struct dm_writecache *wc, struct wc_entry *e)
400 {
401 	return (char *)wc->block_start + (e->index << wc->block_size_bits);
402 }
403 
404 static sector_t cache_sector(struct dm_writecache *wc, struct wc_entry *e)
405 {
406 	return wc->start_sector + wc->metadata_sectors +
407 		((sector_t)e->index << (wc->block_size_bits - SECTOR_SHIFT));
408 }
409 
410 static uint64_t read_original_sector(struct dm_writecache *wc, struct wc_entry *e)
411 {
412 #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
413 	return e->original_sector;
414 #else
415 	return le64_to_cpu(memory_entry(wc, e)->original_sector);
416 #endif
417 }
418 
419 static uint64_t read_seq_count(struct dm_writecache *wc, struct wc_entry *e)
420 {
421 #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
422 	return e->seq_count;
423 #else
424 	return le64_to_cpu(memory_entry(wc, e)->seq_count);
425 #endif
426 }
427 
428 static void clear_seq_count(struct dm_writecache *wc, struct wc_entry *e)
429 {
430 #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
431 	e->seq_count = -1;
432 #endif
433 	pmem_assign(memory_entry(wc, e)->seq_count, cpu_to_le64(-1));
434 }
435 
436 static void write_original_sector_seq_count(struct dm_writecache *wc, struct wc_entry *e,
437 					    uint64_t original_sector, uint64_t seq_count)
438 {
439 	struct wc_memory_entry me;
440 #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
441 	e->original_sector = original_sector;
442 	e->seq_count = seq_count;
443 #endif
444 	me.original_sector = cpu_to_le64(original_sector);
445 	me.seq_count = cpu_to_le64(seq_count);
446 	pmem_assign(*memory_entry(wc, e), me);
447 }
448 
449 #define writecache_error(wc, err, msg, arg...)				\
450 do {									\
451 	if (!cmpxchg(&(wc)->error, 0, err))				\
452 		DMERR(msg, ##arg);					\
453 	wake_up(&(wc)->freelist_wait);					\
454 } while (0)
455 
456 #define writecache_has_error(wc)	(unlikely(READ_ONCE((wc)->error)))
457 
458 static void writecache_flush_all_metadata(struct dm_writecache *wc)
459 {
460 	if (!WC_MODE_PMEM(wc))
461 		memset(wc->dirty_bitmap, -1, wc->dirty_bitmap_size);
462 }
463 
464 static void writecache_flush_region(struct dm_writecache *wc, void *ptr, size_t size)
465 {
466 	if (!WC_MODE_PMEM(wc))
467 		__set_bit(((char *)ptr - (char *)wc->memory_map) / BITMAP_GRANULARITY,
468 			  wc->dirty_bitmap);
469 }
470 
471 static void writecache_disk_flush(struct dm_writecache *wc, struct dm_dev *dev);
472 
473 struct io_notify {
474 	struct dm_writecache *wc;
475 	struct completion c;
476 	atomic_t count;
477 };
478 
479 static void writecache_notify_io(unsigned long error, void *context)
480 {
481 	struct io_notify *endio = context;
482 
483 	if (unlikely(error != 0))
484 		writecache_error(endio->wc, -EIO, "error writing metadata");
485 	BUG_ON(atomic_read(&endio->count) <= 0);
486 	if (atomic_dec_and_test(&endio->count))
487 		complete(&endio->c);
488 }
489 
490 static void writecache_wait_for_ios(struct dm_writecache *wc, int direction)
491 {
492 	wait_event(wc->bio_in_progress_wait[direction],
493 		   !atomic_read(&wc->bio_in_progress[direction]));
494 }
495 
496 static void ssd_commit_flushed(struct dm_writecache *wc, bool wait_for_ios)
497 {
498 	struct dm_io_region region;
499 	struct dm_io_request req;
500 	struct io_notify endio = {
501 		wc,
502 		COMPLETION_INITIALIZER_ONSTACK(endio.c),
503 		ATOMIC_INIT(1),
504 	};
505 	unsigned bitmap_bits = wc->dirty_bitmap_size * 8;
506 	unsigned i = 0;
507 
508 	while (1) {
509 		unsigned j;
510 		i = find_next_bit(wc->dirty_bitmap, bitmap_bits, i);
511 		if (unlikely(i == bitmap_bits))
512 			break;
513 		j = find_next_zero_bit(wc->dirty_bitmap, bitmap_bits, i);
514 
515 		region.bdev = wc->ssd_dev->bdev;
516 		region.sector = (sector_t)i * (BITMAP_GRANULARITY >> SECTOR_SHIFT);
517 		region.count = (sector_t)(j - i) * (BITMAP_GRANULARITY >> SECTOR_SHIFT);
518 
519 		if (unlikely(region.sector >= wc->metadata_sectors))
520 			break;
521 		if (unlikely(region.sector + region.count > wc->metadata_sectors))
522 			region.count = wc->metadata_sectors - region.sector;
523 
524 		region.sector += wc->start_sector;
525 		atomic_inc(&endio.count);
526 		req.bi_opf = REQ_OP_WRITE | REQ_SYNC;
527 		req.mem.type = DM_IO_VMA;
528 		req.mem.ptr.vma = (char *)wc->memory_map + (size_t)i * BITMAP_GRANULARITY;
529 		req.client = wc->dm_io;
530 		req.notify.fn = writecache_notify_io;
531 		req.notify.context = &endio;
532 
533 		/* writing via async dm-io (implied by notify.fn above) won't return an error */
534 	        (void) dm_io(&req, 1, &region, NULL);
535 		i = j;
536 	}
537 
538 	writecache_notify_io(0, &endio);
539 	wait_for_completion_io(&endio.c);
540 
541 	if (wait_for_ios)
542 		writecache_wait_for_ios(wc, WRITE);
543 
544 	writecache_disk_flush(wc, wc->ssd_dev);
545 
546 	memset(wc->dirty_bitmap, 0, wc->dirty_bitmap_size);
547 }
548 
549 static void ssd_commit_superblock(struct dm_writecache *wc)
550 {
551 	int r;
552 	struct dm_io_region region;
553 	struct dm_io_request req;
554 
555 	region.bdev = wc->ssd_dev->bdev;
556 	region.sector = 0;
557 	region.count = max(4096U, wc->block_size) >> SECTOR_SHIFT;
558 
559 	if (unlikely(region.sector + region.count > wc->metadata_sectors))
560 		region.count = wc->metadata_sectors - region.sector;
561 
562 	region.sector += wc->start_sector;
563 
564 	req.bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_FUA;
565 	req.mem.type = DM_IO_VMA;
566 	req.mem.ptr.vma = (char *)wc->memory_map;
567 	req.client = wc->dm_io;
568 	req.notify.fn = NULL;
569 	req.notify.context = NULL;
570 
571 	r = dm_io(&req, 1, &region, NULL);
572 	if (unlikely(r))
573 		writecache_error(wc, r, "error writing superblock");
574 }
575 
576 static void writecache_commit_flushed(struct dm_writecache *wc, bool wait_for_ios)
577 {
578 	if (WC_MODE_PMEM(wc))
579 		pmem_wmb();
580 	else
581 		ssd_commit_flushed(wc, wait_for_ios);
582 }
583 
584 static void writecache_disk_flush(struct dm_writecache *wc, struct dm_dev *dev)
585 {
586 	int r;
587 	struct dm_io_region region;
588 	struct dm_io_request req;
589 
590 	region.bdev = dev->bdev;
591 	region.sector = 0;
592 	region.count = 0;
593 	req.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
594 	req.mem.type = DM_IO_KMEM;
595 	req.mem.ptr.addr = NULL;
596 	req.client = wc->dm_io;
597 	req.notify.fn = NULL;
598 
599 	r = dm_io(&req, 1, &region, NULL);
600 	if (unlikely(r))
601 		writecache_error(wc, r, "error flushing metadata: %d", r);
602 }
603 
604 #define WFE_RETURN_FOLLOWING	1
605 #define WFE_LOWEST_SEQ		2
606 
607 static struct wc_entry *writecache_find_entry(struct dm_writecache *wc,
608 					      uint64_t block, int flags)
609 {
610 	struct wc_entry *e;
611 	struct rb_node *node = wc->tree.rb_node;
612 
613 	if (unlikely(!node))
614 		return NULL;
615 
616 	while (1) {
617 		e = container_of(node, struct wc_entry, rb_node);
618 		if (read_original_sector(wc, e) == block)
619 			break;
620 
621 		node = (read_original_sector(wc, e) >= block ?
622 			e->rb_node.rb_left : e->rb_node.rb_right);
623 		if (unlikely(!node)) {
624 			if (!(flags & WFE_RETURN_FOLLOWING))
625 				return NULL;
626 			if (read_original_sector(wc, e) >= block) {
627 				return e;
628 			} else {
629 				node = rb_next(&e->rb_node);
630 				if (unlikely(!node))
631 					return NULL;
632 				e = container_of(node, struct wc_entry, rb_node);
633 				return e;
634 			}
635 		}
636 	}
637 
638 	while (1) {
639 		struct wc_entry *e2;
640 		if (flags & WFE_LOWEST_SEQ)
641 			node = rb_prev(&e->rb_node);
642 		else
643 			node = rb_next(&e->rb_node);
644 		if (unlikely(!node))
645 			return e;
646 		e2 = container_of(node, struct wc_entry, rb_node);
647 		if (read_original_sector(wc, e2) != block)
648 			return e;
649 		e = e2;
650 	}
651 }
652 
653 static void writecache_insert_entry(struct dm_writecache *wc, struct wc_entry *ins)
654 {
655 	struct wc_entry *e;
656 	struct rb_node **node = &wc->tree.rb_node, *parent = NULL;
657 
658 	while (*node) {
659 		e = container_of(*node, struct wc_entry, rb_node);
660 		parent = &e->rb_node;
661 		if (read_original_sector(wc, e) > read_original_sector(wc, ins))
662 			node = &parent->rb_left;
663 		else
664 			node = &parent->rb_right;
665 	}
666 	rb_link_node(&ins->rb_node, parent, node);
667 	rb_insert_color(&ins->rb_node, &wc->tree);
668 	list_add(&ins->lru, &wc->lru);
669 	ins->age = jiffies;
670 }
671 
672 static void writecache_unlink(struct dm_writecache *wc, struct wc_entry *e)
673 {
674 	list_del(&e->lru);
675 	rb_erase(&e->rb_node, &wc->tree);
676 }
677 
678 static void writecache_add_to_freelist(struct dm_writecache *wc, struct wc_entry *e)
679 {
680 	if (WC_MODE_SORT_FREELIST(wc)) {
681 		struct rb_node **node = &wc->freetree.rb_node, *parent = NULL;
682 		if (unlikely(!*node))
683 			wc->current_free = e;
684 		while (*node) {
685 			parent = *node;
686 			if (&e->rb_node < *node)
687 				node = &parent->rb_left;
688 			else
689 				node = &parent->rb_right;
690 		}
691 		rb_link_node(&e->rb_node, parent, node);
692 		rb_insert_color(&e->rb_node, &wc->freetree);
693 	} else {
694 		list_add_tail(&e->lru, &wc->freelist);
695 	}
696 	wc->freelist_size++;
697 }
698 
699 static inline void writecache_verify_watermark(struct dm_writecache *wc)
700 {
701 	if (unlikely(wc->freelist_size + wc->writeback_size <= wc->freelist_high_watermark))
702 		queue_work(wc->writeback_wq, &wc->writeback_work);
703 }
704 
705 static void writecache_max_age_timer(struct timer_list *t)
706 {
707 	struct dm_writecache *wc = from_timer(wc, t, max_age_timer);
708 
709 	if (!dm_suspended(wc->ti) && !writecache_has_error(wc)) {
710 		queue_work(wc->writeback_wq, &wc->writeback_work);
711 		mod_timer(&wc->max_age_timer, jiffies + wc->max_age / MAX_AGE_DIV);
712 	}
713 }
714 
715 static struct wc_entry *writecache_pop_from_freelist(struct dm_writecache *wc, sector_t expected_sector)
716 {
717 	struct wc_entry *e;
718 
719 	if (WC_MODE_SORT_FREELIST(wc)) {
720 		struct rb_node *next;
721 		if (unlikely(!wc->current_free))
722 			return NULL;
723 		e = wc->current_free;
724 		if (expected_sector != (sector_t)-1 && unlikely(cache_sector(wc, e) != expected_sector))
725 			return NULL;
726 		next = rb_next(&e->rb_node);
727 		rb_erase(&e->rb_node, &wc->freetree);
728 		if (unlikely(!next))
729 			next = rb_first(&wc->freetree);
730 		wc->current_free = next ? container_of(next, struct wc_entry, rb_node) : NULL;
731 	} else {
732 		if (unlikely(list_empty(&wc->freelist)))
733 			return NULL;
734 		e = container_of(wc->freelist.next, struct wc_entry, lru);
735 		if (expected_sector != (sector_t)-1 && unlikely(cache_sector(wc, e) != expected_sector))
736 			return NULL;
737 		list_del(&e->lru);
738 	}
739 	wc->freelist_size--;
740 
741 	writecache_verify_watermark(wc);
742 
743 	return e;
744 }
745 
746 static void writecache_free_entry(struct dm_writecache *wc, struct wc_entry *e)
747 {
748 	writecache_unlink(wc, e);
749 	writecache_add_to_freelist(wc, e);
750 	clear_seq_count(wc, e);
751 	writecache_flush_region(wc, memory_entry(wc, e), sizeof(struct wc_memory_entry));
752 	if (unlikely(waitqueue_active(&wc->freelist_wait)))
753 		wake_up(&wc->freelist_wait);
754 }
755 
756 static void writecache_wait_on_freelist(struct dm_writecache *wc)
757 {
758 	DEFINE_WAIT(wait);
759 
760 	prepare_to_wait(&wc->freelist_wait, &wait, TASK_UNINTERRUPTIBLE);
761 	wc_unlock(wc);
762 	io_schedule();
763 	finish_wait(&wc->freelist_wait, &wait);
764 	wc_lock(wc);
765 }
766 
767 static void writecache_poison_lists(struct dm_writecache *wc)
768 {
769 	/*
770 	 * Catch incorrect access to these values while the device is suspended.
771 	 */
772 	memset(&wc->tree, -1, sizeof wc->tree);
773 	wc->lru.next = LIST_POISON1;
774 	wc->lru.prev = LIST_POISON2;
775 	wc->freelist.next = LIST_POISON1;
776 	wc->freelist.prev = LIST_POISON2;
777 }
778 
779 static void writecache_flush_entry(struct dm_writecache *wc, struct wc_entry *e)
780 {
781 	writecache_flush_region(wc, memory_entry(wc, e), sizeof(struct wc_memory_entry));
782 	if (WC_MODE_PMEM(wc))
783 		writecache_flush_region(wc, memory_data(wc, e), wc->block_size);
784 }
785 
786 static bool writecache_entry_is_committed(struct dm_writecache *wc, struct wc_entry *e)
787 {
788 	return read_seq_count(wc, e) < wc->seq_count;
789 }
790 
791 static void writecache_flush(struct dm_writecache *wc)
792 {
793 	struct wc_entry *e, *e2;
794 	bool need_flush_after_free;
795 
796 	wc->uncommitted_blocks = 0;
797 	del_timer(&wc->autocommit_timer);
798 
799 	if (list_empty(&wc->lru))
800 		return;
801 
802 	e = container_of(wc->lru.next, struct wc_entry, lru);
803 	if (writecache_entry_is_committed(wc, e)) {
804 		if (wc->overwrote_committed) {
805 			writecache_wait_for_ios(wc, WRITE);
806 			writecache_disk_flush(wc, wc->ssd_dev);
807 			wc->overwrote_committed = false;
808 		}
809 		return;
810 	}
811 	while (1) {
812 		writecache_flush_entry(wc, e);
813 		if (unlikely(e->lru.next == &wc->lru))
814 			break;
815 		e2 = container_of(e->lru.next, struct wc_entry, lru);
816 		if (writecache_entry_is_committed(wc, e2))
817 			break;
818 		e = e2;
819 		cond_resched();
820 	}
821 	writecache_commit_flushed(wc, true);
822 
823 	wc->seq_count++;
824 	pmem_assign(sb(wc)->seq_count, cpu_to_le64(wc->seq_count));
825 	if (WC_MODE_PMEM(wc))
826 		writecache_commit_flushed(wc, false);
827 	else
828 		ssd_commit_superblock(wc);
829 
830 	wc->overwrote_committed = false;
831 
832 	need_flush_after_free = false;
833 	while (1) {
834 		/* Free another committed entry with lower seq-count */
835 		struct rb_node *rb_node = rb_prev(&e->rb_node);
836 
837 		if (rb_node) {
838 			e2 = container_of(rb_node, struct wc_entry, rb_node);
839 			if (read_original_sector(wc, e2) == read_original_sector(wc, e) &&
840 			    likely(!e2->write_in_progress)) {
841 				writecache_free_entry(wc, e2);
842 				need_flush_after_free = true;
843 			}
844 		}
845 		if (unlikely(e->lru.prev == &wc->lru))
846 			break;
847 		e = container_of(e->lru.prev, struct wc_entry, lru);
848 		cond_resched();
849 	}
850 
851 	if (need_flush_after_free)
852 		writecache_commit_flushed(wc, false);
853 }
854 
855 static void writecache_flush_work(struct work_struct *work)
856 {
857 	struct dm_writecache *wc = container_of(work, struct dm_writecache, flush_work);
858 
859 	wc_lock(wc);
860 	writecache_flush(wc);
861 	wc_unlock(wc);
862 }
863 
864 static void writecache_autocommit_timer(struct timer_list *t)
865 {
866 	struct dm_writecache *wc = from_timer(wc, t, autocommit_timer);
867 	if (!writecache_has_error(wc))
868 		queue_work(wc->writeback_wq, &wc->flush_work);
869 }
870 
871 static void writecache_schedule_autocommit(struct dm_writecache *wc)
872 {
873 	if (!timer_pending(&wc->autocommit_timer))
874 		mod_timer(&wc->autocommit_timer, jiffies + wc->autocommit_jiffies);
875 }
876 
877 static void writecache_discard(struct dm_writecache *wc, sector_t start, sector_t end)
878 {
879 	struct wc_entry *e;
880 	bool discarded_something = false;
881 
882 	e = writecache_find_entry(wc, start, WFE_RETURN_FOLLOWING | WFE_LOWEST_SEQ);
883 	if (unlikely(!e))
884 		return;
885 
886 	while (read_original_sector(wc, e) < end) {
887 		struct rb_node *node = rb_next(&e->rb_node);
888 
889 		if (likely(!e->write_in_progress)) {
890 			if (!discarded_something) {
891 				if (!WC_MODE_PMEM(wc)) {
892 					writecache_wait_for_ios(wc, READ);
893 					writecache_wait_for_ios(wc, WRITE);
894 				}
895 				discarded_something = true;
896 			}
897 			if (!writecache_entry_is_committed(wc, e))
898 				wc->uncommitted_blocks--;
899 			writecache_free_entry(wc, e);
900 		}
901 
902 		if (unlikely(!node))
903 			break;
904 
905 		e = container_of(node, struct wc_entry, rb_node);
906 	}
907 
908 	if (discarded_something)
909 		writecache_commit_flushed(wc, false);
910 }
911 
912 static bool writecache_wait_for_writeback(struct dm_writecache *wc)
913 {
914 	if (wc->writeback_size) {
915 		writecache_wait_on_freelist(wc);
916 		return true;
917 	}
918 	return false;
919 }
920 
921 static void writecache_suspend(struct dm_target *ti)
922 {
923 	struct dm_writecache *wc = ti->private;
924 	bool flush_on_suspend;
925 
926 	del_timer_sync(&wc->autocommit_timer);
927 	del_timer_sync(&wc->max_age_timer);
928 
929 	wc_lock(wc);
930 	writecache_flush(wc);
931 	flush_on_suspend = wc->flush_on_suspend;
932 	if (flush_on_suspend) {
933 		wc->flush_on_suspend = false;
934 		wc->writeback_all++;
935 		queue_work(wc->writeback_wq, &wc->writeback_work);
936 	}
937 	wc_unlock(wc);
938 
939 	drain_workqueue(wc->writeback_wq);
940 
941 	wc_lock(wc);
942 	if (flush_on_suspend)
943 		wc->writeback_all--;
944 	while (writecache_wait_for_writeback(wc));
945 
946 	if (WC_MODE_PMEM(wc))
947 		persistent_memory_flush_cache(wc->memory_map, wc->memory_map_size);
948 
949 	writecache_poison_lists(wc);
950 
951 	wc_unlock(wc);
952 }
953 
954 static int writecache_alloc_entries(struct dm_writecache *wc)
955 {
956 	size_t b;
957 
958 	if (wc->entries)
959 		return 0;
960 	wc->entries = vmalloc(array_size(sizeof(struct wc_entry), wc->n_blocks));
961 	if (!wc->entries)
962 		return -ENOMEM;
963 	for (b = 0; b < wc->n_blocks; b++) {
964 		struct wc_entry *e = &wc->entries[b];
965 		e->index = b;
966 		e->write_in_progress = false;
967 		cond_resched();
968 	}
969 
970 	return 0;
971 }
972 
973 static int writecache_read_metadata(struct dm_writecache *wc, sector_t n_sectors)
974 {
975 	struct dm_io_region region;
976 	struct dm_io_request req;
977 
978 	region.bdev = wc->ssd_dev->bdev;
979 	region.sector = wc->start_sector;
980 	region.count = n_sectors;
981 	req.bi_opf = REQ_OP_READ | REQ_SYNC;
982 	req.mem.type = DM_IO_VMA;
983 	req.mem.ptr.vma = (char *)wc->memory_map;
984 	req.client = wc->dm_io;
985 	req.notify.fn = NULL;
986 
987 	return dm_io(&req, 1, &region, NULL);
988 }
989 
990 static void writecache_resume(struct dm_target *ti)
991 {
992 	struct dm_writecache *wc = ti->private;
993 	size_t b;
994 	bool need_flush = false;
995 	__le64 sb_seq_count;
996 	int r;
997 
998 	wc_lock(wc);
999 
1000 	wc->data_device_sectors = bdev_nr_sectors(wc->dev->bdev);
1001 
1002 	if (WC_MODE_PMEM(wc)) {
1003 		persistent_memory_invalidate_cache(wc->memory_map, wc->memory_map_size);
1004 	} else {
1005 		r = writecache_read_metadata(wc, wc->metadata_sectors);
1006 		if (r) {
1007 			size_t sb_entries_offset;
1008 			writecache_error(wc, r, "unable to read metadata: %d", r);
1009 			sb_entries_offset = offsetof(struct wc_memory_superblock, entries);
1010 			memset((char *)wc->memory_map + sb_entries_offset, -1,
1011 			       (wc->metadata_sectors << SECTOR_SHIFT) - sb_entries_offset);
1012 		}
1013 	}
1014 
1015 	wc->tree = RB_ROOT;
1016 	INIT_LIST_HEAD(&wc->lru);
1017 	if (WC_MODE_SORT_FREELIST(wc)) {
1018 		wc->freetree = RB_ROOT;
1019 		wc->current_free = NULL;
1020 	} else {
1021 		INIT_LIST_HEAD(&wc->freelist);
1022 	}
1023 	wc->freelist_size = 0;
1024 
1025 	r = copy_mc_to_kernel(&sb_seq_count, &sb(wc)->seq_count,
1026 			      sizeof(uint64_t));
1027 	if (r) {
1028 		writecache_error(wc, r, "hardware memory error when reading superblock: %d", r);
1029 		sb_seq_count = cpu_to_le64(0);
1030 	}
1031 	wc->seq_count = le64_to_cpu(sb_seq_count);
1032 
1033 #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
1034 	for (b = 0; b < wc->n_blocks; b++) {
1035 		struct wc_entry *e = &wc->entries[b];
1036 		struct wc_memory_entry wme;
1037 		if (writecache_has_error(wc)) {
1038 			e->original_sector = -1;
1039 			e->seq_count = -1;
1040 			continue;
1041 		}
1042 		r = copy_mc_to_kernel(&wme, memory_entry(wc, e),
1043 				      sizeof(struct wc_memory_entry));
1044 		if (r) {
1045 			writecache_error(wc, r, "hardware memory error when reading metadata entry %lu: %d",
1046 					 (unsigned long)b, r);
1047 			e->original_sector = -1;
1048 			e->seq_count = -1;
1049 		} else {
1050 			e->original_sector = le64_to_cpu(wme.original_sector);
1051 			e->seq_count = le64_to_cpu(wme.seq_count);
1052 		}
1053 		cond_resched();
1054 	}
1055 #endif
1056 	for (b = 0; b < wc->n_blocks; b++) {
1057 		struct wc_entry *e = &wc->entries[b];
1058 		if (!writecache_entry_is_committed(wc, e)) {
1059 			if (read_seq_count(wc, e) != -1) {
1060 erase_this:
1061 				clear_seq_count(wc, e);
1062 				need_flush = true;
1063 			}
1064 			writecache_add_to_freelist(wc, e);
1065 		} else {
1066 			struct wc_entry *old;
1067 
1068 			old = writecache_find_entry(wc, read_original_sector(wc, e), 0);
1069 			if (!old) {
1070 				writecache_insert_entry(wc, e);
1071 			} else {
1072 				if (read_seq_count(wc, old) == read_seq_count(wc, e)) {
1073 					writecache_error(wc, -EINVAL,
1074 						 "two identical entries, position %llu, sector %llu, sequence %llu",
1075 						 (unsigned long long)b, (unsigned long long)read_original_sector(wc, e),
1076 						 (unsigned long long)read_seq_count(wc, e));
1077 				}
1078 				if (read_seq_count(wc, old) > read_seq_count(wc, e)) {
1079 					goto erase_this;
1080 				} else {
1081 					writecache_free_entry(wc, old);
1082 					writecache_insert_entry(wc, e);
1083 					need_flush = true;
1084 				}
1085 			}
1086 		}
1087 		cond_resched();
1088 	}
1089 
1090 	if (need_flush) {
1091 		writecache_flush_all_metadata(wc);
1092 		writecache_commit_flushed(wc, false);
1093 	}
1094 
1095 	writecache_verify_watermark(wc);
1096 
1097 	if (wc->max_age != MAX_AGE_UNSPECIFIED)
1098 		mod_timer(&wc->max_age_timer, jiffies + wc->max_age / MAX_AGE_DIV);
1099 
1100 	wc_unlock(wc);
1101 }
1102 
1103 static int process_flush_mesg(unsigned argc, char **argv, struct dm_writecache *wc)
1104 {
1105 	if (argc != 1)
1106 		return -EINVAL;
1107 
1108 	wc_lock(wc);
1109 	if (dm_suspended(wc->ti)) {
1110 		wc_unlock(wc);
1111 		return -EBUSY;
1112 	}
1113 	if (writecache_has_error(wc)) {
1114 		wc_unlock(wc);
1115 		return -EIO;
1116 	}
1117 
1118 	writecache_flush(wc);
1119 	wc->writeback_all++;
1120 	queue_work(wc->writeback_wq, &wc->writeback_work);
1121 	wc_unlock(wc);
1122 
1123 	flush_workqueue(wc->writeback_wq);
1124 
1125 	wc_lock(wc);
1126 	wc->writeback_all--;
1127 	if (writecache_has_error(wc)) {
1128 		wc_unlock(wc);
1129 		return -EIO;
1130 	}
1131 	wc_unlock(wc);
1132 
1133 	return 0;
1134 }
1135 
1136 static int process_flush_on_suspend_mesg(unsigned argc, char **argv, struct dm_writecache *wc)
1137 {
1138 	if (argc != 1)
1139 		return -EINVAL;
1140 
1141 	wc_lock(wc);
1142 	wc->flush_on_suspend = true;
1143 	wc_unlock(wc);
1144 
1145 	return 0;
1146 }
1147 
1148 static void activate_cleaner(struct dm_writecache *wc)
1149 {
1150 	wc->flush_on_suspend = true;
1151 	wc->cleaner = true;
1152 	wc->freelist_high_watermark = wc->n_blocks;
1153 	wc->freelist_low_watermark = wc->n_blocks;
1154 }
1155 
1156 static int process_cleaner_mesg(unsigned argc, char **argv, struct dm_writecache *wc)
1157 {
1158 	if (argc != 1)
1159 		return -EINVAL;
1160 
1161 	wc_lock(wc);
1162 	activate_cleaner(wc);
1163 	if (!dm_suspended(wc->ti))
1164 		writecache_verify_watermark(wc);
1165 	wc_unlock(wc);
1166 
1167 	return 0;
1168 }
1169 
1170 static int process_clear_stats_mesg(unsigned argc, char **argv, struct dm_writecache *wc)
1171 {
1172 	if (argc != 1)
1173 		return -EINVAL;
1174 
1175 	wc_lock(wc);
1176 	memset(&wc->stats, 0, sizeof wc->stats);
1177 	wc_unlock(wc);
1178 
1179 	return 0;
1180 }
1181 
1182 static int writecache_message(struct dm_target *ti, unsigned argc, char **argv,
1183 			      char *result, unsigned maxlen)
1184 {
1185 	int r = -EINVAL;
1186 	struct dm_writecache *wc = ti->private;
1187 
1188 	if (!strcasecmp(argv[0], "flush"))
1189 		r = process_flush_mesg(argc, argv, wc);
1190 	else if (!strcasecmp(argv[0], "flush_on_suspend"))
1191 		r = process_flush_on_suspend_mesg(argc, argv, wc);
1192 	else if (!strcasecmp(argv[0], "cleaner"))
1193 		r = process_cleaner_mesg(argc, argv, wc);
1194 	else if (!strcasecmp(argv[0], "clear_stats"))
1195 		r = process_clear_stats_mesg(argc, argv, wc);
1196 	else
1197 		DMERR("unrecognised message received: %s", argv[0]);
1198 
1199 	return r;
1200 }
1201 
1202 static void memcpy_flushcache_optimized(void *dest, void *source, size_t size)
1203 {
1204 	/*
1205 	 * clflushopt performs better with block size 1024, 2048, 4096
1206 	 * non-temporal stores perform better with block size 512
1207 	 *
1208 	 * block size   512             1024            2048            4096
1209 	 * movnti       496 MB/s        642 MB/s        725 MB/s        744 MB/s
1210 	 * clflushopt   373 MB/s        688 MB/s        1.1 GB/s        1.2 GB/s
1211 	 *
1212 	 * We see that movnti performs better for 512-byte blocks, and
1213 	 * clflushopt performs better for 1024-byte and larger blocks. So, we
1214 	 * prefer clflushopt for sizes >= 768.
1215 	 *
1216 	 * NOTE: this happens to be the case now (with dm-writecache's single
1217 	 * threaded model) but re-evaluate this once memcpy_flushcache() is
1218 	 * enabled to use movdir64b which might invalidate this performance
1219 	 * advantage seen with cache-allocating-writes plus flushing.
1220 	 */
1221 #ifdef CONFIG_X86
1222 	if (static_cpu_has(X86_FEATURE_CLFLUSHOPT) &&
1223 	    likely(boot_cpu_data.x86_clflush_size == 64) &&
1224 	    likely(size >= 768)) {
1225 		do {
1226 			memcpy((void *)dest, (void *)source, 64);
1227 			clflushopt((void *)dest);
1228 			dest += 64;
1229 			source += 64;
1230 			size -= 64;
1231 		} while (size >= 64);
1232 		return;
1233 	}
1234 #endif
1235 	memcpy_flushcache(dest, source, size);
1236 }
1237 
1238 static void bio_copy_block(struct dm_writecache *wc, struct bio *bio, void *data)
1239 {
1240 	void *buf;
1241 	unsigned size;
1242 	int rw = bio_data_dir(bio);
1243 	unsigned remaining_size = wc->block_size;
1244 
1245 	do {
1246 		struct bio_vec bv = bio_iter_iovec(bio, bio->bi_iter);
1247 		buf = bvec_kmap_local(&bv);
1248 		size = bv.bv_len;
1249 		if (unlikely(size > remaining_size))
1250 			size = remaining_size;
1251 
1252 		if (rw == READ) {
1253 			int r;
1254 			r = copy_mc_to_kernel(buf, data, size);
1255 			flush_dcache_page(bio_page(bio));
1256 			if (unlikely(r)) {
1257 				writecache_error(wc, r, "hardware memory error when reading data: %d", r);
1258 				bio->bi_status = BLK_STS_IOERR;
1259 			}
1260 		} else {
1261 			flush_dcache_page(bio_page(bio));
1262 			memcpy_flushcache_optimized(data, buf, size);
1263 		}
1264 
1265 		kunmap_local(buf);
1266 
1267 		data = (char *)data + size;
1268 		remaining_size -= size;
1269 		bio_advance(bio, size);
1270 	} while (unlikely(remaining_size));
1271 }
1272 
1273 static int writecache_flush_thread(void *data)
1274 {
1275 	struct dm_writecache *wc = data;
1276 
1277 	while (1) {
1278 		struct bio *bio;
1279 
1280 		wc_lock(wc);
1281 		bio = bio_list_pop(&wc->flush_list);
1282 		if (!bio) {
1283 			set_current_state(TASK_INTERRUPTIBLE);
1284 			wc_unlock(wc);
1285 
1286 			if (unlikely(kthread_should_stop())) {
1287 				set_current_state(TASK_RUNNING);
1288 				break;
1289 			}
1290 
1291 			schedule();
1292 			continue;
1293 		}
1294 
1295 		if (bio_op(bio) == REQ_OP_DISCARD) {
1296 			writecache_discard(wc, bio->bi_iter.bi_sector,
1297 					   bio_end_sector(bio));
1298 			wc_unlock(wc);
1299 			bio_set_dev(bio, wc->dev->bdev);
1300 			submit_bio_noacct(bio);
1301 		} else {
1302 			writecache_flush(wc);
1303 			wc_unlock(wc);
1304 			if (writecache_has_error(wc))
1305 				bio->bi_status = BLK_STS_IOERR;
1306 			bio_endio(bio);
1307 		}
1308 	}
1309 
1310 	return 0;
1311 }
1312 
1313 static void writecache_offload_bio(struct dm_writecache *wc, struct bio *bio)
1314 {
1315 	if (bio_list_empty(&wc->flush_list))
1316 		wake_up_process(wc->flush_thread);
1317 	bio_list_add(&wc->flush_list, bio);
1318 }
1319 
1320 enum wc_map_op {
1321 	WC_MAP_SUBMIT,
1322 	WC_MAP_REMAP,
1323 	WC_MAP_REMAP_ORIGIN,
1324 	WC_MAP_RETURN,
1325 	WC_MAP_ERROR,
1326 };
1327 
1328 static void writecache_map_remap_origin(struct dm_writecache *wc, struct bio *bio,
1329 					struct wc_entry *e)
1330 {
1331 	if (e) {
1332 		sector_t next_boundary =
1333 			read_original_sector(wc, e) - bio->bi_iter.bi_sector;
1334 		if (next_boundary < bio->bi_iter.bi_size >> SECTOR_SHIFT)
1335 			dm_accept_partial_bio(bio, next_boundary);
1336 	}
1337 }
1338 
1339 static enum wc_map_op writecache_map_read(struct dm_writecache *wc, struct bio *bio)
1340 {
1341 	enum wc_map_op map_op;
1342 	struct wc_entry *e;
1343 
1344 read_next_block:
1345 	wc->stats.reads++;
1346 	e = writecache_find_entry(wc, bio->bi_iter.bi_sector, WFE_RETURN_FOLLOWING);
1347 	if (e && read_original_sector(wc, e) == bio->bi_iter.bi_sector) {
1348 		wc->stats.read_hits++;
1349 		if (WC_MODE_PMEM(wc)) {
1350 			bio_copy_block(wc, bio, memory_data(wc, e));
1351 			if (bio->bi_iter.bi_size)
1352 				goto read_next_block;
1353 			map_op = WC_MAP_SUBMIT;
1354 		} else {
1355 			dm_accept_partial_bio(bio, wc->block_size >> SECTOR_SHIFT);
1356 			bio_set_dev(bio, wc->ssd_dev->bdev);
1357 			bio->bi_iter.bi_sector = cache_sector(wc, e);
1358 			if (!writecache_entry_is_committed(wc, e))
1359 				writecache_wait_for_ios(wc, WRITE);
1360 			map_op = WC_MAP_REMAP;
1361 		}
1362 	} else {
1363 		writecache_map_remap_origin(wc, bio, e);
1364 		wc->stats.reads += (bio->bi_iter.bi_size - wc->block_size) >> wc->block_size_bits;
1365 		map_op = WC_MAP_REMAP_ORIGIN;
1366 	}
1367 
1368 	return map_op;
1369 }
1370 
1371 static void writecache_bio_copy_ssd(struct dm_writecache *wc, struct bio *bio,
1372 				    struct wc_entry *e, bool search_used)
1373 {
1374 	unsigned bio_size = wc->block_size;
1375 	sector_t start_cache_sec = cache_sector(wc, e);
1376 	sector_t current_cache_sec = start_cache_sec + (bio_size >> SECTOR_SHIFT);
1377 
1378 	while (bio_size < bio->bi_iter.bi_size) {
1379 		if (!search_used) {
1380 			struct wc_entry *f = writecache_pop_from_freelist(wc, current_cache_sec);
1381 			if (!f)
1382 				break;
1383 			write_original_sector_seq_count(wc, f, bio->bi_iter.bi_sector +
1384 							(bio_size >> SECTOR_SHIFT), wc->seq_count);
1385 			writecache_insert_entry(wc, f);
1386 			wc->uncommitted_blocks++;
1387 		} else {
1388 			struct wc_entry *f;
1389 			struct rb_node *next = rb_next(&e->rb_node);
1390 			if (!next)
1391 				break;
1392 			f = container_of(next, struct wc_entry, rb_node);
1393 			if (f != e + 1)
1394 				break;
1395 			if (read_original_sector(wc, f) !=
1396 			    read_original_sector(wc, e) + (wc->block_size >> SECTOR_SHIFT))
1397 				break;
1398 			if (unlikely(f->write_in_progress))
1399 				break;
1400 			if (writecache_entry_is_committed(wc, f))
1401 				wc->overwrote_committed = true;
1402 			e = f;
1403 		}
1404 		bio_size += wc->block_size;
1405 		current_cache_sec += wc->block_size >> SECTOR_SHIFT;
1406 	}
1407 
1408 	bio_set_dev(bio, wc->ssd_dev->bdev);
1409 	bio->bi_iter.bi_sector = start_cache_sec;
1410 	dm_accept_partial_bio(bio, bio_size >> SECTOR_SHIFT);
1411 
1412 	wc->stats.writes += bio->bi_iter.bi_size >> wc->block_size_bits;
1413 	wc->stats.writes_allocate += (bio->bi_iter.bi_size - wc->block_size) >> wc->block_size_bits;
1414 
1415 	if (unlikely(wc->uncommitted_blocks >= wc->autocommit_blocks)) {
1416 		wc->uncommitted_blocks = 0;
1417 		queue_work(wc->writeback_wq, &wc->flush_work);
1418 	} else {
1419 		writecache_schedule_autocommit(wc);
1420 	}
1421 }
1422 
1423 static enum wc_map_op writecache_map_write(struct dm_writecache *wc, struct bio *bio)
1424 {
1425 	struct wc_entry *e;
1426 
1427 	do {
1428 		bool found_entry = false;
1429 		bool search_used = false;
1430 		if (writecache_has_error(wc)) {
1431 			wc->stats.writes += bio->bi_iter.bi_size >> wc->block_size_bits;
1432 			return WC_MAP_ERROR;
1433 		}
1434 		e = writecache_find_entry(wc, bio->bi_iter.bi_sector, 0);
1435 		if (e) {
1436 			if (!writecache_entry_is_committed(wc, e)) {
1437 				wc->stats.write_hits_uncommitted++;
1438 				search_used = true;
1439 				goto bio_copy;
1440 			}
1441 			wc->stats.write_hits_committed++;
1442 			if (!WC_MODE_PMEM(wc) && !e->write_in_progress) {
1443 				wc->overwrote_committed = true;
1444 				search_used = true;
1445 				goto bio_copy;
1446 			}
1447 			found_entry = true;
1448 		} else {
1449 			if (unlikely(wc->cleaner) ||
1450 			    (wc->metadata_only && !(bio->bi_opf & REQ_META)))
1451 				goto direct_write;
1452 		}
1453 		e = writecache_pop_from_freelist(wc, (sector_t)-1);
1454 		if (unlikely(!e)) {
1455 			if (!WC_MODE_PMEM(wc) && !found_entry) {
1456 direct_write:
1457 				e = writecache_find_entry(wc, bio->bi_iter.bi_sector, WFE_RETURN_FOLLOWING);
1458 				writecache_map_remap_origin(wc, bio, e);
1459 				wc->stats.writes_around += bio->bi_iter.bi_size >> wc->block_size_bits;
1460 				wc->stats.writes += bio->bi_iter.bi_size >> wc->block_size_bits;
1461 				return WC_MAP_REMAP_ORIGIN;
1462 			}
1463 			wc->stats.writes_blocked_on_freelist++;
1464 			writecache_wait_on_freelist(wc);
1465 			continue;
1466 		}
1467 		write_original_sector_seq_count(wc, e, bio->bi_iter.bi_sector, wc->seq_count);
1468 		writecache_insert_entry(wc, e);
1469 		wc->uncommitted_blocks++;
1470 		wc->stats.writes_allocate++;
1471 bio_copy:
1472 		if (WC_MODE_PMEM(wc)) {
1473 			bio_copy_block(wc, bio, memory_data(wc, e));
1474 			wc->stats.writes++;
1475 		} else {
1476 			writecache_bio_copy_ssd(wc, bio, e, search_used);
1477 			return WC_MAP_REMAP;
1478 		}
1479 	} while (bio->bi_iter.bi_size);
1480 
1481 	if (unlikely(bio->bi_opf & REQ_FUA || wc->uncommitted_blocks >= wc->autocommit_blocks))
1482 		writecache_flush(wc);
1483 	else
1484 		writecache_schedule_autocommit(wc);
1485 
1486 	return WC_MAP_SUBMIT;
1487 }
1488 
1489 static enum wc_map_op writecache_map_flush(struct dm_writecache *wc, struct bio *bio)
1490 {
1491 	if (writecache_has_error(wc))
1492 		return WC_MAP_ERROR;
1493 
1494 	if (WC_MODE_PMEM(wc)) {
1495 		wc->stats.flushes++;
1496 		writecache_flush(wc);
1497 		if (writecache_has_error(wc))
1498 			return WC_MAP_ERROR;
1499 		else if (unlikely(wc->cleaner) || unlikely(wc->metadata_only))
1500 			return WC_MAP_REMAP_ORIGIN;
1501 		return WC_MAP_SUBMIT;
1502 	}
1503 	/* SSD: */
1504 	if (dm_bio_get_target_bio_nr(bio))
1505 		return WC_MAP_REMAP_ORIGIN;
1506 	wc->stats.flushes++;
1507 	writecache_offload_bio(wc, bio);
1508 	return WC_MAP_RETURN;
1509 }
1510 
1511 static enum wc_map_op writecache_map_discard(struct dm_writecache *wc, struct bio *bio)
1512 {
1513 	wc->stats.discards += bio->bi_iter.bi_size >> wc->block_size_bits;
1514 
1515 	if (writecache_has_error(wc))
1516 		return WC_MAP_ERROR;
1517 
1518 	if (WC_MODE_PMEM(wc)) {
1519 		writecache_discard(wc, bio->bi_iter.bi_sector, bio_end_sector(bio));
1520 		return WC_MAP_REMAP_ORIGIN;
1521 	}
1522 	/* SSD: */
1523 	writecache_offload_bio(wc, bio);
1524 	return WC_MAP_RETURN;
1525 }
1526 
1527 static int writecache_map(struct dm_target *ti, struct bio *bio)
1528 {
1529 	struct dm_writecache *wc = ti->private;
1530 	enum wc_map_op map_op;
1531 
1532 	bio->bi_private = NULL;
1533 
1534 	wc_lock(wc);
1535 
1536 	if (unlikely(bio->bi_opf & REQ_PREFLUSH)) {
1537 		map_op = writecache_map_flush(wc, bio);
1538 		goto done;
1539 	}
1540 
1541 	bio->bi_iter.bi_sector = dm_target_offset(ti, bio->bi_iter.bi_sector);
1542 
1543 	if (unlikely((((unsigned)bio->bi_iter.bi_sector | bio_sectors(bio)) &
1544 				(wc->block_size / 512 - 1)) != 0)) {
1545 		DMERR("I/O is not aligned, sector %llu, size %u, block size %u",
1546 		      (unsigned long long)bio->bi_iter.bi_sector,
1547 		      bio->bi_iter.bi_size, wc->block_size);
1548 		map_op = WC_MAP_ERROR;
1549 		goto done;
1550 	}
1551 
1552 	if (unlikely(bio_op(bio) == REQ_OP_DISCARD)) {
1553 		map_op = writecache_map_discard(wc, bio);
1554 		goto done;
1555 	}
1556 
1557 	if (bio_data_dir(bio) == READ)
1558 		map_op = writecache_map_read(wc, bio);
1559 	else
1560 		map_op = writecache_map_write(wc, bio);
1561 done:
1562 	switch (map_op) {
1563 	case WC_MAP_REMAP_ORIGIN:
1564 		if (likely(wc->pause != 0)) {
1565 			if (bio_op(bio) == REQ_OP_WRITE) {
1566 				dm_iot_io_begin(&wc->iot, 1);
1567 				bio->bi_private = (void *)2;
1568 			}
1569 		}
1570 		bio_set_dev(bio, wc->dev->bdev);
1571 		wc_unlock(wc);
1572 		return DM_MAPIO_REMAPPED;
1573 
1574 	case WC_MAP_REMAP:
1575 		/* make sure that writecache_end_io decrements bio_in_progress: */
1576 		bio->bi_private = (void *)1;
1577 		atomic_inc(&wc->bio_in_progress[bio_data_dir(bio)]);
1578 		wc_unlock(wc);
1579 		return DM_MAPIO_REMAPPED;
1580 
1581 	case WC_MAP_SUBMIT:
1582 		wc_unlock(wc);
1583 		bio_endio(bio);
1584 		return DM_MAPIO_SUBMITTED;
1585 
1586 	case WC_MAP_RETURN:
1587 		wc_unlock(wc);
1588 		return DM_MAPIO_SUBMITTED;
1589 
1590 	case WC_MAP_ERROR:
1591 		wc_unlock(wc);
1592 		bio_io_error(bio);
1593 		return DM_MAPIO_SUBMITTED;
1594 
1595 	default:
1596 		BUG();
1597 		return -1;
1598 	}
1599 }
1600 
1601 static int writecache_end_io(struct dm_target *ti, struct bio *bio, blk_status_t *status)
1602 {
1603 	struct dm_writecache *wc = ti->private;
1604 
1605 	if (bio->bi_private == (void *)1) {
1606 		int dir = bio_data_dir(bio);
1607 		if (atomic_dec_and_test(&wc->bio_in_progress[dir]))
1608 			if (unlikely(waitqueue_active(&wc->bio_in_progress_wait[dir])))
1609 				wake_up(&wc->bio_in_progress_wait[dir]);
1610 	} else if (bio->bi_private == (void *)2) {
1611 		dm_iot_io_end(&wc->iot, 1);
1612 	}
1613 	return 0;
1614 }
1615 
1616 static int writecache_iterate_devices(struct dm_target *ti,
1617 				      iterate_devices_callout_fn fn, void *data)
1618 {
1619 	struct dm_writecache *wc = ti->private;
1620 
1621 	return fn(ti, wc->dev, 0, ti->len, data);
1622 }
1623 
1624 static void writecache_io_hints(struct dm_target *ti, struct queue_limits *limits)
1625 {
1626 	struct dm_writecache *wc = ti->private;
1627 
1628 	if (limits->logical_block_size < wc->block_size)
1629 		limits->logical_block_size = wc->block_size;
1630 
1631 	if (limits->physical_block_size < wc->block_size)
1632 		limits->physical_block_size = wc->block_size;
1633 
1634 	if (limits->io_min < wc->block_size)
1635 		limits->io_min = wc->block_size;
1636 }
1637 
1638 
1639 static void writecache_writeback_endio(struct bio *bio)
1640 {
1641 	struct writeback_struct *wb = container_of(bio, struct writeback_struct, bio);
1642 	struct dm_writecache *wc = wb->wc;
1643 	unsigned long flags;
1644 
1645 	raw_spin_lock_irqsave(&wc->endio_list_lock, flags);
1646 	if (unlikely(list_empty(&wc->endio_list)))
1647 		wake_up_process(wc->endio_thread);
1648 	list_add_tail(&wb->endio_entry, &wc->endio_list);
1649 	raw_spin_unlock_irqrestore(&wc->endio_list_lock, flags);
1650 }
1651 
1652 static void writecache_copy_endio(int read_err, unsigned long write_err, void *ptr)
1653 {
1654 	struct copy_struct *c = ptr;
1655 	struct dm_writecache *wc = c->wc;
1656 
1657 	c->error = likely(!(read_err | write_err)) ? 0 : -EIO;
1658 
1659 	raw_spin_lock_irq(&wc->endio_list_lock);
1660 	if (unlikely(list_empty(&wc->endio_list)))
1661 		wake_up_process(wc->endio_thread);
1662 	list_add_tail(&c->endio_entry, &wc->endio_list);
1663 	raw_spin_unlock_irq(&wc->endio_list_lock);
1664 }
1665 
1666 static void __writecache_endio_pmem(struct dm_writecache *wc, struct list_head *list)
1667 {
1668 	unsigned i;
1669 	struct writeback_struct *wb;
1670 	struct wc_entry *e;
1671 	unsigned long n_walked = 0;
1672 
1673 	do {
1674 		wb = list_entry(list->next, struct writeback_struct, endio_entry);
1675 		list_del(&wb->endio_entry);
1676 
1677 		if (unlikely(wb->bio.bi_status != BLK_STS_OK))
1678 			writecache_error(wc, blk_status_to_errno(wb->bio.bi_status),
1679 					"write error %d", wb->bio.bi_status);
1680 		i = 0;
1681 		do {
1682 			e = wb->wc_list[i];
1683 			BUG_ON(!e->write_in_progress);
1684 			e->write_in_progress = false;
1685 			INIT_LIST_HEAD(&e->lru);
1686 			if (!writecache_has_error(wc))
1687 				writecache_free_entry(wc, e);
1688 			BUG_ON(!wc->writeback_size);
1689 			wc->writeback_size--;
1690 			n_walked++;
1691 			if (unlikely(n_walked >= ENDIO_LATENCY)) {
1692 				writecache_commit_flushed(wc, false);
1693 				wc_unlock(wc);
1694 				wc_lock(wc);
1695 				n_walked = 0;
1696 			}
1697 		} while (++i < wb->wc_list_n);
1698 
1699 		if (wb->wc_list != wb->wc_list_inline)
1700 			kfree(wb->wc_list);
1701 		bio_put(&wb->bio);
1702 	} while (!list_empty(list));
1703 }
1704 
1705 static void __writecache_endio_ssd(struct dm_writecache *wc, struct list_head *list)
1706 {
1707 	struct copy_struct *c;
1708 	struct wc_entry *e;
1709 
1710 	do {
1711 		c = list_entry(list->next, struct copy_struct, endio_entry);
1712 		list_del(&c->endio_entry);
1713 
1714 		if (unlikely(c->error))
1715 			writecache_error(wc, c->error, "copy error");
1716 
1717 		e = c->e;
1718 		do {
1719 			BUG_ON(!e->write_in_progress);
1720 			e->write_in_progress = false;
1721 			INIT_LIST_HEAD(&e->lru);
1722 			if (!writecache_has_error(wc))
1723 				writecache_free_entry(wc, e);
1724 
1725 			BUG_ON(!wc->writeback_size);
1726 			wc->writeback_size--;
1727 			e++;
1728 		} while (--c->n_entries);
1729 		mempool_free(c, &wc->copy_pool);
1730 	} while (!list_empty(list));
1731 }
1732 
1733 static int writecache_endio_thread(void *data)
1734 {
1735 	struct dm_writecache *wc = data;
1736 
1737 	while (1) {
1738 		struct list_head list;
1739 
1740 		raw_spin_lock_irq(&wc->endio_list_lock);
1741 		if (!list_empty(&wc->endio_list))
1742 			goto pop_from_list;
1743 		set_current_state(TASK_INTERRUPTIBLE);
1744 		raw_spin_unlock_irq(&wc->endio_list_lock);
1745 
1746 		if (unlikely(kthread_should_stop())) {
1747 			set_current_state(TASK_RUNNING);
1748 			break;
1749 		}
1750 
1751 		schedule();
1752 
1753 		continue;
1754 
1755 pop_from_list:
1756 		list = wc->endio_list;
1757 		list.next->prev = list.prev->next = &list;
1758 		INIT_LIST_HEAD(&wc->endio_list);
1759 		raw_spin_unlock_irq(&wc->endio_list_lock);
1760 
1761 		if (!WC_MODE_FUA(wc))
1762 			writecache_disk_flush(wc, wc->dev);
1763 
1764 		wc_lock(wc);
1765 
1766 		if (WC_MODE_PMEM(wc)) {
1767 			__writecache_endio_pmem(wc, &list);
1768 		} else {
1769 			__writecache_endio_ssd(wc, &list);
1770 			writecache_wait_for_ios(wc, READ);
1771 		}
1772 
1773 		writecache_commit_flushed(wc, false);
1774 
1775 		wc_unlock(wc);
1776 	}
1777 
1778 	return 0;
1779 }
1780 
1781 static bool wc_add_block(struct writeback_struct *wb, struct wc_entry *e)
1782 {
1783 	struct dm_writecache *wc = wb->wc;
1784 	unsigned block_size = wc->block_size;
1785 	void *address = memory_data(wc, e);
1786 
1787 	persistent_memory_flush_cache(address, block_size);
1788 
1789 	if (unlikely(bio_end_sector(&wb->bio) >= wc->data_device_sectors))
1790 		return true;
1791 
1792 	return bio_add_page(&wb->bio, persistent_memory_page(address),
1793 			    block_size, persistent_memory_page_offset(address)) != 0;
1794 }
1795 
1796 struct writeback_list {
1797 	struct list_head list;
1798 	size_t size;
1799 };
1800 
1801 static void __writeback_throttle(struct dm_writecache *wc, struct writeback_list *wbl)
1802 {
1803 	if (unlikely(wc->max_writeback_jobs)) {
1804 		if (READ_ONCE(wc->writeback_size) - wbl->size >= wc->max_writeback_jobs) {
1805 			wc_lock(wc);
1806 			while (wc->writeback_size - wbl->size >= wc->max_writeback_jobs)
1807 				writecache_wait_on_freelist(wc);
1808 			wc_unlock(wc);
1809 		}
1810 	}
1811 	cond_resched();
1812 }
1813 
1814 static void __writecache_writeback_pmem(struct dm_writecache *wc, struct writeback_list *wbl)
1815 {
1816 	struct wc_entry *e, *f;
1817 	struct bio *bio;
1818 	struct writeback_struct *wb;
1819 	unsigned max_pages;
1820 
1821 	while (wbl->size) {
1822 		wbl->size--;
1823 		e = container_of(wbl->list.prev, struct wc_entry, lru);
1824 		list_del(&e->lru);
1825 
1826 		max_pages = e->wc_list_contiguous;
1827 
1828 		bio = bio_alloc_bioset(wc->dev->bdev, max_pages, REQ_OP_WRITE,
1829 				       GFP_NOIO, &wc->bio_set);
1830 		wb = container_of(bio, struct writeback_struct, bio);
1831 		wb->wc = wc;
1832 		bio->bi_end_io = writecache_writeback_endio;
1833 		bio->bi_iter.bi_sector = read_original_sector(wc, e);
1834 		if (max_pages <= WB_LIST_INLINE ||
1835 		    unlikely(!(wb->wc_list = kmalloc_array(max_pages, sizeof(struct wc_entry *),
1836 							   GFP_NOIO | __GFP_NORETRY |
1837 							   __GFP_NOMEMALLOC | __GFP_NOWARN)))) {
1838 			wb->wc_list = wb->wc_list_inline;
1839 			max_pages = WB_LIST_INLINE;
1840 		}
1841 
1842 		BUG_ON(!wc_add_block(wb, e));
1843 
1844 		wb->wc_list[0] = e;
1845 		wb->wc_list_n = 1;
1846 
1847 		while (wbl->size && wb->wc_list_n < max_pages) {
1848 			f = container_of(wbl->list.prev, struct wc_entry, lru);
1849 			if (read_original_sector(wc, f) !=
1850 			    read_original_sector(wc, e) + (wc->block_size >> SECTOR_SHIFT))
1851 				break;
1852 			if (!wc_add_block(wb, f))
1853 				break;
1854 			wbl->size--;
1855 			list_del(&f->lru);
1856 			wb->wc_list[wb->wc_list_n++] = f;
1857 			e = f;
1858 		}
1859 		if (WC_MODE_FUA(wc))
1860 			bio->bi_opf |= REQ_FUA;
1861 		if (writecache_has_error(wc)) {
1862 			bio->bi_status = BLK_STS_IOERR;
1863 			bio_endio(bio);
1864 		} else if (unlikely(!bio_sectors(bio))) {
1865 			bio->bi_status = BLK_STS_OK;
1866 			bio_endio(bio);
1867 		} else {
1868 			submit_bio(bio);
1869 		}
1870 
1871 		__writeback_throttle(wc, wbl);
1872 	}
1873 }
1874 
1875 static void __writecache_writeback_ssd(struct dm_writecache *wc, struct writeback_list *wbl)
1876 {
1877 	struct wc_entry *e, *f;
1878 	struct dm_io_region from, to;
1879 	struct copy_struct *c;
1880 
1881 	while (wbl->size) {
1882 		unsigned n_sectors;
1883 
1884 		wbl->size--;
1885 		e = container_of(wbl->list.prev, struct wc_entry, lru);
1886 		list_del(&e->lru);
1887 
1888 		n_sectors = e->wc_list_contiguous << (wc->block_size_bits - SECTOR_SHIFT);
1889 
1890 		from.bdev = wc->ssd_dev->bdev;
1891 		from.sector = cache_sector(wc, e);
1892 		from.count = n_sectors;
1893 		to.bdev = wc->dev->bdev;
1894 		to.sector = read_original_sector(wc, e);
1895 		to.count = n_sectors;
1896 
1897 		c = mempool_alloc(&wc->copy_pool, GFP_NOIO);
1898 		c->wc = wc;
1899 		c->e = e;
1900 		c->n_entries = e->wc_list_contiguous;
1901 
1902 		while ((n_sectors -= wc->block_size >> SECTOR_SHIFT)) {
1903 			wbl->size--;
1904 			f = container_of(wbl->list.prev, struct wc_entry, lru);
1905 			BUG_ON(f != e + 1);
1906 			list_del(&f->lru);
1907 			e = f;
1908 		}
1909 
1910 		if (unlikely(to.sector + to.count > wc->data_device_sectors)) {
1911 			if (to.sector >= wc->data_device_sectors) {
1912 				writecache_copy_endio(0, 0, c);
1913 				continue;
1914 			}
1915 			from.count = to.count = wc->data_device_sectors - to.sector;
1916 		}
1917 
1918 		dm_kcopyd_copy(wc->dm_kcopyd, &from, 1, &to, 0, writecache_copy_endio, c);
1919 
1920 		__writeback_throttle(wc, wbl);
1921 	}
1922 }
1923 
1924 static void writecache_writeback(struct work_struct *work)
1925 {
1926 	struct dm_writecache *wc = container_of(work, struct dm_writecache, writeback_work);
1927 	struct blk_plug plug;
1928 	struct wc_entry *f, *g, *e = NULL;
1929 	struct rb_node *node, *next_node;
1930 	struct list_head skipped;
1931 	struct writeback_list wbl;
1932 	unsigned long n_walked;
1933 
1934 	if (!WC_MODE_PMEM(wc)) {
1935 		/* Wait for any active kcopyd work on behalf of ssd writeback */
1936 		dm_kcopyd_client_flush(wc->dm_kcopyd);
1937 	}
1938 
1939 	if (likely(wc->pause != 0)) {
1940 		while (1) {
1941 			unsigned long idle;
1942 			if (unlikely(wc->cleaner) || unlikely(wc->writeback_all) ||
1943 			    unlikely(dm_suspended(wc->ti)))
1944 				break;
1945 			idle = dm_iot_idle_time(&wc->iot);
1946 			if (idle >= wc->pause)
1947 				break;
1948 			idle = wc->pause - idle;
1949 			if (idle > HZ)
1950 				idle = HZ;
1951 			schedule_timeout_idle(idle);
1952 		}
1953 	}
1954 
1955 	wc_lock(wc);
1956 restart:
1957 	if (writecache_has_error(wc)) {
1958 		wc_unlock(wc);
1959 		return;
1960 	}
1961 
1962 	if (unlikely(wc->writeback_all)) {
1963 		if (writecache_wait_for_writeback(wc))
1964 			goto restart;
1965 	}
1966 
1967 	if (wc->overwrote_committed) {
1968 		writecache_wait_for_ios(wc, WRITE);
1969 	}
1970 
1971 	n_walked = 0;
1972 	INIT_LIST_HEAD(&skipped);
1973 	INIT_LIST_HEAD(&wbl.list);
1974 	wbl.size = 0;
1975 	while (!list_empty(&wc->lru) &&
1976 	       (wc->writeback_all ||
1977 		wc->freelist_size + wc->writeback_size <= wc->freelist_low_watermark ||
1978 		(jiffies - container_of(wc->lru.prev, struct wc_entry, lru)->age >=
1979 		 wc->max_age - wc->max_age / MAX_AGE_DIV))) {
1980 
1981 		n_walked++;
1982 		if (unlikely(n_walked > WRITEBACK_LATENCY) &&
1983 		    likely(!wc->writeback_all)) {
1984 			if (likely(!dm_suspended(wc->ti)))
1985 				queue_work(wc->writeback_wq, &wc->writeback_work);
1986 			break;
1987 		}
1988 
1989 		if (unlikely(wc->writeback_all)) {
1990 			if (unlikely(!e)) {
1991 				writecache_flush(wc);
1992 				e = container_of(rb_first(&wc->tree), struct wc_entry, rb_node);
1993 			} else
1994 				e = g;
1995 		} else
1996 			e = container_of(wc->lru.prev, struct wc_entry, lru);
1997 		BUG_ON(e->write_in_progress);
1998 		if (unlikely(!writecache_entry_is_committed(wc, e))) {
1999 			writecache_flush(wc);
2000 		}
2001 		node = rb_prev(&e->rb_node);
2002 		if (node) {
2003 			f = container_of(node, struct wc_entry, rb_node);
2004 			if (unlikely(read_original_sector(wc, f) ==
2005 				     read_original_sector(wc, e))) {
2006 				BUG_ON(!f->write_in_progress);
2007 				list_move(&e->lru, &skipped);
2008 				cond_resched();
2009 				continue;
2010 			}
2011 		}
2012 		wc->writeback_size++;
2013 		list_move(&e->lru, &wbl.list);
2014 		wbl.size++;
2015 		e->write_in_progress = true;
2016 		e->wc_list_contiguous = 1;
2017 
2018 		f = e;
2019 
2020 		while (1) {
2021 			next_node = rb_next(&f->rb_node);
2022 			if (unlikely(!next_node))
2023 				break;
2024 			g = container_of(next_node, struct wc_entry, rb_node);
2025 			if (unlikely(read_original_sector(wc, g) ==
2026 			    read_original_sector(wc, f))) {
2027 				f = g;
2028 				continue;
2029 			}
2030 			if (read_original_sector(wc, g) !=
2031 			    read_original_sector(wc, f) + (wc->block_size >> SECTOR_SHIFT))
2032 				break;
2033 			if (unlikely(g->write_in_progress))
2034 				break;
2035 			if (unlikely(!writecache_entry_is_committed(wc, g)))
2036 				break;
2037 
2038 			if (!WC_MODE_PMEM(wc)) {
2039 				if (g != f + 1)
2040 					break;
2041 			}
2042 
2043 			n_walked++;
2044 			//if (unlikely(n_walked > WRITEBACK_LATENCY) && likely(!wc->writeback_all))
2045 			//	break;
2046 
2047 			wc->writeback_size++;
2048 			list_move(&g->lru, &wbl.list);
2049 			wbl.size++;
2050 			g->write_in_progress = true;
2051 			g->wc_list_contiguous = BIO_MAX_VECS;
2052 			f = g;
2053 			e->wc_list_contiguous++;
2054 			if (unlikely(e->wc_list_contiguous == BIO_MAX_VECS)) {
2055 				if (unlikely(wc->writeback_all)) {
2056 					next_node = rb_next(&f->rb_node);
2057 					if (likely(next_node))
2058 						g = container_of(next_node, struct wc_entry, rb_node);
2059 				}
2060 				break;
2061 			}
2062 		}
2063 		cond_resched();
2064 	}
2065 
2066 	if (!list_empty(&skipped)) {
2067 		list_splice_tail(&skipped, &wc->lru);
2068 		/*
2069 		 * If we didn't do any progress, we must wait until some
2070 		 * writeback finishes to avoid burning CPU in a loop
2071 		 */
2072 		if (unlikely(!wbl.size))
2073 			writecache_wait_for_writeback(wc);
2074 	}
2075 
2076 	wc_unlock(wc);
2077 
2078 	blk_start_plug(&plug);
2079 
2080 	if (WC_MODE_PMEM(wc))
2081 		__writecache_writeback_pmem(wc, &wbl);
2082 	else
2083 		__writecache_writeback_ssd(wc, &wbl);
2084 
2085 	blk_finish_plug(&plug);
2086 
2087 	if (unlikely(wc->writeback_all)) {
2088 		wc_lock(wc);
2089 		while (writecache_wait_for_writeback(wc));
2090 		wc_unlock(wc);
2091 	}
2092 }
2093 
2094 static int calculate_memory_size(uint64_t device_size, unsigned block_size,
2095 				 size_t *n_blocks_p, size_t *n_metadata_blocks_p)
2096 {
2097 	uint64_t n_blocks, offset;
2098 	struct wc_entry e;
2099 
2100 	n_blocks = device_size;
2101 	do_div(n_blocks, block_size + sizeof(struct wc_memory_entry));
2102 
2103 	while (1) {
2104 		if (!n_blocks)
2105 			return -ENOSPC;
2106 		/* Verify the following entries[n_blocks] won't overflow */
2107 		if (n_blocks >= ((size_t)-sizeof(struct wc_memory_superblock) /
2108 				 sizeof(struct wc_memory_entry)))
2109 			return -EFBIG;
2110 		offset = offsetof(struct wc_memory_superblock, entries[n_blocks]);
2111 		offset = (offset + block_size - 1) & ~(uint64_t)(block_size - 1);
2112 		if (offset + n_blocks * block_size <= device_size)
2113 			break;
2114 		n_blocks--;
2115 	}
2116 
2117 	/* check if the bit field overflows */
2118 	e.index = n_blocks;
2119 	if (e.index != n_blocks)
2120 		return -EFBIG;
2121 
2122 	if (n_blocks_p)
2123 		*n_blocks_p = n_blocks;
2124 	if (n_metadata_blocks_p)
2125 		*n_metadata_blocks_p = offset >> __ffs(block_size);
2126 	return 0;
2127 }
2128 
2129 static int init_memory(struct dm_writecache *wc)
2130 {
2131 	size_t b;
2132 	int r;
2133 
2134 	r = calculate_memory_size(wc->memory_map_size, wc->block_size, &wc->n_blocks, NULL);
2135 	if (r)
2136 		return r;
2137 
2138 	r = writecache_alloc_entries(wc);
2139 	if (r)
2140 		return r;
2141 
2142 	for (b = 0; b < ARRAY_SIZE(sb(wc)->padding); b++)
2143 		pmem_assign(sb(wc)->padding[b], cpu_to_le64(0));
2144 	pmem_assign(sb(wc)->version, cpu_to_le32(MEMORY_SUPERBLOCK_VERSION));
2145 	pmem_assign(sb(wc)->block_size, cpu_to_le32(wc->block_size));
2146 	pmem_assign(sb(wc)->n_blocks, cpu_to_le64(wc->n_blocks));
2147 	pmem_assign(sb(wc)->seq_count, cpu_to_le64(0));
2148 
2149 	for (b = 0; b < wc->n_blocks; b++) {
2150 		write_original_sector_seq_count(wc, &wc->entries[b], -1, -1);
2151 		cond_resched();
2152 	}
2153 
2154 	writecache_flush_all_metadata(wc);
2155 	writecache_commit_flushed(wc, false);
2156 	pmem_assign(sb(wc)->magic, cpu_to_le32(MEMORY_SUPERBLOCK_MAGIC));
2157 	writecache_flush_region(wc, &sb(wc)->magic, sizeof sb(wc)->magic);
2158 	writecache_commit_flushed(wc, false);
2159 
2160 	return 0;
2161 }
2162 
2163 static void writecache_dtr(struct dm_target *ti)
2164 {
2165 	struct dm_writecache *wc = ti->private;
2166 
2167 	if (!wc)
2168 		return;
2169 
2170 	if (wc->endio_thread)
2171 		kthread_stop(wc->endio_thread);
2172 
2173 	if (wc->flush_thread)
2174 		kthread_stop(wc->flush_thread);
2175 
2176 	bioset_exit(&wc->bio_set);
2177 
2178 	mempool_exit(&wc->copy_pool);
2179 
2180 	if (wc->writeback_wq)
2181 		destroy_workqueue(wc->writeback_wq);
2182 
2183 	if (wc->dev)
2184 		dm_put_device(ti, wc->dev);
2185 
2186 	if (wc->ssd_dev)
2187 		dm_put_device(ti, wc->ssd_dev);
2188 
2189 	vfree(wc->entries);
2190 
2191 	if (wc->memory_map) {
2192 		if (WC_MODE_PMEM(wc))
2193 			persistent_memory_release(wc);
2194 		else
2195 			vfree(wc->memory_map);
2196 	}
2197 
2198 	if (wc->dm_kcopyd)
2199 		dm_kcopyd_client_destroy(wc->dm_kcopyd);
2200 
2201 	if (wc->dm_io)
2202 		dm_io_client_destroy(wc->dm_io);
2203 
2204 	vfree(wc->dirty_bitmap);
2205 
2206 	kfree(wc);
2207 }
2208 
2209 static int writecache_ctr(struct dm_target *ti, unsigned argc, char **argv)
2210 {
2211 	struct dm_writecache *wc;
2212 	struct dm_arg_set as;
2213 	const char *string;
2214 	unsigned opt_params;
2215 	size_t offset, data_size;
2216 	int i, r;
2217 	char dummy;
2218 	int high_wm_percent = HIGH_WATERMARK;
2219 	int low_wm_percent = LOW_WATERMARK;
2220 	uint64_t x;
2221 	struct wc_memory_superblock s;
2222 
2223 	static struct dm_arg _args[] = {
2224 		{0, 18, "Invalid number of feature args"},
2225 	};
2226 
2227 	as.argc = argc;
2228 	as.argv = argv;
2229 
2230 	wc = kzalloc(sizeof(struct dm_writecache), GFP_KERNEL);
2231 	if (!wc) {
2232 		ti->error = "Cannot allocate writecache structure";
2233 		r = -ENOMEM;
2234 		goto bad;
2235 	}
2236 	ti->private = wc;
2237 	wc->ti = ti;
2238 
2239 	mutex_init(&wc->lock);
2240 	wc->max_age = MAX_AGE_UNSPECIFIED;
2241 	writecache_poison_lists(wc);
2242 	init_waitqueue_head(&wc->freelist_wait);
2243 	timer_setup(&wc->autocommit_timer, writecache_autocommit_timer, 0);
2244 	timer_setup(&wc->max_age_timer, writecache_max_age_timer, 0);
2245 
2246 	for (i = 0; i < 2; i++) {
2247 		atomic_set(&wc->bio_in_progress[i], 0);
2248 		init_waitqueue_head(&wc->bio_in_progress_wait[i]);
2249 	}
2250 
2251 	wc->dm_io = dm_io_client_create();
2252 	if (IS_ERR(wc->dm_io)) {
2253 		r = PTR_ERR(wc->dm_io);
2254 		ti->error = "Unable to allocate dm-io client";
2255 		wc->dm_io = NULL;
2256 		goto bad;
2257 	}
2258 
2259 	wc->writeback_wq = alloc_workqueue("writecache-writeback", WQ_MEM_RECLAIM, 1);
2260 	if (!wc->writeback_wq) {
2261 		r = -ENOMEM;
2262 		ti->error = "Could not allocate writeback workqueue";
2263 		goto bad;
2264 	}
2265 	INIT_WORK(&wc->writeback_work, writecache_writeback);
2266 	INIT_WORK(&wc->flush_work, writecache_flush_work);
2267 
2268 	dm_iot_init(&wc->iot);
2269 
2270 	raw_spin_lock_init(&wc->endio_list_lock);
2271 	INIT_LIST_HEAD(&wc->endio_list);
2272 	wc->endio_thread = kthread_run(writecache_endio_thread, wc, "writecache_endio");
2273 	if (IS_ERR(wc->endio_thread)) {
2274 		r = PTR_ERR(wc->endio_thread);
2275 		wc->endio_thread = NULL;
2276 		ti->error = "Couldn't spawn endio thread";
2277 		goto bad;
2278 	}
2279 
2280 	/*
2281 	 * Parse the mode (pmem or ssd)
2282 	 */
2283 	string = dm_shift_arg(&as);
2284 	if (!string)
2285 		goto bad_arguments;
2286 
2287 	if (!strcasecmp(string, "s")) {
2288 		wc->pmem_mode = false;
2289 	} else if (!strcasecmp(string, "p")) {
2290 #ifdef DM_WRITECACHE_HAS_PMEM
2291 		wc->pmem_mode = true;
2292 		wc->writeback_fua = true;
2293 #else
2294 		/*
2295 		 * If the architecture doesn't support persistent memory or
2296 		 * the kernel doesn't support any DAX drivers, this driver can
2297 		 * only be used in SSD-only mode.
2298 		 */
2299 		r = -EOPNOTSUPP;
2300 		ti->error = "Persistent memory or DAX not supported on this system";
2301 		goto bad;
2302 #endif
2303 	} else {
2304 		goto bad_arguments;
2305 	}
2306 
2307 	if (WC_MODE_PMEM(wc)) {
2308 		r = bioset_init(&wc->bio_set, BIO_POOL_SIZE,
2309 				offsetof(struct writeback_struct, bio),
2310 				BIOSET_NEED_BVECS);
2311 		if (r) {
2312 			ti->error = "Could not allocate bio set";
2313 			goto bad;
2314 		}
2315 	} else {
2316 		wc->pause = PAUSE_WRITEBACK;
2317 		r = mempool_init_kmalloc_pool(&wc->copy_pool, 1, sizeof(struct copy_struct));
2318 		if (r) {
2319 			ti->error = "Could not allocate mempool";
2320 			goto bad;
2321 		}
2322 	}
2323 
2324 	/*
2325 	 * Parse the origin data device
2326 	 */
2327 	string = dm_shift_arg(&as);
2328 	if (!string)
2329 		goto bad_arguments;
2330 	r = dm_get_device(ti, string, dm_table_get_mode(ti->table), &wc->dev);
2331 	if (r) {
2332 		ti->error = "Origin data device lookup failed";
2333 		goto bad;
2334 	}
2335 
2336 	/*
2337 	 * Parse cache data device (be it pmem or ssd)
2338 	 */
2339 	string = dm_shift_arg(&as);
2340 	if (!string)
2341 		goto bad_arguments;
2342 
2343 	r = dm_get_device(ti, string, dm_table_get_mode(ti->table), &wc->ssd_dev);
2344 	if (r) {
2345 		ti->error = "Cache data device lookup failed";
2346 		goto bad;
2347 	}
2348 	wc->memory_map_size = bdev_nr_bytes(wc->ssd_dev->bdev);
2349 
2350 	/*
2351 	 * Parse the cache block size
2352 	 */
2353 	string = dm_shift_arg(&as);
2354 	if (!string)
2355 		goto bad_arguments;
2356 	if (sscanf(string, "%u%c", &wc->block_size, &dummy) != 1 ||
2357 	    wc->block_size < 512 || wc->block_size > PAGE_SIZE ||
2358 	    (wc->block_size & (wc->block_size - 1))) {
2359 		r = -EINVAL;
2360 		ti->error = "Invalid block size";
2361 		goto bad;
2362 	}
2363 	if (wc->block_size < bdev_logical_block_size(wc->dev->bdev) ||
2364 	    wc->block_size < bdev_logical_block_size(wc->ssd_dev->bdev)) {
2365 		r = -EINVAL;
2366 		ti->error = "Block size is smaller than device logical block size";
2367 		goto bad;
2368 	}
2369 	wc->block_size_bits = __ffs(wc->block_size);
2370 
2371 	wc->max_writeback_jobs = MAX_WRITEBACK_JOBS;
2372 	wc->autocommit_blocks = !WC_MODE_PMEM(wc) ? AUTOCOMMIT_BLOCKS_SSD : AUTOCOMMIT_BLOCKS_PMEM;
2373 	wc->autocommit_jiffies = msecs_to_jiffies(AUTOCOMMIT_MSEC);
2374 
2375 	/*
2376 	 * Parse optional arguments
2377 	 */
2378 	r = dm_read_arg_group(_args, &as, &opt_params, &ti->error);
2379 	if (r)
2380 		goto bad;
2381 
2382 	while (opt_params) {
2383 		string = dm_shift_arg(&as), opt_params--;
2384 		if (!strcasecmp(string, "start_sector") && opt_params >= 1) {
2385 			unsigned long long start_sector;
2386 			string = dm_shift_arg(&as), opt_params--;
2387 			if (sscanf(string, "%llu%c", &start_sector, &dummy) != 1)
2388 				goto invalid_optional;
2389 			wc->start_sector = start_sector;
2390 			wc->start_sector_set = true;
2391 			if (wc->start_sector != start_sector ||
2392 			    wc->start_sector >= wc->memory_map_size >> SECTOR_SHIFT)
2393 				goto invalid_optional;
2394 		} else if (!strcasecmp(string, "high_watermark") && opt_params >= 1) {
2395 			string = dm_shift_arg(&as), opt_params--;
2396 			if (sscanf(string, "%d%c", &high_wm_percent, &dummy) != 1)
2397 				goto invalid_optional;
2398 			if (high_wm_percent < 0 || high_wm_percent > 100)
2399 				goto invalid_optional;
2400 			wc->high_wm_percent_value = high_wm_percent;
2401 			wc->high_wm_percent_set = true;
2402 		} else if (!strcasecmp(string, "low_watermark") && opt_params >= 1) {
2403 			string = dm_shift_arg(&as), opt_params--;
2404 			if (sscanf(string, "%d%c", &low_wm_percent, &dummy) != 1)
2405 				goto invalid_optional;
2406 			if (low_wm_percent < 0 || low_wm_percent > 100)
2407 				goto invalid_optional;
2408 			wc->low_wm_percent_value = low_wm_percent;
2409 			wc->low_wm_percent_set = true;
2410 		} else if (!strcasecmp(string, "writeback_jobs") && opt_params >= 1) {
2411 			string = dm_shift_arg(&as), opt_params--;
2412 			if (sscanf(string, "%u%c", &wc->max_writeback_jobs, &dummy) != 1)
2413 				goto invalid_optional;
2414 			wc->max_writeback_jobs_set = true;
2415 		} else if (!strcasecmp(string, "autocommit_blocks") && opt_params >= 1) {
2416 			string = dm_shift_arg(&as), opt_params--;
2417 			if (sscanf(string, "%u%c", &wc->autocommit_blocks, &dummy) != 1)
2418 				goto invalid_optional;
2419 			wc->autocommit_blocks_set = true;
2420 		} else if (!strcasecmp(string, "autocommit_time") && opt_params >= 1) {
2421 			unsigned autocommit_msecs;
2422 			string = dm_shift_arg(&as), opt_params--;
2423 			if (sscanf(string, "%u%c", &autocommit_msecs, &dummy) != 1)
2424 				goto invalid_optional;
2425 			if (autocommit_msecs > 3600000)
2426 				goto invalid_optional;
2427 			wc->autocommit_jiffies = msecs_to_jiffies(autocommit_msecs);
2428 			wc->autocommit_time_value = autocommit_msecs;
2429 			wc->autocommit_time_set = true;
2430 		} else if (!strcasecmp(string, "max_age") && opt_params >= 1) {
2431 			unsigned max_age_msecs;
2432 			string = dm_shift_arg(&as), opt_params--;
2433 			if (sscanf(string, "%u%c", &max_age_msecs, &dummy) != 1)
2434 				goto invalid_optional;
2435 			if (max_age_msecs > 86400000)
2436 				goto invalid_optional;
2437 			wc->max_age = msecs_to_jiffies(max_age_msecs);
2438 			wc->max_age_set = true;
2439 			wc->max_age_value = max_age_msecs;
2440 		} else if (!strcasecmp(string, "cleaner")) {
2441 			wc->cleaner_set = true;
2442 			wc->cleaner = true;
2443 		} else if (!strcasecmp(string, "fua")) {
2444 			if (WC_MODE_PMEM(wc)) {
2445 				wc->writeback_fua = true;
2446 				wc->writeback_fua_set = true;
2447 			} else goto invalid_optional;
2448 		} else if (!strcasecmp(string, "nofua")) {
2449 			if (WC_MODE_PMEM(wc)) {
2450 				wc->writeback_fua = false;
2451 				wc->writeback_fua_set = true;
2452 			} else goto invalid_optional;
2453 		} else if (!strcasecmp(string, "metadata_only")) {
2454 			wc->metadata_only = true;
2455 		} else if (!strcasecmp(string, "pause_writeback") && opt_params >= 1) {
2456 			unsigned pause_msecs;
2457 			if (WC_MODE_PMEM(wc))
2458 				goto invalid_optional;
2459 			string = dm_shift_arg(&as), opt_params--;
2460 			if (sscanf(string, "%u%c", &pause_msecs, &dummy) != 1)
2461 				goto invalid_optional;
2462 			if (pause_msecs > 60000)
2463 				goto invalid_optional;
2464 			wc->pause = msecs_to_jiffies(pause_msecs);
2465 			wc->pause_set = true;
2466 			wc->pause_value = pause_msecs;
2467 		} else {
2468 invalid_optional:
2469 			r = -EINVAL;
2470 			ti->error = "Invalid optional argument";
2471 			goto bad;
2472 		}
2473 	}
2474 
2475 	if (high_wm_percent < low_wm_percent) {
2476 		r = -EINVAL;
2477 		ti->error = "High watermark must be greater than or equal to low watermark";
2478 		goto bad;
2479 	}
2480 
2481 	if (WC_MODE_PMEM(wc)) {
2482 		if (!dax_synchronous(wc->ssd_dev->dax_dev)) {
2483 			r = -EOPNOTSUPP;
2484 			ti->error = "Asynchronous persistent memory not supported as pmem cache";
2485 			goto bad;
2486 		}
2487 
2488 		r = persistent_memory_claim(wc);
2489 		if (r) {
2490 			ti->error = "Unable to map persistent memory for cache";
2491 			goto bad;
2492 		}
2493 	} else {
2494 		size_t n_blocks, n_metadata_blocks;
2495 		uint64_t n_bitmap_bits;
2496 
2497 		wc->memory_map_size -= (uint64_t)wc->start_sector << SECTOR_SHIFT;
2498 
2499 		bio_list_init(&wc->flush_list);
2500 		wc->flush_thread = kthread_run(writecache_flush_thread, wc, "dm_writecache_flush");
2501 		if (IS_ERR(wc->flush_thread)) {
2502 			r = PTR_ERR(wc->flush_thread);
2503 			wc->flush_thread = NULL;
2504 			ti->error = "Couldn't spawn flush thread";
2505 			goto bad;
2506 		}
2507 
2508 		r = calculate_memory_size(wc->memory_map_size, wc->block_size,
2509 					  &n_blocks, &n_metadata_blocks);
2510 		if (r) {
2511 			ti->error = "Invalid device size";
2512 			goto bad;
2513 		}
2514 
2515 		n_bitmap_bits = (((uint64_t)n_metadata_blocks << wc->block_size_bits) +
2516 				 BITMAP_GRANULARITY - 1) / BITMAP_GRANULARITY;
2517 		/* this is limitation of test_bit functions */
2518 		if (n_bitmap_bits > 1U << 31) {
2519 			r = -EFBIG;
2520 			ti->error = "Invalid device size";
2521 			goto bad;
2522 		}
2523 
2524 		wc->memory_map = vmalloc(n_metadata_blocks << wc->block_size_bits);
2525 		if (!wc->memory_map) {
2526 			r = -ENOMEM;
2527 			ti->error = "Unable to allocate memory for metadata";
2528 			goto bad;
2529 		}
2530 
2531 		wc->dm_kcopyd = dm_kcopyd_client_create(&dm_kcopyd_throttle);
2532 		if (IS_ERR(wc->dm_kcopyd)) {
2533 			r = PTR_ERR(wc->dm_kcopyd);
2534 			ti->error = "Unable to allocate dm-kcopyd client";
2535 			wc->dm_kcopyd = NULL;
2536 			goto bad;
2537 		}
2538 
2539 		wc->metadata_sectors = n_metadata_blocks << (wc->block_size_bits - SECTOR_SHIFT);
2540 		wc->dirty_bitmap_size = (n_bitmap_bits + BITS_PER_LONG - 1) /
2541 			BITS_PER_LONG * sizeof(unsigned long);
2542 		wc->dirty_bitmap = vzalloc(wc->dirty_bitmap_size);
2543 		if (!wc->dirty_bitmap) {
2544 			r = -ENOMEM;
2545 			ti->error = "Unable to allocate dirty bitmap";
2546 			goto bad;
2547 		}
2548 
2549 		r = writecache_read_metadata(wc, wc->block_size >> SECTOR_SHIFT);
2550 		if (r) {
2551 			ti->error = "Unable to read first block of metadata";
2552 			goto bad;
2553 		}
2554 	}
2555 
2556 	r = copy_mc_to_kernel(&s, sb(wc), sizeof(struct wc_memory_superblock));
2557 	if (r) {
2558 		ti->error = "Hardware memory error when reading superblock";
2559 		goto bad;
2560 	}
2561 	if (!le32_to_cpu(s.magic) && !le32_to_cpu(s.version)) {
2562 		r = init_memory(wc);
2563 		if (r) {
2564 			ti->error = "Unable to initialize device";
2565 			goto bad;
2566 		}
2567 		r = copy_mc_to_kernel(&s, sb(wc),
2568 				      sizeof(struct wc_memory_superblock));
2569 		if (r) {
2570 			ti->error = "Hardware memory error when reading superblock";
2571 			goto bad;
2572 		}
2573 	}
2574 
2575 	if (le32_to_cpu(s.magic) != MEMORY_SUPERBLOCK_MAGIC) {
2576 		ti->error = "Invalid magic in the superblock";
2577 		r = -EINVAL;
2578 		goto bad;
2579 	}
2580 
2581 	if (le32_to_cpu(s.version) != MEMORY_SUPERBLOCK_VERSION) {
2582 		ti->error = "Invalid version in the superblock";
2583 		r = -EINVAL;
2584 		goto bad;
2585 	}
2586 
2587 	if (le32_to_cpu(s.block_size) != wc->block_size) {
2588 		ti->error = "Block size does not match superblock";
2589 		r = -EINVAL;
2590 		goto bad;
2591 	}
2592 
2593 	wc->n_blocks = le64_to_cpu(s.n_blocks);
2594 
2595 	offset = wc->n_blocks * sizeof(struct wc_memory_entry);
2596 	if (offset / sizeof(struct wc_memory_entry) != le64_to_cpu(sb(wc)->n_blocks)) {
2597 overflow:
2598 		ti->error = "Overflow in size calculation";
2599 		r = -EINVAL;
2600 		goto bad;
2601 	}
2602 	offset += sizeof(struct wc_memory_superblock);
2603 	if (offset < sizeof(struct wc_memory_superblock))
2604 		goto overflow;
2605 	offset = (offset + wc->block_size - 1) & ~(size_t)(wc->block_size - 1);
2606 	data_size = wc->n_blocks * (size_t)wc->block_size;
2607 	if (!offset || (data_size / wc->block_size != wc->n_blocks) ||
2608 	    (offset + data_size < offset))
2609 		goto overflow;
2610 	if (offset + data_size > wc->memory_map_size) {
2611 		ti->error = "Memory area is too small";
2612 		r = -EINVAL;
2613 		goto bad;
2614 	}
2615 
2616 	wc->metadata_sectors = offset >> SECTOR_SHIFT;
2617 	wc->block_start = (char *)sb(wc) + offset;
2618 
2619 	x = (uint64_t)wc->n_blocks * (100 - high_wm_percent);
2620 	x += 50;
2621 	do_div(x, 100);
2622 	wc->freelist_high_watermark = x;
2623 	x = (uint64_t)wc->n_blocks * (100 - low_wm_percent);
2624 	x += 50;
2625 	do_div(x, 100);
2626 	wc->freelist_low_watermark = x;
2627 
2628 	if (wc->cleaner)
2629 		activate_cleaner(wc);
2630 
2631 	r = writecache_alloc_entries(wc);
2632 	if (r) {
2633 		ti->error = "Cannot allocate memory";
2634 		goto bad;
2635 	}
2636 
2637 	ti->num_flush_bios = WC_MODE_PMEM(wc) ? 1 : 2;
2638 	ti->flush_supported = true;
2639 	ti->num_discard_bios = 1;
2640 
2641 	if (WC_MODE_PMEM(wc))
2642 		persistent_memory_flush_cache(wc->memory_map, wc->memory_map_size);
2643 
2644 	return 0;
2645 
2646 bad_arguments:
2647 	r = -EINVAL;
2648 	ti->error = "Bad arguments";
2649 bad:
2650 	writecache_dtr(ti);
2651 	return r;
2652 }
2653 
2654 static void writecache_status(struct dm_target *ti, status_type_t type,
2655 			      unsigned status_flags, char *result, unsigned maxlen)
2656 {
2657 	struct dm_writecache *wc = ti->private;
2658 	unsigned extra_args;
2659 	unsigned sz = 0;
2660 
2661 	switch (type) {
2662 	case STATUSTYPE_INFO:
2663 		DMEMIT("%ld %llu %llu %llu %llu %llu %llu %llu %llu %llu %llu %llu %llu %llu",
2664 		       writecache_has_error(wc),
2665 		       (unsigned long long)wc->n_blocks, (unsigned long long)wc->freelist_size,
2666 		       (unsigned long long)wc->writeback_size,
2667 		       wc->stats.reads,
2668 		       wc->stats.read_hits,
2669 		       wc->stats.writes,
2670 		       wc->stats.write_hits_uncommitted,
2671 		       wc->stats.write_hits_committed,
2672 		       wc->stats.writes_around,
2673 		       wc->stats.writes_allocate,
2674 		       wc->stats.writes_blocked_on_freelist,
2675 		       wc->stats.flushes,
2676 		       wc->stats.discards);
2677 		break;
2678 	case STATUSTYPE_TABLE:
2679 		DMEMIT("%c %s %s %u ", WC_MODE_PMEM(wc) ? 'p' : 's',
2680 				wc->dev->name, wc->ssd_dev->name, wc->block_size);
2681 		extra_args = 0;
2682 		if (wc->start_sector_set)
2683 			extra_args += 2;
2684 		if (wc->high_wm_percent_set)
2685 			extra_args += 2;
2686 		if (wc->low_wm_percent_set)
2687 			extra_args += 2;
2688 		if (wc->max_writeback_jobs_set)
2689 			extra_args += 2;
2690 		if (wc->autocommit_blocks_set)
2691 			extra_args += 2;
2692 		if (wc->autocommit_time_set)
2693 			extra_args += 2;
2694 		if (wc->max_age_set)
2695 			extra_args += 2;
2696 		if (wc->cleaner_set)
2697 			extra_args++;
2698 		if (wc->writeback_fua_set)
2699 			extra_args++;
2700 		if (wc->metadata_only)
2701 			extra_args++;
2702 		if (wc->pause_set)
2703 			extra_args += 2;
2704 
2705 		DMEMIT("%u", extra_args);
2706 		if (wc->start_sector_set)
2707 			DMEMIT(" start_sector %llu", (unsigned long long)wc->start_sector);
2708 		if (wc->high_wm_percent_set)
2709 			DMEMIT(" high_watermark %u", wc->high_wm_percent_value);
2710 		if (wc->low_wm_percent_set)
2711 			DMEMIT(" low_watermark %u", wc->low_wm_percent_value);
2712 		if (wc->max_writeback_jobs_set)
2713 			DMEMIT(" writeback_jobs %u", wc->max_writeback_jobs);
2714 		if (wc->autocommit_blocks_set)
2715 			DMEMIT(" autocommit_blocks %u", wc->autocommit_blocks);
2716 		if (wc->autocommit_time_set)
2717 			DMEMIT(" autocommit_time %u", wc->autocommit_time_value);
2718 		if (wc->max_age_set)
2719 			DMEMIT(" max_age %u", wc->max_age_value);
2720 		if (wc->cleaner_set)
2721 			DMEMIT(" cleaner");
2722 		if (wc->writeback_fua_set)
2723 			DMEMIT(" %sfua", wc->writeback_fua ? "" : "no");
2724 		if (wc->metadata_only)
2725 			DMEMIT(" metadata_only");
2726 		if (wc->pause_set)
2727 			DMEMIT(" pause_writeback %u", wc->pause_value);
2728 		break;
2729 	case STATUSTYPE_IMA:
2730 		*result = '\0';
2731 		break;
2732 	}
2733 }
2734 
2735 static struct target_type writecache_target = {
2736 	.name			= "writecache",
2737 	.version		= {1, 6, 0},
2738 	.module			= THIS_MODULE,
2739 	.ctr			= writecache_ctr,
2740 	.dtr			= writecache_dtr,
2741 	.status			= writecache_status,
2742 	.postsuspend		= writecache_suspend,
2743 	.resume			= writecache_resume,
2744 	.message		= writecache_message,
2745 	.map			= writecache_map,
2746 	.end_io			= writecache_end_io,
2747 	.iterate_devices	= writecache_iterate_devices,
2748 	.io_hints		= writecache_io_hints,
2749 };
2750 
2751 static int __init dm_writecache_init(void)
2752 {
2753 	int r;
2754 
2755 	r = dm_register_target(&writecache_target);
2756 	if (r < 0) {
2757 		DMERR("register failed %d", r);
2758 		return r;
2759 	}
2760 
2761 	return 0;
2762 }
2763 
2764 static void __exit dm_writecache_exit(void)
2765 {
2766 	dm_unregister_target(&writecache_target);
2767 }
2768 
2769 module_init(dm_writecache_init);
2770 module_exit(dm_writecache_exit);
2771 
2772 MODULE_DESCRIPTION(DM_NAME " writecache target");
2773 MODULE_AUTHOR("Mikulas Patocka <dm-devel@redhat.com>");
2774 MODULE_LICENSE("GPL");
2775