xref: /openbmc/linux/drivers/md/dm-writecache.c (revision b9b77222)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (C) 2018 Red Hat. All rights reserved.
4  *
5  * This file is released under the GPL.
6  */
7 
8 #include <linux/device-mapper.h>
9 #include <linux/module.h>
10 #include <linux/init.h>
11 #include <linux/vmalloc.h>
12 #include <linux/kthread.h>
13 #include <linux/dm-io.h>
14 #include <linux/dm-kcopyd.h>
15 #include <linux/dax.h>
16 #include <linux/pfn_t.h>
17 #include <linux/libnvdimm.h>
18 
19 #define DM_MSG_PREFIX "writecache"
20 
21 #define HIGH_WATERMARK			50
22 #define LOW_WATERMARK			45
23 #define MAX_WRITEBACK_JOBS		0
24 #define ENDIO_LATENCY			16
25 #define WRITEBACK_LATENCY		64
26 #define AUTOCOMMIT_BLOCKS_SSD		65536
27 #define AUTOCOMMIT_BLOCKS_PMEM		64
28 #define AUTOCOMMIT_MSEC			1000
29 
30 #define BITMAP_GRANULARITY	65536
31 #if BITMAP_GRANULARITY < PAGE_SIZE
32 #undef BITMAP_GRANULARITY
33 #define BITMAP_GRANULARITY	PAGE_SIZE
34 #endif
35 
36 #if IS_ENABLED(CONFIG_ARCH_HAS_PMEM_API) && IS_ENABLED(CONFIG_DAX_DRIVER)
37 #define DM_WRITECACHE_HAS_PMEM
38 #endif
39 
40 #ifdef DM_WRITECACHE_HAS_PMEM
41 #define pmem_assign(dest, src)					\
42 do {								\
43 	typeof(dest) uniq = (src);				\
44 	memcpy_flushcache(&(dest), &uniq, sizeof(dest));	\
45 } while (0)
46 #else
47 #define pmem_assign(dest, src)	((dest) = (src))
48 #endif
49 
50 #if defined(__HAVE_ARCH_MEMCPY_MCSAFE) && defined(DM_WRITECACHE_HAS_PMEM)
51 #define DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
52 #endif
53 
54 #define MEMORY_SUPERBLOCK_MAGIC		0x23489321
55 #define MEMORY_SUPERBLOCK_VERSION	1
56 
57 struct wc_memory_entry {
58 	__le64 original_sector;
59 	__le64 seq_count;
60 };
61 
62 struct wc_memory_superblock {
63 	union {
64 		struct {
65 			__le32 magic;
66 			__le32 version;
67 			__le32 block_size;
68 			__le32 pad;
69 			__le64 n_blocks;
70 			__le64 seq_count;
71 		};
72 		__le64 padding[8];
73 	};
74 	struct wc_memory_entry entries[0];
75 };
76 
77 struct wc_entry {
78 	struct rb_node rb_node;
79 	struct list_head lru;
80 	unsigned short wc_list_contiguous;
81 	bool write_in_progress
82 #if BITS_PER_LONG == 64
83 		:1
84 #endif
85 	;
86 	unsigned long index
87 #if BITS_PER_LONG == 64
88 		:47
89 #endif
90 	;
91 #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
92 	uint64_t original_sector;
93 	uint64_t seq_count;
94 #endif
95 };
96 
97 #ifdef DM_WRITECACHE_HAS_PMEM
98 #define WC_MODE_PMEM(wc)			((wc)->pmem_mode)
99 #define WC_MODE_FUA(wc)				((wc)->writeback_fua)
100 #else
101 #define WC_MODE_PMEM(wc)			false
102 #define WC_MODE_FUA(wc)				false
103 #endif
104 #define WC_MODE_SORT_FREELIST(wc)		(!WC_MODE_PMEM(wc))
105 
106 struct dm_writecache {
107 	struct mutex lock;
108 	struct list_head lru;
109 	union {
110 		struct list_head freelist;
111 		struct {
112 			struct rb_root freetree;
113 			struct wc_entry *current_free;
114 		};
115 	};
116 	struct rb_root tree;
117 
118 	size_t freelist_size;
119 	size_t writeback_size;
120 	size_t freelist_high_watermark;
121 	size_t freelist_low_watermark;
122 
123 	unsigned uncommitted_blocks;
124 	unsigned autocommit_blocks;
125 	unsigned max_writeback_jobs;
126 
127 	int error;
128 
129 	unsigned long autocommit_jiffies;
130 	struct timer_list autocommit_timer;
131 	struct wait_queue_head freelist_wait;
132 
133 	atomic_t bio_in_progress[2];
134 	struct wait_queue_head bio_in_progress_wait[2];
135 
136 	struct dm_target *ti;
137 	struct dm_dev *dev;
138 	struct dm_dev *ssd_dev;
139 	sector_t start_sector;
140 	void *memory_map;
141 	uint64_t memory_map_size;
142 	size_t metadata_sectors;
143 	size_t n_blocks;
144 	uint64_t seq_count;
145 	void *block_start;
146 	struct wc_entry *entries;
147 	unsigned block_size;
148 	unsigned char block_size_bits;
149 
150 	bool pmem_mode:1;
151 	bool writeback_fua:1;
152 
153 	bool overwrote_committed:1;
154 	bool memory_vmapped:1;
155 
156 	bool high_wm_percent_set:1;
157 	bool low_wm_percent_set:1;
158 	bool max_writeback_jobs_set:1;
159 	bool autocommit_blocks_set:1;
160 	bool autocommit_time_set:1;
161 	bool writeback_fua_set:1;
162 	bool flush_on_suspend:1;
163 
164 	unsigned writeback_all;
165 	struct workqueue_struct *writeback_wq;
166 	struct work_struct writeback_work;
167 	struct work_struct flush_work;
168 
169 	struct dm_io_client *dm_io;
170 
171 	raw_spinlock_t endio_list_lock;
172 	struct list_head endio_list;
173 	struct task_struct *endio_thread;
174 
175 	struct task_struct *flush_thread;
176 	struct bio_list flush_list;
177 
178 	struct dm_kcopyd_client *dm_kcopyd;
179 	unsigned long *dirty_bitmap;
180 	unsigned dirty_bitmap_size;
181 
182 	struct bio_set bio_set;
183 	mempool_t copy_pool;
184 };
185 
186 #define WB_LIST_INLINE		16
187 
188 struct writeback_struct {
189 	struct list_head endio_entry;
190 	struct dm_writecache *wc;
191 	struct wc_entry **wc_list;
192 	unsigned wc_list_n;
193 	unsigned page_offset;
194 	struct page *page;
195 	struct wc_entry *wc_list_inline[WB_LIST_INLINE];
196 	struct bio bio;
197 };
198 
199 struct copy_struct {
200 	struct list_head endio_entry;
201 	struct dm_writecache *wc;
202 	struct wc_entry *e;
203 	unsigned n_entries;
204 	int error;
205 };
206 
207 DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(dm_writecache_throttle,
208 					    "A percentage of time allocated for data copying");
209 
210 static void wc_lock(struct dm_writecache *wc)
211 {
212 	mutex_lock(&wc->lock);
213 }
214 
215 static void wc_unlock(struct dm_writecache *wc)
216 {
217 	mutex_unlock(&wc->lock);
218 }
219 
220 #ifdef DM_WRITECACHE_HAS_PMEM
221 static int persistent_memory_claim(struct dm_writecache *wc)
222 {
223 	int r;
224 	loff_t s;
225 	long p, da;
226 	pfn_t pfn;
227 	int id;
228 	struct page **pages;
229 
230 	wc->memory_vmapped = false;
231 
232 	if (!wc->ssd_dev->dax_dev) {
233 		r = -EOPNOTSUPP;
234 		goto err1;
235 	}
236 	s = wc->memory_map_size;
237 	p = s >> PAGE_SHIFT;
238 	if (!p) {
239 		r = -EINVAL;
240 		goto err1;
241 	}
242 	if (p != s >> PAGE_SHIFT) {
243 		r = -EOVERFLOW;
244 		goto err1;
245 	}
246 
247 	id = dax_read_lock();
248 
249 	da = dax_direct_access(wc->ssd_dev->dax_dev, 0, p, &wc->memory_map, &pfn);
250 	if (da < 0) {
251 		wc->memory_map = NULL;
252 		r = da;
253 		goto err2;
254 	}
255 	if (!pfn_t_has_page(pfn)) {
256 		wc->memory_map = NULL;
257 		r = -EOPNOTSUPP;
258 		goto err2;
259 	}
260 	if (da != p) {
261 		long i;
262 		wc->memory_map = NULL;
263 		pages = kvmalloc_array(p, sizeof(struct page *), GFP_KERNEL);
264 		if (!pages) {
265 			r = -ENOMEM;
266 			goto err2;
267 		}
268 		i = 0;
269 		do {
270 			long daa;
271 			void *dummy_addr;
272 			daa = dax_direct_access(wc->ssd_dev->dax_dev, i, p - i,
273 						&dummy_addr, &pfn);
274 			if (daa <= 0) {
275 				r = daa ? daa : -EINVAL;
276 				goto err3;
277 			}
278 			if (!pfn_t_has_page(pfn)) {
279 				r = -EOPNOTSUPP;
280 				goto err3;
281 			}
282 			while (daa-- && i < p) {
283 				pages[i++] = pfn_t_to_page(pfn);
284 				pfn.val++;
285 			}
286 		} while (i < p);
287 		wc->memory_map = vmap(pages, p, VM_MAP, PAGE_KERNEL);
288 		if (!wc->memory_map) {
289 			r = -ENOMEM;
290 			goto err3;
291 		}
292 		kvfree(pages);
293 		wc->memory_vmapped = true;
294 	}
295 
296 	dax_read_unlock(id);
297 
298 	wc->memory_map += (size_t)wc->start_sector << SECTOR_SHIFT;
299 	wc->memory_map_size -= (size_t)wc->start_sector << SECTOR_SHIFT;
300 
301 	return 0;
302 err3:
303 	kvfree(pages);
304 err2:
305 	dax_read_unlock(id);
306 err1:
307 	return r;
308 }
309 #else
310 static int persistent_memory_claim(struct dm_writecache *wc)
311 {
312 	BUG();
313 }
314 #endif
315 
316 static void persistent_memory_release(struct dm_writecache *wc)
317 {
318 	if (wc->memory_vmapped)
319 		vunmap(wc->memory_map - ((size_t)wc->start_sector << SECTOR_SHIFT));
320 }
321 
322 static struct page *persistent_memory_page(void *addr)
323 {
324 	if (is_vmalloc_addr(addr))
325 		return vmalloc_to_page(addr);
326 	else
327 		return virt_to_page(addr);
328 }
329 
330 static unsigned persistent_memory_page_offset(void *addr)
331 {
332 	return (unsigned long)addr & (PAGE_SIZE - 1);
333 }
334 
335 static void persistent_memory_flush_cache(void *ptr, size_t size)
336 {
337 	if (is_vmalloc_addr(ptr))
338 		flush_kernel_vmap_range(ptr, size);
339 }
340 
341 static void persistent_memory_invalidate_cache(void *ptr, size_t size)
342 {
343 	if (is_vmalloc_addr(ptr))
344 		invalidate_kernel_vmap_range(ptr, size);
345 }
346 
347 static struct wc_memory_superblock *sb(struct dm_writecache *wc)
348 {
349 	return wc->memory_map;
350 }
351 
352 static struct wc_memory_entry *memory_entry(struct dm_writecache *wc, struct wc_entry *e)
353 {
354 	if (is_power_of_2(sizeof(struct wc_entry)) && 0)
355 		return &sb(wc)->entries[e - wc->entries];
356 	else
357 		return &sb(wc)->entries[e->index];
358 }
359 
360 static void *memory_data(struct dm_writecache *wc, struct wc_entry *e)
361 {
362 	return (char *)wc->block_start + (e->index << wc->block_size_bits);
363 }
364 
365 static sector_t cache_sector(struct dm_writecache *wc, struct wc_entry *e)
366 {
367 	return wc->start_sector + wc->metadata_sectors +
368 		((sector_t)e->index << (wc->block_size_bits - SECTOR_SHIFT));
369 }
370 
371 static uint64_t read_original_sector(struct dm_writecache *wc, struct wc_entry *e)
372 {
373 #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
374 	return e->original_sector;
375 #else
376 	return le64_to_cpu(memory_entry(wc, e)->original_sector);
377 #endif
378 }
379 
380 static uint64_t read_seq_count(struct dm_writecache *wc, struct wc_entry *e)
381 {
382 #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
383 	return e->seq_count;
384 #else
385 	return le64_to_cpu(memory_entry(wc, e)->seq_count);
386 #endif
387 }
388 
389 static void clear_seq_count(struct dm_writecache *wc, struct wc_entry *e)
390 {
391 #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
392 	e->seq_count = -1;
393 #endif
394 	pmem_assign(memory_entry(wc, e)->seq_count, cpu_to_le64(-1));
395 }
396 
397 static void write_original_sector_seq_count(struct dm_writecache *wc, struct wc_entry *e,
398 					    uint64_t original_sector, uint64_t seq_count)
399 {
400 	struct wc_memory_entry me;
401 #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
402 	e->original_sector = original_sector;
403 	e->seq_count = seq_count;
404 #endif
405 	me.original_sector = cpu_to_le64(original_sector);
406 	me.seq_count = cpu_to_le64(seq_count);
407 	pmem_assign(*memory_entry(wc, e), me);
408 }
409 
410 #define writecache_error(wc, err, msg, arg...)				\
411 do {									\
412 	if (!cmpxchg(&(wc)->error, 0, err))				\
413 		DMERR(msg, ##arg);					\
414 	wake_up(&(wc)->freelist_wait);					\
415 } while (0)
416 
417 #define writecache_has_error(wc)	(unlikely(READ_ONCE((wc)->error)))
418 
419 static void writecache_flush_all_metadata(struct dm_writecache *wc)
420 {
421 	if (!WC_MODE_PMEM(wc))
422 		memset(wc->dirty_bitmap, -1, wc->dirty_bitmap_size);
423 }
424 
425 static void writecache_flush_region(struct dm_writecache *wc, void *ptr, size_t size)
426 {
427 	if (!WC_MODE_PMEM(wc))
428 		__set_bit(((char *)ptr - (char *)wc->memory_map) / BITMAP_GRANULARITY,
429 			  wc->dirty_bitmap);
430 }
431 
432 static void writecache_disk_flush(struct dm_writecache *wc, struct dm_dev *dev);
433 
434 struct io_notify {
435 	struct dm_writecache *wc;
436 	struct completion c;
437 	atomic_t count;
438 };
439 
440 static void writecache_notify_io(unsigned long error, void *context)
441 {
442 	struct io_notify *endio = context;
443 
444 	if (unlikely(error != 0))
445 		writecache_error(endio->wc, -EIO, "error writing metadata");
446 	BUG_ON(atomic_read(&endio->count) <= 0);
447 	if (atomic_dec_and_test(&endio->count))
448 		complete(&endio->c);
449 }
450 
451 static void ssd_commit_flushed(struct dm_writecache *wc)
452 {
453 	struct dm_io_region region;
454 	struct dm_io_request req;
455 	struct io_notify endio = {
456 		wc,
457 		COMPLETION_INITIALIZER_ONSTACK(endio.c),
458 		ATOMIC_INIT(1),
459 	};
460 	unsigned bitmap_bits = wc->dirty_bitmap_size * BITS_PER_LONG;
461 	unsigned i = 0;
462 
463 	while (1) {
464 		unsigned j;
465 		i = find_next_bit(wc->dirty_bitmap, bitmap_bits, i);
466 		if (unlikely(i == bitmap_bits))
467 			break;
468 		j = find_next_zero_bit(wc->dirty_bitmap, bitmap_bits, i);
469 
470 		region.bdev = wc->ssd_dev->bdev;
471 		region.sector = (sector_t)i * (BITMAP_GRANULARITY >> SECTOR_SHIFT);
472 		region.count = (sector_t)(j - i) * (BITMAP_GRANULARITY >> SECTOR_SHIFT);
473 
474 		if (unlikely(region.sector >= wc->metadata_sectors))
475 			break;
476 		if (unlikely(region.sector + region.count > wc->metadata_sectors))
477 			region.count = wc->metadata_sectors - region.sector;
478 
479 		region.sector += wc->start_sector;
480 		atomic_inc(&endio.count);
481 		req.bi_op = REQ_OP_WRITE;
482 		req.bi_op_flags = REQ_SYNC;
483 		req.mem.type = DM_IO_VMA;
484 		req.mem.ptr.vma = (char *)wc->memory_map + (size_t)i * BITMAP_GRANULARITY;
485 		req.client = wc->dm_io;
486 		req.notify.fn = writecache_notify_io;
487 		req.notify.context = &endio;
488 
489 		/* writing via async dm-io (implied by notify.fn above) won't return an error */
490 	        (void) dm_io(&req, 1, &region, NULL);
491 		i = j;
492 	}
493 
494 	writecache_notify_io(0, &endio);
495 	wait_for_completion_io(&endio.c);
496 
497 	writecache_disk_flush(wc, wc->ssd_dev);
498 
499 	memset(wc->dirty_bitmap, 0, wc->dirty_bitmap_size);
500 }
501 
502 static void writecache_commit_flushed(struct dm_writecache *wc)
503 {
504 	if (WC_MODE_PMEM(wc))
505 		wmb();
506 	else
507 		ssd_commit_flushed(wc);
508 }
509 
510 static void writecache_disk_flush(struct dm_writecache *wc, struct dm_dev *dev)
511 {
512 	int r;
513 	struct dm_io_region region;
514 	struct dm_io_request req;
515 
516 	region.bdev = dev->bdev;
517 	region.sector = 0;
518 	region.count = 0;
519 	req.bi_op = REQ_OP_WRITE;
520 	req.bi_op_flags = REQ_PREFLUSH;
521 	req.mem.type = DM_IO_KMEM;
522 	req.mem.ptr.addr = NULL;
523 	req.client = wc->dm_io;
524 	req.notify.fn = NULL;
525 
526 	r = dm_io(&req, 1, &region, NULL);
527 	if (unlikely(r))
528 		writecache_error(wc, r, "error flushing metadata: %d", r);
529 }
530 
531 static void writecache_wait_for_ios(struct dm_writecache *wc, int direction)
532 {
533 	wait_event(wc->bio_in_progress_wait[direction],
534 		   !atomic_read(&wc->bio_in_progress[direction]));
535 }
536 
537 #define WFE_RETURN_FOLLOWING	1
538 #define WFE_LOWEST_SEQ		2
539 
540 static struct wc_entry *writecache_find_entry(struct dm_writecache *wc,
541 					      uint64_t block, int flags)
542 {
543 	struct wc_entry *e;
544 	struct rb_node *node = wc->tree.rb_node;
545 
546 	if (unlikely(!node))
547 		return NULL;
548 
549 	while (1) {
550 		e = container_of(node, struct wc_entry, rb_node);
551 		if (read_original_sector(wc, e) == block)
552 			break;
553 		node = (read_original_sector(wc, e) >= block ?
554 			e->rb_node.rb_left : e->rb_node.rb_right);
555 		if (unlikely(!node)) {
556 			if (!(flags & WFE_RETURN_FOLLOWING)) {
557 				return NULL;
558 			}
559 			if (read_original_sector(wc, e) >= block) {
560 				break;
561 			} else {
562 				node = rb_next(&e->rb_node);
563 				if (unlikely(!node)) {
564 					return NULL;
565 				}
566 				e = container_of(node, struct wc_entry, rb_node);
567 				break;
568 			}
569 		}
570 	}
571 
572 	while (1) {
573 		struct wc_entry *e2;
574 		if (flags & WFE_LOWEST_SEQ)
575 			node = rb_prev(&e->rb_node);
576 		else
577 			node = rb_next(&e->rb_node);
578 		if (!node)
579 			return e;
580 		e2 = container_of(node, struct wc_entry, rb_node);
581 		if (read_original_sector(wc, e2) != block)
582 			return e;
583 		e = e2;
584 	}
585 }
586 
587 static void writecache_insert_entry(struct dm_writecache *wc, struct wc_entry *ins)
588 {
589 	struct wc_entry *e;
590 	struct rb_node **node = &wc->tree.rb_node, *parent = NULL;
591 
592 	while (*node) {
593 		e = container_of(*node, struct wc_entry, rb_node);
594 		parent = &e->rb_node;
595 		if (read_original_sector(wc, e) > read_original_sector(wc, ins))
596 			node = &parent->rb_left;
597 		else
598 			node = &parent->rb_right;
599 	}
600 	rb_link_node(&ins->rb_node, parent, node);
601 	rb_insert_color(&ins->rb_node, &wc->tree);
602 	list_add(&ins->lru, &wc->lru);
603 }
604 
605 static void writecache_unlink(struct dm_writecache *wc, struct wc_entry *e)
606 {
607 	list_del(&e->lru);
608 	rb_erase(&e->rb_node, &wc->tree);
609 }
610 
611 static void writecache_add_to_freelist(struct dm_writecache *wc, struct wc_entry *e)
612 {
613 	if (WC_MODE_SORT_FREELIST(wc)) {
614 		struct rb_node **node = &wc->freetree.rb_node, *parent = NULL;
615 		if (unlikely(!*node))
616 			wc->current_free = e;
617 		while (*node) {
618 			parent = *node;
619 			if (&e->rb_node < *node)
620 				node = &parent->rb_left;
621 			else
622 				node = &parent->rb_right;
623 		}
624 		rb_link_node(&e->rb_node, parent, node);
625 		rb_insert_color(&e->rb_node, &wc->freetree);
626 	} else {
627 		list_add_tail(&e->lru, &wc->freelist);
628 	}
629 	wc->freelist_size++;
630 }
631 
632 static struct wc_entry *writecache_pop_from_freelist(struct dm_writecache *wc)
633 {
634 	struct wc_entry *e;
635 
636 	if (WC_MODE_SORT_FREELIST(wc)) {
637 		struct rb_node *next;
638 		if (unlikely(!wc->current_free))
639 			return NULL;
640 		e = wc->current_free;
641 		next = rb_next(&e->rb_node);
642 		rb_erase(&e->rb_node, &wc->freetree);
643 		if (unlikely(!next))
644 			next = rb_first(&wc->freetree);
645 		wc->current_free = next ? container_of(next, struct wc_entry, rb_node) : NULL;
646 	} else {
647 		if (unlikely(list_empty(&wc->freelist)))
648 			return NULL;
649 		e = container_of(wc->freelist.next, struct wc_entry, lru);
650 		list_del(&e->lru);
651 	}
652 	wc->freelist_size--;
653 	if (unlikely(wc->freelist_size + wc->writeback_size <= wc->freelist_high_watermark))
654 		queue_work(wc->writeback_wq, &wc->writeback_work);
655 
656 	return e;
657 }
658 
659 static void writecache_free_entry(struct dm_writecache *wc, struct wc_entry *e)
660 {
661 	writecache_unlink(wc, e);
662 	writecache_add_to_freelist(wc, e);
663 	clear_seq_count(wc, e);
664 	writecache_flush_region(wc, memory_entry(wc, e), sizeof(struct wc_memory_entry));
665 	if (unlikely(waitqueue_active(&wc->freelist_wait)))
666 		wake_up(&wc->freelist_wait);
667 }
668 
669 static void writecache_wait_on_freelist(struct dm_writecache *wc)
670 {
671 	DEFINE_WAIT(wait);
672 
673 	prepare_to_wait(&wc->freelist_wait, &wait, TASK_UNINTERRUPTIBLE);
674 	wc_unlock(wc);
675 	io_schedule();
676 	finish_wait(&wc->freelist_wait, &wait);
677 	wc_lock(wc);
678 }
679 
680 static void writecache_poison_lists(struct dm_writecache *wc)
681 {
682 	/*
683 	 * Catch incorrect access to these values while the device is suspended.
684 	 */
685 	memset(&wc->tree, -1, sizeof wc->tree);
686 	wc->lru.next = LIST_POISON1;
687 	wc->lru.prev = LIST_POISON2;
688 	wc->freelist.next = LIST_POISON1;
689 	wc->freelist.prev = LIST_POISON2;
690 }
691 
692 static void writecache_flush_entry(struct dm_writecache *wc, struct wc_entry *e)
693 {
694 	writecache_flush_region(wc, memory_entry(wc, e), sizeof(struct wc_memory_entry));
695 	if (WC_MODE_PMEM(wc))
696 		writecache_flush_region(wc, memory_data(wc, e), wc->block_size);
697 }
698 
699 static bool writecache_entry_is_committed(struct dm_writecache *wc, struct wc_entry *e)
700 {
701 	return read_seq_count(wc, e) < wc->seq_count;
702 }
703 
704 static void writecache_flush(struct dm_writecache *wc)
705 {
706 	struct wc_entry *e, *e2;
707 	bool need_flush_after_free;
708 
709 	wc->uncommitted_blocks = 0;
710 	del_timer(&wc->autocommit_timer);
711 
712 	if (list_empty(&wc->lru))
713 		return;
714 
715 	e = container_of(wc->lru.next, struct wc_entry, lru);
716 	if (writecache_entry_is_committed(wc, e)) {
717 		if (wc->overwrote_committed) {
718 			writecache_wait_for_ios(wc, WRITE);
719 			writecache_disk_flush(wc, wc->ssd_dev);
720 			wc->overwrote_committed = false;
721 		}
722 		return;
723 	}
724 	while (1) {
725 		writecache_flush_entry(wc, e);
726 		if (unlikely(e->lru.next == &wc->lru))
727 			break;
728 		e2 = container_of(e->lru.next, struct wc_entry, lru);
729 		if (writecache_entry_is_committed(wc, e2))
730 			break;
731 		e = e2;
732 		cond_resched();
733 	}
734 	writecache_commit_flushed(wc);
735 
736 	writecache_wait_for_ios(wc, WRITE);
737 
738 	wc->seq_count++;
739 	pmem_assign(sb(wc)->seq_count, cpu_to_le64(wc->seq_count));
740 	writecache_flush_region(wc, &sb(wc)->seq_count, sizeof sb(wc)->seq_count);
741 	writecache_commit_flushed(wc);
742 
743 	wc->overwrote_committed = false;
744 
745 	need_flush_after_free = false;
746 	while (1) {
747 		/* Free another committed entry with lower seq-count */
748 		struct rb_node *rb_node = rb_prev(&e->rb_node);
749 
750 		if (rb_node) {
751 			e2 = container_of(rb_node, struct wc_entry, rb_node);
752 			if (read_original_sector(wc, e2) == read_original_sector(wc, e) &&
753 			    likely(!e2->write_in_progress)) {
754 				writecache_free_entry(wc, e2);
755 				need_flush_after_free = true;
756 			}
757 		}
758 		if (unlikely(e->lru.prev == &wc->lru))
759 			break;
760 		e = container_of(e->lru.prev, struct wc_entry, lru);
761 		cond_resched();
762 	}
763 
764 	if (need_flush_after_free)
765 		writecache_commit_flushed(wc);
766 }
767 
768 static void writecache_flush_work(struct work_struct *work)
769 {
770 	struct dm_writecache *wc = container_of(work, struct dm_writecache, flush_work);
771 
772 	wc_lock(wc);
773 	writecache_flush(wc);
774 	wc_unlock(wc);
775 }
776 
777 static void writecache_autocommit_timer(struct timer_list *t)
778 {
779 	struct dm_writecache *wc = from_timer(wc, t, autocommit_timer);
780 	if (!writecache_has_error(wc))
781 		queue_work(wc->writeback_wq, &wc->flush_work);
782 }
783 
784 static void writecache_schedule_autocommit(struct dm_writecache *wc)
785 {
786 	if (!timer_pending(&wc->autocommit_timer))
787 		mod_timer(&wc->autocommit_timer, jiffies + wc->autocommit_jiffies);
788 }
789 
790 static void writecache_discard(struct dm_writecache *wc, sector_t start, sector_t end)
791 {
792 	struct wc_entry *e;
793 	bool discarded_something = false;
794 
795 	e = writecache_find_entry(wc, start, WFE_RETURN_FOLLOWING | WFE_LOWEST_SEQ);
796 	if (unlikely(!e))
797 		return;
798 
799 	while (read_original_sector(wc, e) < end) {
800 		struct rb_node *node = rb_next(&e->rb_node);
801 
802 		if (likely(!e->write_in_progress)) {
803 			if (!discarded_something) {
804 				writecache_wait_for_ios(wc, READ);
805 				writecache_wait_for_ios(wc, WRITE);
806 				discarded_something = true;
807 			}
808 			writecache_free_entry(wc, e);
809 		}
810 
811 		if (!node)
812 			break;
813 
814 		e = container_of(node, struct wc_entry, rb_node);
815 	}
816 
817 	if (discarded_something)
818 		writecache_commit_flushed(wc);
819 }
820 
821 static bool writecache_wait_for_writeback(struct dm_writecache *wc)
822 {
823 	if (wc->writeback_size) {
824 		writecache_wait_on_freelist(wc);
825 		return true;
826 	}
827 	return false;
828 }
829 
830 static void writecache_suspend(struct dm_target *ti)
831 {
832 	struct dm_writecache *wc = ti->private;
833 	bool flush_on_suspend;
834 
835 	del_timer_sync(&wc->autocommit_timer);
836 
837 	wc_lock(wc);
838 	writecache_flush(wc);
839 	flush_on_suspend = wc->flush_on_suspend;
840 	if (flush_on_suspend) {
841 		wc->flush_on_suspend = false;
842 		wc->writeback_all++;
843 		queue_work(wc->writeback_wq, &wc->writeback_work);
844 	}
845 	wc_unlock(wc);
846 
847 	flush_workqueue(wc->writeback_wq);
848 
849 	wc_lock(wc);
850 	if (flush_on_suspend)
851 		wc->writeback_all--;
852 	while (writecache_wait_for_writeback(wc));
853 
854 	if (WC_MODE_PMEM(wc))
855 		persistent_memory_flush_cache(wc->memory_map, wc->memory_map_size);
856 
857 	writecache_poison_lists(wc);
858 
859 	wc_unlock(wc);
860 }
861 
862 static int writecache_alloc_entries(struct dm_writecache *wc)
863 {
864 	size_t b;
865 
866 	if (wc->entries)
867 		return 0;
868 	wc->entries = vmalloc(array_size(sizeof(struct wc_entry), wc->n_blocks));
869 	if (!wc->entries)
870 		return -ENOMEM;
871 	for (b = 0; b < wc->n_blocks; b++) {
872 		struct wc_entry *e = &wc->entries[b];
873 		e->index = b;
874 		e->write_in_progress = false;
875 	}
876 
877 	return 0;
878 }
879 
880 static void writecache_resume(struct dm_target *ti)
881 {
882 	struct dm_writecache *wc = ti->private;
883 	size_t b;
884 	bool need_flush = false;
885 	__le64 sb_seq_count;
886 	int r;
887 
888 	wc_lock(wc);
889 
890 	if (WC_MODE_PMEM(wc))
891 		persistent_memory_invalidate_cache(wc->memory_map, wc->memory_map_size);
892 
893 	wc->tree = RB_ROOT;
894 	INIT_LIST_HEAD(&wc->lru);
895 	if (WC_MODE_SORT_FREELIST(wc)) {
896 		wc->freetree = RB_ROOT;
897 		wc->current_free = NULL;
898 	} else {
899 		INIT_LIST_HEAD(&wc->freelist);
900 	}
901 	wc->freelist_size = 0;
902 
903 	r = memcpy_mcsafe(&sb_seq_count, &sb(wc)->seq_count, sizeof(uint64_t));
904 	if (r) {
905 		writecache_error(wc, r, "hardware memory error when reading superblock: %d", r);
906 		sb_seq_count = cpu_to_le64(0);
907 	}
908 	wc->seq_count = le64_to_cpu(sb_seq_count);
909 
910 #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
911 	for (b = 0; b < wc->n_blocks; b++) {
912 		struct wc_entry *e = &wc->entries[b];
913 		struct wc_memory_entry wme;
914 		if (writecache_has_error(wc)) {
915 			e->original_sector = -1;
916 			e->seq_count = -1;
917 			continue;
918 		}
919 		r = memcpy_mcsafe(&wme, memory_entry(wc, e), sizeof(struct wc_memory_entry));
920 		if (r) {
921 			writecache_error(wc, r, "hardware memory error when reading metadata entry %lu: %d",
922 					 (unsigned long)b, r);
923 			e->original_sector = -1;
924 			e->seq_count = -1;
925 		} else {
926 			e->original_sector = le64_to_cpu(wme.original_sector);
927 			e->seq_count = le64_to_cpu(wme.seq_count);
928 		}
929 	}
930 #endif
931 	for (b = 0; b < wc->n_blocks; b++) {
932 		struct wc_entry *e = &wc->entries[b];
933 		if (!writecache_entry_is_committed(wc, e)) {
934 			if (read_seq_count(wc, e) != -1) {
935 erase_this:
936 				clear_seq_count(wc, e);
937 				need_flush = true;
938 			}
939 			writecache_add_to_freelist(wc, e);
940 		} else {
941 			struct wc_entry *old;
942 
943 			old = writecache_find_entry(wc, read_original_sector(wc, e), 0);
944 			if (!old) {
945 				writecache_insert_entry(wc, e);
946 			} else {
947 				if (read_seq_count(wc, old) == read_seq_count(wc, e)) {
948 					writecache_error(wc, -EINVAL,
949 						 "two identical entries, position %llu, sector %llu, sequence %llu",
950 						 (unsigned long long)b, (unsigned long long)read_original_sector(wc, e),
951 						 (unsigned long long)read_seq_count(wc, e));
952 				}
953 				if (read_seq_count(wc, old) > read_seq_count(wc, e)) {
954 					goto erase_this;
955 				} else {
956 					writecache_free_entry(wc, old);
957 					writecache_insert_entry(wc, e);
958 					need_flush = true;
959 				}
960 			}
961 		}
962 		cond_resched();
963 	}
964 
965 	if (need_flush) {
966 		writecache_flush_all_metadata(wc);
967 		writecache_commit_flushed(wc);
968 	}
969 
970 	wc_unlock(wc);
971 }
972 
973 static int process_flush_mesg(unsigned argc, char **argv, struct dm_writecache *wc)
974 {
975 	if (argc != 1)
976 		return -EINVAL;
977 
978 	wc_lock(wc);
979 	if (dm_suspended(wc->ti)) {
980 		wc_unlock(wc);
981 		return -EBUSY;
982 	}
983 	if (writecache_has_error(wc)) {
984 		wc_unlock(wc);
985 		return -EIO;
986 	}
987 
988 	writecache_flush(wc);
989 	wc->writeback_all++;
990 	queue_work(wc->writeback_wq, &wc->writeback_work);
991 	wc_unlock(wc);
992 
993 	flush_workqueue(wc->writeback_wq);
994 
995 	wc_lock(wc);
996 	wc->writeback_all--;
997 	if (writecache_has_error(wc)) {
998 		wc_unlock(wc);
999 		return -EIO;
1000 	}
1001 	wc_unlock(wc);
1002 
1003 	return 0;
1004 }
1005 
1006 static int process_flush_on_suspend_mesg(unsigned argc, char **argv, struct dm_writecache *wc)
1007 {
1008 	if (argc != 1)
1009 		return -EINVAL;
1010 
1011 	wc_lock(wc);
1012 	wc->flush_on_suspend = true;
1013 	wc_unlock(wc);
1014 
1015 	return 0;
1016 }
1017 
1018 static int writecache_message(struct dm_target *ti, unsigned argc, char **argv,
1019 			      char *result, unsigned maxlen)
1020 {
1021 	int r = -EINVAL;
1022 	struct dm_writecache *wc = ti->private;
1023 
1024 	if (!strcasecmp(argv[0], "flush"))
1025 		r = process_flush_mesg(argc, argv, wc);
1026 	else if (!strcasecmp(argv[0], "flush_on_suspend"))
1027 		r = process_flush_on_suspend_mesg(argc, argv, wc);
1028 	else
1029 		DMERR("unrecognised message received: %s", argv[0]);
1030 
1031 	return r;
1032 }
1033 
1034 static void bio_copy_block(struct dm_writecache *wc, struct bio *bio, void *data)
1035 {
1036 	void *buf;
1037 	unsigned long flags;
1038 	unsigned size;
1039 	int rw = bio_data_dir(bio);
1040 	unsigned remaining_size = wc->block_size;
1041 
1042 	do {
1043 		struct bio_vec bv = bio_iter_iovec(bio, bio->bi_iter);
1044 		buf = bvec_kmap_irq(&bv, &flags);
1045 		size = bv.bv_len;
1046 		if (unlikely(size > remaining_size))
1047 			size = remaining_size;
1048 
1049 		if (rw == READ) {
1050 			int r;
1051 			r = memcpy_mcsafe(buf, data, size);
1052 			flush_dcache_page(bio_page(bio));
1053 			if (unlikely(r)) {
1054 				writecache_error(wc, r, "hardware memory error when reading data: %d", r);
1055 				bio->bi_status = BLK_STS_IOERR;
1056 			}
1057 		} else {
1058 			flush_dcache_page(bio_page(bio));
1059 			memcpy_flushcache(data, buf, size);
1060 		}
1061 
1062 		bvec_kunmap_irq(buf, &flags);
1063 
1064 		data = (char *)data + size;
1065 		remaining_size -= size;
1066 		bio_advance(bio, size);
1067 	} while (unlikely(remaining_size));
1068 }
1069 
1070 static int writecache_flush_thread(void *data)
1071 {
1072 	struct dm_writecache *wc = data;
1073 
1074 	while (1) {
1075 		struct bio *bio;
1076 
1077 		wc_lock(wc);
1078 		bio = bio_list_pop(&wc->flush_list);
1079 		if (!bio) {
1080 			set_current_state(TASK_INTERRUPTIBLE);
1081 			wc_unlock(wc);
1082 
1083 			if (unlikely(kthread_should_stop())) {
1084 				set_current_state(TASK_RUNNING);
1085 				break;
1086 			}
1087 
1088 			schedule();
1089 			continue;
1090 		}
1091 
1092 		if (bio_op(bio) == REQ_OP_DISCARD) {
1093 			writecache_discard(wc, bio->bi_iter.bi_sector,
1094 					   bio_end_sector(bio));
1095 			wc_unlock(wc);
1096 			bio_set_dev(bio, wc->dev->bdev);
1097 			generic_make_request(bio);
1098 		} else {
1099 			writecache_flush(wc);
1100 			wc_unlock(wc);
1101 			if (writecache_has_error(wc))
1102 				bio->bi_status = BLK_STS_IOERR;
1103 			bio_endio(bio);
1104 		}
1105 	}
1106 
1107 	return 0;
1108 }
1109 
1110 static void writecache_offload_bio(struct dm_writecache *wc, struct bio *bio)
1111 {
1112 	if (bio_list_empty(&wc->flush_list))
1113 		wake_up_process(wc->flush_thread);
1114 	bio_list_add(&wc->flush_list, bio);
1115 }
1116 
1117 static int writecache_map(struct dm_target *ti, struct bio *bio)
1118 {
1119 	struct wc_entry *e;
1120 	struct dm_writecache *wc = ti->private;
1121 
1122 	bio->bi_private = NULL;
1123 
1124 	wc_lock(wc);
1125 
1126 	if (unlikely(bio->bi_opf & REQ_PREFLUSH)) {
1127 		if (writecache_has_error(wc))
1128 			goto unlock_error;
1129 		if (WC_MODE_PMEM(wc)) {
1130 			writecache_flush(wc);
1131 			if (writecache_has_error(wc))
1132 				goto unlock_error;
1133 			goto unlock_submit;
1134 		} else {
1135 			writecache_offload_bio(wc, bio);
1136 			goto unlock_return;
1137 		}
1138 	}
1139 
1140 	bio->bi_iter.bi_sector = dm_target_offset(ti, bio->bi_iter.bi_sector);
1141 
1142 	if (unlikely((((unsigned)bio->bi_iter.bi_sector | bio_sectors(bio)) &
1143 				(wc->block_size / 512 - 1)) != 0)) {
1144 		DMERR("I/O is not aligned, sector %llu, size %u, block size %u",
1145 		      (unsigned long long)bio->bi_iter.bi_sector,
1146 		      bio->bi_iter.bi_size, wc->block_size);
1147 		goto unlock_error;
1148 	}
1149 
1150 	if (unlikely(bio_op(bio) == REQ_OP_DISCARD)) {
1151 		if (writecache_has_error(wc))
1152 			goto unlock_error;
1153 		if (WC_MODE_PMEM(wc)) {
1154 			writecache_discard(wc, bio->bi_iter.bi_sector, bio_end_sector(bio));
1155 			goto unlock_remap_origin;
1156 		} else {
1157 			writecache_offload_bio(wc, bio);
1158 			goto unlock_return;
1159 		}
1160 	}
1161 
1162 	if (bio_data_dir(bio) == READ) {
1163 read_next_block:
1164 		e = writecache_find_entry(wc, bio->bi_iter.bi_sector, WFE_RETURN_FOLLOWING);
1165 		if (e && read_original_sector(wc, e) == bio->bi_iter.bi_sector) {
1166 			if (WC_MODE_PMEM(wc)) {
1167 				bio_copy_block(wc, bio, memory_data(wc, e));
1168 				if (bio->bi_iter.bi_size)
1169 					goto read_next_block;
1170 				goto unlock_submit;
1171 			} else {
1172 				dm_accept_partial_bio(bio, wc->block_size >> SECTOR_SHIFT);
1173 				bio_set_dev(bio, wc->ssd_dev->bdev);
1174 				bio->bi_iter.bi_sector = cache_sector(wc, e);
1175 				if (!writecache_entry_is_committed(wc, e))
1176 					writecache_wait_for_ios(wc, WRITE);
1177 				goto unlock_remap;
1178 			}
1179 		} else {
1180 			if (e) {
1181 				sector_t next_boundary =
1182 					read_original_sector(wc, e) - bio->bi_iter.bi_sector;
1183 				if (next_boundary < bio->bi_iter.bi_size >> SECTOR_SHIFT) {
1184 					dm_accept_partial_bio(bio, next_boundary);
1185 				}
1186 			}
1187 			goto unlock_remap_origin;
1188 		}
1189 	} else {
1190 		do {
1191 			if (writecache_has_error(wc))
1192 				goto unlock_error;
1193 			e = writecache_find_entry(wc, bio->bi_iter.bi_sector, 0);
1194 			if (e) {
1195 				if (!writecache_entry_is_committed(wc, e))
1196 					goto bio_copy;
1197 				if (!WC_MODE_PMEM(wc) && !e->write_in_progress) {
1198 					wc->overwrote_committed = true;
1199 					goto bio_copy;
1200 				}
1201 			}
1202 			e = writecache_pop_from_freelist(wc);
1203 			if (unlikely(!e)) {
1204 				writecache_wait_on_freelist(wc);
1205 				continue;
1206 			}
1207 			write_original_sector_seq_count(wc, e, bio->bi_iter.bi_sector, wc->seq_count);
1208 			writecache_insert_entry(wc, e);
1209 			wc->uncommitted_blocks++;
1210 bio_copy:
1211 			if (WC_MODE_PMEM(wc)) {
1212 				bio_copy_block(wc, bio, memory_data(wc, e));
1213 			} else {
1214 				dm_accept_partial_bio(bio, wc->block_size >> SECTOR_SHIFT);
1215 				bio_set_dev(bio, wc->ssd_dev->bdev);
1216 				bio->bi_iter.bi_sector = cache_sector(wc, e);
1217 				if (unlikely(wc->uncommitted_blocks >= wc->autocommit_blocks)) {
1218 					wc->uncommitted_blocks = 0;
1219 					queue_work(wc->writeback_wq, &wc->flush_work);
1220 				} else {
1221 					writecache_schedule_autocommit(wc);
1222 				}
1223 				goto unlock_remap;
1224 			}
1225 		} while (bio->bi_iter.bi_size);
1226 
1227 		if (unlikely(wc->uncommitted_blocks >= wc->autocommit_blocks))
1228 			writecache_flush(wc);
1229 		else
1230 			writecache_schedule_autocommit(wc);
1231 		goto unlock_submit;
1232 	}
1233 
1234 unlock_remap_origin:
1235 	bio_set_dev(bio, wc->dev->bdev);
1236 	wc_unlock(wc);
1237 	return DM_MAPIO_REMAPPED;
1238 
1239 unlock_remap:
1240 	/* make sure that writecache_end_io decrements bio_in_progress: */
1241 	bio->bi_private = (void *)1;
1242 	atomic_inc(&wc->bio_in_progress[bio_data_dir(bio)]);
1243 	wc_unlock(wc);
1244 	return DM_MAPIO_REMAPPED;
1245 
1246 unlock_submit:
1247 	wc_unlock(wc);
1248 	bio_endio(bio);
1249 	return DM_MAPIO_SUBMITTED;
1250 
1251 unlock_return:
1252 	wc_unlock(wc);
1253 	return DM_MAPIO_SUBMITTED;
1254 
1255 unlock_error:
1256 	wc_unlock(wc);
1257 	bio_io_error(bio);
1258 	return DM_MAPIO_SUBMITTED;
1259 }
1260 
1261 static int writecache_end_io(struct dm_target *ti, struct bio *bio, blk_status_t *status)
1262 {
1263 	struct dm_writecache *wc = ti->private;
1264 
1265 	if (bio->bi_private != NULL) {
1266 		int dir = bio_data_dir(bio);
1267 		if (atomic_dec_and_test(&wc->bio_in_progress[dir]))
1268 			if (unlikely(waitqueue_active(&wc->bio_in_progress_wait[dir])))
1269 				wake_up(&wc->bio_in_progress_wait[dir]);
1270 	}
1271 	return 0;
1272 }
1273 
1274 static int writecache_iterate_devices(struct dm_target *ti,
1275 				      iterate_devices_callout_fn fn, void *data)
1276 {
1277 	struct dm_writecache *wc = ti->private;
1278 
1279 	return fn(ti, wc->dev, 0, ti->len, data);
1280 }
1281 
1282 static void writecache_io_hints(struct dm_target *ti, struct queue_limits *limits)
1283 {
1284 	struct dm_writecache *wc = ti->private;
1285 
1286 	if (limits->logical_block_size < wc->block_size)
1287 		limits->logical_block_size = wc->block_size;
1288 
1289 	if (limits->physical_block_size < wc->block_size)
1290 		limits->physical_block_size = wc->block_size;
1291 
1292 	if (limits->io_min < wc->block_size)
1293 		limits->io_min = wc->block_size;
1294 }
1295 
1296 
1297 static void writecache_writeback_endio(struct bio *bio)
1298 {
1299 	struct writeback_struct *wb = container_of(bio, struct writeback_struct, bio);
1300 	struct dm_writecache *wc = wb->wc;
1301 	unsigned long flags;
1302 
1303 	raw_spin_lock_irqsave(&wc->endio_list_lock, flags);
1304 	if (unlikely(list_empty(&wc->endio_list)))
1305 		wake_up_process(wc->endio_thread);
1306 	list_add_tail(&wb->endio_entry, &wc->endio_list);
1307 	raw_spin_unlock_irqrestore(&wc->endio_list_lock, flags);
1308 }
1309 
1310 static void writecache_copy_endio(int read_err, unsigned long write_err, void *ptr)
1311 {
1312 	struct copy_struct *c = ptr;
1313 	struct dm_writecache *wc = c->wc;
1314 
1315 	c->error = likely(!(read_err | write_err)) ? 0 : -EIO;
1316 
1317 	raw_spin_lock_irq(&wc->endio_list_lock);
1318 	if (unlikely(list_empty(&wc->endio_list)))
1319 		wake_up_process(wc->endio_thread);
1320 	list_add_tail(&c->endio_entry, &wc->endio_list);
1321 	raw_spin_unlock_irq(&wc->endio_list_lock);
1322 }
1323 
1324 static void __writecache_endio_pmem(struct dm_writecache *wc, struct list_head *list)
1325 {
1326 	unsigned i;
1327 	struct writeback_struct *wb;
1328 	struct wc_entry *e;
1329 	unsigned long n_walked = 0;
1330 
1331 	do {
1332 		wb = list_entry(list->next, struct writeback_struct, endio_entry);
1333 		list_del(&wb->endio_entry);
1334 
1335 		if (unlikely(wb->bio.bi_status != BLK_STS_OK))
1336 			writecache_error(wc, blk_status_to_errno(wb->bio.bi_status),
1337 					"write error %d", wb->bio.bi_status);
1338 		i = 0;
1339 		do {
1340 			e = wb->wc_list[i];
1341 			BUG_ON(!e->write_in_progress);
1342 			e->write_in_progress = false;
1343 			INIT_LIST_HEAD(&e->lru);
1344 			if (!writecache_has_error(wc))
1345 				writecache_free_entry(wc, e);
1346 			BUG_ON(!wc->writeback_size);
1347 			wc->writeback_size--;
1348 			n_walked++;
1349 			if (unlikely(n_walked >= ENDIO_LATENCY)) {
1350 				writecache_commit_flushed(wc);
1351 				wc_unlock(wc);
1352 				wc_lock(wc);
1353 				n_walked = 0;
1354 			}
1355 		} while (++i < wb->wc_list_n);
1356 
1357 		if (wb->wc_list != wb->wc_list_inline)
1358 			kfree(wb->wc_list);
1359 		bio_put(&wb->bio);
1360 	} while (!list_empty(list));
1361 }
1362 
1363 static void __writecache_endio_ssd(struct dm_writecache *wc, struct list_head *list)
1364 {
1365 	struct copy_struct *c;
1366 	struct wc_entry *e;
1367 
1368 	do {
1369 		c = list_entry(list->next, struct copy_struct, endio_entry);
1370 		list_del(&c->endio_entry);
1371 
1372 		if (unlikely(c->error))
1373 			writecache_error(wc, c->error, "copy error");
1374 
1375 		e = c->e;
1376 		do {
1377 			BUG_ON(!e->write_in_progress);
1378 			e->write_in_progress = false;
1379 			INIT_LIST_HEAD(&e->lru);
1380 			if (!writecache_has_error(wc))
1381 				writecache_free_entry(wc, e);
1382 
1383 			BUG_ON(!wc->writeback_size);
1384 			wc->writeback_size--;
1385 			e++;
1386 		} while (--c->n_entries);
1387 		mempool_free(c, &wc->copy_pool);
1388 	} while (!list_empty(list));
1389 }
1390 
1391 static int writecache_endio_thread(void *data)
1392 {
1393 	struct dm_writecache *wc = data;
1394 
1395 	while (1) {
1396 		struct list_head list;
1397 
1398 		raw_spin_lock_irq(&wc->endio_list_lock);
1399 		if (!list_empty(&wc->endio_list))
1400 			goto pop_from_list;
1401 		set_current_state(TASK_INTERRUPTIBLE);
1402 		raw_spin_unlock_irq(&wc->endio_list_lock);
1403 
1404 		if (unlikely(kthread_should_stop())) {
1405 			set_current_state(TASK_RUNNING);
1406 			break;
1407 		}
1408 
1409 		schedule();
1410 
1411 		continue;
1412 
1413 pop_from_list:
1414 		list = wc->endio_list;
1415 		list.next->prev = list.prev->next = &list;
1416 		INIT_LIST_HEAD(&wc->endio_list);
1417 		raw_spin_unlock_irq(&wc->endio_list_lock);
1418 
1419 		if (!WC_MODE_FUA(wc))
1420 			writecache_disk_flush(wc, wc->dev);
1421 
1422 		wc_lock(wc);
1423 
1424 		if (WC_MODE_PMEM(wc)) {
1425 			__writecache_endio_pmem(wc, &list);
1426 		} else {
1427 			__writecache_endio_ssd(wc, &list);
1428 			writecache_wait_for_ios(wc, READ);
1429 		}
1430 
1431 		writecache_commit_flushed(wc);
1432 
1433 		wc_unlock(wc);
1434 	}
1435 
1436 	return 0;
1437 }
1438 
1439 static bool wc_add_block(struct writeback_struct *wb, struct wc_entry *e, gfp_t gfp)
1440 {
1441 	struct dm_writecache *wc = wb->wc;
1442 	unsigned block_size = wc->block_size;
1443 	void *address = memory_data(wc, e);
1444 
1445 	persistent_memory_flush_cache(address, block_size);
1446 	return bio_add_page(&wb->bio, persistent_memory_page(address),
1447 			    block_size, persistent_memory_page_offset(address)) != 0;
1448 }
1449 
1450 struct writeback_list {
1451 	struct list_head list;
1452 	size_t size;
1453 };
1454 
1455 static void __writeback_throttle(struct dm_writecache *wc, struct writeback_list *wbl)
1456 {
1457 	if (unlikely(wc->max_writeback_jobs)) {
1458 		if (READ_ONCE(wc->writeback_size) - wbl->size >= wc->max_writeback_jobs) {
1459 			wc_lock(wc);
1460 			while (wc->writeback_size - wbl->size >= wc->max_writeback_jobs)
1461 				writecache_wait_on_freelist(wc);
1462 			wc_unlock(wc);
1463 		}
1464 	}
1465 	cond_resched();
1466 }
1467 
1468 static void __writecache_writeback_pmem(struct dm_writecache *wc, struct writeback_list *wbl)
1469 {
1470 	struct wc_entry *e, *f;
1471 	struct bio *bio;
1472 	struct writeback_struct *wb;
1473 	unsigned max_pages;
1474 
1475 	while (wbl->size) {
1476 		wbl->size--;
1477 		e = container_of(wbl->list.prev, struct wc_entry, lru);
1478 		list_del(&e->lru);
1479 
1480 		max_pages = e->wc_list_contiguous;
1481 
1482 		bio = bio_alloc_bioset(GFP_NOIO, max_pages, &wc->bio_set);
1483 		wb = container_of(bio, struct writeback_struct, bio);
1484 		wb->wc = wc;
1485 		wb->bio.bi_end_io = writecache_writeback_endio;
1486 		bio_set_dev(&wb->bio, wc->dev->bdev);
1487 		wb->bio.bi_iter.bi_sector = read_original_sector(wc, e);
1488 		wb->page_offset = PAGE_SIZE;
1489 		if (max_pages <= WB_LIST_INLINE ||
1490 		    unlikely(!(wb->wc_list = kmalloc_array(max_pages, sizeof(struct wc_entry *),
1491 							   GFP_NOIO | __GFP_NORETRY |
1492 							   __GFP_NOMEMALLOC | __GFP_NOWARN)))) {
1493 			wb->wc_list = wb->wc_list_inline;
1494 			max_pages = WB_LIST_INLINE;
1495 		}
1496 
1497 		BUG_ON(!wc_add_block(wb, e, GFP_NOIO));
1498 
1499 		wb->wc_list[0] = e;
1500 		wb->wc_list_n = 1;
1501 
1502 		while (wbl->size && wb->wc_list_n < max_pages) {
1503 			f = container_of(wbl->list.prev, struct wc_entry, lru);
1504 			if (read_original_sector(wc, f) !=
1505 			    read_original_sector(wc, e) + (wc->block_size >> SECTOR_SHIFT))
1506 				break;
1507 			if (!wc_add_block(wb, f, GFP_NOWAIT | __GFP_NOWARN))
1508 				break;
1509 			wbl->size--;
1510 			list_del(&f->lru);
1511 			wb->wc_list[wb->wc_list_n++] = f;
1512 			e = f;
1513 		}
1514 		bio_set_op_attrs(&wb->bio, REQ_OP_WRITE, WC_MODE_FUA(wc) * REQ_FUA);
1515 		if (writecache_has_error(wc)) {
1516 			bio->bi_status = BLK_STS_IOERR;
1517 			bio_endio(&wb->bio);
1518 		} else {
1519 			submit_bio(&wb->bio);
1520 		}
1521 
1522 		__writeback_throttle(wc, wbl);
1523 	}
1524 }
1525 
1526 static void __writecache_writeback_ssd(struct dm_writecache *wc, struct writeback_list *wbl)
1527 {
1528 	struct wc_entry *e, *f;
1529 	struct dm_io_region from, to;
1530 	struct copy_struct *c;
1531 
1532 	while (wbl->size) {
1533 		unsigned n_sectors;
1534 
1535 		wbl->size--;
1536 		e = container_of(wbl->list.prev, struct wc_entry, lru);
1537 		list_del(&e->lru);
1538 
1539 		n_sectors = e->wc_list_contiguous << (wc->block_size_bits - SECTOR_SHIFT);
1540 
1541 		from.bdev = wc->ssd_dev->bdev;
1542 		from.sector = cache_sector(wc, e);
1543 		from.count = n_sectors;
1544 		to.bdev = wc->dev->bdev;
1545 		to.sector = read_original_sector(wc, e);
1546 		to.count = n_sectors;
1547 
1548 		c = mempool_alloc(&wc->copy_pool, GFP_NOIO);
1549 		c->wc = wc;
1550 		c->e = e;
1551 		c->n_entries = e->wc_list_contiguous;
1552 
1553 		while ((n_sectors -= wc->block_size >> SECTOR_SHIFT)) {
1554 			wbl->size--;
1555 			f = container_of(wbl->list.prev, struct wc_entry, lru);
1556 			BUG_ON(f != e + 1);
1557 			list_del(&f->lru);
1558 			e = f;
1559 		}
1560 
1561 		dm_kcopyd_copy(wc->dm_kcopyd, &from, 1, &to, 0, writecache_copy_endio, c);
1562 
1563 		__writeback_throttle(wc, wbl);
1564 	}
1565 }
1566 
1567 static void writecache_writeback(struct work_struct *work)
1568 {
1569 	struct dm_writecache *wc = container_of(work, struct dm_writecache, writeback_work);
1570 	struct blk_plug plug;
1571 	struct wc_entry *e, *f, *g;
1572 	struct rb_node *node, *next_node;
1573 	struct list_head skipped;
1574 	struct writeback_list wbl;
1575 	unsigned long n_walked;
1576 
1577 	wc_lock(wc);
1578 restart:
1579 	if (writecache_has_error(wc)) {
1580 		wc_unlock(wc);
1581 		return;
1582 	}
1583 
1584 	if (unlikely(wc->writeback_all)) {
1585 		if (writecache_wait_for_writeback(wc))
1586 			goto restart;
1587 	}
1588 
1589 	if (wc->overwrote_committed) {
1590 		writecache_wait_for_ios(wc, WRITE);
1591 	}
1592 
1593 	n_walked = 0;
1594 	INIT_LIST_HEAD(&skipped);
1595 	INIT_LIST_HEAD(&wbl.list);
1596 	wbl.size = 0;
1597 	while (!list_empty(&wc->lru) &&
1598 	       (wc->writeback_all ||
1599 		wc->freelist_size + wc->writeback_size <= wc->freelist_low_watermark)) {
1600 
1601 		n_walked++;
1602 		if (unlikely(n_walked > WRITEBACK_LATENCY) &&
1603 		    likely(!wc->writeback_all) && likely(!dm_suspended(wc->ti))) {
1604 			queue_work(wc->writeback_wq, &wc->writeback_work);
1605 			break;
1606 		}
1607 
1608 		e = container_of(wc->lru.prev, struct wc_entry, lru);
1609 		BUG_ON(e->write_in_progress);
1610 		if (unlikely(!writecache_entry_is_committed(wc, e))) {
1611 			writecache_flush(wc);
1612 		}
1613 		node = rb_prev(&e->rb_node);
1614 		if (node) {
1615 			f = container_of(node, struct wc_entry, rb_node);
1616 			if (unlikely(read_original_sector(wc, f) ==
1617 				     read_original_sector(wc, e))) {
1618 				BUG_ON(!f->write_in_progress);
1619 				list_del(&e->lru);
1620 				list_add(&e->lru, &skipped);
1621 				cond_resched();
1622 				continue;
1623 			}
1624 		}
1625 		wc->writeback_size++;
1626 		list_del(&e->lru);
1627 		list_add(&e->lru, &wbl.list);
1628 		wbl.size++;
1629 		e->write_in_progress = true;
1630 		e->wc_list_contiguous = 1;
1631 
1632 		f = e;
1633 
1634 		while (1) {
1635 			next_node = rb_next(&f->rb_node);
1636 			if (unlikely(!next_node))
1637 				break;
1638 			g = container_of(next_node, struct wc_entry, rb_node);
1639 			if (read_original_sector(wc, g) ==
1640 			    read_original_sector(wc, f)) {
1641 				f = g;
1642 				continue;
1643 			}
1644 			if (read_original_sector(wc, g) !=
1645 			    read_original_sector(wc, f) + (wc->block_size >> SECTOR_SHIFT))
1646 				break;
1647 			if (unlikely(g->write_in_progress))
1648 				break;
1649 			if (unlikely(!writecache_entry_is_committed(wc, g)))
1650 				break;
1651 
1652 			if (!WC_MODE_PMEM(wc)) {
1653 				if (g != f + 1)
1654 					break;
1655 			}
1656 
1657 			n_walked++;
1658 			//if (unlikely(n_walked > WRITEBACK_LATENCY) && likely(!wc->writeback_all))
1659 			//	break;
1660 
1661 			wc->writeback_size++;
1662 			list_del(&g->lru);
1663 			list_add(&g->lru, &wbl.list);
1664 			wbl.size++;
1665 			g->write_in_progress = true;
1666 			g->wc_list_contiguous = BIO_MAX_PAGES;
1667 			f = g;
1668 			e->wc_list_contiguous++;
1669 			if (unlikely(e->wc_list_contiguous == BIO_MAX_PAGES))
1670 				break;
1671 		}
1672 		cond_resched();
1673 	}
1674 
1675 	if (!list_empty(&skipped)) {
1676 		list_splice_tail(&skipped, &wc->lru);
1677 		/*
1678 		 * If we didn't do any progress, we must wait until some
1679 		 * writeback finishes to avoid burning CPU in a loop
1680 		 */
1681 		if (unlikely(!wbl.size))
1682 			writecache_wait_for_writeback(wc);
1683 	}
1684 
1685 	wc_unlock(wc);
1686 
1687 	blk_start_plug(&plug);
1688 
1689 	if (WC_MODE_PMEM(wc))
1690 		__writecache_writeback_pmem(wc, &wbl);
1691 	else
1692 		__writecache_writeback_ssd(wc, &wbl);
1693 
1694 	blk_finish_plug(&plug);
1695 
1696 	if (unlikely(wc->writeback_all)) {
1697 		wc_lock(wc);
1698 		while (writecache_wait_for_writeback(wc));
1699 		wc_unlock(wc);
1700 	}
1701 }
1702 
1703 static int calculate_memory_size(uint64_t device_size, unsigned block_size,
1704 				 size_t *n_blocks_p, size_t *n_metadata_blocks_p)
1705 {
1706 	uint64_t n_blocks, offset;
1707 	struct wc_entry e;
1708 
1709 	n_blocks = device_size;
1710 	do_div(n_blocks, block_size + sizeof(struct wc_memory_entry));
1711 
1712 	while (1) {
1713 		if (!n_blocks)
1714 			return -ENOSPC;
1715 		/* Verify the following entries[n_blocks] won't overflow */
1716 		if (n_blocks >= ((size_t)-sizeof(struct wc_memory_superblock) /
1717 				 sizeof(struct wc_memory_entry)))
1718 			return -EFBIG;
1719 		offset = offsetof(struct wc_memory_superblock, entries[n_blocks]);
1720 		offset = (offset + block_size - 1) & ~(uint64_t)(block_size - 1);
1721 		if (offset + n_blocks * block_size <= device_size)
1722 			break;
1723 		n_blocks--;
1724 	}
1725 
1726 	/* check if the bit field overflows */
1727 	e.index = n_blocks;
1728 	if (e.index != n_blocks)
1729 		return -EFBIG;
1730 
1731 	if (n_blocks_p)
1732 		*n_blocks_p = n_blocks;
1733 	if (n_metadata_blocks_p)
1734 		*n_metadata_blocks_p = offset >> __ffs(block_size);
1735 	return 0;
1736 }
1737 
1738 static int init_memory(struct dm_writecache *wc)
1739 {
1740 	size_t b;
1741 	int r;
1742 
1743 	r = calculate_memory_size(wc->memory_map_size, wc->block_size, &wc->n_blocks, NULL);
1744 	if (r)
1745 		return r;
1746 
1747 	r = writecache_alloc_entries(wc);
1748 	if (r)
1749 		return r;
1750 
1751 	for (b = 0; b < ARRAY_SIZE(sb(wc)->padding); b++)
1752 		pmem_assign(sb(wc)->padding[b], cpu_to_le64(0));
1753 	pmem_assign(sb(wc)->version, cpu_to_le32(MEMORY_SUPERBLOCK_VERSION));
1754 	pmem_assign(sb(wc)->block_size, cpu_to_le32(wc->block_size));
1755 	pmem_assign(sb(wc)->n_blocks, cpu_to_le64(wc->n_blocks));
1756 	pmem_assign(sb(wc)->seq_count, cpu_to_le64(0));
1757 
1758 	for (b = 0; b < wc->n_blocks; b++)
1759 		write_original_sector_seq_count(wc, &wc->entries[b], -1, -1);
1760 
1761 	writecache_flush_all_metadata(wc);
1762 	writecache_commit_flushed(wc);
1763 	pmem_assign(sb(wc)->magic, cpu_to_le32(MEMORY_SUPERBLOCK_MAGIC));
1764 	writecache_flush_region(wc, &sb(wc)->magic, sizeof sb(wc)->magic);
1765 	writecache_commit_flushed(wc);
1766 
1767 	return 0;
1768 }
1769 
1770 static void writecache_dtr(struct dm_target *ti)
1771 {
1772 	struct dm_writecache *wc = ti->private;
1773 
1774 	if (!wc)
1775 		return;
1776 
1777 	if (wc->endio_thread)
1778 		kthread_stop(wc->endio_thread);
1779 
1780 	if (wc->flush_thread)
1781 		kthread_stop(wc->flush_thread);
1782 
1783 	bioset_exit(&wc->bio_set);
1784 
1785 	mempool_exit(&wc->copy_pool);
1786 
1787 	if (wc->writeback_wq)
1788 		destroy_workqueue(wc->writeback_wq);
1789 
1790 	if (wc->dev)
1791 		dm_put_device(ti, wc->dev);
1792 
1793 	if (wc->ssd_dev)
1794 		dm_put_device(ti, wc->ssd_dev);
1795 
1796 	if (wc->entries)
1797 		vfree(wc->entries);
1798 
1799 	if (wc->memory_map) {
1800 		if (WC_MODE_PMEM(wc))
1801 			persistent_memory_release(wc);
1802 		else
1803 			vfree(wc->memory_map);
1804 	}
1805 
1806 	if (wc->dm_kcopyd)
1807 		dm_kcopyd_client_destroy(wc->dm_kcopyd);
1808 
1809 	if (wc->dm_io)
1810 		dm_io_client_destroy(wc->dm_io);
1811 
1812 	if (wc->dirty_bitmap)
1813 		vfree(wc->dirty_bitmap);
1814 
1815 	kfree(wc);
1816 }
1817 
1818 static int writecache_ctr(struct dm_target *ti, unsigned argc, char **argv)
1819 {
1820 	struct dm_writecache *wc;
1821 	struct dm_arg_set as;
1822 	const char *string;
1823 	unsigned opt_params;
1824 	size_t offset, data_size;
1825 	int i, r;
1826 	char dummy;
1827 	int high_wm_percent = HIGH_WATERMARK;
1828 	int low_wm_percent = LOW_WATERMARK;
1829 	uint64_t x;
1830 	struct wc_memory_superblock s;
1831 
1832 	static struct dm_arg _args[] = {
1833 		{0, 10, "Invalid number of feature args"},
1834 	};
1835 
1836 	as.argc = argc;
1837 	as.argv = argv;
1838 
1839 	wc = kzalloc(sizeof(struct dm_writecache), GFP_KERNEL);
1840 	if (!wc) {
1841 		ti->error = "Cannot allocate writecache structure";
1842 		r = -ENOMEM;
1843 		goto bad;
1844 	}
1845 	ti->private = wc;
1846 	wc->ti = ti;
1847 
1848 	mutex_init(&wc->lock);
1849 	writecache_poison_lists(wc);
1850 	init_waitqueue_head(&wc->freelist_wait);
1851 	timer_setup(&wc->autocommit_timer, writecache_autocommit_timer, 0);
1852 
1853 	for (i = 0; i < 2; i++) {
1854 		atomic_set(&wc->bio_in_progress[i], 0);
1855 		init_waitqueue_head(&wc->bio_in_progress_wait[i]);
1856 	}
1857 
1858 	wc->dm_io = dm_io_client_create();
1859 	if (IS_ERR(wc->dm_io)) {
1860 		r = PTR_ERR(wc->dm_io);
1861 		ti->error = "Unable to allocate dm-io client";
1862 		wc->dm_io = NULL;
1863 		goto bad;
1864 	}
1865 
1866 	wc->writeback_wq = alloc_workqueue("writecache-writeabck", WQ_MEM_RECLAIM, 1);
1867 	if (!wc->writeback_wq) {
1868 		r = -ENOMEM;
1869 		ti->error = "Could not allocate writeback workqueue";
1870 		goto bad;
1871 	}
1872 	INIT_WORK(&wc->writeback_work, writecache_writeback);
1873 	INIT_WORK(&wc->flush_work, writecache_flush_work);
1874 
1875 	raw_spin_lock_init(&wc->endio_list_lock);
1876 	INIT_LIST_HEAD(&wc->endio_list);
1877 	wc->endio_thread = kthread_create(writecache_endio_thread, wc, "writecache_endio");
1878 	if (IS_ERR(wc->endio_thread)) {
1879 		r = PTR_ERR(wc->endio_thread);
1880 		wc->endio_thread = NULL;
1881 		ti->error = "Couldn't spawn endio thread";
1882 		goto bad;
1883 	}
1884 	wake_up_process(wc->endio_thread);
1885 
1886 	/*
1887 	 * Parse the mode (pmem or ssd)
1888 	 */
1889 	string = dm_shift_arg(&as);
1890 	if (!string)
1891 		goto bad_arguments;
1892 
1893 	if (!strcasecmp(string, "s")) {
1894 		wc->pmem_mode = false;
1895 	} else if (!strcasecmp(string, "p")) {
1896 #ifdef DM_WRITECACHE_HAS_PMEM
1897 		wc->pmem_mode = true;
1898 		wc->writeback_fua = true;
1899 #else
1900 		/*
1901 		 * If the architecture doesn't support persistent memory or
1902 		 * the kernel doesn't support any DAX drivers, this driver can
1903 		 * only be used in SSD-only mode.
1904 		 */
1905 		r = -EOPNOTSUPP;
1906 		ti->error = "Persistent memory or DAX not supported on this system";
1907 		goto bad;
1908 #endif
1909 	} else {
1910 		goto bad_arguments;
1911 	}
1912 
1913 	if (WC_MODE_PMEM(wc)) {
1914 		r = bioset_init(&wc->bio_set, BIO_POOL_SIZE,
1915 				offsetof(struct writeback_struct, bio),
1916 				BIOSET_NEED_BVECS);
1917 		if (r) {
1918 			ti->error = "Could not allocate bio set";
1919 			goto bad;
1920 		}
1921 	} else {
1922 		r = mempool_init_kmalloc_pool(&wc->copy_pool, 1, sizeof(struct copy_struct));
1923 		if (r) {
1924 			ti->error = "Could not allocate mempool";
1925 			goto bad;
1926 		}
1927 	}
1928 
1929 	/*
1930 	 * Parse the origin data device
1931 	 */
1932 	string = dm_shift_arg(&as);
1933 	if (!string)
1934 		goto bad_arguments;
1935 	r = dm_get_device(ti, string, dm_table_get_mode(ti->table), &wc->dev);
1936 	if (r) {
1937 		ti->error = "Origin data device lookup failed";
1938 		goto bad;
1939 	}
1940 
1941 	/*
1942 	 * Parse cache data device (be it pmem or ssd)
1943 	 */
1944 	string = dm_shift_arg(&as);
1945 	if (!string)
1946 		goto bad_arguments;
1947 
1948 	r = dm_get_device(ti, string, dm_table_get_mode(ti->table), &wc->ssd_dev);
1949 	if (r) {
1950 		ti->error = "Cache data device lookup failed";
1951 		goto bad;
1952 	}
1953 	wc->memory_map_size = i_size_read(wc->ssd_dev->bdev->bd_inode);
1954 
1955 	/*
1956 	 * Parse the cache block size
1957 	 */
1958 	string = dm_shift_arg(&as);
1959 	if (!string)
1960 		goto bad_arguments;
1961 	if (sscanf(string, "%u%c", &wc->block_size, &dummy) != 1 ||
1962 	    wc->block_size < 512 || wc->block_size > PAGE_SIZE ||
1963 	    (wc->block_size & (wc->block_size - 1))) {
1964 		r = -EINVAL;
1965 		ti->error = "Invalid block size";
1966 		goto bad;
1967 	}
1968 	wc->block_size_bits = __ffs(wc->block_size);
1969 
1970 	wc->max_writeback_jobs = MAX_WRITEBACK_JOBS;
1971 	wc->autocommit_blocks = !WC_MODE_PMEM(wc) ? AUTOCOMMIT_BLOCKS_SSD : AUTOCOMMIT_BLOCKS_PMEM;
1972 	wc->autocommit_jiffies = msecs_to_jiffies(AUTOCOMMIT_MSEC);
1973 
1974 	/*
1975 	 * Parse optional arguments
1976 	 */
1977 	r = dm_read_arg_group(_args, &as, &opt_params, &ti->error);
1978 	if (r)
1979 		goto bad;
1980 
1981 	while (opt_params) {
1982 		string = dm_shift_arg(&as), opt_params--;
1983 		if (!strcasecmp(string, "start_sector") && opt_params >= 1) {
1984 			unsigned long long start_sector;
1985 			string = dm_shift_arg(&as), opt_params--;
1986 			if (sscanf(string, "%llu%c", &start_sector, &dummy) != 1)
1987 				goto invalid_optional;
1988 			wc->start_sector = start_sector;
1989 			if (wc->start_sector != start_sector ||
1990 			    wc->start_sector >= wc->memory_map_size >> SECTOR_SHIFT)
1991 				goto invalid_optional;
1992 		} else if (!strcasecmp(string, "high_watermark") && opt_params >= 1) {
1993 			string = dm_shift_arg(&as), opt_params--;
1994 			if (sscanf(string, "%d%c", &high_wm_percent, &dummy) != 1)
1995 				goto invalid_optional;
1996 			if (high_wm_percent < 0 || high_wm_percent > 100)
1997 				goto invalid_optional;
1998 			wc->high_wm_percent_set = true;
1999 		} else if (!strcasecmp(string, "low_watermark") && opt_params >= 1) {
2000 			string = dm_shift_arg(&as), opt_params--;
2001 			if (sscanf(string, "%d%c", &low_wm_percent, &dummy) != 1)
2002 				goto invalid_optional;
2003 			if (low_wm_percent < 0 || low_wm_percent > 100)
2004 				goto invalid_optional;
2005 			wc->low_wm_percent_set = true;
2006 		} else if (!strcasecmp(string, "writeback_jobs") && opt_params >= 1) {
2007 			string = dm_shift_arg(&as), opt_params--;
2008 			if (sscanf(string, "%u%c", &wc->max_writeback_jobs, &dummy) != 1)
2009 				goto invalid_optional;
2010 			wc->max_writeback_jobs_set = true;
2011 		} else if (!strcasecmp(string, "autocommit_blocks") && opt_params >= 1) {
2012 			string = dm_shift_arg(&as), opt_params--;
2013 			if (sscanf(string, "%u%c", &wc->autocommit_blocks, &dummy) != 1)
2014 				goto invalid_optional;
2015 			wc->autocommit_blocks_set = true;
2016 		} else if (!strcasecmp(string, "autocommit_time") && opt_params >= 1) {
2017 			unsigned autocommit_msecs;
2018 			string = dm_shift_arg(&as), opt_params--;
2019 			if (sscanf(string, "%u%c", &autocommit_msecs, &dummy) != 1)
2020 				goto invalid_optional;
2021 			if (autocommit_msecs > 3600000)
2022 				goto invalid_optional;
2023 			wc->autocommit_jiffies = msecs_to_jiffies(autocommit_msecs);
2024 			wc->autocommit_time_set = true;
2025 		} else if (!strcasecmp(string, "fua")) {
2026 			if (WC_MODE_PMEM(wc)) {
2027 				wc->writeback_fua = true;
2028 				wc->writeback_fua_set = true;
2029 			} else goto invalid_optional;
2030 		} else if (!strcasecmp(string, "nofua")) {
2031 			if (WC_MODE_PMEM(wc)) {
2032 				wc->writeback_fua = false;
2033 				wc->writeback_fua_set = true;
2034 			} else goto invalid_optional;
2035 		} else {
2036 invalid_optional:
2037 			r = -EINVAL;
2038 			ti->error = "Invalid optional argument";
2039 			goto bad;
2040 		}
2041 	}
2042 
2043 	if (high_wm_percent < low_wm_percent) {
2044 		r = -EINVAL;
2045 		ti->error = "High watermark must be greater than or equal to low watermark";
2046 		goto bad;
2047 	}
2048 
2049 	if (WC_MODE_PMEM(wc)) {
2050 		r = persistent_memory_claim(wc);
2051 		if (r) {
2052 			ti->error = "Unable to map persistent memory for cache";
2053 			goto bad;
2054 		}
2055 	} else {
2056 		struct dm_io_region region;
2057 		struct dm_io_request req;
2058 		size_t n_blocks, n_metadata_blocks;
2059 		uint64_t n_bitmap_bits;
2060 
2061 		wc->memory_map_size -= (uint64_t)wc->start_sector << SECTOR_SHIFT;
2062 
2063 		bio_list_init(&wc->flush_list);
2064 		wc->flush_thread = kthread_create(writecache_flush_thread, wc, "dm_writecache_flush");
2065 		if (IS_ERR(wc->flush_thread)) {
2066 			r = PTR_ERR(wc->flush_thread);
2067 			wc->flush_thread = NULL;
2068 			ti->error = "Couldn't spawn endio thread";
2069 			goto bad;
2070 		}
2071 		wake_up_process(wc->flush_thread);
2072 
2073 		r = calculate_memory_size(wc->memory_map_size, wc->block_size,
2074 					  &n_blocks, &n_metadata_blocks);
2075 		if (r) {
2076 			ti->error = "Invalid device size";
2077 			goto bad;
2078 		}
2079 
2080 		n_bitmap_bits = (((uint64_t)n_metadata_blocks << wc->block_size_bits) +
2081 				 BITMAP_GRANULARITY - 1) / BITMAP_GRANULARITY;
2082 		/* this is limitation of test_bit functions */
2083 		if (n_bitmap_bits > 1U << 31) {
2084 			r = -EFBIG;
2085 			ti->error = "Invalid device size";
2086 			goto bad;
2087 		}
2088 
2089 		wc->memory_map = vmalloc(n_metadata_blocks << wc->block_size_bits);
2090 		if (!wc->memory_map) {
2091 			r = -ENOMEM;
2092 			ti->error = "Unable to allocate memory for metadata";
2093 			goto bad;
2094 		}
2095 
2096 		wc->dm_kcopyd = dm_kcopyd_client_create(&dm_kcopyd_throttle);
2097 		if (IS_ERR(wc->dm_kcopyd)) {
2098 			r = PTR_ERR(wc->dm_kcopyd);
2099 			ti->error = "Unable to allocate dm-kcopyd client";
2100 			wc->dm_kcopyd = NULL;
2101 			goto bad;
2102 		}
2103 
2104 		wc->metadata_sectors = n_metadata_blocks << (wc->block_size_bits - SECTOR_SHIFT);
2105 		wc->dirty_bitmap_size = (n_bitmap_bits + BITS_PER_LONG - 1) /
2106 			BITS_PER_LONG * sizeof(unsigned long);
2107 		wc->dirty_bitmap = vzalloc(wc->dirty_bitmap_size);
2108 		if (!wc->dirty_bitmap) {
2109 			r = -ENOMEM;
2110 			ti->error = "Unable to allocate dirty bitmap";
2111 			goto bad;
2112 		}
2113 
2114 		region.bdev = wc->ssd_dev->bdev;
2115 		region.sector = wc->start_sector;
2116 		region.count = wc->metadata_sectors;
2117 		req.bi_op = REQ_OP_READ;
2118 		req.bi_op_flags = REQ_SYNC;
2119 		req.mem.type = DM_IO_VMA;
2120 		req.mem.ptr.vma = (char *)wc->memory_map;
2121 		req.client = wc->dm_io;
2122 		req.notify.fn = NULL;
2123 
2124 		r = dm_io(&req, 1, &region, NULL);
2125 		if (r) {
2126 			ti->error = "Unable to read metadata";
2127 			goto bad;
2128 		}
2129 	}
2130 
2131 	r = memcpy_mcsafe(&s, sb(wc), sizeof(struct wc_memory_superblock));
2132 	if (r) {
2133 		ti->error = "Hardware memory error when reading superblock";
2134 		goto bad;
2135 	}
2136 	if (!le32_to_cpu(s.magic) && !le32_to_cpu(s.version)) {
2137 		r = init_memory(wc);
2138 		if (r) {
2139 			ti->error = "Unable to initialize device";
2140 			goto bad;
2141 		}
2142 		r = memcpy_mcsafe(&s, sb(wc), sizeof(struct wc_memory_superblock));
2143 		if (r) {
2144 			ti->error = "Hardware memory error when reading superblock";
2145 			goto bad;
2146 		}
2147 	}
2148 
2149 	if (le32_to_cpu(s.magic) != MEMORY_SUPERBLOCK_MAGIC) {
2150 		ti->error = "Invalid magic in the superblock";
2151 		r = -EINVAL;
2152 		goto bad;
2153 	}
2154 
2155 	if (le32_to_cpu(s.version) != MEMORY_SUPERBLOCK_VERSION) {
2156 		ti->error = "Invalid version in the superblock";
2157 		r = -EINVAL;
2158 		goto bad;
2159 	}
2160 
2161 	if (le32_to_cpu(s.block_size) != wc->block_size) {
2162 		ti->error = "Block size does not match superblock";
2163 		r = -EINVAL;
2164 		goto bad;
2165 	}
2166 
2167 	wc->n_blocks = le64_to_cpu(s.n_blocks);
2168 
2169 	offset = wc->n_blocks * sizeof(struct wc_memory_entry);
2170 	if (offset / sizeof(struct wc_memory_entry) != le64_to_cpu(sb(wc)->n_blocks)) {
2171 overflow:
2172 		ti->error = "Overflow in size calculation";
2173 		r = -EINVAL;
2174 		goto bad;
2175 	}
2176 	offset += sizeof(struct wc_memory_superblock);
2177 	if (offset < sizeof(struct wc_memory_superblock))
2178 		goto overflow;
2179 	offset = (offset + wc->block_size - 1) & ~(size_t)(wc->block_size - 1);
2180 	data_size = wc->n_blocks * (size_t)wc->block_size;
2181 	if (!offset || (data_size / wc->block_size != wc->n_blocks) ||
2182 	    (offset + data_size < offset))
2183 		goto overflow;
2184 	if (offset + data_size > wc->memory_map_size) {
2185 		ti->error = "Memory area is too small";
2186 		r = -EINVAL;
2187 		goto bad;
2188 	}
2189 
2190 	wc->metadata_sectors = offset >> SECTOR_SHIFT;
2191 	wc->block_start = (char *)sb(wc) + offset;
2192 
2193 	x = (uint64_t)wc->n_blocks * (100 - high_wm_percent);
2194 	x += 50;
2195 	do_div(x, 100);
2196 	wc->freelist_high_watermark = x;
2197 	x = (uint64_t)wc->n_blocks * (100 - low_wm_percent);
2198 	x += 50;
2199 	do_div(x, 100);
2200 	wc->freelist_low_watermark = x;
2201 
2202 	r = writecache_alloc_entries(wc);
2203 	if (r) {
2204 		ti->error = "Cannot allocate memory";
2205 		goto bad;
2206 	}
2207 
2208 	ti->num_flush_bios = 1;
2209 	ti->flush_supported = true;
2210 	ti->num_discard_bios = 1;
2211 
2212 	if (WC_MODE_PMEM(wc))
2213 		persistent_memory_flush_cache(wc->memory_map, wc->memory_map_size);
2214 
2215 	return 0;
2216 
2217 bad_arguments:
2218 	r = -EINVAL;
2219 	ti->error = "Bad arguments";
2220 bad:
2221 	writecache_dtr(ti);
2222 	return r;
2223 }
2224 
2225 static void writecache_status(struct dm_target *ti, status_type_t type,
2226 			      unsigned status_flags, char *result, unsigned maxlen)
2227 {
2228 	struct dm_writecache *wc = ti->private;
2229 	unsigned extra_args;
2230 	unsigned sz = 0;
2231 	uint64_t x;
2232 
2233 	switch (type) {
2234 	case STATUSTYPE_INFO:
2235 		DMEMIT("%ld %llu %llu %llu", writecache_has_error(wc),
2236 		       (unsigned long long)wc->n_blocks, (unsigned long long)wc->freelist_size,
2237 		       (unsigned long long)wc->writeback_size);
2238 		break;
2239 	case STATUSTYPE_TABLE:
2240 		DMEMIT("%c %s %s %u ", WC_MODE_PMEM(wc) ? 'p' : 's',
2241 				wc->dev->name, wc->ssd_dev->name, wc->block_size);
2242 		extra_args = 0;
2243 		if (wc->high_wm_percent_set)
2244 			extra_args += 2;
2245 		if (wc->low_wm_percent_set)
2246 			extra_args += 2;
2247 		if (wc->max_writeback_jobs_set)
2248 			extra_args += 2;
2249 		if (wc->autocommit_blocks_set)
2250 			extra_args += 2;
2251 		if (wc->autocommit_time_set)
2252 			extra_args += 2;
2253 		if (wc->writeback_fua_set)
2254 			extra_args++;
2255 
2256 		DMEMIT("%u", extra_args);
2257 		if (wc->high_wm_percent_set) {
2258 			x = (uint64_t)wc->freelist_high_watermark * 100;
2259 			x += wc->n_blocks / 2;
2260 			do_div(x, (size_t)wc->n_blocks);
2261 			DMEMIT(" high_watermark %u", 100 - (unsigned)x);
2262 		}
2263 		if (wc->low_wm_percent_set) {
2264 			x = (uint64_t)wc->freelist_low_watermark * 100;
2265 			x += wc->n_blocks / 2;
2266 			do_div(x, (size_t)wc->n_blocks);
2267 			DMEMIT(" low_watermark %u", 100 - (unsigned)x);
2268 		}
2269 		if (wc->max_writeback_jobs_set)
2270 			DMEMIT(" writeback_jobs %u", wc->max_writeback_jobs);
2271 		if (wc->autocommit_blocks_set)
2272 			DMEMIT(" autocommit_blocks %u", wc->autocommit_blocks);
2273 		if (wc->autocommit_time_set)
2274 			DMEMIT(" autocommit_time %u", jiffies_to_msecs(wc->autocommit_jiffies));
2275 		if (wc->writeback_fua_set)
2276 			DMEMIT(" %sfua", wc->writeback_fua ? "" : "no");
2277 		break;
2278 	}
2279 }
2280 
2281 static struct target_type writecache_target = {
2282 	.name			= "writecache",
2283 	.version		= {1, 1, 0},
2284 	.module			= THIS_MODULE,
2285 	.ctr			= writecache_ctr,
2286 	.dtr			= writecache_dtr,
2287 	.status			= writecache_status,
2288 	.postsuspend		= writecache_suspend,
2289 	.resume			= writecache_resume,
2290 	.message		= writecache_message,
2291 	.map			= writecache_map,
2292 	.end_io			= writecache_end_io,
2293 	.iterate_devices	= writecache_iterate_devices,
2294 	.io_hints		= writecache_io_hints,
2295 };
2296 
2297 static int __init dm_writecache_init(void)
2298 {
2299 	int r;
2300 
2301 	r = dm_register_target(&writecache_target);
2302 	if (r < 0) {
2303 		DMERR("register failed %d", r);
2304 		return r;
2305 	}
2306 
2307 	return 0;
2308 }
2309 
2310 static void __exit dm_writecache_exit(void)
2311 {
2312 	dm_unregister_target(&writecache_target);
2313 }
2314 
2315 module_init(dm_writecache_init);
2316 module_exit(dm_writecache_exit);
2317 
2318 MODULE_DESCRIPTION(DM_NAME " writecache target");
2319 MODULE_AUTHOR("Mikulas Patocka <dm-devel@redhat.com>");
2320 MODULE_LICENSE("GPL");
2321