xref: /openbmc/linux/drivers/md/dm-writecache.c (revision d061864b)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (C) 2018 Red Hat. All rights reserved.
4  *
5  * This file is released under the GPL.
6  */
7 
8 #include <linux/device-mapper.h>
9 #include <linux/module.h>
10 #include <linux/init.h>
11 #include <linux/vmalloc.h>
12 #include <linux/kthread.h>
13 #include <linux/dm-io.h>
14 #include <linux/dm-kcopyd.h>
15 #include <linux/dax.h>
16 #include <linux/pfn_t.h>
17 #include <linux/libnvdimm.h>
18 
19 #define DM_MSG_PREFIX "writecache"
20 
21 #define HIGH_WATERMARK			50
22 #define LOW_WATERMARK			45
23 #define MAX_WRITEBACK_JOBS		0
24 #define ENDIO_LATENCY			16
25 #define WRITEBACK_LATENCY		64
26 #define AUTOCOMMIT_BLOCKS_SSD		65536
27 #define AUTOCOMMIT_BLOCKS_PMEM		64
28 #define AUTOCOMMIT_MSEC			1000
29 
30 #define BITMAP_GRANULARITY	65536
31 #if BITMAP_GRANULARITY < PAGE_SIZE
32 #undef BITMAP_GRANULARITY
33 #define BITMAP_GRANULARITY	PAGE_SIZE
34 #endif
35 
36 #if IS_ENABLED(CONFIG_ARCH_HAS_PMEM_API) && IS_ENABLED(CONFIG_DAX_DRIVER)
37 #define DM_WRITECACHE_HAS_PMEM
38 #endif
39 
40 #ifdef DM_WRITECACHE_HAS_PMEM
41 #define pmem_assign(dest, src)					\
42 do {								\
43 	typeof(dest) uniq = (src);				\
44 	memcpy_flushcache(&(dest), &uniq, sizeof(dest));	\
45 } while (0)
46 #else
47 #define pmem_assign(dest, src)	((dest) = (src))
48 #endif
49 
50 #if defined(__HAVE_ARCH_MEMCPY_MCSAFE) && defined(DM_WRITECACHE_HAS_PMEM)
51 #define DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
52 #endif
53 
54 #define MEMORY_SUPERBLOCK_MAGIC		0x23489321
55 #define MEMORY_SUPERBLOCK_VERSION	1
56 
57 struct wc_memory_entry {
58 	__le64 original_sector;
59 	__le64 seq_count;
60 };
61 
62 struct wc_memory_superblock {
63 	union {
64 		struct {
65 			__le32 magic;
66 			__le32 version;
67 			__le32 block_size;
68 			__le32 pad;
69 			__le64 n_blocks;
70 			__le64 seq_count;
71 		};
72 		__le64 padding[8];
73 	};
74 	struct wc_memory_entry entries[0];
75 };
76 
77 struct wc_entry {
78 	struct rb_node rb_node;
79 	struct list_head lru;
80 	unsigned short wc_list_contiguous;
81 	bool write_in_progress
82 #if BITS_PER_LONG == 64
83 		:1
84 #endif
85 	;
86 	unsigned long index
87 #if BITS_PER_LONG == 64
88 		:47
89 #endif
90 	;
91 #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
92 	uint64_t original_sector;
93 	uint64_t seq_count;
94 #endif
95 };
96 
97 #ifdef DM_WRITECACHE_HAS_PMEM
98 #define WC_MODE_PMEM(wc)			((wc)->pmem_mode)
99 #define WC_MODE_FUA(wc)				((wc)->writeback_fua)
100 #else
101 #define WC_MODE_PMEM(wc)			false
102 #define WC_MODE_FUA(wc)				false
103 #endif
104 #define WC_MODE_SORT_FREELIST(wc)		(!WC_MODE_PMEM(wc))
105 
106 struct dm_writecache {
107 	struct mutex lock;
108 	struct list_head lru;
109 	union {
110 		struct list_head freelist;
111 		struct {
112 			struct rb_root freetree;
113 			struct wc_entry *current_free;
114 		};
115 	};
116 	struct rb_root tree;
117 
118 	size_t freelist_size;
119 	size_t writeback_size;
120 	size_t freelist_high_watermark;
121 	size_t freelist_low_watermark;
122 
123 	unsigned uncommitted_blocks;
124 	unsigned autocommit_blocks;
125 	unsigned max_writeback_jobs;
126 
127 	int error;
128 
129 	unsigned long autocommit_jiffies;
130 	struct timer_list autocommit_timer;
131 	struct wait_queue_head freelist_wait;
132 
133 	atomic_t bio_in_progress[2];
134 	struct wait_queue_head bio_in_progress_wait[2];
135 
136 	struct dm_target *ti;
137 	struct dm_dev *dev;
138 	struct dm_dev *ssd_dev;
139 	sector_t start_sector;
140 	void *memory_map;
141 	uint64_t memory_map_size;
142 	size_t metadata_sectors;
143 	size_t n_blocks;
144 	uint64_t seq_count;
145 	void *block_start;
146 	struct wc_entry *entries;
147 	unsigned block_size;
148 	unsigned char block_size_bits;
149 
150 	bool pmem_mode:1;
151 	bool writeback_fua:1;
152 
153 	bool overwrote_committed:1;
154 	bool memory_vmapped:1;
155 
156 	bool high_wm_percent_set:1;
157 	bool low_wm_percent_set:1;
158 	bool max_writeback_jobs_set:1;
159 	bool autocommit_blocks_set:1;
160 	bool autocommit_time_set:1;
161 	bool writeback_fua_set:1;
162 	bool flush_on_suspend:1;
163 
164 	unsigned writeback_all;
165 	struct workqueue_struct *writeback_wq;
166 	struct work_struct writeback_work;
167 	struct work_struct flush_work;
168 
169 	struct dm_io_client *dm_io;
170 
171 	raw_spinlock_t endio_list_lock;
172 	struct list_head endio_list;
173 	struct task_struct *endio_thread;
174 
175 	struct task_struct *flush_thread;
176 	struct bio_list flush_list;
177 
178 	struct dm_kcopyd_client *dm_kcopyd;
179 	unsigned long *dirty_bitmap;
180 	unsigned dirty_bitmap_size;
181 
182 	struct bio_set bio_set;
183 	mempool_t copy_pool;
184 };
185 
186 #define WB_LIST_INLINE		16
187 
188 struct writeback_struct {
189 	struct list_head endio_entry;
190 	struct dm_writecache *wc;
191 	struct wc_entry **wc_list;
192 	unsigned wc_list_n;
193 	unsigned page_offset;
194 	struct page *page;
195 	struct wc_entry *wc_list_inline[WB_LIST_INLINE];
196 	struct bio bio;
197 };
198 
199 struct copy_struct {
200 	struct list_head endio_entry;
201 	struct dm_writecache *wc;
202 	struct wc_entry *e;
203 	unsigned n_entries;
204 	int error;
205 };
206 
207 DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(dm_writecache_throttle,
208 					    "A percentage of time allocated for data copying");
209 
210 static void wc_lock(struct dm_writecache *wc)
211 {
212 	mutex_lock(&wc->lock);
213 }
214 
215 static void wc_unlock(struct dm_writecache *wc)
216 {
217 	mutex_unlock(&wc->lock);
218 }
219 
220 #ifdef DM_WRITECACHE_HAS_PMEM
221 static int persistent_memory_claim(struct dm_writecache *wc)
222 {
223 	int r;
224 	loff_t s;
225 	long p, da;
226 	pfn_t pfn;
227 	int id;
228 	struct page **pages;
229 
230 	wc->memory_vmapped = false;
231 
232 	if (!wc->ssd_dev->dax_dev) {
233 		r = -EOPNOTSUPP;
234 		goto err1;
235 	}
236 	s = wc->memory_map_size;
237 	p = s >> PAGE_SHIFT;
238 	if (!p) {
239 		r = -EINVAL;
240 		goto err1;
241 	}
242 	if (p != s >> PAGE_SHIFT) {
243 		r = -EOVERFLOW;
244 		goto err1;
245 	}
246 
247 	id = dax_read_lock();
248 
249 	da = dax_direct_access(wc->ssd_dev->dax_dev, 0, p, &wc->memory_map, &pfn);
250 	if (da < 0) {
251 		wc->memory_map = NULL;
252 		r = da;
253 		goto err2;
254 	}
255 	if (!pfn_t_has_page(pfn)) {
256 		wc->memory_map = NULL;
257 		r = -EOPNOTSUPP;
258 		goto err2;
259 	}
260 	if (da != p) {
261 		long i;
262 		wc->memory_map = NULL;
263 		pages = kvmalloc_array(p, sizeof(struct page *), GFP_KERNEL);
264 		if (!pages) {
265 			r = -ENOMEM;
266 			goto err2;
267 		}
268 		i = 0;
269 		do {
270 			long daa;
271 			daa = dax_direct_access(wc->ssd_dev->dax_dev, i, p - i,
272 						NULL, &pfn);
273 			if (daa <= 0) {
274 				r = daa ? daa : -EINVAL;
275 				goto err3;
276 			}
277 			if (!pfn_t_has_page(pfn)) {
278 				r = -EOPNOTSUPP;
279 				goto err3;
280 			}
281 			while (daa-- && i < p) {
282 				pages[i++] = pfn_t_to_page(pfn);
283 				pfn.val++;
284 			}
285 		} while (i < p);
286 		wc->memory_map = vmap(pages, p, VM_MAP, PAGE_KERNEL);
287 		if (!wc->memory_map) {
288 			r = -ENOMEM;
289 			goto err3;
290 		}
291 		kvfree(pages);
292 		wc->memory_vmapped = true;
293 	}
294 
295 	dax_read_unlock(id);
296 
297 	wc->memory_map += (size_t)wc->start_sector << SECTOR_SHIFT;
298 	wc->memory_map_size -= (size_t)wc->start_sector << SECTOR_SHIFT;
299 
300 	return 0;
301 err3:
302 	kvfree(pages);
303 err2:
304 	dax_read_unlock(id);
305 err1:
306 	return r;
307 }
308 #else
309 static int persistent_memory_claim(struct dm_writecache *wc)
310 {
311 	BUG();
312 }
313 #endif
314 
315 static void persistent_memory_release(struct dm_writecache *wc)
316 {
317 	if (wc->memory_vmapped)
318 		vunmap(wc->memory_map - ((size_t)wc->start_sector << SECTOR_SHIFT));
319 }
320 
321 static struct page *persistent_memory_page(void *addr)
322 {
323 	if (is_vmalloc_addr(addr))
324 		return vmalloc_to_page(addr);
325 	else
326 		return virt_to_page(addr);
327 }
328 
329 static unsigned persistent_memory_page_offset(void *addr)
330 {
331 	return (unsigned long)addr & (PAGE_SIZE - 1);
332 }
333 
334 static void persistent_memory_flush_cache(void *ptr, size_t size)
335 {
336 	if (is_vmalloc_addr(ptr))
337 		flush_kernel_vmap_range(ptr, size);
338 }
339 
340 static void persistent_memory_invalidate_cache(void *ptr, size_t size)
341 {
342 	if (is_vmalloc_addr(ptr))
343 		invalidate_kernel_vmap_range(ptr, size);
344 }
345 
346 static struct wc_memory_superblock *sb(struct dm_writecache *wc)
347 {
348 	return wc->memory_map;
349 }
350 
351 static struct wc_memory_entry *memory_entry(struct dm_writecache *wc, struct wc_entry *e)
352 {
353 	if (is_power_of_2(sizeof(struct wc_entry)) && 0)
354 		return &sb(wc)->entries[e - wc->entries];
355 	else
356 		return &sb(wc)->entries[e->index];
357 }
358 
359 static void *memory_data(struct dm_writecache *wc, struct wc_entry *e)
360 {
361 	return (char *)wc->block_start + (e->index << wc->block_size_bits);
362 }
363 
364 static sector_t cache_sector(struct dm_writecache *wc, struct wc_entry *e)
365 {
366 	return wc->start_sector + wc->metadata_sectors +
367 		((sector_t)e->index << (wc->block_size_bits - SECTOR_SHIFT));
368 }
369 
370 static uint64_t read_original_sector(struct dm_writecache *wc, struct wc_entry *e)
371 {
372 #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
373 	return e->original_sector;
374 #else
375 	return le64_to_cpu(memory_entry(wc, e)->original_sector);
376 #endif
377 }
378 
379 static uint64_t read_seq_count(struct dm_writecache *wc, struct wc_entry *e)
380 {
381 #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
382 	return e->seq_count;
383 #else
384 	return le64_to_cpu(memory_entry(wc, e)->seq_count);
385 #endif
386 }
387 
388 static void clear_seq_count(struct dm_writecache *wc, struct wc_entry *e)
389 {
390 #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
391 	e->seq_count = -1;
392 #endif
393 	pmem_assign(memory_entry(wc, e)->seq_count, cpu_to_le64(-1));
394 }
395 
396 static void write_original_sector_seq_count(struct dm_writecache *wc, struct wc_entry *e,
397 					    uint64_t original_sector, uint64_t seq_count)
398 {
399 	struct wc_memory_entry me;
400 #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
401 	e->original_sector = original_sector;
402 	e->seq_count = seq_count;
403 #endif
404 	me.original_sector = cpu_to_le64(original_sector);
405 	me.seq_count = cpu_to_le64(seq_count);
406 	pmem_assign(*memory_entry(wc, e), me);
407 }
408 
409 #define writecache_error(wc, err, msg, arg...)				\
410 do {									\
411 	if (!cmpxchg(&(wc)->error, 0, err))				\
412 		DMERR(msg, ##arg);					\
413 	wake_up(&(wc)->freelist_wait);					\
414 } while (0)
415 
416 #define writecache_has_error(wc)	(unlikely(READ_ONCE((wc)->error)))
417 
418 static void writecache_flush_all_metadata(struct dm_writecache *wc)
419 {
420 	if (!WC_MODE_PMEM(wc))
421 		memset(wc->dirty_bitmap, -1, wc->dirty_bitmap_size);
422 }
423 
424 static void writecache_flush_region(struct dm_writecache *wc, void *ptr, size_t size)
425 {
426 	if (!WC_MODE_PMEM(wc))
427 		__set_bit(((char *)ptr - (char *)wc->memory_map) / BITMAP_GRANULARITY,
428 			  wc->dirty_bitmap);
429 }
430 
431 static void writecache_disk_flush(struct dm_writecache *wc, struct dm_dev *dev);
432 
433 struct io_notify {
434 	struct dm_writecache *wc;
435 	struct completion c;
436 	atomic_t count;
437 };
438 
439 static void writecache_notify_io(unsigned long error, void *context)
440 {
441 	struct io_notify *endio = context;
442 
443 	if (unlikely(error != 0))
444 		writecache_error(endio->wc, -EIO, "error writing metadata");
445 	BUG_ON(atomic_read(&endio->count) <= 0);
446 	if (atomic_dec_and_test(&endio->count))
447 		complete(&endio->c);
448 }
449 
450 static void ssd_commit_flushed(struct dm_writecache *wc)
451 {
452 	struct dm_io_region region;
453 	struct dm_io_request req;
454 	struct io_notify endio = {
455 		wc,
456 		COMPLETION_INITIALIZER_ONSTACK(endio.c),
457 		ATOMIC_INIT(1),
458 	};
459 	unsigned bitmap_bits = wc->dirty_bitmap_size * 8;
460 	unsigned i = 0;
461 
462 	while (1) {
463 		unsigned j;
464 		i = find_next_bit(wc->dirty_bitmap, bitmap_bits, i);
465 		if (unlikely(i == bitmap_bits))
466 			break;
467 		j = find_next_zero_bit(wc->dirty_bitmap, bitmap_bits, i);
468 
469 		region.bdev = wc->ssd_dev->bdev;
470 		region.sector = (sector_t)i * (BITMAP_GRANULARITY >> SECTOR_SHIFT);
471 		region.count = (sector_t)(j - i) * (BITMAP_GRANULARITY >> SECTOR_SHIFT);
472 
473 		if (unlikely(region.sector >= wc->metadata_sectors))
474 			break;
475 		if (unlikely(region.sector + region.count > wc->metadata_sectors))
476 			region.count = wc->metadata_sectors - region.sector;
477 
478 		region.sector += wc->start_sector;
479 		atomic_inc(&endio.count);
480 		req.bi_op = REQ_OP_WRITE;
481 		req.bi_op_flags = REQ_SYNC;
482 		req.mem.type = DM_IO_VMA;
483 		req.mem.ptr.vma = (char *)wc->memory_map + (size_t)i * BITMAP_GRANULARITY;
484 		req.client = wc->dm_io;
485 		req.notify.fn = writecache_notify_io;
486 		req.notify.context = &endio;
487 
488 		/* writing via async dm-io (implied by notify.fn above) won't return an error */
489 	        (void) dm_io(&req, 1, &region, NULL);
490 		i = j;
491 	}
492 
493 	writecache_notify_io(0, &endio);
494 	wait_for_completion_io(&endio.c);
495 
496 	writecache_disk_flush(wc, wc->ssd_dev);
497 
498 	memset(wc->dirty_bitmap, 0, wc->dirty_bitmap_size);
499 }
500 
501 static void writecache_commit_flushed(struct dm_writecache *wc)
502 {
503 	if (WC_MODE_PMEM(wc))
504 		wmb();
505 	else
506 		ssd_commit_flushed(wc);
507 }
508 
509 static void writecache_disk_flush(struct dm_writecache *wc, struct dm_dev *dev)
510 {
511 	int r;
512 	struct dm_io_region region;
513 	struct dm_io_request req;
514 
515 	region.bdev = dev->bdev;
516 	region.sector = 0;
517 	region.count = 0;
518 	req.bi_op = REQ_OP_WRITE;
519 	req.bi_op_flags = REQ_PREFLUSH;
520 	req.mem.type = DM_IO_KMEM;
521 	req.mem.ptr.addr = NULL;
522 	req.client = wc->dm_io;
523 	req.notify.fn = NULL;
524 
525 	r = dm_io(&req, 1, &region, NULL);
526 	if (unlikely(r))
527 		writecache_error(wc, r, "error flushing metadata: %d", r);
528 }
529 
530 static void writecache_wait_for_ios(struct dm_writecache *wc, int direction)
531 {
532 	wait_event(wc->bio_in_progress_wait[direction],
533 		   !atomic_read(&wc->bio_in_progress[direction]));
534 }
535 
536 #define WFE_RETURN_FOLLOWING	1
537 #define WFE_LOWEST_SEQ		2
538 
539 static struct wc_entry *writecache_find_entry(struct dm_writecache *wc,
540 					      uint64_t block, int flags)
541 {
542 	struct wc_entry *e;
543 	struct rb_node *node = wc->tree.rb_node;
544 
545 	if (unlikely(!node))
546 		return NULL;
547 
548 	while (1) {
549 		e = container_of(node, struct wc_entry, rb_node);
550 		if (read_original_sector(wc, e) == block)
551 			break;
552 		node = (read_original_sector(wc, e) >= block ?
553 			e->rb_node.rb_left : e->rb_node.rb_right);
554 		if (unlikely(!node)) {
555 			if (!(flags & WFE_RETURN_FOLLOWING)) {
556 				return NULL;
557 			}
558 			if (read_original_sector(wc, e) >= block) {
559 				break;
560 			} else {
561 				node = rb_next(&e->rb_node);
562 				if (unlikely(!node)) {
563 					return NULL;
564 				}
565 				e = container_of(node, struct wc_entry, rb_node);
566 				break;
567 			}
568 		}
569 	}
570 
571 	while (1) {
572 		struct wc_entry *e2;
573 		if (flags & WFE_LOWEST_SEQ)
574 			node = rb_prev(&e->rb_node);
575 		else
576 			node = rb_next(&e->rb_node);
577 		if (!node)
578 			return e;
579 		e2 = container_of(node, struct wc_entry, rb_node);
580 		if (read_original_sector(wc, e2) != block)
581 			return e;
582 		e = e2;
583 	}
584 }
585 
586 static void writecache_insert_entry(struct dm_writecache *wc, struct wc_entry *ins)
587 {
588 	struct wc_entry *e;
589 	struct rb_node **node = &wc->tree.rb_node, *parent = NULL;
590 
591 	while (*node) {
592 		e = container_of(*node, struct wc_entry, rb_node);
593 		parent = &e->rb_node;
594 		if (read_original_sector(wc, e) > read_original_sector(wc, ins))
595 			node = &parent->rb_left;
596 		else
597 			node = &parent->rb_right;
598 	}
599 	rb_link_node(&ins->rb_node, parent, node);
600 	rb_insert_color(&ins->rb_node, &wc->tree);
601 	list_add(&ins->lru, &wc->lru);
602 }
603 
604 static void writecache_unlink(struct dm_writecache *wc, struct wc_entry *e)
605 {
606 	list_del(&e->lru);
607 	rb_erase(&e->rb_node, &wc->tree);
608 }
609 
610 static void writecache_add_to_freelist(struct dm_writecache *wc, struct wc_entry *e)
611 {
612 	if (WC_MODE_SORT_FREELIST(wc)) {
613 		struct rb_node **node = &wc->freetree.rb_node, *parent = NULL;
614 		if (unlikely(!*node))
615 			wc->current_free = e;
616 		while (*node) {
617 			parent = *node;
618 			if (&e->rb_node < *node)
619 				node = &parent->rb_left;
620 			else
621 				node = &parent->rb_right;
622 		}
623 		rb_link_node(&e->rb_node, parent, node);
624 		rb_insert_color(&e->rb_node, &wc->freetree);
625 	} else {
626 		list_add_tail(&e->lru, &wc->freelist);
627 	}
628 	wc->freelist_size++;
629 }
630 
631 static struct wc_entry *writecache_pop_from_freelist(struct dm_writecache *wc)
632 {
633 	struct wc_entry *e;
634 
635 	if (WC_MODE_SORT_FREELIST(wc)) {
636 		struct rb_node *next;
637 		if (unlikely(!wc->current_free))
638 			return NULL;
639 		e = wc->current_free;
640 		next = rb_next(&e->rb_node);
641 		rb_erase(&e->rb_node, &wc->freetree);
642 		if (unlikely(!next))
643 			next = rb_first(&wc->freetree);
644 		wc->current_free = next ? container_of(next, struct wc_entry, rb_node) : NULL;
645 	} else {
646 		if (unlikely(list_empty(&wc->freelist)))
647 			return NULL;
648 		e = container_of(wc->freelist.next, struct wc_entry, lru);
649 		list_del(&e->lru);
650 	}
651 	wc->freelist_size--;
652 	if (unlikely(wc->freelist_size + wc->writeback_size <= wc->freelist_high_watermark))
653 		queue_work(wc->writeback_wq, &wc->writeback_work);
654 
655 	return e;
656 }
657 
658 static void writecache_free_entry(struct dm_writecache *wc, struct wc_entry *e)
659 {
660 	writecache_unlink(wc, e);
661 	writecache_add_to_freelist(wc, e);
662 	clear_seq_count(wc, e);
663 	writecache_flush_region(wc, memory_entry(wc, e), sizeof(struct wc_memory_entry));
664 	if (unlikely(waitqueue_active(&wc->freelist_wait)))
665 		wake_up(&wc->freelist_wait);
666 }
667 
668 static void writecache_wait_on_freelist(struct dm_writecache *wc)
669 {
670 	DEFINE_WAIT(wait);
671 
672 	prepare_to_wait(&wc->freelist_wait, &wait, TASK_UNINTERRUPTIBLE);
673 	wc_unlock(wc);
674 	io_schedule();
675 	finish_wait(&wc->freelist_wait, &wait);
676 	wc_lock(wc);
677 }
678 
679 static void writecache_poison_lists(struct dm_writecache *wc)
680 {
681 	/*
682 	 * Catch incorrect access to these values while the device is suspended.
683 	 */
684 	memset(&wc->tree, -1, sizeof wc->tree);
685 	wc->lru.next = LIST_POISON1;
686 	wc->lru.prev = LIST_POISON2;
687 	wc->freelist.next = LIST_POISON1;
688 	wc->freelist.prev = LIST_POISON2;
689 }
690 
691 static void writecache_flush_entry(struct dm_writecache *wc, struct wc_entry *e)
692 {
693 	writecache_flush_region(wc, memory_entry(wc, e), sizeof(struct wc_memory_entry));
694 	if (WC_MODE_PMEM(wc))
695 		writecache_flush_region(wc, memory_data(wc, e), wc->block_size);
696 }
697 
698 static bool writecache_entry_is_committed(struct dm_writecache *wc, struct wc_entry *e)
699 {
700 	return read_seq_count(wc, e) < wc->seq_count;
701 }
702 
703 static void writecache_flush(struct dm_writecache *wc)
704 {
705 	struct wc_entry *e, *e2;
706 	bool need_flush_after_free;
707 
708 	wc->uncommitted_blocks = 0;
709 	del_timer(&wc->autocommit_timer);
710 
711 	if (list_empty(&wc->lru))
712 		return;
713 
714 	e = container_of(wc->lru.next, struct wc_entry, lru);
715 	if (writecache_entry_is_committed(wc, e)) {
716 		if (wc->overwrote_committed) {
717 			writecache_wait_for_ios(wc, WRITE);
718 			writecache_disk_flush(wc, wc->ssd_dev);
719 			wc->overwrote_committed = false;
720 		}
721 		return;
722 	}
723 	while (1) {
724 		writecache_flush_entry(wc, e);
725 		if (unlikely(e->lru.next == &wc->lru))
726 			break;
727 		e2 = container_of(e->lru.next, struct wc_entry, lru);
728 		if (writecache_entry_is_committed(wc, e2))
729 			break;
730 		e = e2;
731 		cond_resched();
732 	}
733 	writecache_commit_flushed(wc);
734 
735 	writecache_wait_for_ios(wc, WRITE);
736 
737 	wc->seq_count++;
738 	pmem_assign(sb(wc)->seq_count, cpu_to_le64(wc->seq_count));
739 	writecache_flush_region(wc, &sb(wc)->seq_count, sizeof sb(wc)->seq_count);
740 	writecache_commit_flushed(wc);
741 
742 	wc->overwrote_committed = false;
743 
744 	need_flush_after_free = false;
745 	while (1) {
746 		/* Free another committed entry with lower seq-count */
747 		struct rb_node *rb_node = rb_prev(&e->rb_node);
748 
749 		if (rb_node) {
750 			e2 = container_of(rb_node, struct wc_entry, rb_node);
751 			if (read_original_sector(wc, e2) == read_original_sector(wc, e) &&
752 			    likely(!e2->write_in_progress)) {
753 				writecache_free_entry(wc, e2);
754 				need_flush_after_free = true;
755 			}
756 		}
757 		if (unlikely(e->lru.prev == &wc->lru))
758 			break;
759 		e = container_of(e->lru.prev, struct wc_entry, lru);
760 		cond_resched();
761 	}
762 
763 	if (need_flush_after_free)
764 		writecache_commit_flushed(wc);
765 }
766 
767 static void writecache_flush_work(struct work_struct *work)
768 {
769 	struct dm_writecache *wc = container_of(work, struct dm_writecache, flush_work);
770 
771 	wc_lock(wc);
772 	writecache_flush(wc);
773 	wc_unlock(wc);
774 }
775 
776 static void writecache_autocommit_timer(struct timer_list *t)
777 {
778 	struct dm_writecache *wc = from_timer(wc, t, autocommit_timer);
779 	if (!writecache_has_error(wc))
780 		queue_work(wc->writeback_wq, &wc->flush_work);
781 }
782 
783 static void writecache_schedule_autocommit(struct dm_writecache *wc)
784 {
785 	if (!timer_pending(&wc->autocommit_timer))
786 		mod_timer(&wc->autocommit_timer, jiffies + wc->autocommit_jiffies);
787 }
788 
789 static void writecache_discard(struct dm_writecache *wc, sector_t start, sector_t end)
790 {
791 	struct wc_entry *e;
792 	bool discarded_something = false;
793 
794 	e = writecache_find_entry(wc, start, WFE_RETURN_FOLLOWING | WFE_LOWEST_SEQ);
795 	if (unlikely(!e))
796 		return;
797 
798 	while (read_original_sector(wc, e) < end) {
799 		struct rb_node *node = rb_next(&e->rb_node);
800 
801 		if (likely(!e->write_in_progress)) {
802 			if (!discarded_something) {
803 				writecache_wait_for_ios(wc, READ);
804 				writecache_wait_for_ios(wc, WRITE);
805 				discarded_something = true;
806 			}
807 			writecache_free_entry(wc, e);
808 		}
809 
810 		if (!node)
811 			break;
812 
813 		e = container_of(node, struct wc_entry, rb_node);
814 	}
815 
816 	if (discarded_something)
817 		writecache_commit_flushed(wc);
818 }
819 
820 static bool writecache_wait_for_writeback(struct dm_writecache *wc)
821 {
822 	if (wc->writeback_size) {
823 		writecache_wait_on_freelist(wc);
824 		return true;
825 	}
826 	return false;
827 }
828 
829 static void writecache_suspend(struct dm_target *ti)
830 {
831 	struct dm_writecache *wc = ti->private;
832 	bool flush_on_suspend;
833 
834 	del_timer_sync(&wc->autocommit_timer);
835 
836 	wc_lock(wc);
837 	writecache_flush(wc);
838 	flush_on_suspend = wc->flush_on_suspend;
839 	if (flush_on_suspend) {
840 		wc->flush_on_suspend = false;
841 		wc->writeback_all++;
842 		queue_work(wc->writeback_wq, &wc->writeback_work);
843 	}
844 	wc_unlock(wc);
845 
846 	flush_workqueue(wc->writeback_wq);
847 
848 	wc_lock(wc);
849 	if (flush_on_suspend)
850 		wc->writeback_all--;
851 	while (writecache_wait_for_writeback(wc));
852 
853 	if (WC_MODE_PMEM(wc))
854 		persistent_memory_flush_cache(wc->memory_map, wc->memory_map_size);
855 
856 	writecache_poison_lists(wc);
857 
858 	wc_unlock(wc);
859 }
860 
861 static int writecache_alloc_entries(struct dm_writecache *wc)
862 {
863 	size_t b;
864 
865 	if (wc->entries)
866 		return 0;
867 	wc->entries = vmalloc(array_size(sizeof(struct wc_entry), wc->n_blocks));
868 	if (!wc->entries)
869 		return -ENOMEM;
870 	for (b = 0; b < wc->n_blocks; b++) {
871 		struct wc_entry *e = &wc->entries[b];
872 		e->index = b;
873 		e->write_in_progress = false;
874 	}
875 
876 	return 0;
877 }
878 
879 static void writecache_resume(struct dm_target *ti)
880 {
881 	struct dm_writecache *wc = ti->private;
882 	size_t b;
883 	bool need_flush = false;
884 	__le64 sb_seq_count;
885 	int r;
886 
887 	wc_lock(wc);
888 
889 	if (WC_MODE_PMEM(wc))
890 		persistent_memory_invalidate_cache(wc->memory_map, wc->memory_map_size);
891 
892 	wc->tree = RB_ROOT;
893 	INIT_LIST_HEAD(&wc->lru);
894 	if (WC_MODE_SORT_FREELIST(wc)) {
895 		wc->freetree = RB_ROOT;
896 		wc->current_free = NULL;
897 	} else {
898 		INIT_LIST_HEAD(&wc->freelist);
899 	}
900 	wc->freelist_size = 0;
901 
902 	r = memcpy_mcsafe(&sb_seq_count, &sb(wc)->seq_count, sizeof(uint64_t));
903 	if (r) {
904 		writecache_error(wc, r, "hardware memory error when reading superblock: %d", r);
905 		sb_seq_count = cpu_to_le64(0);
906 	}
907 	wc->seq_count = le64_to_cpu(sb_seq_count);
908 
909 #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
910 	for (b = 0; b < wc->n_blocks; b++) {
911 		struct wc_entry *e = &wc->entries[b];
912 		struct wc_memory_entry wme;
913 		if (writecache_has_error(wc)) {
914 			e->original_sector = -1;
915 			e->seq_count = -1;
916 			continue;
917 		}
918 		r = memcpy_mcsafe(&wme, memory_entry(wc, e), sizeof(struct wc_memory_entry));
919 		if (r) {
920 			writecache_error(wc, r, "hardware memory error when reading metadata entry %lu: %d",
921 					 (unsigned long)b, r);
922 			e->original_sector = -1;
923 			e->seq_count = -1;
924 		} else {
925 			e->original_sector = le64_to_cpu(wme.original_sector);
926 			e->seq_count = le64_to_cpu(wme.seq_count);
927 		}
928 	}
929 #endif
930 	for (b = 0; b < wc->n_blocks; b++) {
931 		struct wc_entry *e = &wc->entries[b];
932 		if (!writecache_entry_is_committed(wc, e)) {
933 			if (read_seq_count(wc, e) != -1) {
934 erase_this:
935 				clear_seq_count(wc, e);
936 				need_flush = true;
937 			}
938 			writecache_add_to_freelist(wc, e);
939 		} else {
940 			struct wc_entry *old;
941 
942 			old = writecache_find_entry(wc, read_original_sector(wc, e), 0);
943 			if (!old) {
944 				writecache_insert_entry(wc, e);
945 			} else {
946 				if (read_seq_count(wc, old) == read_seq_count(wc, e)) {
947 					writecache_error(wc, -EINVAL,
948 						 "two identical entries, position %llu, sector %llu, sequence %llu",
949 						 (unsigned long long)b, (unsigned long long)read_original_sector(wc, e),
950 						 (unsigned long long)read_seq_count(wc, e));
951 				}
952 				if (read_seq_count(wc, old) > read_seq_count(wc, e)) {
953 					goto erase_this;
954 				} else {
955 					writecache_free_entry(wc, old);
956 					writecache_insert_entry(wc, e);
957 					need_flush = true;
958 				}
959 			}
960 		}
961 		cond_resched();
962 	}
963 
964 	if (need_flush) {
965 		writecache_flush_all_metadata(wc);
966 		writecache_commit_flushed(wc);
967 	}
968 
969 	wc_unlock(wc);
970 }
971 
972 static int process_flush_mesg(unsigned argc, char **argv, struct dm_writecache *wc)
973 {
974 	if (argc != 1)
975 		return -EINVAL;
976 
977 	wc_lock(wc);
978 	if (dm_suspended(wc->ti)) {
979 		wc_unlock(wc);
980 		return -EBUSY;
981 	}
982 	if (writecache_has_error(wc)) {
983 		wc_unlock(wc);
984 		return -EIO;
985 	}
986 
987 	writecache_flush(wc);
988 	wc->writeback_all++;
989 	queue_work(wc->writeback_wq, &wc->writeback_work);
990 	wc_unlock(wc);
991 
992 	flush_workqueue(wc->writeback_wq);
993 
994 	wc_lock(wc);
995 	wc->writeback_all--;
996 	if (writecache_has_error(wc)) {
997 		wc_unlock(wc);
998 		return -EIO;
999 	}
1000 	wc_unlock(wc);
1001 
1002 	return 0;
1003 }
1004 
1005 static int process_flush_on_suspend_mesg(unsigned argc, char **argv, struct dm_writecache *wc)
1006 {
1007 	if (argc != 1)
1008 		return -EINVAL;
1009 
1010 	wc_lock(wc);
1011 	wc->flush_on_suspend = true;
1012 	wc_unlock(wc);
1013 
1014 	return 0;
1015 }
1016 
1017 static int writecache_message(struct dm_target *ti, unsigned argc, char **argv,
1018 			      char *result, unsigned maxlen)
1019 {
1020 	int r = -EINVAL;
1021 	struct dm_writecache *wc = ti->private;
1022 
1023 	if (!strcasecmp(argv[0], "flush"))
1024 		r = process_flush_mesg(argc, argv, wc);
1025 	else if (!strcasecmp(argv[0], "flush_on_suspend"))
1026 		r = process_flush_on_suspend_mesg(argc, argv, wc);
1027 	else
1028 		DMERR("unrecognised message received: %s", argv[0]);
1029 
1030 	return r;
1031 }
1032 
1033 static void bio_copy_block(struct dm_writecache *wc, struct bio *bio, void *data)
1034 {
1035 	void *buf;
1036 	unsigned long flags;
1037 	unsigned size;
1038 	int rw = bio_data_dir(bio);
1039 	unsigned remaining_size = wc->block_size;
1040 
1041 	do {
1042 		struct bio_vec bv = bio_iter_iovec(bio, bio->bi_iter);
1043 		buf = bvec_kmap_irq(&bv, &flags);
1044 		size = bv.bv_len;
1045 		if (unlikely(size > remaining_size))
1046 			size = remaining_size;
1047 
1048 		if (rw == READ) {
1049 			int r;
1050 			r = memcpy_mcsafe(buf, data, size);
1051 			flush_dcache_page(bio_page(bio));
1052 			if (unlikely(r)) {
1053 				writecache_error(wc, r, "hardware memory error when reading data: %d", r);
1054 				bio->bi_status = BLK_STS_IOERR;
1055 			}
1056 		} else {
1057 			flush_dcache_page(bio_page(bio));
1058 			memcpy_flushcache(data, buf, size);
1059 		}
1060 
1061 		bvec_kunmap_irq(buf, &flags);
1062 
1063 		data = (char *)data + size;
1064 		remaining_size -= size;
1065 		bio_advance(bio, size);
1066 	} while (unlikely(remaining_size));
1067 }
1068 
1069 static int writecache_flush_thread(void *data)
1070 {
1071 	struct dm_writecache *wc = data;
1072 
1073 	while (1) {
1074 		struct bio *bio;
1075 
1076 		wc_lock(wc);
1077 		bio = bio_list_pop(&wc->flush_list);
1078 		if (!bio) {
1079 			set_current_state(TASK_INTERRUPTIBLE);
1080 			wc_unlock(wc);
1081 
1082 			if (unlikely(kthread_should_stop())) {
1083 				set_current_state(TASK_RUNNING);
1084 				break;
1085 			}
1086 
1087 			schedule();
1088 			continue;
1089 		}
1090 
1091 		if (bio_op(bio) == REQ_OP_DISCARD) {
1092 			writecache_discard(wc, bio->bi_iter.bi_sector,
1093 					   bio_end_sector(bio));
1094 			wc_unlock(wc);
1095 			bio_set_dev(bio, wc->dev->bdev);
1096 			generic_make_request(bio);
1097 		} else {
1098 			writecache_flush(wc);
1099 			wc_unlock(wc);
1100 			if (writecache_has_error(wc))
1101 				bio->bi_status = BLK_STS_IOERR;
1102 			bio_endio(bio);
1103 		}
1104 	}
1105 
1106 	return 0;
1107 }
1108 
1109 static void writecache_offload_bio(struct dm_writecache *wc, struct bio *bio)
1110 {
1111 	if (bio_list_empty(&wc->flush_list))
1112 		wake_up_process(wc->flush_thread);
1113 	bio_list_add(&wc->flush_list, bio);
1114 }
1115 
1116 static int writecache_map(struct dm_target *ti, struct bio *bio)
1117 {
1118 	struct wc_entry *e;
1119 	struct dm_writecache *wc = ti->private;
1120 
1121 	bio->bi_private = NULL;
1122 
1123 	wc_lock(wc);
1124 
1125 	if (unlikely(bio->bi_opf & REQ_PREFLUSH)) {
1126 		if (writecache_has_error(wc))
1127 			goto unlock_error;
1128 		if (WC_MODE_PMEM(wc)) {
1129 			writecache_flush(wc);
1130 			if (writecache_has_error(wc))
1131 				goto unlock_error;
1132 			goto unlock_submit;
1133 		} else {
1134 			writecache_offload_bio(wc, bio);
1135 			goto unlock_return;
1136 		}
1137 	}
1138 
1139 	bio->bi_iter.bi_sector = dm_target_offset(ti, bio->bi_iter.bi_sector);
1140 
1141 	if (unlikely((((unsigned)bio->bi_iter.bi_sector | bio_sectors(bio)) &
1142 				(wc->block_size / 512 - 1)) != 0)) {
1143 		DMERR("I/O is not aligned, sector %llu, size %u, block size %u",
1144 		      (unsigned long long)bio->bi_iter.bi_sector,
1145 		      bio->bi_iter.bi_size, wc->block_size);
1146 		goto unlock_error;
1147 	}
1148 
1149 	if (unlikely(bio_op(bio) == REQ_OP_DISCARD)) {
1150 		if (writecache_has_error(wc))
1151 			goto unlock_error;
1152 		if (WC_MODE_PMEM(wc)) {
1153 			writecache_discard(wc, bio->bi_iter.bi_sector, bio_end_sector(bio));
1154 			goto unlock_remap_origin;
1155 		} else {
1156 			writecache_offload_bio(wc, bio);
1157 			goto unlock_return;
1158 		}
1159 	}
1160 
1161 	if (bio_data_dir(bio) == READ) {
1162 read_next_block:
1163 		e = writecache_find_entry(wc, bio->bi_iter.bi_sector, WFE_RETURN_FOLLOWING);
1164 		if (e && read_original_sector(wc, e) == bio->bi_iter.bi_sector) {
1165 			if (WC_MODE_PMEM(wc)) {
1166 				bio_copy_block(wc, bio, memory_data(wc, e));
1167 				if (bio->bi_iter.bi_size)
1168 					goto read_next_block;
1169 				goto unlock_submit;
1170 			} else {
1171 				dm_accept_partial_bio(bio, wc->block_size >> SECTOR_SHIFT);
1172 				bio_set_dev(bio, wc->ssd_dev->bdev);
1173 				bio->bi_iter.bi_sector = cache_sector(wc, e);
1174 				if (!writecache_entry_is_committed(wc, e))
1175 					writecache_wait_for_ios(wc, WRITE);
1176 				goto unlock_remap;
1177 			}
1178 		} else {
1179 			if (e) {
1180 				sector_t next_boundary =
1181 					read_original_sector(wc, e) - bio->bi_iter.bi_sector;
1182 				if (next_boundary < bio->bi_iter.bi_size >> SECTOR_SHIFT) {
1183 					dm_accept_partial_bio(bio, next_boundary);
1184 				}
1185 			}
1186 			goto unlock_remap_origin;
1187 		}
1188 	} else {
1189 		do {
1190 			if (writecache_has_error(wc))
1191 				goto unlock_error;
1192 			e = writecache_find_entry(wc, bio->bi_iter.bi_sector, 0);
1193 			if (e) {
1194 				if (!writecache_entry_is_committed(wc, e))
1195 					goto bio_copy;
1196 				if (!WC_MODE_PMEM(wc) && !e->write_in_progress) {
1197 					wc->overwrote_committed = true;
1198 					goto bio_copy;
1199 				}
1200 			}
1201 			e = writecache_pop_from_freelist(wc);
1202 			if (unlikely(!e)) {
1203 				writecache_wait_on_freelist(wc);
1204 				continue;
1205 			}
1206 			write_original_sector_seq_count(wc, e, bio->bi_iter.bi_sector, wc->seq_count);
1207 			writecache_insert_entry(wc, e);
1208 			wc->uncommitted_blocks++;
1209 bio_copy:
1210 			if (WC_MODE_PMEM(wc)) {
1211 				bio_copy_block(wc, bio, memory_data(wc, e));
1212 			} else {
1213 				dm_accept_partial_bio(bio, wc->block_size >> SECTOR_SHIFT);
1214 				bio_set_dev(bio, wc->ssd_dev->bdev);
1215 				bio->bi_iter.bi_sector = cache_sector(wc, e);
1216 				if (unlikely(wc->uncommitted_blocks >= wc->autocommit_blocks)) {
1217 					wc->uncommitted_blocks = 0;
1218 					queue_work(wc->writeback_wq, &wc->flush_work);
1219 				} else {
1220 					writecache_schedule_autocommit(wc);
1221 				}
1222 				goto unlock_remap;
1223 			}
1224 		} while (bio->bi_iter.bi_size);
1225 
1226 		if (unlikely(wc->uncommitted_blocks >= wc->autocommit_blocks))
1227 			writecache_flush(wc);
1228 		else
1229 			writecache_schedule_autocommit(wc);
1230 		goto unlock_submit;
1231 	}
1232 
1233 unlock_remap_origin:
1234 	bio_set_dev(bio, wc->dev->bdev);
1235 	wc_unlock(wc);
1236 	return DM_MAPIO_REMAPPED;
1237 
1238 unlock_remap:
1239 	/* make sure that writecache_end_io decrements bio_in_progress: */
1240 	bio->bi_private = (void *)1;
1241 	atomic_inc(&wc->bio_in_progress[bio_data_dir(bio)]);
1242 	wc_unlock(wc);
1243 	return DM_MAPIO_REMAPPED;
1244 
1245 unlock_submit:
1246 	wc_unlock(wc);
1247 	bio_endio(bio);
1248 	return DM_MAPIO_SUBMITTED;
1249 
1250 unlock_return:
1251 	wc_unlock(wc);
1252 	return DM_MAPIO_SUBMITTED;
1253 
1254 unlock_error:
1255 	wc_unlock(wc);
1256 	bio_io_error(bio);
1257 	return DM_MAPIO_SUBMITTED;
1258 }
1259 
1260 static int writecache_end_io(struct dm_target *ti, struct bio *bio, blk_status_t *status)
1261 {
1262 	struct dm_writecache *wc = ti->private;
1263 
1264 	if (bio->bi_private != NULL) {
1265 		int dir = bio_data_dir(bio);
1266 		if (atomic_dec_and_test(&wc->bio_in_progress[dir]))
1267 			if (unlikely(waitqueue_active(&wc->bio_in_progress_wait[dir])))
1268 				wake_up(&wc->bio_in_progress_wait[dir]);
1269 	}
1270 	return 0;
1271 }
1272 
1273 static int writecache_iterate_devices(struct dm_target *ti,
1274 				      iterate_devices_callout_fn fn, void *data)
1275 {
1276 	struct dm_writecache *wc = ti->private;
1277 
1278 	return fn(ti, wc->dev, 0, ti->len, data);
1279 }
1280 
1281 static void writecache_io_hints(struct dm_target *ti, struct queue_limits *limits)
1282 {
1283 	struct dm_writecache *wc = ti->private;
1284 
1285 	if (limits->logical_block_size < wc->block_size)
1286 		limits->logical_block_size = wc->block_size;
1287 
1288 	if (limits->physical_block_size < wc->block_size)
1289 		limits->physical_block_size = wc->block_size;
1290 
1291 	if (limits->io_min < wc->block_size)
1292 		limits->io_min = wc->block_size;
1293 }
1294 
1295 
1296 static void writecache_writeback_endio(struct bio *bio)
1297 {
1298 	struct writeback_struct *wb = container_of(bio, struct writeback_struct, bio);
1299 	struct dm_writecache *wc = wb->wc;
1300 	unsigned long flags;
1301 
1302 	raw_spin_lock_irqsave(&wc->endio_list_lock, flags);
1303 	if (unlikely(list_empty(&wc->endio_list)))
1304 		wake_up_process(wc->endio_thread);
1305 	list_add_tail(&wb->endio_entry, &wc->endio_list);
1306 	raw_spin_unlock_irqrestore(&wc->endio_list_lock, flags);
1307 }
1308 
1309 static void writecache_copy_endio(int read_err, unsigned long write_err, void *ptr)
1310 {
1311 	struct copy_struct *c = ptr;
1312 	struct dm_writecache *wc = c->wc;
1313 
1314 	c->error = likely(!(read_err | write_err)) ? 0 : -EIO;
1315 
1316 	raw_spin_lock_irq(&wc->endio_list_lock);
1317 	if (unlikely(list_empty(&wc->endio_list)))
1318 		wake_up_process(wc->endio_thread);
1319 	list_add_tail(&c->endio_entry, &wc->endio_list);
1320 	raw_spin_unlock_irq(&wc->endio_list_lock);
1321 }
1322 
1323 static void __writecache_endio_pmem(struct dm_writecache *wc, struct list_head *list)
1324 {
1325 	unsigned i;
1326 	struct writeback_struct *wb;
1327 	struct wc_entry *e;
1328 	unsigned long n_walked = 0;
1329 
1330 	do {
1331 		wb = list_entry(list->next, struct writeback_struct, endio_entry);
1332 		list_del(&wb->endio_entry);
1333 
1334 		if (unlikely(wb->bio.bi_status != BLK_STS_OK))
1335 			writecache_error(wc, blk_status_to_errno(wb->bio.bi_status),
1336 					"write error %d", wb->bio.bi_status);
1337 		i = 0;
1338 		do {
1339 			e = wb->wc_list[i];
1340 			BUG_ON(!e->write_in_progress);
1341 			e->write_in_progress = false;
1342 			INIT_LIST_HEAD(&e->lru);
1343 			if (!writecache_has_error(wc))
1344 				writecache_free_entry(wc, e);
1345 			BUG_ON(!wc->writeback_size);
1346 			wc->writeback_size--;
1347 			n_walked++;
1348 			if (unlikely(n_walked >= ENDIO_LATENCY)) {
1349 				writecache_commit_flushed(wc);
1350 				wc_unlock(wc);
1351 				wc_lock(wc);
1352 				n_walked = 0;
1353 			}
1354 		} while (++i < wb->wc_list_n);
1355 
1356 		if (wb->wc_list != wb->wc_list_inline)
1357 			kfree(wb->wc_list);
1358 		bio_put(&wb->bio);
1359 	} while (!list_empty(list));
1360 }
1361 
1362 static void __writecache_endio_ssd(struct dm_writecache *wc, struct list_head *list)
1363 {
1364 	struct copy_struct *c;
1365 	struct wc_entry *e;
1366 
1367 	do {
1368 		c = list_entry(list->next, struct copy_struct, endio_entry);
1369 		list_del(&c->endio_entry);
1370 
1371 		if (unlikely(c->error))
1372 			writecache_error(wc, c->error, "copy error");
1373 
1374 		e = c->e;
1375 		do {
1376 			BUG_ON(!e->write_in_progress);
1377 			e->write_in_progress = false;
1378 			INIT_LIST_HEAD(&e->lru);
1379 			if (!writecache_has_error(wc))
1380 				writecache_free_entry(wc, e);
1381 
1382 			BUG_ON(!wc->writeback_size);
1383 			wc->writeback_size--;
1384 			e++;
1385 		} while (--c->n_entries);
1386 		mempool_free(c, &wc->copy_pool);
1387 	} while (!list_empty(list));
1388 }
1389 
1390 static int writecache_endio_thread(void *data)
1391 {
1392 	struct dm_writecache *wc = data;
1393 
1394 	while (1) {
1395 		struct list_head list;
1396 
1397 		raw_spin_lock_irq(&wc->endio_list_lock);
1398 		if (!list_empty(&wc->endio_list))
1399 			goto pop_from_list;
1400 		set_current_state(TASK_INTERRUPTIBLE);
1401 		raw_spin_unlock_irq(&wc->endio_list_lock);
1402 
1403 		if (unlikely(kthread_should_stop())) {
1404 			set_current_state(TASK_RUNNING);
1405 			break;
1406 		}
1407 
1408 		schedule();
1409 
1410 		continue;
1411 
1412 pop_from_list:
1413 		list = wc->endio_list;
1414 		list.next->prev = list.prev->next = &list;
1415 		INIT_LIST_HEAD(&wc->endio_list);
1416 		raw_spin_unlock_irq(&wc->endio_list_lock);
1417 
1418 		if (!WC_MODE_FUA(wc))
1419 			writecache_disk_flush(wc, wc->dev);
1420 
1421 		wc_lock(wc);
1422 
1423 		if (WC_MODE_PMEM(wc)) {
1424 			__writecache_endio_pmem(wc, &list);
1425 		} else {
1426 			__writecache_endio_ssd(wc, &list);
1427 			writecache_wait_for_ios(wc, READ);
1428 		}
1429 
1430 		writecache_commit_flushed(wc);
1431 
1432 		wc_unlock(wc);
1433 	}
1434 
1435 	return 0;
1436 }
1437 
1438 static bool wc_add_block(struct writeback_struct *wb, struct wc_entry *e, gfp_t gfp)
1439 {
1440 	struct dm_writecache *wc = wb->wc;
1441 	unsigned block_size = wc->block_size;
1442 	void *address = memory_data(wc, e);
1443 
1444 	persistent_memory_flush_cache(address, block_size);
1445 	return bio_add_page(&wb->bio, persistent_memory_page(address),
1446 			    block_size, persistent_memory_page_offset(address)) != 0;
1447 }
1448 
1449 struct writeback_list {
1450 	struct list_head list;
1451 	size_t size;
1452 };
1453 
1454 static void __writeback_throttle(struct dm_writecache *wc, struct writeback_list *wbl)
1455 {
1456 	if (unlikely(wc->max_writeback_jobs)) {
1457 		if (READ_ONCE(wc->writeback_size) - wbl->size >= wc->max_writeback_jobs) {
1458 			wc_lock(wc);
1459 			while (wc->writeback_size - wbl->size >= wc->max_writeback_jobs)
1460 				writecache_wait_on_freelist(wc);
1461 			wc_unlock(wc);
1462 		}
1463 	}
1464 	cond_resched();
1465 }
1466 
1467 static void __writecache_writeback_pmem(struct dm_writecache *wc, struct writeback_list *wbl)
1468 {
1469 	struct wc_entry *e, *f;
1470 	struct bio *bio;
1471 	struct writeback_struct *wb;
1472 	unsigned max_pages;
1473 
1474 	while (wbl->size) {
1475 		wbl->size--;
1476 		e = container_of(wbl->list.prev, struct wc_entry, lru);
1477 		list_del(&e->lru);
1478 
1479 		max_pages = e->wc_list_contiguous;
1480 
1481 		bio = bio_alloc_bioset(GFP_NOIO, max_pages, &wc->bio_set);
1482 		wb = container_of(bio, struct writeback_struct, bio);
1483 		wb->wc = wc;
1484 		wb->bio.bi_end_io = writecache_writeback_endio;
1485 		bio_set_dev(&wb->bio, wc->dev->bdev);
1486 		wb->bio.bi_iter.bi_sector = read_original_sector(wc, e);
1487 		wb->page_offset = PAGE_SIZE;
1488 		if (max_pages <= WB_LIST_INLINE ||
1489 		    unlikely(!(wb->wc_list = kmalloc_array(max_pages, sizeof(struct wc_entry *),
1490 							   GFP_NOIO | __GFP_NORETRY |
1491 							   __GFP_NOMEMALLOC | __GFP_NOWARN)))) {
1492 			wb->wc_list = wb->wc_list_inline;
1493 			max_pages = WB_LIST_INLINE;
1494 		}
1495 
1496 		BUG_ON(!wc_add_block(wb, e, GFP_NOIO));
1497 
1498 		wb->wc_list[0] = e;
1499 		wb->wc_list_n = 1;
1500 
1501 		while (wbl->size && wb->wc_list_n < max_pages) {
1502 			f = container_of(wbl->list.prev, struct wc_entry, lru);
1503 			if (read_original_sector(wc, f) !=
1504 			    read_original_sector(wc, e) + (wc->block_size >> SECTOR_SHIFT))
1505 				break;
1506 			if (!wc_add_block(wb, f, GFP_NOWAIT | __GFP_NOWARN))
1507 				break;
1508 			wbl->size--;
1509 			list_del(&f->lru);
1510 			wb->wc_list[wb->wc_list_n++] = f;
1511 			e = f;
1512 		}
1513 		bio_set_op_attrs(&wb->bio, REQ_OP_WRITE, WC_MODE_FUA(wc) * REQ_FUA);
1514 		if (writecache_has_error(wc)) {
1515 			bio->bi_status = BLK_STS_IOERR;
1516 			bio_endio(&wb->bio);
1517 		} else {
1518 			submit_bio(&wb->bio);
1519 		}
1520 
1521 		__writeback_throttle(wc, wbl);
1522 	}
1523 }
1524 
1525 static void __writecache_writeback_ssd(struct dm_writecache *wc, struct writeback_list *wbl)
1526 {
1527 	struct wc_entry *e, *f;
1528 	struct dm_io_region from, to;
1529 	struct copy_struct *c;
1530 
1531 	while (wbl->size) {
1532 		unsigned n_sectors;
1533 
1534 		wbl->size--;
1535 		e = container_of(wbl->list.prev, struct wc_entry, lru);
1536 		list_del(&e->lru);
1537 
1538 		n_sectors = e->wc_list_contiguous << (wc->block_size_bits - SECTOR_SHIFT);
1539 
1540 		from.bdev = wc->ssd_dev->bdev;
1541 		from.sector = cache_sector(wc, e);
1542 		from.count = n_sectors;
1543 		to.bdev = wc->dev->bdev;
1544 		to.sector = read_original_sector(wc, e);
1545 		to.count = n_sectors;
1546 
1547 		c = mempool_alloc(&wc->copy_pool, GFP_NOIO);
1548 		c->wc = wc;
1549 		c->e = e;
1550 		c->n_entries = e->wc_list_contiguous;
1551 
1552 		while ((n_sectors -= wc->block_size >> SECTOR_SHIFT)) {
1553 			wbl->size--;
1554 			f = container_of(wbl->list.prev, struct wc_entry, lru);
1555 			BUG_ON(f != e + 1);
1556 			list_del(&f->lru);
1557 			e = f;
1558 		}
1559 
1560 		dm_kcopyd_copy(wc->dm_kcopyd, &from, 1, &to, 0, writecache_copy_endio, c);
1561 
1562 		__writeback_throttle(wc, wbl);
1563 	}
1564 }
1565 
1566 static void writecache_writeback(struct work_struct *work)
1567 {
1568 	struct dm_writecache *wc = container_of(work, struct dm_writecache, writeback_work);
1569 	struct blk_plug plug;
1570 	struct wc_entry *e, *f, *g;
1571 	struct rb_node *node, *next_node;
1572 	struct list_head skipped;
1573 	struct writeback_list wbl;
1574 	unsigned long n_walked;
1575 
1576 	wc_lock(wc);
1577 restart:
1578 	if (writecache_has_error(wc)) {
1579 		wc_unlock(wc);
1580 		return;
1581 	}
1582 
1583 	if (unlikely(wc->writeback_all)) {
1584 		if (writecache_wait_for_writeback(wc))
1585 			goto restart;
1586 	}
1587 
1588 	if (wc->overwrote_committed) {
1589 		writecache_wait_for_ios(wc, WRITE);
1590 	}
1591 
1592 	n_walked = 0;
1593 	INIT_LIST_HEAD(&skipped);
1594 	INIT_LIST_HEAD(&wbl.list);
1595 	wbl.size = 0;
1596 	while (!list_empty(&wc->lru) &&
1597 	       (wc->writeback_all ||
1598 		wc->freelist_size + wc->writeback_size <= wc->freelist_low_watermark)) {
1599 
1600 		n_walked++;
1601 		if (unlikely(n_walked > WRITEBACK_LATENCY) &&
1602 		    likely(!wc->writeback_all) && likely(!dm_suspended(wc->ti))) {
1603 			queue_work(wc->writeback_wq, &wc->writeback_work);
1604 			break;
1605 		}
1606 
1607 		e = container_of(wc->lru.prev, struct wc_entry, lru);
1608 		BUG_ON(e->write_in_progress);
1609 		if (unlikely(!writecache_entry_is_committed(wc, e))) {
1610 			writecache_flush(wc);
1611 		}
1612 		node = rb_prev(&e->rb_node);
1613 		if (node) {
1614 			f = container_of(node, struct wc_entry, rb_node);
1615 			if (unlikely(read_original_sector(wc, f) ==
1616 				     read_original_sector(wc, e))) {
1617 				BUG_ON(!f->write_in_progress);
1618 				list_del(&e->lru);
1619 				list_add(&e->lru, &skipped);
1620 				cond_resched();
1621 				continue;
1622 			}
1623 		}
1624 		wc->writeback_size++;
1625 		list_del(&e->lru);
1626 		list_add(&e->lru, &wbl.list);
1627 		wbl.size++;
1628 		e->write_in_progress = true;
1629 		e->wc_list_contiguous = 1;
1630 
1631 		f = e;
1632 
1633 		while (1) {
1634 			next_node = rb_next(&f->rb_node);
1635 			if (unlikely(!next_node))
1636 				break;
1637 			g = container_of(next_node, struct wc_entry, rb_node);
1638 			if (read_original_sector(wc, g) ==
1639 			    read_original_sector(wc, f)) {
1640 				f = g;
1641 				continue;
1642 			}
1643 			if (read_original_sector(wc, g) !=
1644 			    read_original_sector(wc, f) + (wc->block_size >> SECTOR_SHIFT))
1645 				break;
1646 			if (unlikely(g->write_in_progress))
1647 				break;
1648 			if (unlikely(!writecache_entry_is_committed(wc, g)))
1649 				break;
1650 
1651 			if (!WC_MODE_PMEM(wc)) {
1652 				if (g != f + 1)
1653 					break;
1654 			}
1655 
1656 			n_walked++;
1657 			//if (unlikely(n_walked > WRITEBACK_LATENCY) && likely(!wc->writeback_all))
1658 			//	break;
1659 
1660 			wc->writeback_size++;
1661 			list_del(&g->lru);
1662 			list_add(&g->lru, &wbl.list);
1663 			wbl.size++;
1664 			g->write_in_progress = true;
1665 			g->wc_list_contiguous = BIO_MAX_PAGES;
1666 			f = g;
1667 			e->wc_list_contiguous++;
1668 			if (unlikely(e->wc_list_contiguous == BIO_MAX_PAGES))
1669 				break;
1670 		}
1671 		cond_resched();
1672 	}
1673 
1674 	if (!list_empty(&skipped)) {
1675 		list_splice_tail(&skipped, &wc->lru);
1676 		/*
1677 		 * If we didn't do any progress, we must wait until some
1678 		 * writeback finishes to avoid burning CPU in a loop
1679 		 */
1680 		if (unlikely(!wbl.size))
1681 			writecache_wait_for_writeback(wc);
1682 	}
1683 
1684 	wc_unlock(wc);
1685 
1686 	blk_start_plug(&plug);
1687 
1688 	if (WC_MODE_PMEM(wc))
1689 		__writecache_writeback_pmem(wc, &wbl);
1690 	else
1691 		__writecache_writeback_ssd(wc, &wbl);
1692 
1693 	blk_finish_plug(&plug);
1694 
1695 	if (unlikely(wc->writeback_all)) {
1696 		wc_lock(wc);
1697 		while (writecache_wait_for_writeback(wc));
1698 		wc_unlock(wc);
1699 	}
1700 }
1701 
1702 static int calculate_memory_size(uint64_t device_size, unsigned block_size,
1703 				 size_t *n_blocks_p, size_t *n_metadata_blocks_p)
1704 {
1705 	uint64_t n_blocks, offset;
1706 	struct wc_entry e;
1707 
1708 	n_blocks = device_size;
1709 	do_div(n_blocks, block_size + sizeof(struct wc_memory_entry));
1710 
1711 	while (1) {
1712 		if (!n_blocks)
1713 			return -ENOSPC;
1714 		/* Verify the following entries[n_blocks] won't overflow */
1715 		if (n_blocks >= ((size_t)-sizeof(struct wc_memory_superblock) /
1716 				 sizeof(struct wc_memory_entry)))
1717 			return -EFBIG;
1718 		offset = offsetof(struct wc_memory_superblock, entries[n_blocks]);
1719 		offset = (offset + block_size - 1) & ~(uint64_t)(block_size - 1);
1720 		if (offset + n_blocks * block_size <= device_size)
1721 			break;
1722 		n_blocks--;
1723 	}
1724 
1725 	/* check if the bit field overflows */
1726 	e.index = n_blocks;
1727 	if (e.index != n_blocks)
1728 		return -EFBIG;
1729 
1730 	if (n_blocks_p)
1731 		*n_blocks_p = n_blocks;
1732 	if (n_metadata_blocks_p)
1733 		*n_metadata_blocks_p = offset >> __ffs(block_size);
1734 	return 0;
1735 }
1736 
1737 static int init_memory(struct dm_writecache *wc)
1738 {
1739 	size_t b;
1740 	int r;
1741 
1742 	r = calculate_memory_size(wc->memory_map_size, wc->block_size, &wc->n_blocks, NULL);
1743 	if (r)
1744 		return r;
1745 
1746 	r = writecache_alloc_entries(wc);
1747 	if (r)
1748 		return r;
1749 
1750 	for (b = 0; b < ARRAY_SIZE(sb(wc)->padding); b++)
1751 		pmem_assign(sb(wc)->padding[b], cpu_to_le64(0));
1752 	pmem_assign(sb(wc)->version, cpu_to_le32(MEMORY_SUPERBLOCK_VERSION));
1753 	pmem_assign(sb(wc)->block_size, cpu_to_le32(wc->block_size));
1754 	pmem_assign(sb(wc)->n_blocks, cpu_to_le64(wc->n_blocks));
1755 	pmem_assign(sb(wc)->seq_count, cpu_to_le64(0));
1756 
1757 	for (b = 0; b < wc->n_blocks; b++)
1758 		write_original_sector_seq_count(wc, &wc->entries[b], -1, -1);
1759 
1760 	writecache_flush_all_metadata(wc);
1761 	writecache_commit_flushed(wc);
1762 	pmem_assign(sb(wc)->magic, cpu_to_le32(MEMORY_SUPERBLOCK_MAGIC));
1763 	writecache_flush_region(wc, &sb(wc)->magic, sizeof sb(wc)->magic);
1764 	writecache_commit_flushed(wc);
1765 
1766 	return 0;
1767 }
1768 
1769 static void writecache_dtr(struct dm_target *ti)
1770 {
1771 	struct dm_writecache *wc = ti->private;
1772 
1773 	if (!wc)
1774 		return;
1775 
1776 	if (wc->endio_thread)
1777 		kthread_stop(wc->endio_thread);
1778 
1779 	if (wc->flush_thread)
1780 		kthread_stop(wc->flush_thread);
1781 
1782 	bioset_exit(&wc->bio_set);
1783 
1784 	mempool_exit(&wc->copy_pool);
1785 
1786 	if (wc->writeback_wq)
1787 		destroy_workqueue(wc->writeback_wq);
1788 
1789 	if (wc->dev)
1790 		dm_put_device(ti, wc->dev);
1791 
1792 	if (wc->ssd_dev)
1793 		dm_put_device(ti, wc->ssd_dev);
1794 
1795 	if (wc->entries)
1796 		vfree(wc->entries);
1797 
1798 	if (wc->memory_map) {
1799 		if (WC_MODE_PMEM(wc))
1800 			persistent_memory_release(wc);
1801 		else
1802 			vfree(wc->memory_map);
1803 	}
1804 
1805 	if (wc->dm_kcopyd)
1806 		dm_kcopyd_client_destroy(wc->dm_kcopyd);
1807 
1808 	if (wc->dm_io)
1809 		dm_io_client_destroy(wc->dm_io);
1810 
1811 	if (wc->dirty_bitmap)
1812 		vfree(wc->dirty_bitmap);
1813 
1814 	kfree(wc);
1815 }
1816 
1817 static int writecache_ctr(struct dm_target *ti, unsigned argc, char **argv)
1818 {
1819 	struct dm_writecache *wc;
1820 	struct dm_arg_set as;
1821 	const char *string;
1822 	unsigned opt_params;
1823 	size_t offset, data_size;
1824 	int i, r;
1825 	char dummy;
1826 	int high_wm_percent = HIGH_WATERMARK;
1827 	int low_wm_percent = LOW_WATERMARK;
1828 	uint64_t x;
1829 	struct wc_memory_superblock s;
1830 
1831 	static struct dm_arg _args[] = {
1832 		{0, 10, "Invalid number of feature args"},
1833 	};
1834 
1835 	as.argc = argc;
1836 	as.argv = argv;
1837 
1838 	wc = kzalloc(sizeof(struct dm_writecache), GFP_KERNEL);
1839 	if (!wc) {
1840 		ti->error = "Cannot allocate writecache structure";
1841 		r = -ENOMEM;
1842 		goto bad;
1843 	}
1844 	ti->private = wc;
1845 	wc->ti = ti;
1846 
1847 	mutex_init(&wc->lock);
1848 	writecache_poison_lists(wc);
1849 	init_waitqueue_head(&wc->freelist_wait);
1850 	timer_setup(&wc->autocommit_timer, writecache_autocommit_timer, 0);
1851 
1852 	for (i = 0; i < 2; i++) {
1853 		atomic_set(&wc->bio_in_progress[i], 0);
1854 		init_waitqueue_head(&wc->bio_in_progress_wait[i]);
1855 	}
1856 
1857 	wc->dm_io = dm_io_client_create();
1858 	if (IS_ERR(wc->dm_io)) {
1859 		r = PTR_ERR(wc->dm_io);
1860 		ti->error = "Unable to allocate dm-io client";
1861 		wc->dm_io = NULL;
1862 		goto bad;
1863 	}
1864 
1865 	wc->writeback_wq = alloc_workqueue("writecache-writeabck", WQ_MEM_RECLAIM, 1);
1866 	if (!wc->writeback_wq) {
1867 		r = -ENOMEM;
1868 		ti->error = "Could not allocate writeback workqueue";
1869 		goto bad;
1870 	}
1871 	INIT_WORK(&wc->writeback_work, writecache_writeback);
1872 	INIT_WORK(&wc->flush_work, writecache_flush_work);
1873 
1874 	raw_spin_lock_init(&wc->endio_list_lock);
1875 	INIT_LIST_HEAD(&wc->endio_list);
1876 	wc->endio_thread = kthread_create(writecache_endio_thread, wc, "writecache_endio");
1877 	if (IS_ERR(wc->endio_thread)) {
1878 		r = PTR_ERR(wc->endio_thread);
1879 		wc->endio_thread = NULL;
1880 		ti->error = "Couldn't spawn endio thread";
1881 		goto bad;
1882 	}
1883 	wake_up_process(wc->endio_thread);
1884 
1885 	/*
1886 	 * Parse the mode (pmem or ssd)
1887 	 */
1888 	string = dm_shift_arg(&as);
1889 	if (!string)
1890 		goto bad_arguments;
1891 
1892 	if (!strcasecmp(string, "s")) {
1893 		wc->pmem_mode = false;
1894 	} else if (!strcasecmp(string, "p")) {
1895 #ifdef DM_WRITECACHE_HAS_PMEM
1896 		wc->pmem_mode = true;
1897 		wc->writeback_fua = true;
1898 #else
1899 		/*
1900 		 * If the architecture doesn't support persistent memory or
1901 		 * the kernel doesn't support any DAX drivers, this driver can
1902 		 * only be used in SSD-only mode.
1903 		 */
1904 		r = -EOPNOTSUPP;
1905 		ti->error = "Persistent memory or DAX not supported on this system";
1906 		goto bad;
1907 #endif
1908 	} else {
1909 		goto bad_arguments;
1910 	}
1911 
1912 	if (WC_MODE_PMEM(wc)) {
1913 		r = bioset_init(&wc->bio_set, BIO_POOL_SIZE,
1914 				offsetof(struct writeback_struct, bio),
1915 				BIOSET_NEED_BVECS);
1916 		if (r) {
1917 			ti->error = "Could not allocate bio set";
1918 			goto bad;
1919 		}
1920 	} else {
1921 		r = mempool_init_kmalloc_pool(&wc->copy_pool, 1, sizeof(struct copy_struct));
1922 		if (r) {
1923 			ti->error = "Could not allocate mempool";
1924 			goto bad;
1925 		}
1926 	}
1927 
1928 	/*
1929 	 * Parse the origin data device
1930 	 */
1931 	string = dm_shift_arg(&as);
1932 	if (!string)
1933 		goto bad_arguments;
1934 	r = dm_get_device(ti, string, dm_table_get_mode(ti->table), &wc->dev);
1935 	if (r) {
1936 		ti->error = "Origin data device lookup failed";
1937 		goto bad;
1938 	}
1939 
1940 	/*
1941 	 * Parse cache data device (be it pmem or ssd)
1942 	 */
1943 	string = dm_shift_arg(&as);
1944 	if (!string)
1945 		goto bad_arguments;
1946 
1947 	r = dm_get_device(ti, string, dm_table_get_mode(ti->table), &wc->ssd_dev);
1948 	if (r) {
1949 		ti->error = "Cache data device lookup failed";
1950 		goto bad;
1951 	}
1952 	wc->memory_map_size = i_size_read(wc->ssd_dev->bdev->bd_inode);
1953 
1954 	/*
1955 	 * Parse the cache block size
1956 	 */
1957 	string = dm_shift_arg(&as);
1958 	if (!string)
1959 		goto bad_arguments;
1960 	if (sscanf(string, "%u%c", &wc->block_size, &dummy) != 1 ||
1961 	    wc->block_size < 512 || wc->block_size > PAGE_SIZE ||
1962 	    (wc->block_size & (wc->block_size - 1))) {
1963 		r = -EINVAL;
1964 		ti->error = "Invalid block size";
1965 		goto bad;
1966 	}
1967 	wc->block_size_bits = __ffs(wc->block_size);
1968 
1969 	wc->max_writeback_jobs = MAX_WRITEBACK_JOBS;
1970 	wc->autocommit_blocks = !WC_MODE_PMEM(wc) ? AUTOCOMMIT_BLOCKS_SSD : AUTOCOMMIT_BLOCKS_PMEM;
1971 	wc->autocommit_jiffies = msecs_to_jiffies(AUTOCOMMIT_MSEC);
1972 
1973 	/*
1974 	 * Parse optional arguments
1975 	 */
1976 	r = dm_read_arg_group(_args, &as, &opt_params, &ti->error);
1977 	if (r)
1978 		goto bad;
1979 
1980 	while (opt_params) {
1981 		string = dm_shift_arg(&as), opt_params--;
1982 		if (!strcasecmp(string, "start_sector") && opt_params >= 1) {
1983 			unsigned long long start_sector;
1984 			string = dm_shift_arg(&as), opt_params--;
1985 			if (sscanf(string, "%llu%c", &start_sector, &dummy) != 1)
1986 				goto invalid_optional;
1987 			wc->start_sector = start_sector;
1988 			if (wc->start_sector != start_sector ||
1989 			    wc->start_sector >= wc->memory_map_size >> SECTOR_SHIFT)
1990 				goto invalid_optional;
1991 		} else if (!strcasecmp(string, "high_watermark") && opt_params >= 1) {
1992 			string = dm_shift_arg(&as), opt_params--;
1993 			if (sscanf(string, "%d%c", &high_wm_percent, &dummy) != 1)
1994 				goto invalid_optional;
1995 			if (high_wm_percent < 0 || high_wm_percent > 100)
1996 				goto invalid_optional;
1997 			wc->high_wm_percent_set = true;
1998 		} else if (!strcasecmp(string, "low_watermark") && opt_params >= 1) {
1999 			string = dm_shift_arg(&as), opt_params--;
2000 			if (sscanf(string, "%d%c", &low_wm_percent, &dummy) != 1)
2001 				goto invalid_optional;
2002 			if (low_wm_percent < 0 || low_wm_percent > 100)
2003 				goto invalid_optional;
2004 			wc->low_wm_percent_set = true;
2005 		} else if (!strcasecmp(string, "writeback_jobs") && opt_params >= 1) {
2006 			string = dm_shift_arg(&as), opt_params--;
2007 			if (sscanf(string, "%u%c", &wc->max_writeback_jobs, &dummy) != 1)
2008 				goto invalid_optional;
2009 			wc->max_writeback_jobs_set = true;
2010 		} else if (!strcasecmp(string, "autocommit_blocks") && opt_params >= 1) {
2011 			string = dm_shift_arg(&as), opt_params--;
2012 			if (sscanf(string, "%u%c", &wc->autocommit_blocks, &dummy) != 1)
2013 				goto invalid_optional;
2014 			wc->autocommit_blocks_set = true;
2015 		} else if (!strcasecmp(string, "autocommit_time") && opt_params >= 1) {
2016 			unsigned autocommit_msecs;
2017 			string = dm_shift_arg(&as), opt_params--;
2018 			if (sscanf(string, "%u%c", &autocommit_msecs, &dummy) != 1)
2019 				goto invalid_optional;
2020 			if (autocommit_msecs > 3600000)
2021 				goto invalid_optional;
2022 			wc->autocommit_jiffies = msecs_to_jiffies(autocommit_msecs);
2023 			wc->autocommit_time_set = true;
2024 		} else if (!strcasecmp(string, "fua")) {
2025 			if (WC_MODE_PMEM(wc)) {
2026 				wc->writeback_fua = true;
2027 				wc->writeback_fua_set = true;
2028 			} else goto invalid_optional;
2029 		} else if (!strcasecmp(string, "nofua")) {
2030 			if (WC_MODE_PMEM(wc)) {
2031 				wc->writeback_fua = false;
2032 				wc->writeback_fua_set = true;
2033 			} else goto invalid_optional;
2034 		} else {
2035 invalid_optional:
2036 			r = -EINVAL;
2037 			ti->error = "Invalid optional argument";
2038 			goto bad;
2039 		}
2040 	}
2041 
2042 	if (high_wm_percent < low_wm_percent) {
2043 		r = -EINVAL;
2044 		ti->error = "High watermark must be greater than or equal to low watermark";
2045 		goto bad;
2046 	}
2047 
2048 	if (WC_MODE_PMEM(wc)) {
2049 		r = persistent_memory_claim(wc);
2050 		if (r) {
2051 			ti->error = "Unable to map persistent memory for cache";
2052 			goto bad;
2053 		}
2054 	} else {
2055 		struct dm_io_region region;
2056 		struct dm_io_request req;
2057 		size_t n_blocks, n_metadata_blocks;
2058 		uint64_t n_bitmap_bits;
2059 
2060 		wc->memory_map_size -= (uint64_t)wc->start_sector << SECTOR_SHIFT;
2061 
2062 		bio_list_init(&wc->flush_list);
2063 		wc->flush_thread = kthread_create(writecache_flush_thread, wc, "dm_writecache_flush");
2064 		if (IS_ERR(wc->flush_thread)) {
2065 			r = PTR_ERR(wc->flush_thread);
2066 			wc->flush_thread = NULL;
2067 			ti->error = "Couldn't spawn endio thread";
2068 			goto bad;
2069 		}
2070 		wake_up_process(wc->flush_thread);
2071 
2072 		r = calculate_memory_size(wc->memory_map_size, wc->block_size,
2073 					  &n_blocks, &n_metadata_blocks);
2074 		if (r) {
2075 			ti->error = "Invalid device size";
2076 			goto bad;
2077 		}
2078 
2079 		n_bitmap_bits = (((uint64_t)n_metadata_blocks << wc->block_size_bits) +
2080 				 BITMAP_GRANULARITY - 1) / BITMAP_GRANULARITY;
2081 		/* this is limitation of test_bit functions */
2082 		if (n_bitmap_bits > 1U << 31) {
2083 			r = -EFBIG;
2084 			ti->error = "Invalid device size";
2085 			goto bad;
2086 		}
2087 
2088 		wc->memory_map = vmalloc(n_metadata_blocks << wc->block_size_bits);
2089 		if (!wc->memory_map) {
2090 			r = -ENOMEM;
2091 			ti->error = "Unable to allocate memory for metadata";
2092 			goto bad;
2093 		}
2094 
2095 		wc->dm_kcopyd = dm_kcopyd_client_create(&dm_kcopyd_throttle);
2096 		if (IS_ERR(wc->dm_kcopyd)) {
2097 			r = PTR_ERR(wc->dm_kcopyd);
2098 			ti->error = "Unable to allocate dm-kcopyd client";
2099 			wc->dm_kcopyd = NULL;
2100 			goto bad;
2101 		}
2102 
2103 		wc->metadata_sectors = n_metadata_blocks << (wc->block_size_bits - SECTOR_SHIFT);
2104 		wc->dirty_bitmap_size = (n_bitmap_bits + BITS_PER_LONG - 1) /
2105 			BITS_PER_LONG * sizeof(unsigned long);
2106 		wc->dirty_bitmap = vzalloc(wc->dirty_bitmap_size);
2107 		if (!wc->dirty_bitmap) {
2108 			r = -ENOMEM;
2109 			ti->error = "Unable to allocate dirty bitmap";
2110 			goto bad;
2111 		}
2112 
2113 		region.bdev = wc->ssd_dev->bdev;
2114 		region.sector = wc->start_sector;
2115 		region.count = wc->metadata_sectors;
2116 		req.bi_op = REQ_OP_READ;
2117 		req.bi_op_flags = REQ_SYNC;
2118 		req.mem.type = DM_IO_VMA;
2119 		req.mem.ptr.vma = (char *)wc->memory_map;
2120 		req.client = wc->dm_io;
2121 		req.notify.fn = NULL;
2122 
2123 		r = dm_io(&req, 1, &region, NULL);
2124 		if (r) {
2125 			ti->error = "Unable to read metadata";
2126 			goto bad;
2127 		}
2128 	}
2129 
2130 	r = memcpy_mcsafe(&s, sb(wc), sizeof(struct wc_memory_superblock));
2131 	if (r) {
2132 		ti->error = "Hardware memory error when reading superblock";
2133 		goto bad;
2134 	}
2135 	if (!le32_to_cpu(s.magic) && !le32_to_cpu(s.version)) {
2136 		r = init_memory(wc);
2137 		if (r) {
2138 			ti->error = "Unable to initialize device";
2139 			goto bad;
2140 		}
2141 		r = memcpy_mcsafe(&s, sb(wc), sizeof(struct wc_memory_superblock));
2142 		if (r) {
2143 			ti->error = "Hardware memory error when reading superblock";
2144 			goto bad;
2145 		}
2146 	}
2147 
2148 	if (le32_to_cpu(s.magic) != MEMORY_SUPERBLOCK_MAGIC) {
2149 		ti->error = "Invalid magic in the superblock";
2150 		r = -EINVAL;
2151 		goto bad;
2152 	}
2153 
2154 	if (le32_to_cpu(s.version) != MEMORY_SUPERBLOCK_VERSION) {
2155 		ti->error = "Invalid version in the superblock";
2156 		r = -EINVAL;
2157 		goto bad;
2158 	}
2159 
2160 	if (le32_to_cpu(s.block_size) != wc->block_size) {
2161 		ti->error = "Block size does not match superblock";
2162 		r = -EINVAL;
2163 		goto bad;
2164 	}
2165 
2166 	wc->n_blocks = le64_to_cpu(s.n_blocks);
2167 
2168 	offset = wc->n_blocks * sizeof(struct wc_memory_entry);
2169 	if (offset / sizeof(struct wc_memory_entry) != le64_to_cpu(sb(wc)->n_blocks)) {
2170 overflow:
2171 		ti->error = "Overflow in size calculation";
2172 		r = -EINVAL;
2173 		goto bad;
2174 	}
2175 	offset += sizeof(struct wc_memory_superblock);
2176 	if (offset < sizeof(struct wc_memory_superblock))
2177 		goto overflow;
2178 	offset = (offset + wc->block_size - 1) & ~(size_t)(wc->block_size - 1);
2179 	data_size = wc->n_blocks * (size_t)wc->block_size;
2180 	if (!offset || (data_size / wc->block_size != wc->n_blocks) ||
2181 	    (offset + data_size < offset))
2182 		goto overflow;
2183 	if (offset + data_size > wc->memory_map_size) {
2184 		ti->error = "Memory area is too small";
2185 		r = -EINVAL;
2186 		goto bad;
2187 	}
2188 
2189 	wc->metadata_sectors = offset >> SECTOR_SHIFT;
2190 	wc->block_start = (char *)sb(wc) + offset;
2191 
2192 	x = (uint64_t)wc->n_blocks * (100 - high_wm_percent);
2193 	x += 50;
2194 	do_div(x, 100);
2195 	wc->freelist_high_watermark = x;
2196 	x = (uint64_t)wc->n_blocks * (100 - low_wm_percent);
2197 	x += 50;
2198 	do_div(x, 100);
2199 	wc->freelist_low_watermark = x;
2200 
2201 	r = writecache_alloc_entries(wc);
2202 	if (r) {
2203 		ti->error = "Cannot allocate memory";
2204 		goto bad;
2205 	}
2206 
2207 	ti->num_flush_bios = 1;
2208 	ti->flush_supported = true;
2209 	ti->num_discard_bios = 1;
2210 
2211 	if (WC_MODE_PMEM(wc))
2212 		persistent_memory_flush_cache(wc->memory_map, wc->memory_map_size);
2213 
2214 	return 0;
2215 
2216 bad_arguments:
2217 	r = -EINVAL;
2218 	ti->error = "Bad arguments";
2219 bad:
2220 	writecache_dtr(ti);
2221 	return r;
2222 }
2223 
2224 static void writecache_status(struct dm_target *ti, status_type_t type,
2225 			      unsigned status_flags, char *result, unsigned maxlen)
2226 {
2227 	struct dm_writecache *wc = ti->private;
2228 	unsigned extra_args;
2229 	unsigned sz = 0;
2230 	uint64_t x;
2231 
2232 	switch (type) {
2233 	case STATUSTYPE_INFO:
2234 		DMEMIT("%ld %llu %llu %llu", writecache_has_error(wc),
2235 		       (unsigned long long)wc->n_blocks, (unsigned long long)wc->freelist_size,
2236 		       (unsigned long long)wc->writeback_size);
2237 		break;
2238 	case STATUSTYPE_TABLE:
2239 		DMEMIT("%c %s %s %u ", WC_MODE_PMEM(wc) ? 'p' : 's',
2240 				wc->dev->name, wc->ssd_dev->name, wc->block_size);
2241 		extra_args = 0;
2242 		if (wc->start_sector)
2243 			extra_args += 2;
2244 		if (wc->high_wm_percent_set)
2245 			extra_args += 2;
2246 		if (wc->low_wm_percent_set)
2247 			extra_args += 2;
2248 		if (wc->max_writeback_jobs_set)
2249 			extra_args += 2;
2250 		if (wc->autocommit_blocks_set)
2251 			extra_args += 2;
2252 		if (wc->autocommit_time_set)
2253 			extra_args += 2;
2254 		if (wc->writeback_fua_set)
2255 			extra_args++;
2256 
2257 		DMEMIT("%u", extra_args);
2258 		if (wc->start_sector)
2259 			DMEMIT(" start_sector %llu", (unsigned long long)wc->start_sector);
2260 		if (wc->high_wm_percent_set) {
2261 			x = (uint64_t)wc->freelist_high_watermark * 100;
2262 			x += wc->n_blocks / 2;
2263 			do_div(x, (size_t)wc->n_blocks);
2264 			DMEMIT(" high_watermark %u", 100 - (unsigned)x);
2265 		}
2266 		if (wc->low_wm_percent_set) {
2267 			x = (uint64_t)wc->freelist_low_watermark * 100;
2268 			x += wc->n_blocks / 2;
2269 			do_div(x, (size_t)wc->n_blocks);
2270 			DMEMIT(" low_watermark %u", 100 - (unsigned)x);
2271 		}
2272 		if (wc->max_writeback_jobs_set)
2273 			DMEMIT(" writeback_jobs %u", wc->max_writeback_jobs);
2274 		if (wc->autocommit_blocks_set)
2275 			DMEMIT(" autocommit_blocks %u", wc->autocommit_blocks);
2276 		if (wc->autocommit_time_set)
2277 			DMEMIT(" autocommit_time %u", jiffies_to_msecs(wc->autocommit_jiffies));
2278 		if (wc->writeback_fua_set)
2279 			DMEMIT(" %sfua", wc->writeback_fua ? "" : "no");
2280 		break;
2281 	}
2282 }
2283 
2284 static struct target_type writecache_target = {
2285 	.name			= "writecache",
2286 	.version		= {1, 1, 1},
2287 	.module			= THIS_MODULE,
2288 	.ctr			= writecache_ctr,
2289 	.dtr			= writecache_dtr,
2290 	.status			= writecache_status,
2291 	.postsuspend		= writecache_suspend,
2292 	.resume			= writecache_resume,
2293 	.message		= writecache_message,
2294 	.map			= writecache_map,
2295 	.end_io			= writecache_end_io,
2296 	.iterate_devices	= writecache_iterate_devices,
2297 	.io_hints		= writecache_io_hints,
2298 };
2299 
2300 static int __init dm_writecache_init(void)
2301 {
2302 	int r;
2303 
2304 	r = dm_register_target(&writecache_target);
2305 	if (r < 0) {
2306 		DMERR("register failed %d", r);
2307 		return r;
2308 	}
2309 
2310 	return 0;
2311 }
2312 
2313 static void __exit dm_writecache_exit(void)
2314 {
2315 	dm_unregister_target(&writecache_target);
2316 }
2317 
2318 module_init(dm_writecache_init);
2319 module_exit(dm_writecache_exit);
2320 
2321 MODULE_DESCRIPTION(DM_NAME " writecache target");
2322 MODULE_AUTHOR("Mikulas Patocka <dm-devel@redhat.com>");
2323 MODULE_LICENSE("GPL");
2324