xref: /openbmc/linux/drivers/md/dm-writecache.c (revision 82df5b73)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (C) 2018 Red Hat. All rights reserved.
4  *
5  * This file is released under the GPL.
6  */
7 
8 #include <linux/device-mapper.h>
9 #include <linux/module.h>
10 #include <linux/init.h>
11 #include <linux/vmalloc.h>
12 #include <linux/kthread.h>
13 #include <linux/dm-io.h>
14 #include <linux/dm-kcopyd.h>
15 #include <linux/dax.h>
16 #include <linux/pfn_t.h>
17 #include <linux/libnvdimm.h>
18 
19 #define DM_MSG_PREFIX "writecache"
20 
21 #define HIGH_WATERMARK			50
22 #define LOW_WATERMARK			45
23 #define MAX_WRITEBACK_JOBS		0
24 #define ENDIO_LATENCY			16
25 #define WRITEBACK_LATENCY		64
26 #define AUTOCOMMIT_BLOCKS_SSD		65536
27 #define AUTOCOMMIT_BLOCKS_PMEM		64
28 #define AUTOCOMMIT_MSEC			1000
29 #define MAX_AGE_DIV			16
30 #define MAX_AGE_UNSPECIFIED		-1UL
31 
32 #define BITMAP_GRANULARITY	65536
33 #if BITMAP_GRANULARITY < PAGE_SIZE
34 #undef BITMAP_GRANULARITY
35 #define BITMAP_GRANULARITY	PAGE_SIZE
36 #endif
37 
38 #if IS_ENABLED(CONFIG_ARCH_HAS_PMEM_API) && IS_ENABLED(CONFIG_DAX_DRIVER)
39 #define DM_WRITECACHE_HAS_PMEM
40 #endif
41 
42 #ifdef DM_WRITECACHE_HAS_PMEM
43 #define pmem_assign(dest, src)					\
44 do {								\
45 	typeof(dest) uniq = (src);				\
46 	memcpy_flushcache(&(dest), &uniq, sizeof(dest));	\
47 } while (0)
48 #else
49 #define pmem_assign(dest, src)	((dest) = (src))
50 #endif
51 
52 #if defined(__HAVE_ARCH_MEMCPY_MCSAFE) && defined(DM_WRITECACHE_HAS_PMEM)
53 #define DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
54 #endif
55 
56 #define MEMORY_SUPERBLOCK_MAGIC		0x23489321
57 #define MEMORY_SUPERBLOCK_VERSION	1
58 
59 struct wc_memory_entry {
60 	__le64 original_sector;
61 	__le64 seq_count;
62 };
63 
64 struct wc_memory_superblock {
65 	union {
66 		struct {
67 			__le32 magic;
68 			__le32 version;
69 			__le32 block_size;
70 			__le32 pad;
71 			__le64 n_blocks;
72 			__le64 seq_count;
73 		};
74 		__le64 padding[8];
75 	};
76 	struct wc_memory_entry entries[0];
77 };
78 
79 struct wc_entry {
80 	struct rb_node rb_node;
81 	struct list_head lru;
82 	unsigned short wc_list_contiguous;
83 	bool write_in_progress
84 #if BITS_PER_LONG == 64
85 		:1
86 #endif
87 	;
88 	unsigned long index
89 #if BITS_PER_LONG == 64
90 		:47
91 #endif
92 	;
93 	unsigned long age;
94 #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
95 	uint64_t original_sector;
96 	uint64_t seq_count;
97 #endif
98 };
99 
100 #ifdef DM_WRITECACHE_HAS_PMEM
101 #define WC_MODE_PMEM(wc)			((wc)->pmem_mode)
102 #define WC_MODE_FUA(wc)				((wc)->writeback_fua)
103 #else
104 #define WC_MODE_PMEM(wc)			false
105 #define WC_MODE_FUA(wc)				false
106 #endif
107 #define WC_MODE_SORT_FREELIST(wc)		(!WC_MODE_PMEM(wc))
108 
109 struct dm_writecache {
110 	struct mutex lock;
111 	struct list_head lru;
112 	union {
113 		struct list_head freelist;
114 		struct {
115 			struct rb_root freetree;
116 			struct wc_entry *current_free;
117 		};
118 	};
119 	struct rb_root tree;
120 
121 	size_t freelist_size;
122 	size_t writeback_size;
123 	size_t freelist_high_watermark;
124 	size_t freelist_low_watermark;
125 	unsigned long max_age;
126 
127 	unsigned uncommitted_blocks;
128 	unsigned autocommit_blocks;
129 	unsigned max_writeback_jobs;
130 
131 	int error;
132 
133 	unsigned long autocommit_jiffies;
134 	struct timer_list autocommit_timer;
135 	struct wait_queue_head freelist_wait;
136 
137 	struct timer_list max_age_timer;
138 
139 	atomic_t bio_in_progress[2];
140 	struct wait_queue_head bio_in_progress_wait[2];
141 
142 	struct dm_target *ti;
143 	struct dm_dev *dev;
144 	struct dm_dev *ssd_dev;
145 	sector_t start_sector;
146 	void *memory_map;
147 	uint64_t memory_map_size;
148 	size_t metadata_sectors;
149 	size_t n_blocks;
150 	uint64_t seq_count;
151 	void *block_start;
152 	struct wc_entry *entries;
153 	unsigned block_size;
154 	unsigned char block_size_bits;
155 
156 	bool pmem_mode:1;
157 	bool writeback_fua:1;
158 
159 	bool overwrote_committed:1;
160 	bool memory_vmapped:1;
161 
162 	bool high_wm_percent_set:1;
163 	bool low_wm_percent_set:1;
164 	bool max_writeback_jobs_set:1;
165 	bool autocommit_blocks_set:1;
166 	bool autocommit_time_set:1;
167 	bool writeback_fua_set:1;
168 	bool flush_on_suspend:1;
169 	bool cleaner:1;
170 
171 	unsigned writeback_all;
172 	struct workqueue_struct *writeback_wq;
173 	struct work_struct writeback_work;
174 	struct work_struct flush_work;
175 
176 	struct dm_io_client *dm_io;
177 
178 	raw_spinlock_t endio_list_lock;
179 	struct list_head endio_list;
180 	struct task_struct *endio_thread;
181 
182 	struct task_struct *flush_thread;
183 	struct bio_list flush_list;
184 
185 	struct dm_kcopyd_client *dm_kcopyd;
186 	unsigned long *dirty_bitmap;
187 	unsigned dirty_bitmap_size;
188 
189 	struct bio_set bio_set;
190 	mempool_t copy_pool;
191 };
192 
193 #define WB_LIST_INLINE		16
194 
195 struct writeback_struct {
196 	struct list_head endio_entry;
197 	struct dm_writecache *wc;
198 	struct wc_entry **wc_list;
199 	unsigned wc_list_n;
200 	struct wc_entry *wc_list_inline[WB_LIST_INLINE];
201 	struct bio bio;
202 };
203 
204 struct copy_struct {
205 	struct list_head endio_entry;
206 	struct dm_writecache *wc;
207 	struct wc_entry *e;
208 	unsigned n_entries;
209 	int error;
210 };
211 
212 DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(dm_writecache_throttle,
213 					    "A percentage of time allocated for data copying");
214 
215 static void wc_lock(struct dm_writecache *wc)
216 {
217 	mutex_lock(&wc->lock);
218 }
219 
220 static void wc_unlock(struct dm_writecache *wc)
221 {
222 	mutex_unlock(&wc->lock);
223 }
224 
225 #ifdef DM_WRITECACHE_HAS_PMEM
226 static int persistent_memory_claim(struct dm_writecache *wc)
227 {
228 	int r;
229 	loff_t s;
230 	long p, da;
231 	pfn_t pfn;
232 	int id;
233 	struct page **pages;
234 
235 	wc->memory_vmapped = false;
236 
237 	s = wc->memory_map_size;
238 	p = s >> PAGE_SHIFT;
239 	if (!p) {
240 		r = -EINVAL;
241 		goto err1;
242 	}
243 	if (p != s >> PAGE_SHIFT) {
244 		r = -EOVERFLOW;
245 		goto err1;
246 	}
247 
248 	id = dax_read_lock();
249 
250 	da = dax_direct_access(wc->ssd_dev->dax_dev, 0, p, &wc->memory_map, &pfn);
251 	if (da < 0) {
252 		wc->memory_map = NULL;
253 		r = da;
254 		goto err2;
255 	}
256 	if (!pfn_t_has_page(pfn)) {
257 		wc->memory_map = NULL;
258 		r = -EOPNOTSUPP;
259 		goto err2;
260 	}
261 	if (da != p) {
262 		long i;
263 		wc->memory_map = NULL;
264 		pages = kvmalloc_array(p, sizeof(struct page *), GFP_KERNEL);
265 		if (!pages) {
266 			r = -ENOMEM;
267 			goto err2;
268 		}
269 		i = 0;
270 		do {
271 			long daa;
272 			daa = dax_direct_access(wc->ssd_dev->dax_dev, i, p - i,
273 						NULL, &pfn);
274 			if (daa <= 0) {
275 				r = daa ? daa : -EINVAL;
276 				goto err3;
277 			}
278 			if (!pfn_t_has_page(pfn)) {
279 				r = -EOPNOTSUPP;
280 				goto err3;
281 			}
282 			while (daa-- && i < p) {
283 				pages[i++] = pfn_t_to_page(pfn);
284 				pfn.val++;
285 			}
286 		} while (i < p);
287 		wc->memory_map = vmap(pages, p, VM_MAP, PAGE_KERNEL);
288 		if (!wc->memory_map) {
289 			r = -ENOMEM;
290 			goto err3;
291 		}
292 		kvfree(pages);
293 		wc->memory_vmapped = true;
294 	}
295 
296 	dax_read_unlock(id);
297 
298 	wc->memory_map += (size_t)wc->start_sector << SECTOR_SHIFT;
299 	wc->memory_map_size -= (size_t)wc->start_sector << SECTOR_SHIFT;
300 
301 	return 0;
302 err3:
303 	kvfree(pages);
304 err2:
305 	dax_read_unlock(id);
306 err1:
307 	return r;
308 }
309 #else
310 static int persistent_memory_claim(struct dm_writecache *wc)
311 {
312 	BUG();
313 }
314 #endif
315 
316 static void persistent_memory_release(struct dm_writecache *wc)
317 {
318 	if (wc->memory_vmapped)
319 		vunmap(wc->memory_map - ((size_t)wc->start_sector << SECTOR_SHIFT));
320 }
321 
322 static struct page *persistent_memory_page(void *addr)
323 {
324 	if (is_vmalloc_addr(addr))
325 		return vmalloc_to_page(addr);
326 	else
327 		return virt_to_page(addr);
328 }
329 
330 static unsigned persistent_memory_page_offset(void *addr)
331 {
332 	return (unsigned long)addr & (PAGE_SIZE - 1);
333 }
334 
335 static void persistent_memory_flush_cache(void *ptr, size_t size)
336 {
337 	if (is_vmalloc_addr(ptr))
338 		flush_kernel_vmap_range(ptr, size);
339 }
340 
341 static void persistent_memory_invalidate_cache(void *ptr, size_t size)
342 {
343 	if (is_vmalloc_addr(ptr))
344 		invalidate_kernel_vmap_range(ptr, size);
345 }
346 
347 static struct wc_memory_superblock *sb(struct dm_writecache *wc)
348 {
349 	return wc->memory_map;
350 }
351 
352 static struct wc_memory_entry *memory_entry(struct dm_writecache *wc, struct wc_entry *e)
353 {
354 	return &sb(wc)->entries[e->index];
355 }
356 
357 static void *memory_data(struct dm_writecache *wc, struct wc_entry *e)
358 {
359 	return (char *)wc->block_start + (e->index << wc->block_size_bits);
360 }
361 
362 static sector_t cache_sector(struct dm_writecache *wc, struct wc_entry *e)
363 {
364 	return wc->start_sector + wc->metadata_sectors +
365 		((sector_t)e->index << (wc->block_size_bits - SECTOR_SHIFT));
366 }
367 
368 static uint64_t read_original_sector(struct dm_writecache *wc, struct wc_entry *e)
369 {
370 #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
371 	return e->original_sector;
372 #else
373 	return le64_to_cpu(memory_entry(wc, e)->original_sector);
374 #endif
375 }
376 
377 static uint64_t read_seq_count(struct dm_writecache *wc, struct wc_entry *e)
378 {
379 #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
380 	return e->seq_count;
381 #else
382 	return le64_to_cpu(memory_entry(wc, e)->seq_count);
383 #endif
384 }
385 
386 static void clear_seq_count(struct dm_writecache *wc, struct wc_entry *e)
387 {
388 #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
389 	e->seq_count = -1;
390 #endif
391 	pmem_assign(memory_entry(wc, e)->seq_count, cpu_to_le64(-1));
392 }
393 
394 static void write_original_sector_seq_count(struct dm_writecache *wc, struct wc_entry *e,
395 					    uint64_t original_sector, uint64_t seq_count)
396 {
397 	struct wc_memory_entry me;
398 #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
399 	e->original_sector = original_sector;
400 	e->seq_count = seq_count;
401 #endif
402 	me.original_sector = cpu_to_le64(original_sector);
403 	me.seq_count = cpu_to_le64(seq_count);
404 	pmem_assign(*memory_entry(wc, e), me);
405 }
406 
407 #define writecache_error(wc, err, msg, arg...)				\
408 do {									\
409 	if (!cmpxchg(&(wc)->error, 0, err))				\
410 		DMERR(msg, ##arg);					\
411 	wake_up(&(wc)->freelist_wait);					\
412 } while (0)
413 
414 #define writecache_has_error(wc)	(unlikely(READ_ONCE((wc)->error)))
415 
416 static void writecache_flush_all_metadata(struct dm_writecache *wc)
417 {
418 	if (!WC_MODE_PMEM(wc))
419 		memset(wc->dirty_bitmap, -1, wc->dirty_bitmap_size);
420 }
421 
422 static void writecache_flush_region(struct dm_writecache *wc, void *ptr, size_t size)
423 {
424 	if (!WC_MODE_PMEM(wc))
425 		__set_bit(((char *)ptr - (char *)wc->memory_map) / BITMAP_GRANULARITY,
426 			  wc->dirty_bitmap);
427 }
428 
429 static void writecache_disk_flush(struct dm_writecache *wc, struct dm_dev *dev);
430 
431 struct io_notify {
432 	struct dm_writecache *wc;
433 	struct completion c;
434 	atomic_t count;
435 };
436 
437 static void writecache_notify_io(unsigned long error, void *context)
438 {
439 	struct io_notify *endio = context;
440 
441 	if (unlikely(error != 0))
442 		writecache_error(endio->wc, -EIO, "error writing metadata");
443 	BUG_ON(atomic_read(&endio->count) <= 0);
444 	if (atomic_dec_and_test(&endio->count))
445 		complete(&endio->c);
446 }
447 
448 static void writecache_wait_for_ios(struct dm_writecache *wc, int direction)
449 {
450 	wait_event(wc->bio_in_progress_wait[direction],
451 		   !atomic_read(&wc->bio_in_progress[direction]));
452 }
453 
454 static void ssd_commit_flushed(struct dm_writecache *wc, bool wait_for_ios)
455 {
456 	struct dm_io_region region;
457 	struct dm_io_request req;
458 	struct io_notify endio = {
459 		wc,
460 		COMPLETION_INITIALIZER_ONSTACK(endio.c),
461 		ATOMIC_INIT(1),
462 	};
463 	unsigned bitmap_bits = wc->dirty_bitmap_size * 8;
464 	unsigned i = 0;
465 
466 	while (1) {
467 		unsigned j;
468 		i = find_next_bit(wc->dirty_bitmap, bitmap_bits, i);
469 		if (unlikely(i == bitmap_bits))
470 			break;
471 		j = find_next_zero_bit(wc->dirty_bitmap, bitmap_bits, i);
472 
473 		region.bdev = wc->ssd_dev->bdev;
474 		region.sector = (sector_t)i * (BITMAP_GRANULARITY >> SECTOR_SHIFT);
475 		region.count = (sector_t)(j - i) * (BITMAP_GRANULARITY >> SECTOR_SHIFT);
476 
477 		if (unlikely(region.sector >= wc->metadata_sectors))
478 			break;
479 		if (unlikely(region.sector + region.count > wc->metadata_sectors))
480 			region.count = wc->metadata_sectors - region.sector;
481 
482 		region.sector += wc->start_sector;
483 		atomic_inc(&endio.count);
484 		req.bi_op = REQ_OP_WRITE;
485 		req.bi_op_flags = REQ_SYNC;
486 		req.mem.type = DM_IO_VMA;
487 		req.mem.ptr.vma = (char *)wc->memory_map + (size_t)i * BITMAP_GRANULARITY;
488 		req.client = wc->dm_io;
489 		req.notify.fn = writecache_notify_io;
490 		req.notify.context = &endio;
491 
492 		/* writing via async dm-io (implied by notify.fn above) won't return an error */
493 	        (void) dm_io(&req, 1, &region, NULL);
494 		i = j;
495 	}
496 
497 	writecache_notify_io(0, &endio);
498 	wait_for_completion_io(&endio.c);
499 
500 	if (wait_for_ios)
501 		writecache_wait_for_ios(wc, WRITE);
502 
503 	writecache_disk_flush(wc, wc->ssd_dev);
504 
505 	memset(wc->dirty_bitmap, 0, wc->dirty_bitmap_size);
506 }
507 
508 static void ssd_commit_superblock(struct dm_writecache *wc)
509 {
510 	int r;
511 	struct dm_io_region region;
512 	struct dm_io_request req;
513 
514 	region.bdev = wc->ssd_dev->bdev;
515 	region.sector = 0;
516 	region.count = PAGE_SIZE;
517 
518 	if (unlikely(region.sector + region.count > wc->metadata_sectors))
519 		region.count = wc->metadata_sectors - region.sector;
520 
521 	region.sector += wc->start_sector;
522 
523 	req.bi_op = REQ_OP_WRITE;
524 	req.bi_op_flags = REQ_SYNC | REQ_FUA;
525 	req.mem.type = DM_IO_VMA;
526 	req.mem.ptr.vma = (char *)wc->memory_map;
527 	req.client = wc->dm_io;
528 	req.notify.fn = NULL;
529 	req.notify.context = NULL;
530 
531 	r = dm_io(&req, 1, &region, NULL);
532 	if (unlikely(r))
533 		writecache_error(wc, r, "error writing superblock");
534 }
535 
536 static void writecache_commit_flushed(struct dm_writecache *wc, bool wait_for_ios)
537 {
538 	if (WC_MODE_PMEM(wc))
539 		wmb();
540 	else
541 		ssd_commit_flushed(wc, wait_for_ios);
542 }
543 
544 static void writecache_disk_flush(struct dm_writecache *wc, struct dm_dev *dev)
545 {
546 	int r;
547 	struct dm_io_region region;
548 	struct dm_io_request req;
549 
550 	region.bdev = dev->bdev;
551 	region.sector = 0;
552 	region.count = 0;
553 	req.bi_op = REQ_OP_WRITE;
554 	req.bi_op_flags = REQ_PREFLUSH;
555 	req.mem.type = DM_IO_KMEM;
556 	req.mem.ptr.addr = NULL;
557 	req.client = wc->dm_io;
558 	req.notify.fn = NULL;
559 
560 	r = dm_io(&req, 1, &region, NULL);
561 	if (unlikely(r))
562 		writecache_error(wc, r, "error flushing metadata: %d", r);
563 }
564 
565 #define WFE_RETURN_FOLLOWING	1
566 #define WFE_LOWEST_SEQ		2
567 
568 static struct wc_entry *writecache_find_entry(struct dm_writecache *wc,
569 					      uint64_t block, int flags)
570 {
571 	struct wc_entry *e;
572 	struct rb_node *node = wc->tree.rb_node;
573 
574 	if (unlikely(!node))
575 		return NULL;
576 
577 	while (1) {
578 		e = container_of(node, struct wc_entry, rb_node);
579 		if (read_original_sector(wc, e) == block)
580 			break;
581 
582 		node = (read_original_sector(wc, e) >= block ?
583 			e->rb_node.rb_left : e->rb_node.rb_right);
584 		if (unlikely(!node)) {
585 			if (!(flags & WFE_RETURN_FOLLOWING))
586 				return NULL;
587 			if (read_original_sector(wc, e) >= block) {
588 				return e;
589 			} else {
590 				node = rb_next(&e->rb_node);
591 				if (unlikely(!node))
592 					return NULL;
593 				e = container_of(node, struct wc_entry, rb_node);
594 				return e;
595 			}
596 		}
597 	}
598 
599 	while (1) {
600 		struct wc_entry *e2;
601 		if (flags & WFE_LOWEST_SEQ)
602 			node = rb_prev(&e->rb_node);
603 		else
604 			node = rb_next(&e->rb_node);
605 		if (unlikely(!node))
606 			return e;
607 		e2 = container_of(node, struct wc_entry, rb_node);
608 		if (read_original_sector(wc, e2) != block)
609 			return e;
610 		e = e2;
611 	}
612 }
613 
614 static void writecache_insert_entry(struct dm_writecache *wc, struct wc_entry *ins)
615 {
616 	struct wc_entry *e;
617 	struct rb_node **node = &wc->tree.rb_node, *parent = NULL;
618 
619 	while (*node) {
620 		e = container_of(*node, struct wc_entry, rb_node);
621 		parent = &e->rb_node;
622 		if (read_original_sector(wc, e) > read_original_sector(wc, ins))
623 			node = &parent->rb_left;
624 		else
625 			node = &parent->rb_right;
626 	}
627 	rb_link_node(&ins->rb_node, parent, node);
628 	rb_insert_color(&ins->rb_node, &wc->tree);
629 	list_add(&ins->lru, &wc->lru);
630 	ins->age = jiffies;
631 }
632 
633 static void writecache_unlink(struct dm_writecache *wc, struct wc_entry *e)
634 {
635 	list_del(&e->lru);
636 	rb_erase(&e->rb_node, &wc->tree);
637 }
638 
639 static void writecache_add_to_freelist(struct dm_writecache *wc, struct wc_entry *e)
640 {
641 	if (WC_MODE_SORT_FREELIST(wc)) {
642 		struct rb_node **node = &wc->freetree.rb_node, *parent = NULL;
643 		if (unlikely(!*node))
644 			wc->current_free = e;
645 		while (*node) {
646 			parent = *node;
647 			if (&e->rb_node < *node)
648 				node = &parent->rb_left;
649 			else
650 				node = &parent->rb_right;
651 		}
652 		rb_link_node(&e->rb_node, parent, node);
653 		rb_insert_color(&e->rb_node, &wc->freetree);
654 	} else {
655 		list_add_tail(&e->lru, &wc->freelist);
656 	}
657 	wc->freelist_size++;
658 }
659 
660 static inline void writecache_verify_watermark(struct dm_writecache *wc)
661 {
662 	if (unlikely(wc->freelist_size + wc->writeback_size <= wc->freelist_high_watermark))
663 		queue_work(wc->writeback_wq, &wc->writeback_work);
664 }
665 
666 static void writecache_max_age_timer(struct timer_list *t)
667 {
668 	struct dm_writecache *wc = from_timer(wc, t, max_age_timer);
669 
670 	if (!dm_suspended(wc->ti) && !writecache_has_error(wc)) {
671 		queue_work(wc->writeback_wq, &wc->writeback_work);
672 		mod_timer(&wc->max_age_timer, jiffies + wc->max_age / MAX_AGE_DIV);
673 	}
674 }
675 
676 static struct wc_entry *writecache_pop_from_freelist(struct dm_writecache *wc, sector_t expected_sector)
677 {
678 	struct wc_entry *e;
679 
680 	if (WC_MODE_SORT_FREELIST(wc)) {
681 		struct rb_node *next;
682 		if (unlikely(!wc->current_free))
683 			return NULL;
684 		e = wc->current_free;
685 		if (expected_sector != (sector_t)-1 && unlikely(cache_sector(wc, e) != expected_sector))
686 			return NULL;
687 		next = rb_next(&e->rb_node);
688 		rb_erase(&e->rb_node, &wc->freetree);
689 		if (unlikely(!next))
690 			next = rb_first(&wc->freetree);
691 		wc->current_free = next ? container_of(next, struct wc_entry, rb_node) : NULL;
692 	} else {
693 		if (unlikely(list_empty(&wc->freelist)))
694 			return NULL;
695 		e = container_of(wc->freelist.next, struct wc_entry, lru);
696 		if (expected_sector != (sector_t)-1 && unlikely(cache_sector(wc, e) != expected_sector))
697 			return NULL;
698 		list_del(&e->lru);
699 	}
700 	wc->freelist_size--;
701 
702 	writecache_verify_watermark(wc);
703 
704 	return e;
705 }
706 
707 static void writecache_free_entry(struct dm_writecache *wc, struct wc_entry *e)
708 {
709 	writecache_unlink(wc, e);
710 	writecache_add_to_freelist(wc, e);
711 	clear_seq_count(wc, e);
712 	writecache_flush_region(wc, memory_entry(wc, e), sizeof(struct wc_memory_entry));
713 	if (unlikely(waitqueue_active(&wc->freelist_wait)))
714 		wake_up(&wc->freelist_wait);
715 }
716 
717 static void writecache_wait_on_freelist(struct dm_writecache *wc)
718 {
719 	DEFINE_WAIT(wait);
720 
721 	prepare_to_wait(&wc->freelist_wait, &wait, TASK_UNINTERRUPTIBLE);
722 	wc_unlock(wc);
723 	io_schedule();
724 	finish_wait(&wc->freelist_wait, &wait);
725 	wc_lock(wc);
726 }
727 
728 static void writecache_poison_lists(struct dm_writecache *wc)
729 {
730 	/*
731 	 * Catch incorrect access to these values while the device is suspended.
732 	 */
733 	memset(&wc->tree, -1, sizeof wc->tree);
734 	wc->lru.next = LIST_POISON1;
735 	wc->lru.prev = LIST_POISON2;
736 	wc->freelist.next = LIST_POISON1;
737 	wc->freelist.prev = LIST_POISON2;
738 }
739 
740 static void writecache_flush_entry(struct dm_writecache *wc, struct wc_entry *e)
741 {
742 	writecache_flush_region(wc, memory_entry(wc, e), sizeof(struct wc_memory_entry));
743 	if (WC_MODE_PMEM(wc))
744 		writecache_flush_region(wc, memory_data(wc, e), wc->block_size);
745 }
746 
747 static bool writecache_entry_is_committed(struct dm_writecache *wc, struct wc_entry *e)
748 {
749 	return read_seq_count(wc, e) < wc->seq_count;
750 }
751 
752 static void writecache_flush(struct dm_writecache *wc)
753 {
754 	struct wc_entry *e, *e2;
755 	bool need_flush_after_free;
756 
757 	wc->uncommitted_blocks = 0;
758 	del_timer(&wc->autocommit_timer);
759 
760 	if (list_empty(&wc->lru))
761 		return;
762 
763 	e = container_of(wc->lru.next, struct wc_entry, lru);
764 	if (writecache_entry_is_committed(wc, e)) {
765 		if (wc->overwrote_committed) {
766 			writecache_wait_for_ios(wc, WRITE);
767 			writecache_disk_flush(wc, wc->ssd_dev);
768 			wc->overwrote_committed = false;
769 		}
770 		return;
771 	}
772 	while (1) {
773 		writecache_flush_entry(wc, e);
774 		if (unlikely(e->lru.next == &wc->lru))
775 			break;
776 		e2 = container_of(e->lru.next, struct wc_entry, lru);
777 		if (writecache_entry_is_committed(wc, e2))
778 			break;
779 		e = e2;
780 		cond_resched();
781 	}
782 	writecache_commit_flushed(wc, true);
783 
784 	wc->seq_count++;
785 	pmem_assign(sb(wc)->seq_count, cpu_to_le64(wc->seq_count));
786 	if (WC_MODE_PMEM(wc))
787 		writecache_commit_flushed(wc, false);
788 	else
789 		ssd_commit_superblock(wc);
790 
791 	wc->overwrote_committed = false;
792 
793 	need_flush_after_free = false;
794 	while (1) {
795 		/* Free another committed entry with lower seq-count */
796 		struct rb_node *rb_node = rb_prev(&e->rb_node);
797 
798 		if (rb_node) {
799 			e2 = container_of(rb_node, struct wc_entry, rb_node);
800 			if (read_original_sector(wc, e2) == read_original_sector(wc, e) &&
801 			    likely(!e2->write_in_progress)) {
802 				writecache_free_entry(wc, e2);
803 				need_flush_after_free = true;
804 			}
805 		}
806 		if (unlikely(e->lru.prev == &wc->lru))
807 			break;
808 		e = container_of(e->lru.prev, struct wc_entry, lru);
809 		cond_resched();
810 	}
811 
812 	if (need_flush_after_free)
813 		writecache_commit_flushed(wc, false);
814 }
815 
816 static void writecache_flush_work(struct work_struct *work)
817 {
818 	struct dm_writecache *wc = container_of(work, struct dm_writecache, flush_work);
819 
820 	wc_lock(wc);
821 	writecache_flush(wc);
822 	wc_unlock(wc);
823 }
824 
825 static void writecache_autocommit_timer(struct timer_list *t)
826 {
827 	struct dm_writecache *wc = from_timer(wc, t, autocommit_timer);
828 	if (!writecache_has_error(wc))
829 		queue_work(wc->writeback_wq, &wc->flush_work);
830 }
831 
832 static void writecache_schedule_autocommit(struct dm_writecache *wc)
833 {
834 	if (!timer_pending(&wc->autocommit_timer))
835 		mod_timer(&wc->autocommit_timer, jiffies + wc->autocommit_jiffies);
836 }
837 
838 static void writecache_discard(struct dm_writecache *wc, sector_t start, sector_t end)
839 {
840 	struct wc_entry *e;
841 	bool discarded_something = false;
842 
843 	e = writecache_find_entry(wc, start, WFE_RETURN_FOLLOWING | WFE_LOWEST_SEQ);
844 	if (unlikely(!e))
845 		return;
846 
847 	while (read_original_sector(wc, e) < end) {
848 		struct rb_node *node = rb_next(&e->rb_node);
849 
850 		if (likely(!e->write_in_progress)) {
851 			if (!discarded_something) {
852 				writecache_wait_for_ios(wc, READ);
853 				writecache_wait_for_ios(wc, WRITE);
854 				discarded_something = true;
855 			}
856 			writecache_free_entry(wc, e);
857 		}
858 
859 		if (unlikely(!node))
860 			break;
861 
862 		e = container_of(node, struct wc_entry, rb_node);
863 	}
864 
865 	if (discarded_something)
866 		writecache_commit_flushed(wc, false);
867 }
868 
869 static bool writecache_wait_for_writeback(struct dm_writecache *wc)
870 {
871 	if (wc->writeback_size) {
872 		writecache_wait_on_freelist(wc);
873 		return true;
874 	}
875 	return false;
876 }
877 
878 static void writecache_suspend(struct dm_target *ti)
879 {
880 	struct dm_writecache *wc = ti->private;
881 	bool flush_on_suspend;
882 
883 	del_timer_sync(&wc->autocommit_timer);
884 	del_timer_sync(&wc->max_age_timer);
885 
886 	wc_lock(wc);
887 	writecache_flush(wc);
888 	flush_on_suspend = wc->flush_on_suspend;
889 	if (flush_on_suspend) {
890 		wc->flush_on_suspend = false;
891 		wc->writeback_all++;
892 		queue_work(wc->writeback_wq, &wc->writeback_work);
893 	}
894 	wc_unlock(wc);
895 
896 	drain_workqueue(wc->writeback_wq);
897 
898 	wc_lock(wc);
899 	if (flush_on_suspend)
900 		wc->writeback_all--;
901 	while (writecache_wait_for_writeback(wc));
902 
903 	if (WC_MODE_PMEM(wc))
904 		persistent_memory_flush_cache(wc->memory_map, wc->memory_map_size);
905 
906 	writecache_poison_lists(wc);
907 
908 	wc_unlock(wc);
909 }
910 
911 static int writecache_alloc_entries(struct dm_writecache *wc)
912 {
913 	size_t b;
914 
915 	if (wc->entries)
916 		return 0;
917 	wc->entries = vmalloc(array_size(sizeof(struct wc_entry), wc->n_blocks));
918 	if (!wc->entries)
919 		return -ENOMEM;
920 	for (b = 0; b < wc->n_blocks; b++) {
921 		struct wc_entry *e = &wc->entries[b];
922 		e->index = b;
923 		e->write_in_progress = false;
924 		cond_resched();
925 	}
926 
927 	return 0;
928 }
929 
930 static int writecache_read_metadata(struct dm_writecache *wc, sector_t n_sectors)
931 {
932 	struct dm_io_region region;
933 	struct dm_io_request req;
934 
935 	region.bdev = wc->ssd_dev->bdev;
936 	region.sector = wc->start_sector;
937 	region.count = n_sectors;
938 	req.bi_op = REQ_OP_READ;
939 	req.bi_op_flags = REQ_SYNC;
940 	req.mem.type = DM_IO_VMA;
941 	req.mem.ptr.vma = (char *)wc->memory_map;
942 	req.client = wc->dm_io;
943 	req.notify.fn = NULL;
944 
945 	return dm_io(&req, 1, &region, NULL);
946 }
947 
948 static void writecache_resume(struct dm_target *ti)
949 {
950 	struct dm_writecache *wc = ti->private;
951 	size_t b;
952 	bool need_flush = false;
953 	__le64 sb_seq_count;
954 	int r;
955 
956 	wc_lock(wc);
957 
958 	if (WC_MODE_PMEM(wc)) {
959 		persistent_memory_invalidate_cache(wc->memory_map, wc->memory_map_size);
960 	} else {
961 		r = writecache_read_metadata(wc, wc->metadata_sectors);
962 		if (r) {
963 			size_t sb_entries_offset;
964 			writecache_error(wc, r, "unable to read metadata: %d", r);
965 			sb_entries_offset = offsetof(struct wc_memory_superblock, entries);
966 			memset((char *)wc->memory_map + sb_entries_offset, -1,
967 			       (wc->metadata_sectors << SECTOR_SHIFT) - sb_entries_offset);
968 		}
969 	}
970 
971 	wc->tree = RB_ROOT;
972 	INIT_LIST_HEAD(&wc->lru);
973 	if (WC_MODE_SORT_FREELIST(wc)) {
974 		wc->freetree = RB_ROOT;
975 		wc->current_free = NULL;
976 	} else {
977 		INIT_LIST_HEAD(&wc->freelist);
978 	}
979 	wc->freelist_size = 0;
980 
981 	r = memcpy_mcsafe(&sb_seq_count, &sb(wc)->seq_count, sizeof(uint64_t));
982 	if (r) {
983 		writecache_error(wc, r, "hardware memory error when reading superblock: %d", r);
984 		sb_seq_count = cpu_to_le64(0);
985 	}
986 	wc->seq_count = le64_to_cpu(sb_seq_count);
987 
988 #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
989 	for (b = 0; b < wc->n_blocks; b++) {
990 		struct wc_entry *e = &wc->entries[b];
991 		struct wc_memory_entry wme;
992 		if (writecache_has_error(wc)) {
993 			e->original_sector = -1;
994 			e->seq_count = -1;
995 			continue;
996 		}
997 		r = memcpy_mcsafe(&wme, memory_entry(wc, e), sizeof(struct wc_memory_entry));
998 		if (r) {
999 			writecache_error(wc, r, "hardware memory error when reading metadata entry %lu: %d",
1000 					 (unsigned long)b, r);
1001 			e->original_sector = -1;
1002 			e->seq_count = -1;
1003 		} else {
1004 			e->original_sector = le64_to_cpu(wme.original_sector);
1005 			e->seq_count = le64_to_cpu(wme.seq_count);
1006 		}
1007 		cond_resched();
1008 	}
1009 #endif
1010 	for (b = 0; b < wc->n_blocks; b++) {
1011 		struct wc_entry *e = &wc->entries[b];
1012 		if (!writecache_entry_is_committed(wc, e)) {
1013 			if (read_seq_count(wc, e) != -1) {
1014 erase_this:
1015 				clear_seq_count(wc, e);
1016 				need_flush = true;
1017 			}
1018 			writecache_add_to_freelist(wc, e);
1019 		} else {
1020 			struct wc_entry *old;
1021 
1022 			old = writecache_find_entry(wc, read_original_sector(wc, e), 0);
1023 			if (!old) {
1024 				writecache_insert_entry(wc, e);
1025 			} else {
1026 				if (read_seq_count(wc, old) == read_seq_count(wc, e)) {
1027 					writecache_error(wc, -EINVAL,
1028 						 "two identical entries, position %llu, sector %llu, sequence %llu",
1029 						 (unsigned long long)b, (unsigned long long)read_original_sector(wc, e),
1030 						 (unsigned long long)read_seq_count(wc, e));
1031 				}
1032 				if (read_seq_count(wc, old) > read_seq_count(wc, e)) {
1033 					goto erase_this;
1034 				} else {
1035 					writecache_free_entry(wc, old);
1036 					writecache_insert_entry(wc, e);
1037 					need_flush = true;
1038 				}
1039 			}
1040 		}
1041 		cond_resched();
1042 	}
1043 
1044 	if (need_flush) {
1045 		writecache_flush_all_metadata(wc);
1046 		writecache_commit_flushed(wc, false);
1047 	}
1048 
1049 	writecache_verify_watermark(wc);
1050 
1051 	if (wc->max_age != MAX_AGE_UNSPECIFIED)
1052 		mod_timer(&wc->max_age_timer, jiffies + wc->max_age / MAX_AGE_DIV);
1053 
1054 	wc_unlock(wc);
1055 }
1056 
1057 static int process_flush_mesg(unsigned argc, char **argv, struct dm_writecache *wc)
1058 {
1059 	if (argc != 1)
1060 		return -EINVAL;
1061 
1062 	wc_lock(wc);
1063 	if (dm_suspended(wc->ti)) {
1064 		wc_unlock(wc);
1065 		return -EBUSY;
1066 	}
1067 	if (writecache_has_error(wc)) {
1068 		wc_unlock(wc);
1069 		return -EIO;
1070 	}
1071 
1072 	writecache_flush(wc);
1073 	wc->writeback_all++;
1074 	queue_work(wc->writeback_wq, &wc->writeback_work);
1075 	wc_unlock(wc);
1076 
1077 	flush_workqueue(wc->writeback_wq);
1078 
1079 	wc_lock(wc);
1080 	wc->writeback_all--;
1081 	if (writecache_has_error(wc)) {
1082 		wc_unlock(wc);
1083 		return -EIO;
1084 	}
1085 	wc_unlock(wc);
1086 
1087 	return 0;
1088 }
1089 
1090 static int process_flush_on_suspend_mesg(unsigned argc, char **argv, struct dm_writecache *wc)
1091 {
1092 	if (argc != 1)
1093 		return -EINVAL;
1094 
1095 	wc_lock(wc);
1096 	wc->flush_on_suspend = true;
1097 	wc_unlock(wc);
1098 
1099 	return 0;
1100 }
1101 
1102 static void activate_cleaner(struct dm_writecache *wc)
1103 {
1104 	wc->flush_on_suspend = true;
1105 	wc->cleaner = true;
1106 	wc->freelist_high_watermark = wc->n_blocks;
1107 	wc->freelist_low_watermark = wc->n_blocks;
1108 }
1109 
1110 static int process_cleaner_mesg(unsigned argc, char **argv, struct dm_writecache *wc)
1111 {
1112 	if (argc != 1)
1113 		return -EINVAL;
1114 
1115 	wc_lock(wc);
1116 	activate_cleaner(wc);
1117 	if (!dm_suspended(wc->ti))
1118 		writecache_verify_watermark(wc);
1119 	wc_unlock(wc);
1120 
1121 	return 0;
1122 }
1123 
1124 static int writecache_message(struct dm_target *ti, unsigned argc, char **argv,
1125 			      char *result, unsigned maxlen)
1126 {
1127 	int r = -EINVAL;
1128 	struct dm_writecache *wc = ti->private;
1129 
1130 	if (!strcasecmp(argv[0], "flush"))
1131 		r = process_flush_mesg(argc, argv, wc);
1132 	else if (!strcasecmp(argv[0], "flush_on_suspend"))
1133 		r = process_flush_on_suspend_mesg(argc, argv, wc);
1134 	else if (!strcasecmp(argv[0], "cleaner"))
1135 		r = process_cleaner_mesg(argc, argv, wc);
1136 	else
1137 		DMERR("unrecognised message received: %s", argv[0]);
1138 
1139 	return r;
1140 }
1141 
1142 static void memcpy_flushcache_optimized(void *dest, void *source, size_t size)
1143 {
1144 	/*
1145 	 * clflushopt performs better with block size 1024, 2048, 4096
1146 	 * non-temporal stores perform better with block size 512
1147 	 *
1148 	 * block size   512             1024            2048            4096
1149 	 * movnti       496 MB/s        642 MB/s        725 MB/s        744 MB/s
1150 	 * clflushopt   373 MB/s        688 MB/s        1.1 GB/s        1.2 GB/s
1151 	 *
1152 	 * We see that movnti performs better for 512-byte blocks, and
1153 	 * clflushopt performs better for 1024-byte and larger blocks. So, we
1154 	 * prefer clflushopt for sizes >= 768.
1155 	 *
1156 	 * NOTE: this happens to be the case now (with dm-writecache's single
1157 	 * threaded model) but re-evaluate this once memcpy_flushcache() is
1158 	 * enabled to use movdir64b which might invalidate this performance
1159 	 * advantage seen with cache-allocating-writes plus flushing.
1160 	 */
1161 #ifdef CONFIG_X86
1162 	if (static_cpu_has(X86_FEATURE_CLFLUSHOPT) &&
1163 	    likely(boot_cpu_data.x86_clflush_size == 64) &&
1164 	    likely(size >= 768)) {
1165 		do {
1166 			memcpy((void *)dest, (void *)source, 64);
1167 			clflushopt((void *)dest);
1168 			dest += 64;
1169 			source += 64;
1170 			size -= 64;
1171 		} while (size >= 64);
1172 		return;
1173 	}
1174 #endif
1175 	memcpy_flushcache(dest, source, size);
1176 }
1177 
1178 static void bio_copy_block(struct dm_writecache *wc, struct bio *bio, void *data)
1179 {
1180 	void *buf;
1181 	unsigned long flags;
1182 	unsigned size;
1183 	int rw = bio_data_dir(bio);
1184 	unsigned remaining_size = wc->block_size;
1185 
1186 	do {
1187 		struct bio_vec bv = bio_iter_iovec(bio, bio->bi_iter);
1188 		buf = bvec_kmap_irq(&bv, &flags);
1189 		size = bv.bv_len;
1190 		if (unlikely(size > remaining_size))
1191 			size = remaining_size;
1192 
1193 		if (rw == READ) {
1194 			int r;
1195 			r = memcpy_mcsafe(buf, data, size);
1196 			flush_dcache_page(bio_page(bio));
1197 			if (unlikely(r)) {
1198 				writecache_error(wc, r, "hardware memory error when reading data: %d", r);
1199 				bio->bi_status = BLK_STS_IOERR;
1200 			}
1201 		} else {
1202 			flush_dcache_page(bio_page(bio));
1203 			memcpy_flushcache_optimized(data, buf, size);
1204 		}
1205 
1206 		bvec_kunmap_irq(buf, &flags);
1207 
1208 		data = (char *)data + size;
1209 		remaining_size -= size;
1210 		bio_advance(bio, size);
1211 	} while (unlikely(remaining_size));
1212 }
1213 
1214 static int writecache_flush_thread(void *data)
1215 {
1216 	struct dm_writecache *wc = data;
1217 
1218 	while (1) {
1219 		struct bio *bio;
1220 
1221 		wc_lock(wc);
1222 		bio = bio_list_pop(&wc->flush_list);
1223 		if (!bio) {
1224 			set_current_state(TASK_INTERRUPTIBLE);
1225 			wc_unlock(wc);
1226 
1227 			if (unlikely(kthread_should_stop())) {
1228 				set_current_state(TASK_RUNNING);
1229 				break;
1230 			}
1231 
1232 			schedule();
1233 			continue;
1234 		}
1235 
1236 		if (bio_op(bio) == REQ_OP_DISCARD) {
1237 			writecache_discard(wc, bio->bi_iter.bi_sector,
1238 					   bio_end_sector(bio));
1239 			wc_unlock(wc);
1240 			bio_set_dev(bio, wc->dev->bdev);
1241 			generic_make_request(bio);
1242 		} else {
1243 			writecache_flush(wc);
1244 			wc_unlock(wc);
1245 			if (writecache_has_error(wc))
1246 				bio->bi_status = BLK_STS_IOERR;
1247 			bio_endio(bio);
1248 		}
1249 	}
1250 
1251 	return 0;
1252 }
1253 
1254 static void writecache_offload_bio(struct dm_writecache *wc, struct bio *bio)
1255 {
1256 	if (bio_list_empty(&wc->flush_list))
1257 		wake_up_process(wc->flush_thread);
1258 	bio_list_add(&wc->flush_list, bio);
1259 }
1260 
1261 static int writecache_map(struct dm_target *ti, struct bio *bio)
1262 {
1263 	struct wc_entry *e;
1264 	struct dm_writecache *wc = ti->private;
1265 
1266 	bio->bi_private = NULL;
1267 
1268 	wc_lock(wc);
1269 
1270 	if (unlikely(bio->bi_opf & REQ_PREFLUSH)) {
1271 		if (writecache_has_error(wc))
1272 			goto unlock_error;
1273 		if (WC_MODE_PMEM(wc)) {
1274 			writecache_flush(wc);
1275 			if (writecache_has_error(wc))
1276 				goto unlock_error;
1277 			goto unlock_submit;
1278 		} else {
1279 			writecache_offload_bio(wc, bio);
1280 			goto unlock_return;
1281 		}
1282 	}
1283 
1284 	bio->bi_iter.bi_sector = dm_target_offset(ti, bio->bi_iter.bi_sector);
1285 
1286 	if (unlikely((((unsigned)bio->bi_iter.bi_sector | bio_sectors(bio)) &
1287 				(wc->block_size / 512 - 1)) != 0)) {
1288 		DMERR("I/O is not aligned, sector %llu, size %u, block size %u",
1289 		      (unsigned long long)bio->bi_iter.bi_sector,
1290 		      bio->bi_iter.bi_size, wc->block_size);
1291 		goto unlock_error;
1292 	}
1293 
1294 	if (unlikely(bio_op(bio) == REQ_OP_DISCARD)) {
1295 		if (writecache_has_error(wc))
1296 			goto unlock_error;
1297 		if (WC_MODE_PMEM(wc)) {
1298 			writecache_discard(wc, bio->bi_iter.bi_sector, bio_end_sector(bio));
1299 			goto unlock_remap_origin;
1300 		} else {
1301 			writecache_offload_bio(wc, bio);
1302 			goto unlock_return;
1303 		}
1304 	}
1305 
1306 	if (bio_data_dir(bio) == READ) {
1307 read_next_block:
1308 		e = writecache_find_entry(wc, bio->bi_iter.bi_sector, WFE_RETURN_FOLLOWING);
1309 		if (e && read_original_sector(wc, e) == bio->bi_iter.bi_sector) {
1310 			if (WC_MODE_PMEM(wc)) {
1311 				bio_copy_block(wc, bio, memory_data(wc, e));
1312 				if (bio->bi_iter.bi_size)
1313 					goto read_next_block;
1314 				goto unlock_submit;
1315 			} else {
1316 				dm_accept_partial_bio(bio, wc->block_size >> SECTOR_SHIFT);
1317 				bio_set_dev(bio, wc->ssd_dev->bdev);
1318 				bio->bi_iter.bi_sector = cache_sector(wc, e);
1319 				if (!writecache_entry_is_committed(wc, e))
1320 					writecache_wait_for_ios(wc, WRITE);
1321 				goto unlock_remap;
1322 			}
1323 		} else {
1324 			if (e) {
1325 				sector_t next_boundary =
1326 					read_original_sector(wc, e) - bio->bi_iter.bi_sector;
1327 				if (next_boundary < bio->bi_iter.bi_size >> SECTOR_SHIFT) {
1328 					dm_accept_partial_bio(bio, next_boundary);
1329 				}
1330 			}
1331 			goto unlock_remap_origin;
1332 		}
1333 	} else {
1334 		do {
1335 			bool found_entry = false;
1336 			if (writecache_has_error(wc))
1337 				goto unlock_error;
1338 			e = writecache_find_entry(wc, bio->bi_iter.bi_sector, 0);
1339 			if (e) {
1340 				if (!writecache_entry_is_committed(wc, e))
1341 					goto bio_copy;
1342 				if (!WC_MODE_PMEM(wc) && !e->write_in_progress) {
1343 					wc->overwrote_committed = true;
1344 					goto bio_copy;
1345 				}
1346 				found_entry = true;
1347 			} else {
1348 				if (unlikely(wc->cleaner))
1349 					goto direct_write;
1350 			}
1351 			e = writecache_pop_from_freelist(wc, (sector_t)-1);
1352 			if (unlikely(!e)) {
1353 				if (!found_entry) {
1354 direct_write:
1355 					e = writecache_find_entry(wc, bio->bi_iter.bi_sector, WFE_RETURN_FOLLOWING);
1356 					if (e) {
1357 						sector_t next_boundary = read_original_sector(wc, e) - bio->bi_iter.bi_sector;
1358 						BUG_ON(!next_boundary);
1359 						if (next_boundary < bio->bi_iter.bi_size >> SECTOR_SHIFT) {
1360 							dm_accept_partial_bio(bio, next_boundary);
1361 						}
1362 					}
1363 					goto unlock_remap_origin;
1364 				}
1365 				writecache_wait_on_freelist(wc);
1366 				continue;
1367 			}
1368 			write_original_sector_seq_count(wc, e, bio->bi_iter.bi_sector, wc->seq_count);
1369 			writecache_insert_entry(wc, e);
1370 			wc->uncommitted_blocks++;
1371 bio_copy:
1372 			if (WC_MODE_PMEM(wc)) {
1373 				bio_copy_block(wc, bio, memory_data(wc, e));
1374 			} else {
1375 				unsigned bio_size = wc->block_size;
1376 				sector_t start_cache_sec = cache_sector(wc, e);
1377 				sector_t current_cache_sec = start_cache_sec + (bio_size >> SECTOR_SHIFT);
1378 
1379 				while (bio_size < bio->bi_iter.bi_size) {
1380 					struct wc_entry *f = writecache_pop_from_freelist(wc, current_cache_sec);
1381 					if (!f)
1382 						break;
1383 					write_original_sector_seq_count(wc, f, bio->bi_iter.bi_sector +
1384 									(bio_size >> SECTOR_SHIFT), wc->seq_count);
1385 					writecache_insert_entry(wc, f);
1386 					wc->uncommitted_blocks++;
1387 					bio_size += wc->block_size;
1388 					current_cache_sec += wc->block_size >> SECTOR_SHIFT;
1389 				}
1390 
1391 				bio_set_dev(bio, wc->ssd_dev->bdev);
1392 				bio->bi_iter.bi_sector = start_cache_sec;
1393 				dm_accept_partial_bio(bio, bio_size >> SECTOR_SHIFT);
1394 
1395 				if (unlikely(wc->uncommitted_blocks >= wc->autocommit_blocks)) {
1396 					wc->uncommitted_blocks = 0;
1397 					queue_work(wc->writeback_wq, &wc->flush_work);
1398 				} else {
1399 					writecache_schedule_autocommit(wc);
1400 				}
1401 				goto unlock_remap;
1402 			}
1403 		} while (bio->bi_iter.bi_size);
1404 
1405 		if (unlikely(bio->bi_opf & REQ_FUA ||
1406 			     wc->uncommitted_blocks >= wc->autocommit_blocks))
1407 			writecache_flush(wc);
1408 		else
1409 			writecache_schedule_autocommit(wc);
1410 		goto unlock_submit;
1411 	}
1412 
1413 unlock_remap_origin:
1414 	bio_set_dev(bio, wc->dev->bdev);
1415 	wc_unlock(wc);
1416 	return DM_MAPIO_REMAPPED;
1417 
1418 unlock_remap:
1419 	/* make sure that writecache_end_io decrements bio_in_progress: */
1420 	bio->bi_private = (void *)1;
1421 	atomic_inc(&wc->bio_in_progress[bio_data_dir(bio)]);
1422 	wc_unlock(wc);
1423 	return DM_MAPIO_REMAPPED;
1424 
1425 unlock_submit:
1426 	wc_unlock(wc);
1427 	bio_endio(bio);
1428 	return DM_MAPIO_SUBMITTED;
1429 
1430 unlock_return:
1431 	wc_unlock(wc);
1432 	return DM_MAPIO_SUBMITTED;
1433 
1434 unlock_error:
1435 	wc_unlock(wc);
1436 	bio_io_error(bio);
1437 	return DM_MAPIO_SUBMITTED;
1438 }
1439 
1440 static int writecache_end_io(struct dm_target *ti, struct bio *bio, blk_status_t *status)
1441 {
1442 	struct dm_writecache *wc = ti->private;
1443 
1444 	if (bio->bi_private != NULL) {
1445 		int dir = bio_data_dir(bio);
1446 		if (atomic_dec_and_test(&wc->bio_in_progress[dir]))
1447 			if (unlikely(waitqueue_active(&wc->bio_in_progress_wait[dir])))
1448 				wake_up(&wc->bio_in_progress_wait[dir]);
1449 	}
1450 	return 0;
1451 }
1452 
1453 static int writecache_iterate_devices(struct dm_target *ti,
1454 				      iterate_devices_callout_fn fn, void *data)
1455 {
1456 	struct dm_writecache *wc = ti->private;
1457 
1458 	return fn(ti, wc->dev, 0, ti->len, data);
1459 }
1460 
1461 static void writecache_io_hints(struct dm_target *ti, struct queue_limits *limits)
1462 {
1463 	struct dm_writecache *wc = ti->private;
1464 
1465 	if (limits->logical_block_size < wc->block_size)
1466 		limits->logical_block_size = wc->block_size;
1467 
1468 	if (limits->physical_block_size < wc->block_size)
1469 		limits->physical_block_size = wc->block_size;
1470 
1471 	if (limits->io_min < wc->block_size)
1472 		limits->io_min = wc->block_size;
1473 }
1474 
1475 
1476 static void writecache_writeback_endio(struct bio *bio)
1477 {
1478 	struct writeback_struct *wb = container_of(bio, struct writeback_struct, bio);
1479 	struct dm_writecache *wc = wb->wc;
1480 	unsigned long flags;
1481 
1482 	raw_spin_lock_irqsave(&wc->endio_list_lock, flags);
1483 	if (unlikely(list_empty(&wc->endio_list)))
1484 		wake_up_process(wc->endio_thread);
1485 	list_add_tail(&wb->endio_entry, &wc->endio_list);
1486 	raw_spin_unlock_irqrestore(&wc->endio_list_lock, flags);
1487 }
1488 
1489 static void writecache_copy_endio(int read_err, unsigned long write_err, void *ptr)
1490 {
1491 	struct copy_struct *c = ptr;
1492 	struct dm_writecache *wc = c->wc;
1493 
1494 	c->error = likely(!(read_err | write_err)) ? 0 : -EIO;
1495 
1496 	raw_spin_lock_irq(&wc->endio_list_lock);
1497 	if (unlikely(list_empty(&wc->endio_list)))
1498 		wake_up_process(wc->endio_thread);
1499 	list_add_tail(&c->endio_entry, &wc->endio_list);
1500 	raw_spin_unlock_irq(&wc->endio_list_lock);
1501 }
1502 
1503 static void __writecache_endio_pmem(struct dm_writecache *wc, struct list_head *list)
1504 {
1505 	unsigned i;
1506 	struct writeback_struct *wb;
1507 	struct wc_entry *e;
1508 	unsigned long n_walked = 0;
1509 
1510 	do {
1511 		wb = list_entry(list->next, struct writeback_struct, endio_entry);
1512 		list_del(&wb->endio_entry);
1513 
1514 		if (unlikely(wb->bio.bi_status != BLK_STS_OK))
1515 			writecache_error(wc, blk_status_to_errno(wb->bio.bi_status),
1516 					"write error %d", wb->bio.bi_status);
1517 		i = 0;
1518 		do {
1519 			e = wb->wc_list[i];
1520 			BUG_ON(!e->write_in_progress);
1521 			e->write_in_progress = false;
1522 			INIT_LIST_HEAD(&e->lru);
1523 			if (!writecache_has_error(wc))
1524 				writecache_free_entry(wc, e);
1525 			BUG_ON(!wc->writeback_size);
1526 			wc->writeback_size--;
1527 			n_walked++;
1528 			if (unlikely(n_walked >= ENDIO_LATENCY)) {
1529 				writecache_commit_flushed(wc, false);
1530 				wc_unlock(wc);
1531 				wc_lock(wc);
1532 				n_walked = 0;
1533 			}
1534 		} while (++i < wb->wc_list_n);
1535 
1536 		if (wb->wc_list != wb->wc_list_inline)
1537 			kfree(wb->wc_list);
1538 		bio_put(&wb->bio);
1539 	} while (!list_empty(list));
1540 }
1541 
1542 static void __writecache_endio_ssd(struct dm_writecache *wc, struct list_head *list)
1543 {
1544 	struct copy_struct *c;
1545 	struct wc_entry *e;
1546 
1547 	do {
1548 		c = list_entry(list->next, struct copy_struct, endio_entry);
1549 		list_del(&c->endio_entry);
1550 
1551 		if (unlikely(c->error))
1552 			writecache_error(wc, c->error, "copy error");
1553 
1554 		e = c->e;
1555 		do {
1556 			BUG_ON(!e->write_in_progress);
1557 			e->write_in_progress = false;
1558 			INIT_LIST_HEAD(&e->lru);
1559 			if (!writecache_has_error(wc))
1560 				writecache_free_entry(wc, e);
1561 
1562 			BUG_ON(!wc->writeback_size);
1563 			wc->writeback_size--;
1564 			e++;
1565 		} while (--c->n_entries);
1566 		mempool_free(c, &wc->copy_pool);
1567 	} while (!list_empty(list));
1568 }
1569 
1570 static int writecache_endio_thread(void *data)
1571 {
1572 	struct dm_writecache *wc = data;
1573 
1574 	while (1) {
1575 		struct list_head list;
1576 
1577 		raw_spin_lock_irq(&wc->endio_list_lock);
1578 		if (!list_empty(&wc->endio_list))
1579 			goto pop_from_list;
1580 		set_current_state(TASK_INTERRUPTIBLE);
1581 		raw_spin_unlock_irq(&wc->endio_list_lock);
1582 
1583 		if (unlikely(kthread_should_stop())) {
1584 			set_current_state(TASK_RUNNING);
1585 			break;
1586 		}
1587 
1588 		schedule();
1589 
1590 		continue;
1591 
1592 pop_from_list:
1593 		list = wc->endio_list;
1594 		list.next->prev = list.prev->next = &list;
1595 		INIT_LIST_HEAD(&wc->endio_list);
1596 		raw_spin_unlock_irq(&wc->endio_list_lock);
1597 
1598 		if (!WC_MODE_FUA(wc))
1599 			writecache_disk_flush(wc, wc->dev);
1600 
1601 		wc_lock(wc);
1602 
1603 		if (WC_MODE_PMEM(wc)) {
1604 			__writecache_endio_pmem(wc, &list);
1605 		} else {
1606 			__writecache_endio_ssd(wc, &list);
1607 			writecache_wait_for_ios(wc, READ);
1608 		}
1609 
1610 		writecache_commit_flushed(wc, false);
1611 
1612 		wc_unlock(wc);
1613 	}
1614 
1615 	return 0;
1616 }
1617 
1618 static bool wc_add_block(struct writeback_struct *wb, struct wc_entry *e, gfp_t gfp)
1619 {
1620 	struct dm_writecache *wc = wb->wc;
1621 	unsigned block_size = wc->block_size;
1622 	void *address = memory_data(wc, e);
1623 
1624 	persistent_memory_flush_cache(address, block_size);
1625 	return bio_add_page(&wb->bio, persistent_memory_page(address),
1626 			    block_size, persistent_memory_page_offset(address)) != 0;
1627 }
1628 
1629 struct writeback_list {
1630 	struct list_head list;
1631 	size_t size;
1632 };
1633 
1634 static void __writeback_throttle(struct dm_writecache *wc, struct writeback_list *wbl)
1635 {
1636 	if (unlikely(wc->max_writeback_jobs)) {
1637 		if (READ_ONCE(wc->writeback_size) - wbl->size >= wc->max_writeback_jobs) {
1638 			wc_lock(wc);
1639 			while (wc->writeback_size - wbl->size >= wc->max_writeback_jobs)
1640 				writecache_wait_on_freelist(wc);
1641 			wc_unlock(wc);
1642 		}
1643 	}
1644 	cond_resched();
1645 }
1646 
1647 static void __writecache_writeback_pmem(struct dm_writecache *wc, struct writeback_list *wbl)
1648 {
1649 	struct wc_entry *e, *f;
1650 	struct bio *bio;
1651 	struct writeback_struct *wb;
1652 	unsigned max_pages;
1653 
1654 	while (wbl->size) {
1655 		wbl->size--;
1656 		e = container_of(wbl->list.prev, struct wc_entry, lru);
1657 		list_del(&e->lru);
1658 
1659 		max_pages = e->wc_list_contiguous;
1660 
1661 		bio = bio_alloc_bioset(GFP_NOIO, max_pages, &wc->bio_set);
1662 		wb = container_of(bio, struct writeback_struct, bio);
1663 		wb->wc = wc;
1664 		bio->bi_end_io = writecache_writeback_endio;
1665 		bio_set_dev(bio, wc->dev->bdev);
1666 		bio->bi_iter.bi_sector = read_original_sector(wc, e);
1667 		if (max_pages <= WB_LIST_INLINE ||
1668 		    unlikely(!(wb->wc_list = kmalloc_array(max_pages, sizeof(struct wc_entry *),
1669 							   GFP_NOIO | __GFP_NORETRY |
1670 							   __GFP_NOMEMALLOC | __GFP_NOWARN)))) {
1671 			wb->wc_list = wb->wc_list_inline;
1672 			max_pages = WB_LIST_INLINE;
1673 		}
1674 
1675 		BUG_ON(!wc_add_block(wb, e, GFP_NOIO));
1676 
1677 		wb->wc_list[0] = e;
1678 		wb->wc_list_n = 1;
1679 
1680 		while (wbl->size && wb->wc_list_n < max_pages) {
1681 			f = container_of(wbl->list.prev, struct wc_entry, lru);
1682 			if (read_original_sector(wc, f) !=
1683 			    read_original_sector(wc, e) + (wc->block_size >> SECTOR_SHIFT))
1684 				break;
1685 			if (!wc_add_block(wb, f, GFP_NOWAIT | __GFP_NOWARN))
1686 				break;
1687 			wbl->size--;
1688 			list_del(&f->lru);
1689 			wb->wc_list[wb->wc_list_n++] = f;
1690 			e = f;
1691 		}
1692 		bio_set_op_attrs(bio, REQ_OP_WRITE, WC_MODE_FUA(wc) * REQ_FUA);
1693 		if (writecache_has_error(wc)) {
1694 			bio->bi_status = BLK_STS_IOERR;
1695 			bio_endio(bio);
1696 		} else {
1697 			submit_bio(bio);
1698 		}
1699 
1700 		__writeback_throttle(wc, wbl);
1701 	}
1702 }
1703 
1704 static void __writecache_writeback_ssd(struct dm_writecache *wc, struct writeback_list *wbl)
1705 {
1706 	struct wc_entry *e, *f;
1707 	struct dm_io_region from, to;
1708 	struct copy_struct *c;
1709 
1710 	while (wbl->size) {
1711 		unsigned n_sectors;
1712 
1713 		wbl->size--;
1714 		e = container_of(wbl->list.prev, struct wc_entry, lru);
1715 		list_del(&e->lru);
1716 
1717 		n_sectors = e->wc_list_contiguous << (wc->block_size_bits - SECTOR_SHIFT);
1718 
1719 		from.bdev = wc->ssd_dev->bdev;
1720 		from.sector = cache_sector(wc, e);
1721 		from.count = n_sectors;
1722 		to.bdev = wc->dev->bdev;
1723 		to.sector = read_original_sector(wc, e);
1724 		to.count = n_sectors;
1725 
1726 		c = mempool_alloc(&wc->copy_pool, GFP_NOIO);
1727 		c->wc = wc;
1728 		c->e = e;
1729 		c->n_entries = e->wc_list_contiguous;
1730 
1731 		while ((n_sectors -= wc->block_size >> SECTOR_SHIFT)) {
1732 			wbl->size--;
1733 			f = container_of(wbl->list.prev, struct wc_entry, lru);
1734 			BUG_ON(f != e + 1);
1735 			list_del(&f->lru);
1736 			e = f;
1737 		}
1738 
1739 		dm_kcopyd_copy(wc->dm_kcopyd, &from, 1, &to, 0, writecache_copy_endio, c);
1740 
1741 		__writeback_throttle(wc, wbl);
1742 	}
1743 }
1744 
1745 static void writecache_writeback(struct work_struct *work)
1746 {
1747 	struct dm_writecache *wc = container_of(work, struct dm_writecache, writeback_work);
1748 	struct blk_plug plug;
1749 	struct wc_entry *f, *uninitialized_var(g), *e = NULL;
1750 	struct rb_node *node, *next_node;
1751 	struct list_head skipped;
1752 	struct writeback_list wbl;
1753 	unsigned long n_walked;
1754 
1755 	wc_lock(wc);
1756 restart:
1757 	if (writecache_has_error(wc)) {
1758 		wc_unlock(wc);
1759 		return;
1760 	}
1761 
1762 	if (unlikely(wc->writeback_all)) {
1763 		if (writecache_wait_for_writeback(wc))
1764 			goto restart;
1765 	}
1766 
1767 	if (wc->overwrote_committed) {
1768 		writecache_wait_for_ios(wc, WRITE);
1769 	}
1770 
1771 	n_walked = 0;
1772 	INIT_LIST_HEAD(&skipped);
1773 	INIT_LIST_HEAD(&wbl.list);
1774 	wbl.size = 0;
1775 	while (!list_empty(&wc->lru) &&
1776 	       (wc->writeback_all ||
1777 		wc->freelist_size + wc->writeback_size <= wc->freelist_low_watermark ||
1778 		(jiffies - container_of(wc->lru.prev, struct wc_entry, lru)->age >=
1779 		 wc->max_age - wc->max_age / MAX_AGE_DIV))) {
1780 
1781 		n_walked++;
1782 		if (unlikely(n_walked > WRITEBACK_LATENCY) &&
1783 		    likely(!wc->writeback_all) && likely(!dm_suspended(wc->ti))) {
1784 			queue_work(wc->writeback_wq, &wc->writeback_work);
1785 			break;
1786 		}
1787 
1788 		if (unlikely(wc->writeback_all)) {
1789 			if (unlikely(!e)) {
1790 				writecache_flush(wc);
1791 				e = container_of(rb_first(&wc->tree), struct wc_entry, rb_node);
1792 			} else
1793 				e = g;
1794 		} else
1795 			e = container_of(wc->lru.prev, struct wc_entry, lru);
1796 		BUG_ON(e->write_in_progress);
1797 		if (unlikely(!writecache_entry_is_committed(wc, e))) {
1798 			writecache_flush(wc);
1799 		}
1800 		node = rb_prev(&e->rb_node);
1801 		if (node) {
1802 			f = container_of(node, struct wc_entry, rb_node);
1803 			if (unlikely(read_original_sector(wc, f) ==
1804 				     read_original_sector(wc, e))) {
1805 				BUG_ON(!f->write_in_progress);
1806 				list_del(&e->lru);
1807 				list_add(&e->lru, &skipped);
1808 				cond_resched();
1809 				continue;
1810 			}
1811 		}
1812 		wc->writeback_size++;
1813 		list_del(&e->lru);
1814 		list_add(&e->lru, &wbl.list);
1815 		wbl.size++;
1816 		e->write_in_progress = true;
1817 		e->wc_list_contiguous = 1;
1818 
1819 		f = e;
1820 
1821 		while (1) {
1822 			next_node = rb_next(&f->rb_node);
1823 			if (unlikely(!next_node))
1824 				break;
1825 			g = container_of(next_node, struct wc_entry, rb_node);
1826 			if (unlikely(read_original_sector(wc, g) ==
1827 			    read_original_sector(wc, f))) {
1828 				f = g;
1829 				continue;
1830 			}
1831 			if (read_original_sector(wc, g) !=
1832 			    read_original_sector(wc, f) + (wc->block_size >> SECTOR_SHIFT))
1833 				break;
1834 			if (unlikely(g->write_in_progress))
1835 				break;
1836 			if (unlikely(!writecache_entry_is_committed(wc, g)))
1837 				break;
1838 
1839 			if (!WC_MODE_PMEM(wc)) {
1840 				if (g != f + 1)
1841 					break;
1842 			}
1843 
1844 			n_walked++;
1845 			//if (unlikely(n_walked > WRITEBACK_LATENCY) && likely(!wc->writeback_all))
1846 			//	break;
1847 
1848 			wc->writeback_size++;
1849 			list_del(&g->lru);
1850 			list_add(&g->lru, &wbl.list);
1851 			wbl.size++;
1852 			g->write_in_progress = true;
1853 			g->wc_list_contiguous = BIO_MAX_PAGES;
1854 			f = g;
1855 			e->wc_list_contiguous++;
1856 			if (unlikely(e->wc_list_contiguous == BIO_MAX_PAGES)) {
1857 				if (unlikely(wc->writeback_all)) {
1858 					next_node = rb_next(&f->rb_node);
1859 					if (likely(next_node))
1860 						g = container_of(next_node, struct wc_entry, rb_node);
1861 				}
1862 				break;
1863 			}
1864 		}
1865 		cond_resched();
1866 	}
1867 
1868 	if (!list_empty(&skipped)) {
1869 		list_splice_tail(&skipped, &wc->lru);
1870 		/*
1871 		 * If we didn't do any progress, we must wait until some
1872 		 * writeback finishes to avoid burning CPU in a loop
1873 		 */
1874 		if (unlikely(!wbl.size))
1875 			writecache_wait_for_writeback(wc);
1876 	}
1877 
1878 	wc_unlock(wc);
1879 
1880 	blk_start_plug(&plug);
1881 
1882 	if (WC_MODE_PMEM(wc))
1883 		__writecache_writeback_pmem(wc, &wbl);
1884 	else
1885 		__writecache_writeback_ssd(wc, &wbl);
1886 
1887 	blk_finish_plug(&plug);
1888 
1889 	if (unlikely(wc->writeback_all)) {
1890 		wc_lock(wc);
1891 		while (writecache_wait_for_writeback(wc));
1892 		wc_unlock(wc);
1893 	}
1894 }
1895 
1896 static int calculate_memory_size(uint64_t device_size, unsigned block_size,
1897 				 size_t *n_blocks_p, size_t *n_metadata_blocks_p)
1898 {
1899 	uint64_t n_blocks, offset;
1900 	struct wc_entry e;
1901 
1902 	n_blocks = device_size;
1903 	do_div(n_blocks, block_size + sizeof(struct wc_memory_entry));
1904 
1905 	while (1) {
1906 		if (!n_blocks)
1907 			return -ENOSPC;
1908 		/* Verify the following entries[n_blocks] won't overflow */
1909 		if (n_blocks >= ((size_t)-sizeof(struct wc_memory_superblock) /
1910 				 sizeof(struct wc_memory_entry)))
1911 			return -EFBIG;
1912 		offset = offsetof(struct wc_memory_superblock, entries[n_blocks]);
1913 		offset = (offset + block_size - 1) & ~(uint64_t)(block_size - 1);
1914 		if (offset + n_blocks * block_size <= device_size)
1915 			break;
1916 		n_blocks--;
1917 	}
1918 
1919 	/* check if the bit field overflows */
1920 	e.index = n_blocks;
1921 	if (e.index != n_blocks)
1922 		return -EFBIG;
1923 
1924 	if (n_blocks_p)
1925 		*n_blocks_p = n_blocks;
1926 	if (n_metadata_blocks_p)
1927 		*n_metadata_blocks_p = offset >> __ffs(block_size);
1928 	return 0;
1929 }
1930 
1931 static int init_memory(struct dm_writecache *wc)
1932 {
1933 	size_t b;
1934 	int r;
1935 
1936 	r = calculate_memory_size(wc->memory_map_size, wc->block_size, &wc->n_blocks, NULL);
1937 	if (r)
1938 		return r;
1939 
1940 	r = writecache_alloc_entries(wc);
1941 	if (r)
1942 		return r;
1943 
1944 	for (b = 0; b < ARRAY_SIZE(sb(wc)->padding); b++)
1945 		pmem_assign(sb(wc)->padding[b], cpu_to_le64(0));
1946 	pmem_assign(sb(wc)->version, cpu_to_le32(MEMORY_SUPERBLOCK_VERSION));
1947 	pmem_assign(sb(wc)->block_size, cpu_to_le32(wc->block_size));
1948 	pmem_assign(sb(wc)->n_blocks, cpu_to_le64(wc->n_blocks));
1949 	pmem_assign(sb(wc)->seq_count, cpu_to_le64(0));
1950 
1951 	for (b = 0; b < wc->n_blocks; b++) {
1952 		write_original_sector_seq_count(wc, &wc->entries[b], -1, -1);
1953 		cond_resched();
1954 	}
1955 
1956 	writecache_flush_all_metadata(wc);
1957 	writecache_commit_flushed(wc, false);
1958 	pmem_assign(sb(wc)->magic, cpu_to_le32(MEMORY_SUPERBLOCK_MAGIC));
1959 	writecache_flush_region(wc, &sb(wc)->magic, sizeof sb(wc)->magic);
1960 	writecache_commit_flushed(wc, false);
1961 
1962 	return 0;
1963 }
1964 
1965 static void writecache_dtr(struct dm_target *ti)
1966 {
1967 	struct dm_writecache *wc = ti->private;
1968 
1969 	if (!wc)
1970 		return;
1971 
1972 	if (wc->endio_thread)
1973 		kthread_stop(wc->endio_thread);
1974 
1975 	if (wc->flush_thread)
1976 		kthread_stop(wc->flush_thread);
1977 
1978 	bioset_exit(&wc->bio_set);
1979 
1980 	mempool_exit(&wc->copy_pool);
1981 
1982 	if (wc->writeback_wq)
1983 		destroy_workqueue(wc->writeback_wq);
1984 
1985 	if (wc->dev)
1986 		dm_put_device(ti, wc->dev);
1987 
1988 	if (wc->ssd_dev)
1989 		dm_put_device(ti, wc->ssd_dev);
1990 
1991 	if (wc->entries)
1992 		vfree(wc->entries);
1993 
1994 	if (wc->memory_map) {
1995 		if (WC_MODE_PMEM(wc))
1996 			persistent_memory_release(wc);
1997 		else
1998 			vfree(wc->memory_map);
1999 	}
2000 
2001 	if (wc->dm_kcopyd)
2002 		dm_kcopyd_client_destroy(wc->dm_kcopyd);
2003 
2004 	if (wc->dm_io)
2005 		dm_io_client_destroy(wc->dm_io);
2006 
2007 	if (wc->dirty_bitmap)
2008 		vfree(wc->dirty_bitmap);
2009 
2010 	kfree(wc);
2011 }
2012 
2013 static int writecache_ctr(struct dm_target *ti, unsigned argc, char **argv)
2014 {
2015 	struct dm_writecache *wc;
2016 	struct dm_arg_set as;
2017 	const char *string;
2018 	unsigned opt_params;
2019 	size_t offset, data_size;
2020 	int i, r;
2021 	char dummy;
2022 	int high_wm_percent = HIGH_WATERMARK;
2023 	int low_wm_percent = LOW_WATERMARK;
2024 	uint64_t x;
2025 	struct wc_memory_superblock s;
2026 
2027 	static struct dm_arg _args[] = {
2028 		{0, 10, "Invalid number of feature args"},
2029 	};
2030 
2031 	as.argc = argc;
2032 	as.argv = argv;
2033 
2034 	wc = kzalloc(sizeof(struct dm_writecache), GFP_KERNEL);
2035 	if (!wc) {
2036 		ti->error = "Cannot allocate writecache structure";
2037 		r = -ENOMEM;
2038 		goto bad;
2039 	}
2040 	ti->private = wc;
2041 	wc->ti = ti;
2042 
2043 	mutex_init(&wc->lock);
2044 	wc->max_age = MAX_AGE_UNSPECIFIED;
2045 	writecache_poison_lists(wc);
2046 	init_waitqueue_head(&wc->freelist_wait);
2047 	timer_setup(&wc->autocommit_timer, writecache_autocommit_timer, 0);
2048 	timer_setup(&wc->max_age_timer, writecache_max_age_timer, 0);
2049 
2050 	for (i = 0; i < 2; i++) {
2051 		atomic_set(&wc->bio_in_progress[i], 0);
2052 		init_waitqueue_head(&wc->bio_in_progress_wait[i]);
2053 	}
2054 
2055 	wc->dm_io = dm_io_client_create();
2056 	if (IS_ERR(wc->dm_io)) {
2057 		r = PTR_ERR(wc->dm_io);
2058 		ti->error = "Unable to allocate dm-io client";
2059 		wc->dm_io = NULL;
2060 		goto bad;
2061 	}
2062 
2063 	wc->writeback_wq = alloc_workqueue("writecache-writeback", WQ_MEM_RECLAIM, 1);
2064 	if (!wc->writeback_wq) {
2065 		r = -ENOMEM;
2066 		ti->error = "Could not allocate writeback workqueue";
2067 		goto bad;
2068 	}
2069 	INIT_WORK(&wc->writeback_work, writecache_writeback);
2070 	INIT_WORK(&wc->flush_work, writecache_flush_work);
2071 
2072 	raw_spin_lock_init(&wc->endio_list_lock);
2073 	INIT_LIST_HEAD(&wc->endio_list);
2074 	wc->endio_thread = kthread_create(writecache_endio_thread, wc, "writecache_endio");
2075 	if (IS_ERR(wc->endio_thread)) {
2076 		r = PTR_ERR(wc->endio_thread);
2077 		wc->endio_thread = NULL;
2078 		ti->error = "Couldn't spawn endio thread";
2079 		goto bad;
2080 	}
2081 	wake_up_process(wc->endio_thread);
2082 
2083 	/*
2084 	 * Parse the mode (pmem or ssd)
2085 	 */
2086 	string = dm_shift_arg(&as);
2087 	if (!string)
2088 		goto bad_arguments;
2089 
2090 	if (!strcasecmp(string, "s")) {
2091 		wc->pmem_mode = false;
2092 	} else if (!strcasecmp(string, "p")) {
2093 #ifdef DM_WRITECACHE_HAS_PMEM
2094 		wc->pmem_mode = true;
2095 		wc->writeback_fua = true;
2096 #else
2097 		/*
2098 		 * If the architecture doesn't support persistent memory or
2099 		 * the kernel doesn't support any DAX drivers, this driver can
2100 		 * only be used in SSD-only mode.
2101 		 */
2102 		r = -EOPNOTSUPP;
2103 		ti->error = "Persistent memory or DAX not supported on this system";
2104 		goto bad;
2105 #endif
2106 	} else {
2107 		goto bad_arguments;
2108 	}
2109 
2110 	if (WC_MODE_PMEM(wc)) {
2111 		r = bioset_init(&wc->bio_set, BIO_POOL_SIZE,
2112 				offsetof(struct writeback_struct, bio),
2113 				BIOSET_NEED_BVECS);
2114 		if (r) {
2115 			ti->error = "Could not allocate bio set";
2116 			goto bad;
2117 		}
2118 	} else {
2119 		r = mempool_init_kmalloc_pool(&wc->copy_pool, 1, sizeof(struct copy_struct));
2120 		if (r) {
2121 			ti->error = "Could not allocate mempool";
2122 			goto bad;
2123 		}
2124 	}
2125 
2126 	/*
2127 	 * Parse the origin data device
2128 	 */
2129 	string = dm_shift_arg(&as);
2130 	if (!string)
2131 		goto bad_arguments;
2132 	r = dm_get_device(ti, string, dm_table_get_mode(ti->table), &wc->dev);
2133 	if (r) {
2134 		ti->error = "Origin data device lookup failed";
2135 		goto bad;
2136 	}
2137 
2138 	/*
2139 	 * Parse cache data device (be it pmem or ssd)
2140 	 */
2141 	string = dm_shift_arg(&as);
2142 	if (!string)
2143 		goto bad_arguments;
2144 
2145 	r = dm_get_device(ti, string, dm_table_get_mode(ti->table), &wc->ssd_dev);
2146 	if (r) {
2147 		ti->error = "Cache data device lookup failed";
2148 		goto bad;
2149 	}
2150 	wc->memory_map_size = i_size_read(wc->ssd_dev->bdev->bd_inode);
2151 
2152 	/*
2153 	 * Parse the cache block size
2154 	 */
2155 	string = dm_shift_arg(&as);
2156 	if (!string)
2157 		goto bad_arguments;
2158 	if (sscanf(string, "%u%c", &wc->block_size, &dummy) != 1 ||
2159 	    wc->block_size < 512 || wc->block_size > PAGE_SIZE ||
2160 	    (wc->block_size & (wc->block_size - 1))) {
2161 		r = -EINVAL;
2162 		ti->error = "Invalid block size";
2163 		goto bad;
2164 	}
2165 	if (wc->block_size < bdev_logical_block_size(wc->dev->bdev) ||
2166 	    wc->block_size < bdev_logical_block_size(wc->ssd_dev->bdev)) {
2167 		r = -EINVAL;
2168 		ti->error = "Block size is smaller than device logical block size";
2169 		goto bad;
2170 	}
2171 	wc->block_size_bits = __ffs(wc->block_size);
2172 
2173 	wc->max_writeback_jobs = MAX_WRITEBACK_JOBS;
2174 	wc->autocommit_blocks = !WC_MODE_PMEM(wc) ? AUTOCOMMIT_BLOCKS_SSD : AUTOCOMMIT_BLOCKS_PMEM;
2175 	wc->autocommit_jiffies = msecs_to_jiffies(AUTOCOMMIT_MSEC);
2176 
2177 	/*
2178 	 * Parse optional arguments
2179 	 */
2180 	r = dm_read_arg_group(_args, &as, &opt_params, &ti->error);
2181 	if (r)
2182 		goto bad;
2183 
2184 	while (opt_params) {
2185 		string = dm_shift_arg(&as), opt_params--;
2186 		if (!strcasecmp(string, "start_sector") && opt_params >= 1) {
2187 			unsigned long long start_sector;
2188 			string = dm_shift_arg(&as), opt_params--;
2189 			if (sscanf(string, "%llu%c", &start_sector, &dummy) != 1)
2190 				goto invalid_optional;
2191 			wc->start_sector = start_sector;
2192 			if (wc->start_sector != start_sector ||
2193 			    wc->start_sector >= wc->memory_map_size >> SECTOR_SHIFT)
2194 				goto invalid_optional;
2195 		} else if (!strcasecmp(string, "high_watermark") && opt_params >= 1) {
2196 			string = dm_shift_arg(&as), opt_params--;
2197 			if (sscanf(string, "%d%c", &high_wm_percent, &dummy) != 1)
2198 				goto invalid_optional;
2199 			if (high_wm_percent < 0 || high_wm_percent > 100)
2200 				goto invalid_optional;
2201 			wc->high_wm_percent_set = true;
2202 		} else if (!strcasecmp(string, "low_watermark") && opt_params >= 1) {
2203 			string = dm_shift_arg(&as), opt_params--;
2204 			if (sscanf(string, "%d%c", &low_wm_percent, &dummy) != 1)
2205 				goto invalid_optional;
2206 			if (low_wm_percent < 0 || low_wm_percent > 100)
2207 				goto invalid_optional;
2208 			wc->low_wm_percent_set = true;
2209 		} else if (!strcasecmp(string, "writeback_jobs") && opt_params >= 1) {
2210 			string = dm_shift_arg(&as), opt_params--;
2211 			if (sscanf(string, "%u%c", &wc->max_writeback_jobs, &dummy) != 1)
2212 				goto invalid_optional;
2213 			wc->max_writeback_jobs_set = true;
2214 		} else if (!strcasecmp(string, "autocommit_blocks") && opt_params >= 1) {
2215 			string = dm_shift_arg(&as), opt_params--;
2216 			if (sscanf(string, "%u%c", &wc->autocommit_blocks, &dummy) != 1)
2217 				goto invalid_optional;
2218 			wc->autocommit_blocks_set = true;
2219 		} else if (!strcasecmp(string, "autocommit_time") && opt_params >= 1) {
2220 			unsigned autocommit_msecs;
2221 			string = dm_shift_arg(&as), opt_params--;
2222 			if (sscanf(string, "%u%c", &autocommit_msecs, &dummy) != 1)
2223 				goto invalid_optional;
2224 			if (autocommit_msecs > 3600000)
2225 				goto invalid_optional;
2226 			wc->autocommit_jiffies = msecs_to_jiffies(autocommit_msecs);
2227 			wc->autocommit_time_set = true;
2228 		} else if (!strcasecmp(string, "max_age") && opt_params >= 1) {
2229 			unsigned max_age_msecs;
2230 			string = dm_shift_arg(&as), opt_params--;
2231 			if (sscanf(string, "%u%c", &max_age_msecs, &dummy) != 1)
2232 				goto invalid_optional;
2233 			if (max_age_msecs > 86400000)
2234 				goto invalid_optional;
2235 			wc->max_age = msecs_to_jiffies(max_age_msecs);
2236 		} else if (!strcasecmp(string, "cleaner")) {
2237 			wc->cleaner = true;
2238 		} else if (!strcasecmp(string, "fua")) {
2239 			if (WC_MODE_PMEM(wc)) {
2240 				wc->writeback_fua = true;
2241 				wc->writeback_fua_set = true;
2242 			} else goto invalid_optional;
2243 		} else if (!strcasecmp(string, "nofua")) {
2244 			if (WC_MODE_PMEM(wc)) {
2245 				wc->writeback_fua = false;
2246 				wc->writeback_fua_set = true;
2247 			} else goto invalid_optional;
2248 		} else {
2249 invalid_optional:
2250 			r = -EINVAL;
2251 			ti->error = "Invalid optional argument";
2252 			goto bad;
2253 		}
2254 	}
2255 
2256 	if (high_wm_percent < low_wm_percent) {
2257 		r = -EINVAL;
2258 		ti->error = "High watermark must be greater than or equal to low watermark";
2259 		goto bad;
2260 	}
2261 
2262 	if (WC_MODE_PMEM(wc)) {
2263 		r = persistent_memory_claim(wc);
2264 		if (r) {
2265 			ti->error = "Unable to map persistent memory for cache";
2266 			goto bad;
2267 		}
2268 	} else {
2269 		size_t n_blocks, n_metadata_blocks;
2270 		uint64_t n_bitmap_bits;
2271 
2272 		wc->memory_map_size -= (uint64_t)wc->start_sector << SECTOR_SHIFT;
2273 
2274 		bio_list_init(&wc->flush_list);
2275 		wc->flush_thread = kthread_create(writecache_flush_thread, wc, "dm_writecache_flush");
2276 		if (IS_ERR(wc->flush_thread)) {
2277 			r = PTR_ERR(wc->flush_thread);
2278 			wc->flush_thread = NULL;
2279 			ti->error = "Couldn't spawn flush thread";
2280 			goto bad;
2281 		}
2282 		wake_up_process(wc->flush_thread);
2283 
2284 		r = calculate_memory_size(wc->memory_map_size, wc->block_size,
2285 					  &n_blocks, &n_metadata_blocks);
2286 		if (r) {
2287 			ti->error = "Invalid device size";
2288 			goto bad;
2289 		}
2290 
2291 		n_bitmap_bits = (((uint64_t)n_metadata_blocks << wc->block_size_bits) +
2292 				 BITMAP_GRANULARITY - 1) / BITMAP_GRANULARITY;
2293 		/* this is limitation of test_bit functions */
2294 		if (n_bitmap_bits > 1U << 31) {
2295 			r = -EFBIG;
2296 			ti->error = "Invalid device size";
2297 			goto bad;
2298 		}
2299 
2300 		wc->memory_map = vmalloc(n_metadata_blocks << wc->block_size_bits);
2301 		if (!wc->memory_map) {
2302 			r = -ENOMEM;
2303 			ti->error = "Unable to allocate memory for metadata";
2304 			goto bad;
2305 		}
2306 
2307 		wc->dm_kcopyd = dm_kcopyd_client_create(&dm_kcopyd_throttle);
2308 		if (IS_ERR(wc->dm_kcopyd)) {
2309 			r = PTR_ERR(wc->dm_kcopyd);
2310 			ti->error = "Unable to allocate dm-kcopyd client";
2311 			wc->dm_kcopyd = NULL;
2312 			goto bad;
2313 		}
2314 
2315 		wc->metadata_sectors = n_metadata_blocks << (wc->block_size_bits - SECTOR_SHIFT);
2316 		wc->dirty_bitmap_size = (n_bitmap_bits + BITS_PER_LONG - 1) /
2317 			BITS_PER_LONG * sizeof(unsigned long);
2318 		wc->dirty_bitmap = vzalloc(wc->dirty_bitmap_size);
2319 		if (!wc->dirty_bitmap) {
2320 			r = -ENOMEM;
2321 			ti->error = "Unable to allocate dirty bitmap";
2322 			goto bad;
2323 		}
2324 
2325 		r = writecache_read_metadata(wc, wc->block_size >> SECTOR_SHIFT);
2326 		if (r) {
2327 			ti->error = "Unable to read first block of metadata";
2328 			goto bad;
2329 		}
2330 	}
2331 
2332 	r = memcpy_mcsafe(&s, sb(wc), sizeof(struct wc_memory_superblock));
2333 	if (r) {
2334 		ti->error = "Hardware memory error when reading superblock";
2335 		goto bad;
2336 	}
2337 	if (!le32_to_cpu(s.magic) && !le32_to_cpu(s.version)) {
2338 		r = init_memory(wc);
2339 		if (r) {
2340 			ti->error = "Unable to initialize device";
2341 			goto bad;
2342 		}
2343 		r = memcpy_mcsafe(&s, sb(wc), sizeof(struct wc_memory_superblock));
2344 		if (r) {
2345 			ti->error = "Hardware memory error when reading superblock";
2346 			goto bad;
2347 		}
2348 	}
2349 
2350 	if (le32_to_cpu(s.magic) != MEMORY_SUPERBLOCK_MAGIC) {
2351 		ti->error = "Invalid magic in the superblock";
2352 		r = -EINVAL;
2353 		goto bad;
2354 	}
2355 
2356 	if (le32_to_cpu(s.version) != MEMORY_SUPERBLOCK_VERSION) {
2357 		ti->error = "Invalid version in the superblock";
2358 		r = -EINVAL;
2359 		goto bad;
2360 	}
2361 
2362 	if (le32_to_cpu(s.block_size) != wc->block_size) {
2363 		ti->error = "Block size does not match superblock";
2364 		r = -EINVAL;
2365 		goto bad;
2366 	}
2367 
2368 	wc->n_blocks = le64_to_cpu(s.n_blocks);
2369 
2370 	offset = wc->n_blocks * sizeof(struct wc_memory_entry);
2371 	if (offset / sizeof(struct wc_memory_entry) != le64_to_cpu(sb(wc)->n_blocks)) {
2372 overflow:
2373 		ti->error = "Overflow in size calculation";
2374 		r = -EINVAL;
2375 		goto bad;
2376 	}
2377 	offset += sizeof(struct wc_memory_superblock);
2378 	if (offset < sizeof(struct wc_memory_superblock))
2379 		goto overflow;
2380 	offset = (offset + wc->block_size - 1) & ~(size_t)(wc->block_size - 1);
2381 	data_size = wc->n_blocks * (size_t)wc->block_size;
2382 	if (!offset || (data_size / wc->block_size != wc->n_blocks) ||
2383 	    (offset + data_size < offset))
2384 		goto overflow;
2385 	if (offset + data_size > wc->memory_map_size) {
2386 		ti->error = "Memory area is too small";
2387 		r = -EINVAL;
2388 		goto bad;
2389 	}
2390 
2391 	wc->metadata_sectors = offset >> SECTOR_SHIFT;
2392 	wc->block_start = (char *)sb(wc) + offset;
2393 
2394 	x = (uint64_t)wc->n_blocks * (100 - high_wm_percent);
2395 	x += 50;
2396 	do_div(x, 100);
2397 	wc->freelist_high_watermark = x;
2398 	x = (uint64_t)wc->n_blocks * (100 - low_wm_percent);
2399 	x += 50;
2400 	do_div(x, 100);
2401 	wc->freelist_low_watermark = x;
2402 
2403 	if (wc->cleaner)
2404 		activate_cleaner(wc);
2405 
2406 	r = writecache_alloc_entries(wc);
2407 	if (r) {
2408 		ti->error = "Cannot allocate memory";
2409 		goto bad;
2410 	}
2411 
2412 	ti->num_flush_bios = 1;
2413 	ti->flush_supported = true;
2414 	ti->num_discard_bios = 1;
2415 
2416 	if (WC_MODE_PMEM(wc))
2417 		persistent_memory_flush_cache(wc->memory_map, wc->memory_map_size);
2418 
2419 	return 0;
2420 
2421 bad_arguments:
2422 	r = -EINVAL;
2423 	ti->error = "Bad arguments";
2424 bad:
2425 	writecache_dtr(ti);
2426 	return r;
2427 }
2428 
2429 static void writecache_status(struct dm_target *ti, status_type_t type,
2430 			      unsigned status_flags, char *result, unsigned maxlen)
2431 {
2432 	struct dm_writecache *wc = ti->private;
2433 	unsigned extra_args;
2434 	unsigned sz = 0;
2435 	uint64_t x;
2436 
2437 	switch (type) {
2438 	case STATUSTYPE_INFO:
2439 		DMEMIT("%ld %llu %llu %llu", writecache_has_error(wc),
2440 		       (unsigned long long)wc->n_blocks, (unsigned long long)wc->freelist_size,
2441 		       (unsigned long long)wc->writeback_size);
2442 		break;
2443 	case STATUSTYPE_TABLE:
2444 		DMEMIT("%c %s %s %u ", WC_MODE_PMEM(wc) ? 'p' : 's',
2445 				wc->dev->name, wc->ssd_dev->name, wc->block_size);
2446 		extra_args = 0;
2447 		if (wc->start_sector)
2448 			extra_args += 2;
2449 		if (wc->high_wm_percent_set && !wc->cleaner)
2450 			extra_args += 2;
2451 		if (wc->low_wm_percent_set && !wc->cleaner)
2452 			extra_args += 2;
2453 		if (wc->max_writeback_jobs_set)
2454 			extra_args += 2;
2455 		if (wc->autocommit_blocks_set)
2456 			extra_args += 2;
2457 		if (wc->autocommit_time_set)
2458 			extra_args += 2;
2459 		if (wc->cleaner)
2460 			extra_args++;
2461 		if (wc->writeback_fua_set)
2462 			extra_args++;
2463 
2464 		DMEMIT("%u", extra_args);
2465 		if (wc->start_sector)
2466 			DMEMIT(" start_sector %llu", (unsigned long long)wc->start_sector);
2467 		if (wc->high_wm_percent_set && !wc->cleaner) {
2468 			x = (uint64_t)wc->freelist_high_watermark * 100;
2469 			x += wc->n_blocks / 2;
2470 			do_div(x, (size_t)wc->n_blocks);
2471 			DMEMIT(" high_watermark %u", 100 - (unsigned)x);
2472 		}
2473 		if (wc->low_wm_percent_set && !wc->cleaner) {
2474 			x = (uint64_t)wc->freelist_low_watermark * 100;
2475 			x += wc->n_blocks / 2;
2476 			do_div(x, (size_t)wc->n_blocks);
2477 			DMEMIT(" low_watermark %u", 100 - (unsigned)x);
2478 		}
2479 		if (wc->max_writeback_jobs_set)
2480 			DMEMIT(" writeback_jobs %u", wc->max_writeback_jobs);
2481 		if (wc->autocommit_blocks_set)
2482 			DMEMIT(" autocommit_blocks %u", wc->autocommit_blocks);
2483 		if (wc->autocommit_time_set)
2484 			DMEMIT(" autocommit_time %u", jiffies_to_msecs(wc->autocommit_jiffies));
2485 		if (wc->max_age != MAX_AGE_UNSPECIFIED)
2486 			DMEMIT(" max_age %u", jiffies_to_msecs(wc->max_age));
2487 		if (wc->cleaner)
2488 			DMEMIT(" cleaner");
2489 		if (wc->writeback_fua_set)
2490 			DMEMIT(" %sfua", wc->writeback_fua ? "" : "no");
2491 		break;
2492 	}
2493 }
2494 
2495 static struct target_type writecache_target = {
2496 	.name			= "writecache",
2497 	.version		= {1, 3, 0},
2498 	.module			= THIS_MODULE,
2499 	.ctr			= writecache_ctr,
2500 	.dtr			= writecache_dtr,
2501 	.status			= writecache_status,
2502 	.postsuspend		= writecache_suspend,
2503 	.resume			= writecache_resume,
2504 	.message		= writecache_message,
2505 	.map			= writecache_map,
2506 	.end_io			= writecache_end_io,
2507 	.iterate_devices	= writecache_iterate_devices,
2508 	.io_hints		= writecache_io_hints,
2509 };
2510 
2511 static int __init dm_writecache_init(void)
2512 {
2513 	int r;
2514 
2515 	r = dm_register_target(&writecache_target);
2516 	if (r < 0) {
2517 		DMERR("register failed %d", r);
2518 		return r;
2519 	}
2520 
2521 	return 0;
2522 }
2523 
2524 static void __exit dm_writecache_exit(void)
2525 {
2526 	dm_unregister_target(&writecache_target);
2527 }
2528 
2529 module_init(dm_writecache_init);
2530 module_exit(dm_writecache_exit);
2531 
2532 MODULE_DESCRIPTION(DM_NAME " writecache target");
2533 MODULE_AUTHOR("Mikulas Patocka <dm-devel@redhat.com>");
2534 MODULE_LICENSE("GPL");
2535