xref: /openbmc/linux/drivers/md/dm-cache-target.c (revision d8bcaabe)
1 /*
2  * Copyright (C) 2012 Red Hat. All rights reserved.
3  *
4  * This file is released under the GPL.
5  */
6 
7 #include "dm.h"
8 #include "dm-bio-prison-v2.h"
9 #include "dm-bio-record.h"
10 #include "dm-cache-metadata.h"
11 
12 #include <linux/dm-io.h>
13 #include <linux/dm-kcopyd.h>
14 #include <linux/jiffies.h>
15 #include <linux/init.h>
16 #include <linux/mempool.h>
17 #include <linux/module.h>
18 #include <linux/rwsem.h>
19 #include <linux/slab.h>
20 #include <linux/vmalloc.h>
21 
22 #define DM_MSG_PREFIX "cache"
23 
24 DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(cache_copy_throttle,
25 	"A percentage of time allocated for copying to and/or from cache");
26 
27 /*----------------------------------------------------------------*/
28 
29 /*
30  * Glossary:
31  *
32  * oblock: index of an origin block
33  * cblock: index of a cache block
34  * promotion: movement of a block from origin to cache
35  * demotion: movement of a block from cache to origin
36  * migration: movement of a block between the origin and cache device,
37  *	      either direction
38  */
39 
40 /*----------------------------------------------------------------*/
41 
42 struct io_tracker {
43 	spinlock_t lock;
44 
45 	/*
46 	 * Sectors of in-flight IO.
47 	 */
48 	sector_t in_flight;
49 
50 	/*
51 	 * The time, in jiffies, when this device became idle (if it is
52 	 * indeed idle).
53 	 */
54 	unsigned long idle_time;
55 	unsigned long last_update_time;
56 };
57 
58 static void iot_init(struct io_tracker *iot)
59 {
60 	spin_lock_init(&iot->lock);
61 	iot->in_flight = 0ul;
62 	iot->idle_time = 0ul;
63 	iot->last_update_time = jiffies;
64 }
65 
66 static bool __iot_idle_for(struct io_tracker *iot, unsigned long jifs)
67 {
68 	if (iot->in_flight)
69 		return false;
70 
71 	return time_after(jiffies, iot->idle_time + jifs);
72 }
73 
74 static bool iot_idle_for(struct io_tracker *iot, unsigned long jifs)
75 {
76 	bool r;
77 	unsigned long flags;
78 
79 	spin_lock_irqsave(&iot->lock, flags);
80 	r = __iot_idle_for(iot, jifs);
81 	spin_unlock_irqrestore(&iot->lock, flags);
82 
83 	return r;
84 }
85 
86 static void iot_io_begin(struct io_tracker *iot, sector_t len)
87 {
88 	unsigned long flags;
89 
90 	spin_lock_irqsave(&iot->lock, flags);
91 	iot->in_flight += len;
92 	spin_unlock_irqrestore(&iot->lock, flags);
93 }
94 
95 static void __iot_io_end(struct io_tracker *iot, sector_t len)
96 {
97 	if (!len)
98 		return;
99 
100 	iot->in_flight -= len;
101 	if (!iot->in_flight)
102 		iot->idle_time = jiffies;
103 }
104 
105 static void iot_io_end(struct io_tracker *iot, sector_t len)
106 {
107 	unsigned long flags;
108 
109 	spin_lock_irqsave(&iot->lock, flags);
110 	__iot_io_end(iot, len);
111 	spin_unlock_irqrestore(&iot->lock, flags);
112 }
113 
114 /*----------------------------------------------------------------*/
115 
116 /*
117  * Represents a chunk of future work.  'input' allows continuations to pass
118  * values between themselves, typically error values.
119  */
120 struct continuation {
121 	struct work_struct ws;
122 	blk_status_t input;
123 };
124 
125 static inline void init_continuation(struct continuation *k,
126 				     void (*fn)(struct work_struct *))
127 {
128 	INIT_WORK(&k->ws, fn);
129 	k->input = 0;
130 }
131 
132 static inline void queue_continuation(struct workqueue_struct *wq,
133 				      struct continuation *k)
134 {
135 	queue_work(wq, &k->ws);
136 }
137 
138 /*----------------------------------------------------------------*/
139 
140 /*
141  * The batcher collects together pieces of work that need a particular
142  * operation to occur before they can proceed (typically a commit).
143  */
144 struct batcher {
145 	/*
146 	 * The operation that everyone is waiting for.
147 	 */
148 	blk_status_t (*commit_op)(void *context);
149 	void *commit_context;
150 
151 	/*
152 	 * This is how bios should be issued once the commit op is complete
153 	 * (accounted_request).
154 	 */
155 	void (*issue_op)(struct bio *bio, void *context);
156 	void *issue_context;
157 
158 	/*
159 	 * Queued work gets put on here after commit.
160 	 */
161 	struct workqueue_struct *wq;
162 
163 	spinlock_t lock;
164 	struct list_head work_items;
165 	struct bio_list bios;
166 	struct work_struct commit_work;
167 
168 	bool commit_scheduled;
169 };
170 
171 static void __commit(struct work_struct *_ws)
172 {
173 	struct batcher *b = container_of(_ws, struct batcher, commit_work);
174 	blk_status_t r;
175 	unsigned long flags;
176 	struct list_head work_items;
177 	struct work_struct *ws, *tmp;
178 	struct continuation *k;
179 	struct bio *bio;
180 	struct bio_list bios;
181 
182 	INIT_LIST_HEAD(&work_items);
183 	bio_list_init(&bios);
184 
185 	/*
186 	 * We have to grab these before the commit_op to avoid a race
187 	 * condition.
188 	 */
189 	spin_lock_irqsave(&b->lock, flags);
190 	list_splice_init(&b->work_items, &work_items);
191 	bio_list_merge(&bios, &b->bios);
192 	bio_list_init(&b->bios);
193 	b->commit_scheduled = false;
194 	spin_unlock_irqrestore(&b->lock, flags);
195 
196 	r = b->commit_op(b->commit_context);
197 
198 	list_for_each_entry_safe(ws, tmp, &work_items, entry) {
199 		k = container_of(ws, struct continuation, ws);
200 		k->input = r;
201 		INIT_LIST_HEAD(&ws->entry); /* to avoid a WARN_ON */
202 		queue_work(b->wq, ws);
203 	}
204 
205 	while ((bio = bio_list_pop(&bios))) {
206 		if (r) {
207 			bio->bi_status = r;
208 			bio_endio(bio);
209 		} else
210 			b->issue_op(bio, b->issue_context);
211 	}
212 }
213 
214 static void batcher_init(struct batcher *b,
215 			 blk_status_t (*commit_op)(void *),
216 			 void *commit_context,
217 			 void (*issue_op)(struct bio *bio, void *),
218 			 void *issue_context,
219 			 struct workqueue_struct *wq)
220 {
221 	b->commit_op = commit_op;
222 	b->commit_context = commit_context;
223 	b->issue_op = issue_op;
224 	b->issue_context = issue_context;
225 	b->wq = wq;
226 
227 	spin_lock_init(&b->lock);
228 	INIT_LIST_HEAD(&b->work_items);
229 	bio_list_init(&b->bios);
230 	INIT_WORK(&b->commit_work, __commit);
231 	b->commit_scheduled = false;
232 }
233 
234 static void async_commit(struct batcher *b)
235 {
236 	queue_work(b->wq, &b->commit_work);
237 }
238 
239 static void continue_after_commit(struct batcher *b, struct continuation *k)
240 {
241 	unsigned long flags;
242 	bool commit_scheduled;
243 
244 	spin_lock_irqsave(&b->lock, flags);
245 	commit_scheduled = b->commit_scheduled;
246 	list_add_tail(&k->ws.entry, &b->work_items);
247 	spin_unlock_irqrestore(&b->lock, flags);
248 
249 	if (commit_scheduled)
250 		async_commit(b);
251 }
252 
253 /*
254  * Bios are errored if commit failed.
255  */
256 static void issue_after_commit(struct batcher *b, struct bio *bio)
257 {
258        unsigned long flags;
259        bool commit_scheduled;
260 
261        spin_lock_irqsave(&b->lock, flags);
262        commit_scheduled = b->commit_scheduled;
263        bio_list_add(&b->bios, bio);
264        spin_unlock_irqrestore(&b->lock, flags);
265 
266        if (commit_scheduled)
267 	       async_commit(b);
268 }
269 
270 /*
271  * Call this if some urgent work is waiting for the commit to complete.
272  */
273 static void schedule_commit(struct batcher *b)
274 {
275 	bool immediate;
276 	unsigned long flags;
277 
278 	spin_lock_irqsave(&b->lock, flags);
279 	immediate = !list_empty(&b->work_items) || !bio_list_empty(&b->bios);
280 	b->commit_scheduled = true;
281 	spin_unlock_irqrestore(&b->lock, flags);
282 
283 	if (immediate)
284 		async_commit(b);
285 }
286 
287 /*
288  * There are a couple of places where we let a bio run, but want to do some
289  * work before calling its endio function.  We do this by temporarily
290  * changing the endio fn.
291  */
292 struct dm_hook_info {
293 	bio_end_io_t *bi_end_io;
294 };
295 
296 static void dm_hook_bio(struct dm_hook_info *h, struct bio *bio,
297 			bio_end_io_t *bi_end_io, void *bi_private)
298 {
299 	h->bi_end_io = bio->bi_end_io;
300 
301 	bio->bi_end_io = bi_end_io;
302 	bio->bi_private = bi_private;
303 }
304 
305 static void dm_unhook_bio(struct dm_hook_info *h, struct bio *bio)
306 {
307 	bio->bi_end_io = h->bi_end_io;
308 }
309 
310 /*----------------------------------------------------------------*/
311 
312 #define MIGRATION_POOL_SIZE 128
313 #define COMMIT_PERIOD HZ
314 #define MIGRATION_COUNT_WINDOW 10
315 
316 /*
317  * The block size of the device holding cache data must be
318  * between 32KB and 1GB.
319  */
320 #define DATA_DEV_BLOCK_SIZE_MIN_SECTORS (32 * 1024 >> SECTOR_SHIFT)
321 #define DATA_DEV_BLOCK_SIZE_MAX_SECTORS (1024 * 1024 * 1024 >> SECTOR_SHIFT)
322 
323 enum cache_metadata_mode {
324 	CM_WRITE,		/* metadata may be changed */
325 	CM_READ_ONLY,		/* metadata may not be changed */
326 	CM_FAIL
327 };
328 
329 enum cache_io_mode {
330 	/*
331 	 * Data is written to cached blocks only.  These blocks are marked
332 	 * dirty.  If you lose the cache device you will lose data.
333 	 * Potential performance increase for both reads and writes.
334 	 */
335 	CM_IO_WRITEBACK,
336 
337 	/*
338 	 * Data is written to both cache and origin.  Blocks are never
339 	 * dirty.  Potential performance benfit for reads only.
340 	 */
341 	CM_IO_WRITETHROUGH,
342 
343 	/*
344 	 * A degraded mode useful for various cache coherency situations
345 	 * (eg, rolling back snapshots).  Reads and writes always go to the
346 	 * origin.  If a write goes to a cached oblock, then the cache
347 	 * block is invalidated.
348 	 */
349 	CM_IO_PASSTHROUGH
350 };
351 
352 struct cache_features {
353 	enum cache_metadata_mode mode;
354 	enum cache_io_mode io_mode;
355 	unsigned metadata_version;
356 };
357 
358 struct cache_stats {
359 	atomic_t read_hit;
360 	atomic_t read_miss;
361 	atomic_t write_hit;
362 	atomic_t write_miss;
363 	atomic_t demotion;
364 	atomic_t promotion;
365 	atomic_t writeback;
366 	atomic_t copies_avoided;
367 	atomic_t cache_cell_clash;
368 	atomic_t commit_count;
369 	atomic_t discard_count;
370 };
371 
372 struct cache {
373 	struct dm_target *ti;
374 	struct dm_target_callbacks callbacks;
375 
376 	struct dm_cache_metadata *cmd;
377 
378 	/*
379 	 * Metadata is written to this device.
380 	 */
381 	struct dm_dev *metadata_dev;
382 
383 	/*
384 	 * The slower of the two data devices.  Typically a spindle.
385 	 */
386 	struct dm_dev *origin_dev;
387 
388 	/*
389 	 * The faster of the two data devices.  Typically an SSD.
390 	 */
391 	struct dm_dev *cache_dev;
392 
393 	/*
394 	 * Size of the origin device in _complete_ blocks and native sectors.
395 	 */
396 	dm_oblock_t origin_blocks;
397 	sector_t origin_sectors;
398 
399 	/*
400 	 * Size of the cache device in blocks.
401 	 */
402 	dm_cblock_t cache_size;
403 
404 	/*
405 	 * Fields for converting from sectors to blocks.
406 	 */
407 	sector_t sectors_per_block;
408 	int sectors_per_block_shift;
409 
410 	spinlock_t lock;
411 	struct list_head deferred_cells;
412 	struct bio_list deferred_bios;
413 	struct bio_list deferred_writethrough_bios;
414 	sector_t migration_threshold;
415 	wait_queue_head_t migration_wait;
416 	atomic_t nr_allocated_migrations;
417 
418 	/*
419 	 * The number of in flight migrations that are performing
420 	 * background io. eg, promotion, writeback.
421 	 */
422 	atomic_t nr_io_migrations;
423 
424 	struct rw_semaphore quiesce_lock;
425 
426 	/*
427 	 * cache_size entries, dirty if set
428 	 */
429 	atomic_t nr_dirty;
430 	unsigned long *dirty_bitset;
431 
432 	/*
433 	 * origin_blocks entries, discarded if set.
434 	 */
435 	dm_dblock_t discard_nr_blocks;
436 	unsigned long *discard_bitset;
437 	uint32_t discard_block_size; /* a power of 2 times sectors per block */
438 
439 	/*
440 	 * Rather than reconstructing the table line for the status we just
441 	 * save it and regurgitate.
442 	 */
443 	unsigned nr_ctr_args;
444 	const char **ctr_args;
445 
446 	struct dm_kcopyd_client *copier;
447 	struct workqueue_struct *wq;
448 	struct work_struct deferred_bio_worker;
449 	struct work_struct deferred_writethrough_worker;
450 	struct work_struct migration_worker;
451 	struct delayed_work waker;
452 	struct dm_bio_prison_v2 *prison;
453 
454 	mempool_t *migration_pool;
455 
456 	struct dm_cache_policy *policy;
457 	unsigned policy_nr_args;
458 
459 	bool need_tick_bio:1;
460 	bool sized:1;
461 	bool invalidate:1;
462 	bool commit_requested:1;
463 	bool loaded_mappings:1;
464 	bool loaded_discards:1;
465 
466 	/*
467 	 * Cache features such as write-through.
468 	 */
469 	struct cache_features features;
470 
471 	struct cache_stats stats;
472 
473 	/*
474 	 * Invalidation fields.
475 	 */
476 	spinlock_t invalidation_lock;
477 	struct list_head invalidation_requests;
478 
479 	struct io_tracker tracker;
480 
481 	struct work_struct commit_ws;
482 	struct batcher committer;
483 
484 	struct rw_semaphore background_work_lock;
485 };
486 
487 struct per_bio_data {
488 	bool tick:1;
489 	unsigned req_nr:2;
490 	struct dm_bio_prison_cell_v2 *cell;
491 	struct dm_hook_info hook_info;
492 	sector_t len;
493 
494 	/*
495 	 * writethrough fields.  These MUST remain at the end of this
496 	 * structure and the 'cache' member must be the first as it
497 	 * is used to determine the offset of the writethrough fields.
498 	 */
499 	struct cache *cache;
500 	dm_cblock_t cblock;
501 	struct dm_bio_details bio_details;
502 };
503 
504 struct dm_cache_migration {
505 	struct continuation k;
506 	struct cache *cache;
507 
508 	struct policy_work *op;
509 	struct bio *overwrite_bio;
510 	struct dm_bio_prison_cell_v2 *cell;
511 
512 	dm_cblock_t invalidate_cblock;
513 	dm_oblock_t invalidate_oblock;
514 };
515 
516 /*----------------------------------------------------------------*/
517 
518 static bool writethrough_mode(struct cache_features *f)
519 {
520 	return f->io_mode == CM_IO_WRITETHROUGH;
521 }
522 
523 static bool writeback_mode(struct cache_features *f)
524 {
525 	return f->io_mode == CM_IO_WRITEBACK;
526 }
527 
528 static inline bool passthrough_mode(struct cache_features *f)
529 {
530 	return unlikely(f->io_mode == CM_IO_PASSTHROUGH);
531 }
532 
533 /*----------------------------------------------------------------*/
534 
535 static void wake_deferred_bio_worker(struct cache *cache)
536 {
537 	queue_work(cache->wq, &cache->deferred_bio_worker);
538 }
539 
540 static void wake_deferred_writethrough_worker(struct cache *cache)
541 {
542 	queue_work(cache->wq, &cache->deferred_writethrough_worker);
543 }
544 
545 static void wake_migration_worker(struct cache *cache)
546 {
547 	if (passthrough_mode(&cache->features))
548 		return;
549 
550 	queue_work(cache->wq, &cache->migration_worker);
551 }
552 
553 /*----------------------------------------------------------------*/
554 
555 static struct dm_bio_prison_cell_v2 *alloc_prison_cell(struct cache *cache)
556 {
557 	return dm_bio_prison_alloc_cell_v2(cache->prison, GFP_NOWAIT);
558 }
559 
560 static void free_prison_cell(struct cache *cache, struct dm_bio_prison_cell_v2 *cell)
561 {
562 	dm_bio_prison_free_cell_v2(cache->prison, cell);
563 }
564 
565 static struct dm_cache_migration *alloc_migration(struct cache *cache)
566 {
567 	struct dm_cache_migration *mg;
568 
569 	mg = mempool_alloc(cache->migration_pool, GFP_NOWAIT);
570 	if (mg) {
571 		mg->cache = cache;
572 		atomic_inc(&mg->cache->nr_allocated_migrations);
573 	}
574 
575 	return mg;
576 }
577 
578 static void free_migration(struct dm_cache_migration *mg)
579 {
580 	struct cache *cache = mg->cache;
581 
582 	if (atomic_dec_and_test(&cache->nr_allocated_migrations))
583 		wake_up(&cache->migration_wait);
584 
585 	mempool_free(mg, cache->migration_pool);
586 }
587 
588 /*----------------------------------------------------------------*/
589 
590 static inline dm_oblock_t oblock_succ(dm_oblock_t b)
591 {
592 	return to_oblock(from_oblock(b) + 1ull);
593 }
594 
595 static void build_key(dm_oblock_t begin, dm_oblock_t end, struct dm_cell_key_v2 *key)
596 {
597 	key->virtual = 0;
598 	key->dev = 0;
599 	key->block_begin = from_oblock(begin);
600 	key->block_end = from_oblock(end);
601 }
602 
603 /*
604  * We have two lock levels.  Level 0, which is used to prevent WRITEs, and
605  * level 1 which prevents *both* READs and WRITEs.
606  */
607 #define WRITE_LOCK_LEVEL 0
608 #define READ_WRITE_LOCK_LEVEL 1
609 
610 static unsigned lock_level(struct bio *bio)
611 {
612 	return bio_data_dir(bio) == WRITE ?
613 		WRITE_LOCK_LEVEL :
614 		READ_WRITE_LOCK_LEVEL;
615 }
616 
617 /*----------------------------------------------------------------
618  * Per bio data
619  *--------------------------------------------------------------*/
620 
621 /*
622  * If using writeback, leave out struct per_bio_data's writethrough fields.
623  */
624 #define PB_DATA_SIZE_WB (offsetof(struct per_bio_data, cache))
625 #define PB_DATA_SIZE_WT (sizeof(struct per_bio_data))
626 
627 static size_t get_per_bio_data_size(struct cache *cache)
628 {
629 	return writethrough_mode(&cache->features) ? PB_DATA_SIZE_WT : PB_DATA_SIZE_WB;
630 }
631 
632 static struct per_bio_data *get_per_bio_data(struct bio *bio, size_t data_size)
633 {
634 	struct per_bio_data *pb = dm_per_bio_data(bio, data_size);
635 	BUG_ON(!pb);
636 	return pb;
637 }
638 
639 static struct per_bio_data *init_per_bio_data(struct bio *bio, size_t data_size)
640 {
641 	struct per_bio_data *pb = get_per_bio_data(bio, data_size);
642 
643 	pb->tick = false;
644 	pb->req_nr = dm_bio_get_target_bio_nr(bio);
645 	pb->cell = NULL;
646 	pb->len = 0;
647 
648 	return pb;
649 }
650 
651 /*----------------------------------------------------------------*/
652 
653 static void defer_bio(struct cache *cache, struct bio *bio)
654 {
655 	unsigned long flags;
656 
657 	spin_lock_irqsave(&cache->lock, flags);
658 	bio_list_add(&cache->deferred_bios, bio);
659 	spin_unlock_irqrestore(&cache->lock, flags);
660 
661 	wake_deferred_bio_worker(cache);
662 }
663 
664 static void defer_bios(struct cache *cache, struct bio_list *bios)
665 {
666 	unsigned long flags;
667 
668 	spin_lock_irqsave(&cache->lock, flags);
669 	bio_list_merge(&cache->deferred_bios, bios);
670 	bio_list_init(bios);
671 	spin_unlock_irqrestore(&cache->lock, flags);
672 
673 	wake_deferred_bio_worker(cache);
674 }
675 
676 /*----------------------------------------------------------------*/
677 
678 static bool bio_detain_shared(struct cache *cache, dm_oblock_t oblock, struct bio *bio)
679 {
680 	bool r;
681 	size_t pb_size;
682 	struct per_bio_data *pb;
683 	struct dm_cell_key_v2 key;
684 	dm_oblock_t end = to_oblock(from_oblock(oblock) + 1ULL);
685 	struct dm_bio_prison_cell_v2 *cell_prealloc, *cell;
686 
687 	cell_prealloc = alloc_prison_cell(cache); /* FIXME: allow wait if calling from worker */
688 	if (!cell_prealloc) {
689 		defer_bio(cache, bio);
690 		return false;
691 	}
692 
693 	build_key(oblock, end, &key);
694 	r = dm_cell_get_v2(cache->prison, &key, lock_level(bio), bio, cell_prealloc, &cell);
695 	if (!r) {
696 		/*
697 		 * Failed to get the lock.
698 		 */
699 		free_prison_cell(cache, cell_prealloc);
700 		return r;
701 	}
702 
703 	if (cell != cell_prealloc)
704 		free_prison_cell(cache, cell_prealloc);
705 
706 	pb_size = get_per_bio_data_size(cache);
707 	pb = get_per_bio_data(bio, pb_size);
708 	pb->cell = cell;
709 
710 	return r;
711 }
712 
713 /*----------------------------------------------------------------*/
714 
715 static bool is_dirty(struct cache *cache, dm_cblock_t b)
716 {
717 	return test_bit(from_cblock(b), cache->dirty_bitset);
718 }
719 
720 static void set_dirty(struct cache *cache, dm_cblock_t cblock)
721 {
722 	if (!test_and_set_bit(from_cblock(cblock), cache->dirty_bitset)) {
723 		atomic_inc(&cache->nr_dirty);
724 		policy_set_dirty(cache->policy, cblock);
725 	}
726 }
727 
728 /*
729  * These two are called when setting after migrations to force the policy
730  * and dirty bitset to be in sync.
731  */
732 static void force_set_dirty(struct cache *cache, dm_cblock_t cblock)
733 {
734 	if (!test_and_set_bit(from_cblock(cblock), cache->dirty_bitset))
735 		atomic_inc(&cache->nr_dirty);
736 	policy_set_dirty(cache->policy, cblock);
737 }
738 
739 static void force_clear_dirty(struct cache *cache, dm_cblock_t cblock)
740 {
741 	if (test_and_clear_bit(from_cblock(cblock), cache->dirty_bitset)) {
742 		if (atomic_dec_return(&cache->nr_dirty) == 0)
743 			dm_table_event(cache->ti->table);
744 	}
745 
746 	policy_clear_dirty(cache->policy, cblock);
747 }
748 
749 /*----------------------------------------------------------------*/
750 
751 static bool block_size_is_power_of_two(struct cache *cache)
752 {
753 	return cache->sectors_per_block_shift >= 0;
754 }
755 
756 /* gcc on ARM generates spurious references to __udivdi3 and __umoddi3 */
757 #if defined(CONFIG_ARM) && __GNUC__ == 4 && __GNUC_MINOR__ <= 6
758 __always_inline
759 #endif
760 static dm_block_t block_div(dm_block_t b, uint32_t n)
761 {
762 	do_div(b, n);
763 
764 	return b;
765 }
766 
767 static dm_block_t oblocks_per_dblock(struct cache *cache)
768 {
769 	dm_block_t oblocks = cache->discard_block_size;
770 
771 	if (block_size_is_power_of_two(cache))
772 		oblocks >>= cache->sectors_per_block_shift;
773 	else
774 		oblocks = block_div(oblocks, cache->sectors_per_block);
775 
776 	return oblocks;
777 }
778 
779 static dm_dblock_t oblock_to_dblock(struct cache *cache, dm_oblock_t oblock)
780 {
781 	return to_dblock(block_div(from_oblock(oblock),
782 				   oblocks_per_dblock(cache)));
783 }
784 
785 static void set_discard(struct cache *cache, dm_dblock_t b)
786 {
787 	unsigned long flags;
788 
789 	BUG_ON(from_dblock(b) >= from_dblock(cache->discard_nr_blocks));
790 	atomic_inc(&cache->stats.discard_count);
791 
792 	spin_lock_irqsave(&cache->lock, flags);
793 	set_bit(from_dblock(b), cache->discard_bitset);
794 	spin_unlock_irqrestore(&cache->lock, flags);
795 }
796 
797 static void clear_discard(struct cache *cache, dm_dblock_t b)
798 {
799 	unsigned long flags;
800 
801 	spin_lock_irqsave(&cache->lock, flags);
802 	clear_bit(from_dblock(b), cache->discard_bitset);
803 	spin_unlock_irqrestore(&cache->lock, flags);
804 }
805 
806 static bool is_discarded(struct cache *cache, dm_dblock_t b)
807 {
808 	int r;
809 	unsigned long flags;
810 
811 	spin_lock_irqsave(&cache->lock, flags);
812 	r = test_bit(from_dblock(b), cache->discard_bitset);
813 	spin_unlock_irqrestore(&cache->lock, flags);
814 
815 	return r;
816 }
817 
818 static bool is_discarded_oblock(struct cache *cache, dm_oblock_t b)
819 {
820 	int r;
821 	unsigned long flags;
822 
823 	spin_lock_irqsave(&cache->lock, flags);
824 	r = test_bit(from_dblock(oblock_to_dblock(cache, b)),
825 		     cache->discard_bitset);
826 	spin_unlock_irqrestore(&cache->lock, flags);
827 
828 	return r;
829 }
830 
831 /*----------------------------------------------------------------
832  * Remapping
833  *--------------------------------------------------------------*/
834 static void remap_to_origin(struct cache *cache, struct bio *bio)
835 {
836 	bio_set_dev(bio, cache->origin_dev->bdev);
837 }
838 
839 static void remap_to_cache(struct cache *cache, struct bio *bio,
840 			   dm_cblock_t cblock)
841 {
842 	sector_t bi_sector = bio->bi_iter.bi_sector;
843 	sector_t block = from_cblock(cblock);
844 
845 	bio_set_dev(bio, cache->cache_dev->bdev);
846 	if (!block_size_is_power_of_two(cache))
847 		bio->bi_iter.bi_sector =
848 			(block * cache->sectors_per_block) +
849 			sector_div(bi_sector, cache->sectors_per_block);
850 	else
851 		bio->bi_iter.bi_sector =
852 			(block << cache->sectors_per_block_shift) |
853 			(bi_sector & (cache->sectors_per_block - 1));
854 }
855 
856 static void check_if_tick_bio_needed(struct cache *cache, struct bio *bio)
857 {
858 	unsigned long flags;
859 	size_t pb_data_size = get_per_bio_data_size(cache);
860 	struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
861 
862 	spin_lock_irqsave(&cache->lock, flags);
863 	if (cache->need_tick_bio && !op_is_flush(bio->bi_opf) &&
864 	    bio_op(bio) != REQ_OP_DISCARD) {
865 		pb->tick = true;
866 		cache->need_tick_bio = false;
867 	}
868 	spin_unlock_irqrestore(&cache->lock, flags);
869 }
870 
871 static void remap_to_origin_clear_discard(struct cache *cache, struct bio *bio,
872 					  dm_oblock_t oblock)
873 {
874 	// FIXME: this is called way too much.
875 	check_if_tick_bio_needed(cache, bio);
876 	remap_to_origin(cache, bio);
877 	if (bio_data_dir(bio) == WRITE)
878 		clear_discard(cache, oblock_to_dblock(cache, oblock));
879 }
880 
881 static void remap_to_cache_dirty(struct cache *cache, struct bio *bio,
882 				 dm_oblock_t oblock, dm_cblock_t cblock)
883 {
884 	check_if_tick_bio_needed(cache, bio);
885 	remap_to_cache(cache, bio, cblock);
886 	if (bio_data_dir(bio) == WRITE) {
887 		set_dirty(cache, cblock);
888 		clear_discard(cache, oblock_to_dblock(cache, oblock));
889 	}
890 }
891 
892 static dm_oblock_t get_bio_block(struct cache *cache, struct bio *bio)
893 {
894 	sector_t block_nr = bio->bi_iter.bi_sector;
895 
896 	if (!block_size_is_power_of_two(cache))
897 		(void) sector_div(block_nr, cache->sectors_per_block);
898 	else
899 		block_nr >>= cache->sectors_per_block_shift;
900 
901 	return to_oblock(block_nr);
902 }
903 
904 static bool accountable_bio(struct cache *cache, struct bio *bio)
905 {
906 	return bio_op(bio) != REQ_OP_DISCARD;
907 }
908 
909 static void accounted_begin(struct cache *cache, struct bio *bio)
910 {
911 	size_t pb_data_size = get_per_bio_data_size(cache);
912 	struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
913 
914 	if (accountable_bio(cache, bio)) {
915 		pb->len = bio_sectors(bio);
916 		iot_io_begin(&cache->tracker, pb->len);
917 	}
918 }
919 
920 static void accounted_complete(struct cache *cache, struct bio *bio)
921 {
922 	size_t pb_data_size = get_per_bio_data_size(cache);
923 	struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
924 
925 	iot_io_end(&cache->tracker, pb->len);
926 }
927 
928 static void accounted_request(struct cache *cache, struct bio *bio)
929 {
930 	accounted_begin(cache, bio);
931 	generic_make_request(bio);
932 }
933 
934 static void issue_op(struct bio *bio, void *context)
935 {
936 	struct cache *cache = context;
937 	accounted_request(cache, bio);
938 }
939 
940 static void defer_writethrough_bio(struct cache *cache, struct bio *bio)
941 {
942 	unsigned long flags;
943 
944 	spin_lock_irqsave(&cache->lock, flags);
945 	bio_list_add(&cache->deferred_writethrough_bios, bio);
946 	spin_unlock_irqrestore(&cache->lock, flags);
947 
948 	wake_deferred_writethrough_worker(cache);
949 }
950 
951 static void writethrough_endio(struct bio *bio)
952 {
953 	struct per_bio_data *pb = get_per_bio_data(bio, PB_DATA_SIZE_WT);
954 
955 	dm_unhook_bio(&pb->hook_info, bio);
956 
957 	if (bio->bi_status) {
958 		bio_endio(bio);
959 		return;
960 	}
961 
962 	dm_bio_restore(&pb->bio_details, bio);
963 	remap_to_cache(pb->cache, bio, pb->cblock);
964 
965 	/*
966 	 * We can't issue this bio directly, since we're in interrupt
967 	 * context.  So it gets put on a bio list for processing by the
968 	 * worker thread.
969 	 */
970 	defer_writethrough_bio(pb->cache, bio);
971 }
972 
973 /*
974  * FIXME: send in parallel, huge latency as is.
975  * When running in writethrough mode we need to send writes to clean blocks
976  * to both the cache and origin devices.  In future we'd like to clone the
977  * bio and send them in parallel, but for now we're doing them in
978  * series as this is easier.
979  */
980 static void remap_to_origin_then_cache(struct cache *cache, struct bio *bio,
981 				       dm_oblock_t oblock, dm_cblock_t cblock)
982 {
983 	struct per_bio_data *pb = get_per_bio_data(bio, PB_DATA_SIZE_WT);
984 
985 	pb->cache = cache;
986 	pb->cblock = cblock;
987 	dm_hook_bio(&pb->hook_info, bio, writethrough_endio, NULL);
988 	dm_bio_record(&pb->bio_details, bio);
989 
990 	remap_to_origin_clear_discard(pb->cache, bio, oblock);
991 }
992 
993 /*----------------------------------------------------------------
994  * Failure modes
995  *--------------------------------------------------------------*/
996 static enum cache_metadata_mode get_cache_mode(struct cache *cache)
997 {
998 	return cache->features.mode;
999 }
1000 
1001 static const char *cache_device_name(struct cache *cache)
1002 {
1003 	return dm_device_name(dm_table_get_md(cache->ti->table));
1004 }
1005 
1006 static void notify_mode_switch(struct cache *cache, enum cache_metadata_mode mode)
1007 {
1008 	const char *descs[] = {
1009 		"write",
1010 		"read-only",
1011 		"fail"
1012 	};
1013 
1014 	dm_table_event(cache->ti->table);
1015 	DMINFO("%s: switching cache to %s mode",
1016 	       cache_device_name(cache), descs[(int)mode]);
1017 }
1018 
1019 static void set_cache_mode(struct cache *cache, enum cache_metadata_mode new_mode)
1020 {
1021 	bool needs_check;
1022 	enum cache_metadata_mode old_mode = get_cache_mode(cache);
1023 
1024 	if (dm_cache_metadata_needs_check(cache->cmd, &needs_check)) {
1025 		DMERR("%s: unable to read needs_check flag, setting failure mode.",
1026 		      cache_device_name(cache));
1027 		new_mode = CM_FAIL;
1028 	}
1029 
1030 	if (new_mode == CM_WRITE && needs_check) {
1031 		DMERR("%s: unable to switch cache to write mode until repaired.",
1032 		      cache_device_name(cache));
1033 		if (old_mode != new_mode)
1034 			new_mode = old_mode;
1035 		else
1036 			new_mode = CM_READ_ONLY;
1037 	}
1038 
1039 	/* Never move out of fail mode */
1040 	if (old_mode == CM_FAIL)
1041 		new_mode = CM_FAIL;
1042 
1043 	switch (new_mode) {
1044 	case CM_FAIL:
1045 	case CM_READ_ONLY:
1046 		dm_cache_metadata_set_read_only(cache->cmd);
1047 		break;
1048 
1049 	case CM_WRITE:
1050 		dm_cache_metadata_set_read_write(cache->cmd);
1051 		break;
1052 	}
1053 
1054 	cache->features.mode = new_mode;
1055 
1056 	if (new_mode != old_mode)
1057 		notify_mode_switch(cache, new_mode);
1058 }
1059 
1060 static void abort_transaction(struct cache *cache)
1061 {
1062 	const char *dev_name = cache_device_name(cache);
1063 
1064 	if (get_cache_mode(cache) >= CM_READ_ONLY)
1065 		return;
1066 
1067 	if (dm_cache_metadata_set_needs_check(cache->cmd)) {
1068 		DMERR("%s: failed to set 'needs_check' flag in metadata", dev_name);
1069 		set_cache_mode(cache, CM_FAIL);
1070 	}
1071 
1072 	DMERR_LIMIT("%s: aborting current metadata transaction", dev_name);
1073 	if (dm_cache_metadata_abort(cache->cmd)) {
1074 		DMERR("%s: failed to abort metadata transaction", dev_name);
1075 		set_cache_mode(cache, CM_FAIL);
1076 	}
1077 }
1078 
1079 static void metadata_operation_failed(struct cache *cache, const char *op, int r)
1080 {
1081 	DMERR_LIMIT("%s: metadata operation '%s' failed: error = %d",
1082 		    cache_device_name(cache), op, r);
1083 	abort_transaction(cache);
1084 	set_cache_mode(cache, CM_READ_ONLY);
1085 }
1086 
1087 /*----------------------------------------------------------------*/
1088 
1089 static void load_stats(struct cache *cache)
1090 {
1091 	struct dm_cache_statistics stats;
1092 
1093 	dm_cache_metadata_get_stats(cache->cmd, &stats);
1094 	atomic_set(&cache->stats.read_hit, stats.read_hits);
1095 	atomic_set(&cache->stats.read_miss, stats.read_misses);
1096 	atomic_set(&cache->stats.write_hit, stats.write_hits);
1097 	atomic_set(&cache->stats.write_miss, stats.write_misses);
1098 }
1099 
1100 static void save_stats(struct cache *cache)
1101 {
1102 	struct dm_cache_statistics stats;
1103 
1104 	if (get_cache_mode(cache) >= CM_READ_ONLY)
1105 		return;
1106 
1107 	stats.read_hits = atomic_read(&cache->stats.read_hit);
1108 	stats.read_misses = atomic_read(&cache->stats.read_miss);
1109 	stats.write_hits = atomic_read(&cache->stats.write_hit);
1110 	stats.write_misses = atomic_read(&cache->stats.write_miss);
1111 
1112 	dm_cache_metadata_set_stats(cache->cmd, &stats);
1113 }
1114 
1115 static void update_stats(struct cache_stats *stats, enum policy_operation op)
1116 {
1117 	switch (op) {
1118 	case POLICY_PROMOTE:
1119 		atomic_inc(&stats->promotion);
1120 		break;
1121 
1122 	case POLICY_DEMOTE:
1123 		atomic_inc(&stats->demotion);
1124 		break;
1125 
1126 	case POLICY_WRITEBACK:
1127 		atomic_inc(&stats->writeback);
1128 		break;
1129 	}
1130 }
1131 
1132 /*----------------------------------------------------------------
1133  * Migration processing
1134  *
1135  * Migration covers moving data from the origin device to the cache, or
1136  * vice versa.
1137  *--------------------------------------------------------------*/
1138 
1139 static void inc_io_migrations(struct cache *cache)
1140 {
1141 	atomic_inc(&cache->nr_io_migrations);
1142 }
1143 
1144 static void dec_io_migrations(struct cache *cache)
1145 {
1146 	atomic_dec(&cache->nr_io_migrations);
1147 }
1148 
1149 static bool discard_or_flush(struct bio *bio)
1150 {
1151 	return bio_op(bio) == REQ_OP_DISCARD || op_is_flush(bio->bi_opf);
1152 }
1153 
1154 static void calc_discard_block_range(struct cache *cache, struct bio *bio,
1155 				     dm_dblock_t *b, dm_dblock_t *e)
1156 {
1157 	sector_t sb = bio->bi_iter.bi_sector;
1158 	sector_t se = bio_end_sector(bio);
1159 
1160 	*b = to_dblock(dm_sector_div_up(sb, cache->discard_block_size));
1161 
1162 	if (se - sb < cache->discard_block_size)
1163 		*e = *b;
1164 	else
1165 		*e = to_dblock(block_div(se, cache->discard_block_size));
1166 }
1167 
1168 /*----------------------------------------------------------------*/
1169 
1170 static void prevent_background_work(struct cache *cache)
1171 {
1172 	lockdep_off();
1173 	down_write(&cache->background_work_lock);
1174 	lockdep_on();
1175 }
1176 
1177 static void allow_background_work(struct cache *cache)
1178 {
1179 	lockdep_off();
1180 	up_write(&cache->background_work_lock);
1181 	lockdep_on();
1182 }
1183 
1184 static bool background_work_begin(struct cache *cache)
1185 {
1186 	bool r;
1187 
1188 	lockdep_off();
1189 	r = down_read_trylock(&cache->background_work_lock);
1190 	lockdep_on();
1191 
1192 	return r;
1193 }
1194 
1195 static void background_work_end(struct cache *cache)
1196 {
1197 	lockdep_off();
1198 	up_read(&cache->background_work_lock);
1199 	lockdep_on();
1200 }
1201 
1202 /*----------------------------------------------------------------*/
1203 
1204 static void quiesce(struct dm_cache_migration *mg,
1205 		    void (*continuation)(struct work_struct *))
1206 {
1207 	init_continuation(&mg->k, continuation);
1208 	dm_cell_quiesce_v2(mg->cache->prison, mg->cell, &mg->k.ws);
1209 }
1210 
1211 static struct dm_cache_migration *ws_to_mg(struct work_struct *ws)
1212 {
1213 	struct continuation *k = container_of(ws, struct continuation, ws);
1214 	return container_of(k, struct dm_cache_migration, k);
1215 }
1216 
1217 static void copy_complete(int read_err, unsigned long write_err, void *context)
1218 {
1219 	struct dm_cache_migration *mg = container_of(context, struct dm_cache_migration, k);
1220 
1221 	if (read_err || write_err)
1222 		mg->k.input = BLK_STS_IOERR;
1223 
1224 	queue_continuation(mg->cache->wq, &mg->k);
1225 }
1226 
1227 static int copy(struct dm_cache_migration *mg, bool promote)
1228 {
1229 	int r;
1230 	struct dm_io_region o_region, c_region;
1231 	struct cache *cache = mg->cache;
1232 
1233 	o_region.bdev = cache->origin_dev->bdev;
1234 	o_region.sector = from_oblock(mg->op->oblock) * cache->sectors_per_block;
1235 	o_region.count = cache->sectors_per_block;
1236 
1237 	c_region.bdev = cache->cache_dev->bdev;
1238 	c_region.sector = from_cblock(mg->op->cblock) * cache->sectors_per_block;
1239 	c_region.count = cache->sectors_per_block;
1240 
1241 	if (promote)
1242 		r = dm_kcopyd_copy(cache->copier, &o_region, 1, &c_region, 0, copy_complete, &mg->k);
1243 	else
1244 		r = dm_kcopyd_copy(cache->copier, &c_region, 1, &o_region, 0, copy_complete, &mg->k);
1245 
1246 	return r;
1247 }
1248 
1249 static void bio_drop_shared_lock(struct cache *cache, struct bio *bio)
1250 {
1251 	size_t pb_data_size = get_per_bio_data_size(cache);
1252 	struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
1253 
1254 	if (pb->cell && dm_cell_put_v2(cache->prison, pb->cell))
1255 		free_prison_cell(cache, pb->cell);
1256 	pb->cell = NULL;
1257 }
1258 
1259 static void overwrite_endio(struct bio *bio)
1260 {
1261 	struct dm_cache_migration *mg = bio->bi_private;
1262 	struct cache *cache = mg->cache;
1263 	size_t pb_data_size = get_per_bio_data_size(cache);
1264 	struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
1265 
1266 	dm_unhook_bio(&pb->hook_info, bio);
1267 
1268 	if (bio->bi_status)
1269 		mg->k.input = bio->bi_status;
1270 
1271 	queue_continuation(mg->cache->wq, &mg->k);
1272 }
1273 
1274 static void overwrite(struct dm_cache_migration *mg,
1275 		      void (*continuation)(struct work_struct *))
1276 {
1277 	struct bio *bio = mg->overwrite_bio;
1278 	size_t pb_data_size = get_per_bio_data_size(mg->cache);
1279 	struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
1280 
1281 	dm_hook_bio(&pb->hook_info, bio, overwrite_endio, mg);
1282 
1283 	/*
1284 	 * The overwrite bio is part of the copy operation, as such it does
1285 	 * not set/clear discard or dirty flags.
1286 	 */
1287 	if (mg->op->op == POLICY_PROMOTE)
1288 		remap_to_cache(mg->cache, bio, mg->op->cblock);
1289 	else
1290 		remap_to_origin(mg->cache, bio);
1291 
1292 	init_continuation(&mg->k, continuation);
1293 	accounted_request(mg->cache, bio);
1294 }
1295 
1296 /*
1297  * Migration steps:
1298  *
1299  * 1) exclusive lock preventing WRITEs
1300  * 2) quiesce
1301  * 3) copy or issue overwrite bio
1302  * 4) upgrade to exclusive lock preventing READs and WRITEs
1303  * 5) quiesce
1304  * 6) update metadata and commit
1305  * 7) unlock
1306  */
1307 static void mg_complete(struct dm_cache_migration *mg, bool success)
1308 {
1309 	struct bio_list bios;
1310 	struct cache *cache = mg->cache;
1311 	struct policy_work *op = mg->op;
1312 	dm_cblock_t cblock = op->cblock;
1313 
1314 	if (success)
1315 		update_stats(&cache->stats, op->op);
1316 
1317 	switch (op->op) {
1318 	case POLICY_PROMOTE:
1319 		clear_discard(cache, oblock_to_dblock(cache, op->oblock));
1320 		policy_complete_background_work(cache->policy, op, success);
1321 
1322 		if (mg->overwrite_bio) {
1323 			if (success)
1324 				force_set_dirty(cache, cblock);
1325 			else if (mg->k.input)
1326 				mg->overwrite_bio->bi_status = mg->k.input;
1327 			else
1328 				mg->overwrite_bio->bi_status = BLK_STS_IOERR;
1329 			bio_endio(mg->overwrite_bio);
1330 		} else {
1331 			if (success)
1332 				force_clear_dirty(cache, cblock);
1333 			dec_io_migrations(cache);
1334 		}
1335 		break;
1336 
1337 	case POLICY_DEMOTE:
1338 		/*
1339 		 * We clear dirty here to update the nr_dirty counter.
1340 		 */
1341 		if (success)
1342 			force_clear_dirty(cache, cblock);
1343 		policy_complete_background_work(cache->policy, op, success);
1344 		dec_io_migrations(cache);
1345 		break;
1346 
1347 	case POLICY_WRITEBACK:
1348 		if (success)
1349 			force_clear_dirty(cache, cblock);
1350 		policy_complete_background_work(cache->policy, op, success);
1351 		dec_io_migrations(cache);
1352 		break;
1353 	}
1354 
1355 	bio_list_init(&bios);
1356 	if (mg->cell) {
1357 		if (dm_cell_unlock_v2(cache->prison, mg->cell, &bios))
1358 			free_prison_cell(cache, mg->cell);
1359 	}
1360 
1361 	free_migration(mg);
1362 	defer_bios(cache, &bios);
1363 	wake_migration_worker(cache);
1364 
1365 	background_work_end(cache);
1366 }
1367 
1368 static void mg_success(struct work_struct *ws)
1369 {
1370 	struct dm_cache_migration *mg = ws_to_mg(ws);
1371 	mg_complete(mg, mg->k.input == 0);
1372 }
1373 
1374 static void mg_update_metadata(struct work_struct *ws)
1375 {
1376 	int r;
1377 	struct dm_cache_migration *mg = ws_to_mg(ws);
1378 	struct cache *cache = mg->cache;
1379 	struct policy_work *op = mg->op;
1380 
1381 	switch (op->op) {
1382 	case POLICY_PROMOTE:
1383 		r = dm_cache_insert_mapping(cache->cmd, op->cblock, op->oblock);
1384 		if (r) {
1385 			DMERR_LIMIT("%s: migration failed; couldn't insert mapping",
1386 				    cache_device_name(cache));
1387 			metadata_operation_failed(cache, "dm_cache_insert_mapping", r);
1388 
1389 			mg_complete(mg, false);
1390 			return;
1391 		}
1392 		mg_complete(mg, true);
1393 		break;
1394 
1395 	case POLICY_DEMOTE:
1396 		r = dm_cache_remove_mapping(cache->cmd, op->cblock);
1397 		if (r) {
1398 			DMERR_LIMIT("%s: migration failed; couldn't update on disk metadata",
1399 				    cache_device_name(cache));
1400 			metadata_operation_failed(cache, "dm_cache_remove_mapping", r);
1401 
1402 			mg_complete(mg, false);
1403 			return;
1404 		}
1405 
1406 		/*
1407 		 * It would be nice if we only had to commit when a REQ_FLUSH
1408 		 * comes through.  But there's one scenario that we have to
1409 		 * look out for:
1410 		 *
1411 		 * - vblock x in a cache block
1412 		 * - domotion occurs
1413 		 * - cache block gets reallocated and over written
1414 		 * - crash
1415 		 *
1416 		 * When we recover, because there was no commit the cache will
1417 		 * rollback to having the data for vblock x in the cache block.
1418 		 * But the cache block has since been overwritten, so it'll end
1419 		 * up pointing to data that was never in 'x' during the history
1420 		 * of the device.
1421 		 *
1422 		 * To avoid this issue we require a commit as part of the
1423 		 * demotion operation.
1424 		 */
1425 		init_continuation(&mg->k, mg_success);
1426 		continue_after_commit(&cache->committer, &mg->k);
1427 		schedule_commit(&cache->committer);
1428 		break;
1429 
1430 	case POLICY_WRITEBACK:
1431 		mg_complete(mg, true);
1432 		break;
1433 	}
1434 }
1435 
1436 static void mg_update_metadata_after_copy(struct work_struct *ws)
1437 {
1438 	struct dm_cache_migration *mg = ws_to_mg(ws);
1439 
1440 	/*
1441 	 * Did the copy succeed?
1442 	 */
1443 	if (mg->k.input)
1444 		mg_complete(mg, false);
1445 	else
1446 		mg_update_metadata(ws);
1447 }
1448 
1449 static void mg_upgrade_lock(struct work_struct *ws)
1450 {
1451 	int r;
1452 	struct dm_cache_migration *mg = ws_to_mg(ws);
1453 
1454 	/*
1455 	 * Did the copy succeed?
1456 	 */
1457 	if (mg->k.input)
1458 		mg_complete(mg, false);
1459 
1460 	else {
1461 		/*
1462 		 * Now we want the lock to prevent both reads and writes.
1463 		 */
1464 		r = dm_cell_lock_promote_v2(mg->cache->prison, mg->cell,
1465 					    READ_WRITE_LOCK_LEVEL);
1466 		if (r < 0)
1467 			mg_complete(mg, false);
1468 
1469 		else if (r)
1470 			quiesce(mg, mg_update_metadata);
1471 
1472 		else
1473 			mg_update_metadata(ws);
1474 	}
1475 }
1476 
1477 static void mg_copy(struct work_struct *ws)
1478 {
1479 	int r;
1480 	struct dm_cache_migration *mg = ws_to_mg(ws);
1481 
1482 	if (mg->overwrite_bio) {
1483 		/*
1484 		 * It's safe to do this here, even though it's new data
1485 		 * because all IO has been locked out of the block.
1486 		 *
1487 		 * mg_lock_writes() already took READ_WRITE_LOCK_LEVEL
1488 		 * so _not_ using mg_upgrade_lock() as continutation.
1489 		 */
1490 		overwrite(mg, mg_update_metadata_after_copy);
1491 
1492 	} else {
1493 		struct cache *cache = mg->cache;
1494 		struct policy_work *op = mg->op;
1495 		bool is_policy_promote = (op->op == POLICY_PROMOTE);
1496 
1497 		if ((!is_policy_promote && !is_dirty(cache, op->cblock)) ||
1498 		    is_discarded_oblock(cache, op->oblock)) {
1499 			mg_upgrade_lock(ws);
1500 			return;
1501 		}
1502 
1503 		init_continuation(&mg->k, mg_upgrade_lock);
1504 
1505 		r = copy(mg, is_policy_promote);
1506 		if (r) {
1507 			DMERR_LIMIT("%s: migration copy failed", cache_device_name(cache));
1508 			mg->k.input = BLK_STS_IOERR;
1509 			mg_complete(mg, false);
1510 		}
1511 	}
1512 }
1513 
1514 static int mg_lock_writes(struct dm_cache_migration *mg)
1515 {
1516 	int r;
1517 	struct dm_cell_key_v2 key;
1518 	struct cache *cache = mg->cache;
1519 	struct dm_bio_prison_cell_v2 *prealloc;
1520 
1521 	prealloc = alloc_prison_cell(cache);
1522 	if (!prealloc) {
1523 		DMERR_LIMIT("%s: alloc_prison_cell failed", cache_device_name(cache));
1524 		mg_complete(mg, false);
1525 		return -ENOMEM;
1526 	}
1527 
1528 	/*
1529 	 * Prevent writes to the block, but allow reads to continue.
1530 	 * Unless we're using an overwrite bio, in which case we lock
1531 	 * everything.
1532 	 */
1533 	build_key(mg->op->oblock, oblock_succ(mg->op->oblock), &key);
1534 	r = dm_cell_lock_v2(cache->prison, &key,
1535 			    mg->overwrite_bio ?  READ_WRITE_LOCK_LEVEL : WRITE_LOCK_LEVEL,
1536 			    prealloc, &mg->cell);
1537 	if (r < 0) {
1538 		free_prison_cell(cache, prealloc);
1539 		mg_complete(mg, false);
1540 		return r;
1541 	}
1542 
1543 	if (mg->cell != prealloc)
1544 		free_prison_cell(cache, prealloc);
1545 
1546 	if (r == 0)
1547 		mg_copy(&mg->k.ws);
1548 	else
1549 		quiesce(mg, mg_copy);
1550 
1551 	return 0;
1552 }
1553 
1554 static int mg_start(struct cache *cache, struct policy_work *op, struct bio *bio)
1555 {
1556 	struct dm_cache_migration *mg;
1557 
1558 	if (!background_work_begin(cache)) {
1559 		policy_complete_background_work(cache->policy, op, false);
1560 		return -EPERM;
1561 	}
1562 
1563 	mg = alloc_migration(cache);
1564 	if (!mg) {
1565 		policy_complete_background_work(cache->policy, op, false);
1566 		background_work_end(cache);
1567 		return -ENOMEM;
1568 	}
1569 
1570 	memset(mg, 0, sizeof(*mg));
1571 
1572 	mg->cache = cache;
1573 	mg->op = op;
1574 	mg->overwrite_bio = bio;
1575 
1576 	if (!bio)
1577 		inc_io_migrations(cache);
1578 
1579 	return mg_lock_writes(mg);
1580 }
1581 
1582 /*----------------------------------------------------------------
1583  * invalidation processing
1584  *--------------------------------------------------------------*/
1585 
1586 static void invalidate_complete(struct dm_cache_migration *mg, bool success)
1587 {
1588 	struct bio_list bios;
1589 	struct cache *cache = mg->cache;
1590 
1591 	bio_list_init(&bios);
1592 	if (dm_cell_unlock_v2(cache->prison, mg->cell, &bios))
1593 		free_prison_cell(cache, mg->cell);
1594 
1595 	if (!success && mg->overwrite_bio)
1596 		bio_io_error(mg->overwrite_bio);
1597 
1598 	free_migration(mg);
1599 	defer_bios(cache, &bios);
1600 
1601 	background_work_end(cache);
1602 }
1603 
1604 static void invalidate_completed(struct work_struct *ws)
1605 {
1606 	struct dm_cache_migration *mg = ws_to_mg(ws);
1607 	invalidate_complete(mg, !mg->k.input);
1608 }
1609 
1610 static int invalidate_cblock(struct cache *cache, dm_cblock_t cblock)
1611 {
1612 	int r = policy_invalidate_mapping(cache->policy, cblock);
1613 	if (!r) {
1614 		r = dm_cache_remove_mapping(cache->cmd, cblock);
1615 		if (r) {
1616 			DMERR_LIMIT("%s: invalidation failed; couldn't update on disk metadata",
1617 				    cache_device_name(cache));
1618 			metadata_operation_failed(cache, "dm_cache_remove_mapping", r);
1619 		}
1620 
1621 	} else if (r == -ENODATA) {
1622 		/*
1623 		 * Harmless, already unmapped.
1624 		 */
1625 		r = 0;
1626 
1627 	} else
1628 		DMERR("%s: policy_invalidate_mapping failed", cache_device_name(cache));
1629 
1630 	return r;
1631 }
1632 
1633 static void invalidate_remove(struct work_struct *ws)
1634 {
1635 	int r;
1636 	struct dm_cache_migration *mg = ws_to_mg(ws);
1637 	struct cache *cache = mg->cache;
1638 
1639 	r = invalidate_cblock(cache, mg->invalidate_cblock);
1640 	if (r) {
1641 		invalidate_complete(mg, false);
1642 		return;
1643 	}
1644 
1645 	init_continuation(&mg->k, invalidate_completed);
1646 	continue_after_commit(&cache->committer, &mg->k);
1647 	remap_to_origin_clear_discard(cache, mg->overwrite_bio, mg->invalidate_oblock);
1648 	mg->overwrite_bio = NULL;
1649 	schedule_commit(&cache->committer);
1650 }
1651 
1652 static int invalidate_lock(struct dm_cache_migration *mg)
1653 {
1654 	int r;
1655 	struct dm_cell_key_v2 key;
1656 	struct cache *cache = mg->cache;
1657 	struct dm_bio_prison_cell_v2 *prealloc;
1658 
1659 	prealloc = alloc_prison_cell(cache);
1660 	if (!prealloc) {
1661 		invalidate_complete(mg, false);
1662 		return -ENOMEM;
1663 	}
1664 
1665 	build_key(mg->invalidate_oblock, oblock_succ(mg->invalidate_oblock), &key);
1666 	r = dm_cell_lock_v2(cache->prison, &key,
1667 			    READ_WRITE_LOCK_LEVEL, prealloc, &mg->cell);
1668 	if (r < 0) {
1669 		free_prison_cell(cache, prealloc);
1670 		invalidate_complete(mg, false);
1671 		return r;
1672 	}
1673 
1674 	if (mg->cell != prealloc)
1675 		free_prison_cell(cache, prealloc);
1676 
1677 	if (r)
1678 		quiesce(mg, invalidate_remove);
1679 
1680 	else {
1681 		/*
1682 		 * We can't call invalidate_remove() directly here because we
1683 		 * might still be in request context.
1684 		 */
1685 		init_continuation(&mg->k, invalidate_remove);
1686 		queue_work(cache->wq, &mg->k.ws);
1687 	}
1688 
1689 	return 0;
1690 }
1691 
1692 static int invalidate_start(struct cache *cache, dm_cblock_t cblock,
1693 			    dm_oblock_t oblock, struct bio *bio)
1694 {
1695 	struct dm_cache_migration *mg;
1696 
1697 	if (!background_work_begin(cache))
1698 		return -EPERM;
1699 
1700 	mg = alloc_migration(cache);
1701 	if (!mg) {
1702 		background_work_end(cache);
1703 		return -ENOMEM;
1704 	}
1705 
1706 	memset(mg, 0, sizeof(*mg));
1707 
1708 	mg->cache = cache;
1709 	mg->overwrite_bio = bio;
1710 	mg->invalidate_cblock = cblock;
1711 	mg->invalidate_oblock = oblock;
1712 
1713 	return invalidate_lock(mg);
1714 }
1715 
1716 /*----------------------------------------------------------------
1717  * bio processing
1718  *--------------------------------------------------------------*/
1719 
1720 enum busy {
1721 	IDLE,
1722 	BUSY
1723 };
1724 
1725 static enum busy spare_migration_bandwidth(struct cache *cache)
1726 {
1727 	bool idle = iot_idle_for(&cache->tracker, HZ);
1728 	sector_t current_volume = (atomic_read(&cache->nr_io_migrations) + 1) *
1729 		cache->sectors_per_block;
1730 
1731 	if (idle && current_volume <= cache->migration_threshold)
1732 		return IDLE;
1733 	else
1734 		return BUSY;
1735 }
1736 
1737 static void inc_hit_counter(struct cache *cache, struct bio *bio)
1738 {
1739 	atomic_inc(bio_data_dir(bio) == READ ?
1740 		   &cache->stats.read_hit : &cache->stats.write_hit);
1741 }
1742 
1743 static void inc_miss_counter(struct cache *cache, struct bio *bio)
1744 {
1745 	atomic_inc(bio_data_dir(bio) == READ ?
1746 		   &cache->stats.read_miss : &cache->stats.write_miss);
1747 }
1748 
1749 /*----------------------------------------------------------------*/
1750 
1751 static bool bio_writes_complete_block(struct cache *cache, struct bio *bio)
1752 {
1753 	return (bio_data_dir(bio) == WRITE) &&
1754 		(bio->bi_iter.bi_size == (cache->sectors_per_block << SECTOR_SHIFT));
1755 }
1756 
1757 static bool optimisable_bio(struct cache *cache, struct bio *bio, dm_oblock_t block)
1758 {
1759 	return writeback_mode(&cache->features) &&
1760 		(is_discarded_oblock(cache, block) || bio_writes_complete_block(cache, bio));
1761 }
1762 
1763 static int map_bio(struct cache *cache, struct bio *bio, dm_oblock_t block,
1764 		   bool *commit_needed)
1765 {
1766 	int r, data_dir;
1767 	bool rb, background_queued;
1768 	dm_cblock_t cblock;
1769 	size_t pb_data_size = get_per_bio_data_size(cache);
1770 	struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
1771 
1772 	*commit_needed = false;
1773 
1774 	rb = bio_detain_shared(cache, block, bio);
1775 	if (!rb) {
1776 		/*
1777 		 * An exclusive lock is held for this block, so we have to
1778 		 * wait.  We set the commit_needed flag so the current
1779 		 * transaction will be committed asap, allowing this lock
1780 		 * to be dropped.
1781 		 */
1782 		*commit_needed = true;
1783 		return DM_MAPIO_SUBMITTED;
1784 	}
1785 
1786 	data_dir = bio_data_dir(bio);
1787 
1788 	if (optimisable_bio(cache, bio, block)) {
1789 		struct policy_work *op = NULL;
1790 
1791 		r = policy_lookup_with_work(cache->policy, block, &cblock, data_dir, true, &op);
1792 		if (unlikely(r && r != -ENOENT)) {
1793 			DMERR_LIMIT("%s: policy_lookup_with_work() failed with r = %d",
1794 				    cache_device_name(cache), r);
1795 			bio_io_error(bio);
1796 			return DM_MAPIO_SUBMITTED;
1797 		}
1798 
1799 		if (r == -ENOENT && op) {
1800 			bio_drop_shared_lock(cache, bio);
1801 			BUG_ON(op->op != POLICY_PROMOTE);
1802 			mg_start(cache, op, bio);
1803 			return DM_MAPIO_SUBMITTED;
1804 		}
1805 	} else {
1806 		r = policy_lookup(cache->policy, block, &cblock, data_dir, false, &background_queued);
1807 		if (unlikely(r && r != -ENOENT)) {
1808 			DMERR_LIMIT("%s: policy_lookup() failed with r = %d",
1809 				    cache_device_name(cache), r);
1810 			bio_io_error(bio);
1811 			return DM_MAPIO_SUBMITTED;
1812 		}
1813 
1814 		if (background_queued)
1815 			wake_migration_worker(cache);
1816 	}
1817 
1818 	if (r == -ENOENT) {
1819 		/*
1820 		 * Miss.
1821 		 */
1822 		inc_miss_counter(cache, bio);
1823 		if (pb->req_nr == 0) {
1824 			accounted_begin(cache, bio);
1825 			remap_to_origin_clear_discard(cache, bio, block);
1826 
1827 		} else {
1828 			/*
1829 			 * This is a duplicate writethrough io that is no
1830 			 * longer needed because the block has been demoted.
1831 			 */
1832 			bio_endio(bio);
1833 			return DM_MAPIO_SUBMITTED;
1834 		}
1835 	} else {
1836 		/*
1837 		 * Hit.
1838 		 */
1839 		inc_hit_counter(cache, bio);
1840 
1841 		/*
1842 		 * Passthrough always maps to the origin, invalidating any
1843 		 * cache blocks that are written to.
1844 		 */
1845 		if (passthrough_mode(&cache->features)) {
1846 			if (bio_data_dir(bio) == WRITE) {
1847 				bio_drop_shared_lock(cache, bio);
1848 				atomic_inc(&cache->stats.demotion);
1849 				invalidate_start(cache, cblock, block, bio);
1850 			} else
1851 				remap_to_origin_clear_discard(cache, bio, block);
1852 
1853 		} else {
1854 			if (bio_data_dir(bio) == WRITE && writethrough_mode(&cache->features) &&
1855 			    !is_dirty(cache, cblock)) {
1856 				remap_to_origin_then_cache(cache, bio, block, cblock);
1857 				accounted_begin(cache, bio);
1858 			} else
1859 				remap_to_cache_dirty(cache, bio, block, cblock);
1860 		}
1861 	}
1862 
1863 	/*
1864 	 * dm core turns FUA requests into a separate payload and FLUSH req.
1865 	 */
1866 	if (bio->bi_opf & REQ_FUA) {
1867 		/*
1868 		 * issue_after_commit will call accounted_begin a second time.  So
1869 		 * we call accounted_complete() to avoid double accounting.
1870 		 */
1871 		accounted_complete(cache, bio);
1872 		issue_after_commit(&cache->committer, bio);
1873 		*commit_needed = true;
1874 		return DM_MAPIO_SUBMITTED;
1875 	}
1876 
1877 	return DM_MAPIO_REMAPPED;
1878 }
1879 
1880 static bool process_bio(struct cache *cache, struct bio *bio)
1881 {
1882 	bool commit_needed;
1883 
1884 	if (map_bio(cache, bio, get_bio_block(cache, bio), &commit_needed) == DM_MAPIO_REMAPPED)
1885 		generic_make_request(bio);
1886 
1887 	return commit_needed;
1888 }
1889 
1890 /*
1891  * A non-zero return indicates read_only or fail_io mode.
1892  */
1893 static int commit(struct cache *cache, bool clean_shutdown)
1894 {
1895 	int r;
1896 
1897 	if (get_cache_mode(cache) >= CM_READ_ONLY)
1898 		return -EINVAL;
1899 
1900 	atomic_inc(&cache->stats.commit_count);
1901 	r = dm_cache_commit(cache->cmd, clean_shutdown);
1902 	if (r)
1903 		metadata_operation_failed(cache, "dm_cache_commit", r);
1904 
1905 	return r;
1906 }
1907 
1908 /*
1909  * Used by the batcher.
1910  */
1911 static blk_status_t commit_op(void *context)
1912 {
1913 	struct cache *cache = context;
1914 
1915 	if (dm_cache_changed_this_transaction(cache->cmd))
1916 		return errno_to_blk_status(commit(cache, false));
1917 
1918 	return 0;
1919 }
1920 
1921 /*----------------------------------------------------------------*/
1922 
1923 static bool process_flush_bio(struct cache *cache, struct bio *bio)
1924 {
1925 	size_t pb_data_size = get_per_bio_data_size(cache);
1926 	struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
1927 
1928 	if (!pb->req_nr)
1929 		remap_to_origin(cache, bio);
1930 	else
1931 		remap_to_cache(cache, bio, 0);
1932 
1933 	issue_after_commit(&cache->committer, bio);
1934 	return true;
1935 }
1936 
1937 static bool process_discard_bio(struct cache *cache, struct bio *bio)
1938 {
1939 	dm_dblock_t b, e;
1940 
1941 	// FIXME: do we need to lock the region?  Or can we just assume the
1942 	// user wont be so foolish as to issue discard concurrently with
1943 	// other IO?
1944 	calc_discard_block_range(cache, bio, &b, &e);
1945 	while (b != e) {
1946 		set_discard(cache, b);
1947 		b = to_dblock(from_dblock(b) + 1);
1948 	}
1949 
1950 	bio_endio(bio);
1951 
1952 	return false;
1953 }
1954 
1955 static void process_deferred_bios(struct work_struct *ws)
1956 {
1957 	struct cache *cache = container_of(ws, struct cache, deferred_bio_worker);
1958 
1959 	unsigned long flags;
1960 	bool commit_needed = false;
1961 	struct bio_list bios;
1962 	struct bio *bio;
1963 
1964 	bio_list_init(&bios);
1965 
1966 	spin_lock_irqsave(&cache->lock, flags);
1967 	bio_list_merge(&bios, &cache->deferred_bios);
1968 	bio_list_init(&cache->deferred_bios);
1969 	spin_unlock_irqrestore(&cache->lock, flags);
1970 
1971 	while ((bio = bio_list_pop(&bios))) {
1972 		if (bio->bi_opf & REQ_PREFLUSH)
1973 			commit_needed = process_flush_bio(cache, bio) || commit_needed;
1974 
1975 		else if (bio_op(bio) == REQ_OP_DISCARD)
1976 			commit_needed = process_discard_bio(cache, bio) || commit_needed;
1977 
1978 		else
1979 			commit_needed = process_bio(cache, bio) || commit_needed;
1980 	}
1981 
1982 	if (commit_needed)
1983 		schedule_commit(&cache->committer);
1984 }
1985 
1986 static void process_deferred_writethrough_bios(struct work_struct *ws)
1987 {
1988 	struct cache *cache = container_of(ws, struct cache, deferred_writethrough_worker);
1989 
1990 	unsigned long flags;
1991 	struct bio_list bios;
1992 	struct bio *bio;
1993 
1994 	bio_list_init(&bios);
1995 
1996 	spin_lock_irqsave(&cache->lock, flags);
1997 	bio_list_merge(&bios, &cache->deferred_writethrough_bios);
1998 	bio_list_init(&cache->deferred_writethrough_bios);
1999 	spin_unlock_irqrestore(&cache->lock, flags);
2000 
2001 	/*
2002 	 * These bios have already been through accounted_begin()
2003 	 */
2004 	while ((bio = bio_list_pop(&bios)))
2005 		generic_make_request(bio);
2006 }
2007 
2008 /*----------------------------------------------------------------
2009  * Main worker loop
2010  *--------------------------------------------------------------*/
2011 
2012 static void requeue_deferred_bios(struct cache *cache)
2013 {
2014 	struct bio *bio;
2015 	struct bio_list bios;
2016 
2017 	bio_list_init(&bios);
2018 	bio_list_merge(&bios, &cache->deferred_bios);
2019 	bio_list_init(&cache->deferred_bios);
2020 
2021 	while ((bio = bio_list_pop(&bios))) {
2022 		bio->bi_status = BLK_STS_DM_REQUEUE;
2023 		bio_endio(bio);
2024 	}
2025 }
2026 
2027 /*
2028  * We want to commit periodically so that not too much
2029  * unwritten metadata builds up.
2030  */
2031 static void do_waker(struct work_struct *ws)
2032 {
2033 	struct cache *cache = container_of(to_delayed_work(ws), struct cache, waker);
2034 
2035 	policy_tick(cache->policy, true);
2036 	wake_migration_worker(cache);
2037 	schedule_commit(&cache->committer);
2038 	queue_delayed_work(cache->wq, &cache->waker, COMMIT_PERIOD);
2039 }
2040 
2041 static void check_migrations(struct work_struct *ws)
2042 {
2043 	int r;
2044 	struct policy_work *op;
2045 	struct cache *cache = container_of(ws, struct cache, migration_worker);
2046 	enum busy b;
2047 
2048 	for (;;) {
2049 		b = spare_migration_bandwidth(cache);
2050 
2051 		r = policy_get_background_work(cache->policy, b == IDLE, &op);
2052 		if (r == -ENODATA)
2053 			break;
2054 
2055 		if (r) {
2056 			DMERR_LIMIT("%s: policy_background_work failed",
2057 				    cache_device_name(cache));
2058 			break;
2059 		}
2060 
2061 		r = mg_start(cache, op, NULL);
2062 		if (r)
2063 			break;
2064 	}
2065 }
2066 
2067 /*----------------------------------------------------------------
2068  * Target methods
2069  *--------------------------------------------------------------*/
2070 
2071 /*
2072  * This function gets called on the error paths of the constructor, so we
2073  * have to cope with a partially initialised struct.
2074  */
2075 static void destroy(struct cache *cache)
2076 {
2077 	unsigned i;
2078 
2079 	mempool_destroy(cache->migration_pool);
2080 
2081 	if (cache->prison)
2082 		dm_bio_prison_destroy_v2(cache->prison);
2083 
2084 	if (cache->wq)
2085 		destroy_workqueue(cache->wq);
2086 
2087 	if (cache->dirty_bitset)
2088 		free_bitset(cache->dirty_bitset);
2089 
2090 	if (cache->discard_bitset)
2091 		free_bitset(cache->discard_bitset);
2092 
2093 	if (cache->copier)
2094 		dm_kcopyd_client_destroy(cache->copier);
2095 
2096 	if (cache->cmd)
2097 		dm_cache_metadata_close(cache->cmd);
2098 
2099 	if (cache->metadata_dev)
2100 		dm_put_device(cache->ti, cache->metadata_dev);
2101 
2102 	if (cache->origin_dev)
2103 		dm_put_device(cache->ti, cache->origin_dev);
2104 
2105 	if (cache->cache_dev)
2106 		dm_put_device(cache->ti, cache->cache_dev);
2107 
2108 	if (cache->policy)
2109 		dm_cache_policy_destroy(cache->policy);
2110 
2111 	for (i = 0; i < cache->nr_ctr_args ; i++)
2112 		kfree(cache->ctr_args[i]);
2113 	kfree(cache->ctr_args);
2114 
2115 	kfree(cache);
2116 }
2117 
2118 static void cache_dtr(struct dm_target *ti)
2119 {
2120 	struct cache *cache = ti->private;
2121 
2122 	destroy(cache);
2123 }
2124 
2125 static sector_t get_dev_size(struct dm_dev *dev)
2126 {
2127 	return i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT;
2128 }
2129 
2130 /*----------------------------------------------------------------*/
2131 
2132 /*
2133  * Construct a cache device mapping.
2134  *
2135  * cache <metadata dev> <cache dev> <origin dev> <block size>
2136  *       <#feature args> [<feature arg>]*
2137  *       <policy> <#policy args> [<policy arg>]*
2138  *
2139  * metadata dev    : fast device holding the persistent metadata
2140  * cache dev	   : fast device holding cached data blocks
2141  * origin dev	   : slow device holding original data blocks
2142  * block size	   : cache unit size in sectors
2143  *
2144  * #feature args   : number of feature arguments passed
2145  * feature args    : writethrough.  (The default is writeback.)
2146  *
2147  * policy	   : the replacement policy to use
2148  * #policy args    : an even number of policy arguments corresponding
2149  *		     to key/value pairs passed to the policy
2150  * policy args	   : key/value pairs passed to the policy
2151  *		     E.g. 'sequential_threshold 1024'
2152  *		     See cache-policies.txt for details.
2153  *
2154  * Optional feature arguments are:
2155  *   writethrough  : write through caching that prohibits cache block
2156  *		     content from being different from origin block content.
2157  *		     Without this argument, the default behaviour is to write
2158  *		     back cache block contents later for performance reasons,
2159  *		     so they may differ from the corresponding origin blocks.
2160  */
2161 struct cache_args {
2162 	struct dm_target *ti;
2163 
2164 	struct dm_dev *metadata_dev;
2165 
2166 	struct dm_dev *cache_dev;
2167 	sector_t cache_sectors;
2168 
2169 	struct dm_dev *origin_dev;
2170 	sector_t origin_sectors;
2171 
2172 	uint32_t block_size;
2173 
2174 	const char *policy_name;
2175 	int policy_argc;
2176 	const char **policy_argv;
2177 
2178 	struct cache_features features;
2179 };
2180 
2181 static void destroy_cache_args(struct cache_args *ca)
2182 {
2183 	if (ca->metadata_dev)
2184 		dm_put_device(ca->ti, ca->metadata_dev);
2185 
2186 	if (ca->cache_dev)
2187 		dm_put_device(ca->ti, ca->cache_dev);
2188 
2189 	if (ca->origin_dev)
2190 		dm_put_device(ca->ti, ca->origin_dev);
2191 
2192 	kfree(ca);
2193 }
2194 
2195 static bool at_least_one_arg(struct dm_arg_set *as, char **error)
2196 {
2197 	if (!as->argc) {
2198 		*error = "Insufficient args";
2199 		return false;
2200 	}
2201 
2202 	return true;
2203 }
2204 
2205 static int parse_metadata_dev(struct cache_args *ca, struct dm_arg_set *as,
2206 			      char **error)
2207 {
2208 	int r;
2209 	sector_t metadata_dev_size;
2210 	char b[BDEVNAME_SIZE];
2211 
2212 	if (!at_least_one_arg(as, error))
2213 		return -EINVAL;
2214 
2215 	r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE,
2216 			  &ca->metadata_dev);
2217 	if (r) {
2218 		*error = "Error opening metadata device";
2219 		return r;
2220 	}
2221 
2222 	metadata_dev_size = get_dev_size(ca->metadata_dev);
2223 	if (metadata_dev_size > DM_CACHE_METADATA_MAX_SECTORS_WARNING)
2224 		DMWARN("Metadata device %s is larger than %u sectors: excess space will not be used.",
2225 		       bdevname(ca->metadata_dev->bdev, b), THIN_METADATA_MAX_SECTORS);
2226 
2227 	return 0;
2228 }
2229 
2230 static int parse_cache_dev(struct cache_args *ca, struct dm_arg_set *as,
2231 			   char **error)
2232 {
2233 	int r;
2234 
2235 	if (!at_least_one_arg(as, error))
2236 		return -EINVAL;
2237 
2238 	r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE,
2239 			  &ca->cache_dev);
2240 	if (r) {
2241 		*error = "Error opening cache device";
2242 		return r;
2243 	}
2244 	ca->cache_sectors = get_dev_size(ca->cache_dev);
2245 
2246 	return 0;
2247 }
2248 
2249 static int parse_origin_dev(struct cache_args *ca, struct dm_arg_set *as,
2250 			    char **error)
2251 {
2252 	int r;
2253 
2254 	if (!at_least_one_arg(as, error))
2255 		return -EINVAL;
2256 
2257 	r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE,
2258 			  &ca->origin_dev);
2259 	if (r) {
2260 		*error = "Error opening origin device";
2261 		return r;
2262 	}
2263 
2264 	ca->origin_sectors = get_dev_size(ca->origin_dev);
2265 	if (ca->ti->len > ca->origin_sectors) {
2266 		*error = "Device size larger than cached device";
2267 		return -EINVAL;
2268 	}
2269 
2270 	return 0;
2271 }
2272 
2273 static int parse_block_size(struct cache_args *ca, struct dm_arg_set *as,
2274 			    char **error)
2275 {
2276 	unsigned long block_size;
2277 
2278 	if (!at_least_one_arg(as, error))
2279 		return -EINVAL;
2280 
2281 	if (kstrtoul(dm_shift_arg(as), 10, &block_size) || !block_size ||
2282 	    block_size < DATA_DEV_BLOCK_SIZE_MIN_SECTORS ||
2283 	    block_size > DATA_DEV_BLOCK_SIZE_MAX_SECTORS ||
2284 	    block_size & (DATA_DEV_BLOCK_SIZE_MIN_SECTORS - 1)) {
2285 		*error = "Invalid data block size";
2286 		return -EINVAL;
2287 	}
2288 
2289 	if (block_size > ca->cache_sectors) {
2290 		*error = "Data block size is larger than the cache device";
2291 		return -EINVAL;
2292 	}
2293 
2294 	ca->block_size = block_size;
2295 
2296 	return 0;
2297 }
2298 
2299 static void init_features(struct cache_features *cf)
2300 {
2301 	cf->mode = CM_WRITE;
2302 	cf->io_mode = CM_IO_WRITEBACK;
2303 	cf->metadata_version = 1;
2304 }
2305 
2306 static int parse_features(struct cache_args *ca, struct dm_arg_set *as,
2307 			  char **error)
2308 {
2309 	static const struct dm_arg _args[] = {
2310 		{0, 2, "Invalid number of cache feature arguments"},
2311 	};
2312 
2313 	int r;
2314 	unsigned argc;
2315 	const char *arg;
2316 	struct cache_features *cf = &ca->features;
2317 
2318 	init_features(cf);
2319 
2320 	r = dm_read_arg_group(_args, as, &argc, error);
2321 	if (r)
2322 		return -EINVAL;
2323 
2324 	while (argc--) {
2325 		arg = dm_shift_arg(as);
2326 
2327 		if (!strcasecmp(arg, "writeback"))
2328 			cf->io_mode = CM_IO_WRITEBACK;
2329 
2330 		else if (!strcasecmp(arg, "writethrough"))
2331 			cf->io_mode = CM_IO_WRITETHROUGH;
2332 
2333 		else if (!strcasecmp(arg, "passthrough"))
2334 			cf->io_mode = CM_IO_PASSTHROUGH;
2335 
2336 		else if (!strcasecmp(arg, "metadata2"))
2337 			cf->metadata_version = 2;
2338 
2339 		else {
2340 			*error = "Unrecognised cache feature requested";
2341 			return -EINVAL;
2342 		}
2343 	}
2344 
2345 	return 0;
2346 }
2347 
2348 static int parse_policy(struct cache_args *ca, struct dm_arg_set *as,
2349 			char **error)
2350 {
2351 	static const struct dm_arg _args[] = {
2352 		{0, 1024, "Invalid number of policy arguments"},
2353 	};
2354 
2355 	int r;
2356 
2357 	if (!at_least_one_arg(as, error))
2358 		return -EINVAL;
2359 
2360 	ca->policy_name = dm_shift_arg(as);
2361 
2362 	r = dm_read_arg_group(_args, as, &ca->policy_argc, error);
2363 	if (r)
2364 		return -EINVAL;
2365 
2366 	ca->policy_argv = (const char **)as->argv;
2367 	dm_consume_args(as, ca->policy_argc);
2368 
2369 	return 0;
2370 }
2371 
2372 static int parse_cache_args(struct cache_args *ca, int argc, char **argv,
2373 			    char **error)
2374 {
2375 	int r;
2376 	struct dm_arg_set as;
2377 
2378 	as.argc = argc;
2379 	as.argv = argv;
2380 
2381 	r = parse_metadata_dev(ca, &as, error);
2382 	if (r)
2383 		return r;
2384 
2385 	r = parse_cache_dev(ca, &as, error);
2386 	if (r)
2387 		return r;
2388 
2389 	r = parse_origin_dev(ca, &as, error);
2390 	if (r)
2391 		return r;
2392 
2393 	r = parse_block_size(ca, &as, error);
2394 	if (r)
2395 		return r;
2396 
2397 	r = parse_features(ca, &as, error);
2398 	if (r)
2399 		return r;
2400 
2401 	r = parse_policy(ca, &as, error);
2402 	if (r)
2403 		return r;
2404 
2405 	return 0;
2406 }
2407 
2408 /*----------------------------------------------------------------*/
2409 
2410 static struct kmem_cache *migration_cache;
2411 
2412 #define NOT_CORE_OPTION 1
2413 
2414 static int process_config_option(struct cache *cache, const char *key, const char *value)
2415 {
2416 	unsigned long tmp;
2417 
2418 	if (!strcasecmp(key, "migration_threshold")) {
2419 		if (kstrtoul(value, 10, &tmp))
2420 			return -EINVAL;
2421 
2422 		cache->migration_threshold = tmp;
2423 		return 0;
2424 	}
2425 
2426 	return NOT_CORE_OPTION;
2427 }
2428 
2429 static int set_config_value(struct cache *cache, const char *key, const char *value)
2430 {
2431 	int r = process_config_option(cache, key, value);
2432 
2433 	if (r == NOT_CORE_OPTION)
2434 		r = policy_set_config_value(cache->policy, key, value);
2435 
2436 	if (r)
2437 		DMWARN("bad config value for %s: %s", key, value);
2438 
2439 	return r;
2440 }
2441 
2442 static int set_config_values(struct cache *cache, int argc, const char **argv)
2443 {
2444 	int r = 0;
2445 
2446 	if (argc & 1) {
2447 		DMWARN("Odd number of policy arguments given but they should be <key> <value> pairs.");
2448 		return -EINVAL;
2449 	}
2450 
2451 	while (argc) {
2452 		r = set_config_value(cache, argv[0], argv[1]);
2453 		if (r)
2454 			break;
2455 
2456 		argc -= 2;
2457 		argv += 2;
2458 	}
2459 
2460 	return r;
2461 }
2462 
2463 static int create_cache_policy(struct cache *cache, struct cache_args *ca,
2464 			       char **error)
2465 {
2466 	struct dm_cache_policy *p = dm_cache_policy_create(ca->policy_name,
2467 							   cache->cache_size,
2468 							   cache->origin_sectors,
2469 							   cache->sectors_per_block);
2470 	if (IS_ERR(p)) {
2471 		*error = "Error creating cache's policy";
2472 		return PTR_ERR(p);
2473 	}
2474 	cache->policy = p;
2475 	BUG_ON(!cache->policy);
2476 
2477 	return 0;
2478 }
2479 
2480 /*
2481  * We want the discard block size to be at least the size of the cache
2482  * block size and have no more than 2^14 discard blocks across the origin.
2483  */
2484 #define MAX_DISCARD_BLOCKS (1 << 14)
2485 
2486 static bool too_many_discard_blocks(sector_t discard_block_size,
2487 				    sector_t origin_size)
2488 {
2489 	(void) sector_div(origin_size, discard_block_size);
2490 
2491 	return origin_size > MAX_DISCARD_BLOCKS;
2492 }
2493 
2494 static sector_t calculate_discard_block_size(sector_t cache_block_size,
2495 					     sector_t origin_size)
2496 {
2497 	sector_t discard_block_size = cache_block_size;
2498 
2499 	if (origin_size)
2500 		while (too_many_discard_blocks(discard_block_size, origin_size))
2501 			discard_block_size *= 2;
2502 
2503 	return discard_block_size;
2504 }
2505 
2506 static void set_cache_size(struct cache *cache, dm_cblock_t size)
2507 {
2508 	dm_block_t nr_blocks = from_cblock(size);
2509 
2510 	if (nr_blocks > (1 << 20) && cache->cache_size != size)
2511 		DMWARN_LIMIT("You have created a cache device with a lot of individual cache blocks (%llu)\n"
2512 			     "All these mappings can consume a lot of kernel memory, and take some time to read/write.\n"
2513 			     "Please consider increasing the cache block size to reduce the overall cache block count.",
2514 			     (unsigned long long) nr_blocks);
2515 
2516 	cache->cache_size = size;
2517 }
2518 
2519 static int is_congested(struct dm_dev *dev, int bdi_bits)
2520 {
2521 	struct request_queue *q = bdev_get_queue(dev->bdev);
2522 	return bdi_congested(q->backing_dev_info, bdi_bits);
2523 }
2524 
2525 static int cache_is_congested(struct dm_target_callbacks *cb, int bdi_bits)
2526 {
2527 	struct cache *cache = container_of(cb, struct cache, callbacks);
2528 
2529 	return is_congested(cache->origin_dev, bdi_bits) ||
2530 		is_congested(cache->cache_dev, bdi_bits);
2531 }
2532 
2533 #define DEFAULT_MIGRATION_THRESHOLD 2048
2534 
2535 static int cache_create(struct cache_args *ca, struct cache **result)
2536 {
2537 	int r = 0;
2538 	char **error = &ca->ti->error;
2539 	struct cache *cache;
2540 	struct dm_target *ti = ca->ti;
2541 	dm_block_t origin_blocks;
2542 	struct dm_cache_metadata *cmd;
2543 	bool may_format = ca->features.mode == CM_WRITE;
2544 
2545 	cache = kzalloc(sizeof(*cache), GFP_KERNEL);
2546 	if (!cache)
2547 		return -ENOMEM;
2548 
2549 	cache->ti = ca->ti;
2550 	ti->private = cache;
2551 	ti->num_flush_bios = 2;
2552 	ti->flush_supported = true;
2553 
2554 	ti->num_discard_bios = 1;
2555 	ti->discards_supported = true;
2556 	ti->split_discard_bios = false;
2557 
2558 	cache->features = ca->features;
2559 	ti->per_io_data_size = get_per_bio_data_size(cache);
2560 
2561 	cache->callbacks.congested_fn = cache_is_congested;
2562 	dm_table_add_target_callbacks(ti->table, &cache->callbacks);
2563 
2564 	cache->metadata_dev = ca->metadata_dev;
2565 	cache->origin_dev = ca->origin_dev;
2566 	cache->cache_dev = ca->cache_dev;
2567 
2568 	ca->metadata_dev = ca->origin_dev = ca->cache_dev = NULL;
2569 
2570 	origin_blocks = cache->origin_sectors = ca->origin_sectors;
2571 	origin_blocks = block_div(origin_blocks, ca->block_size);
2572 	cache->origin_blocks = to_oblock(origin_blocks);
2573 
2574 	cache->sectors_per_block = ca->block_size;
2575 	if (dm_set_target_max_io_len(ti, cache->sectors_per_block)) {
2576 		r = -EINVAL;
2577 		goto bad;
2578 	}
2579 
2580 	if (ca->block_size & (ca->block_size - 1)) {
2581 		dm_block_t cache_size = ca->cache_sectors;
2582 
2583 		cache->sectors_per_block_shift = -1;
2584 		cache_size = block_div(cache_size, ca->block_size);
2585 		set_cache_size(cache, to_cblock(cache_size));
2586 	} else {
2587 		cache->sectors_per_block_shift = __ffs(ca->block_size);
2588 		set_cache_size(cache, to_cblock(ca->cache_sectors >> cache->sectors_per_block_shift));
2589 	}
2590 
2591 	r = create_cache_policy(cache, ca, error);
2592 	if (r)
2593 		goto bad;
2594 
2595 	cache->policy_nr_args = ca->policy_argc;
2596 	cache->migration_threshold = DEFAULT_MIGRATION_THRESHOLD;
2597 
2598 	r = set_config_values(cache, ca->policy_argc, ca->policy_argv);
2599 	if (r) {
2600 		*error = "Error setting cache policy's config values";
2601 		goto bad;
2602 	}
2603 
2604 	cmd = dm_cache_metadata_open(cache->metadata_dev->bdev,
2605 				     ca->block_size, may_format,
2606 				     dm_cache_policy_get_hint_size(cache->policy),
2607 				     ca->features.metadata_version);
2608 	if (IS_ERR(cmd)) {
2609 		*error = "Error creating metadata object";
2610 		r = PTR_ERR(cmd);
2611 		goto bad;
2612 	}
2613 	cache->cmd = cmd;
2614 	set_cache_mode(cache, CM_WRITE);
2615 	if (get_cache_mode(cache) != CM_WRITE) {
2616 		*error = "Unable to get write access to metadata, please check/repair metadata.";
2617 		r = -EINVAL;
2618 		goto bad;
2619 	}
2620 
2621 	if (passthrough_mode(&cache->features)) {
2622 		bool all_clean;
2623 
2624 		r = dm_cache_metadata_all_clean(cache->cmd, &all_clean);
2625 		if (r) {
2626 			*error = "dm_cache_metadata_all_clean() failed";
2627 			goto bad;
2628 		}
2629 
2630 		if (!all_clean) {
2631 			*error = "Cannot enter passthrough mode unless all blocks are clean";
2632 			r = -EINVAL;
2633 			goto bad;
2634 		}
2635 
2636 		policy_allow_migrations(cache->policy, false);
2637 	}
2638 
2639 	spin_lock_init(&cache->lock);
2640 	INIT_LIST_HEAD(&cache->deferred_cells);
2641 	bio_list_init(&cache->deferred_bios);
2642 	bio_list_init(&cache->deferred_writethrough_bios);
2643 	atomic_set(&cache->nr_allocated_migrations, 0);
2644 	atomic_set(&cache->nr_io_migrations, 0);
2645 	init_waitqueue_head(&cache->migration_wait);
2646 
2647 	r = -ENOMEM;
2648 	atomic_set(&cache->nr_dirty, 0);
2649 	cache->dirty_bitset = alloc_bitset(from_cblock(cache->cache_size));
2650 	if (!cache->dirty_bitset) {
2651 		*error = "could not allocate dirty bitset";
2652 		goto bad;
2653 	}
2654 	clear_bitset(cache->dirty_bitset, from_cblock(cache->cache_size));
2655 
2656 	cache->discard_block_size =
2657 		calculate_discard_block_size(cache->sectors_per_block,
2658 					     cache->origin_sectors);
2659 	cache->discard_nr_blocks = to_dblock(dm_sector_div_up(cache->origin_sectors,
2660 							      cache->discard_block_size));
2661 	cache->discard_bitset = alloc_bitset(from_dblock(cache->discard_nr_blocks));
2662 	if (!cache->discard_bitset) {
2663 		*error = "could not allocate discard bitset";
2664 		goto bad;
2665 	}
2666 	clear_bitset(cache->discard_bitset, from_dblock(cache->discard_nr_blocks));
2667 
2668 	cache->copier = dm_kcopyd_client_create(&dm_kcopyd_throttle);
2669 	if (IS_ERR(cache->copier)) {
2670 		*error = "could not create kcopyd client";
2671 		r = PTR_ERR(cache->copier);
2672 		goto bad;
2673 	}
2674 
2675 	cache->wq = alloc_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM, 0);
2676 	if (!cache->wq) {
2677 		*error = "could not create workqueue for metadata object";
2678 		goto bad;
2679 	}
2680 	INIT_WORK(&cache->deferred_bio_worker, process_deferred_bios);
2681 	INIT_WORK(&cache->deferred_writethrough_worker,
2682 		  process_deferred_writethrough_bios);
2683 	INIT_WORK(&cache->migration_worker, check_migrations);
2684 	INIT_DELAYED_WORK(&cache->waker, do_waker);
2685 
2686 	cache->prison = dm_bio_prison_create_v2(cache->wq);
2687 	if (!cache->prison) {
2688 		*error = "could not create bio prison";
2689 		goto bad;
2690 	}
2691 
2692 	cache->migration_pool = mempool_create_slab_pool(MIGRATION_POOL_SIZE,
2693 							 migration_cache);
2694 	if (!cache->migration_pool) {
2695 		*error = "Error creating cache's migration mempool";
2696 		goto bad;
2697 	}
2698 
2699 	cache->need_tick_bio = true;
2700 	cache->sized = false;
2701 	cache->invalidate = false;
2702 	cache->commit_requested = false;
2703 	cache->loaded_mappings = false;
2704 	cache->loaded_discards = false;
2705 
2706 	load_stats(cache);
2707 
2708 	atomic_set(&cache->stats.demotion, 0);
2709 	atomic_set(&cache->stats.promotion, 0);
2710 	atomic_set(&cache->stats.copies_avoided, 0);
2711 	atomic_set(&cache->stats.cache_cell_clash, 0);
2712 	atomic_set(&cache->stats.commit_count, 0);
2713 	atomic_set(&cache->stats.discard_count, 0);
2714 
2715 	spin_lock_init(&cache->invalidation_lock);
2716 	INIT_LIST_HEAD(&cache->invalidation_requests);
2717 
2718 	batcher_init(&cache->committer, commit_op, cache,
2719 		     issue_op, cache, cache->wq);
2720 	iot_init(&cache->tracker);
2721 
2722 	init_rwsem(&cache->background_work_lock);
2723 	prevent_background_work(cache);
2724 
2725 	*result = cache;
2726 	return 0;
2727 bad:
2728 	destroy(cache);
2729 	return r;
2730 }
2731 
2732 static int copy_ctr_args(struct cache *cache, int argc, const char **argv)
2733 {
2734 	unsigned i;
2735 	const char **copy;
2736 
2737 	copy = kcalloc(argc, sizeof(*copy), GFP_KERNEL);
2738 	if (!copy)
2739 		return -ENOMEM;
2740 	for (i = 0; i < argc; i++) {
2741 		copy[i] = kstrdup(argv[i], GFP_KERNEL);
2742 		if (!copy[i]) {
2743 			while (i--)
2744 				kfree(copy[i]);
2745 			kfree(copy);
2746 			return -ENOMEM;
2747 		}
2748 	}
2749 
2750 	cache->nr_ctr_args = argc;
2751 	cache->ctr_args = copy;
2752 
2753 	return 0;
2754 }
2755 
2756 static int cache_ctr(struct dm_target *ti, unsigned argc, char **argv)
2757 {
2758 	int r = -EINVAL;
2759 	struct cache_args *ca;
2760 	struct cache *cache = NULL;
2761 
2762 	ca = kzalloc(sizeof(*ca), GFP_KERNEL);
2763 	if (!ca) {
2764 		ti->error = "Error allocating memory for cache";
2765 		return -ENOMEM;
2766 	}
2767 	ca->ti = ti;
2768 
2769 	r = parse_cache_args(ca, argc, argv, &ti->error);
2770 	if (r)
2771 		goto out;
2772 
2773 	r = cache_create(ca, &cache);
2774 	if (r)
2775 		goto out;
2776 
2777 	r = copy_ctr_args(cache, argc - 3, (const char **)argv + 3);
2778 	if (r) {
2779 		destroy(cache);
2780 		goto out;
2781 	}
2782 
2783 	ti->private = cache;
2784 out:
2785 	destroy_cache_args(ca);
2786 	return r;
2787 }
2788 
2789 /*----------------------------------------------------------------*/
2790 
2791 static int cache_map(struct dm_target *ti, struct bio *bio)
2792 {
2793 	struct cache *cache = ti->private;
2794 
2795 	int r;
2796 	bool commit_needed;
2797 	dm_oblock_t block = get_bio_block(cache, bio);
2798 	size_t pb_data_size = get_per_bio_data_size(cache);
2799 
2800 	init_per_bio_data(bio, pb_data_size);
2801 	if (unlikely(from_oblock(block) >= from_oblock(cache->origin_blocks))) {
2802 		/*
2803 		 * This can only occur if the io goes to a partial block at
2804 		 * the end of the origin device.  We don't cache these.
2805 		 * Just remap to the origin and carry on.
2806 		 */
2807 		remap_to_origin(cache, bio);
2808 		accounted_begin(cache, bio);
2809 		return DM_MAPIO_REMAPPED;
2810 	}
2811 
2812 	if (discard_or_flush(bio)) {
2813 		defer_bio(cache, bio);
2814 		return DM_MAPIO_SUBMITTED;
2815 	}
2816 
2817 	r = map_bio(cache, bio, block, &commit_needed);
2818 	if (commit_needed)
2819 		schedule_commit(&cache->committer);
2820 
2821 	return r;
2822 }
2823 
2824 static int cache_end_io(struct dm_target *ti, struct bio *bio,
2825 		blk_status_t *error)
2826 {
2827 	struct cache *cache = ti->private;
2828 	unsigned long flags;
2829 	size_t pb_data_size = get_per_bio_data_size(cache);
2830 	struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
2831 
2832 	if (pb->tick) {
2833 		policy_tick(cache->policy, false);
2834 
2835 		spin_lock_irqsave(&cache->lock, flags);
2836 		cache->need_tick_bio = true;
2837 		spin_unlock_irqrestore(&cache->lock, flags);
2838 	}
2839 
2840 	bio_drop_shared_lock(cache, bio);
2841 	accounted_complete(cache, bio);
2842 
2843 	return DM_ENDIO_DONE;
2844 }
2845 
2846 static int write_dirty_bitset(struct cache *cache)
2847 {
2848 	int r;
2849 
2850 	if (get_cache_mode(cache) >= CM_READ_ONLY)
2851 		return -EINVAL;
2852 
2853 	r = dm_cache_set_dirty_bits(cache->cmd, from_cblock(cache->cache_size), cache->dirty_bitset);
2854 	if (r)
2855 		metadata_operation_failed(cache, "dm_cache_set_dirty_bits", r);
2856 
2857 	return r;
2858 }
2859 
2860 static int write_discard_bitset(struct cache *cache)
2861 {
2862 	unsigned i, r;
2863 
2864 	if (get_cache_mode(cache) >= CM_READ_ONLY)
2865 		return -EINVAL;
2866 
2867 	r = dm_cache_discard_bitset_resize(cache->cmd, cache->discard_block_size,
2868 					   cache->discard_nr_blocks);
2869 	if (r) {
2870 		DMERR("%s: could not resize on-disk discard bitset", cache_device_name(cache));
2871 		metadata_operation_failed(cache, "dm_cache_discard_bitset_resize", r);
2872 		return r;
2873 	}
2874 
2875 	for (i = 0; i < from_dblock(cache->discard_nr_blocks); i++) {
2876 		r = dm_cache_set_discard(cache->cmd, to_dblock(i),
2877 					 is_discarded(cache, to_dblock(i)));
2878 		if (r) {
2879 			metadata_operation_failed(cache, "dm_cache_set_discard", r);
2880 			return r;
2881 		}
2882 	}
2883 
2884 	return 0;
2885 }
2886 
2887 static int write_hints(struct cache *cache)
2888 {
2889 	int r;
2890 
2891 	if (get_cache_mode(cache) >= CM_READ_ONLY)
2892 		return -EINVAL;
2893 
2894 	r = dm_cache_write_hints(cache->cmd, cache->policy);
2895 	if (r) {
2896 		metadata_operation_failed(cache, "dm_cache_write_hints", r);
2897 		return r;
2898 	}
2899 
2900 	return 0;
2901 }
2902 
2903 /*
2904  * returns true on success
2905  */
2906 static bool sync_metadata(struct cache *cache)
2907 {
2908 	int r1, r2, r3, r4;
2909 
2910 	r1 = write_dirty_bitset(cache);
2911 	if (r1)
2912 		DMERR("%s: could not write dirty bitset", cache_device_name(cache));
2913 
2914 	r2 = write_discard_bitset(cache);
2915 	if (r2)
2916 		DMERR("%s: could not write discard bitset", cache_device_name(cache));
2917 
2918 	save_stats(cache);
2919 
2920 	r3 = write_hints(cache);
2921 	if (r3)
2922 		DMERR("%s: could not write hints", cache_device_name(cache));
2923 
2924 	/*
2925 	 * If writing the above metadata failed, we still commit, but don't
2926 	 * set the clean shutdown flag.  This will effectively force every
2927 	 * dirty bit to be set on reload.
2928 	 */
2929 	r4 = commit(cache, !r1 && !r2 && !r3);
2930 	if (r4)
2931 		DMERR("%s: could not write cache metadata", cache_device_name(cache));
2932 
2933 	return !r1 && !r2 && !r3 && !r4;
2934 }
2935 
2936 static void cache_postsuspend(struct dm_target *ti)
2937 {
2938 	struct cache *cache = ti->private;
2939 
2940 	prevent_background_work(cache);
2941 	BUG_ON(atomic_read(&cache->nr_io_migrations));
2942 
2943 	cancel_delayed_work(&cache->waker);
2944 	flush_workqueue(cache->wq);
2945 	WARN_ON(cache->tracker.in_flight);
2946 
2947 	/*
2948 	 * If it's a flush suspend there won't be any deferred bios, so this
2949 	 * call is harmless.
2950 	 */
2951 	requeue_deferred_bios(cache);
2952 
2953 	if (get_cache_mode(cache) == CM_WRITE)
2954 		(void) sync_metadata(cache);
2955 }
2956 
2957 static int load_mapping(void *context, dm_oblock_t oblock, dm_cblock_t cblock,
2958 			bool dirty, uint32_t hint, bool hint_valid)
2959 {
2960 	int r;
2961 	struct cache *cache = context;
2962 
2963 	if (dirty) {
2964 		set_bit(from_cblock(cblock), cache->dirty_bitset);
2965 		atomic_inc(&cache->nr_dirty);
2966 	} else
2967 		clear_bit(from_cblock(cblock), cache->dirty_bitset);
2968 
2969 	r = policy_load_mapping(cache->policy, oblock, cblock, dirty, hint, hint_valid);
2970 	if (r)
2971 		return r;
2972 
2973 	return 0;
2974 }
2975 
2976 /*
2977  * The discard block size in the on disk metadata is not
2978  * neccessarily the same as we're currently using.  So we have to
2979  * be careful to only set the discarded attribute if we know it
2980  * covers a complete block of the new size.
2981  */
2982 struct discard_load_info {
2983 	struct cache *cache;
2984 
2985 	/*
2986 	 * These blocks are sized using the on disk dblock size, rather
2987 	 * than the current one.
2988 	 */
2989 	dm_block_t block_size;
2990 	dm_block_t discard_begin, discard_end;
2991 };
2992 
2993 static void discard_load_info_init(struct cache *cache,
2994 				   struct discard_load_info *li)
2995 {
2996 	li->cache = cache;
2997 	li->discard_begin = li->discard_end = 0;
2998 }
2999 
3000 static void set_discard_range(struct discard_load_info *li)
3001 {
3002 	sector_t b, e;
3003 
3004 	if (li->discard_begin == li->discard_end)
3005 		return;
3006 
3007 	/*
3008 	 * Convert to sectors.
3009 	 */
3010 	b = li->discard_begin * li->block_size;
3011 	e = li->discard_end * li->block_size;
3012 
3013 	/*
3014 	 * Then convert back to the current dblock size.
3015 	 */
3016 	b = dm_sector_div_up(b, li->cache->discard_block_size);
3017 	sector_div(e, li->cache->discard_block_size);
3018 
3019 	/*
3020 	 * The origin may have shrunk, so we need to check we're still in
3021 	 * bounds.
3022 	 */
3023 	if (e > from_dblock(li->cache->discard_nr_blocks))
3024 		e = from_dblock(li->cache->discard_nr_blocks);
3025 
3026 	for (; b < e; b++)
3027 		set_discard(li->cache, to_dblock(b));
3028 }
3029 
3030 static int load_discard(void *context, sector_t discard_block_size,
3031 			dm_dblock_t dblock, bool discard)
3032 {
3033 	struct discard_load_info *li = context;
3034 
3035 	li->block_size = discard_block_size;
3036 
3037 	if (discard) {
3038 		if (from_dblock(dblock) == li->discard_end)
3039 			/*
3040 			 * We're already in a discard range, just extend it.
3041 			 */
3042 			li->discard_end = li->discard_end + 1ULL;
3043 
3044 		else {
3045 			/*
3046 			 * Emit the old range and start a new one.
3047 			 */
3048 			set_discard_range(li);
3049 			li->discard_begin = from_dblock(dblock);
3050 			li->discard_end = li->discard_begin + 1ULL;
3051 		}
3052 	} else {
3053 		set_discard_range(li);
3054 		li->discard_begin = li->discard_end = 0;
3055 	}
3056 
3057 	return 0;
3058 }
3059 
3060 static dm_cblock_t get_cache_dev_size(struct cache *cache)
3061 {
3062 	sector_t size = get_dev_size(cache->cache_dev);
3063 	(void) sector_div(size, cache->sectors_per_block);
3064 	return to_cblock(size);
3065 }
3066 
3067 static bool can_resize(struct cache *cache, dm_cblock_t new_size)
3068 {
3069 	if (from_cblock(new_size) > from_cblock(cache->cache_size))
3070 		return true;
3071 
3072 	/*
3073 	 * We can't drop a dirty block when shrinking the cache.
3074 	 */
3075 	while (from_cblock(new_size) < from_cblock(cache->cache_size)) {
3076 		new_size = to_cblock(from_cblock(new_size) + 1);
3077 		if (is_dirty(cache, new_size)) {
3078 			DMERR("%s: unable to shrink cache; cache block %llu is dirty",
3079 			      cache_device_name(cache),
3080 			      (unsigned long long) from_cblock(new_size));
3081 			return false;
3082 		}
3083 	}
3084 
3085 	return true;
3086 }
3087 
3088 static int resize_cache_dev(struct cache *cache, dm_cblock_t new_size)
3089 {
3090 	int r;
3091 
3092 	r = dm_cache_resize(cache->cmd, new_size);
3093 	if (r) {
3094 		DMERR("%s: could not resize cache metadata", cache_device_name(cache));
3095 		metadata_operation_failed(cache, "dm_cache_resize", r);
3096 		return r;
3097 	}
3098 
3099 	set_cache_size(cache, new_size);
3100 
3101 	return 0;
3102 }
3103 
3104 static int cache_preresume(struct dm_target *ti)
3105 {
3106 	int r = 0;
3107 	struct cache *cache = ti->private;
3108 	dm_cblock_t csize = get_cache_dev_size(cache);
3109 
3110 	/*
3111 	 * Check to see if the cache has resized.
3112 	 */
3113 	if (!cache->sized) {
3114 		r = resize_cache_dev(cache, csize);
3115 		if (r)
3116 			return r;
3117 
3118 		cache->sized = true;
3119 
3120 	} else if (csize != cache->cache_size) {
3121 		if (!can_resize(cache, csize))
3122 			return -EINVAL;
3123 
3124 		r = resize_cache_dev(cache, csize);
3125 		if (r)
3126 			return r;
3127 	}
3128 
3129 	if (!cache->loaded_mappings) {
3130 		r = dm_cache_load_mappings(cache->cmd, cache->policy,
3131 					   load_mapping, cache);
3132 		if (r) {
3133 			DMERR("%s: could not load cache mappings", cache_device_name(cache));
3134 			metadata_operation_failed(cache, "dm_cache_load_mappings", r);
3135 			return r;
3136 		}
3137 
3138 		cache->loaded_mappings = true;
3139 	}
3140 
3141 	if (!cache->loaded_discards) {
3142 		struct discard_load_info li;
3143 
3144 		/*
3145 		 * The discard bitset could have been resized, or the
3146 		 * discard block size changed.  To be safe we start by
3147 		 * setting every dblock to not discarded.
3148 		 */
3149 		clear_bitset(cache->discard_bitset, from_dblock(cache->discard_nr_blocks));
3150 
3151 		discard_load_info_init(cache, &li);
3152 		r = dm_cache_load_discards(cache->cmd, load_discard, &li);
3153 		if (r) {
3154 			DMERR("%s: could not load origin discards", cache_device_name(cache));
3155 			metadata_operation_failed(cache, "dm_cache_load_discards", r);
3156 			return r;
3157 		}
3158 		set_discard_range(&li);
3159 
3160 		cache->loaded_discards = true;
3161 	}
3162 
3163 	return r;
3164 }
3165 
3166 static void cache_resume(struct dm_target *ti)
3167 {
3168 	struct cache *cache = ti->private;
3169 
3170 	cache->need_tick_bio = true;
3171 	allow_background_work(cache);
3172 	do_waker(&cache->waker.work);
3173 }
3174 
3175 /*
3176  * Status format:
3177  *
3178  * <metadata block size> <#used metadata blocks>/<#total metadata blocks>
3179  * <cache block size> <#used cache blocks>/<#total cache blocks>
3180  * <#read hits> <#read misses> <#write hits> <#write misses>
3181  * <#demotions> <#promotions> <#dirty>
3182  * <#features> <features>*
3183  * <#core args> <core args>
3184  * <policy name> <#policy args> <policy args>* <cache metadata mode> <needs_check>
3185  */
3186 static void cache_status(struct dm_target *ti, status_type_t type,
3187 			 unsigned status_flags, char *result, unsigned maxlen)
3188 {
3189 	int r = 0;
3190 	unsigned i;
3191 	ssize_t sz = 0;
3192 	dm_block_t nr_free_blocks_metadata = 0;
3193 	dm_block_t nr_blocks_metadata = 0;
3194 	char buf[BDEVNAME_SIZE];
3195 	struct cache *cache = ti->private;
3196 	dm_cblock_t residency;
3197 	bool needs_check;
3198 
3199 	switch (type) {
3200 	case STATUSTYPE_INFO:
3201 		if (get_cache_mode(cache) == CM_FAIL) {
3202 			DMEMIT("Fail");
3203 			break;
3204 		}
3205 
3206 		/* Commit to ensure statistics aren't out-of-date */
3207 		if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti))
3208 			(void) commit(cache, false);
3209 
3210 		r = dm_cache_get_free_metadata_block_count(cache->cmd, &nr_free_blocks_metadata);
3211 		if (r) {
3212 			DMERR("%s: dm_cache_get_free_metadata_block_count returned %d",
3213 			      cache_device_name(cache), r);
3214 			goto err;
3215 		}
3216 
3217 		r = dm_cache_get_metadata_dev_size(cache->cmd, &nr_blocks_metadata);
3218 		if (r) {
3219 			DMERR("%s: dm_cache_get_metadata_dev_size returned %d",
3220 			      cache_device_name(cache), r);
3221 			goto err;
3222 		}
3223 
3224 		residency = policy_residency(cache->policy);
3225 
3226 		DMEMIT("%u %llu/%llu %llu %llu/%llu %u %u %u %u %u %u %lu ",
3227 		       (unsigned)DM_CACHE_METADATA_BLOCK_SIZE,
3228 		       (unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata),
3229 		       (unsigned long long)nr_blocks_metadata,
3230 		       (unsigned long long)cache->sectors_per_block,
3231 		       (unsigned long long) from_cblock(residency),
3232 		       (unsigned long long) from_cblock(cache->cache_size),
3233 		       (unsigned) atomic_read(&cache->stats.read_hit),
3234 		       (unsigned) atomic_read(&cache->stats.read_miss),
3235 		       (unsigned) atomic_read(&cache->stats.write_hit),
3236 		       (unsigned) atomic_read(&cache->stats.write_miss),
3237 		       (unsigned) atomic_read(&cache->stats.demotion),
3238 		       (unsigned) atomic_read(&cache->stats.promotion),
3239 		       (unsigned long) atomic_read(&cache->nr_dirty));
3240 
3241 		if (cache->features.metadata_version == 2)
3242 			DMEMIT("2 metadata2 ");
3243 		else
3244 			DMEMIT("1 ");
3245 
3246 		if (writethrough_mode(&cache->features))
3247 			DMEMIT("writethrough ");
3248 
3249 		else if (passthrough_mode(&cache->features))
3250 			DMEMIT("passthrough ");
3251 
3252 		else if (writeback_mode(&cache->features))
3253 			DMEMIT("writeback ");
3254 
3255 		else {
3256 			DMERR("%s: internal error: unknown io mode: %d",
3257 			      cache_device_name(cache), (int) cache->features.io_mode);
3258 			goto err;
3259 		}
3260 
3261 		DMEMIT("2 migration_threshold %llu ", (unsigned long long) cache->migration_threshold);
3262 
3263 		DMEMIT("%s ", dm_cache_policy_get_name(cache->policy));
3264 		if (sz < maxlen) {
3265 			r = policy_emit_config_values(cache->policy, result, maxlen, &sz);
3266 			if (r)
3267 				DMERR("%s: policy_emit_config_values returned %d",
3268 				      cache_device_name(cache), r);
3269 		}
3270 
3271 		if (get_cache_mode(cache) == CM_READ_ONLY)
3272 			DMEMIT("ro ");
3273 		else
3274 			DMEMIT("rw ");
3275 
3276 		r = dm_cache_metadata_needs_check(cache->cmd, &needs_check);
3277 
3278 		if (r || needs_check)
3279 			DMEMIT("needs_check ");
3280 		else
3281 			DMEMIT("- ");
3282 
3283 		break;
3284 
3285 	case STATUSTYPE_TABLE:
3286 		format_dev_t(buf, cache->metadata_dev->bdev->bd_dev);
3287 		DMEMIT("%s ", buf);
3288 		format_dev_t(buf, cache->cache_dev->bdev->bd_dev);
3289 		DMEMIT("%s ", buf);
3290 		format_dev_t(buf, cache->origin_dev->bdev->bd_dev);
3291 		DMEMIT("%s", buf);
3292 
3293 		for (i = 0; i < cache->nr_ctr_args - 1; i++)
3294 			DMEMIT(" %s", cache->ctr_args[i]);
3295 		if (cache->nr_ctr_args)
3296 			DMEMIT(" %s", cache->ctr_args[cache->nr_ctr_args - 1]);
3297 	}
3298 
3299 	return;
3300 
3301 err:
3302 	DMEMIT("Error");
3303 }
3304 
3305 /*
3306  * Defines a range of cblocks, begin to (end - 1) are in the range.  end is
3307  * the one-past-the-end value.
3308  */
3309 struct cblock_range {
3310 	dm_cblock_t begin;
3311 	dm_cblock_t end;
3312 };
3313 
3314 /*
3315  * A cache block range can take two forms:
3316  *
3317  * i) A single cblock, eg. '3456'
3318  * ii) A begin and end cblock with a dash between, eg. 123-234
3319  */
3320 static int parse_cblock_range(struct cache *cache, const char *str,
3321 			      struct cblock_range *result)
3322 {
3323 	char dummy;
3324 	uint64_t b, e;
3325 	int r;
3326 
3327 	/*
3328 	 * Try and parse form (ii) first.
3329 	 */
3330 	r = sscanf(str, "%llu-%llu%c", &b, &e, &dummy);
3331 	if (r < 0)
3332 		return r;
3333 
3334 	if (r == 2) {
3335 		result->begin = to_cblock(b);
3336 		result->end = to_cblock(e);
3337 		return 0;
3338 	}
3339 
3340 	/*
3341 	 * That didn't work, try form (i).
3342 	 */
3343 	r = sscanf(str, "%llu%c", &b, &dummy);
3344 	if (r < 0)
3345 		return r;
3346 
3347 	if (r == 1) {
3348 		result->begin = to_cblock(b);
3349 		result->end = to_cblock(from_cblock(result->begin) + 1u);
3350 		return 0;
3351 	}
3352 
3353 	DMERR("%s: invalid cblock range '%s'", cache_device_name(cache), str);
3354 	return -EINVAL;
3355 }
3356 
3357 static int validate_cblock_range(struct cache *cache, struct cblock_range *range)
3358 {
3359 	uint64_t b = from_cblock(range->begin);
3360 	uint64_t e = from_cblock(range->end);
3361 	uint64_t n = from_cblock(cache->cache_size);
3362 
3363 	if (b >= n) {
3364 		DMERR("%s: begin cblock out of range: %llu >= %llu",
3365 		      cache_device_name(cache), b, n);
3366 		return -EINVAL;
3367 	}
3368 
3369 	if (e > n) {
3370 		DMERR("%s: end cblock out of range: %llu > %llu",
3371 		      cache_device_name(cache), e, n);
3372 		return -EINVAL;
3373 	}
3374 
3375 	if (b >= e) {
3376 		DMERR("%s: invalid cblock range: %llu >= %llu",
3377 		      cache_device_name(cache), b, e);
3378 		return -EINVAL;
3379 	}
3380 
3381 	return 0;
3382 }
3383 
3384 static inline dm_cblock_t cblock_succ(dm_cblock_t b)
3385 {
3386 	return to_cblock(from_cblock(b) + 1);
3387 }
3388 
3389 static int request_invalidation(struct cache *cache, struct cblock_range *range)
3390 {
3391 	int r = 0;
3392 
3393 	/*
3394 	 * We don't need to do any locking here because we know we're in
3395 	 * passthrough mode.  There's is potential for a race between an
3396 	 * invalidation triggered by an io and an invalidation message.  This
3397 	 * is harmless, we must not worry if the policy call fails.
3398 	 */
3399 	while (range->begin != range->end) {
3400 		r = invalidate_cblock(cache, range->begin);
3401 		if (r)
3402 			return r;
3403 
3404 		range->begin = cblock_succ(range->begin);
3405 	}
3406 
3407 	cache->commit_requested = true;
3408 	return r;
3409 }
3410 
3411 static int process_invalidate_cblocks_message(struct cache *cache, unsigned count,
3412 					      const char **cblock_ranges)
3413 {
3414 	int r = 0;
3415 	unsigned i;
3416 	struct cblock_range range;
3417 
3418 	if (!passthrough_mode(&cache->features)) {
3419 		DMERR("%s: cache has to be in passthrough mode for invalidation",
3420 		      cache_device_name(cache));
3421 		return -EPERM;
3422 	}
3423 
3424 	for (i = 0; i < count; i++) {
3425 		r = parse_cblock_range(cache, cblock_ranges[i], &range);
3426 		if (r)
3427 			break;
3428 
3429 		r = validate_cblock_range(cache, &range);
3430 		if (r)
3431 			break;
3432 
3433 		/*
3434 		 * Pass begin and end origin blocks to the worker and wake it.
3435 		 */
3436 		r = request_invalidation(cache, &range);
3437 		if (r)
3438 			break;
3439 	}
3440 
3441 	return r;
3442 }
3443 
3444 /*
3445  * Supports
3446  *	"<key> <value>"
3447  * and
3448  *     "invalidate_cblocks [(<begin>)|(<begin>-<end>)]*
3449  *
3450  * The key migration_threshold is supported by the cache target core.
3451  */
3452 static int cache_message(struct dm_target *ti, unsigned argc, char **argv)
3453 {
3454 	struct cache *cache = ti->private;
3455 
3456 	if (!argc)
3457 		return -EINVAL;
3458 
3459 	if (get_cache_mode(cache) >= CM_READ_ONLY) {
3460 		DMERR("%s: unable to service cache target messages in READ_ONLY or FAIL mode",
3461 		      cache_device_name(cache));
3462 		return -EOPNOTSUPP;
3463 	}
3464 
3465 	if (!strcasecmp(argv[0], "invalidate_cblocks"))
3466 		return process_invalidate_cblocks_message(cache, argc - 1, (const char **) argv + 1);
3467 
3468 	if (argc != 2)
3469 		return -EINVAL;
3470 
3471 	return set_config_value(cache, argv[0], argv[1]);
3472 }
3473 
3474 static int cache_iterate_devices(struct dm_target *ti,
3475 				 iterate_devices_callout_fn fn, void *data)
3476 {
3477 	int r = 0;
3478 	struct cache *cache = ti->private;
3479 
3480 	r = fn(ti, cache->cache_dev, 0, get_dev_size(cache->cache_dev), data);
3481 	if (!r)
3482 		r = fn(ti, cache->origin_dev, 0, ti->len, data);
3483 
3484 	return r;
3485 }
3486 
3487 static void set_discard_limits(struct cache *cache, struct queue_limits *limits)
3488 {
3489 	/*
3490 	 * FIXME: these limits may be incompatible with the cache device
3491 	 */
3492 	limits->max_discard_sectors = min_t(sector_t, cache->discard_block_size * 1024,
3493 					    cache->origin_sectors);
3494 	limits->discard_granularity = cache->discard_block_size << SECTOR_SHIFT;
3495 }
3496 
3497 static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits)
3498 {
3499 	struct cache *cache = ti->private;
3500 	uint64_t io_opt_sectors = limits->io_opt >> SECTOR_SHIFT;
3501 
3502 	/*
3503 	 * If the system-determined stacked limits are compatible with the
3504 	 * cache's blocksize (io_opt is a factor) do not override them.
3505 	 */
3506 	if (io_opt_sectors < cache->sectors_per_block ||
3507 	    do_div(io_opt_sectors, cache->sectors_per_block)) {
3508 		blk_limits_io_min(limits, cache->sectors_per_block << SECTOR_SHIFT);
3509 		blk_limits_io_opt(limits, cache->sectors_per_block << SECTOR_SHIFT);
3510 	}
3511 	set_discard_limits(cache, limits);
3512 }
3513 
3514 /*----------------------------------------------------------------*/
3515 
3516 static struct target_type cache_target = {
3517 	.name = "cache",
3518 	.version = {2, 0, 0},
3519 	.module = THIS_MODULE,
3520 	.ctr = cache_ctr,
3521 	.dtr = cache_dtr,
3522 	.map = cache_map,
3523 	.end_io = cache_end_io,
3524 	.postsuspend = cache_postsuspend,
3525 	.preresume = cache_preresume,
3526 	.resume = cache_resume,
3527 	.status = cache_status,
3528 	.message = cache_message,
3529 	.iterate_devices = cache_iterate_devices,
3530 	.io_hints = cache_io_hints,
3531 };
3532 
3533 static int __init dm_cache_init(void)
3534 {
3535 	int r;
3536 
3537 	r = dm_register_target(&cache_target);
3538 	if (r) {
3539 		DMERR("cache target registration failed: %d", r);
3540 		return r;
3541 	}
3542 
3543 	migration_cache = KMEM_CACHE(dm_cache_migration, 0);
3544 	if (!migration_cache) {
3545 		dm_unregister_target(&cache_target);
3546 		return -ENOMEM;
3547 	}
3548 
3549 	return 0;
3550 }
3551 
3552 static void __exit dm_cache_exit(void)
3553 {
3554 	dm_unregister_target(&cache_target);
3555 	kmem_cache_destroy(migration_cache);
3556 }
3557 
3558 module_init(dm_cache_init);
3559 module_exit(dm_cache_exit);
3560 
3561 MODULE_DESCRIPTION(DM_NAME " cache target");
3562 MODULE_AUTHOR("Joe Thornber <ejt@redhat.com>");
3563 MODULE_LICENSE("GPL");
3564