xref: /openbmc/linux/drivers/md/dm-cache-target.c (revision 840ef8b7cc584a23c4f9d05352f4dbaf8e56e5ab)
1 /*
2  * Copyright (C) 2012 Red Hat. All rights reserved.
3  *
4  * This file is released under the GPL.
5  */
6 
7 #include "dm.h"
8 #include "dm-bio-prison.h"
9 #include "dm-cache-metadata.h"
10 
11 #include <linux/dm-io.h>
12 #include <linux/dm-kcopyd.h>
13 #include <linux/init.h>
14 #include <linux/mempool.h>
15 #include <linux/module.h>
16 #include <linux/slab.h>
17 #include <linux/vmalloc.h>
18 
19 #define DM_MSG_PREFIX "cache"
20 
21 DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(cache_copy_throttle,
22 	"A percentage of time allocated for copying to and/or from cache");
23 
24 /*----------------------------------------------------------------*/
25 
26 /*
27  * Glossary:
28  *
29  * oblock: index of an origin block
30  * cblock: index of a cache block
31  * promotion: movement of a block from origin to cache
32  * demotion: movement of a block from cache to origin
33  * migration: movement of a block between the origin and cache device,
34  *	      either direction
35  */
36 
37 /*----------------------------------------------------------------*/
38 
39 static size_t bitset_size_in_bytes(unsigned nr_entries)
40 {
41 	return sizeof(unsigned long) * dm_div_up(nr_entries, BITS_PER_LONG);
42 }
43 
44 static unsigned long *alloc_bitset(unsigned nr_entries)
45 {
46 	size_t s = bitset_size_in_bytes(nr_entries);
47 	return vzalloc(s);
48 }
49 
50 static void clear_bitset(void *bitset, unsigned nr_entries)
51 {
52 	size_t s = bitset_size_in_bytes(nr_entries);
53 	memset(bitset, 0, s);
54 }
55 
56 static void free_bitset(unsigned long *bits)
57 {
58 	vfree(bits);
59 }
60 
61 /*----------------------------------------------------------------*/
62 
63 #define PRISON_CELLS 1024
64 #define MIGRATION_POOL_SIZE 128
65 #define COMMIT_PERIOD HZ
66 #define MIGRATION_COUNT_WINDOW 10
67 
68 /*
69  * The block size of the device holding cache data must be >= 32KB
70  */
71 #define DATA_DEV_BLOCK_SIZE_MIN_SECTORS (32 * 1024 >> SECTOR_SHIFT)
72 
73 /*
74  * FIXME: the cache is read/write for the time being.
75  */
76 enum cache_mode {
77 	CM_WRITE,		/* metadata may be changed */
78 	CM_READ_ONLY,		/* metadata may not be changed */
79 };
80 
81 struct cache_features {
82 	enum cache_mode mode;
83 	bool write_through:1;
84 };
85 
86 struct cache_stats {
87 	atomic_t read_hit;
88 	atomic_t read_miss;
89 	atomic_t write_hit;
90 	atomic_t write_miss;
91 	atomic_t demotion;
92 	atomic_t promotion;
93 	atomic_t copies_avoided;
94 	atomic_t cache_cell_clash;
95 	atomic_t commit_count;
96 	atomic_t discard_count;
97 };
98 
99 struct cache {
100 	struct dm_target *ti;
101 	struct dm_target_callbacks callbacks;
102 
103 	/*
104 	 * Metadata is written to this device.
105 	 */
106 	struct dm_dev *metadata_dev;
107 
108 	/*
109 	 * The slower of the two data devices.  Typically a spindle.
110 	 */
111 	struct dm_dev *origin_dev;
112 
113 	/*
114 	 * The faster of the two data devices.  Typically an SSD.
115 	 */
116 	struct dm_dev *cache_dev;
117 
118 	/*
119 	 * Cache features such as write-through.
120 	 */
121 	struct cache_features features;
122 
123 	/*
124 	 * Size of the origin device in _complete_ blocks and native sectors.
125 	 */
126 	dm_oblock_t origin_blocks;
127 	sector_t origin_sectors;
128 
129 	/*
130 	 * Size of the cache device in blocks.
131 	 */
132 	dm_cblock_t cache_size;
133 
134 	/*
135 	 * Fields for converting from sectors to blocks.
136 	 */
137 	uint32_t sectors_per_block;
138 	int sectors_per_block_shift;
139 
140 	struct dm_cache_metadata *cmd;
141 
142 	spinlock_t lock;
143 	struct bio_list deferred_bios;
144 	struct bio_list deferred_flush_bios;
145 	struct list_head quiesced_migrations;
146 	struct list_head completed_migrations;
147 	struct list_head need_commit_migrations;
148 	sector_t migration_threshold;
149 	atomic_t nr_migrations;
150 	wait_queue_head_t migration_wait;
151 
152 	/*
153 	 * cache_size entries, dirty if set
154 	 */
155 	dm_cblock_t nr_dirty;
156 	unsigned long *dirty_bitset;
157 
158 	/*
159 	 * origin_blocks entries, discarded if set.
160 	 */
161 	sector_t discard_block_size; /* a power of 2 times sectors per block */
162 	dm_dblock_t discard_nr_blocks;
163 	unsigned long *discard_bitset;
164 
165 	struct dm_kcopyd_client *copier;
166 	struct workqueue_struct *wq;
167 	struct work_struct worker;
168 
169 	struct delayed_work waker;
170 	unsigned long last_commit_jiffies;
171 
172 	struct dm_bio_prison *prison;
173 	struct dm_deferred_set *all_io_ds;
174 
175 	mempool_t *migration_pool;
176 	struct dm_cache_migration *next_migration;
177 
178 	struct dm_cache_policy *policy;
179 	unsigned policy_nr_args;
180 
181 	bool need_tick_bio:1;
182 	bool sized:1;
183 	bool quiescing:1;
184 	bool commit_requested:1;
185 	bool loaded_mappings:1;
186 	bool loaded_discards:1;
187 
188 	struct cache_stats stats;
189 
190 	/*
191 	 * Rather than reconstructing the table line for the status we just
192 	 * save it and regurgitate.
193 	 */
194 	unsigned nr_ctr_args;
195 	const char **ctr_args;
196 };
197 
198 struct per_bio_data {
199 	bool tick:1;
200 	unsigned req_nr:2;
201 	struct dm_deferred_entry *all_io_entry;
202 };
203 
204 struct dm_cache_migration {
205 	struct list_head list;
206 	struct cache *cache;
207 
208 	unsigned long start_jiffies;
209 	dm_oblock_t old_oblock;
210 	dm_oblock_t new_oblock;
211 	dm_cblock_t cblock;
212 
213 	bool err:1;
214 	bool writeback:1;
215 	bool demote:1;
216 	bool promote:1;
217 
218 	struct dm_bio_prison_cell *old_ocell;
219 	struct dm_bio_prison_cell *new_ocell;
220 };
221 
222 /*
223  * Processing a bio in the worker thread may require these memory
224  * allocations.  We prealloc to avoid deadlocks (the same worker thread
225  * frees them back to the mempool).
226  */
227 struct prealloc {
228 	struct dm_cache_migration *mg;
229 	struct dm_bio_prison_cell *cell1;
230 	struct dm_bio_prison_cell *cell2;
231 };
232 
233 static void wake_worker(struct cache *cache)
234 {
235 	queue_work(cache->wq, &cache->worker);
236 }
237 
238 /*----------------------------------------------------------------*/
239 
240 static struct dm_bio_prison_cell *alloc_prison_cell(struct cache *cache)
241 {
242 	/* FIXME: change to use a local slab. */
243 	return dm_bio_prison_alloc_cell(cache->prison, GFP_NOWAIT);
244 }
245 
246 static void free_prison_cell(struct cache *cache, struct dm_bio_prison_cell *cell)
247 {
248 	dm_bio_prison_free_cell(cache->prison, cell);
249 }
250 
251 static int prealloc_data_structs(struct cache *cache, struct prealloc *p)
252 {
253 	if (!p->mg) {
254 		p->mg = mempool_alloc(cache->migration_pool, GFP_NOWAIT);
255 		if (!p->mg)
256 			return -ENOMEM;
257 	}
258 
259 	if (!p->cell1) {
260 		p->cell1 = alloc_prison_cell(cache);
261 		if (!p->cell1)
262 			return -ENOMEM;
263 	}
264 
265 	if (!p->cell2) {
266 		p->cell2 = alloc_prison_cell(cache);
267 		if (!p->cell2)
268 			return -ENOMEM;
269 	}
270 
271 	return 0;
272 }
273 
274 static void prealloc_free_structs(struct cache *cache, struct prealloc *p)
275 {
276 	if (p->cell2)
277 		free_prison_cell(cache, p->cell2);
278 
279 	if (p->cell1)
280 		free_prison_cell(cache, p->cell1);
281 
282 	if (p->mg)
283 		mempool_free(p->mg, cache->migration_pool);
284 }
285 
286 static struct dm_cache_migration *prealloc_get_migration(struct prealloc *p)
287 {
288 	struct dm_cache_migration *mg = p->mg;
289 
290 	BUG_ON(!mg);
291 	p->mg = NULL;
292 
293 	return mg;
294 }
295 
296 /*
297  * You must have a cell within the prealloc struct to return.  If not this
298  * function will BUG() rather than returning NULL.
299  */
300 static struct dm_bio_prison_cell *prealloc_get_cell(struct prealloc *p)
301 {
302 	struct dm_bio_prison_cell *r = NULL;
303 
304 	if (p->cell1) {
305 		r = p->cell1;
306 		p->cell1 = NULL;
307 
308 	} else if (p->cell2) {
309 		r = p->cell2;
310 		p->cell2 = NULL;
311 	} else
312 		BUG();
313 
314 	return r;
315 }
316 
317 /*
318  * You can't have more than two cells in a prealloc struct.  BUG() will be
319  * called if you try and overfill.
320  */
321 static void prealloc_put_cell(struct prealloc *p, struct dm_bio_prison_cell *cell)
322 {
323 	if (!p->cell2)
324 		p->cell2 = cell;
325 
326 	else if (!p->cell1)
327 		p->cell1 = cell;
328 
329 	else
330 		BUG();
331 }
332 
333 /*----------------------------------------------------------------*/
334 
335 static void build_key(dm_oblock_t oblock, struct dm_cell_key *key)
336 {
337 	key->virtual = 0;
338 	key->dev = 0;
339 	key->block = from_oblock(oblock);
340 }
341 
342 /*
343  * The caller hands in a preallocated cell, and a free function for it.
344  * The cell will be freed if there's an error, or if it wasn't used because
345  * a cell with that key already exists.
346  */
347 typedef void (*cell_free_fn)(void *context, struct dm_bio_prison_cell *cell);
348 
349 static int bio_detain(struct cache *cache, dm_oblock_t oblock,
350 		      struct bio *bio, struct dm_bio_prison_cell *cell_prealloc,
351 		      cell_free_fn free_fn, void *free_context,
352 		      struct dm_bio_prison_cell **cell_result)
353 {
354 	int r;
355 	struct dm_cell_key key;
356 
357 	build_key(oblock, &key);
358 	r = dm_bio_detain(cache->prison, &key, bio, cell_prealloc, cell_result);
359 	if (r)
360 		free_fn(free_context, cell_prealloc);
361 
362 	return r;
363 }
364 
365 static int get_cell(struct cache *cache,
366 		    dm_oblock_t oblock,
367 		    struct prealloc *structs,
368 		    struct dm_bio_prison_cell **cell_result)
369 {
370 	int r;
371 	struct dm_cell_key key;
372 	struct dm_bio_prison_cell *cell_prealloc;
373 
374 	cell_prealloc = prealloc_get_cell(structs);
375 
376 	build_key(oblock, &key);
377 	r = dm_get_cell(cache->prison, &key, cell_prealloc, cell_result);
378 	if (r)
379 		prealloc_put_cell(structs, cell_prealloc);
380 
381 	return r;
382 }
383 
384  /*----------------------------------------------------------------*/
385 
386 static bool is_dirty(struct cache *cache, dm_cblock_t b)
387 {
388 	return test_bit(from_cblock(b), cache->dirty_bitset);
389 }
390 
391 static void set_dirty(struct cache *cache, dm_oblock_t oblock, dm_cblock_t cblock)
392 {
393 	if (!test_and_set_bit(from_cblock(cblock), cache->dirty_bitset)) {
394 		cache->nr_dirty = to_cblock(from_cblock(cache->nr_dirty) + 1);
395 		policy_set_dirty(cache->policy, oblock);
396 	}
397 }
398 
399 static void clear_dirty(struct cache *cache, dm_oblock_t oblock, dm_cblock_t cblock)
400 {
401 	if (test_and_clear_bit(from_cblock(cblock), cache->dirty_bitset)) {
402 		policy_clear_dirty(cache->policy, oblock);
403 		cache->nr_dirty = to_cblock(from_cblock(cache->nr_dirty) - 1);
404 		if (!from_cblock(cache->nr_dirty))
405 			dm_table_event(cache->ti->table);
406 	}
407 }
408 
409 /*----------------------------------------------------------------*/
410 static bool block_size_is_power_of_two(struct cache *cache)
411 {
412 	return cache->sectors_per_block_shift >= 0;
413 }
414 
415 static dm_dblock_t oblock_to_dblock(struct cache *cache, dm_oblock_t oblock)
416 {
417 	sector_t discard_blocks = cache->discard_block_size;
418 	dm_block_t b = from_oblock(oblock);
419 
420 	if (!block_size_is_power_of_two(cache))
421 		(void) sector_div(discard_blocks, cache->sectors_per_block);
422 	else
423 		discard_blocks >>= cache->sectors_per_block_shift;
424 
425 	(void) sector_div(b, discard_blocks);
426 
427 	return to_dblock(b);
428 }
429 
430 static void set_discard(struct cache *cache, dm_dblock_t b)
431 {
432 	unsigned long flags;
433 
434 	atomic_inc(&cache->stats.discard_count);
435 
436 	spin_lock_irqsave(&cache->lock, flags);
437 	set_bit(from_dblock(b), cache->discard_bitset);
438 	spin_unlock_irqrestore(&cache->lock, flags);
439 }
440 
441 static void clear_discard(struct cache *cache, dm_dblock_t b)
442 {
443 	unsigned long flags;
444 
445 	spin_lock_irqsave(&cache->lock, flags);
446 	clear_bit(from_dblock(b), cache->discard_bitset);
447 	spin_unlock_irqrestore(&cache->lock, flags);
448 }
449 
450 static bool is_discarded(struct cache *cache, dm_dblock_t b)
451 {
452 	int r;
453 	unsigned long flags;
454 
455 	spin_lock_irqsave(&cache->lock, flags);
456 	r = test_bit(from_dblock(b), cache->discard_bitset);
457 	spin_unlock_irqrestore(&cache->lock, flags);
458 
459 	return r;
460 }
461 
462 static bool is_discarded_oblock(struct cache *cache, dm_oblock_t b)
463 {
464 	int r;
465 	unsigned long flags;
466 
467 	spin_lock_irqsave(&cache->lock, flags);
468 	r = test_bit(from_dblock(oblock_to_dblock(cache, b)),
469 		     cache->discard_bitset);
470 	spin_unlock_irqrestore(&cache->lock, flags);
471 
472 	return r;
473 }
474 
475 /*----------------------------------------------------------------*/
476 
477 static void load_stats(struct cache *cache)
478 {
479 	struct dm_cache_statistics stats;
480 
481 	dm_cache_metadata_get_stats(cache->cmd, &stats);
482 	atomic_set(&cache->stats.read_hit, stats.read_hits);
483 	atomic_set(&cache->stats.read_miss, stats.read_misses);
484 	atomic_set(&cache->stats.write_hit, stats.write_hits);
485 	atomic_set(&cache->stats.write_miss, stats.write_misses);
486 }
487 
488 static void save_stats(struct cache *cache)
489 {
490 	struct dm_cache_statistics stats;
491 
492 	stats.read_hits = atomic_read(&cache->stats.read_hit);
493 	stats.read_misses = atomic_read(&cache->stats.read_miss);
494 	stats.write_hits = atomic_read(&cache->stats.write_hit);
495 	stats.write_misses = atomic_read(&cache->stats.write_miss);
496 
497 	dm_cache_metadata_set_stats(cache->cmd, &stats);
498 }
499 
500 /*----------------------------------------------------------------
501  * Per bio data
502  *--------------------------------------------------------------*/
503 static struct per_bio_data *get_per_bio_data(struct bio *bio)
504 {
505 	struct per_bio_data *pb = dm_per_bio_data(bio, sizeof(struct per_bio_data));
506 	BUG_ON(!pb);
507 	return pb;
508 }
509 
510 static struct per_bio_data *init_per_bio_data(struct bio *bio)
511 {
512 	struct per_bio_data *pb = get_per_bio_data(bio);
513 
514 	pb->tick = false;
515 	pb->req_nr = dm_bio_get_target_bio_nr(bio);
516 	pb->all_io_entry = NULL;
517 
518 	return pb;
519 }
520 
521 /*----------------------------------------------------------------
522  * Remapping
523  *--------------------------------------------------------------*/
524 static void remap_to_origin(struct cache *cache, struct bio *bio)
525 {
526 	bio->bi_bdev = cache->origin_dev->bdev;
527 }
528 
529 static void remap_to_cache(struct cache *cache, struct bio *bio,
530 			   dm_cblock_t cblock)
531 {
532 	sector_t bi_sector = bio->bi_sector;
533 
534 	bio->bi_bdev = cache->cache_dev->bdev;
535 	if (!block_size_is_power_of_two(cache))
536 		bio->bi_sector = (from_cblock(cblock) * cache->sectors_per_block) +
537 				sector_div(bi_sector, cache->sectors_per_block);
538 	else
539 		bio->bi_sector = (from_cblock(cblock) << cache->sectors_per_block_shift) |
540 				(bi_sector & (cache->sectors_per_block - 1));
541 }
542 
543 static void check_if_tick_bio_needed(struct cache *cache, struct bio *bio)
544 {
545 	unsigned long flags;
546 	struct per_bio_data *pb = get_per_bio_data(bio);
547 
548 	spin_lock_irqsave(&cache->lock, flags);
549 	if (cache->need_tick_bio &&
550 	    !(bio->bi_rw & (REQ_FUA | REQ_FLUSH | REQ_DISCARD))) {
551 		pb->tick = true;
552 		cache->need_tick_bio = false;
553 	}
554 	spin_unlock_irqrestore(&cache->lock, flags);
555 }
556 
557 static void remap_to_origin_clear_discard(struct cache *cache, struct bio *bio,
558 				  dm_oblock_t oblock)
559 {
560 	check_if_tick_bio_needed(cache, bio);
561 	remap_to_origin(cache, bio);
562 	if (bio_data_dir(bio) == WRITE)
563 		clear_discard(cache, oblock_to_dblock(cache, oblock));
564 }
565 
566 static void remap_to_cache_dirty(struct cache *cache, struct bio *bio,
567 				 dm_oblock_t oblock, dm_cblock_t cblock)
568 {
569 	remap_to_cache(cache, bio, cblock);
570 	if (bio_data_dir(bio) == WRITE) {
571 		set_dirty(cache, oblock, cblock);
572 		clear_discard(cache, oblock_to_dblock(cache, oblock));
573 	}
574 }
575 
576 static dm_oblock_t get_bio_block(struct cache *cache, struct bio *bio)
577 {
578 	sector_t block_nr = bio->bi_sector;
579 
580 	if (!block_size_is_power_of_two(cache))
581 		(void) sector_div(block_nr, cache->sectors_per_block);
582 	else
583 		block_nr >>= cache->sectors_per_block_shift;
584 
585 	return to_oblock(block_nr);
586 }
587 
588 static int bio_triggers_commit(struct cache *cache, struct bio *bio)
589 {
590 	return bio->bi_rw & (REQ_FLUSH | REQ_FUA);
591 }
592 
593 static void issue(struct cache *cache, struct bio *bio)
594 {
595 	unsigned long flags;
596 
597 	if (!bio_triggers_commit(cache, bio)) {
598 		generic_make_request(bio);
599 		return;
600 	}
601 
602 	/*
603 	 * Batch together any bios that trigger commits and then issue a
604 	 * single commit for them in do_worker().
605 	 */
606 	spin_lock_irqsave(&cache->lock, flags);
607 	cache->commit_requested = true;
608 	bio_list_add(&cache->deferred_flush_bios, bio);
609 	spin_unlock_irqrestore(&cache->lock, flags);
610 }
611 
612 /*----------------------------------------------------------------
613  * Migration processing
614  *
615  * Migration covers moving data from the origin device to the cache, or
616  * vice versa.
617  *--------------------------------------------------------------*/
618 static void free_migration(struct dm_cache_migration *mg)
619 {
620 	mempool_free(mg, mg->cache->migration_pool);
621 }
622 
623 static void inc_nr_migrations(struct cache *cache)
624 {
625 	atomic_inc(&cache->nr_migrations);
626 }
627 
628 static void dec_nr_migrations(struct cache *cache)
629 {
630 	atomic_dec(&cache->nr_migrations);
631 
632 	/*
633 	 * Wake the worker in case we're suspending the target.
634 	 */
635 	wake_up(&cache->migration_wait);
636 }
637 
638 static void __cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell,
639 			 bool holder)
640 {
641 	(holder ? dm_cell_release : dm_cell_release_no_holder)
642 		(cache->prison, cell, &cache->deferred_bios);
643 	free_prison_cell(cache, cell);
644 }
645 
646 static void cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell,
647 		       bool holder)
648 {
649 	unsigned long flags;
650 
651 	spin_lock_irqsave(&cache->lock, flags);
652 	__cell_defer(cache, cell, holder);
653 	spin_unlock_irqrestore(&cache->lock, flags);
654 
655 	wake_worker(cache);
656 }
657 
658 static void cleanup_migration(struct dm_cache_migration *mg)
659 {
660 	dec_nr_migrations(mg->cache);
661 	free_migration(mg);
662 }
663 
664 static void migration_failure(struct dm_cache_migration *mg)
665 {
666 	struct cache *cache = mg->cache;
667 
668 	if (mg->writeback) {
669 		DMWARN_LIMIT("writeback failed; couldn't copy block");
670 		set_dirty(cache, mg->old_oblock, mg->cblock);
671 		cell_defer(cache, mg->old_ocell, false);
672 
673 	} else if (mg->demote) {
674 		DMWARN_LIMIT("demotion failed; couldn't copy block");
675 		policy_force_mapping(cache->policy, mg->new_oblock, mg->old_oblock);
676 
677 		cell_defer(cache, mg->old_ocell, mg->promote ? 0 : 1);
678 		if (mg->promote)
679 			cell_defer(cache, mg->new_ocell, 1);
680 	} else {
681 		DMWARN_LIMIT("promotion failed; couldn't copy block");
682 		policy_remove_mapping(cache->policy, mg->new_oblock);
683 		cell_defer(cache, mg->new_ocell, 1);
684 	}
685 
686 	cleanup_migration(mg);
687 }
688 
689 static void migration_success_pre_commit(struct dm_cache_migration *mg)
690 {
691 	unsigned long flags;
692 	struct cache *cache = mg->cache;
693 
694 	if (mg->writeback) {
695 		cell_defer(cache, mg->old_ocell, false);
696 		clear_dirty(cache, mg->old_oblock, mg->cblock);
697 		cleanup_migration(mg);
698 		return;
699 
700 	} else if (mg->demote) {
701 		if (dm_cache_remove_mapping(cache->cmd, mg->cblock)) {
702 			DMWARN_LIMIT("demotion failed; couldn't update on disk metadata");
703 			policy_force_mapping(cache->policy, mg->new_oblock,
704 					     mg->old_oblock);
705 			if (mg->promote)
706 				cell_defer(cache, mg->new_ocell, true);
707 			cleanup_migration(mg);
708 			return;
709 		}
710 	} else {
711 		if (dm_cache_insert_mapping(cache->cmd, mg->cblock, mg->new_oblock)) {
712 			DMWARN_LIMIT("promotion failed; couldn't update on disk metadata");
713 			policy_remove_mapping(cache->policy, mg->new_oblock);
714 			cleanup_migration(mg);
715 			return;
716 		}
717 	}
718 
719 	spin_lock_irqsave(&cache->lock, flags);
720 	list_add_tail(&mg->list, &cache->need_commit_migrations);
721 	cache->commit_requested = true;
722 	spin_unlock_irqrestore(&cache->lock, flags);
723 }
724 
725 static void migration_success_post_commit(struct dm_cache_migration *mg)
726 {
727 	unsigned long flags;
728 	struct cache *cache = mg->cache;
729 
730 	if (mg->writeback) {
731 		DMWARN("writeback unexpectedly triggered commit");
732 		return;
733 
734 	} else if (mg->demote) {
735 		cell_defer(cache, mg->old_ocell, mg->promote ? 0 : 1);
736 
737 		if (mg->promote) {
738 			mg->demote = false;
739 
740 			spin_lock_irqsave(&cache->lock, flags);
741 			list_add_tail(&mg->list, &cache->quiesced_migrations);
742 			spin_unlock_irqrestore(&cache->lock, flags);
743 
744 		} else
745 			cleanup_migration(mg);
746 
747 	} else {
748 		cell_defer(cache, mg->new_ocell, true);
749 		clear_dirty(cache, mg->new_oblock, mg->cblock);
750 		cleanup_migration(mg);
751 	}
752 }
753 
754 static void copy_complete(int read_err, unsigned long write_err, void *context)
755 {
756 	unsigned long flags;
757 	struct dm_cache_migration *mg = (struct dm_cache_migration *) context;
758 	struct cache *cache = mg->cache;
759 
760 	if (read_err || write_err)
761 		mg->err = true;
762 
763 	spin_lock_irqsave(&cache->lock, flags);
764 	list_add_tail(&mg->list, &cache->completed_migrations);
765 	spin_unlock_irqrestore(&cache->lock, flags);
766 
767 	wake_worker(cache);
768 }
769 
770 static void issue_copy_real(struct dm_cache_migration *mg)
771 {
772 	int r;
773 	struct dm_io_region o_region, c_region;
774 	struct cache *cache = mg->cache;
775 
776 	o_region.bdev = cache->origin_dev->bdev;
777 	o_region.count = cache->sectors_per_block;
778 
779 	c_region.bdev = cache->cache_dev->bdev;
780 	c_region.sector = from_cblock(mg->cblock) * cache->sectors_per_block;
781 	c_region.count = cache->sectors_per_block;
782 
783 	if (mg->writeback || mg->demote) {
784 		/* demote */
785 		o_region.sector = from_oblock(mg->old_oblock) * cache->sectors_per_block;
786 		r = dm_kcopyd_copy(cache->copier, &c_region, 1, &o_region, 0, copy_complete, mg);
787 	} else {
788 		/* promote */
789 		o_region.sector = from_oblock(mg->new_oblock) * cache->sectors_per_block;
790 		r = dm_kcopyd_copy(cache->copier, &o_region, 1, &c_region, 0, copy_complete, mg);
791 	}
792 
793 	if (r < 0)
794 		migration_failure(mg);
795 }
796 
797 static void avoid_copy(struct dm_cache_migration *mg)
798 {
799 	atomic_inc(&mg->cache->stats.copies_avoided);
800 	migration_success_pre_commit(mg);
801 }
802 
803 static void issue_copy(struct dm_cache_migration *mg)
804 {
805 	bool avoid;
806 	struct cache *cache = mg->cache;
807 
808 	if (mg->writeback || mg->demote)
809 		avoid = !is_dirty(cache, mg->cblock) ||
810 			is_discarded_oblock(cache, mg->old_oblock);
811 	else
812 		avoid = is_discarded_oblock(cache, mg->new_oblock);
813 
814 	avoid ? avoid_copy(mg) : issue_copy_real(mg);
815 }
816 
817 static void complete_migration(struct dm_cache_migration *mg)
818 {
819 	if (mg->err)
820 		migration_failure(mg);
821 	else
822 		migration_success_pre_commit(mg);
823 }
824 
825 static void process_migrations(struct cache *cache, struct list_head *head,
826 			       void (*fn)(struct dm_cache_migration *))
827 {
828 	unsigned long flags;
829 	struct list_head list;
830 	struct dm_cache_migration *mg, *tmp;
831 
832 	INIT_LIST_HEAD(&list);
833 	spin_lock_irqsave(&cache->lock, flags);
834 	list_splice_init(head, &list);
835 	spin_unlock_irqrestore(&cache->lock, flags);
836 
837 	list_for_each_entry_safe(mg, tmp, &list, list)
838 		fn(mg);
839 }
840 
841 static void __queue_quiesced_migration(struct dm_cache_migration *mg)
842 {
843 	list_add_tail(&mg->list, &mg->cache->quiesced_migrations);
844 }
845 
846 static void queue_quiesced_migration(struct dm_cache_migration *mg)
847 {
848 	unsigned long flags;
849 	struct cache *cache = mg->cache;
850 
851 	spin_lock_irqsave(&cache->lock, flags);
852 	__queue_quiesced_migration(mg);
853 	spin_unlock_irqrestore(&cache->lock, flags);
854 
855 	wake_worker(cache);
856 }
857 
858 static void queue_quiesced_migrations(struct cache *cache, struct list_head *work)
859 {
860 	unsigned long flags;
861 	struct dm_cache_migration *mg, *tmp;
862 
863 	spin_lock_irqsave(&cache->lock, flags);
864 	list_for_each_entry_safe(mg, tmp, work, list)
865 		__queue_quiesced_migration(mg);
866 	spin_unlock_irqrestore(&cache->lock, flags);
867 
868 	wake_worker(cache);
869 }
870 
871 static void check_for_quiesced_migrations(struct cache *cache,
872 					  struct per_bio_data *pb)
873 {
874 	struct list_head work;
875 
876 	if (!pb->all_io_entry)
877 		return;
878 
879 	INIT_LIST_HEAD(&work);
880 	if (pb->all_io_entry)
881 		dm_deferred_entry_dec(pb->all_io_entry, &work);
882 
883 	if (!list_empty(&work))
884 		queue_quiesced_migrations(cache, &work);
885 }
886 
887 static void quiesce_migration(struct dm_cache_migration *mg)
888 {
889 	if (!dm_deferred_set_add_work(mg->cache->all_io_ds, &mg->list))
890 		queue_quiesced_migration(mg);
891 }
892 
893 static void promote(struct cache *cache, struct prealloc *structs,
894 		    dm_oblock_t oblock, dm_cblock_t cblock,
895 		    struct dm_bio_prison_cell *cell)
896 {
897 	struct dm_cache_migration *mg = prealloc_get_migration(structs);
898 
899 	mg->err = false;
900 	mg->writeback = false;
901 	mg->demote = false;
902 	mg->promote = true;
903 	mg->cache = cache;
904 	mg->new_oblock = oblock;
905 	mg->cblock = cblock;
906 	mg->old_ocell = NULL;
907 	mg->new_ocell = cell;
908 	mg->start_jiffies = jiffies;
909 
910 	inc_nr_migrations(cache);
911 	quiesce_migration(mg);
912 }
913 
914 static void writeback(struct cache *cache, struct prealloc *structs,
915 		      dm_oblock_t oblock, dm_cblock_t cblock,
916 		      struct dm_bio_prison_cell *cell)
917 {
918 	struct dm_cache_migration *mg = prealloc_get_migration(structs);
919 
920 	mg->err = false;
921 	mg->writeback = true;
922 	mg->demote = false;
923 	mg->promote = false;
924 	mg->cache = cache;
925 	mg->old_oblock = oblock;
926 	mg->cblock = cblock;
927 	mg->old_ocell = cell;
928 	mg->new_ocell = NULL;
929 	mg->start_jiffies = jiffies;
930 
931 	inc_nr_migrations(cache);
932 	quiesce_migration(mg);
933 }
934 
935 static void demote_then_promote(struct cache *cache, struct prealloc *structs,
936 				dm_oblock_t old_oblock, dm_oblock_t new_oblock,
937 				dm_cblock_t cblock,
938 				struct dm_bio_prison_cell *old_ocell,
939 				struct dm_bio_prison_cell *new_ocell)
940 {
941 	struct dm_cache_migration *mg = prealloc_get_migration(structs);
942 
943 	mg->err = false;
944 	mg->writeback = false;
945 	mg->demote = true;
946 	mg->promote = true;
947 	mg->cache = cache;
948 	mg->old_oblock = old_oblock;
949 	mg->new_oblock = new_oblock;
950 	mg->cblock = cblock;
951 	mg->old_ocell = old_ocell;
952 	mg->new_ocell = new_ocell;
953 	mg->start_jiffies = jiffies;
954 
955 	inc_nr_migrations(cache);
956 	quiesce_migration(mg);
957 }
958 
959 /*----------------------------------------------------------------
960  * bio processing
961  *--------------------------------------------------------------*/
962 static void defer_bio(struct cache *cache, struct bio *bio)
963 {
964 	unsigned long flags;
965 
966 	spin_lock_irqsave(&cache->lock, flags);
967 	bio_list_add(&cache->deferred_bios, bio);
968 	spin_unlock_irqrestore(&cache->lock, flags);
969 
970 	wake_worker(cache);
971 }
972 
973 static void process_flush_bio(struct cache *cache, struct bio *bio)
974 {
975 	struct per_bio_data *pb = get_per_bio_data(bio);
976 
977 	BUG_ON(bio->bi_size);
978 	if (!pb->req_nr)
979 		remap_to_origin(cache, bio);
980 	else
981 		remap_to_cache(cache, bio, 0);
982 
983 	issue(cache, bio);
984 }
985 
986 /*
987  * People generally discard large parts of a device, eg, the whole device
988  * when formatting.  Splitting these large discards up into cache block
989  * sized ios and then quiescing (always neccessary for discard) takes too
990  * long.
991  *
992  * We keep it simple, and allow any size of discard to come in, and just
993  * mark off blocks on the discard bitset.  No passdown occurs!
994  *
995  * To implement passdown we need to change the bio_prison such that a cell
996  * can have a key that spans many blocks.
997  */
998 static void process_discard_bio(struct cache *cache, struct bio *bio)
999 {
1000 	dm_block_t start_block = dm_sector_div_up(bio->bi_sector,
1001 						  cache->discard_block_size);
1002 	dm_block_t end_block = bio->bi_sector + bio_sectors(bio);
1003 	dm_block_t b;
1004 
1005 	(void) sector_div(end_block, cache->discard_block_size);
1006 
1007 	for (b = start_block; b < end_block; b++)
1008 		set_discard(cache, to_dblock(b));
1009 
1010 	bio_endio(bio, 0);
1011 }
1012 
1013 static bool spare_migration_bandwidth(struct cache *cache)
1014 {
1015 	sector_t current_volume = (atomic_read(&cache->nr_migrations) + 1) *
1016 		cache->sectors_per_block;
1017 	return current_volume < cache->migration_threshold;
1018 }
1019 
1020 static bool is_writethrough_io(struct cache *cache, struct bio *bio,
1021 			       dm_cblock_t cblock)
1022 {
1023 	return bio_data_dir(bio) == WRITE &&
1024 		cache->features.write_through && !is_dirty(cache, cblock);
1025 }
1026 
1027 static void inc_hit_counter(struct cache *cache, struct bio *bio)
1028 {
1029 	atomic_inc(bio_data_dir(bio) == READ ?
1030 		   &cache->stats.read_hit : &cache->stats.write_hit);
1031 }
1032 
1033 static void inc_miss_counter(struct cache *cache, struct bio *bio)
1034 {
1035 	atomic_inc(bio_data_dir(bio) == READ ?
1036 		   &cache->stats.read_miss : &cache->stats.write_miss);
1037 }
1038 
1039 static void process_bio(struct cache *cache, struct prealloc *structs,
1040 			struct bio *bio)
1041 {
1042 	int r;
1043 	bool release_cell = true;
1044 	dm_oblock_t block = get_bio_block(cache, bio);
1045 	struct dm_bio_prison_cell *cell_prealloc, *old_ocell, *new_ocell;
1046 	struct policy_result lookup_result;
1047 	struct per_bio_data *pb = get_per_bio_data(bio);
1048 	bool discarded_block = is_discarded_oblock(cache, block);
1049 	bool can_migrate = discarded_block || spare_migration_bandwidth(cache);
1050 
1051 	/*
1052 	 * Check to see if that block is currently migrating.
1053 	 */
1054 	cell_prealloc = prealloc_get_cell(structs);
1055 	r = bio_detain(cache, block, bio, cell_prealloc,
1056 		       (cell_free_fn) prealloc_put_cell,
1057 		       structs, &new_ocell);
1058 	if (r > 0)
1059 		return;
1060 
1061 	r = policy_map(cache->policy, block, true, can_migrate, discarded_block,
1062 		       bio, &lookup_result);
1063 
1064 	if (r == -EWOULDBLOCK)
1065 		/* migration has been denied */
1066 		lookup_result.op = POLICY_MISS;
1067 
1068 	switch (lookup_result.op) {
1069 	case POLICY_HIT:
1070 		inc_hit_counter(cache, bio);
1071 		pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
1072 
1073 		if (is_writethrough_io(cache, bio, lookup_result.cblock)) {
1074 			/*
1075 			 * No need to mark anything dirty in write through mode.
1076 			 */
1077 			pb->req_nr == 0 ?
1078 				remap_to_cache(cache, bio, lookup_result.cblock) :
1079 				remap_to_origin_clear_discard(cache, bio, block);
1080 		} else
1081 			remap_to_cache_dirty(cache, bio, block, lookup_result.cblock);
1082 
1083 		issue(cache, bio);
1084 		break;
1085 
1086 	case POLICY_MISS:
1087 		inc_miss_counter(cache, bio);
1088 		pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
1089 
1090 		if (pb->req_nr != 0) {
1091 			/*
1092 			 * This is a duplicate writethrough io that is no
1093 			 * longer needed because the block has been demoted.
1094 			 */
1095 			bio_endio(bio, 0);
1096 		} else {
1097 			remap_to_origin_clear_discard(cache, bio, block);
1098 			issue(cache, bio);
1099 		}
1100 		break;
1101 
1102 	case POLICY_NEW:
1103 		atomic_inc(&cache->stats.promotion);
1104 		promote(cache, structs, block, lookup_result.cblock, new_ocell);
1105 		release_cell = false;
1106 		break;
1107 
1108 	case POLICY_REPLACE:
1109 		cell_prealloc = prealloc_get_cell(structs);
1110 		r = bio_detain(cache, lookup_result.old_oblock, bio, cell_prealloc,
1111 			       (cell_free_fn) prealloc_put_cell,
1112 			       structs, &old_ocell);
1113 		if (r > 0) {
1114 			/*
1115 			 * We have to be careful to avoid lock inversion of
1116 			 * the cells.  So we back off, and wait for the
1117 			 * old_ocell to become free.
1118 			 */
1119 			policy_force_mapping(cache->policy, block,
1120 					     lookup_result.old_oblock);
1121 			atomic_inc(&cache->stats.cache_cell_clash);
1122 			break;
1123 		}
1124 		atomic_inc(&cache->stats.demotion);
1125 		atomic_inc(&cache->stats.promotion);
1126 
1127 		demote_then_promote(cache, structs, lookup_result.old_oblock,
1128 				    block, lookup_result.cblock,
1129 				    old_ocell, new_ocell);
1130 		release_cell = false;
1131 		break;
1132 
1133 	default:
1134 		DMERR_LIMIT("%s: erroring bio, unknown policy op: %u", __func__,
1135 			    (unsigned) lookup_result.op);
1136 		bio_io_error(bio);
1137 	}
1138 
1139 	if (release_cell)
1140 		cell_defer(cache, new_ocell, false);
1141 }
1142 
1143 static int need_commit_due_to_time(struct cache *cache)
1144 {
1145 	return jiffies < cache->last_commit_jiffies ||
1146 	       jiffies > cache->last_commit_jiffies + COMMIT_PERIOD;
1147 }
1148 
1149 static int commit_if_needed(struct cache *cache)
1150 {
1151 	if (dm_cache_changed_this_transaction(cache->cmd) &&
1152 	    (cache->commit_requested || need_commit_due_to_time(cache))) {
1153 		atomic_inc(&cache->stats.commit_count);
1154 		cache->last_commit_jiffies = jiffies;
1155 		cache->commit_requested = false;
1156 		return dm_cache_commit(cache->cmd, false);
1157 	}
1158 
1159 	return 0;
1160 }
1161 
1162 static void process_deferred_bios(struct cache *cache)
1163 {
1164 	unsigned long flags;
1165 	struct bio_list bios;
1166 	struct bio *bio;
1167 	struct prealloc structs;
1168 
1169 	memset(&structs, 0, sizeof(structs));
1170 	bio_list_init(&bios);
1171 
1172 	spin_lock_irqsave(&cache->lock, flags);
1173 	bio_list_merge(&bios, &cache->deferred_bios);
1174 	bio_list_init(&cache->deferred_bios);
1175 	spin_unlock_irqrestore(&cache->lock, flags);
1176 
1177 	while (!bio_list_empty(&bios)) {
1178 		/*
1179 		 * If we've got no free migration structs, and processing
1180 		 * this bio might require one, we pause until there are some
1181 		 * prepared mappings to process.
1182 		 */
1183 		if (prealloc_data_structs(cache, &structs)) {
1184 			spin_lock_irqsave(&cache->lock, flags);
1185 			bio_list_merge(&cache->deferred_bios, &bios);
1186 			spin_unlock_irqrestore(&cache->lock, flags);
1187 			break;
1188 		}
1189 
1190 		bio = bio_list_pop(&bios);
1191 
1192 		if (bio->bi_rw & REQ_FLUSH)
1193 			process_flush_bio(cache, bio);
1194 		else if (bio->bi_rw & REQ_DISCARD)
1195 			process_discard_bio(cache, bio);
1196 		else
1197 			process_bio(cache, &structs, bio);
1198 	}
1199 
1200 	prealloc_free_structs(cache, &structs);
1201 }
1202 
1203 static void process_deferred_flush_bios(struct cache *cache, bool submit_bios)
1204 {
1205 	unsigned long flags;
1206 	struct bio_list bios;
1207 	struct bio *bio;
1208 
1209 	bio_list_init(&bios);
1210 
1211 	spin_lock_irqsave(&cache->lock, flags);
1212 	bio_list_merge(&bios, &cache->deferred_flush_bios);
1213 	bio_list_init(&cache->deferred_flush_bios);
1214 	spin_unlock_irqrestore(&cache->lock, flags);
1215 
1216 	while ((bio = bio_list_pop(&bios)))
1217 		submit_bios ? generic_make_request(bio) : bio_io_error(bio);
1218 }
1219 
1220 static void writeback_some_dirty_blocks(struct cache *cache)
1221 {
1222 	int r = 0;
1223 	dm_oblock_t oblock;
1224 	dm_cblock_t cblock;
1225 	struct prealloc structs;
1226 	struct dm_bio_prison_cell *old_ocell;
1227 
1228 	memset(&structs, 0, sizeof(structs));
1229 
1230 	while (spare_migration_bandwidth(cache)) {
1231 		if (prealloc_data_structs(cache, &structs))
1232 			break;
1233 
1234 		r = policy_writeback_work(cache->policy, &oblock, &cblock);
1235 		if (r)
1236 			break;
1237 
1238 		r = get_cell(cache, oblock, &structs, &old_ocell);
1239 		if (r) {
1240 			policy_set_dirty(cache->policy, oblock);
1241 			break;
1242 		}
1243 
1244 		writeback(cache, &structs, oblock, cblock, old_ocell);
1245 	}
1246 
1247 	prealloc_free_structs(cache, &structs);
1248 }
1249 
1250 /*----------------------------------------------------------------
1251  * Main worker loop
1252  *--------------------------------------------------------------*/
1253 static void start_quiescing(struct cache *cache)
1254 {
1255 	unsigned long flags;
1256 
1257 	spin_lock_irqsave(&cache->lock, flags);
1258 	cache->quiescing = 1;
1259 	spin_unlock_irqrestore(&cache->lock, flags);
1260 }
1261 
1262 static void stop_quiescing(struct cache *cache)
1263 {
1264 	unsigned long flags;
1265 
1266 	spin_lock_irqsave(&cache->lock, flags);
1267 	cache->quiescing = 0;
1268 	spin_unlock_irqrestore(&cache->lock, flags);
1269 }
1270 
1271 static bool is_quiescing(struct cache *cache)
1272 {
1273 	int r;
1274 	unsigned long flags;
1275 
1276 	spin_lock_irqsave(&cache->lock, flags);
1277 	r = cache->quiescing;
1278 	spin_unlock_irqrestore(&cache->lock, flags);
1279 
1280 	return r;
1281 }
1282 
1283 static void wait_for_migrations(struct cache *cache)
1284 {
1285 	wait_event(cache->migration_wait, !atomic_read(&cache->nr_migrations));
1286 }
1287 
1288 static void stop_worker(struct cache *cache)
1289 {
1290 	cancel_delayed_work(&cache->waker);
1291 	flush_workqueue(cache->wq);
1292 }
1293 
1294 static void requeue_deferred_io(struct cache *cache)
1295 {
1296 	struct bio *bio;
1297 	struct bio_list bios;
1298 
1299 	bio_list_init(&bios);
1300 	bio_list_merge(&bios, &cache->deferred_bios);
1301 	bio_list_init(&cache->deferred_bios);
1302 
1303 	while ((bio = bio_list_pop(&bios)))
1304 		bio_endio(bio, DM_ENDIO_REQUEUE);
1305 }
1306 
1307 static int more_work(struct cache *cache)
1308 {
1309 	if (is_quiescing(cache))
1310 		return !list_empty(&cache->quiesced_migrations) ||
1311 			!list_empty(&cache->completed_migrations) ||
1312 			!list_empty(&cache->need_commit_migrations);
1313 	else
1314 		return !bio_list_empty(&cache->deferred_bios) ||
1315 			!bio_list_empty(&cache->deferred_flush_bios) ||
1316 			!list_empty(&cache->quiesced_migrations) ||
1317 			!list_empty(&cache->completed_migrations) ||
1318 			!list_empty(&cache->need_commit_migrations);
1319 }
1320 
1321 static void do_worker(struct work_struct *ws)
1322 {
1323 	struct cache *cache = container_of(ws, struct cache, worker);
1324 
1325 	do {
1326 		if (!is_quiescing(cache))
1327 			process_deferred_bios(cache);
1328 
1329 		process_migrations(cache, &cache->quiesced_migrations, issue_copy);
1330 		process_migrations(cache, &cache->completed_migrations, complete_migration);
1331 
1332 		writeback_some_dirty_blocks(cache);
1333 
1334 		if (commit_if_needed(cache)) {
1335 			process_deferred_flush_bios(cache, false);
1336 
1337 			/*
1338 			 * FIXME: rollback metadata or just go into a
1339 			 * failure mode and error everything
1340 			 */
1341 		} else {
1342 			process_deferred_flush_bios(cache, true);
1343 			process_migrations(cache, &cache->need_commit_migrations,
1344 					   migration_success_post_commit);
1345 		}
1346 	} while (more_work(cache));
1347 }
1348 
1349 /*
1350  * We want to commit periodically so that not too much
1351  * unwritten metadata builds up.
1352  */
1353 static void do_waker(struct work_struct *ws)
1354 {
1355 	struct cache *cache = container_of(to_delayed_work(ws), struct cache, waker);
1356 	wake_worker(cache);
1357 	queue_delayed_work(cache->wq, &cache->waker, COMMIT_PERIOD);
1358 }
1359 
1360 /*----------------------------------------------------------------*/
1361 
1362 static int is_congested(struct dm_dev *dev, int bdi_bits)
1363 {
1364 	struct request_queue *q = bdev_get_queue(dev->bdev);
1365 	return bdi_congested(&q->backing_dev_info, bdi_bits);
1366 }
1367 
1368 static int cache_is_congested(struct dm_target_callbacks *cb, int bdi_bits)
1369 {
1370 	struct cache *cache = container_of(cb, struct cache, callbacks);
1371 
1372 	return is_congested(cache->origin_dev, bdi_bits) ||
1373 		is_congested(cache->cache_dev, bdi_bits);
1374 }
1375 
1376 /*----------------------------------------------------------------
1377  * Target methods
1378  *--------------------------------------------------------------*/
1379 
1380 /*
1381  * This function gets called on the error paths of the constructor, so we
1382  * have to cope with a partially initialised struct.
1383  */
1384 static void destroy(struct cache *cache)
1385 {
1386 	unsigned i;
1387 
1388 	if (cache->next_migration)
1389 		mempool_free(cache->next_migration, cache->migration_pool);
1390 
1391 	if (cache->migration_pool)
1392 		mempool_destroy(cache->migration_pool);
1393 
1394 	if (cache->all_io_ds)
1395 		dm_deferred_set_destroy(cache->all_io_ds);
1396 
1397 	if (cache->prison)
1398 		dm_bio_prison_destroy(cache->prison);
1399 
1400 	if (cache->wq)
1401 		destroy_workqueue(cache->wq);
1402 
1403 	if (cache->dirty_bitset)
1404 		free_bitset(cache->dirty_bitset);
1405 
1406 	if (cache->discard_bitset)
1407 		free_bitset(cache->discard_bitset);
1408 
1409 	if (cache->copier)
1410 		dm_kcopyd_client_destroy(cache->copier);
1411 
1412 	if (cache->cmd)
1413 		dm_cache_metadata_close(cache->cmd);
1414 
1415 	if (cache->metadata_dev)
1416 		dm_put_device(cache->ti, cache->metadata_dev);
1417 
1418 	if (cache->origin_dev)
1419 		dm_put_device(cache->ti, cache->origin_dev);
1420 
1421 	if (cache->cache_dev)
1422 		dm_put_device(cache->ti, cache->cache_dev);
1423 
1424 	if (cache->policy)
1425 		dm_cache_policy_destroy(cache->policy);
1426 
1427 	for (i = 0; i < cache->nr_ctr_args ; i++)
1428 		kfree(cache->ctr_args[i]);
1429 	kfree(cache->ctr_args);
1430 
1431 	kfree(cache);
1432 }
1433 
1434 static void cache_dtr(struct dm_target *ti)
1435 {
1436 	struct cache *cache = ti->private;
1437 
1438 	destroy(cache);
1439 }
1440 
1441 static sector_t get_dev_size(struct dm_dev *dev)
1442 {
1443 	return i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT;
1444 }
1445 
1446 /*----------------------------------------------------------------*/
1447 
1448 /*
1449  * Construct a cache device mapping.
1450  *
1451  * cache <metadata dev> <cache dev> <origin dev> <block size>
1452  *       <#feature args> [<feature arg>]*
1453  *       <policy> <#policy args> [<policy arg>]*
1454  *
1455  * metadata dev    : fast device holding the persistent metadata
1456  * cache dev	   : fast device holding cached data blocks
1457  * origin dev	   : slow device holding original data blocks
1458  * block size	   : cache unit size in sectors
1459  *
1460  * #feature args   : number of feature arguments passed
1461  * feature args    : writethrough.  (The default is writeback.)
1462  *
1463  * policy	   : the replacement policy to use
1464  * #policy args    : an even number of policy arguments corresponding
1465  *		     to key/value pairs passed to the policy
1466  * policy args	   : key/value pairs passed to the policy
1467  *		     E.g. 'sequential_threshold 1024'
1468  *		     See cache-policies.txt for details.
1469  *
1470  * Optional feature arguments are:
1471  *   writethrough  : write through caching that prohibits cache block
1472  *		     content from being different from origin block content.
1473  *		     Without this argument, the default behaviour is to write
1474  *		     back cache block contents later for performance reasons,
1475  *		     so they may differ from the corresponding origin blocks.
1476  */
1477 struct cache_args {
1478 	struct dm_target *ti;
1479 
1480 	struct dm_dev *metadata_dev;
1481 
1482 	struct dm_dev *cache_dev;
1483 	sector_t cache_sectors;
1484 
1485 	struct dm_dev *origin_dev;
1486 	sector_t origin_sectors;
1487 
1488 	uint32_t block_size;
1489 
1490 	const char *policy_name;
1491 	int policy_argc;
1492 	const char **policy_argv;
1493 
1494 	struct cache_features features;
1495 };
1496 
1497 static void destroy_cache_args(struct cache_args *ca)
1498 {
1499 	if (ca->metadata_dev)
1500 		dm_put_device(ca->ti, ca->metadata_dev);
1501 
1502 	if (ca->cache_dev)
1503 		dm_put_device(ca->ti, ca->cache_dev);
1504 
1505 	if (ca->origin_dev)
1506 		dm_put_device(ca->ti, ca->origin_dev);
1507 
1508 	kfree(ca);
1509 }
1510 
1511 static bool at_least_one_arg(struct dm_arg_set *as, char **error)
1512 {
1513 	if (!as->argc) {
1514 		*error = "Insufficient args";
1515 		return false;
1516 	}
1517 
1518 	return true;
1519 }
1520 
1521 static int parse_metadata_dev(struct cache_args *ca, struct dm_arg_set *as,
1522 			      char **error)
1523 {
1524 	int r;
1525 	sector_t metadata_dev_size;
1526 	char b[BDEVNAME_SIZE];
1527 
1528 	if (!at_least_one_arg(as, error))
1529 		return -EINVAL;
1530 
1531 	r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE,
1532 			  &ca->metadata_dev);
1533 	if (r) {
1534 		*error = "Error opening metadata device";
1535 		return r;
1536 	}
1537 
1538 	metadata_dev_size = get_dev_size(ca->metadata_dev);
1539 	if (metadata_dev_size > DM_CACHE_METADATA_MAX_SECTORS_WARNING)
1540 		DMWARN("Metadata device %s is larger than %u sectors: excess space will not be used.",
1541 		       bdevname(ca->metadata_dev->bdev, b), THIN_METADATA_MAX_SECTORS);
1542 
1543 	return 0;
1544 }
1545 
1546 static int parse_cache_dev(struct cache_args *ca, struct dm_arg_set *as,
1547 			   char **error)
1548 {
1549 	int r;
1550 
1551 	if (!at_least_one_arg(as, error))
1552 		return -EINVAL;
1553 
1554 	r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE,
1555 			  &ca->cache_dev);
1556 	if (r) {
1557 		*error = "Error opening cache device";
1558 		return r;
1559 	}
1560 	ca->cache_sectors = get_dev_size(ca->cache_dev);
1561 
1562 	return 0;
1563 }
1564 
1565 static int parse_origin_dev(struct cache_args *ca, struct dm_arg_set *as,
1566 			    char **error)
1567 {
1568 	int r;
1569 
1570 	if (!at_least_one_arg(as, error))
1571 		return -EINVAL;
1572 
1573 	r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE,
1574 			  &ca->origin_dev);
1575 	if (r) {
1576 		*error = "Error opening origin device";
1577 		return r;
1578 	}
1579 
1580 	ca->origin_sectors = get_dev_size(ca->origin_dev);
1581 	if (ca->ti->len > ca->origin_sectors) {
1582 		*error = "Device size larger than cached device";
1583 		return -EINVAL;
1584 	}
1585 
1586 	return 0;
1587 }
1588 
1589 static int parse_block_size(struct cache_args *ca, struct dm_arg_set *as,
1590 			    char **error)
1591 {
1592 	unsigned long tmp;
1593 
1594 	if (!at_least_one_arg(as, error))
1595 		return -EINVAL;
1596 
1597 	if (kstrtoul(dm_shift_arg(as), 10, &tmp) || !tmp ||
1598 	    tmp < DATA_DEV_BLOCK_SIZE_MIN_SECTORS ||
1599 	    tmp & (DATA_DEV_BLOCK_SIZE_MIN_SECTORS - 1)) {
1600 		*error = "Invalid data block size";
1601 		return -EINVAL;
1602 	}
1603 
1604 	if (tmp > ca->cache_sectors) {
1605 		*error = "Data block size is larger than the cache device";
1606 		return -EINVAL;
1607 	}
1608 
1609 	ca->block_size = tmp;
1610 
1611 	return 0;
1612 }
1613 
1614 static void init_features(struct cache_features *cf)
1615 {
1616 	cf->mode = CM_WRITE;
1617 	cf->write_through = false;
1618 }
1619 
1620 static int parse_features(struct cache_args *ca, struct dm_arg_set *as,
1621 			  char **error)
1622 {
1623 	static struct dm_arg _args[] = {
1624 		{0, 1, "Invalid number of cache feature arguments"},
1625 	};
1626 
1627 	int r;
1628 	unsigned argc;
1629 	const char *arg;
1630 	struct cache_features *cf = &ca->features;
1631 
1632 	init_features(cf);
1633 
1634 	r = dm_read_arg_group(_args, as, &argc, error);
1635 	if (r)
1636 		return -EINVAL;
1637 
1638 	while (argc--) {
1639 		arg = dm_shift_arg(as);
1640 
1641 		if (!strcasecmp(arg, "writeback"))
1642 			cf->write_through = false;
1643 
1644 		else if (!strcasecmp(arg, "writethrough"))
1645 			cf->write_through = true;
1646 
1647 		else {
1648 			*error = "Unrecognised cache feature requested";
1649 			return -EINVAL;
1650 		}
1651 	}
1652 
1653 	return 0;
1654 }
1655 
1656 static int parse_policy(struct cache_args *ca, struct dm_arg_set *as,
1657 			char **error)
1658 {
1659 	static struct dm_arg _args[] = {
1660 		{0, 1024, "Invalid number of policy arguments"},
1661 	};
1662 
1663 	int r;
1664 
1665 	if (!at_least_one_arg(as, error))
1666 		return -EINVAL;
1667 
1668 	ca->policy_name = dm_shift_arg(as);
1669 
1670 	r = dm_read_arg_group(_args, as, &ca->policy_argc, error);
1671 	if (r)
1672 		return -EINVAL;
1673 
1674 	ca->policy_argv = (const char **)as->argv;
1675 	dm_consume_args(as, ca->policy_argc);
1676 
1677 	return 0;
1678 }
1679 
1680 static int parse_cache_args(struct cache_args *ca, int argc, char **argv,
1681 			    char **error)
1682 {
1683 	int r;
1684 	struct dm_arg_set as;
1685 
1686 	as.argc = argc;
1687 	as.argv = argv;
1688 
1689 	r = parse_metadata_dev(ca, &as, error);
1690 	if (r)
1691 		return r;
1692 
1693 	r = parse_cache_dev(ca, &as, error);
1694 	if (r)
1695 		return r;
1696 
1697 	r = parse_origin_dev(ca, &as, error);
1698 	if (r)
1699 		return r;
1700 
1701 	r = parse_block_size(ca, &as, error);
1702 	if (r)
1703 		return r;
1704 
1705 	r = parse_features(ca, &as, error);
1706 	if (r)
1707 		return r;
1708 
1709 	r = parse_policy(ca, &as, error);
1710 	if (r)
1711 		return r;
1712 
1713 	return 0;
1714 }
1715 
1716 /*----------------------------------------------------------------*/
1717 
1718 static struct kmem_cache *migration_cache;
1719 
1720 static int set_config_values(struct dm_cache_policy *p, int argc, const char **argv)
1721 {
1722 	int r = 0;
1723 
1724 	if (argc & 1) {
1725 		DMWARN("Odd number of policy arguments given but they should be <key> <value> pairs.");
1726 		return -EINVAL;
1727 	}
1728 
1729 	while (argc) {
1730 		r = policy_set_config_value(p, argv[0], argv[1]);
1731 		if (r) {
1732 			DMWARN("policy_set_config_value failed: key = '%s', value = '%s'",
1733 			       argv[0], argv[1]);
1734 			return r;
1735 		}
1736 
1737 		argc -= 2;
1738 		argv += 2;
1739 	}
1740 
1741 	return r;
1742 }
1743 
1744 static int create_cache_policy(struct cache *cache, struct cache_args *ca,
1745 			       char **error)
1746 {
1747 	int r;
1748 
1749 	cache->policy =	dm_cache_policy_create(ca->policy_name,
1750 					       cache->cache_size,
1751 					       cache->origin_sectors,
1752 					       cache->sectors_per_block);
1753 	if (!cache->policy) {
1754 		*error = "Error creating cache's policy";
1755 		return -ENOMEM;
1756 	}
1757 
1758 	r = set_config_values(cache->policy, ca->policy_argc, ca->policy_argv);
1759 	if (r)
1760 		dm_cache_policy_destroy(cache->policy);
1761 
1762 	return r;
1763 }
1764 
1765 /*
1766  * We want the discard block size to be a power of two, at least the size
1767  * of the cache block size, and have no more than 2^14 discard blocks
1768  * across the origin.
1769  */
1770 #define MAX_DISCARD_BLOCKS (1 << 14)
1771 
1772 static bool too_many_discard_blocks(sector_t discard_block_size,
1773 				    sector_t origin_size)
1774 {
1775 	(void) sector_div(origin_size, discard_block_size);
1776 
1777 	return origin_size > MAX_DISCARD_BLOCKS;
1778 }
1779 
1780 static sector_t calculate_discard_block_size(sector_t cache_block_size,
1781 					     sector_t origin_size)
1782 {
1783 	sector_t discard_block_size;
1784 
1785 	discard_block_size = roundup_pow_of_two(cache_block_size);
1786 
1787 	if (origin_size)
1788 		while (too_many_discard_blocks(discard_block_size, origin_size))
1789 			discard_block_size *= 2;
1790 
1791 	return discard_block_size;
1792 }
1793 
1794 #define DEFAULT_MIGRATION_THRESHOLD (2048 * 100)
1795 
1796 static unsigned cache_num_write_bios(struct dm_target *ti, struct bio *bio);
1797 
1798 static int cache_create(struct cache_args *ca, struct cache **result)
1799 {
1800 	int r = 0;
1801 	char **error = &ca->ti->error;
1802 	struct cache *cache;
1803 	struct dm_target *ti = ca->ti;
1804 	dm_block_t origin_blocks;
1805 	struct dm_cache_metadata *cmd;
1806 	bool may_format = ca->features.mode == CM_WRITE;
1807 
1808 	cache = kzalloc(sizeof(*cache), GFP_KERNEL);
1809 	if (!cache)
1810 		return -ENOMEM;
1811 
1812 	cache->ti = ca->ti;
1813 	ti->private = cache;
1814 	ti->per_bio_data_size = sizeof(struct per_bio_data);
1815 	ti->num_flush_bios = 2;
1816 	ti->flush_supported = true;
1817 
1818 	ti->num_discard_bios = 1;
1819 	ti->discards_supported = true;
1820 	ti->discard_zeroes_data_unsupported = true;
1821 
1822 	memcpy(&cache->features, &ca->features, sizeof(cache->features));
1823 
1824 	if (cache->features.write_through)
1825 		ti->num_write_bios = cache_num_write_bios;
1826 
1827 	cache->callbacks.congested_fn = cache_is_congested;
1828 	dm_table_add_target_callbacks(ti->table, &cache->callbacks);
1829 
1830 	cache->metadata_dev = ca->metadata_dev;
1831 	cache->origin_dev = ca->origin_dev;
1832 	cache->cache_dev = ca->cache_dev;
1833 
1834 	ca->metadata_dev = ca->origin_dev = ca->cache_dev = NULL;
1835 
1836 	/* FIXME: factor out this whole section */
1837 	origin_blocks = cache->origin_sectors = ca->origin_sectors;
1838 	(void) sector_div(origin_blocks, ca->block_size);
1839 	cache->origin_blocks = to_oblock(origin_blocks);
1840 
1841 	cache->sectors_per_block = ca->block_size;
1842 	if (dm_set_target_max_io_len(ti, cache->sectors_per_block)) {
1843 		r = -EINVAL;
1844 		goto bad;
1845 	}
1846 
1847 	if (ca->block_size & (ca->block_size - 1)) {
1848 		dm_block_t cache_size = ca->cache_sectors;
1849 
1850 		cache->sectors_per_block_shift = -1;
1851 		(void) sector_div(cache_size, ca->block_size);
1852 		cache->cache_size = to_cblock(cache_size);
1853 	} else {
1854 		cache->sectors_per_block_shift = __ffs(ca->block_size);
1855 		cache->cache_size = to_cblock(ca->cache_sectors >> cache->sectors_per_block_shift);
1856 	}
1857 
1858 	r = create_cache_policy(cache, ca, error);
1859 	if (r)
1860 		goto bad;
1861 	cache->policy_nr_args = ca->policy_argc;
1862 
1863 	cmd = dm_cache_metadata_open(cache->metadata_dev->bdev,
1864 				     ca->block_size, may_format,
1865 				     dm_cache_policy_get_hint_size(cache->policy));
1866 	if (IS_ERR(cmd)) {
1867 		*error = "Error creating metadata object";
1868 		r = PTR_ERR(cmd);
1869 		goto bad;
1870 	}
1871 	cache->cmd = cmd;
1872 
1873 	spin_lock_init(&cache->lock);
1874 	bio_list_init(&cache->deferred_bios);
1875 	bio_list_init(&cache->deferred_flush_bios);
1876 	INIT_LIST_HEAD(&cache->quiesced_migrations);
1877 	INIT_LIST_HEAD(&cache->completed_migrations);
1878 	INIT_LIST_HEAD(&cache->need_commit_migrations);
1879 	cache->migration_threshold = DEFAULT_MIGRATION_THRESHOLD;
1880 	atomic_set(&cache->nr_migrations, 0);
1881 	init_waitqueue_head(&cache->migration_wait);
1882 
1883 	cache->nr_dirty = 0;
1884 	cache->dirty_bitset = alloc_bitset(from_cblock(cache->cache_size));
1885 	if (!cache->dirty_bitset) {
1886 		*error = "could not allocate dirty bitset";
1887 		goto bad;
1888 	}
1889 	clear_bitset(cache->dirty_bitset, from_cblock(cache->cache_size));
1890 
1891 	cache->discard_block_size =
1892 		calculate_discard_block_size(cache->sectors_per_block,
1893 					     cache->origin_sectors);
1894 	cache->discard_nr_blocks = oblock_to_dblock(cache, cache->origin_blocks);
1895 	cache->discard_bitset = alloc_bitset(from_dblock(cache->discard_nr_blocks));
1896 	if (!cache->discard_bitset) {
1897 		*error = "could not allocate discard bitset";
1898 		goto bad;
1899 	}
1900 	clear_bitset(cache->discard_bitset, from_dblock(cache->discard_nr_blocks));
1901 
1902 	cache->copier = dm_kcopyd_client_create(&dm_kcopyd_throttle);
1903 	if (IS_ERR(cache->copier)) {
1904 		*error = "could not create kcopyd client";
1905 		r = PTR_ERR(cache->copier);
1906 		goto bad;
1907 	}
1908 
1909 	cache->wq = alloc_ordered_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM);
1910 	if (!cache->wq) {
1911 		*error = "could not create workqueue for metadata object";
1912 		goto bad;
1913 	}
1914 	INIT_WORK(&cache->worker, do_worker);
1915 	INIT_DELAYED_WORK(&cache->waker, do_waker);
1916 	cache->last_commit_jiffies = jiffies;
1917 
1918 	cache->prison = dm_bio_prison_create(PRISON_CELLS);
1919 	if (!cache->prison) {
1920 		*error = "could not create bio prison";
1921 		goto bad;
1922 	}
1923 
1924 	cache->all_io_ds = dm_deferred_set_create();
1925 	if (!cache->all_io_ds) {
1926 		*error = "could not create all_io deferred set";
1927 		goto bad;
1928 	}
1929 
1930 	cache->migration_pool = mempool_create_slab_pool(MIGRATION_POOL_SIZE,
1931 							 migration_cache);
1932 	if (!cache->migration_pool) {
1933 		*error = "Error creating cache's migration mempool";
1934 		goto bad;
1935 	}
1936 
1937 	cache->next_migration = NULL;
1938 
1939 	cache->need_tick_bio = true;
1940 	cache->sized = false;
1941 	cache->quiescing = false;
1942 	cache->commit_requested = false;
1943 	cache->loaded_mappings = false;
1944 	cache->loaded_discards = false;
1945 
1946 	load_stats(cache);
1947 
1948 	atomic_set(&cache->stats.demotion, 0);
1949 	atomic_set(&cache->stats.promotion, 0);
1950 	atomic_set(&cache->stats.copies_avoided, 0);
1951 	atomic_set(&cache->stats.cache_cell_clash, 0);
1952 	atomic_set(&cache->stats.commit_count, 0);
1953 	atomic_set(&cache->stats.discard_count, 0);
1954 
1955 	*result = cache;
1956 	return 0;
1957 
1958 bad:
1959 	destroy(cache);
1960 	return r;
1961 }
1962 
1963 static int copy_ctr_args(struct cache *cache, int argc, const char **argv)
1964 {
1965 	unsigned i;
1966 	const char **copy;
1967 
1968 	copy = kcalloc(argc, sizeof(*copy), GFP_KERNEL);
1969 	if (!copy)
1970 		return -ENOMEM;
1971 	for (i = 0; i < argc; i++) {
1972 		copy[i] = kstrdup(argv[i], GFP_KERNEL);
1973 		if (!copy[i]) {
1974 			while (i--)
1975 				kfree(copy[i]);
1976 			kfree(copy);
1977 			return -ENOMEM;
1978 		}
1979 	}
1980 
1981 	cache->nr_ctr_args = argc;
1982 	cache->ctr_args = copy;
1983 
1984 	return 0;
1985 }
1986 
1987 static int cache_ctr(struct dm_target *ti, unsigned argc, char **argv)
1988 {
1989 	int r = -EINVAL;
1990 	struct cache_args *ca;
1991 	struct cache *cache = NULL;
1992 
1993 	ca = kzalloc(sizeof(*ca), GFP_KERNEL);
1994 	if (!ca) {
1995 		ti->error = "Error allocating memory for cache";
1996 		return -ENOMEM;
1997 	}
1998 	ca->ti = ti;
1999 
2000 	r = parse_cache_args(ca, argc, argv, &ti->error);
2001 	if (r)
2002 		goto out;
2003 
2004 	r = cache_create(ca, &cache);
2005 
2006 	r = copy_ctr_args(cache, argc - 3, (const char **)argv + 3);
2007 	if (r) {
2008 		destroy(cache);
2009 		goto out;
2010 	}
2011 
2012 	ti->private = cache;
2013 
2014 out:
2015 	destroy_cache_args(ca);
2016 	return r;
2017 }
2018 
2019 static unsigned cache_num_write_bios(struct dm_target *ti, struct bio *bio)
2020 {
2021 	int r;
2022 	struct cache *cache = ti->private;
2023 	dm_oblock_t block = get_bio_block(cache, bio);
2024 	dm_cblock_t cblock;
2025 
2026 	r = policy_lookup(cache->policy, block, &cblock);
2027 	if (r < 0)
2028 		return 2;	/* assume the worst */
2029 
2030 	return (!r && !is_dirty(cache, cblock)) ? 2 : 1;
2031 }
2032 
2033 static int cache_map(struct dm_target *ti, struct bio *bio)
2034 {
2035 	struct cache *cache = ti->private;
2036 
2037 	int r;
2038 	dm_oblock_t block = get_bio_block(cache, bio);
2039 	bool can_migrate = false;
2040 	bool discarded_block;
2041 	struct dm_bio_prison_cell *cell;
2042 	struct policy_result lookup_result;
2043 	struct per_bio_data *pb;
2044 
2045 	if (from_oblock(block) > from_oblock(cache->origin_blocks)) {
2046 		/*
2047 		 * This can only occur if the io goes to a partial block at
2048 		 * the end of the origin device.  We don't cache these.
2049 		 * Just remap to the origin and carry on.
2050 		 */
2051 		remap_to_origin_clear_discard(cache, bio, block);
2052 		return DM_MAPIO_REMAPPED;
2053 	}
2054 
2055 	pb = init_per_bio_data(bio);
2056 
2057 	if (bio->bi_rw & (REQ_FLUSH | REQ_FUA | REQ_DISCARD)) {
2058 		defer_bio(cache, bio);
2059 		return DM_MAPIO_SUBMITTED;
2060 	}
2061 
2062 	/*
2063 	 * Check to see if that block is currently migrating.
2064 	 */
2065 	cell = alloc_prison_cell(cache);
2066 	if (!cell) {
2067 		defer_bio(cache, bio);
2068 		return DM_MAPIO_SUBMITTED;
2069 	}
2070 
2071 	r = bio_detain(cache, block, bio, cell,
2072 		       (cell_free_fn) free_prison_cell,
2073 		       cache, &cell);
2074 	if (r) {
2075 		if (r < 0)
2076 			defer_bio(cache, bio);
2077 
2078 		return DM_MAPIO_SUBMITTED;
2079 	}
2080 
2081 	discarded_block = is_discarded_oblock(cache, block);
2082 
2083 	r = policy_map(cache->policy, block, false, can_migrate, discarded_block,
2084 		       bio, &lookup_result);
2085 	if (r == -EWOULDBLOCK) {
2086 		cell_defer(cache, cell, true);
2087 		return DM_MAPIO_SUBMITTED;
2088 
2089 	} else if (r) {
2090 		DMERR_LIMIT("Unexpected return from cache replacement policy: %d", r);
2091 		bio_io_error(bio);
2092 		return DM_MAPIO_SUBMITTED;
2093 	}
2094 
2095 	switch (lookup_result.op) {
2096 	case POLICY_HIT:
2097 		inc_hit_counter(cache, bio);
2098 		pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
2099 
2100 		if (is_writethrough_io(cache, bio, lookup_result.cblock)) {
2101 			/*
2102 			 * No need to mark anything dirty in write through mode.
2103 			 */
2104 			pb->req_nr == 0 ?
2105 				remap_to_cache(cache, bio, lookup_result.cblock) :
2106 				remap_to_origin_clear_discard(cache, bio, block);
2107 			cell_defer(cache, cell, false);
2108 		} else {
2109 			remap_to_cache_dirty(cache, bio, block, lookup_result.cblock);
2110 			cell_defer(cache, cell, false);
2111 		}
2112 		break;
2113 
2114 	case POLICY_MISS:
2115 		inc_miss_counter(cache, bio);
2116 		pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
2117 
2118 		if (pb->req_nr != 0) {
2119 			/*
2120 			 * This is a duplicate writethrough io that is no
2121 			 * longer needed because the block has been demoted.
2122 			 */
2123 			bio_endio(bio, 0);
2124 			cell_defer(cache, cell, false);
2125 			return DM_MAPIO_SUBMITTED;
2126 		} else {
2127 			remap_to_origin_clear_discard(cache, bio, block);
2128 			cell_defer(cache, cell, false);
2129 		}
2130 		break;
2131 
2132 	default:
2133 		DMERR_LIMIT("%s: erroring bio: unknown policy op: %u", __func__,
2134 			    (unsigned) lookup_result.op);
2135 		bio_io_error(bio);
2136 		return DM_MAPIO_SUBMITTED;
2137 	}
2138 
2139 	return DM_MAPIO_REMAPPED;
2140 }
2141 
2142 static int cache_end_io(struct dm_target *ti, struct bio *bio, int error)
2143 {
2144 	struct cache *cache = ti->private;
2145 	unsigned long flags;
2146 	struct per_bio_data *pb = get_per_bio_data(bio);
2147 
2148 	if (pb->tick) {
2149 		policy_tick(cache->policy);
2150 
2151 		spin_lock_irqsave(&cache->lock, flags);
2152 		cache->need_tick_bio = true;
2153 		spin_unlock_irqrestore(&cache->lock, flags);
2154 	}
2155 
2156 	check_for_quiesced_migrations(cache, pb);
2157 
2158 	return 0;
2159 }
2160 
2161 static int write_dirty_bitset(struct cache *cache)
2162 {
2163 	unsigned i, r;
2164 
2165 	for (i = 0; i < from_cblock(cache->cache_size); i++) {
2166 		r = dm_cache_set_dirty(cache->cmd, to_cblock(i),
2167 				       is_dirty(cache, to_cblock(i)));
2168 		if (r)
2169 			return r;
2170 	}
2171 
2172 	return 0;
2173 }
2174 
2175 static int write_discard_bitset(struct cache *cache)
2176 {
2177 	unsigned i, r;
2178 
2179 	r = dm_cache_discard_bitset_resize(cache->cmd, cache->discard_block_size,
2180 					   cache->discard_nr_blocks);
2181 	if (r) {
2182 		DMERR("could not resize on-disk discard bitset");
2183 		return r;
2184 	}
2185 
2186 	for (i = 0; i < from_dblock(cache->discard_nr_blocks); i++) {
2187 		r = dm_cache_set_discard(cache->cmd, to_dblock(i),
2188 					 is_discarded(cache, to_dblock(i)));
2189 		if (r)
2190 			return r;
2191 	}
2192 
2193 	return 0;
2194 }
2195 
2196 static int save_hint(void *context, dm_cblock_t cblock, dm_oblock_t oblock,
2197 		     uint32_t hint)
2198 {
2199 	struct cache *cache = context;
2200 	return dm_cache_save_hint(cache->cmd, cblock, hint);
2201 }
2202 
2203 static int write_hints(struct cache *cache)
2204 {
2205 	int r;
2206 
2207 	r = dm_cache_begin_hints(cache->cmd, cache->policy);
2208 	if (r) {
2209 		DMERR("dm_cache_begin_hints failed");
2210 		return r;
2211 	}
2212 
2213 	r = policy_walk_mappings(cache->policy, save_hint, cache);
2214 	if (r)
2215 		DMERR("policy_walk_mappings failed");
2216 
2217 	return r;
2218 }
2219 
2220 /*
2221  * returns true on success
2222  */
2223 static bool sync_metadata(struct cache *cache)
2224 {
2225 	int r1, r2, r3, r4;
2226 
2227 	r1 = write_dirty_bitset(cache);
2228 	if (r1)
2229 		DMERR("could not write dirty bitset");
2230 
2231 	r2 = write_discard_bitset(cache);
2232 	if (r2)
2233 		DMERR("could not write discard bitset");
2234 
2235 	save_stats(cache);
2236 
2237 	r3 = write_hints(cache);
2238 	if (r3)
2239 		DMERR("could not write hints");
2240 
2241 	/*
2242 	 * If writing the above metadata failed, we still commit, but don't
2243 	 * set the clean shutdown flag.  This will effectively force every
2244 	 * dirty bit to be set on reload.
2245 	 */
2246 	r4 = dm_cache_commit(cache->cmd, !r1 && !r2 && !r3);
2247 	if (r4)
2248 		DMERR("could not write cache metadata.  Data loss may occur.");
2249 
2250 	return !r1 && !r2 && !r3 && !r4;
2251 }
2252 
2253 static void cache_postsuspend(struct dm_target *ti)
2254 {
2255 	struct cache *cache = ti->private;
2256 
2257 	start_quiescing(cache);
2258 	wait_for_migrations(cache);
2259 	stop_worker(cache);
2260 	requeue_deferred_io(cache);
2261 	stop_quiescing(cache);
2262 
2263 	(void) sync_metadata(cache);
2264 }
2265 
2266 static int load_mapping(void *context, dm_oblock_t oblock, dm_cblock_t cblock,
2267 			bool dirty, uint32_t hint, bool hint_valid)
2268 {
2269 	int r;
2270 	struct cache *cache = context;
2271 
2272 	r = policy_load_mapping(cache->policy, oblock, cblock, hint, hint_valid);
2273 	if (r)
2274 		return r;
2275 
2276 	if (dirty)
2277 		set_dirty(cache, oblock, cblock);
2278 	else
2279 		clear_dirty(cache, oblock, cblock);
2280 
2281 	return 0;
2282 }
2283 
2284 static int load_discard(void *context, sector_t discard_block_size,
2285 			dm_dblock_t dblock, bool discard)
2286 {
2287 	struct cache *cache = context;
2288 
2289 	/* FIXME: handle mis-matched block size */
2290 
2291 	if (discard)
2292 		set_discard(cache, dblock);
2293 	else
2294 		clear_discard(cache, dblock);
2295 
2296 	return 0;
2297 }
2298 
2299 static int cache_preresume(struct dm_target *ti)
2300 {
2301 	int r = 0;
2302 	struct cache *cache = ti->private;
2303 	sector_t actual_cache_size = get_dev_size(cache->cache_dev);
2304 	(void) sector_div(actual_cache_size, cache->sectors_per_block);
2305 
2306 	/*
2307 	 * Check to see if the cache has resized.
2308 	 */
2309 	if (from_cblock(cache->cache_size) != actual_cache_size || !cache->sized) {
2310 		cache->cache_size = to_cblock(actual_cache_size);
2311 
2312 		r = dm_cache_resize(cache->cmd, cache->cache_size);
2313 		if (r) {
2314 			DMERR("could not resize cache metadata");
2315 			return r;
2316 		}
2317 
2318 		cache->sized = true;
2319 	}
2320 
2321 	if (!cache->loaded_mappings) {
2322 		r = dm_cache_load_mappings(cache->cmd,
2323 					   dm_cache_policy_get_name(cache->policy),
2324 					   load_mapping, cache);
2325 		if (r) {
2326 			DMERR("could not load cache mappings");
2327 			return r;
2328 		}
2329 
2330 		cache->loaded_mappings = true;
2331 	}
2332 
2333 	if (!cache->loaded_discards) {
2334 		r = dm_cache_load_discards(cache->cmd, load_discard, cache);
2335 		if (r) {
2336 			DMERR("could not load origin discards");
2337 			return r;
2338 		}
2339 
2340 		cache->loaded_discards = true;
2341 	}
2342 
2343 	return r;
2344 }
2345 
2346 static void cache_resume(struct dm_target *ti)
2347 {
2348 	struct cache *cache = ti->private;
2349 
2350 	cache->need_tick_bio = true;
2351 	do_waker(&cache->waker.work);
2352 }
2353 
2354 /*
2355  * Status format:
2356  *
2357  * <#used metadata blocks>/<#total metadata blocks>
2358  * <#read hits> <#read misses> <#write hits> <#write misses>
2359  * <#demotions> <#promotions> <#blocks in cache> <#dirty>
2360  * <#features> <features>*
2361  * <#core args> <core args>
2362  * <#policy args> <policy args>*
2363  */
2364 static void cache_status(struct dm_target *ti, status_type_t type,
2365 			 unsigned status_flags, char *result, unsigned maxlen)
2366 {
2367 	int r = 0;
2368 	unsigned i;
2369 	ssize_t sz = 0;
2370 	dm_block_t nr_free_blocks_metadata = 0;
2371 	dm_block_t nr_blocks_metadata = 0;
2372 	char buf[BDEVNAME_SIZE];
2373 	struct cache *cache = ti->private;
2374 	dm_cblock_t residency;
2375 
2376 	switch (type) {
2377 	case STATUSTYPE_INFO:
2378 		/* Commit to ensure statistics aren't out-of-date */
2379 		if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti)) {
2380 			r = dm_cache_commit(cache->cmd, false);
2381 			if (r)
2382 				DMERR("could not commit metadata for accurate status");
2383 		}
2384 
2385 		r = dm_cache_get_free_metadata_block_count(cache->cmd,
2386 							   &nr_free_blocks_metadata);
2387 		if (r) {
2388 			DMERR("could not get metadata free block count");
2389 			goto err;
2390 		}
2391 
2392 		r = dm_cache_get_metadata_dev_size(cache->cmd, &nr_blocks_metadata);
2393 		if (r) {
2394 			DMERR("could not get metadata device size");
2395 			goto err;
2396 		}
2397 
2398 		residency = policy_residency(cache->policy);
2399 
2400 		DMEMIT("%llu/%llu %u %u %u %u %u %u %llu %u ",
2401 		       (unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata),
2402 		       (unsigned long long)nr_blocks_metadata,
2403 		       (unsigned) atomic_read(&cache->stats.read_hit),
2404 		       (unsigned) atomic_read(&cache->stats.read_miss),
2405 		       (unsigned) atomic_read(&cache->stats.write_hit),
2406 		       (unsigned) atomic_read(&cache->stats.write_miss),
2407 		       (unsigned) atomic_read(&cache->stats.demotion),
2408 		       (unsigned) atomic_read(&cache->stats.promotion),
2409 		       (unsigned long long) from_cblock(residency),
2410 		       cache->nr_dirty);
2411 
2412 		if (cache->features.write_through)
2413 			DMEMIT("1 writethrough ");
2414 		else
2415 			DMEMIT("0 ");
2416 
2417 		DMEMIT("2 migration_threshold %llu ", (unsigned long long) cache->migration_threshold);
2418 		if (sz < maxlen) {
2419 			r = policy_emit_config_values(cache->policy, result + sz, maxlen - sz);
2420 			if (r)
2421 				DMERR("policy_emit_config_values returned %d", r);
2422 		}
2423 
2424 		break;
2425 
2426 	case STATUSTYPE_TABLE:
2427 		format_dev_t(buf, cache->metadata_dev->bdev->bd_dev);
2428 		DMEMIT("%s ", buf);
2429 		format_dev_t(buf, cache->cache_dev->bdev->bd_dev);
2430 		DMEMIT("%s ", buf);
2431 		format_dev_t(buf, cache->origin_dev->bdev->bd_dev);
2432 		DMEMIT("%s", buf);
2433 
2434 		for (i = 0; i < cache->nr_ctr_args - 1; i++)
2435 			DMEMIT(" %s", cache->ctr_args[i]);
2436 		if (cache->nr_ctr_args)
2437 			DMEMIT(" %s", cache->ctr_args[cache->nr_ctr_args - 1]);
2438 	}
2439 
2440 	return;
2441 
2442 err:
2443 	DMEMIT("Error");
2444 }
2445 
2446 #define NOT_CORE_OPTION 1
2447 
2448 static int process_config_option(struct cache *cache, char **argv)
2449 {
2450 	unsigned long tmp;
2451 
2452 	if (!strcasecmp(argv[0], "migration_threshold")) {
2453 		if (kstrtoul(argv[1], 10, &tmp))
2454 			return -EINVAL;
2455 
2456 		cache->migration_threshold = tmp;
2457 		return 0;
2458 	}
2459 
2460 	return NOT_CORE_OPTION;
2461 }
2462 
2463 /*
2464  * Supports <key> <value>.
2465  *
2466  * The key migration_threshold is supported by the cache target core.
2467  */
2468 static int cache_message(struct dm_target *ti, unsigned argc, char **argv)
2469 {
2470 	int r;
2471 	struct cache *cache = ti->private;
2472 
2473 	if (argc != 2)
2474 		return -EINVAL;
2475 
2476 	r = process_config_option(cache, argv);
2477 	if (r == NOT_CORE_OPTION)
2478 		return policy_set_config_value(cache->policy, argv[0], argv[1]);
2479 
2480 	return r;
2481 }
2482 
2483 static int cache_iterate_devices(struct dm_target *ti,
2484 				 iterate_devices_callout_fn fn, void *data)
2485 {
2486 	int r = 0;
2487 	struct cache *cache = ti->private;
2488 
2489 	r = fn(ti, cache->cache_dev, 0, get_dev_size(cache->cache_dev), data);
2490 	if (!r)
2491 		r = fn(ti, cache->origin_dev, 0, ti->len, data);
2492 
2493 	return r;
2494 }
2495 
2496 /*
2497  * We assume I/O is going to the origin (which is the volume
2498  * more likely to have restrictions e.g. by being striped).
2499  * (Looking up the exact location of the data would be expensive
2500  * and could always be out of date by the time the bio is submitted.)
2501  */
2502 static int cache_bvec_merge(struct dm_target *ti,
2503 			    struct bvec_merge_data *bvm,
2504 			    struct bio_vec *biovec, int max_size)
2505 {
2506 	struct cache *cache = ti->private;
2507 	struct request_queue *q = bdev_get_queue(cache->origin_dev->bdev);
2508 
2509 	if (!q->merge_bvec_fn)
2510 		return max_size;
2511 
2512 	bvm->bi_bdev = cache->origin_dev->bdev;
2513 	return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
2514 }
2515 
2516 static void set_discard_limits(struct cache *cache, struct queue_limits *limits)
2517 {
2518 	/*
2519 	 * FIXME: these limits may be incompatible with the cache device
2520 	 */
2521 	limits->max_discard_sectors = cache->discard_block_size * 1024;
2522 	limits->discard_granularity = cache->discard_block_size << SECTOR_SHIFT;
2523 }
2524 
2525 static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits)
2526 {
2527 	struct cache *cache = ti->private;
2528 
2529 	blk_limits_io_min(limits, 0);
2530 	blk_limits_io_opt(limits, cache->sectors_per_block << SECTOR_SHIFT);
2531 	set_discard_limits(cache, limits);
2532 }
2533 
2534 /*----------------------------------------------------------------*/
2535 
2536 static struct target_type cache_target = {
2537 	.name = "cache",
2538 	.version = {1, 0, 0},
2539 	.module = THIS_MODULE,
2540 	.ctr = cache_ctr,
2541 	.dtr = cache_dtr,
2542 	.map = cache_map,
2543 	.end_io = cache_end_io,
2544 	.postsuspend = cache_postsuspend,
2545 	.preresume = cache_preresume,
2546 	.resume = cache_resume,
2547 	.status = cache_status,
2548 	.message = cache_message,
2549 	.iterate_devices = cache_iterate_devices,
2550 	.merge = cache_bvec_merge,
2551 	.io_hints = cache_io_hints,
2552 };
2553 
2554 static int __init dm_cache_init(void)
2555 {
2556 	int r;
2557 
2558 	r = dm_register_target(&cache_target);
2559 	if (r) {
2560 		DMERR("cache target registration failed: %d", r);
2561 		return r;
2562 	}
2563 
2564 	migration_cache = KMEM_CACHE(dm_cache_migration, 0);
2565 	if (!migration_cache) {
2566 		dm_unregister_target(&cache_target);
2567 		return -ENOMEM;
2568 	}
2569 
2570 	return 0;
2571 }
2572 
2573 static void __exit dm_cache_exit(void)
2574 {
2575 	dm_unregister_target(&cache_target);
2576 	kmem_cache_destroy(migration_cache);
2577 }
2578 
2579 module_init(dm_cache_init);
2580 module_exit(dm_cache_exit);
2581 
2582 MODULE_DESCRIPTION(DM_NAME " cache target");
2583 MODULE_AUTHOR("Joe Thornber <ejt@redhat.com>");
2584 MODULE_LICENSE("GPL");
2585