xref: /openbmc/linux/drivers/md/dm-cache-target.c (revision 4949009e)
1 /*
2  * Copyright (C) 2012 Red Hat. All rights reserved.
3  *
4  * This file is released under the GPL.
5  */
6 
7 #include "dm.h"
8 #include "dm-bio-prison.h"
9 #include "dm-bio-record.h"
10 #include "dm-cache-metadata.h"
11 
12 #include <linux/dm-io.h>
13 #include <linux/dm-kcopyd.h>
14 #include <linux/init.h>
15 #include <linux/mempool.h>
16 #include <linux/module.h>
17 #include <linux/slab.h>
18 #include <linux/vmalloc.h>
19 
20 #define DM_MSG_PREFIX "cache"
21 
22 DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(cache_copy_throttle,
23 	"A percentage of time allocated for copying to and/or from cache");
24 
25 /*----------------------------------------------------------------*/
26 
27 /*
28  * Glossary:
29  *
30  * oblock: index of an origin block
31  * cblock: index of a cache block
32  * promotion: movement of a block from origin to cache
33  * demotion: movement of a block from cache to origin
34  * migration: movement of a block between the origin and cache device,
35  *	      either direction
36  */
37 
38 /*----------------------------------------------------------------*/
39 
40 static size_t bitset_size_in_bytes(unsigned nr_entries)
41 {
42 	return sizeof(unsigned long) * dm_div_up(nr_entries, BITS_PER_LONG);
43 }
44 
45 static unsigned long *alloc_bitset(unsigned nr_entries)
46 {
47 	size_t s = bitset_size_in_bytes(nr_entries);
48 	return vzalloc(s);
49 }
50 
51 static void clear_bitset(void *bitset, unsigned nr_entries)
52 {
53 	size_t s = bitset_size_in_bytes(nr_entries);
54 	memset(bitset, 0, s);
55 }
56 
57 static void free_bitset(unsigned long *bits)
58 {
59 	vfree(bits);
60 }
61 
62 /*----------------------------------------------------------------*/
63 
64 /*
65  * There are a couple of places where we let a bio run, but want to do some
66  * work before calling its endio function.  We do this by temporarily
67  * changing the endio fn.
68  */
69 struct dm_hook_info {
70 	bio_end_io_t *bi_end_io;
71 	void *bi_private;
72 };
73 
74 static void dm_hook_bio(struct dm_hook_info *h, struct bio *bio,
75 			bio_end_io_t *bi_end_io, void *bi_private)
76 {
77 	h->bi_end_io = bio->bi_end_io;
78 	h->bi_private = bio->bi_private;
79 
80 	bio->bi_end_io = bi_end_io;
81 	bio->bi_private = bi_private;
82 }
83 
84 static void dm_unhook_bio(struct dm_hook_info *h, struct bio *bio)
85 {
86 	bio->bi_end_io = h->bi_end_io;
87 	bio->bi_private = h->bi_private;
88 
89 	/*
90 	 * Must bump bi_remaining to allow bio to complete with
91 	 * restored bi_end_io.
92 	 */
93 	atomic_inc(&bio->bi_remaining);
94 }
95 
96 /*----------------------------------------------------------------*/
97 
98 #define MIGRATION_POOL_SIZE 128
99 #define COMMIT_PERIOD HZ
100 #define MIGRATION_COUNT_WINDOW 10
101 
102 /*
103  * The block size of the device holding cache data must be
104  * between 32KB and 1GB.
105  */
106 #define DATA_DEV_BLOCK_SIZE_MIN_SECTORS (32 * 1024 >> SECTOR_SHIFT)
107 #define DATA_DEV_BLOCK_SIZE_MAX_SECTORS (1024 * 1024 * 1024 >> SECTOR_SHIFT)
108 
109 /*
110  * FIXME: the cache is read/write for the time being.
111  */
112 enum cache_metadata_mode {
113 	CM_WRITE,		/* metadata may be changed */
114 	CM_READ_ONLY,		/* metadata may not be changed */
115 };
116 
117 enum cache_io_mode {
118 	/*
119 	 * Data is written to cached blocks only.  These blocks are marked
120 	 * dirty.  If you lose the cache device you will lose data.
121 	 * Potential performance increase for both reads and writes.
122 	 */
123 	CM_IO_WRITEBACK,
124 
125 	/*
126 	 * Data is written to both cache and origin.  Blocks are never
127 	 * dirty.  Potential performance benfit for reads only.
128 	 */
129 	CM_IO_WRITETHROUGH,
130 
131 	/*
132 	 * A degraded mode useful for various cache coherency situations
133 	 * (eg, rolling back snapshots).  Reads and writes always go to the
134 	 * origin.  If a write goes to a cached oblock, then the cache
135 	 * block is invalidated.
136 	 */
137 	CM_IO_PASSTHROUGH
138 };
139 
140 struct cache_features {
141 	enum cache_metadata_mode mode;
142 	enum cache_io_mode io_mode;
143 };
144 
145 struct cache_stats {
146 	atomic_t read_hit;
147 	atomic_t read_miss;
148 	atomic_t write_hit;
149 	atomic_t write_miss;
150 	atomic_t demotion;
151 	atomic_t promotion;
152 	atomic_t copies_avoided;
153 	atomic_t cache_cell_clash;
154 	atomic_t commit_count;
155 	atomic_t discard_count;
156 };
157 
158 /*
159  * Defines a range of cblocks, begin to (end - 1) are in the range.  end is
160  * the one-past-the-end value.
161  */
162 struct cblock_range {
163 	dm_cblock_t begin;
164 	dm_cblock_t end;
165 };
166 
167 struct invalidation_request {
168 	struct list_head list;
169 	struct cblock_range *cblocks;
170 
171 	atomic_t complete;
172 	int err;
173 
174 	wait_queue_head_t result_wait;
175 };
176 
177 struct cache {
178 	struct dm_target *ti;
179 	struct dm_target_callbacks callbacks;
180 
181 	struct dm_cache_metadata *cmd;
182 
183 	/*
184 	 * Metadata is written to this device.
185 	 */
186 	struct dm_dev *metadata_dev;
187 
188 	/*
189 	 * The slower of the two data devices.  Typically a spindle.
190 	 */
191 	struct dm_dev *origin_dev;
192 
193 	/*
194 	 * The faster of the two data devices.  Typically an SSD.
195 	 */
196 	struct dm_dev *cache_dev;
197 
198 	/*
199 	 * Size of the origin device in _complete_ blocks and native sectors.
200 	 */
201 	dm_oblock_t origin_blocks;
202 	sector_t origin_sectors;
203 
204 	/*
205 	 * Size of the cache device in blocks.
206 	 */
207 	dm_cblock_t cache_size;
208 
209 	/*
210 	 * Fields for converting from sectors to blocks.
211 	 */
212 	uint32_t sectors_per_block;
213 	int sectors_per_block_shift;
214 
215 	spinlock_t lock;
216 	struct bio_list deferred_bios;
217 	struct bio_list deferred_flush_bios;
218 	struct bio_list deferred_writethrough_bios;
219 	struct list_head quiesced_migrations;
220 	struct list_head completed_migrations;
221 	struct list_head need_commit_migrations;
222 	sector_t migration_threshold;
223 	wait_queue_head_t migration_wait;
224 	atomic_t nr_allocated_migrations;
225 
226 	/*
227 	 * The number of in flight migrations that are performing
228 	 * background io. eg, promotion, writeback.
229 	 */
230 	atomic_t nr_io_migrations;
231 
232 	wait_queue_head_t quiescing_wait;
233 	atomic_t quiescing;
234 	atomic_t quiescing_ack;
235 
236 	/*
237 	 * cache_size entries, dirty if set
238 	 */
239 	atomic_t nr_dirty;
240 	unsigned long *dirty_bitset;
241 
242 	/*
243 	 * origin_blocks entries, discarded if set.
244 	 */
245 	dm_dblock_t discard_nr_blocks;
246 	unsigned long *discard_bitset;
247 	uint32_t discard_block_size; /* a power of 2 times sectors per block */
248 
249 	/*
250 	 * Rather than reconstructing the table line for the status we just
251 	 * save it and regurgitate.
252 	 */
253 	unsigned nr_ctr_args;
254 	const char **ctr_args;
255 
256 	struct dm_kcopyd_client *copier;
257 	struct workqueue_struct *wq;
258 	struct work_struct worker;
259 
260 	struct delayed_work waker;
261 	unsigned long last_commit_jiffies;
262 
263 	struct dm_bio_prison *prison;
264 	struct dm_deferred_set *all_io_ds;
265 
266 	mempool_t *migration_pool;
267 
268 	struct dm_cache_policy *policy;
269 	unsigned policy_nr_args;
270 
271 	bool need_tick_bio:1;
272 	bool sized:1;
273 	bool invalidate:1;
274 	bool commit_requested:1;
275 	bool loaded_mappings:1;
276 	bool loaded_discards:1;
277 
278 	/*
279 	 * Cache features such as write-through.
280 	 */
281 	struct cache_features features;
282 
283 	struct cache_stats stats;
284 
285 	/*
286 	 * Invalidation fields.
287 	 */
288 	spinlock_t invalidation_lock;
289 	struct list_head invalidation_requests;
290 };
291 
292 struct per_bio_data {
293 	bool tick:1;
294 	unsigned req_nr:2;
295 	struct dm_deferred_entry *all_io_entry;
296 	struct dm_hook_info hook_info;
297 
298 	/*
299 	 * writethrough fields.  These MUST remain at the end of this
300 	 * structure and the 'cache' member must be the first as it
301 	 * is used to determine the offset of the writethrough fields.
302 	 */
303 	struct cache *cache;
304 	dm_cblock_t cblock;
305 	struct dm_bio_details bio_details;
306 };
307 
308 struct dm_cache_migration {
309 	struct list_head list;
310 	struct cache *cache;
311 
312 	unsigned long start_jiffies;
313 	dm_oblock_t old_oblock;
314 	dm_oblock_t new_oblock;
315 	dm_cblock_t cblock;
316 
317 	bool err:1;
318 	bool discard:1;
319 	bool writeback:1;
320 	bool demote:1;
321 	bool promote:1;
322 	bool requeue_holder:1;
323 	bool invalidate:1;
324 
325 	struct dm_bio_prison_cell *old_ocell;
326 	struct dm_bio_prison_cell *new_ocell;
327 };
328 
329 /*
330  * Processing a bio in the worker thread may require these memory
331  * allocations.  We prealloc to avoid deadlocks (the same worker thread
332  * frees them back to the mempool).
333  */
334 struct prealloc {
335 	struct dm_cache_migration *mg;
336 	struct dm_bio_prison_cell *cell1;
337 	struct dm_bio_prison_cell *cell2;
338 };
339 
340 static void wake_worker(struct cache *cache)
341 {
342 	queue_work(cache->wq, &cache->worker);
343 }
344 
345 /*----------------------------------------------------------------*/
346 
347 static struct dm_bio_prison_cell *alloc_prison_cell(struct cache *cache)
348 {
349 	/* FIXME: change to use a local slab. */
350 	return dm_bio_prison_alloc_cell(cache->prison, GFP_NOWAIT);
351 }
352 
353 static void free_prison_cell(struct cache *cache, struct dm_bio_prison_cell *cell)
354 {
355 	dm_bio_prison_free_cell(cache->prison, cell);
356 }
357 
358 static struct dm_cache_migration *alloc_migration(struct cache *cache)
359 {
360 	struct dm_cache_migration *mg;
361 
362 	mg = mempool_alloc(cache->migration_pool, GFP_NOWAIT);
363 	if (mg) {
364 		mg->cache = cache;
365 		atomic_inc(&mg->cache->nr_allocated_migrations);
366 	}
367 
368 	return mg;
369 }
370 
371 static void free_migration(struct dm_cache_migration *mg)
372 {
373 	if (atomic_dec_and_test(&mg->cache->nr_allocated_migrations))
374 		wake_up(&mg->cache->migration_wait);
375 
376 	mempool_free(mg, mg->cache->migration_pool);
377 }
378 
379 static int prealloc_data_structs(struct cache *cache, struct prealloc *p)
380 {
381 	if (!p->mg) {
382 		p->mg = alloc_migration(cache);
383 		if (!p->mg)
384 			return -ENOMEM;
385 	}
386 
387 	if (!p->cell1) {
388 		p->cell1 = alloc_prison_cell(cache);
389 		if (!p->cell1)
390 			return -ENOMEM;
391 	}
392 
393 	if (!p->cell2) {
394 		p->cell2 = alloc_prison_cell(cache);
395 		if (!p->cell2)
396 			return -ENOMEM;
397 	}
398 
399 	return 0;
400 }
401 
402 static void prealloc_free_structs(struct cache *cache, struct prealloc *p)
403 {
404 	if (p->cell2)
405 		free_prison_cell(cache, p->cell2);
406 
407 	if (p->cell1)
408 		free_prison_cell(cache, p->cell1);
409 
410 	if (p->mg)
411 		free_migration(p->mg);
412 }
413 
414 static struct dm_cache_migration *prealloc_get_migration(struct prealloc *p)
415 {
416 	struct dm_cache_migration *mg = p->mg;
417 
418 	BUG_ON(!mg);
419 	p->mg = NULL;
420 
421 	return mg;
422 }
423 
424 /*
425  * You must have a cell within the prealloc struct to return.  If not this
426  * function will BUG() rather than returning NULL.
427  */
428 static struct dm_bio_prison_cell *prealloc_get_cell(struct prealloc *p)
429 {
430 	struct dm_bio_prison_cell *r = NULL;
431 
432 	if (p->cell1) {
433 		r = p->cell1;
434 		p->cell1 = NULL;
435 
436 	} else if (p->cell2) {
437 		r = p->cell2;
438 		p->cell2 = NULL;
439 	} else
440 		BUG();
441 
442 	return r;
443 }
444 
445 /*
446  * You can't have more than two cells in a prealloc struct.  BUG() will be
447  * called if you try and overfill.
448  */
449 static void prealloc_put_cell(struct prealloc *p, struct dm_bio_prison_cell *cell)
450 {
451 	if (!p->cell2)
452 		p->cell2 = cell;
453 
454 	else if (!p->cell1)
455 		p->cell1 = cell;
456 
457 	else
458 		BUG();
459 }
460 
461 /*----------------------------------------------------------------*/
462 
463 static void build_key(dm_oblock_t begin, dm_oblock_t end, struct dm_cell_key *key)
464 {
465 	key->virtual = 0;
466 	key->dev = 0;
467 	key->block_begin = from_oblock(begin);
468 	key->block_end = from_oblock(end);
469 }
470 
471 /*
472  * The caller hands in a preallocated cell, and a free function for it.
473  * The cell will be freed if there's an error, or if it wasn't used because
474  * a cell with that key already exists.
475  */
476 typedef void (*cell_free_fn)(void *context, struct dm_bio_prison_cell *cell);
477 
478 static int bio_detain_range(struct cache *cache, dm_oblock_t oblock_begin, dm_oblock_t oblock_end,
479 			    struct bio *bio, struct dm_bio_prison_cell *cell_prealloc,
480 			    cell_free_fn free_fn, void *free_context,
481 			    struct dm_bio_prison_cell **cell_result)
482 {
483 	int r;
484 	struct dm_cell_key key;
485 
486 	build_key(oblock_begin, oblock_end, &key);
487 	r = dm_bio_detain(cache->prison, &key, bio, cell_prealloc, cell_result);
488 	if (r)
489 		free_fn(free_context, cell_prealloc);
490 
491 	return r;
492 }
493 
494 static int bio_detain(struct cache *cache, dm_oblock_t oblock,
495 		      struct bio *bio, struct dm_bio_prison_cell *cell_prealloc,
496 		      cell_free_fn free_fn, void *free_context,
497 		      struct dm_bio_prison_cell **cell_result)
498 {
499 	dm_oblock_t end = to_oblock(from_oblock(oblock) + 1ULL);
500 	return bio_detain_range(cache, oblock, end, bio,
501 				cell_prealloc, free_fn, free_context, cell_result);
502 }
503 
504 static int get_cell(struct cache *cache,
505 		    dm_oblock_t oblock,
506 		    struct prealloc *structs,
507 		    struct dm_bio_prison_cell **cell_result)
508 {
509 	int r;
510 	struct dm_cell_key key;
511 	struct dm_bio_prison_cell *cell_prealloc;
512 
513 	cell_prealloc = prealloc_get_cell(structs);
514 
515 	build_key(oblock, to_oblock(from_oblock(oblock) + 1ULL), &key);
516 	r = dm_get_cell(cache->prison, &key, cell_prealloc, cell_result);
517 	if (r)
518 		prealloc_put_cell(structs, cell_prealloc);
519 
520 	return r;
521 }
522 
523 /*----------------------------------------------------------------*/
524 
525 static bool is_dirty(struct cache *cache, dm_cblock_t b)
526 {
527 	return test_bit(from_cblock(b), cache->dirty_bitset);
528 }
529 
530 static void set_dirty(struct cache *cache, dm_oblock_t oblock, dm_cblock_t cblock)
531 {
532 	if (!test_and_set_bit(from_cblock(cblock), cache->dirty_bitset)) {
533 		atomic_inc(&cache->nr_dirty);
534 		policy_set_dirty(cache->policy, oblock);
535 	}
536 }
537 
538 static void clear_dirty(struct cache *cache, dm_oblock_t oblock, dm_cblock_t cblock)
539 {
540 	if (test_and_clear_bit(from_cblock(cblock), cache->dirty_bitset)) {
541 		policy_clear_dirty(cache->policy, oblock);
542 		if (atomic_dec_return(&cache->nr_dirty) == 0)
543 			dm_table_event(cache->ti->table);
544 	}
545 }
546 
547 /*----------------------------------------------------------------*/
548 
549 static bool block_size_is_power_of_two(struct cache *cache)
550 {
551 	return cache->sectors_per_block_shift >= 0;
552 }
553 
554 /* gcc on ARM generates spurious references to __udivdi3 and __umoddi3 */
555 #if defined(CONFIG_ARM) && __GNUC__ == 4 && __GNUC_MINOR__ <= 6
556 __always_inline
557 #endif
558 static dm_block_t block_div(dm_block_t b, uint32_t n)
559 {
560 	do_div(b, n);
561 
562 	return b;
563 }
564 
565 static dm_block_t oblocks_per_dblock(struct cache *cache)
566 {
567 	dm_block_t oblocks = cache->discard_block_size;
568 
569 	if (block_size_is_power_of_two(cache))
570 		oblocks >>= cache->sectors_per_block_shift;
571 	else
572 		oblocks = block_div(oblocks, cache->sectors_per_block);
573 
574 	return oblocks;
575 }
576 
577 static dm_dblock_t oblock_to_dblock(struct cache *cache, dm_oblock_t oblock)
578 {
579 	return to_dblock(block_div(from_oblock(oblock),
580 				   oblocks_per_dblock(cache)));
581 }
582 
583 static dm_oblock_t dblock_to_oblock(struct cache *cache, dm_dblock_t dblock)
584 {
585 	return to_oblock(from_dblock(dblock) * oblocks_per_dblock(cache));
586 }
587 
588 static void set_discard(struct cache *cache, dm_dblock_t b)
589 {
590 	unsigned long flags;
591 
592 	BUG_ON(from_dblock(b) >= from_dblock(cache->discard_nr_blocks));
593 	atomic_inc(&cache->stats.discard_count);
594 
595 	spin_lock_irqsave(&cache->lock, flags);
596 	set_bit(from_dblock(b), cache->discard_bitset);
597 	spin_unlock_irqrestore(&cache->lock, flags);
598 }
599 
600 static void clear_discard(struct cache *cache, dm_dblock_t b)
601 {
602 	unsigned long flags;
603 
604 	spin_lock_irqsave(&cache->lock, flags);
605 	clear_bit(from_dblock(b), cache->discard_bitset);
606 	spin_unlock_irqrestore(&cache->lock, flags);
607 }
608 
609 static bool is_discarded(struct cache *cache, dm_dblock_t b)
610 {
611 	int r;
612 	unsigned long flags;
613 
614 	spin_lock_irqsave(&cache->lock, flags);
615 	r = test_bit(from_dblock(b), cache->discard_bitset);
616 	spin_unlock_irqrestore(&cache->lock, flags);
617 
618 	return r;
619 }
620 
621 static bool is_discarded_oblock(struct cache *cache, dm_oblock_t b)
622 {
623 	int r;
624 	unsigned long flags;
625 
626 	spin_lock_irqsave(&cache->lock, flags);
627 	r = test_bit(from_dblock(oblock_to_dblock(cache, b)),
628 		     cache->discard_bitset);
629 	spin_unlock_irqrestore(&cache->lock, flags);
630 
631 	return r;
632 }
633 
634 /*----------------------------------------------------------------*/
635 
636 static void load_stats(struct cache *cache)
637 {
638 	struct dm_cache_statistics stats;
639 
640 	dm_cache_metadata_get_stats(cache->cmd, &stats);
641 	atomic_set(&cache->stats.read_hit, stats.read_hits);
642 	atomic_set(&cache->stats.read_miss, stats.read_misses);
643 	atomic_set(&cache->stats.write_hit, stats.write_hits);
644 	atomic_set(&cache->stats.write_miss, stats.write_misses);
645 }
646 
647 static void save_stats(struct cache *cache)
648 {
649 	struct dm_cache_statistics stats;
650 
651 	stats.read_hits = atomic_read(&cache->stats.read_hit);
652 	stats.read_misses = atomic_read(&cache->stats.read_miss);
653 	stats.write_hits = atomic_read(&cache->stats.write_hit);
654 	stats.write_misses = atomic_read(&cache->stats.write_miss);
655 
656 	dm_cache_metadata_set_stats(cache->cmd, &stats);
657 }
658 
659 /*----------------------------------------------------------------
660  * Per bio data
661  *--------------------------------------------------------------*/
662 
663 /*
664  * If using writeback, leave out struct per_bio_data's writethrough fields.
665  */
666 #define PB_DATA_SIZE_WB (offsetof(struct per_bio_data, cache))
667 #define PB_DATA_SIZE_WT (sizeof(struct per_bio_data))
668 
669 static bool writethrough_mode(struct cache_features *f)
670 {
671 	return f->io_mode == CM_IO_WRITETHROUGH;
672 }
673 
674 static bool writeback_mode(struct cache_features *f)
675 {
676 	return f->io_mode == CM_IO_WRITEBACK;
677 }
678 
679 static bool passthrough_mode(struct cache_features *f)
680 {
681 	return f->io_mode == CM_IO_PASSTHROUGH;
682 }
683 
684 static size_t get_per_bio_data_size(struct cache *cache)
685 {
686 	return writethrough_mode(&cache->features) ? PB_DATA_SIZE_WT : PB_DATA_SIZE_WB;
687 }
688 
689 static struct per_bio_data *get_per_bio_data(struct bio *bio, size_t data_size)
690 {
691 	struct per_bio_data *pb = dm_per_bio_data(bio, data_size);
692 	BUG_ON(!pb);
693 	return pb;
694 }
695 
696 static struct per_bio_data *init_per_bio_data(struct bio *bio, size_t data_size)
697 {
698 	struct per_bio_data *pb = get_per_bio_data(bio, data_size);
699 
700 	pb->tick = false;
701 	pb->req_nr = dm_bio_get_target_bio_nr(bio);
702 	pb->all_io_entry = NULL;
703 
704 	return pb;
705 }
706 
707 /*----------------------------------------------------------------
708  * Remapping
709  *--------------------------------------------------------------*/
710 static void remap_to_origin(struct cache *cache, struct bio *bio)
711 {
712 	bio->bi_bdev = cache->origin_dev->bdev;
713 }
714 
715 static void remap_to_cache(struct cache *cache, struct bio *bio,
716 			   dm_cblock_t cblock)
717 {
718 	sector_t bi_sector = bio->bi_iter.bi_sector;
719 	sector_t block = from_cblock(cblock);
720 
721 	bio->bi_bdev = cache->cache_dev->bdev;
722 	if (!block_size_is_power_of_two(cache))
723 		bio->bi_iter.bi_sector =
724 			(block * cache->sectors_per_block) +
725 			sector_div(bi_sector, cache->sectors_per_block);
726 	else
727 		bio->bi_iter.bi_sector =
728 			(block << cache->sectors_per_block_shift) |
729 			(bi_sector & (cache->sectors_per_block - 1));
730 }
731 
732 static void check_if_tick_bio_needed(struct cache *cache, struct bio *bio)
733 {
734 	unsigned long flags;
735 	size_t pb_data_size = get_per_bio_data_size(cache);
736 	struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
737 
738 	spin_lock_irqsave(&cache->lock, flags);
739 	if (cache->need_tick_bio &&
740 	    !(bio->bi_rw & (REQ_FUA | REQ_FLUSH | REQ_DISCARD))) {
741 		pb->tick = true;
742 		cache->need_tick_bio = false;
743 	}
744 	spin_unlock_irqrestore(&cache->lock, flags);
745 }
746 
747 static void remap_to_origin_clear_discard(struct cache *cache, struct bio *bio,
748 				  dm_oblock_t oblock)
749 {
750 	check_if_tick_bio_needed(cache, bio);
751 	remap_to_origin(cache, bio);
752 	if (bio_data_dir(bio) == WRITE)
753 		clear_discard(cache, oblock_to_dblock(cache, oblock));
754 }
755 
756 static void remap_to_cache_dirty(struct cache *cache, struct bio *bio,
757 				 dm_oblock_t oblock, dm_cblock_t cblock)
758 {
759 	check_if_tick_bio_needed(cache, bio);
760 	remap_to_cache(cache, bio, cblock);
761 	if (bio_data_dir(bio) == WRITE) {
762 		set_dirty(cache, oblock, cblock);
763 		clear_discard(cache, oblock_to_dblock(cache, oblock));
764 	}
765 }
766 
767 static dm_oblock_t get_bio_block(struct cache *cache, struct bio *bio)
768 {
769 	sector_t block_nr = bio->bi_iter.bi_sector;
770 
771 	if (!block_size_is_power_of_two(cache))
772 		(void) sector_div(block_nr, cache->sectors_per_block);
773 	else
774 		block_nr >>= cache->sectors_per_block_shift;
775 
776 	return to_oblock(block_nr);
777 }
778 
779 static int bio_triggers_commit(struct cache *cache, struct bio *bio)
780 {
781 	return bio->bi_rw & (REQ_FLUSH | REQ_FUA);
782 }
783 
784 /*
785  * You must increment the deferred set whilst the prison cell is held.  To
786  * encourage this, we ask for 'cell' to be passed in.
787  */
788 static void inc_ds(struct cache *cache, struct bio *bio,
789 		   struct dm_bio_prison_cell *cell)
790 {
791 	size_t pb_data_size = get_per_bio_data_size(cache);
792 	struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
793 
794 	BUG_ON(!cell);
795 	BUG_ON(pb->all_io_entry);
796 
797 	pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
798 }
799 
800 static void issue(struct cache *cache, struct bio *bio)
801 {
802 	unsigned long flags;
803 
804 	if (!bio_triggers_commit(cache, bio)) {
805 		generic_make_request(bio);
806 		return;
807 	}
808 
809 	/*
810 	 * Batch together any bios that trigger commits and then issue a
811 	 * single commit for them in do_worker().
812 	 */
813 	spin_lock_irqsave(&cache->lock, flags);
814 	cache->commit_requested = true;
815 	bio_list_add(&cache->deferred_flush_bios, bio);
816 	spin_unlock_irqrestore(&cache->lock, flags);
817 }
818 
819 static void inc_and_issue(struct cache *cache, struct bio *bio, struct dm_bio_prison_cell *cell)
820 {
821 	inc_ds(cache, bio, cell);
822 	issue(cache, bio);
823 }
824 
825 static void defer_writethrough_bio(struct cache *cache, struct bio *bio)
826 {
827 	unsigned long flags;
828 
829 	spin_lock_irqsave(&cache->lock, flags);
830 	bio_list_add(&cache->deferred_writethrough_bios, bio);
831 	spin_unlock_irqrestore(&cache->lock, flags);
832 
833 	wake_worker(cache);
834 }
835 
836 static void writethrough_endio(struct bio *bio, int err)
837 {
838 	struct per_bio_data *pb = get_per_bio_data(bio, PB_DATA_SIZE_WT);
839 
840 	dm_unhook_bio(&pb->hook_info, bio);
841 
842 	if (err) {
843 		bio_endio(bio, err);
844 		return;
845 	}
846 
847 	dm_bio_restore(&pb->bio_details, bio);
848 	remap_to_cache(pb->cache, bio, pb->cblock);
849 
850 	/*
851 	 * We can't issue this bio directly, since we're in interrupt
852 	 * context.  So it gets put on a bio list for processing by the
853 	 * worker thread.
854 	 */
855 	defer_writethrough_bio(pb->cache, bio);
856 }
857 
858 /*
859  * When running in writethrough mode we need to send writes to clean blocks
860  * to both the cache and origin devices.  In future we'd like to clone the
861  * bio and send them in parallel, but for now we're doing them in
862  * series as this is easier.
863  */
864 static void remap_to_origin_then_cache(struct cache *cache, struct bio *bio,
865 				       dm_oblock_t oblock, dm_cblock_t cblock)
866 {
867 	struct per_bio_data *pb = get_per_bio_data(bio, PB_DATA_SIZE_WT);
868 
869 	pb->cache = cache;
870 	pb->cblock = cblock;
871 	dm_hook_bio(&pb->hook_info, bio, writethrough_endio, NULL);
872 	dm_bio_record(&pb->bio_details, bio);
873 
874 	remap_to_origin_clear_discard(pb->cache, bio, oblock);
875 }
876 
877 /*----------------------------------------------------------------
878  * Migration processing
879  *
880  * Migration covers moving data from the origin device to the cache, or
881  * vice versa.
882  *--------------------------------------------------------------*/
883 static void inc_io_migrations(struct cache *cache)
884 {
885 	atomic_inc(&cache->nr_io_migrations);
886 }
887 
888 static void dec_io_migrations(struct cache *cache)
889 {
890 	atomic_dec(&cache->nr_io_migrations);
891 }
892 
893 static void __cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell,
894 			 bool holder)
895 {
896 	(holder ? dm_cell_release : dm_cell_release_no_holder)
897 		(cache->prison, cell, &cache->deferred_bios);
898 	free_prison_cell(cache, cell);
899 }
900 
901 static void cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell,
902 		       bool holder)
903 {
904 	unsigned long flags;
905 
906 	spin_lock_irqsave(&cache->lock, flags);
907 	__cell_defer(cache, cell, holder);
908 	spin_unlock_irqrestore(&cache->lock, flags);
909 
910 	wake_worker(cache);
911 }
912 
913 static void free_io_migration(struct dm_cache_migration *mg)
914 {
915 	dec_io_migrations(mg->cache);
916 	free_migration(mg);
917 }
918 
919 static void migration_failure(struct dm_cache_migration *mg)
920 {
921 	struct cache *cache = mg->cache;
922 
923 	if (mg->writeback) {
924 		DMWARN_LIMIT("writeback failed; couldn't copy block");
925 		set_dirty(cache, mg->old_oblock, mg->cblock);
926 		cell_defer(cache, mg->old_ocell, false);
927 
928 	} else if (mg->demote) {
929 		DMWARN_LIMIT("demotion failed; couldn't copy block");
930 		policy_force_mapping(cache->policy, mg->new_oblock, mg->old_oblock);
931 
932 		cell_defer(cache, mg->old_ocell, mg->promote ? false : true);
933 		if (mg->promote)
934 			cell_defer(cache, mg->new_ocell, true);
935 	} else {
936 		DMWARN_LIMIT("promotion failed; couldn't copy block");
937 		policy_remove_mapping(cache->policy, mg->new_oblock);
938 		cell_defer(cache, mg->new_ocell, true);
939 	}
940 
941 	free_io_migration(mg);
942 }
943 
944 static void migration_success_pre_commit(struct dm_cache_migration *mg)
945 {
946 	unsigned long flags;
947 	struct cache *cache = mg->cache;
948 
949 	if (mg->writeback) {
950 		clear_dirty(cache, mg->old_oblock, mg->cblock);
951 		cell_defer(cache, mg->old_ocell, false);
952 		free_io_migration(mg);
953 		return;
954 
955 	} else if (mg->demote) {
956 		if (dm_cache_remove_mapping(cache->cmd, mg->cblock)) {
957 			DMWARN_LIMIT("demotion failed; couldn't update on disk metadata");
958 			policy_force_mapping(cache->policy, mg->new_oblock,
959 					     mg->old_oblock);
960 			if (mg->promote)
961 				cell_defer(cache, mg->new_ocell, true);
962 			free_io_migration(mg);
963 			return;
964 		}
965 	} else {
966 		if (dm_cache_insert_mapping(cache->cmd, mg->cblock, mg->new_oblock)) {
967 			DMWARN_LIMIT("promotion failed; couldn't update on disk metadata");
968 			policy_remove_mapping(cache->policy, mg->new_oblock);
969 			free_io_migration(mg);
970 			return;
971 		}
972 	}
973 
974 	spin_lock_irqsave(&cache->lock, flags);
975 	list_add_tail(&mg->list, &cache->need_commit_migrations);
976 	cache->commit_requested = true;
977 	spin_unlock_irqrestore(&cache->lock, flags);
978 }
979 
980 static void migration_success_post_commit(struct dm_cache_migration *mg)
981 {
982 	unsigned long flags;
983 	struct cache *cache = mg->cache;
984 
985 	if (mg->writeback) {
986 		DMWARN("writeback unexpectedly triggered commit");
987 		return;
988 
989 	} else if (mg->demote) {
990 		cell_defer(cache, mg->old_ocell, mg->promote ? false : true);
991 
992 		if (mg->promote) {
993 			mg->demote = false;
994 
995 			spin_lock_irqsave(&cache->lock, flags);
996 			list_add_tail(&mg->list, &cache->quiesced_migrations);
997 			spin_unlock_irqrestore(&cache->lock, flags);
998 
999 		} else {
1000 			if (mg->invalidate)
1001 				policy_remove_mapping(cache->policy, mg->old_oblock);
1002 			free_io_migration(mg);
1003 		}
1004 
1005 	} else {
1006 		if (mg->requeue_holder) {
1007 			clear_dirty(cache, mg->new_oblock, mg->cblock);
1008 			cell_defer(cache, mg->new_ocell, true);
1009 		} else {
1010 			/*
1011 			 * The block was promoted via an overwrite, so it's dirty.
1012 			 */
1013 			set_dirty(cache, mg->new_oblock, mg->cblock);
1014 			bio_endio(mg->new_ocell->holder, 0);
1015 			cell_defer(cache, mg->new_ocell, false);
1016 		}
1017 		free_io_migration(mg);
1018 	}
1019 }
1020 
1021 static void copy_complete(int read_err, unsigned long write_err, void *context)
1022 {
1023 	unsigned long flags;
1024 	struct dm_cache_migration *mg = (struct dm_cache_migration *) context;
1025 	struct cache *cache = mg->cache;
1026 
1027 	if (read_err || write_err)
1028 		mg->err = true;
1029 
1030 	spin_lock_irqsave(&cache->lock, flags);
1031 	list_add_tail(&mg->list, &cache->completed_migrations);
1032 	spin_unlock_irqrestore(&cache->lock, flags);
1033 
1034 	wake_worker(cache);
1035 }
1036 
1037 static void issue_copy(struct dm_cache_migration *mg)
1038 {
1039 	int r;
1040 	struct dm_io_region o_region, c_region;
1041 	struct cache *cache = mg->cache;
1042 	sector_t cblock = from_cblock(mg->cblock);
1043 
1044 	o_region.bdev = cache->origin_dev->bdev;
1045 	o_region.count = cache->sectors_per_block;
1046 
1047 	c_region.bdev = cache->cache_dev->bdev;
1048 	c_region.sector = cblock * cache->sectors_per_block;
1049 	c_region.count = cache->sectors_per_block;
1050 
1051 	if (mg->writeback || mg->demote) {
1052 		/* demote */
1053 		o_region.sector = from_oblock(mg->old_oblock) * cache->sectors_per_block;
1054 		r = dm_kcopyd_copy(cache->copier, &c_region, 1, &o_region, 0, copy_complete, mg);
1055 	} else {
1056 		/* promote */
1057 		o_region.sector = from_oblock(mg->new_oblock) * cache->sectors_per_block;
1058 		r = dm_kcopyd_copy(cache->copier, &o_region, 1, &c_region, 0, copy_complete, mg);
1059 	}
1060 
1061 	if (r < 0) {
1062 		DMERR_LIMIT("issuing migration failed");
1063 		migration_failure(mg);
1064 	}
1065 }
1066 
1067 static void overwrite_endio(struct bio *bio, int err)
1068 {
1069 	struct dm_cache_migration *mg = bio->bi_private;
1070 	struct cache *cache = mg->cache;
1071 	size_t pb_data_size = get_per_bio_data_size(cache);
1072 	struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
1073 	unsigned long flags;
1074 
1075 	dm_unhook_bio(&pb->hook_info, bio);
1076 
1077 	if (err)
1078 		mg->err = true;
1079 
1080 	mg->requeue_holder = false;
1081 
1082 	spin_lock_irqsave(&cache->lock, flags);
1083 	list_add_tail(&mg->list, &cache->completed_migrations);
1084 	spin_unlock_irqrestore(&cache->lock, flags);
1085 
1086 	wake_worker(cache);
1087 }
1088 
1089 static void issue_overwrite(struct dm_cache_migration *mg, struct bio *bio)
1090 {
1091 	size_t pb_data_size = get_per_bio_data_size(mg->cache);
1092 	struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
1093 
1094 	dm_hook_bio(&pb->hook_info, bio, overwrite_endio, mg);
1095 	remap_to_cache_dirty(mg->cache, bio, mg->new_oblock, mg->cblock);
1096 
1097 	/*
1098 	 * No need to inc_ds() here, since the cell will be held for the
1099 	 * duration of the io.
1100 	 */
1101 	generic_make_request(bio);
1102 }
1103 
1104 static bool bio_writes_complete_block(struct cache *cache, struct bio *bio)
1105 {
1106 	return (bio_data_dir(bio) == WRITE) &&
1107 		(bio->bi_iter.bi_size == (cache->sectors_per_block << SECTOR_SHIFT));
1108 }
1109 
1110 static void avoid_copy(struct dm_cache_migration *mg)
1111 {
1112 	atomic_inc(&mg->cache->stats.copies_avoided);
1113 	migration_success_pre_commit(mg);
1114 }
1115 
1116 static void calc_discard_block_range(struct cache *cache, struct bio *bio,
1117 				     dm_dblock_t *b, dm_dblock_t *e)
1118 {
1119 	sector_t sb = bio->bi_iter.bi_sector;
1120 	sector_t se = bio_end_sector(bio);
1121 
1122 	*b = to_dblock(dm_sector_div_up(sb, cache->discard_block_size));
1123 
1124 	if (se - sb < cache->discard_block_size)
1125 		*e = *b;
1126 	else
1127 		*e = to_dblock(block_div(se, cache->discard_block_size));
1128 }
1129 
1130 static void issue_discard(struct dm_cache_migration *mg)
1131 {
1132 	dm_dblock_t b, e;
1133 	struct bio *bio = mg->new_ocell->holder;
1134 
1135 	calc_discard_block_range(mg->cache, bio, &b, &e);
1136 	while (b != e) {
1137 		set_discard(mg->cache, b);
1138 		b = to_dblock(from_dblock(b) + 1);
1139 	}
1140 
1141 	bio_endio(bio, 0);
1142 	cell_defer(mg->cache, mg->new_ocell, false);
1143 	free_migration(mg);
1144 }
1145 
1146 static void issue_copy_or_discard(struct dm_cache_migration *mg)
1147 {
1148 	bool avoid;
1149 	struct cache *cache = mg->cache;
1150 
1151 	if (mg->discard) {
1152 		issue_discard(mg);
1153 		return;
1154 	}
1155 
1156 	if (mg->writeback || mg->demote)
1157 		avoid = !is_dirty(cache, mg->cblock) ||
1158 			is_discarded_oblock(cache, mg->old_oblock);
1159 	else {
1160 		struct bio *bio = mg->new_ocell->holder;
1161 
1162 		avoid = is_discarded_oblock(cache, mg->new_oblock);
1163 
1164 		if (writeback_mode(&cache->features) &&
1165 		    !avoid && bio_writes_complete_block(cache, bio)) {
1166 			issue_overwrite(mg, bio);
1167 			return;
1168 		}
1169 	}
1170 
1171 	avoid ? avoid_copy(mg) : issue_copy(mg);
1172 }
1173 
1174 static void complete_migration(struct dm_cache_migration *mg)
1175 {
1176 	if (mg->err)
1177 		migration_failure(mg);
1178 	else
1179 		migration_success_pre_commit(mg);
1180 }
1181 
1182 static void process_migrations(struct cache *cache, struct list_head *head,
1183 			       void (*fn)(struct dm_cache_migration *))
1184 {
1185 	unsigned long flags;
1186 	struct list_head list;
1187 	struct dm_cache_migration *mg, *tmp;
1188 
1189 	INIT_LIST_HEAD(&list);
1190 	spin_lock_irqsave(&cache->lock, flags);
1191 	list_splice_init(head, &list);
1192 	spin_unlock_irqrestore(&cache->lock, flags);
1193 
1194 	list_for_each_entry_safe(mg, tmp, &list, list)
1195 		fn(mg);
1196 }
1197 
1198 static void __queue_quiesced_migration(struct dm_cache_migration *mg)
1199 {
1200 	list_add_tail(&mg->list, &mg->cache->quiesced_migrations);
1201 }
1202 
1203 static void queue_quiesced_migration(struct dm_cache_migration *mg)
1204 {
1205 	unsigned long flags;
1206 	struct cache *cache = mg->cache;
1207 
1208 	spin_lock_irqsave(&cache->lock, flags);
1209 	__queue_quiesced_migration(mg);
1210 	spin_unlock_irqrestore(&cache->lock, flags);
1211 
1212 	wake_worker(cache);
1213 }
1214 
1215 static void queue_quiesced_migrations(struct cache *cache, struct list_head *work)
1216 {
1217 	unsigned long flags;
1218 	struct dm_cache_migration *mg, *tmp;
1219 
1220 	spin_lock_irqsave(&cache->lock, flags);
1221 	list_for_each_entry_safe(mg, tmp, work, list)
1222 		__queue_quiesced_migration(mg);
1223 	spin_unlock_irqrestore(&cache->lock, flags);
1224 
1225 	wake_worker(cache);
1226 }
1227 
1228 static void check_for_quiesced_migrations(struct cache *cache,
1229 					  struct per_bio_data *pb)
1230 {
1231 	struct list_head work;
1232 
1233 	if (!pb->all_io_entry)
1234 		return;
1235 
1236 	INIT_LIST_HEAD(&work);
1237 	dm_deferred_entry_dec(pb->all_io_entry, &work);
1238 
1239 	if (!list_empty(&work))
1240 		queue_quiesced_migrations(cache, &work);
1241 }
1242 
1243 static void quiesce_migration(struct dm_cache_migration *mg)
1244 {
1245 	if (!dm_deferred_set_add_work(mg->cache->all_io_ds, &mg->list))
1246 		queue_quiesced_migration(mg);
1247 }
1248 
1249 static void promote(struct cache *cache, struct prealloc *structs,
1250 		    dm_oblock_t oblock, dm_cblock_t cblock,
1251 		    struct dm_bio_prison_cell *cell)
1252 {
1253 	struct dm_cache_migration *mg = prealloc_get_migration(structs);
1254 
1255 	mg->err = false;
1256 	mg->discard = false;
1257 	mg->writeback = false;
1258 	mg->demote = false;
1259 	mg->promote = true;
1260 	mg->requeue_holder = true;
1261 	mg->invalidate = false;
1262 	mg->cache = cache;
1263 	mg->new_oblock = oblock;
1264 	mg->cblock = cblock;
1265 	mg->old_ocell = NULL;
1266 	mg->new_ocell = cell;
1267 	mg->start_jiffies = jiffies;
1268 
1269 	inc_io_migrations(cache);
1270 	quiesce_migration(mg);
1271 }
1272 
1273 static void writeback(struct cache *cache, struct prealloc *structs,
1274 		      dm_oblock_t oblock, dm_cblock_t cblock,
1275 		      struct dm_bio_prison_cell *cell)
1276 {
1277 	struct dm_cache_migration *mg = prealloc_get_migration(structs);
1278 
1279 	mg->err = false;
1280 	mg->discard = false;
1281 	mg->writeback = true;
1282 	mg->demote = false;
1283 	mg->promote = false;
1284 	mg->requeue_holder = true;
1285 	mg->invalidate = false;
1286 	mg->cache = cache;
1287 	mg->old_oblock = oblock;
1288 	mg->cblock = cblock;
1289 	mg->old_ocell = cell;
1290 	mg->new_ocell = NULL;
1291 	mg->start_jiffies = jiffies;
1292 
1293 	inc_io_migrations(cache);
1294 	quiesce_migration(mg);
1295 }
1296 
1297 static void demote_then_promote(struct cache *cache, struct prealloc *structs,
1298 				dm_oblock_t old_oblock, dm_oblock_t new_oblock,
1299 				dm_cblock_t cblock,
1300 				struct dm_bio_prison_cell *old_ocell,
1301 				struct dm_bio_prison_cell *new_ocell)
1302 {
1303 	struct dm_cache_migration *mg = prealloc_get_migration(structs);
1304 
1305 	mg->err = false;
1306 	mg->discard = false;
1307 	mg->writeback = false;
1308 	mg->demote = true;
1309 	mg->promote = true;
1310 	mg->requeue_holder = true;
1311 	mg->invalidate = false;
1312 	mg->cache = cache;
1313 	mg->old_oblock = old_oblock;
1314 	mg->new_oblock = new_oblock;
1315 	mg->cblock = cblock;
1316 	mg->old_ocell = old_ocell;
1317 	mg->new_ocell = new_ocell;
1318 	mg->start_jiffies = jiffies;
1319 
1320 	inc_io_migrations(cache);
1321 	quiesce_migration(mg);
1322 }
1323 
1324 /*
1325  * Invalidate a cache entry.  No writeback occurs; any changes in the cache
1326  * block are thrown away.
1327  */
1328 static void invalidate(struct cache *cache, struct prealloc *structs,
1329 		       dm_oblock_t oblock, dm_cblock_t cblock,
1330 		       struct dm_bio_prison_cell *cell)
1331 {
1332 	struct dm_cache_migration *mg = prealloc_get_migration(structs);
1333 
1334 	mg->err = false;
1335 	mg->discard = false;
1336 	mg->writeback = false;
1337 	mg->demote = true;
1338 	mg->promote = false;
1339 	mg->requeue_holder = true;
1340 	mg->invalidate = true;
1341 	mg->cache = cache;
1342 	mg->old_oblock = oblock;
1343 	mg->cblock = cblock;
1344 	mg->old_ocell = cell;
1345 	mg->new_ocell = NULL;
1346 	mg->start_jiffies = jiffies;
1347 
1348 	inc_io_migrations(cache);
1349 	quiesce_migration(mg);
1350 }
1351 
1352 static void discard(struct cache *cache, struct prealloc *structs,
1353 		    struct dm_bio_prison_cell *cell)
1354 {
1355 	struct dm_cache_migration *mg = prealloc_get_migration(structs);
1356 
1357 	mg->err = false;
1358 	mg->discard = true;
1359 	mg->writeback = false;
1360 	mg->demote = false;
1361 	mg->promote = false;
1362 	mg->requeue_holder = false;
1363 	mg->invalidate = false;
1364 	mg->cache = cache;
1365 	mg->old_ocell = NULL;
1366 	mg->new_ocell = cell;
1367 	mg->start_jiffies = jiffies;
1368 
1369 	quiesce_migration(mg);
1370 }
1371 
1372 /*----------------------------------------------------------------
1373  * bio processing
1374  *--------------------------------------------------------------*/
1375 static void defer_bio(struct cache *cache, struct bio *bio)
1376 {
1377 	unsigned long flags;
1378 
1379 	spin_lock_irqsave(&cache->lock, flags);
1380 	bio_list_add(&cache->deferred_bios, bio);
1381 	spin_unlock_irqrestore(&cache->lock, flags);
1382 
1383 	wake_worker(cache);
1384 }
1385 
1386 static void process_flush_bio(struct cache *cache, struct bio *bio)
1387 {
1388 	size_t pb_data_size = get_per_bio_data_size(cache);
1389 	struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
1390 
1391 	BUG_ON(bio->bi_iter.bi_size);
1392 	if (!pb->req_nr)
1393 		remap_to_origin(cache, bio);
1394 	else
1395 		remap_to_cache(cache, bio, 0);
1396 
1397 	/*
1398 	 * REQ_FLUSH is not directed at any particular block so we don't
1399 	 * need to inc_ds().  REQ_FUA's are split into a write + REQ_FLUSH
1400 	 * by dm-core.
1401 	 */
1402 	issue(cache, bio);
1403 }
1404 
1405 static void process_discard_bio(struct cache *cache, struct prealloc *structs,
1406 				struct bio *bio)
1407 {
1408 	int r;
1409 	dm_dblock_t b, e;
1410 	struct dm_bio_prison_cell *cell_prealloc, *new_ocell;
1411 
1412 	calc_discard_block_range(cache, bio, &b, &e);
1413 	if (b == e) {
1414 		bio_endio(bio, 0);
1415 		return;
1416 	}
1417 
1418 	cell_prealloc = prealloc_get_cell(structs);
1419 	r = bio_detain_range(cache, dblock_to_oblock(cache, b), dblock_to_oblock(cache, e), bio, cell_prealloc,
1420 			     (cell_free_fn) prealloc_put_cell,
1421 			     structs, &new_ocell);
1422 	if (r > 0)
1423 		return;
1424 
1425 	discard(cache, structs, new_ocell);
1426 }
1427 
1428 static bool spare_migration_bandwidth(struct cache *cache)
1429 {
1430 	sector_t current_volume = (atomic_read(&cache->nr_io_migrations) + 1) *
1431 		cache->sectors_per_block;
1432 	return current_volume < cache->migration_threshold;
1433 }
1434 
1435 static void inc_hit_counter(struct cache *cache, struct bio *bio)
1436 {
1437 	atomic_inc(bio_data_dir(bio) == READ ?
1438 		   &cache->stats.read_hit : &cache->stats.write_hit);
1439 }
1440 
1441 static void inc_miss_counter(struct cache *cache, struct bio *bio)
1442 {
1443 	atomic_inc(bio_data_dir(bio) == READ ?
1444 		   &cache->stats.read_miss : &cache->stats.write_miss);
1445 }
1446 
1447 static void process_bio(struct cache *cache, struct prealloc *structs,
1448 			struct bio *bio)
1449 {
1450 	int r;
1451 	bool release_cell = true;
1452 	dm_oblock_t block = get_bio_block(cache, bio);
1453 	struct dm_bio_prison_cell *cell_prealloc, *old_ocell, *new_ocell;
1454 	struct policy_result lookup_result;
1455 	bool passthrough = passthrough_mode(&cache->features);
1456 	bool discarded_block, can_migrate;
1457 
1458 	/*
1459 	 * Check to see if that block is currently migrating.
1460 	 */
1461 	cell_prealloc = prealloc_get_cell(structs);
1462 	r = bio_detain(cache, block, bio, cell_prealloc,
1463 		       (cell_free_fn) prealloc_put_cell,
1464 		       structs, &new_ocell);
1465 	if (r > 0)
1466 		return;
1467 
1468 	discarded_block = is_discarded_oblock(cache, block);
1469 	can_migrate = !passthrough && (discarded_block || spare_migration_bandwidth(cache));
1470 
1471 	r = policy_map(cache->policy, block, true, can_migrate, discarded_block,
1472 		       bio, &lookup_result);
1473 
1474 	if (r == -EWOULDBLOCK)
1475 		/* migration has been denied */
1476 		lookup_result.op = POLICY_MISS;
1477 
1478 	switch (lookup_result.op) {
1479 	case POLICY_HIT:
1480 		if (passthrough) {
1481 			inc_miss_counter(cache, bio);
1482 
1483 			/*
1484 			 * Passthrough always maps to the origin,
1485 			 * invalidating any cache blocks that are written
1486 			 * to.
1487 			 */
1488 
1489 			if (bio_data_dir(bio) == WRITE) {
1490 				atomic_inc(&cache->stats.demotion);
1491 				invalidate(cache, structs, block, lookup_result.cblock, new_ocell);
1492 				release_cell = false;
1493 
1494 			} else {
1495 				/* FIXME: factor out issue_origin() */
1496 				remap_to_origin_clear_discard(cache, bio, block);
1497 				inc_and_issue(cache, bio, new_ocell);
1498 			}
1499 		} else {
1500 			inc_hit_counter(cache, bio);
1501 
1502 			if (bio_data_dir(bio) == WRITE &&
1503 			    writethrough_mode(&cache->features) &&
1504 			    !is_dirty(cache, lookup_result.cblock)) {
1505 				remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock);
1506 				inc_and_issue(cache, bio, new_ocell);
1507 
1508 			} else  {
1509 				remap_to_cache_dirty(cache, bio, block, lookup_result.cblock);
1510 				inc_and_issue(cache, bio, new_ocell);
1511 			}
1512 		}
1513 
1514 		break;
1515 
1516 	case POLICY_MISS:
1517 		inc_miss_counter(cache, bio);
1518 		remap_to_origin_clear_discard(cache, bio, block);
1519 		inc_and_issue(cache, bio, new_ocell);
1520 		break;
1521 
1522 	case POLICY_NEW:
1523 		atomic_inc(&cache->stats.promotion);
1524 		promote(cache, structs, block, lookup_result.cblock, new_ocell);
1525 		release_cell = false;
1526 		break;
1527 
1528 	case POLICY_REPLACE:
1529 		cell_prealloc = prealloc_get_cell(structs);
1530 		r = bio_detain(cache, lookup_result.old_oblock, bio, cell_prealloc,
1531 			       (cell_free_fn) prealloc_put_cell,
1532 			       structs, &old_ocell);
1533 		if (r > 0) {
1534 			/*
1535 			 * We have to be careful to avoid lock inversion of
1536 			 * the cells.  So we back off, and wait for the
1537 			 * old_ocell to become free.
1538 			 */
1539 			policy_force_mapping(cache->policy, block,
1540 					     lookup_result.old_oblock);
1541 			atomic_inc(&cache->stats.cache_cell_clash);
1542 			break;
1543 		}
1544 		atomic_inc(&cache->stats.demotion);
1545 		atomic_inc(&cache->stats.promotion);
1546 
1547 		demote_then_promote(cache, structs, lookup_result.old_oblock,
1548 				    block, lookup_result.cblock,
1549 				    old_ocell, new_ocell);
1550 		release_cell = false;
1551 		break;
1552 
1553 	default:
1554 		DMERR_LIMIT("%s: erroring bio, unknown policy op: %u", __func__,
1555 			    (unsigned) lookup_result.op);
1556 		bio_io_error(bio);
1557 	}
1558 
1559 	if (release_cell)
1560 		cell_defer(cache, new_ocell, false);
1561 }
1562 
1563 static int need_commit_due_to_time(struct cache *cache)
1564 {
1565 	return jiffies < cache->last_commit_jiffies ||
1566 	       jiffies > cache->last_commit_jiffies + COMMIT_PERIOD;
1567 }
1568 
1569 static int commit_if_needed(struct cache *cache)
1570 {
1571 	int r = 0;
1572 
1573 	if ((cache->commit_requested || need_commit_due_to_time(cache)) &&
1574 	    dm_cache_changed_this_transaction(cache->cmd)) {
1575 		atomic_inc(&cache->stats.commit_count);
1576 		cache->commit_requested = false;
1577 		r = dm_cache_commit(cache->cmd, false);
1578 		cache->last_commit_jiffies = jiffies;
1579 	}
1580 
1581 	return r;
1582 }
1583 
1584 static void process_deferred_bios(struct cache *cache)
1585 {
1586 	unsigned long flags;
1587 	struct bio_list bios;
1588 	struct bio *bio;
1589 	struct prealloc structs;
1590 
1591 	memset(&structs, 0, sizeof(structs));
1592 	bio_list_init(&bios);
1593 
1594 	spin_lock_irqsave(&cache->lock, flags);
1595 	bio_list_merge(&bios, &cache->deferred_bios);
1596 	bio_list_init(&cache->deferred_bios);
1597 	spin_unlock_irqrestore(&cache->lock, flags);
1598 
1599 	while (!bio_list_empty(&bios)) {
1600 		/*
1601 		 * If we've got no free migration structs, and processing
1602 		 * this bio might require one, we pause until there are some
1603 		 * prepared mappings to process.
1604 		 */
1605 		if (prealloc_data_structs(cache, &structs)) {
1606 			spin_lock_irqsave(&cache->lock, flags);
1607 			bio_list_merge(&cache->deferred_bios, &bios);
1608 			spin_unlock_irqrestore(&cache->lock, flags);
1609 			break;
1610 		}
1611 
1612 		bio = bio_list_pop(&bios);
1613 
1614 		if (bio->bi_rw & REQ_FLUSH)
1615 			process_flush_bio(cache, bio);
1616 		else if (bio->bi_rw & REQ_DISCARD)
1617 			process_discard_bio(cache, &structs, bio);
1618 		else
1619 			process_bio(cache, &structs, bio);
1620 	}
1621 
1622 	prealloc_free_structs(cache, &structs);
1623 }
1624 
1625 static void process_deferred_flush_bios(struct cache *cache, bool submit_bios)
1626 {
1627 	unsigned long flags;
1628 	struct bio_list bios;
1629 	struct bio *bio;
1630 
1631 	bio_list_init(&bios);
1632 
1633 	spin_lock_irqsave(&cache->lock, flags);
1634 	bio_list_merge(&bios, &cache->deferred_flush_bios);
1635 	bio_list_init(&cache->deferred_flush_bios);
1636 	spin_unlock_irqrestore(&cache->lock, flags);
1637 
1638 	/*
1639 	 * These bios have already been through inc_ds()
1640 	 */
1641 	while ((bio = bio_list_pop(&bios)))
1642 		submit_bios ? generic_make_request(bio) : bio_io_error(bio);
1643 }
1644 
1645 static void process_deferred_writethrough_bios(struct cache *cache)
1646 {
1647 	unsigned long flags;
1648 	struct bio_list bios;
1649 	struct bio *bio;
1650 
1651 	bio_list_init(&bios);
1652 
1653 	spin_lock_irqsave(&cache->lock, flags);
1654 	bio_list_merge(&bios, &cache->deferred_writethrough_bios);
1655 	bio_list_init(&cache->deferred_writethrough_bios);
1656 	spin_unlock_irqrestore(&cache->lock, flags);
1657 
1658 	/*
1659 	 * These bios have already been through inc_ds()
1660 	 */
1661 	while ((bio = bio_list_pop(&bios)))
1662 		generic_make_request(bio);
1663 }
1664 
1665 static void writeback_some_dirty_blocks(struct cache *cache)
1666 {
1667 	int r = 0;
1668 	dm_oblock_t oblock;
1669 	dm_cblock_t cblock;
1670 	struct prealloc structs;
1671 	struct dm_bio_prison_cell *old_ocell;
1672 
1673 	memset(&structs, 0, sizeof(structs));
1674 
1675 	while (spare_migration_bandwidth(cache)) {
1676 		if (prealloc_data_structs(cache, &structs))
1677 			break;
1678 
1679 		r = policy_writeback_work(cache->policy, &oblock, &cblock);
1680 		if (r)
1681 			break;
1682 
1683 		r = get_cell(cache, oblock, &structs, &old_ocell);
1684 		if (r) {
1685 			policy_set_dirty(cache->policy, oblock);
1686 			break;
1687 		}
1688 
1689 		writeback(cache, &structs, oblock, cblock, old_ocell);
1690 	}
1691 
1692 	prealloc_free_structs(cache, &structs);
1693 }
1694 
1695 /*----------------------------------------------------------------
1696  * Invalidations.
1697  * Dropping something from the cache *without* writing back.
1698  *--------------------------------------------------------------*/
1699 
1700 static void process_invalidation_request(struct cache *cache, struct invalidation_request *req)
1701 {
1702 	int r = 0;
1703 	uint64_t begin = from_cblock(req->cblocks->begin);
1704 	uint64_t end = from_cblock(req->cblocks->end);
1705 
1706 	while (begin != end) {
1707 		r = policy_remove_cblock(cache->policy, to_cblock(begin));
1708 		if (!r) {
1709 			r = dm_cache_remove_mapping(cache->cmd, to_cblock(begin));
1710 			if (r)
1711 				break;
1712 
1713 		} else if (r == -ENODATA) {
1714 			/* harmless, already unmapped */
1715 			r = 0;
1716 
1717 		} else {
1718 			DMERR("policy_remove_cblock failed");
1719 			break;
1720 		}
1721 
1722 		begin++;
1723         }
1724 
1725 	cache->commit_requested = true;
1726 
1727 	req->err = r;
1728 	atomic_set(&req->complete, 1);
1729 
1730 	wake_up(&req->result_wait);
1731 }
1732 
1733 static void process_invalidation_requests(struct cache *cache)
1734 {
1735 	struct list_head list;
1736 	struct invalidation_request *req, *tmp;
1737 
1738 	INIT_LIST_HEAD(&list);
1739 	spin_lock(&cache->invalidation_lock);
1740 	list_splice_init(&cache->invalidation_requests, &list);
1741 	spin_unlock(&cache->invalidation_lock);
1742 
1743 	list_for_each_entry_safe (req, tmp, &list, list)
1744 		process_invalidation_request(cache, req);
1745 }
1746 
1747 /*----------------------------------------------------------------
1748  * Main worker loop
1749  *--------------------------------------------------------------*/
1750 static bool is_quiescing(struct cache *cache)
1751 {
1752 	return atomic_read(&cache->quiescing);
1753 }
1754 
1755 static void ack_quiescing(struct cache *cache)
1756 {
1757 	if (is_quiescing(cache)) {
1758 		atomic_inc(&cache->quiescing_ack);
1759 		wake_up(&cache->quiescing_wait);
1760 	}
1761 }
1762 
1763 static void wait_for_quiescing_ack(struct cache *cache)
1764 {
1765 	wait_event(cache->quiescing_wait, atomic_read(&cache->quiescing_ack));
1766 }
1767 
1768 static void start_quiescing(struct cache *cache)
1769 {
1770 	atomic_inc(&cache->quiescing);
1771 	wait_for_quiescing_ack(cache);
1772 }
1773 
1774 static void stop_quiescing(struct cache *cache)
1775 {
1776 	atomic_set(&cache->quiescing, 0);
1777 	atomic_set(&cache->quiescing_ack, 0);
1778 }
1779 
1780 static void wait_for_migrations(struct cache *cache)
1781 {
1782 	wait_event(cache->migration_wait, !atomic_read(&cache->nr_allocated_migrations));
1783 }
1784 
1785 static void stop_worker(struct cache *cache)
1786 {
1787 	cancel_delayed_work(&cache->waker);
1788 	flush_workqueue(cache->wq);
1789 }
1790 
1791 static void requeue_deferred_io(struct cache *cache)
1792 {
1793 	struct bio *bio;
1794 	struct bio_list bios;
1795 
1796 	bio_list_init(&bios);
1797 	bio_list_merge(&bios, &cache->deferred_bios);
1798 	bio_list_init(&cache->deferred_bios);
1799 
1800 	while ((bio = bio_list_pop(&bios)))
1801 		bio_endio(bio, DM_ENDIO_REQUEUE);
1802 }
1803 
1804 static int more_work(struct cache *cache)
1805 {
1806 	if (is_quiescing(cache))
1807 		return !list_empty(&cache->quiesced_migrations) ||
1808 			!list_empty(&cache->completed_migrations) ||
1809 			!list_empty(&cache->need_commit_migrations);
1810 	else
1811 		return !bio_list_empty(&cache->deferred_bios) ||
1812 			!bio_list_empty(&cache->deferred_flush_bios) ||
1813 			!bio_list_empty(&cache->deferred_writethrough_bios) ||
1814 			!list_empty(&cache->quiesced_migrations) ||
1815 			!list_empty(&cache->completed_migrations) ||
1816 			!list_empty(&cache->need_commit_migrations) ||
1817 			cache->invalidate;
1818 }
1819 
1820 static void do_worker(struct work_struct *ws)
1821 {
1822 	struct cache *cache = container_of(ws, struct cache, worker);
1823 
1824 	do {
1825 		if (!is_quiescing(cache)) {
1826 			writeback_some_dirty_blocks(cache);
1827 			process_deferred_writethrough_bios(cache);
1828 			process_deferred_bios(cache);
1829 			process_invalidation_requests(cache);
1830 		}
1831 
1832 		process_migrations(cache, &cache->quiesced_migrations, issue_copy_or_discard);
1833 		process_migrations(cache, &cache->completed_migrations, complete_migration);
1834 
1835 		if (commit_if_needed(cache)) {
1836 			process_deferred_flush_bios(cache, false);
1837 			process_migrations(cache, &cache->need_commit_migrations, migration_failure);
1838 
1839 			/*
1840 			 * FIXME: rollback metadata or just go into a
1841 			 * failure mode and error everything
1842 			 */
1843 		} else {
1844 			process_deferred_flush_bios(cache, true);
1845 			process_migrations(cache, &cache->need_commit_migrations,
1846 					   migration_success_post_commit);
1847 		}
1848 
1849 		ack_quiescing(cache);
1850 
1851 	} while (more_work(cache));
1852 }
1853 
1854 /*
1855  * We want to commit periodically so that not too much
1856  * unwritten metadata builds up.
1857  */
1858 static void do_waker(struct work_struct *ws)
1859 {
1860 	struct cache *cache = container_of(to_delayed_work(ws), struct cache, waker);
1861 	policy_tick(cache->policy);
1862 	wake_worker(cache);
1863 	queue_delayed_work(cache->wq, &cache->waker, COMMIT_PERIOD);
1864 }
1865 
1866 /*----------------------------------------------------------------*/
1867 
1868 static int is_congested(struct dm_dev *dev, int bdi_bits)
1869 {
1870 	struct request_queue *q = bdev_get_queue(dev->bdev);
1871 	return bdi_congested(&q->backing_dev_info, bdi_bits);
1872 }
1873 
1874 static int cache_is_congested(struct dm_target_callbacks *cb, int bdi_bits)
1875 {
1876 	struct cache *cache = container_of(cb, struct cache, callbacks);
1877 
1878 	return is_congested(cache->origin_dev, bdi_bits) ||
1879 		is_congested(cache->cache_dev, bdi_bits);
1880 }
1881 
1882 /*----------------------------------------------------------------
1883  * Target methods
1884  *--------------------------------------------------------------*/
1885 
1886 /*
1887  * This function gets called on the error paths of the constructor, so we
1888  * have to cope with a partially initialised struct.
1889  */
1890 static void destroy(struct cache *cache)
1891 {
1892 	unsigned i;
1893 
1894 	if (cache->migration_pool)
1895 		mempool_destroy(cache->migration_pool);
1896 
1897 	if (cache->all_io_ds)
1898 		dm_deferred_set_destroy(cache->all_io_ds);
1899 
1900 	if (cache->prison)
1901 		dm_bio_prison_destroy(cache->prison);
1902 
1903 	if (cache->wq)
1904 		destroy_workqueue(cache->wq);
1905 
1906 	if (cache->dirty_bitset)
1907 		free_bitset(cache->dirty_bitset);
1908 
1909 	if (cache->discard_bitset)
1910 		free_bitset(cache->discard_bitset);
1911 
1912 	if (cache->copier)
1913 		dm_kcopyd_client_destroy(cache->copier);
1914 
1915 	if (cache->cmd)
1916 		dm_cache_metadata_close(cache->cmd);
1917 
1918 	if (cache->metadata_dev)
1919 		dm_put_device(cache->ti, cache->metadata_dev);
1920 
1921 	if (cache->origin_dev)
1922 		dm_put_device(cache->ti, cache->origin_dev);
1923 
1924 	if (cache->cache_dev)
1925 		dm_put_device(cache->ti, cache->cache_dev);
1926 
1927 	if (cache->policy)
1928 		dm_cache_policy_destroy(cache->policy);
1929 
1930 	for (i = 0; i < cache->nr_ctr_args ; i++)
1931 		kfree(cache->ctr_args[i]);
1932 	kfree(cache->ctr_args);
1933 
1934 	kfree(cache);
1935 }
1936 
1937 static void cache_dtr(struct dm_target *ti)
1938 {
1939 	struct cache *cache = ti->private;
1940 
1941 	destroy(cache);
1942 }
1943 
1944 static sector_t get_dev_size(struct dm_dev *dev)
1945 {
1946 	return i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT;
1947 }
1948 
1949 /*----------------------------------------------------------------*/
1950 
1951 /*
1952  * Construct a cache device mapping.
1953  *
1954  * cache <metadata dev> <cache dev> <origin dev> <block size>
1955  *       <#feature args> [<feature arg>]*
1956  *       <policy> <#policy args> [<policy arg>]*
1957  *
1958  * metadata dev    : fast device holding the persistent metadata
1959  * cache dev	   : fast device holding cached data blocks
1960  * origin dev	   : slow device holding original data blocks
1961  * block size	   : cache unit size in sectors
1962  *
1963  * #feature args   : number of feature arguments passed
1964  * feature args    : writethrough.  (The default is writeback.)
1965  *
1966  * policy	   : the replacement policy to use
1967  * #policy args    : an even number of policy arguments corresponding
1968  *		     to key/value pairs passed to the policy
1969  * policy args	   : key/value pairs passed to the policy
1970  *		     E.g. 'sequential_threshold 1024'
1971  *		     See cache-policies.txt for details.
1972  *
1973  * Optional feature arguments are:
1974  *   writethrough  : write through caching that prohibits cache block
1975  *		     content from being different from origin block content.
1976  *		     Without this argument, the default behaviour is to write
1977  *		     back cache block contents later for performance reasons,
1978  *		     so they may differ from the corresponding origin blocks.
1979  */
1980 struct cache_args {
1981 	struct dm_target *ti;
1982 
1983 	struct dm_dev *metadata_dev;
1984 
1985 	struct dm_dev *cache_dev;
1986 	sector_t cache_sectors;
1987 
1988 	struct dm_dev *origin_dev;
1989 	sector_t origin_sectors;
1990 
1991 	uint32_t block_size;
1992 
1993 	const char *policy_name;
1994 	int policy_argc;
1995 	const char **policy_argv;
1996 
1997 	struct cache_features features;
1998 };
1999 
2000 static void destroy_cache_args(struct cache_args *ca)
2001 {
2002 	if (ca->metadata_dev)
2003 		dm_put_device(ca->ti, ca->metadata_dev);
2004 
2005 	if (ca->cache_dev)
2006 		dm_put_device(ca->ti, ca->cache_dev);
2007 
2008 	if (ca->origin_dev)
2009 		dm_put_device(ca->ti, ca->origin_dev);
2010 
2011 	kfree(ca);
2012 }
2013 
2014 static bool at_least_one_arg(struct dm_arg_set *as, char **error)
2015 {
2016 	if (!as->argc) {
2017 		*error = "Insufficient args";
2018 		return false;
2019 	}
2020 
2021 	return true;
2022 }
2023 
2024 static int parse_metadata_dev(struct cache_args *ca, struct dm_arg_set *as,
2025 			      char **error)
2026 {
2027 	int r;
2028 	sector_t metadata_dev_size;
2029 	char b[BDEVNAME_SIZE];
2030 
2031 	if (!at_least_one_arg(as, error))
2032 		return -EINVAL;
2033 
2034 	r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE,
2035 			  &ca->metadata_dev);
2036 	if (r) {
2037 		*error = "Error opening metadata device";
2038 		return r;
2039 	}
2040 
2041 	metadata_dev_size = get_dev_size(ca->metadata_dev);
2042 	if (metadata_dev_size > DM_CACHE_METADATA_MAX_SECTORS_WARNING)
2043 		DMWARN("Metadata device %s is larger than %u sectors: excess space will not be used.",
2044 		       bdevname(ca->metadata_dev->bdev, b), THIN_METADATA_MAX_SECTORS);
2045 
2046 	return 0;
2047 }
2048 
2049 static int parse_cache_dev(struct cache_args *ca, struct dm_arg_set *as,
2050 			   char **error)
2051 {
2052 	int r;
2053 
2054 	if (!at_least_one_arg(as, error))
2055 		return -EINVAL;
2056 
2057 	r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE,
2058 			  &ca->cache_dev);
2059 	if (r) {
2060 		*error = "Error opening cache device";
2061 		return r;
2062 	}
2063 	ca->cache_sectors = get_dev_size(ca->cache_dev);
2064 
2065 	return 0;
2066 }
2067 
2068 static int parse_origin_dev(struct cache_args *ca, struct dm_arg_set *as,
2069 			    char **error)
2070 {
2071 	int r;
2072 
2073 	if (!at_least_one_arg(as, error))
2074 		return -EINVAL;
2075 
2076 	r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE,
2077 			  &ca->origin_dev);
2078 	if (r) {
2079 		*error = "Error opening origin device";
2080 		return r;
2081 	}
2082 
2083 	ca->origin_sectors = get_dev_size(ca->origin_dev);
2084 	if (ca->ti->len > ca->origin_sectors) {
2085 		*error = "Device size larger than cached device";
2086 		return -EINVAL;
2087 	}
2088 
2089 	return 0;
2090 }
2091 
2092 static int parse_block_size(struct cache_args *ca, struct dm_arg_set *as,
2093 			    char **error)
2094 {
2095 	unsigned long block_size;
2096 
2097 	if (!at_least_one_arg(as, error))
2098 		return -EINVAL;
2099 
2100 	if (kstrtoul(dm_shift_arg(as), 10, &block_size) || !block_size ||
2101 	    block_size < DATA_DEV_BLOCK_SIZE_MIN_SECTORS ||
2102 	    block_size > DATA_DEV_BLOCK_SIZE_MAX_SECTORS ||
2103 	    block_size & (DATA_DEV_BLOCK_SIZE_MIN_SECTORS - 1)) {
2104 		*error = "Invalid data block size";
2105 		return -EINVAL;
2106 	}
2107 
2108 	if (block_size > ca->cache_sectors) {
2109 		*error = "Data block size is larger than the cache device";
2110 		return -EINVAL;
2111 	}
2112 
2113 	ca->block_size = block_size;
2114 
2115 	return 0;
2116 }
2117 
2118 static void init_features(struct cache_features *cf)
2119 {
2120 	cf->mode = CM_WRITE;
2121 	cf->io_mode = CM_IO_WRITEBACK;
2122 }
2123 
2124 static int parse_features(struct cache_args *ca, struct dm_arg_set *as,
2125 			  char **error)
2126 {
2127 	static struct dm_arg _args[] = {
2128 		{0, 1, "Invalid number of cache feature arguments"},
2129 	};
2130 
2131 	int r;
2132 	unsigned argc;
2133 	const char *arg;
2134 	struct cache_features *cf = &ca->features;
2135 
2136 	init_features(cf);
2137 
2138 	r = dm_read_arg_group(_args, as, &argc, error);
2139 	if (r)
2140 		return -EINVAL;
2141 
2142 	while (argc--) {
2143 		arg = dm_shift_arg(as);
2144 
2145 		if (!strcasecmp(arg, "writeback"))
2146 			cf->io_mode = CM_IO_WRITEBACK;
2147 
2148 		else if (!strcasecmp(arg, "writethrough"))
2149 			cf->io_mode = CM_IO_WRITETHROUGH;
2150 
2151 		else if (!strcasecmp(arg, "passthrough"))
2152 			cf->io_mode = CM_IO_PASSTHROUGH;
2153 
2154 		else {
2155 			*error = "Unrecognised cache feature requested";
2156 			return -EINVAL;
2157 		}
2158 	}
2159 
2160 	return 0;
2161 }
2162 
2163 static int parse_policy(struct cache_args *ca, struct dm_arg_set *as,
2164 			char **error)
2165 {
2166 	static struct dm_arg _args[] = {
2167 		{0, 1024, "Invalid number of policy arguments"},
2168 	};
2169 
2170 	int r;
2171 
2172 	if (!at_least_one_arg(as, error))
2173 		return -EINVAL;
2174 
2175 	ca->policy_name = dm_shift_arg(as);
2176 
2177 	r = dm_read_arg_group(_args, as, &ca->policy_argc, error);
2178 	if (r)
2179 		return -EINVAL;
2180 
2181 	ca->policy_argv = (const char **)as->argv;
2182 	dm_consume_args(as, ca->policy_argc);
2183 
2184 	return 0;
2185 }
2186 
2187 static int parse_cache_args(struct cache_args *ca, int argc, char **argv,
2188 			    char **error)
2189 {
2190 	int r;
2191 	struct dm_arg_set as;
2192 
2193 	as.argc = argc;
2194 	as.argv = argv;
2195 
2196 	r = parse_metadata_dev(ca, &as, error);
2197 	if (r)
2198 		return r;
2199 
2200 	r = parse_cache_dev(ca, &as, error);
2201 	if (r)
2202 		return r;
2203 
2204 	r = parse_origin_dev(ca, &as, error);
2205 	if (r)
2206 		return r;
2207 
2208 	r = parse_block_size(ca, &as, error);
2209 	if (r)
2210 		return r;
2211 
2212 	r = parse_features(ca, &as, error);
2213 	if (r)
2214 		return r;
2215 
2216 	r = parse_policy(ca, &as, error);
2217 	if (r)
2218 		return r;
2219 
2220 	return 0;
2221 }
2222 
2223 /*----------------------------------------------------------------*/
2224 
2225 static struct kmem_cache *migration_cache;
2226 
2227 #define NOT_CORE_OPTION 1
2228 
2229 static int process_config_option(struct cache *cache, const char *key, const char *value)
2230 {
2231 	unsigned long tmp;
2232 
2233 	if (!strcasecmp(key, "migration_threshold")) {
2234 		if (kstrtoul(value, 10, &tmp))
2235 			return -EINVAL;
2236 
2237 		cache->migration_threshold = tmp;
2238 		return 0;
2239 	}
2240 
2241 	return NOT_CORE_OPTION;
2242 }
2243 
2244 static int set_config_value(struct cache *cache, const char *key, const char *value)
2245 {
2246 	int r = process_config_option(cache, key, value);
2247 
2248 	if (r == NOT_CORE_OPTION)
2249 		r = policy_set_config_value(cache->policy, key, value);
2250 
2251 	if (r)
2252 		DMWARN("bad config value for %s: %s", key, value);
2253 
2254 	return r;
2255 }
2256 
2257 static int set_config_values(struct cache *cache, int argc, const char **argv)
2258 {
2259 	int r = 0;
2260 
2261 	if (argc & 1) {
2262 		DMWARN("Odd number of policy arguments given but they should be <key> <value> pairs.");
2263 		return -EINVAL;
2264 	}
2265 
2266 	while (argc) {
2267 		r = set_config_value(cache, argv[0], argv[1]);
2268 		if (r)
2269 			break;
2270 
2271 		argc -= 2;
2272 		argv += 2;
2273 	}
2274 
2275 	return r;
2276 }
2277 
2278 static int create_cache_policy(struct cache *cache, struct cache_args *ca,
2279 			       char **error)
2280 {
2281 	struct dm_cache_policy *p = dm_cache_policy_create(ca->policy_name,
2282 							   cache->cache_size,
2283 							   cache->origin_sectors,
2284 							   cache->sectors_per_block);
2285 	if (IS_ERR(p)) {
2286 		*error = "Error creating cache's policy";
2287 		return PTR_ERR(p);
2288 	}
2289 	cache->policy = p;
2290 
2291 	return 0;
2292 }
2293 
2294 /*
2295  * We want the discard block size to be at least the size of the cache
2296  * block size and have no more than 2^14 discard blocks across the origin.
2297  */
2298 #define MAX_DISCARD_BLOCKS (1 << 14)
2299 
2300 static bool too_many_discard_blocks(sector_t discard_block_size,
2301 				    sector_t origin_size)
2302 {
2303 	(void) sector_div(origin_size, discard_block_size);
2304 
2305 	return origin_size > MAX_DISCARD_BLOCKS;
2306 }
2307 
2308 static sector_t calculate_discard_block_size(sector_t cache_block_size,
2309 					     sector_t origin_size)
2310 {
2311 	sector_t discard_block_size = cache_block_size;
2312 
2313 	if (origin_size)
2314 		while (too_many_discard_blocks(discard_block_size, origin_size))
2315 			discard_block_size *= 2;
2316 
2317 	return discard_block_size;
2318 }
2319 
2320 static void set_cache_size(struct cache *cache, dm_cblock_t size)
2321 {
2322 	dm_block_t nr_blocks = from_cblock(size);
2323 
2324 	if (nr_blocks > (1 << 20) && cache->cache_size != size)
2325 		DMWARN_LIMIT("You have created a cache device with a lot of individual cache blocks (%llu)\n"
2326 			     "All these mappings can consume a lot of kernel memory, and take some time to read/write.\n"
2327 			     "Please consider increasing the cache block size to reduce the overall cache block count.",
2328 			     (unsigned long long) nr_blocks);
2329 
2330 	cache->cache_size = size;
2331 }
2332 
2333 #define DEFAULT_MIGRATION_THRESHOLD 2048
2334 
2335 static int cache_create(struct cache_args *ca, struct cache **result)
2336 {
2337 	int r = 0;
2338 	char **error = &ca->ti->error;
2339 	struct cache *cache;
2340 	struct dm_target *ti = ca->ti;
2341 	dm_block_t origin_blocks;
2342 	struct dm_cache_metadata *cmd;
2343 	bool may_format = ca->features.mode == CM_WRITE;
2344 
2345 	cache = kzalloc(sizeof(*cache), GFP_KERNEL);
2346 	if (!cache)
2347 		return -ENOMEM;
2348 
2349 	cache->ti = ca->ti;
2350 	ti->private = cache;
2351 	ti->num_flush_bios = 2;
2352 	ti->flush_supported = true;
2353 
2354 	ti->num_discard_bios = 1;
2355 	ti->discards_supported = true;
2356 	ti->discard_zeroes_data_unsupported = true;
2357 	ti->split_discard_bios = false;
2358 
2359 	cache->features = ca->features;
2360 	ti->per_bio_data_size = get_per_bio_data_size(cache);
2361 
2362 	cache->callbacks.congested_fn = cache_is_congested;
2363 	dm_table_add_target_callbacks(ti->table, &cache->callbacks);
2364 
2365 	cache->metadata_dev = ca->metadata_dev;
2366 	cache->origin_dev = ca->origin_dev;
2367 	cache->cache_dev = ca->cache_dev;
2368 
2369 	ca->metadata_dev = ca->origin_dev = ca->cache_dev = NULL;
2370 
2371 	/* FIXME: factor out this whole section */
2372 	origin_blocks = cache->origin_sectors = ca->origin_sectors;
2373 	origin_blocks = block_div(origin_blocks, ca->block_size);
2374 	cache->origin_blocks = to_oblock(origin_blocks);
2375 
2376 	cache->sectors_per_block = ca->block_size;
2377 	if (dm_set_target_max_io_len(ti, cache->sectors_per_block)) {
2378 		r = -EINVAL;
2379 		goto bad;
2380 	}
2381 
2382 	if (ca->block_size & (ca->block_size - 1)) {
2383 		dm_block_t cache_size = ca->cache_sectors;
2384 
2385 		cache->sectors_per_block_shift = -1;
2386 		cache_size = block_div(cache_size, ca->block_size);
2387 		set_cache_size(cache, to_cblock(cache_size));
2388 	} else {
2389 		cache->sectors_per_block_shift = __ffs(ca->block_size);
2390 		set_cache_size(cache, to_cblock(ca->cache_sectors >> cache->sectors_per_block_shift));
2391 	}
2392 
2393 	r = create_cache_policy(cache, ca, error);
2394 	if (r)
2395 		goto bad;
2396 
2397 	cache->policy_nr_args = ca->policy_argc;
2398 	cache->migration_threshold = DEFAULT_MIGRATION_THRESHOLD;
2399 
2400 	r = set_config_values(cache, ca->policy_argc, ca->policy_argv);
2401 	if (r) {
2402 		*error = "Error setting cache policy's config values";
2403 		goto bad;
2404 	}
2405 
2406 	cmd = dm_cache_metadata_open(cache->metadata_dev->bdev,
2407 				     ca->block_size, may_format,
2408 				     dm_cache_policy_get_hint_size(cache->policy));
2409 	if (IS_ERR(cmd)) {
2410 		*error = "Error creating metadata object";
2411 		r = PTR_ERR(cmd);
2412 		goto bad;
2413 	}
2414 	cache->cmd = cmd;
2415 
2416 	if (passthrough_mode(&cache->features)) {
2417 		bool all_clean;
2418 
2419 		r = dm_cache_metadata_all_clean(cache->cmd, &all_clean);
2420 		if (r) {
2421 			*error = "dm_cache_metadata_all_clean() failed";
2422 			goto bad;
2423 		}
2424 
2425 		if (!all_clean) {
2426 			*error = "Cannot enter passthrough mode unless all blocks are clean";
2427 			r = -EINVAL;
2428 			goto bad;
2429 		}
2430 	}
2431 
2432 	spin_lock_init(&cache->lock);
2433 	bio_list_init(&cache->deferred_bios);
2434 	bio_list_init(&cache->deferred_flush_bios);
2435 	bio_list_init(&cache->deferred_writethrough_bios);
2436 	INIT_LIST_HEAD(&cache->quiesced_migrations);
2437 	INIT_LIST_HEAD(&cache->completed_migrations);
2438 	INIT_LIST_HEAD(&cache->need_commit_migrations);
2439 	atomic_set(&cache->nr_allocated_migrations, 0);
2440 	atomic_set(&cache->nr_io_migrations, 0);
2441 	init_waitqueue_head(&cache->migration_wait);
2442 
2443 	init_waitqueue_head(&cache->quiescing_wait);
2444 	atomic_set(&cache->quiescing, 0);
2445 	atomic_set(&cache->quiescing_ack, 0);
2446 
2447 	r = -ENOMEM;
2448 	atomic_set(&cache->nr_dirty, 0);
2449 	cache->dirty_bitset = alloc_bitset(from_cblock(cache->cache_size));
2450 	if (!cache->dirty_bitset) {
2451 		*error = "could not allocate dirty bitset";
2452 		goto bad;
2453 	}
2454 	clear_bitset(cache->dirty_bitset, from_cblock(cache->cache_size));
2455 
2456 	cache->discard_block_size =
2457 		calculate_discard_block_size(cache->sectors_per_block,
2458 					     cache->origin_sectors);
2459 	cache->discard_nr_blocks = to_dblock(dm_sector_div_up(cache->origin_sectors,
2460 							      cache->discard_block_size));
2461 	cache->discard_bitset = alloc_bitset(from_dblock(cache->discard_nr_blocks));
2462 	if (!cache->discard_bitset) {
2463 		*error = "could not allocate discard bitset";
2464 		goto bad;
2465 	}
2466 	clear_bitset(cache->discard_bitset, from_dblock(cache->discard_nr_blocks));
2467 
2468 	cache->copier = dm_kcopyd_client_create(&dm_kcopyd_throttle);
2469 	if (IS_ERR(cache->copier)) {
2470 		*error = "could not create kcopyd client";
2471 		r = PTR_ERR(cache->copier);
2472 		goto bad;
2473 	}
2474 
2475 	cache->wq = alloc_ordered_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM);
2476 	if (!cache->wq) {
2477 		*error = "could not create workqueue for metadata object";
2478 		goto bad;
2479 	}
2480 	INIT_WORK(&cache->worker, do_worker);
2481 	INIT_DELAYED_WORK(&cache->waker, do_waker);
2482 	cache->last_commit_jiffies = jiffies;
2483 
2484 	cache->prison = dm_bio_prison_create();
2485 	if (!cache->prison) {
2486 		*error = "could not create bio prison";
2487 		goto bad;
2488 	}
2489 
2490 	cache->all_io_ds = dm_deferred_set_create();
2491 	if (!cache->all_io_ds) {
2492 		*error = "could not create all_io deferred set";
2493 		goto bad;
2494 	}
2495 
2496 	cache->migration_pool = mempool_create_slab_pool(MIGRATION_POOL_SIZE,
2497 							 migration_cache);
2498 	if (!cache->migration_pool) {
2499 		*error = "Error creating cache's migration mempool";
2500 		goto bad;
2501 	}
2502 
2503 	cache->need_tick_bio = true;
2504 	cache->sized = false;
2505 	cache->invalidate = false;
2506 	cache->commit_requested = false;
2507 	cache->loaded_mappings = false;
2508 	cache->loaded_discards = false;
2509 
2510 	load_stats(cache);
2511 
2512 	atomic_set(&cache->stats.demotion, 0);
2513 	atomic_set(&cache->stats.promotion, 0);
2514 	atomic_set(&cache->stats.copies_avoided, 0);
2515 	atomic_set(&cache->stats.cache_cell_clash, 0);
2516 	atomic_set(&cache->stats.commit_count, 0);
2517 	atomic_set(&cache->stats.discard_count, 0);
2518 
2519 	spin_lock_init(&cache->invalidation_lock);
2520 	INIT_LIST_HEAD(&cache->invalidation_requests);
2521 
2522 	*result = cache;
2523 	return 0;
2524 
2525 bad:
2526 	destroy(cache);
2527 	return r;
2528 }
2529 
2530 static int copy_ctr_args(struct cache *cache, int argc, const char **argv)
2531 {
2532 	unsigned i;
2533 	const char **copy;
2534 
2535 	copy = kcalloc(argc, sizeof(*copy), GFP_KERNEL);
2536 	if (!copy)
2537 		return -ENOMEM;
2538 	for (i = 0; i < argc; i++) {
2539 		copy[i] = kstrdup(argv[i], GFP_KERNEL);
2540 		if (!copy[i]) {
2541 			while (i--)
2542 				kfree(copy[i]);
2543 			kfree(copy);
2544 			return -ENOMEM;
2545 		}
2546 	}
2547 
2548 	cache->nr_ctr_args = argc;
2549 	cache->ctr_args = copy;
2550 
2551 	return 0;
2552 }
2553 
2554 static int cache_ctr(struct dm_target *ti, unsigned argc, char **argv)
2555 {
2556 	int r = -EINVAL;
2557 	struct cache_args *ca;
2558 	struct cache *cache = NULL;
2559 
2560 	ca = kzalloc(sizeof(*ca), GFP_KERNEL);
2561 	if (!ca) {
2562 		ti->error = "Error allocating memory for cache";
2563 		return -ENOMEM;
2564 	}
2565 	ca->ti = ti;
2566 
2567 	r = parse_cache_args(ca, argc, argv, &ti->error);
2568 	if (r)
2569 		goto out;
2570 
2571 	r = cache_create(ca, &cache);
2572 	if (r)
2573 		goto out;
2574 
2575 	r = copy_ctr_args(cache, argc - 3, (const char **)argv + 3);
2576 	if (r) {
2577 		destroy(cache);
2578 		goto out;
2579 	}
2580 
2581 	ti->private = cache;
2582 
2583 out:
2584 	destroy_cache_args(ca);
2585 	return r;
2586 }
2587 
2588 static int __cache_map(struct cache *cache, struct bio *bio, struct dm_bio_prison_cell **cell)
2589 {
2590 	int r;
2591 	dm_oblock_t block = get_bio_block(cache, bio);
2592 	size_t pb_data_size = get_per_bio_data_size(cache);
2593 	bool can_migrate = false;
2594 	bool discarded_block;
2595 	struct policy_result lookup_result;
2596 	struct per_bio_data *pb = init_per_bio_data(bio, pb_data_size);
2597 
2598 	if (unlikely(from_oblock(block) >= from_oblock(cache->origin_blocks))) {
2599 		/*
2600 		 * This can only occur if the io goes to a partial block at
2601 		 * the end of the origin device.  We don't cache these.
2602 		 * Just remap to the origin and carry on.
2603 		 */
2604 		remap_to_origin(cache, bio);
2605 		return DM_MAPIO_REMAPPED;
2606 	}
2607 
2608 	if (bio->bi_rw & (REQ_FLUSH | REQ_FUA | REQ_DISCARD)) {
2609 		defer_bio(cache, bio);
2610 		return DM_MAPIO_SUBMITTED;
2611 	}
2612 
2613 	/*
2614 	 * Check to see if that block is currently migrating.
2615 	 */
2616 	*cell = alloc_prison_cell(cache);
2617 	if (!*cell) {
2618 		defer_bio(cache, bio);
2619 		return DM_MAPIO_SUBMITTED;
2620 	}
2621 
2622 	r = bio_detain(cache, block, bio, *cell,
2623 		       (cell_free_fn) free_prison_cell,
2624 		       cache, cell);
2625 	if (r) {
2626 		if (r < 0)
2627 			defer_bio(cache, bio);
2628 
2629 		return DM_MAPIO_SUBMITTED;
2630 	}
2631 
2632 	discarded_block = is_discarded_oblock(cache, block);
2633 
2634 	r = policy_map(cache->policy, block, false, can_migrate, discarded_block,
2635 		       bio, &lookup_result);
2636 	if (r == -EWOULDBLOCK) {
2637 		cell_defer(cache, *cell, true);
2638 		return DM_MAPIO_SUBMITTED;
2639 
2640 	} else if (r) {
2641 		DMERR_LIMIT("Unexpected return from cache replacement policy: %d", r);
2642 		cell_defer(cache, *cell, false);
2643 		bio_io_error(bio);
2644 		return DM_MAPIO_SUBMITTED;
2645 	}
2646 
2647 	r = DM_MAPIO_REMAPPED;
2648 	switch (lookup_result.op) {
2649 	case POLICY_HIT:
2650 		if (passthrough_mode(&cache->features)) {
2651 			if (bio_data_dir(bio) == WRITE) {
2652 				/*
2653 				 * We need to invalidate this block, so
2654 				 * defer for the worker thread.
2655 				 */
2656 				cell_defer(cache, *cell, true);
2657 				r = DM_MAPIO_SUBMITTED;
2658 
2659 			} else {
2660 				inc_miss_counter(cache, bio);
2661 				remap_to_origin_clear_discard(cache, bio, block);
2662 			}
2663 
2664 		} else {
2665 			inc_hit_counter(cache, bio);
2666 			if (bio_data_dir(bio) == WRITE && writethrough_mode(&cache->features) &&
2667 			    !is_dirty(cache, lookup_result.cblock))
2668 				remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock);
2669 			else
2670 				remap_to_cache_dirty(cache, bio, block, lookup_result.cblock);
2671 		}
2672 		break;
2673 
2674 	case POLICY_MISS:
2675 		inc_miss_counter(cache, bio);
2676 		if (pb->req_nr != 0) {
2677 			/*
2678 			 * This is a duplicate writethrough io that is no
2679 			 * longer needed because the block has been demoted.
2680 			 */
2681 			bio_endio(bio, 0);
2682 			cell_defer(cache, *cell, false);
2683 			r = DM_MAPIO_SUBMITTED;
2684 
2685 		} else
2686 			remap_to_origin_clear_discard(cache, bio, block);
2687 
2688 		break;
2689 
2690 	default:
2691 		DMERR_LIMIT("%s: erroring bio: unknown policy op: %u", __func__,
2692 			    (unsigned) lookup_result.op);
2693 		cell_defer(cache, *cell, false);
2694 		bio_io_error(bio);
2695 		r = DM_MAPIO_SUBMITTED;
2696 	}
2697 
2698 	return r;
2699 }
2700 
2701 static int cache_map(struct dm_target *ti, struct bio *bio)
2702 {
2703 	int r;
2704 	struct dm_bio_prison_cell *cell = NULL;
2705 	struct cache *cache = ti->private;
2706 
2707 	r = __cache_map(cache, bio, &cell);
2708 	if (r == DM_MAPIO_REMAPPED && cell) {
2709 		inc_ds(cache, bio, cell);
2710 		cell_defer(cache, cell, false);
2711 	}
2712 
2713 	return r;
2714 }
2715 
2716 static int cache_end_io(struct dm_target *ti, struct bio *bio, int error)
2717 {
2718 	struct cache *cache = ti->private;
2719 	unsigned long flags;
2720 	size_t pb_data_size = get_per_bio_data_size(cache);
2721 	struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
2722 
2723 	if (pb->tick) {
2724 		policy_tick(cache->policy);
2725 
2726 		spin_lock_irqsave(&cache->lock, flags);
2727 		cache->need_tick_bio = true;
2728 		spin_unlock_irqrestore(&cache->lock, flags);
2729 	}
2730 
2731 	check_for_quiesced_migrations(cache, pb);
2732 
2733 	return 0;
2734 }
2735 
2736 static int write_dirty_bitset(struct cache *cache)
2737 {
2738 	unsigned i, r;
2739 
2740 	for (i = 0; i < from_cblock(cache->cache_size); i++) {
2741 		r = dm_cache_set_dirty(cache->cmd, to_cblock(i),
2742 				       is_dirty(cache, to_cblock(i)));
2743 		if (r)
2744 			return r;
2745 	}
2746 
2747 	return 0;
2748 }
2749 
2750 static int write_discard_bitset(struct cache *cache)
2751 {
2752 	unsigned i, r;
2753 
2754 	r = dm_cache_discard_bitset_resize(cache->cmd, cache->discard_block_size,
2755 					   cache->discard_nr_blocks);
2756 	if (r) {
2757 		DMERR("could not resize on-disk discard bitset");
2758 		return r;
2759 	}
2760 
2761 	for (i = 0; i < from_dblock(cache->discard_nr_blocks); i++) {
2762 		r = dm_cache_set_discard(cache->cmd, to_dblock(i),
2763 					 is_discarded(cache, to_dblock(i)));
2764 		if (r)
2765 			return r;
2766 	}
2767 
2768 	return 0;
2769 }
2770 
2771 /*
2772  * returns true on success
2773  */
2774 static bool sync_metadata(struct cache *cache)
2775 {
2776 	int r1, r2, r3, r4;
2777 
2778 	r1 = write_dirty_bitset(cache);
2779 	if (r1)
2780 		DMERR("could not write dirty bitset");
2781 
2782 	r2 = write_discard_bitset(cache);
2783 	if (r2)
2784 		DMERR("could not write discard bitset");
2785 
2786 	save_stats(cache);
2787 
2788 	r3 = dm_cache_write_hints(cache->cmd, cache->policy);
2789 	if (r3)
2790 		DMERR("could not write hints");
2791 
2792 	/*
2793 	 * If writing the above metadata failed, we still commit, but don't
2794 	 * set the clean shutdown flag.  This will effectively force every
2795 	 * dirty bit to be set on reload.
2796 	 */
2797 	r4 = dm_cache_commit(cache->cmd, !r1 && !r2 && !r3);
2798 	if (r4)
2799 		DMERR("could not write cache metadata.  Data loss may occur.");
2800 
2801 	return !r1 && !r2 && !r3 && !r4;
2802 }
2803 
2804 static void cache_postsuspend(struct dm_target *ti)
2805 {
2806 	struct cache *cache = ti->private;
2807 
2808 	start_quiescing(cache);
2809 	wait_for_migrations(cache);
2810 	stop_worker(cache);
2811 	requeue_deferred_io(cache);
2812 	stop_quiescing(cache);
2813 
2814 	(void) sync_metadata(cache);
2815 }
2816 
2817 static int load_mapping(void *context, dm_oblock_t oblock, dm_cblock_t cblock,
2818 			bool dirty, uint32_t hint, bool hint_valid)
2819 {
2820 	int r;
2821 	struct cache *cache = context;
2822 
2823 	r = policy_load_mapping(cache->policy, oblock, cblock, hint, hint_valid);
2824 	if (r)
2825 		return r;
2826 
2827 	if (dirty)
2828 		set_dirty(cache, oblock, cblock);
2829 	else
2830 		clear_dirty(cache, oblock, cblock);
2831 
2832 	return 0;
2833 }
2834 
2835 /*
2836  * The discard block size in the on disk metadata is not
2837  * neccessarily the same as we're currently using.  So we have to
2838  * be careful to only set the discarded attribute if we know it
2839  * covers a complete block of the new size.
2840  */
2841 struct discard_load_info {
2842 	struct cache *cache;
2843 
2844 	/*
2845 	 * These blocks are sized using the on disk dblock size, rather
2846 	 * than the current one.
2847 	 */
2848 	dm_block_t block_size;
2849 	dm_block_t discard_begin, discard_end;
2850 };
2851 
2852 static void discard_load_info_init(struct cache *cache,
2853 				   struct discard_load_info *li)
2854 {
2855 	li->cache = cache;
2856 	li->discard_begin = li->discard_end = 0;
2857 }
2858 
2859 static void set_discard_range(struct discard_load_info *li)
2860 {
2861 	sector_t b, e;
2862 
2863 	if (li->discard_begin == li->discard_end)
2864 		return;
2865 
2866 	/*
2867 	 * Convert to sectors.
2868 	 */
2869 	b = li->discard_begin * li->block_size;
2870 	e = li->discard_end * li->block_size;
2871 
2872 	/*
2873 	 * Then convert back to the current dblock size.
2874 	 */
2875 	b = dm_sector_div_up(b, li->cache->discard_block_size);
2876 	sector_div(e, li->cache->discard_block_size);
2877 
2878 	/*
2879 	 * The origin may have shrunk, so we need to check we're still in
2880 	 * bounds.
2881 	 */
2882 	if (e > from_dblock(li->cache->discard_nr_blocks))
2883 		e = from_dblock(li->cache->discard_nr_blocks);
2884 
2885 	for (; b < e; b++)
2886 		set_discard(li->cache, to_dblock(b));
2887 }
2888 
2889 static int load_discard(void *context, sector_t discard_block_size,
2890 			dm_dblock_t dblock, bool discard)
2891 {
2892 	struct discard_load_info *li = context;
2893 
2894 	li->block_size = discard_block_size;
2895 
2896 	if (discard) {
2897 		if (from_dblock(dblock) == li->discard_end)
2898 			/*
2899 			 * We're already in a discard range, just extend it.
2900 			 */
2901 			li->discard_end = li->discard_end + 1ULL;
2902 
2903 		else {
2904 			/*
2905 			 * Emit the old range and start a new one.
2906 			 */
2907 			set_discard_range(li);
2908 			li->discard_begin = from_dblock(dblock);
2909 			li->discard_end = li->discard_begin + 1ULL;
2910 		}
2911 	} else {
2912 		set_discard_range(li);
2913 		li->discard_begin = li->discard_end = 0;
2914 	}
2915 
2916 	return 0;
2917 }
2918 
2919 static dm_cblock_t get_cache_dev_size(struct cache *cache)
2920 {
2921 	sector_t size = get_dev_size(cache->cache_dev);
2922 	(void) sector_div(size, cache->sectors_per_block);
2923 	return to_cblock(size);
2924 }
2925 
2926 static bool can_resize(struct cache *cache, dm_cblock_t new_size)
2927 {
2928 	if (from_cblock(new_size) > from_cblock(cache->cache_size))
2929 		return true;
2930 
2931 	/*
2932 	 * We can't drop a dirty block when shrinking the cache.
2933 	 */
2934 	while (from_cblock(new_size) < from_cblock(cache->cache_size)) {
2935 		new_size = to_cblock(from_cblock(new_size) + 1);
2936 		if (is_dirty(cache, new_size)) {
2937 			DMERR("unable to shrink cache; cache block %llu is dirty",
2938 			      (unsigned long long) from_cblock(new_size));
2939 			return false;
2940 		}
2941 	}
2942 
2943 	return true;
2944 }
2945 
2946 static int resize_cache_dev(struct cache *cache, dm_cblock_t new_size)
2947 {
2948 	int r;
2949 
2950 	r = dm_cache_resize(cache->cmd, new_size);
2951 	if (r) {
2952 		DMERR("could not resize cache metadata");
2953 		return r;
2954 	}
2955 
2956 	set_cache_size(cache, new_size);
2957 
2958 	return 0;
2959 }
2960 
2961 static int cache_preresume(struct dm_target *ti)
2962 {
2963 	int r = 0;
2964 	struct cache *cache = ti->private;
2965 	dm_cblock_t csize = get_cache_dev_size(cache);
2966 
2967 	/*
2968 	 * Check to see if the cache has resized.
2969 	 */
2970 	if (!cache->sized) {
2971 		r = resize_cache_dev(cache, csize);
2972 		if (r)
2973 			return r;
2974 
2975 		cache->sized = true;
2976 
2977 	} else if (csize != cache->cache_size) {
2978 		if (!can_resize(cache, csize))
2979 			return -EINVAL;
2980 
2981 		r = resize_cache_dev(cache, csize);
2982 		if (r)
2983 			return r;
2984 	}
2985 
2986 	if (!cache->loaded_mappings) {
2987 		r = dm_cache_load_mappings(cache->cmd, cache->policy,
2988 					   load_mapping, cache);
2989 		if (r) {
2990 			DMERR("could not load cache mappings");
2991 			return r;
2992 		}
2993 
2994 		cache->loaded_mappings = true;
2995 	}
2996 
2997 	if (!cache->loaded_discards) {
2998 		struct discard_load_info li;
2999 
3000 		/*
3001 		 * The discard bitset could have been resized, or the
3002 		 * discard block size changed.  To be safe we start by
3003 		 * setting every dblock to not discarded.
3004 		 */
3005 		clear_bitset(cache->discard_bitset, from_dblock(cache->discard_nr_blocks));
3006 
3007 		discard_load_info_init(cache, &li);
3008 		r = dm_cache_load_discards(cache->cmd, load_discard, &li);
3009 		if (r) {
3010 			DMERR("could not load origin discards");
3011 			return r;
3012 		}
3013 		set_discard_range(&li);
3014 
3015 		cache->loaded_discards = true;
3016 	}
3017 
3018 	return r;
3019 }
3020 
3021 static void cache_resume(struct dm_target *ti)
3022 {
3023 	struct cache *cache = ti->private;
3024 
3025 	cache->need_tick_bio = true;
3026 	do_waker(&cache->waker.work);
3027 }
3028 
3029 /*
3030  * Status format:
3031  *
3032  * <metadata block size> <#used metadata blocks>/<#total metadata blocks>
3033  * <cache block size> <#used cache blocks>/<#total cache blocks>
3034  * <#read hits> <#read misses> <#write hits> <#write misses>
3035  * <#demotions> <#promotions> <#dirty>
3036  * <#features> <features>*
3037  * <#core args> <core args>
3038  * <policy name> <#policy args> <policy args>*
3039  */
3040 static void cache_status(struct dm_target *ti, status_type_t type,
3041 			 unsigned status_flags, char *result, unsigned maxlen)
3042 {
3043 	int r = 0;
3044 	unsigned i;
3045 	ssize_t sz = 0;
3046 	dm_block_t nr_free_blocks_metadata = 0;
3047 	dm_block_t nr_blocks_metadata = 0;
3048 	char buf[BDEVNAME_SIZE];
3049 	struct cache *cache = ti->private;
3050 	dm_cblock_t residency;
3051 
3052 	switch (type) {
3053 	case STATUSTYPE_INFO:
3054 		/* Commit to ensure statistics aren't out-of-date */
3055 		if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti)) {
3056 			r = dm_cache_commit(cache->cmd, false);
3057 			if (r)
3058 				DMERR("could not commit metadata for accurate status");
3059 		}
3060 
3061 		r = dm_cache_get_free_metadata_block_count(cache->cmd,
3062 							   &nr_free_blocks_metadata);
3063 		if (r) {
3064 			DMERR("could not get metadata free block count");
3065 			goto err;
3066 		}
3067 
3068 		r = dm_cache_get_metadata_dev_size(cache->cmd, &nr_blocks_metadata);
3069 		if (r) {
3070 			DMERR("could not get metadata device size");
3071 			goto err;
3072 		}
3073 
3074 		residency = policy_residency(cache->policy);
3075 
3076 		DMEMIT("%u %llu/%llu %u %llu/%llu %u %u %u %u %u %u %lu ",
3077 		       (unsigned)DM_CACHE_METADATA_BLOCK_SIZE,
3078 		       (unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata),
3079 		       (unsigned long long)nr_blocks_metadata,
3080 		       cache->sectors_per_block,
3081 		       (unsigned long long) from_cblock(residency),
3082 		       (unsigned long long) from_cblock(cache->cache_size),
3083 		       (unsigned) atomic_read(&cache->stats.read_hit),
3084 		       (unsigned) atomic_read(&cache->stats.read_miss),
3085 		       (unsigned) atomic_read(&cache->stats.write_hit),
3086 		       (unsigned) atomic_read(&cache->stats.write_miss),
3087 		       (unsigned) atomic_read(&cache->stats.demotion),
3088 		       (unsigned) atomic_read(&cache->stats.promotion),
3089 		       (unsigned long) atomic_read(&cache->nr_dirty));
3090 
3091 		if (writethrough_mode(&cache->features))
3092 			DMEMIT("1 writethrough ");
3093 
3094 		else if (passthrough_mode(&cache->features))
3095 			DMEMIT("1 passthrough ");
3096 
3097 		else if (writeback_mode(&cache->features))
3098 			DMEMIT("1 writeback ");
3099 
3100 		else {
3101 			DMERR("internal error: unknown io mode: %d", (int) cache->features.io_mode);
3102 			goto err;
3103 		}
3104 
3105 		DMEMIT("2 migration_threshold %llu ", (unsigned long long) cache->migration_threshold);
3106 
3107 		DMEMIT("%s ", dm_cache_policy_get_name(cache->policy));
3108 		if (sz < maxlen) {
3109 			r = policy_emit_config_values(cache->policy, result + sz, maxlen - sz);
3110 			if (r)
3111 				DMERR("policy_emit_config_values returned %d", r);
3112 		}
3113 
3114 		break;
3115 
3116 	case STATUSTYPE_TABLE:
3117 		format_dev_t(buf, cache->metadata_dev->bdev->bd_dev);
3118 		DMEMIT("%s ", buf);
3119 		format_dev_t(buf, cache->cache_dev->bdev->bd_dev);
3120 		DMEMIT("%s ", buf);
3121 		format_dev_t(buf, cache->origin_dev->bdev->bd_dev);
3122 		DMEMIT("%s", buf);
3123 
3124 		for (i = 0; i < cache->nr_ctr_args - 1; i++)
3125 			DMEMIT(" %s", cache->ctr_args[i]);
3126 		if (cache->nr_ctr_args)
3127 			DMEMIT(" %s", cache->ctr_args[cache->nr_ctr_args - 1]);
3128 	}
3129 
3130 	return;
3131 
3132 err:
3133 	DMEMIT("Error");
3134 }
3135 
3136 /*
3137  * A cache block range can take two forms:
3138  *
3139  * i) A single cblock, eg. '3456'
3140  * ii) A begin and end cblock with dots between, eg. 123-234
3141  */
3142 static int parse_cblock_range(struct cache *cache, const char *str,
3143 			      struct cblock_range *result)
3144 {
3145 	char dummy;
3146 	uint64_t b, e;
3147 	int r;
3148 
3149 	/*
3150 	 * Try and parse form (ii) first.
3151 	 */
3152 	r = sscanf(str, "%llu-%llu%c", &b, &e, &dummy);
3153 	if (r < 0)
3154 		return r;
3155 
3156 	if (r == 2) {
3157 		result->begin = to_cblock(b);
3158 		result->end = to_cblock(e);
3159 		return 0;
3160 	}
3161 
3162 	/*
3163 	 * That didn't work, try form (i).
3164 	 */
3165 	r = sscanf(str, "%llu%c", &b, &dummy);
3166 	if (r < 0)
3167 		return r;
3168 
3169 	if (r == 1) {
3170 		result->begin = to_cblock(b);
3171 		result->end = to_cblock(from_cblock(result->begin) + 1u);
3172 		return 0;
3173 	}
3174 
3175 	DMERR("invalid cblock range '%s'", str);
3176 	return -EINVAL;
3177 }
3178 
3179 static int validate_cblock_range(struct cache *cache, struct cblock_range *range)
3180 {
3181 	uint64_t b = from_cblock(range->begin);
3182 	uint64_t e = from_cblock(range->end);
3183 	uint64_t n = from_cblock(cache->cache_size);
3184 
3185 	if (b >= n) {
3186 		DMERR("begin cblock out of range: %llu >= %llu", b, n);
3187 		return -EINVAL;
3188 	}
3189 
3190 	if (e > n) {
3191 		DMERR("end cblock out of range: %llu > %llu", e, n);
3192 		return -EINVAL;
3193 	}
3194 
3195 	if (b >= e) {
3196 		DMERR("invalid cblock range: %llu >= %llu", b, e);
3197 		return -EINVAL;
3198 	}
3199 
3200 	return 0;
3201 }
3202 
3203 static int request_invalidation(struct cache *cache, struct cblock_range *range)
3204 {
3205 	struct invalidation_request req;
3206 
3207 	INIT_LIST_HEAD(&req.list);
3208 	req.cblocks = range;
3209 	atomic_set(&req.complete, 0);
3210 	req.err = 0;
3211 	init_waitqueue_head(&req.result_wait);
3212 
3213 	spin_lock(&cache->invalidation_lock);
3214 	list_add(&req.list, &cache->invalidation_requests);
3215 	spin_unlock(&cache->invalidation_lock);
3216 	wake_worker(cache);
3217 
3218 	wait_event(req.result_wait, atomic_read(&req.complete));
3219 	return req.err;
3220 }
3221 
3222 static int process_invalidate_cblocks_message(struct cache *cache, unsigned count,
3223 					      const char **cblock_ranges)
3224 {
3225 	int r = 0;
3226 	unsigned i;
3227 	struct cblock_range range;
3228 
3229 	if (!passthrough_mode(&cache->features)) {
3230 		DMERR("cache has to be in passthrough mode for invalidation");
3231 		return -EPERM;
3232 	}
3233 
3234 	for (i = 0; i < count; i++) {
3235 		r = parse_cblock_range(cache, cblock_ranges[i], &range);
3236 		if (r)
3237 			break;
3238 
3239 		r = validate_cblock_range(cache, &range);
3240 		if (r)
3241 			break;
3242 
3243 		/*
3244 		 * Pass begin and end origin blocks to the worker and wake it.
3245 		 */
3246 		r = request_invalidation(cache, &range);
3247 		if (r)
3248 			break;
3249 	}
3250 
3251 	return r;
3252 }
3253 
3254 /*
3255  * Supports
3256  *	"<key> <value>"
3257  * and
3258  *     "invalidate_cblocks [(<begin>)|(<begin>-<end>)]*
3259  *
3260  * The key migration_threshold is supported by the cache target core.
3261  */
3262 static int cache_message(struct dm_target *ti, unsigned argc, char **argv)
3263 {
3264 	struct cache *cache = ti->private;
3265 
3266 	if (!argc)
3267 		return -EINVAL;
3268 
3269 	if (!strcasecmp(argv[0], "invalidate_cblocks"))
3270 		return process_invalidate_cblocks_message(cache, argc - 1, (const char **) argv + 1);
3271 
3272 	if (argc != 2)
3273 		return -EINVAL;
3274 
3275 	return set_config_value(cache, argv[0], argv[1]);
3276 }
3277 
3278 static int cache_iterate_devices(struct dm_target *ti,
3279 				 iterate_devices_callout_fn fn, void *data)
3280 {
3281 	int r = 0;
3282 	struct cache *cache = ti->private;
3283 
3284 	r = fn(ti, cache->cache_dev, 0, get_dev_size(cache->cache_dev), data);
3285 	if (!r)
3286 		r = fn(ti, cache->origin_dev, 0, ti->len, data);
3287 
3288 	return r;
3289 }
3290 
3291 /*
3292  * We assume I/O is going to the origin (which is the volume
3293  * more likely to have restrictions e.g. by being striped).
3294  * (Looking up the exact location of the data would be expensive
3295  * and could always be out of date by the time the bio is submitted.)
3296  */
3297 static int cache_bvec_merge(struct dm_target *ti,
3298 			    struct bvec_merge_data *bvm,
3299 			    struct bio_vec *biovec, int max_size)
3300 {
3301 	struct cache *cache = ti->private;
3302 	struct request_queue *q = bdev_get_queue(cache->origin_dev->bdev);
3303 
3304 	if (!q->merge_bvec_fn)
3305 		return max_size;
3306 
3307 	bvm->bi_bdev = cache->origin_dev->bdev;
3308 	return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
3309 }
3310 
3311 static void set_discard_limits(struct cache *cache, struct queue_limits *limits)
3312 {
3313 	/*
3314 	 * FIXME: these limits may be incompatible with the cache device
3315 	 */
3316 	limits->max_discard_sectors = min_t(sector_t, cache->discard_block_size * 1024,
3317 					    cache->origin_sectors);
3318 	limits->discard_granularity = cache->discard_block_size << SECTOR_SHIFT;
3319 }
3320 
3321 static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits)
3322 {
3323 	struct cache *cache = ti->private;
3324 	uint64_t io_opt_sectors = limits->io_opt >> SECTOR_SHIFT;
3325 
3326 	/*
3327 	 * If the system-determined stacked limits are compatible with the
3328 	 * cache's blocksize (io_opt is a factor) do not override them.
3329 	 */
3330 	if (io_opt_sectors < cache->sectors_per_block ||
3331 	    do_div(io_opt_sectors, cache->sectors_per_block)) {
3332 		blk_limits_io_min(limits, cache->sectors_per_block << SECTOR_SHIFT);
3333 		blk_limits_io_opt(limits, cache->sectors_per_block << SECTOR_SHIFT);
3334 	}
3335 	set_discard_limits(cache, limits);
3336 }
3337 
3338 /*----------------------------------------------------------------*/
3339 
3340 static struct target_type cache_target = {
3341 	.name = "cache",
3342 	.version = {1, 6, 0},
3343 	.module = THIS_MODULE,
3344 	.ctr = cache_ctr,
3345 	.dtr = cache_dtr,
3346 	.map = cache_map,
3347 	.end_io = cache_end_io,
3348 	.postsuspend = cache_postsuspend,
3349 	.preresume = cache_preresume,
3350 	.resume = cache_resume,
3351 	.status = cache_status,
3352 	.message = cache_message,
3353 	.iterate_devices = cache_iterate_devices,
3354 	.merge = cache_bvec_merge,
3355 	.io_hints = cache_io_hints,
3356 };
3357 
3358 static int __init dm_cache_init(void)
3359 {
3360 	int r;
3361 
3362 	r = dm_register_target(&cache_target);
3363 	if (r) {
3364 		DMERR("cache target registration failed: %d", r);
3365 		return r;
3366 	}
3367 
3368 	migration_cache = KMEM_CACHE(dm_cache_migration, 0);
3369 	if (!migration_cache) {
3370 		dm_unregister_target(&cache_target);
3371 		return -ENOMEM;
3372 	}
3373 
3374 	return 0;
3375 }
3376 
3377 static void __exit dm_cache_exit(void)
3378 {
3379 	dm_unregister_target(&cache_target);
3380 	kmem_cache_destroy(migration_cache);
3381 }
3382 
3383 module_init(dm_cache_init);
3384 module_exit(dm_cache_exit);
3385 
3386 MODULE_DESCRIPTION(DM_NAME " cache target");
3387 MODULE_AUTHOR("Joe Thornber <ejt@redhat.com>");
3388 MODULE_LICENSE("GPL");
3389