xref: /openbmc/linux/drivers/md/dm-cache-target.c (revision ecc23d0a422a3118fcf6e4f0a46e17a6c2047b02)
1  // SPDX-License-Identifier: GPL-2.0-only
2  /*
3   * Copyright (C) 2012 Red Hat. All rights reserved.
4   *
5   * This file is released under the GPL.
6   */
7  
8  #include "dm.h"
9  #include "dm-bio-prison-v2.h"
10  #include "dm-bio-record.h"
11  #include "dm-cache-metadata.h"
12  #include "dm-io-tracker.h"
13  #include "dm-cache-background-tracker.h"
14  
15  #include <linux/dm-io.h>
16  #include <linux/dm-kcopyd.h>
17  #include <linux/jiffies.h>
18  #include <linux/init.h>
19  #include <linux/mempool.h>
20  #include <linux/module.h>
21  #include <linux/rwsem.h>
22  #include <linux/slab.h>
23  #include <linux/vmalloc.h>
24  
25  #define DM_MSG_PREFIX "cache"
26  
27  DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(cache_copy_throttle,
28  	"A percentage of time allocated for copying to and/or from cache");
29  
30  /*----------------------------------------------------------------*/
31  
32  /*
33   * Glossary:
34   *
35   * oblock: index of an origin block
36   * cblock: index of a cache block
37   * promotion: movement of a block from origin to cache
38   * demotion: movement of a block from cache to origin
39   * migration: movement of a block between the origin and cache device,
40   *	      either direction
41   */
42  
43  /*----------------------------------------------------------------*/
44  
45  /*
46   * Represents a chunk of future work.  'input' allows continuations to pass
47   * values between themselves, typically error values.
48   */
49  struct continuation {
50  	struct work_struct ws;
51  	blk_status_t input;
52  };
53  
init_continuation(struct continuation * k,void (* fn)(struct work_struct *))54  static inline void init_continuation(struct continuation *k,
55  				     void (*fn)(struct work_struct *))
56  {
57  	INIT_WORK(&k->ws, fn);
58  	k->input = 0;
59  }
60  
queue_continuation(struct workqueue_struct * wq,struct continuation * k)61  static inline void queue_continuation(struct workqueue_struct *wq,
62  				      struct continuation *k)
63  {
64  	queue_work(wq, &k->ws);
65  }
66  
67  /*----------------------------------------------------------------*/
68  
69  /*
70   * The batcher collects together pieces of work that need a particular
71   * operation to occur before they can proceed (typically a commit).
72   */
73  struct batcher {
74  	/*
75  	 * The operation that everyone is waiting for.
76  	 */
77  	blk_status_t (*commit_op)(void *context);
78  	void *commit_context;
79  
80  	/*
81  	 * This is how bios should be issued once the commit op is complete
82  	 * (accounted_request).
83  	 */
84  	void (*issue_op)(struct bio *bio, void *context);
85  	void *issue_context;
86  
87  	/*
88  	 * Queued work gets put on here after commit.
89  	 */
90  	struct workqueue_struct *wq;
91  
92  	spinlock_t lock;
93  	struct list_head work_items;
94  	struct bio_list bios;
95  	struct work_struct commit_work;
96  
97  	bool commit_scheduled;
98  };
99  
__commit(struct work_struct * _ws)100  static void __commit(struct work_struct *_ws)
101  {
102  	struct batcher *b = container_of(_ws, struct batcher, commit_work);
103  	blk_status_t r;
104  	struct list_head work_items;
105  	struct work_struct *ws, *tmp;
106  	struct continuation *k;
107  	struct bio *bio;
108  	struct bio_list bios;
109  
110  	INIT_LIST_HEAD(&work_items);
111  	bio_list_init(&bios);
112  
113  	/*
114  	 * We have to grab these before the commit_op to avoid a race
115  	 * condition.
116  	 */
117  	spin_lock_irq(&b->lock);
118  	list_splice_init(&b->work_items, &work_items);
119  	bio_list_merge(&bios, &b->bios);
120  	bio_list_init(&b->bios);
121  	b->commit_scheduled = false;
122  	spin_unlock_irq(&b->lock);
123  
124  	r = b->commit_op(b->commit_context);
125  
126  	list_for_each_entry_safe(ws, tmp, &work_items, entry) {
127  		k = container_of(ws, struct continuation, ws);
128  		k->input = r;
129  		INIT_LIST_HEAD(&ws->entry); /* to avoid a WARN_ON */
130  		queue_work(b->wq, ws);
131  	}
132  
133  	while ((bio = bio_list_pop(&bios))) {
134  		if (r) {
135  			bio->bi_status = r;
136  			bio_endio(bio);
137  		} else
138  			b->issue_op(bio, b->issue_context);
139  	}
140  }
141  
batcher_init(struct batcher * b,blk_status_t (* commit_op)(void *),void * commit_context,void (* issue_op)(struct bio * bio,void *),void * issue_context,struct workqueue_struct * wq)142  static void batcher_init(struct batcher *b,
143  			 blk_status_t (*commit_op)(void *),
144  			 void *commit_context,
145  			 void (*issue_op)(struct bio *bio, void *),
146  			 void *issue_context,
147  			 struct workqueue_struct *wq)
148  {
149  	b->commit_op = commit_op;
150  	b->commit_context = commit_context;
151  	b->issue_op = issue_op;
152  	b->issue_context = issue_context;
153  	b->wq = wq;
154  
155  	spin_lock_init(&b->lock);
156  	INIT_LIST_HEAD(&b->work_items);
157  	bio_list_init(&b->bios);
158  	INIT_WORK(&b->commit_work, __commit);
159  	b->commit_scheduled = false;
160  }
161  
async_commit(struct batcher * b)162  static void async_commit(struct batcher *b)
163  {
164  	queue_work(b->wq, &b->commit_work);
165  }
166  
continue_after_commit(struct batcher * b,struct continuation * k)167  static void continue_after_commit(struct batcher *b, struct continuation *k)
168  {
169  	bool commit_scheduled;
170  
171  	spin_lock_irq(&b->lock);
172  	commit_scheduled = b->commit_scheduled;
173  	list_add_tail(&k->ws.entry, &b->work_items);
174  	spin_unlock_irq(&b->lock);
175  
176  	if (commit_scheduled)
177  		async_commit(b);
178  }
179  
180  /*
181   * Bios are errored if commit failed.
182   */
issue_after_commit(struct batcher * b,struct bio * bio)183  static void issue_after_commit(struct batcher *b, struct bio *bio)
184  {
185  	bool commit_scheduled;
186  
187  	spin_lock_irq(&b->lock);
188  	commit_scheduled = b->commit_scheduled;
189  	bio_list_add(&b->bios, bio);
190  	spin_unlock_irq(&b->lock);
191  
192  	if (commit_scheduled)
193  		async_commit(b);
194  }
195  
196  /*
197   * Call this if some urgent work is waiting for the commit to complete.
198   */
schedule_commit(struct batcher * b)199  static void schedule_commit(struct batcher *b)
200  {
201  	bool immediate;
202  
203  	spin_lock_irq(&b->lock);
204  	immediate = !list_empty(&b->work_items) || !bio_list_empty(&b->bios);
205  	b->commit_scheduled = true;
206  	spin_unlock_irq(&b->lock);
207  
208  	if (immediate)
209  		async_commit(b);
210  }
211  
212  /*
213   * There are a couple of places where we let a bio run, but want to do some
214   * work before calling its endio function.  We do this by temporarily
215   * changing the endio fn.
216   */
217  struct dm_hook_info {
218  	bio_end_io_t *bi_end_io;
219  };
220  
dm_hook_bio(struct dm_hook_info * h,struct bio * bio,bio_end_io_t * bi_end_io,void * bi_private)221  static void dm_hook_bio(struct dm_hook_info *h, struct bio *bio,
222  			bio_end_io_t *bi_end_io, void *bi_private)
223  {
224  	h->bi_end_io = bio->bi_end_io;
225  
226  	bio->bi_end_io = bi_end_io;
227  	bio->bi_private = bi_private;
228  }
229  
dm_unhook_bio(struct dm_hook_info * h,struct bio * bio)230  static void dm_unhook_bio(struct dm_hook_info *h, struct bio *bio)
231  {
232  	bio->bi_end_io = h->bi_end_io;
233  }
234  
235  /*----------------------------------------------------------------*/
236  
237  #define MIGRATION_POOL_SIZE 128
238  #define COMMIT_PERIOD HZ
239  #define MIGRATION_COUNT_WINDOW 10
240  
241  /*
242   * The block size of the device holding cache data must be
243   * between 32KB and 1GB.
244   */
245  #define DATA_DEV_BLOCK_SIZE_MIN_SECTORS (32 * 1024 >> SECTOR_SHIFT)
246  #define DATA_DEV_BLOCK_SIZE_MAX_SECTORS (1024 * 1024 * 1024 >> SECTOR_SHIFT)
247  
248  enum cache_metadata_mode {
249  	CM_WRITE,		/* metadata may be changed */
250  	CM_READ_ONLY,		/* metadata may not be changed */
251  	CM_FAIL
252  };
253  
254  enum cache_io_mode {
255  	/*
256  	 * Data is written to cached blocks only.  These blocks are marked
257  	 * dirty.  If you lose the cache device you will lose data.
258  	 * Potential performance increase for both reads and writes.
259  	 */
260  	CM_IO_WRITEBACK,
261  
262  	/*
263  	 * Data is written to both cache and origin.  Blocks are never
264  	 * dirty.  Potential performance benfit for reads only.
265  	 */
266  	CM_IO_WRITETHROUGH,
267  
268  	/*
269  	 * A degraded mode useful for various cache coherency situations
270  	 * (eg, rolling back snapshots).  Reads and writes always go to the
271  	 * origin.  If a write goes to a cached oblock, then the cache
272  	 * block is invalidated.
273  	 */
274  	CM_IO_PASSTHROUGH
275  };
276  
277  struct cache_features {
278  	enum cache_metadata_mode mode;
279  	enum cache_io_mode io_mode;
280  	unsigned int metadata_version;
281  	bool discard_passdown:1;
282  };
283  
284  struct cache_stats {
285  	atomic_t read_hit;
286  	atomic_t read_miss;
287  	atomic_t write_hit;
288  	atomic_t write_miss;
289  	atomic_t demotion;
290  	atomic_t promotion;
291  	atomic_t writeback;
292  	atomic_t copies_avoided;
293  	atomic_t cache_cell_clash;
294  	atomic_t commit_count;
295  	atomic_t discard_count;
296  };
297  
298  struct cache {
299  	struct dm_target *ti;
300  	spinlock_t lock;
301  
302  	/*
303  	 * Fields for converting from sectors to blocks.
304  	 */
305  	int sectors_per_block_shift;
306  	sector_t sectors_per_block;
307  
308  	struct dm_cache_metadata *cmd;
309  
310  	/*
311  	 * Metadata is written to this device.
312  	 */
313  	struct dm_dev *metadata_dev;
314  
315  	/*
316  	 * The slower of the two data devices.  Typically a spindle.
317  	 */
318  	struct dm_dev *origin_dev;
319  
320  	/*
321  	 * The faster of the two data devices.  Typically an SSD.
322  	 */
323  	struct dm_dev *cache_dev;
324  
325  	/*
326  	 * Size of the origin device in _complete_ blocks and native sectors.
327  	 */
328  	dm_oblock_t origin_blocks;
329  	sector_t origin_sectors;
330  
331  	/*
332  	 * Size of the cache device in blocks.
333  	 */
334  	dm_cblock_t cache_size;
335  
336  	/*
337  	 * Invalidation fields.
338  	 */
339  	spinlock_t invalidation_lock;
340  	struct list_head invalidation_requests;
341  
342  	sector_t migration_threshold;
343  	wait_queue_head_t migration_wait;
344  	atomic_t nr_allocated_migrations;
345  
346  	/*
347  	 * The number of in flight migrations that are performing
348  	 * background io. eg, promotion, writeback.
349  	 */
350  	atomic_t nr_io_migrations;
351  
352  	struct bio_list deferred_bios;
353  
354  	struct rw_semaphore quiesce_lock;
355  
356  	/*
357  	 * origin_blocks entries, discarded if set.
358  	 */
359  	dm_dblock_t discard_nr_blocks;
360  	unsigned long *discard_bitset;
361  	uint32_t discard_block_size; /* a power of 2 times sectors per block */
362  
363  	/*
364  	 * Rather than reconstructing the table line for the status we just
365  	 * save it and regurgitate.
366  	 */
367  	unsigned int nr_ctr_args;
368  	const char **ctr_args;
369  
370  	struct dm_kcopyd_client *copier;
371  	struct work_struct deferred_bio_worker;
372  	struct work_struct migration_worker;
373  	struct workqueue_struct *wq;
374  	struct delayed_work waker;
375  	struct dm_bio_prison_v2 *prison;
376  
377  	/*
378  	 * cache_size entries, dirty if set
379  	 */
380  	unsigned long *dirty_bitset;
381  	atomic_t nr_dirty;
382  
383  	unsigned int policy_nr_args;
384  	struct dm_cache_policy *policy;
385  
386  	/*
387  	 * Cache features such as write-through.
388  	 */
389  	struct cache_features features;
390  
391  	struct cache_stats stats;
392  
393  	bool need_tick_bio:1;
394  	bool sized:1;
395  	bool invalidate:1;
396  	bool commit_requested:1;
397  	bool loaded_mappings:1;
398  	bool loaded_discards:1;
399  
400  	struct rw_semaphore background_work_lock;
401  
402  	struct batcher committer;
403  	struct work_struct commit_ws;
404  
405  	struct dm_io_tracker tracker;
406  
407  	mempool_t migration_pool;
408  
409  	struct bio_set bs;
410  };
411  
412  struct per_bio_data {
413  	bool tick:1;
414  	unsigned int req_nr:2;
415  	struct dm_bio_prison_cell_v2 *cell;
416  	struct dm_hook_info hook_info;
417  	sector_t len;
418  };
419  
420  struct dm_cache_migration {
421  	struct continuation k;
422  	struct cache *cache;
423  
424  	struct policy_work *op;
425  	struct bio *overwrite_bio;
426  	struct dm_bio_prison_cell_v2 *cell;
427  
428  	dm_cblock_t invalidate_cblock;
429  	dm_oblock_t invalidate_oblock;
430  };
431  
432  /*----------------------------------------------------------------*/
433  
writethrough_mode(struct cache * cache)434  static bool writethrough_mode(struct cache *cache)
435  {
436  	return cache->features.io_mode == CM_IO_WRITETHROUGH;
437  }
438  
writeback_mode(struct cache * cache)439  static bool writeback_mode(struct cache *cache)
440  {
441  	return cache->features.io_mode == CM_IO_WRITEBACK;
442  }
443  
passthrough_mode(struct cache * cache)444  static inline bool passthrough_mode(struct cache *cache)
445  {
446  	return unlikely(cache->features.io_mode == CM_IO_PASSTHROUGH);
447  }
448  
449  /*----------------------------------------------------------------*/
450  
wake_deferred_bio_worker(struct cache * cache)451  static void wake_deferred_bio_worker(struct cache *cache)
452  {
453  	queue_work(cache->wq, &cache->deferred_bio_worker);
454  }
455  
wake_migration_worker(struct cache * cache)456  static void wake_migration_worker(struct cache *cache)
457  {
458  	if (passthrough_mode(cache))
459  		return;
460  
461  	queue_work(cache->wq, &cache->migration_worker);
462  }
463  
464  /*----------------------------------------------------------------*/
465  
alloc_prison_cell(struct cache * cache)466  static struct dm_bio_prison_cell_v2 *alloc_prison_cell(struct cache *cache)
467  {
468  	return dm_bio_prison_alloc_cell_v2(cache->prison, GFP_NOIO);
469  }
470  
free_prison_cell(struct cache * cache,struct dm_bio_prison_cell_v2 * cell)471  static void free_prison_cell(struct cache *cache, struct dm_bio_prison_cell_v2 *cell)
472  {
473  	dm_bio_prison_free_cell_v2(cache->prison, cell);
474  }
475  
alloc_migration(struct cache * cache)476  static struct dm_cache_migration *alloc_migration(struct cache *cache)
477  {
478  	struct dm_cache_migration *mg;
479  
480  	mg = mempool_alloc(&cache->migration_pool, GFP_NOIO);
481  
482  	memset(mg, 0, sizeof(*mg));
483  
484  	mg->cache = cache;
485  	atomic_inc(&cache->nr_allocated_migrations);
486  
487  	return mg;
488  }
489  
free_migration(struct dm_cache_migration * mg)490  static void free_migration(struct dm_cache_migration *mg)
491  {
492  	struct cache *cache = mg->cache;
493  
494  	if (atomic_dec_and_test(&cache->nr_allocated_migrations))
495  		wake_up(&cache->migration_wait);
496  
497  	mempool_free(mg, &cache->migration_pool);
498  }
499  
500  /*----------------------------------------------------------------*/
501  
oblock_succ(dm_oblock_t b)502  static inline dm_oblock_t oblock_succ(dm_oblock_t b)
503  {
504  	return to_oblock(from_oblock(b) + 1ull);
505  }
506  
build_key(dm_oblock_t begin,dm_oblock_t end,struct dm_cell_key_v2 * key)507  static void build_key(dm_oblock_t begin, dm_oblock_t end, struct dm_cell_key_v2 *key)
508  {
509  	key->virtual = 0;
510  	key->dev = 0;
511  	key->block_begin = from_oblock(begin);
512  	key->block_end = from_oblock(end);
513  }
514  
515  /*
516   * We have two lock levels.  Level 0, which is used to prevent WRITEs, and
517   * level 1 which prevents *both* READs and WRITEs.
518   */
519  #define WRITE_LOCK_LEVEL 0
520  #define READ_WRITE_LOCK_LEVEL 1
521  
lock_level(struct bio * bio)522  static unsigned int lock_level(struct bio *bio)
523  {
524  	return bio_data_dir(bio) == WRITE ?
525  		WRITE_LOCK_LEVEL :
526  		READ_WRITE_LOCK_LEVEL;
527  }
528  
529  /*
530   *--------------------------------------------------------------
531   * Per bio data
532   *--------------------------------------------------------------
533   */
534  
get_per_bio_data(struct bio * bio)535  static struct per_bio_data *get_per_bio_data(struct bio *bio)
536  {
537  	struct per_bio_data *pb = dm_per_bio_data(bio, sizeof(struct per_bio_data));
538  
539  	BUG_ON(!pb);
540  	return pb;
541  }
542  
init_per_bio_data(struct bio * bio)543  static struct per_bio_data *init_per_bio_data(struct bio *bio)
544  {
545  	struct per_bio_data *pb = get_per_bio_data(bio);
546  
547  	pb->tick = false;
548  	pb->req_nr = dm_bio_get_target_bio_nr(bio);
549  	pb->cell = NULL;
550  	pb->len = 0;
551  
552  	return pb;
553  }
554  
555  /*----------------------------------------------------------------*/
556  
defer_bio(struct cache * cache,struct bio * bio)557  static void defer_bio(struct cache *cache, struct bio *bio)
558  {
559  	spin_lock_irq(&cache->lock);
560  	bio_list_add(&cache->deferred_bios, bio);
561  	spin_unlock_irq(&cache->lock);
562  
563  	wake_deferred_bio_worker(cache);
564  }
565  
defer_bios(struct cache * cache,struct bio_list * bios)566  static void defer_bios(struct cache *cache, struct bio_list *bios)
567  {
568  	spin_lock_irq(&cache->lock);
569  	bio_list_merge(&cache->deferred_bios, bios);
570  	bio_list_init(bios);
571  	spin_unlock_irq(&cache->lock);
572  
573  	wake_deferred_bio_worker(cache);
574  }
575  
576  /*----------------------------------------------------------------*/
577  
bio_detain_shared(struct cache * cache,dm_oblock_t oblock,struct bio * bio)578  static bool bio_detain_shared(struct cache *cache, dm_oblock_t oblock, struct bio *bio)
579  {
580  	bool r;
581  	struct per_bio_data *pb;
582  	struct dm_cell_key_v2 key;
583  	dm_oblock_t end = to_oblock(from_oblock(oblock) + 1ULL);
584  	struct dm_bio_prison_cell_v2 *cell_prealloc, *cell;
585  
586  	cell_prealloc = alloc_prison_cell(cache); /* FIXME: allow wait if calling from worker */
587  
588  	build_key(oblock, end, &key);
589  	r = dm_cell_get_v2(cache->prison, &key, lock_level(bio), bio, cell_prealloc, &cell);
590  	if (!r) {
591  		/*
592  		 * Failed to get the lock.
593  		 */
594  		free_prison_cell(cache, cell_prealloc);
595  		return r;
596  	}
597  
598  	if (cell != cell_prealloc)
599  		free_prison_cell(cache, cell_prealloc);
600  
601  	pb = get_per_bio_data(bio);
602  	pb->cell = cell;
603  
604  	return r;
605  }
606  
607  /*----------------------------------------------------------------*/
608  
is_dirty(struct cache * cache,dm_cblock_t b)609  static bool is_dirty(struct cache *cache, dm_cblock_t b)
610  {
611  	return test_bit(from_cblock(b), cache->dirty_bitset);
612  }
613  
set_dirty(struct cache * cache,dm_cblock_t cblock)614  static void set_dirty(struct cache *cache, dm_cblock_t cblock)
615  {
616  	if (!test_and_set_bit(from_cblock(cblock), cache->dirty_bitset)) {
617  		atomic_inc(&cache->nr_dirty);
618  		policy_set_dirty(cache->policy, cblock);
619  	}
620  }
621  
622  /*
623   * These two are called when setting after migrations to force the policy
624   * and dirty bitset to be in sync.
625   */
force_set_dirty(struct cache * cache,dm_cblock_t cblock)626  static void force_set_dirty(struct cache *cache, dm_cblock_t cblock)
627  {
628  	if (!test_and_set_bit(from_cblock(cblock), cache->dirty_bitset))
629  		atomic_inc(&cache->nr_dirty);
630  	policy_set_dirty(cache->policy, cblock);
631  }
632  
force_clear_dirty(struct cache * cache,dm_cblock_t cblock)633  static void force_clear_dirty(struct cache *cache, dm_cblock_t cblock)
634  {
635  	if (test_and_clear_bit(from_cblock(cblock), cache->dirty_bitset)) {
636  		if (atomic_dec_return(&cache->nr_dirty) == 0)
637  			dm_table_event(cache->ti->table);
638  	}
639  
640  	policy_clear_dirty(cache->policy, cblock);
641  }
642  
643  /*----------------------------------------------------------------*/
644  
block_size_is_power_of_two(struct cache * cache)645  static bool block_size_is_power_of_two(struct cache *cache)
646  {
647  	return cache->sectors_per_block_shift >= 0;
648  }
649  
block_div(dm_block_t b,uint32_t n)650  static dm_block_t block_div(dm_block_t b, uint32_t n)
651  {
652  	do_div(b, n);
653  
654  	return b;
655  }
656  
oblocks_per_dblock(struct cache * cache)657  static dm_block_t oblocks_per_dblock(struct cache *cache)
658  {
659  	dm_block_t oblocks = cache->discard_block_size;
660  
661  	if (block_size_is_power_of_two(cache))
662  		oblocks >>= cache->sectors_per_block_shift;
663  	else
664  		oblocks = block_div(oblocks, cache->sectors_per_block);
665  
666  	return oblocks;
667  }
668  
oblock_to_dblock(struct cache * cache,dm_oblock_t oblock)669  static dm_dblock_t oblock_to_dblock(struct cache *cache, dm_oblock_t oblock)
670  {
671  	return to_dblock(block_div(from_oblock(oblock),
672  				   oblocks_per_dblock(cache)));
673  }
674  
set_discard(struct cache * cache,dm_dblock_t b)675  static void set_discard(struct cache *cache, dm_dblock_t b)
676  {
677  	BUG_ON(from_dblock(b) >= from_dblock(cache->discard_nr_blocks));
678  	atomic_inc(&cache->stats.discard_count);
679  
680  	spin_lock_irq(&cache->lock);
681  	set_bit(from_dblock(b), cache->discard_bitset);
682  	spin_unlock_irq(&cache->lock);
683  }
684  
clear_discard(struct cache * cache,dm_dblock_t b)685  static void clear_discard(struct cache *cache, dm_dblock_t b)
686  {
687  	spin_lock_irq(&cache->lock);
688  	clear_bit(from_dblock(b), cache->discard_bitset);
689  	spin_unlock_irq(&cache->lock);
690  }
691  
is_discarded(struct cache * cache,dm_dblock_t b)692  static bool is_discarded(struct cache *cache, dm_dblock_t b)
693  {
694  	int r;
695  
696  	spin_lock_irq(&cache->lock);
697  	r = test_bit(from_dblock(b), cache->discard_bitset);
698  	spin_unlock_irq(&cache->lock);
699  
700  	return r;
701  }
702  
is_discarded_oblock(struct cache * cache,dm_oblock_t b)703  static bool is_discarded_oblock(struct cache *cache, dm_oblock_t b)
704  {
705  	int r;
706  
707  	spin_lock_irq(&cache->lock);
708  	r = test_bit(from_dblock(oblock_to_dblock(cache, b)),
709  		     cache->discard_bitset);
710  	spin_unlock_irq(&cache->lock);
711  
712  	return r;
713  }
714  
715  /*
716   * -------------------------------------------------------------
717   * Remapping
718   *--------------------------------------------------------------
719   */
remap_to_origin(struct cache * cache,struct bio * bio)720  static void remap_to_origin(struct cache *cache, struct bio *bio)
721  {
722  	bio_set_dev(bio, cache->origin_dev->bdev);
723  }
724  
remap_to_cache(struct cache * cache,struct bio * bio,dm_cblock_t cblock)725  static void remap_to_cache(struct cache *cache, struct bio *bio,
726  			   dm_cblock_t cblock)
727  {
728  	sector_t bi_sector = bio->bi_iter.bi_sector;
729  	sector_t block = from_cblock(cblock);
730  
731  	bio_set_dev(bio, cache->cache_dev->bdev);
732  	if (!block_size_is_power_of_two(cache))
733  		bio->bi_iter.bi_sector =
734  			(block * cache->sectors_per_block) +
735  			sector_div(bi_sector, cache->sectors_per_block);
736  	else
737  		bio->bi_iter.bi_sector =
738  			(block << cache->sectors_per_block_shift) |
739  			(bi_sector & (cache->sectors_per_block - 1));
740  }
741  
check_if_tick_bio_needed(struct cache * cache,struct bio * bio)742  static void check_if_tick_bio_needed(struct cache *cache, struct bio *bio)
743  {
744  	struct per_bio_data *pb;
745  
746  	spin_lock_irq(&cache->lock);
747  	if (cache->need_tick_bio && !op_is_flush(bio->bi_opf) &&
748  	    bio_op(bio) != REQ_OP_DISCARD) {
749  		pb = get_per_bio_data(bio);
750  		pb->tick = true;
751  		cache->need_tick_bio = false;
752  	}
753  	spin_unlock_irq(&cache->lock);
754  }
755  
remap_to_origin_clear_discard(struct cache * cache,struct bio * bio,dm_oblock_t oblock)756  static void remap_to_origin_clear_discard(struct cache *cache, struct bio *bio,
757  					  dm_oblock_t oblock)
758  {
759  	// FIXME: check_if_tick_bio_needed() is called way too much through this interface
760  	check_if_tick_bio_needed(cache, bio);
761  	remap_to_origin(cache, bio);
762  	if (bio_data_dir(bio) == WRITE)
763  		clear_discard(cache, oblock_to_dblock(cache, oblock));
764  }
765  
remap_to_cache_dirty(struct cache * cache,struct bio * bio,dm_oblock_t oblock,dm_cblock_t cblock)766  static void remap_to_cache_dirty(struct cache *cache, struct bio *bio,
767  				 dm_oblock_t oblock, dm_cblock_t cblock)
768  {
769  	check_if_tick_bio_needed(cache, bio);
770  	remap_to_cache(cache, bio, cblock);
771  	if (bio_data_dir(bio) == WRITE) {
772  		set_dirty(cache, cblock);
773  		clear_discard(cache, oblock_to_dblock(cache, oblock));
774  	}
775  }
776  
get_bio_block(struct cache * cache,struct bio * bio)777  static dm_oblock_t get_bio_block(struct cache *cache, struct bio *bio)
778  {
779  	sector_t block_nr = bio->bi_iter.bi_sector;
780  
781  	if (!block_size_is_power_of_two(cache))
782  		(void) sector_div(block_nr, cache->sectors_per_block);
783  	else
784  		block_nr >>= cache->sectors_per_block_shift;
785  
786  	return to_oblock(block_nr);
787  }
788  
accountable_bio(struct cache * cache,struct bio * bio)789  static bool accountable_bio(struct cache *cache, struct bio *bio)
790  {
791  	return bio_op(bio) != REQ_OP_DISCARD;
792  }
793  
accounted_begin(struct cache * cache,struct bio * bio)794  static void accounted_begin(struct cache *cache, struct bio *bio)
795  {
796  	struct per_bio_data *pb;
797  
798  	if (accountable_bio(cache, bio)) {
799  		pb = get_per_bio_data(bio);
800  		pb->len = bio_sectors(bio);
801  		dm_iot_io_begin(&cache->tracker, pb->len);
802  	}
803  }
804  
accounted_complete(struct cache * cache,struct bio * bio)805  static void accounted_complete(struct cache *cache, struct bio *bio)
806  {
807  	struct per_bio_data *pb = get_per_bio_data(bio);
808  
809  	dm_iot_io_end(&cache->tracker, pb->len);
810  }
811  
accounted_request(struct cache * cache,struct bio * bio)812  static void accounted_request(struct cache *cache, struct bio *bio)
813  {
814  	accounted_begin(cache, bio);
815  	dm_submit_bio_remap(bio, NULL);
816  }
817  
issue_op(struct bio * bio,void * context)818  static void issue_op(struct bio *bio, void *context)
819  {
820  	struct cache *cache = context;
821  
822  	accounted_request(cache, bio);
823  }
824  
825  /*
826   * When running in writethrough mode we need to send writes to clean blocks
827   * to both the cache and origin devices.  Clone the bio and send them in parallel.
828   */
remap_to_origin_and_cache(struct cache * cache,struct bio * bio,dm_oblock_t oblock,dm_cblock_t cblock)829  static void remap_to_origin_and_cache(struct cache *cache, struct bio *bio,
830  				      dm_oblock_t oblock, dm_cblock_t cblock)
831  {
832  	struct bio *origin_bio = bio_alloc_clone(cache->origin_dev->bdev, bio,
833  						 GFP_NOIO, &cache->bs);
834  
835  	BUG_ON(!origin_bio);
836  
837  	bio_chain(origin_bio, bio);
838  
839  	if (bio_data_dir(origin_bio) == WRITE)
840  		clear_discard(cache, oblock_to_dblock(cache, oblock));
841  	submit_bio(origin_bio);
842  
843  	remap_to_cache(cache, bio, cblock);
844  }
845  
846  /*
847   *--------------------------------------------------------------
848   * Failure modes
849   *--------------------------------------------------------------
850   */
get_cache_mode(struct cache * cache)851  static enum cache_metadata_mode get_cache_mode(struct cache *cache)
852  {
853  	return cache->features.mode;
854  }
855  
cache_device_name(struct cache * cache)856  static const char *cache_device_name(struct cache *cache)
857  {
858  	return dm_table_device_name(cache->ti->table);
859  }
860  
notify_mode_switch(struct cache * cache,enum cache_metadata_mode mode)861  static void notify_mode_switch(struct cache *cache, enum cache_metadata_mode mode)
862  {
863  	static const char *descs[] = {
864  		"write",
865  		"read-only",
866  		"fail"
867  	};
868  
869  	dm_table_event(cache->ti->table);
870  	DMINFO("%s: switching cache to %s mode",
871  	       cache_device_name(cache), descs[(int)mode]);
872  }
873  
set_cache_mode(struct cache * cache,enum cache_metadata_mode new_mode)874  static void set_cache_mode(struct cache *cache, enum cache_metadata_mode new_mode)
875  {
876  	bool needs_check;
877  	enum cache_metadata_mode old_mode = get_cache_mode(cache);
878  
879  	if (dm_cache_metadata_needs_check(cache->cmd, &needs_check)) {
880  		DMERR("%s: unable to read needs_check flag, setting failure mode.",
881  		      cache_device_name(cache));
882  		new_mode = CM_FAIL;
883  	}
884  
885  	if (new_mode == CM_WRITE && needs_check) {
886  		DMERR("%s: unable to switch cache to write mode until repaired.",
887  		      cache_device_name(cache));
888  		if (old_mode != new_mode)
889  			new_mode = old_mode;
890  		else
891  			new_mode = CM_READ_ONLY;
892  	}
893  
894  	/* Never move out of fail mode */
895  	if (old_mode == CM_FAIL)
896  		new_mode = CM_FAIL;
897  
898  	switch (new_mode) {
899  	case CM_FAIL:
900  	case CM_READ_ONLY:
901  		dm_cache_metadata_set_read_only(cache->cmd);
902  		break;
903  
904  	case CM_WRITE:
905  		dm_cache_metadata_set_read_write(cache->cmd);
906  		break;
907  	}
908  
909  	cache->features.mode = new_mode;
910  
911  	if (new_mode != old_mode)
912  		notify_mode_switch(cache, new_mode);
913  }
914  
abort_transaction(struct cache * cache)915  static void abort_transaction(struct cache *cache)
916  {
917  	const char *dev_name = cache_device_name(cache);
918  
919  	if (get_cache_mode(cache) >= CM_READ_ONLY)
920  		return;
921  
922  	DMERR_LIMIT("%s: aborting current metadata transaction", dev_name);
923  	if (dm_cache_metadata_abort(cache->cmd)) {
924  		DMERR("%s: failed to abort metadata transaction", dev_name);
925  		set_cache_mode(cache, CM_FAIL);
926  	}
927  
928  	if (dm_cache_metadata_set_needs_check(cache->cmd)) {
929  		DMERR("%s: failed to set 'needs_check' flag in metadata", dev_name);
930  		set_cache_mode(cache, CM_FAIL);
931  	}
932  }
933  
metadata_operation_failed(struct cache * cache,const char * op,int r)934  static void metadata_operation_failed(struct cache *cache, const char *op, int r)
935  {
936  	DMERR_LIMIT("%s: metadata operation '%s' failed: error = %d",
937  		    cache_device_name(cache), op, r);
938  	abort_transaction(cache);
939  	set_cache_mode(cache, CM_READ_ONLY);
940  }
941  
942  /*----------------------------------------------------------------*/
943  
load_stats(struct cache * cache)944  static void load_stats(struct cache *cache)
945  {
946  	struct dm_cache_statistics stats;
947  
948  	dm_cache_metadata_get_stats(cache->cmd, &stats);
949  	atomic_set(&cache->stats.read_hit, stats.read_hits);
950  	atomic_set(&cache->stats.read_miss, stats.read_misses);
951  	atomic_set(&cache->stats.write_hit, stats.write_hits);
952  	atomic_set(&cache->stats.write_miss, stats.write_misses);
953  }
954  
save_stats(struct cache * cache)955  static void save_stats(struct cache *cache)
956  {
957  	struct dm_cache_statistics stats;
958  
959  	if (get_cache_mode(cache) >= CM_READ_ONLY)
960  		return;
961  
962  	stats.read_hits = atomic_read(&cache->stats.read_hit);
963  	stats.read_misses = atomic_read(&cache->stats.read_miss);
964  	stats.write_hits = atomic_read(&cache->stats.write_hit);
965  	stats.write_misses = atomic_read(&cache->stats.write_miss);
966  
967  	dm_cache_metadata_set_stats(cache->cmd, &stats);
968  }
969  
update_stats(struct cache_stats * stats,enum policy_operation op)970  static void update_stats(struct cache_stats *stats, enum policy_operation op)
971  {
972  	switch (op) {
973  	case POLICY_PROMOTE:
974  		atomic_inc(&stats->promotion);
975  		break;
976  
977  	case POLICY_DEMOTE:
978  		atomic_inc(&stats->demotion);
979  		break;
980  
981  	case POLICY_WRITEBACK:
982  		atomic_inc(&stats->writeback);
983  		break;
984  	}
985  }
986  
987  /*
988   *---------------------------------------------------------------------
989   * Migration processing
990   *
991   * Migration covers moving data from the origin device to the cache, or
992   * vice versa.
993   *---------------------------------------------------------------------
994   */
inc_io_migrations(struct cache * cache)995  static void inc_io_migrations(struct cache *cache)
996  {
997  	atomic_inc(&cache->nr_io_migrations);
998  }
999  
dec_io_migrations(struct cache * cache)1000  static void dec_io_migrations(struct cache *cache)
1001  {
1002  	atomic_dec(&cache->nr_io_migrations);
1003  }
1004  
discard_or_flush(struct bio * bio)1005  static bool discard_or_flush(struct bio *bio)
1006  {
1007  	return bio_op(bio) == REQ_OP_DISCARD || op_is_flush(bio->bi_opf);
1008  }
1009  
calc_discard_block_range(struct cache * cache,struct bio * bio,dm_dblock_t * b,dm_dblock_t * e)1010  static void calc_discard_block_range(struct cache *cache, struct bio *bio,
1011  				     dm_dblock_t *b, dm_dblock_t *e)
1012  {
1013  	sector_t sb = bio->bi_iter.bi_sector;
1014  	sector_t se = bio_end_sector(bio);
1015  
1016  	*b = to_dblock(dm_sector_div_up(sb, cache->discard_block_size));
1017  
1018  	if (se - sb < cache->discard_block_size)
1019  		*e = *b;
1020  	else
1021  		*e = to_dblock(block_div(se, cache->discard_block_size));
1022  }
1023  
1024  /*----------------------------------------------------------------*/
1025  
prevent_background_work(struct cache * cache)1026  static void prevent_background_work(struct cache *cache)
1027  {
1028  	lockdep_off();
1029  	down_write(&cache->background_work_lock);
1030  	lockdep_on();
1031  }
1032  
allow_background_work(struct cache * cache)1033  static void allow_background_work(struct cache *cache)
1034  {
1035  	lockdep_off();
1036  	up_write(&cache->background_work_lock);
1037  	lockdep_on();
1038  }
1039  
background_work_begin(struct cache * cache)1040  static bool background_work_begin(struct cache *cache)
1041  {
1042  	bool r;
1043  
1044  	lockdep_off();
1045  	r = down_read_trylock(&cache->background_work_lock);
1046  	lockdep_on();
1047  
1048  	return r;
1049  }
1050  
background_work_end(struct cache * cache)1051  static void background_work_end(struct cache *cache)
1052  {
1053  	lockdep_off();
1054  	up_read(&cache->background_work_lock);
1055  	lockdep_on();
1056  }
1057  
1058  /*----------------------------------------------------------------*/
1059  
bio_writes_complete_block(struct cache * cache,struct bio * bio)1060  static bool bio_writes_complete_block(struct cache *cache, struct bio *bio)
1061  {
1062  	return (bio_data_dir(bio) == WRITE) &&
1063  		(bio->bi_iter.bi_size == (cache->sectors_per_block << SECTOR_SHIFT));
1064  }
1065  
optimisable_bio(struct cache * cache,struct bio * bio,dm_oblock_t block)1066  static bool optimisable_bio(struct cache *cache, struct bio *bio, dm_oblock_t block)
1067  {
1068  	return writeback_mode(cache) &&
1069  		(is_discarded_oblock(cache, block) || bio_writes_complete_block(cache, bio));
1070  }
1071  
quiesce(struct dm_cache_migration * mg,void (* continuation)(struct work_struct *))1072  static void quiesce(struct dm_cache_migration *mg,
1073  		    void (*continuation)(struct work_struct *))
1074  {
1075  	init_continuation(&mg->k, continuation);
1076  	dm_cell_quiesce_v2(mg->cache->prison, mg->cell, &mg->k.ws);
1077  }
1078  
ws_to_mg(struct work_struct * ws)1079  static struct dm_cache_migration *ws_to_mg(struct work_struct *ws)
1080  {
1081  	struct continuation *k = container_of(ws, struct continuation, ws);
1082  
1083  	return container_of(k, struct dm_cache_migration, k);
1084  }
1085  
copy_complete(int read_err,unsigned long write_err,void * context)1086  static void copy_complete(int read_err, unsigned long write_err, void *context)
1087  {
1088  	struct dm_cache_migration *mg = container_of(context, struct dm_cache_migration, k);
1089  
1090  	if (read_err || write_err)
1091  		mg->k.input = BLK_STS_IOERR;
1092  
1093  	queue_continuation(mg->cache->wq, &mg->k);
1094  }
1095  
copy(struct dm_cache_migration * mg,bool promote)1096  static void copy(struct dm_cache_migration *mg, bool promote)
1097  {
1098  	struct dm_io_region o_region, c_region;
1099  	struct cache *cache = mg->cache;
1100  
1101  	o_region.bdev = cache->origin_dev->bdev;
1102  	o_region.sector = from_oblock(mg->op->oblock) * cache->sectors_per_block;
1103  	o_region.count = cache->sectors_per_block;
1104  
1105  	c_region.bdev = cache->cache_dev->bdev;
1106  	c_region.sector = from_cblock(mg->op->cblock) * cache->sectors_per_block;
1107  	c_region.count = cache->sectors_per_block;
1108  
1109  	if (promote)
1110  		dm_kcopyd_copy(cache->copier, &o_region, 1, &c_region, 0, copy_complete, &mg->k);
1111  	else
1112  		dm_kcopyd_copy(cache->copier, &c_region, 1, &o_region, 0, copy_complete, &mg->k);
1113  }
1114  
bio_drop_shared_lock(struct cache * cache,struct bio * bio)1115  static void bio_drop_shared_lock(struct cache *cache, struct bio *bio)
1116  {
1117  	struct per_bio_data *pb = get_per_bio_data(bio);
1118  
1119  	if (pb->cell && dm_cell_put_v2(cache->prison, pb->cell))
1120  		free_prison_cell(cache, pb->cell);
1121  	pb->cell = NULL;
1122  }
1123  
overwrite_endio(struct bio * bio)1124  static void overwrite_endio(struct bio *bio)
1125  {
1126  	struct dm_cache_migration *mg = bio->bi_private;
1127  	struct cache *cache = mg->cache;
1128  	struct per_bio_data *pb = get_per_bio_data(bio);
1129  
1130  	dm_unhook_bio(&pb->hook_info, bio);
1131  
1132  	if (bio->bi_status)
1133  		mg->k.input = bio->bi_status;
1134  
1135  	queue_continuation(cache->wq, &mg->k);
1136  }
1137  
overwrite(struct dm_cache_migration * mg,void (* continuation)(struct work_struct *))1138  static void overwrite(struct dm_cache_migration *mg,
1139  		      void (*continuation)(struct work_struct *))
1140  {
1141  	struct bio *bio = mg->overwrite_bio;
1142  	struct per_bio_data *pb = get_per_bio_data(bio);
1143  
1144  	dm_hook_bio(&pb->hook_info, bio, overwrite_endio, mg);
1145  
1146  	/*
1147  	 * The overwrite bio is part of the copy operation, as such it does
1148  	 * not set/clear discard or dirty flags.
1149  	 */
1150  	if (mg->op->op == POLICY_PROMOTE)
1151  		remap_to_cache(mg->cache, bio, mg->op->cblock);
1152  	else
1153  		remap_to_origin(mg->cache, bio);
1154  
1155  	init_continuation(&mg->k, continuation);
1156  	accounted_request(mg->cache, bio);
1157  }
1158  
1159  /*
1160   * Migration steps:
1161   *
1162   * 1) exclusive lock preventing WRITEs
1163   * 2) quiesce
1164   * 3) copy or issue overwrite bio
1165   * 4) upgrade to exclusive lock preventing READs and WRITEs
1166   * 5) quiesce
1167   * 6) update metadata and commit
1168   * 7) unlock
1169   */
mg_complete(struct dm_cache_migration * mg,bool success)1170  static void mg_complete(struct dm_cache_migration *mg, bool success)
1171  {
1172  	struct bio_list bios;
1173  	struct cache *cache = mg->cache;
1174  	struct policy_work *op = mg->op;
1175  	dm_cblock_t cblock = op->cblock;
1176  
1177  	if (success)
1178  		update_stats(&cache->stats, op->op);
1179  
1180  	switch (op->op) {
1181  	case POLICY_PROMOTE:
1182  		clear_discard(cache, oblock_to_dblock(cache, op->oblock));
1183  		policy_complete_background_work(cache->policy, op, success);
1184  
1185  		if (mg->overwrite_bio) {
1186  			if (success)
1187  				force_set_dirty(cache, cblock);
1188  			else if (mg->k.input)
1189  				mg->overwrite_bio->bi_status = mg->k.input;
1190  			else
1191  				mg->overwrite_bio->bi_status = BLK_STS_IOERR;
1192  			bio_endio(mg->overwrite_bio);
1193  		} else {
1194  			if (success)
1195  				force_clear_dirty(cache, cblock);
1196  			dec_io_migrations(cache);
1197  		}
1198  		break;
1199  
1200  	case POLICY_DEMOTE:
1201  		/*
1202  		 * We clear dirty here to update the nr_dirty counter.
1203  		 */
1204  		if (success)
1205  			force_clear_dirty(cache, cblock);
1206  		policy_complete_background_work(cache->policy, op, success);
1207  		dec_io_migrations(cache);
1208  		break;
1209  
1210  	case POLICY_WRITEBACK:
1211  		if (success)
1212  			force_clear_dirty(cache, cblock);
1213  		policy_complete_background_work(cache->policy, op, success);
1214  		dec_io_migrations(cache);
1215  		break;
1216  	}
1217  
1218  	bio_list_init(&bios);
1219  	if (mg->cell) {
1220  		if (dm_cell_unlock_v2(cache->prison, mg->cell, &bios))
1221  			free_prison_cell(cache, mg->cell);
1222  	}
1223  
1224  	free_migration(mg);
1225  	defer_bios(cache, &bios);
1226  	wake_migration_worker(cache);
1227  
1228  	background_work_end(cache);
1229  }
1230  
mg_success(struct work_struct * ws)1231  static void mg_success(struct work_struct *ws)
1232  {
1233  	struct dm_cache_migration *mg = ws_to_mg(ws);
1234  
1235  	mg_complete(mg, mg->k.input == 0);
1236  }
1237  
mg_update_metadata(struct work_struct * ws)1238  static void mg_update_metadata(struct work_struct *ws)
1239  {
1240  	int r;
1241  	struct dm_cache_migration *mg = ws_to_mg(ws);
1242  	struct cache *cache = mg->cache;
1243  	struct policy_work *op = mg->op;
1244  
1245  	switch (op->op) {
1246  	case POLICY_PROMOTE:
1247  		r = dm_cache_insert_mapping(cache->cmd, op->cblock, op->oblock);
1248  		if (r) {
1249  			DMERR_LIMIT("%s: migration failed; couldn't insert mapping",
1250  				    cache_device_name(cache));
1251  			metadata_operation_failed(cache, "dm_cache_insert_mapping", r);
1252  
1253  			mg_complete(mg, false);
1254  			return;
1255  		}
1256  		mg_complete(mg, true);
1257  		break;
1258  
1259  	case POLICY_DEMOTE:
1260  		r = dm_cache_remove_mapping(cache->cmd, op->cblock);
1261  		if (r) {
1262  			DMERR_LIMIT("%s: migration failed; couldn't update on disk metadata",
1263  				    cache_device_name(cache));
1264  			metadata_operation_failed(cache, "dm_cache_remove_mapping", r);
1265  
1266  			mg_complete(mg, false);
1267  			return;
1268  		}
1269  
1270  		/*
1271  		 * It would be nice if we only had to commit when a REQ_FLUSH
1272  		 * comes through.  But there's one scenario that we have to
1273  		 * look out for:
1274  		 *
1275  		 * - vblock x in a cache block
1276  		 * - domotion occurs
1277  		 * - cache block gets reallocated and over written
1278  		 * - crash
1279  		 *
1280  		 * When we recover, because there was no commit the cache will
1281  		 * rollback to having the data for vblock x in the cache block.
1282  		 * But the cache block has since been overwritten, so it'll end
1283  		 * up pointing to data that was never in 'x' during the history
1284  		 * of the device.
1285  		 *
1286  		 * To avoid this issue we require a commit as part of the
1287  		 * demotion operation.
1288  		 */
1289  		init_continuation(&mg->k, mg_success);
1290  		continue_after_commit(&cache->committer, &mg->k);
1291  		schedule_commit(&cache->committer);
1292  		break;
1293  
1294  	case POLICY_WRITEBACK:
1295  		mg_complete(mg, true);
1296  		break;
1297  	}
1298  }
1299  
mg_update_metadata_after_copy(struct work_struct * ws)1300  static void mg_update_metadata_after_copy(struct work_struct *ws)
1301  {
1302  	struct dm_cache_migration *mg = ws_to_mg(ws);
1303  
1304  	/*
1305  	 * Did the copy succeed?
1306  	 */
1307  	if (mg->k.input)
1308  		mg_complete(mg, false);
1309  	else
1310  		mg_update_metadata(ws);
1311  }
1312  
mg_upgrade_lock(struct work_struct * ws)1313  static void mg_upgrade_lock(struct work_struct *ws)
1314  {
1315  	int r;
1316  	struct dm_cache_migration *mg = ws_to_mg(ws);
1317  
1318  	/*
1319  	 * Did the copy succeed?
1320  	 */
1321  	if (mg->k.input)
1322  		mg_complete(mg, false);
1323  
1324  	else {
1325  		/*
1326  		 * Now we want the lock to prevent both reads and writes.
1327  		 */
1328  		r = dm_cell_lock_promote_v2(mg->cache->prison, mg->cell,
1329  					    READ_WRITE_LOCK_LEVEL);
1330  		if (r < 0)
1331  			mg_complete(mg, false);
1332  
1333  		else if (r)
1334  			quiesce(mg, mg_update_metadata);
1335  
1336  		else
1337  			mg_update_metadata(ws);
1338  	}
1339  }
1340  
mg_full_copy(struct work_struct * ws)1341  static void mg_full_copy(struct work_struct *ws)
1342  {
1343  	struct dm_cache_migration *mg = ws_to_mg(ws);
1344  	struct cache *cache = mg->cache;
1345  	struct policy_work *op = mg->op;
1346  	bool is_policy_promote = (op->op == POLICY_PROMOTE);
1347  
1348  	if ((!is_policy_promote && !is_dirty(cache, op->cblock)) ||
1349  	    is_discarded_oblock(cache, op->oblock)) {
1350  		mg_upgrade_lock(ws);
1351  		return;
1352  	}
1353  
1354  	init_continuation(&mg->k, mg_upgrade_lock);
1355  	copy(mg, is_policy_promote);
1356  }
1357  
mg_copy(struct work_struct * ws)1358  static void mg_copy(struct work_struct *ws)
1359  {
1360  	struct dm_cache_migration *mg = ws_to_mg(ws);
1361  
1362  	if (mg->overwrite_bio) {
1363  		/*
1364  		 * No exclusive lock was held when we last checked if the bio
1365  		 * was optimisable.  So we have to check again in case things
1366  		 * have changed (eg, the block may no longer be discarded).
1367  		 */
1368  		if (!optimisable_bio(mg->cache, mg->overwrite_bio, mg->op->oblock)) {
1369  			/*
1370  			 * Fallback to a real full copy after doing some tidying up.
1371  			 */
1372  			bool rb = bio_detain_shared(mg->cache, mg->op->oblock, mg->overwrite_bio);
1373  
1374  			BUG_ON(rb); /* An exclussive lock must _not_ be held for this block */
1375  			mg->overwrite_bio = NULL;
1376  			inc_io_migrations(mg->cache);
1377  			mg_full_copy(ws);
1378  			return;
1379  		}
1380  
1381  		/*
1382  		 * It's safe to do this here, even though it's new data
1383  		 * because all IO has been locked out of the block.
1384  		 *
1385  		 * mg_lock_writes() already took READ_WRITE_LOCK_LEVEL
1386  		 * so _not_ using mg_upgrade_lock() as continutation.
1387  		 */
1388  		overwrite(mg, mg_update_metadata_after_copy);
1389  
1390  	} else
1391  		mg_full_copy(ws);
1392  }
1393  
mg_lock_writes(struct dm_cache_migration * mg)1394  static int mg_lock_writes(struct dm_cache_migration *mg)
1395  {
1396  	int r;
1397  	struct dm_cell_key_v2 key;
1398  	struct cache *cache = mg->cache;
1399  	struct dm_bio_prison_cell_v2 *prealloc;
1400  
1401  	prealloc = alloc_prison_cell(cache);
1402  
1403  	/*
1404  	 * Prevent writes to the block, but allow reads to continue.
1405  	 * Unless we're using an overwrite bio, in which case we lock
1406  	 * everything.
1407  	 */
1408  	build_key(mg->op->oblock, oblock_succ(mg->op->oblock), &key);
1409  	r = dm_cell_lock_v2(cache->prison, &key,
1410  			    mg->overwrite_bio ?  READ_WRITE_LOCK_LEVEL : WRITE_LOCK_LEVEL,
1411  			    prealloc, &mg->cell);
1412  	if (r < 0) {
1413  		free_prison_cell(cache, prealloc);
1414  		mg_complete(mg, false);
1415  		return r;
1416  	}
1417  
1418  	if (mg->cell != prealloc)
1419  		free_prison_cell(cache, prealloc);
1420  
1421  	if (r == 0)
1422  		mg_copy(&mg->k.ws);
1423  	else
1424  		quiesce(mg, mg_copy);
1425  
1426  	return 0;
1427  }
1428  
mg_start(struct cache * cache,struct policy_work * op,struct bio * bio)1429  static int mg_start(struct cache *cache, struct policy_work *op, struct bio *bio)
1430  {
1431  	struct dm_cache_migration *mg;
1432  
1433  	if (!background_work_begin(cache)) {
1434  		policy_complete_background_work(cache->policy, op, false);
1435  		return -EPERM;
1436  	}
1437  
1438  	mg = alloc_migration(cache);
1439  
1440  	mg->op = op;
1441  	mg->overwrite_bio = bio;
1442  
1443  	if (!bio)
1444  		inc_io_migrations(cache);
1445  
1446  	return mg_lock_writes(mg);
1447  }
1448  
1449  /*
1450   *--------------------------------------------------------------
1451   * invalidation processing
1452   *--------------------------------------------------------------
1453   */
1454  
invalidate_complete(struct dm_cache_migration * mg,bool success)1455  static void invalidate_complete(struct dm_cache_migration *mg, bool success)
1456  {
1457  	struct bio_list bios;
1458  	struct cache *cache = mg->cache;
1459  
1460  	bio_list_init(&bios);
1461  	if (dm_cell_unlock_v2(cache->prison, mg->cell, &bios))
1462  		free_prison_cell(cache, mg->cell);
1463  
1464  	if (!success && mg->overwrite_bio)
1465  		bio_io_error(mg->overwrite_bio);
1466  
1467  	free_migration(mg);
1468  	defer_bios(cache, &bios);
1469  
1470  	background_work_end(cache);
1471  }
1472  
invalidate_completed(struct work_struct * ws)1473  static void invalidate_completed(struct work_struct *ws)
1474  {
1475  	struct dm_cache_migration *mg = ws_to_mg(ws);
1476  
1477  	invalidate_complete(mg, !mg->k.input);
1478  }
1479  
invalidate_cblock(struct cache * cache,dm_cblock_t cblock)1480  static int invalidate_cblock(struct cache *cache, dm_cblock_t cblock)
1481  {
1482  	int r;
1483  
1484  	r = policy_invalidate_mapping(cache->policy, cblock);
1485  	if (!r) {
1486  		r = dm_cache_remove_mapping(cache->cmd, cblock);
1487  		if (r) {
1488  			DMERR_LIMIT("%s: invalidation failed; couldn't update on disk metadata",
1489  				    cache_device_name(cache));
1490  			metadata_operation_failed(cache, "dm_cache_remove_mapping", r);
1491  		}
1492  
1493  	} else if (r == -ENODATA) {
1494  		/*
1495  		 * Harmless, already unmapped.
1496  		 */
1497  		r = 0;
1498  
1499  	} else
1500  		DMERR("%s: policy_invalidate_mapping failed", cache_device_name(cache));
1501  
1502  	return r;
1503  }
1504  
invalidate_remove(struct work_struct * ws)1505  static void invalidate_remove(struct work_struct *ws)
1506  {
1507  	int r;
1508  	struct dm_cache_migration *mg = ws_to_mg(ws);
1509  	struct cache *cache = mg->cache;
1510  
1511  	r = invalidate_cblock(cache, mg->invalidate_cblock);
1512  	if (r) {
1513  		invalidate_complete(mg, false);
1514  		return;
1515  	}
1516  
1517  	init_continuation(&mg->k, invalidate_completed);
1518  	continue_after_commit(&cache->committer, &mg->k);
1519  	remap_to_origin_clear_discard(cache, mg->overwrite_bio, mg->invalidate_oblock);
1520  	mg->overwrite_bio = NULL;
1521  	schedule_commit(&cache->committer);
1522  }
1523  
invalidate_lock(struct dm_cache_migration * mg)1524  static int invalidate_lock(struct dm_cache_migration *mg)
1525  {
1526  	int r;
1527  	struct dm_cell_key_v2 key;
1528  	struct cache *cache = mg->cache;
1529  	struct dm_bio_prison_cell_v2 *prealloc;
1530  
1531  	prealloc = alloc_prison_cell(cache);
1532  
1533  	build_key(mg->invalidate_oblock, oblock_succ(mg->invalidate_oblock), &key);
1534  	r = dm_cell_lock_v2(cache->prison, &key,
1535  			    READ_WRITE_LOCK_LEVEL, prealloc, &mg->cell);
1536  	if (r < 0) {
1537  		free_prison_cell(cache, prealloc);
1538  		invalidate_complete(mg, false);
1539  		return r;
1540  	}
1541  
1542  	if (mg->cell != prealloc)
1543  		free_prison_cell(cache, prealloc);
1544  
1545  	if (r)
1546  		quiesce(mg, invalidate_remove);
1547  
1548  	else {
1549  		/*
1550  		 * We can't call invalidate_remove() directly here because we
1551  		 * might still be in request context.
1552  		 */
1553  		init_continuation(&mg->k, invalidate_remove);
1554  		queue_work(cache->wq, &mg->k.ws);
1555  	}
1556  
1557  	return 0;
1558  }
1559  
invalidate_start(struct cache * cache,dm_cblock_t cblock,dm_oblock_t oblock,struct bio * bio)1560  static int invalidate_start(struct cache *cache, dm_cblock_t cblock,
1561  			    dm_oblock_t oblock, struct bio *bio)
1562  {
1563  	struct dm_cache_migration *mg;
1564  
1565  	if (!background_work_begin(cache))
1566  		return -EPERM;
1567  
1568  	mg = alloc_migration(cache);
1569  
1570  	mg->overwrite_bio = bio;
1571  	mg->invalidate_cblock = cblock;
1572  	mg->invalidate_oblock = oblock;
1573  
1574  	return invalidate_lock(mg);
1575  }
1576  
1577  /*
1578   *--------------------------------------------------------------
1579   * bio processing
1580   *--------------------------------------------------------------
1581   */
1582  
1583  enum busy {
1584  	IDLE,
1585  	BUSY
1586  };
1587  
spare_migration_bandwidth(struct cache * cache)1588  static enum busy spare_migration_bandwidth(struct cache *cache)
1589  {
1590  	bool idle = dm_iot_idle_for(&cache->tracker, HZ);
1591  	sector_t current_volume = (atomic_read(&cache->nr_io_migrations) + 1) *
1592  		cache->sectors_per_block;
1593  
1594  	if (idle && current_volume <= cache->migration_threshold)
1595  		return IDLE;
1596  	else
1597  		return BUSY;
1598  }
1599  
inc_hit_counter(struct cache * cache,struct bio * bio)1600  static void inc_hit_counter(struct cache *cache, struct bio *bio)
1601  {
1602  	atomic_inc(bio_data_dir(bio) == READ ?
1603  		   &cache->stats.read_hit : &cache->stats.write_hit);
1604  }
1605  
inc_miss_counter(struct cache * cache,struct bio * bio)1606  static void inc_miss_counter(struct cache *cache, struct bio *bio)
1607  {
1608  	atomic_inc(bio_data_dir(bio) == READ ?
1609  		   &cache->stats.read_miss : &cache->stats.write_miss);
1610  }
1611  
1612  /*----------------------------------------------------------------*/
1613  
map_bio(struct cache * cache,struct bio * bio,dm_oblock_t block,bool * commit_needed)1614  static int map_bio(struct cache *cache, struct bio *bio, dm_oblock_t block,
1615  		   bool *commit_needed)
1616  {
1617  	int r, data_dir;
1618  	bool rb, background_queued;
1619  	dm_cblock_t cblock;
1620  
1621  	*commit_needed = false;
1622  
1623  	rb = bio_detain_shared(cache, block, bio);
1624  	if (!rb) {
1625  		/*
1626  		 * An exclusive lock is held for this block, so we have to
1627  		 * wait.  We set the commit_needed flag so the current
1628  		 * transaction will be committed asap, allowing this lock
1629  		 * to be dropped.
1630  		 */
1631  		*commit_needed = true;
1632  		return DM_MAPIO_SUBMITTED;
1633  	}
1634  
1635  	data_dir = bio_data_dir(bio);
1636  
1637  	if (optimisable_bio(cache, bio, block)) {
1638  		struct policy_work *op = NULL;
1639  
1640  		r = policy_lookup_with_work(cache->policy, block, &cblock, data_dir, true, &op);
1641  		if (unlikely(r && r != -ENOENT)) {
1642  			DMERR_LIMIT("%s: policy_lookup_with_work() failed with r = %d",
1643  				    cache_device_name(cache), r);
1644  			bio_io_error(bio);
1645  			return DM_MAPIO_SUBMITTED;
1646  		}
1647  
1648  		if (r == -ENOENT && op) {
1649  			bio_drop_shared_lock(cache, bio);
1650  			BUG_ON(op->op != POLICY_PROMOTE);
1651  			mg_start(cache, op, bio);
1652  			return DM_MAPIO_SUBMITTED;
1653  		}
1654  	} else {
1655  		r = policy_lookup(cache->policy, block, &cblock, data_dir, false, &background_queued);
1656  		if (unlikely(r && r != -ENOENT)) {
1657  			DMERR_LIMIT("%s: policy_lookup() failed with r = %d",
1658  				    cache_device_name(cache), r);
1659  			bio_io_error(bio);
1660  			return DM_MAPIO_SUBMITTED;
1661  		}
1662  
1663  		if (background_queued)
1664  			wake_migration_worker(cache);
1665  	}
1666  
1667  	if (r == -ENOENT) {
1668  		struct per_bio_data *pb = get_per_bio_data(bio);
1669  
1670  		/*
1671  		 * Miss.
1672  		 */
1673  		inc_miss_counter(cache, bio);
1674  		if (pb->req_nr == 0) {
1675  			accounted_begin(cache, bio);
1676  			remap_to_origin_clear_discard(cache, bio, block);
1677  		} else {
1678  			/*
1679  			 * This is a duplicate writethrough io that is no
1680  			 * longer needed because the block has been demoted.
1681  			 */
1682  			bio_endio(bio);
1683  			return DM_MAPIO_SUBMITTED;
1684  		}
1685  	} else {
1686  		/*
1687  		 * Hit.
1688  		 */
1689  		inc_hit_counter(cache, bio);
1690  
1691  		/*
1692  		 * Passthrough always maps to the origin, invalidating any
1693  		 * cache blocks that are written to.
1694  		 */
1695  		if (passthrough_mode(cache)) {
1696  			if (bio_data_dir(bio) == WRITE) {
1697  				bio_drop_shared_lock(cache, bio);
1698  				atomic_inc(&cache->stats.demotion);
1699  				invalidate_start(cache, cblock, block, bio);
1700  			} else
1701  				remap_to_origin_clear_discard(cache, bio, block);
1702  		} else {
1703  			if (bio_data_dir(bio) == WRITE && writethrough_mode(cache) &&
1704  			    !is_dirty(cache, cblock)) {
1705  				remap_to_origin_and_cache(cache, bio, block, cblock);
1706  				accounted_begin(cache, bio);
1707  			} else
1708  				remap_to_cache_dirty(cache, bio, block, cblock);
1709  		}
1710  	}
1711  
1712  	/*
1713  	 * dm core turns FUA requests into a separate payload and FLUSH req.
1714  	 */
1715  	if (bio->bi_opf & REQ_FUA) {
1716  		/*
1717  		 * issue_after_commit will call accounted_begin a second time.  So
1718  		 * we call accounted_complete() to avoid double accounting.
1719  		 */
1720  		accounted_complete(cache, bio);
1721  		issue_after_commit(&cache->committer, bio);
1722  		*commit_needed = true;
1723  		return DM_MAPIO_SUBMITTED;
1724  	}
1725  
1726  	return DM_MAPIO_REMAPPED;
1727  }
1728  
process_bio(struct cache * cache,struct bio * bio)1729  static bool process_bio(struct cache *cache, struct bio *bio)
1730  {
1731  	bool commit_needed;
1732  
1733  	if (map_bio(cache, bio, get_bio_block(cache, bio), &commit_needed) == DM_MAPIO_REMAPPED)
1734  		dm_submit_bio_remap(bio, NULL);
1735  
1736  	return commit_needed;
1737  }
1738  
1739  /*
1740   * A non-zero return indicates read_only or fail_io mode.
1741   */
commit(struct cache * cache,bool clean_shutdown)1742  static int commit(struct cache *cache, bool clean_shutdown)
1743  {
1744  	int r;
1745  
1746  	if (get_cache_mode(cache) >= CM_READ_ONLY)
1747  		return -EINVAL;
1748  
1749  	atomic_inc(&cache->stats.commit_count);
1750  	r = dm_cache_commit(cache->cmd, clean_shutdown);
1751  	if (r)
1752  		metadata_operation_failed(cache, "dm_cache_commit", r);
1753  
1754  	return r;
1755  }
1756  
1757  /*
1758   * Used by the batcher.
1759   */
commit_op(void * context)1760  static blk_status_t commit_op(void *context)
1761  {
1762  	struct cache *cache = context;
1763  
1764  	if (dm_cache_changed_this_transaction(cache->cmd))
1765  		return errno_to_blk_status(commit(cache, false));
1766  
1767  	return 0;
1768  }
1769  
1770  /*----------------------------------------------------------------*/
1771  
process_flush_bio(struct cache * cache,struct bio * bio)1772  static bool process_flush_bio(struct cache *cache, struct bio *bio)
1773  {
1774  	struct per_bio_data *pb = get_per_bio_data(bio);
1775  
1776  	if (!pb->req_nr)
1777  		remap_to_origin(cache, bio);
1778  	else
1779  		remap_to_cache(cache, bio, 0);
1780  
1781  	issue_after_commit(&cache->committer, bio);
1782  	return true;
1783  }
1784  
process_discard_bio(struct cache * cache,struct bio * bio)1785  static bool process_discard_bio(struct cache *cache, struct bio *bio)
1786  {
1787  	dm_dblock_t b, e;
1788  
1789  	/*
1790  	 * FIXME: do we need to lock the region?  Or can we just assume the
1791  	 * user wont be so foolish as to issue discard concurrently with
1792  	 * other IO?
1793  	 */
1794  	calc_discard_block_range(cache, bio, &b, &e);
1795  	while (b != e) {
1796  		set_discard(cache, b);
1797  		b = to_dblock(from_dblock(b) + 1);
1798  	}
1799  
1800  	if (cache->features.discard_passdown) {
1801  		remap_to_origin(cache, bio);
1802  		dm_submit_bio_remap(bio, NULL);
1803  	} else
1804  		bio_endio(bio);
1805  
1806  	return false;
1807  }
1808  
process_deferred_bios(struct work_struct * ws)1809  static void process_deferred_bios(struct work_struct *ws)
1810  {
1811  	struct cache *cache = container_of(ws, struct cache, deferred_bio_worker);
1812  
1813  	bool commit_needed = false;
1814  	struct bio_list bios;
1815  	struct bio *bio;
1816  
1817  	bio_list_init(&bios);
1818  
1819  	spin_lock_irq(&cache->lock);
1820  	bio_list_merge(&bios, &cache->deferred_bios);
1821  	bio_list_init(&cache->deferred_bios);
1822  	spin_unlock_irq(&cache->lock);
1823  
1824  	while ((bio = bio_list_pop(&bios))) {
1825  		if (bio->bi_opf & REQ_PREFLUSH)
1826  			commit_needed = process_flush_bio(cache, bio) || commit_needed;
1827  
1828  		else if (bio_op(bio) == REQ_OP_DISCARD)
1829  			commit_needed = process_discard_bio(cache, bio) || commit_needed;
1830  
1831  		else
1832  			commit_needed = process_bio(cache, bio) || commit_needed;
1833  		cond_resched();
1834  	}
1835  
1836  	if (commit_needed)
1837  		schedule_commit(&cache->committer);
1838  }
1839  
1840  /*
1841   *--------------------------------------------------------------
1842   * Main worker loop
1843   *--------------------------------------------------------------
1844   */
requeue_deferred_bios(struct cache * cache)1845  static void requeue_deferred_bios(struct cache *cache)
1846  {
1847  	struct bio *bio;
1848  	struct bio_list bios;
1849  
1850  	bio_list_init(&bios);
1851  	bio_list_merge(&bios, &cache->deferred_bios);
1852  	bio_list_init(&cache->deferred_bios);
1853  
1854  	while ((bio = bio_list_pop(&bios))) {
1855  		bio->bi_status = BLK_STS_DM_REQUEUE;
1856  		bio_endio(bio);
1857  		cond_resched();
1858  	}
1859  }
1860  
1861  /*
1862   * We want to commit periodically so that not too much
1863   * unwritten metadata builds up.
1864   */
do_waker(struct work_struct * ws)1865  static void do_waker(struct work_struct *ws)
1866  {
1867  	struct cache *cache = container_of(to_delayed_work(ws), struct cache, waker);
1868  
1869  	policy_tick(cache->policy, true);
1870  	wake_migration_worker(cache);
1871  	schedule_commit(&cache->committer);
1872  	queue_delayed_work(cache->wq, &cache->waker, COMMIT_PERIOD);
1873  }
1874  
check_migrations(struct work_struct * ws)1875  static void check_migrations(struct work_struct *ws)
1876  {
1877  	int r;
1878  	struct policy_work *op;
1879  	struct cache *cache = container_of(ws, struct cache, migration_worker);
1880  	enum busy b;
1881  
1882  	for (;;) {
1883  		b = spare_migration_bandwidth(cache);
1884  
1885  		r = policy_get_background_work(cache->policy, b == IDLE, &op);
1886  		if (r == -ENODATA)
1887  			break;
1888  
1889  		if (r) {
1890  			DMERR_LIMIT("%s: policy_background_work failed",
1891  				    cache_device_name(cache));
1892  			break;
1893  		}
1894  
1895  		r = mg_start(cache, op, NULL);
1896  		if (r)
1897  			break;
1898  
1899  		cond_resched();
1900  	}
1901  }
1902  
1903  /*
1904   *--------------------------------------------------------------
1905   * Target methods
1906   *--------------------------------------------------------------
1907   */
1908  
1909  /*
1910   * This function gets called on the error paths of the constructor, so we
1911   * have to cope with a partially initialised struct.
1912   */
__destroy(struct cache * cache)1913  static void __destroy(struct cache *cache)
1914  {
1915  	mempool_exit(&cache->migration_pool);
1916  
1917  	if (cache->prison)
1918  		dm_bio_prison_destroy_v2(cache->prison);
1919  
1920  	if (cache->wq)
1921  		destroy_workqueue(cache->wq);
1922  
1923  	if (cache->dirty_bitset)
1924  		free_bitset(cache->dirty_bitset);
1925  
1926  	if (cache->discard_bitset)
1927  		free_bitset(cache->discard_bitset);
1928  
1929  	if (cache->copier)
1930  		dm_kcopyd_client_destroy(cache->copier);
1931  
1932  	if (cache->cmd)
1933  		dm_cache_metadata_close(cache->cmd);
1934  
1935  	if (cache->metadata_dev)
1936  		dm_put_device(cache->ti, cache->metadata_dev);
1937  
1938  	if (cache->origin_dev)
1939  		dm_put_device(cache->ti, cache->origin_dev);
1940  
1941  	if (cache->cache_dev)
1942  		dm_put_device(cache->ti, cache->cache_dev);
1943  
1944  	if (cache->policy)
1945  		dm_cache_policy_destroy(cache->policy);
1946  
1947  	bioset_exit(&cache->bs);
1948  
1949  	kfree(cache);
1950  }
1951  
destroy(struct cache * cache)1952  static void destroy(struct cache *cache)
1953  {
1954  	unsigned int i;
1955  
1956  	cancel_delayed_work_sync(&cache->waker);
1957  
1958  	for (i = 0; i < cache->nr_ctr_args ; i++)
1959  		kfree(cache->ctr_args[i]);
1960  	kfree(cache->ctr_args);
1961  
1962  	__destroy(cache);
1963  }
1964  
cache_dtr(struct dm_target * ti)1965  static void cache_dtr(struct dm_target *ti)
1966  {
1967  	struct cache *cache = ti->private;
1968  
1969  	destroy(cache);
1970  }
1971  
get_dev_size(struct dm_dev * dev)1972  static sector_t get_dev_size(struct dm_dev *dev)
1973  {
1974  	return bdev_nr_sectors(dev->bdev);
1975  }
1976  
1977  /*----------------------------------------------------------------*/
1978  
1979  /*
1980   * Construct a cache device mapping.
1981   *
1982   * cache <metadata dev> <cache dev> <origin dev> <block size>
1983   *       <#feature args> [<feature arg>]*
1984   *       <policy> <#policy args> [<policy arg>]*
1985   *
1986   * metadata dev    : fast device holding the persistent metadata
1987   * cache dev	   : fast device holding cached data blocks
1988   * origin dev	   : slow device holding original data blocks
1989   * block size	   : cache unit size in sectors
1990   *
1991   * #feature args   : number of feature arguments passed
1992   * feature args    : writethrough.  (The default is writeback.)
1993   *
1994   * policy	   : the replacement policy to use
1995   * #policy args    : an even number of policy arguments corresponding
1996   *		     to key/value pairs passed to the policy
1997   * policy args	   : key/value pairs passed to the policy
1998   *		     E.g. 'sequential_threshold 1024'
1999   *		     See cache-policies.txt for details.
2000   *
2001   * Optional feature arguments are:
2002   *   writethrough  : write through caching that prohibits cache block
2003   *		     content from being different from origin block content.
2004   *		     Without this argument, the default behaviour is to write
2005   *		     back cache block contents later for performance reasons,
2006   *		     so they may differ from the corresponding origin blocks.
2007   */
2008  struct cache_args {
2009  	struct dm_target *ti;
2010  
2011  	struct dm_dev *metadata_dev;
2012  
2013  	struct dm_dev *cache_dev;
2014  	sector_t cache_sectors;
2015  
2016  	struct dm_dev *origin_dev;
2017  
2018  	uint32_t block_size;
2019  
2020  	const char *policy_name;
2021  	int policy_argc;
2022  	const char **policy_argv;
2023  
2024  	struct cache_features features;
2025  };
2026  
destroy_cache_args(struct cache_args * ca)2027  static void destroy_cache_args(struct cache_args *ca)
2028  {
2029  	if (ca->metadata_dev)
2030  		dm_put_device(ca->ti, ca->metadata_dev);
2031  
2032  	if (ca->cache_dev)
2033  		dm_put_device(ca->ti, ca->cache_dev);
2034  
2035  	if (ca->origin_dev)
2036  		dm_put_device(ca->ti, ca->origin_dev);
2037  
2038  	kfree(ca);
2039  }
2040  
at_least_one_arg(struct dm_arg_set * as,char ** error)2041  static bool at_least_one_arg(struct dm_arg_set *as, char **error)
2042  {
2043  	if (!as->argc) {
2044  		*error = "Insufficient args";
2045  		return false;
2046  	}
2047  
2048  	return true;
2049  }
2050  
parse_metadata_dev(struct cache_args * ca,struct dm_arg_set * as,char ** error)2051  static int parse_metadata_dev(struct cache_args *ca, struct dm_arg_set *as,
2052  			      char **error)
2053  {
2054  	int r;
2055  	sector_t metadata_dev_size;
2056  
2057  	if (!at_least_one_arg(as, error))
2058  		return -EINVAL;
2059  
2060  	r = dm_get_device(ca->ti, dm_shift_arg(as),
2061  			  BLK_OPEN_READ | BLK_OPEN_WRITE, &ca->metadata_dev);
2062  	if (r) {
2063  		*error = "Error opening metadata device";
2064  		return r;
2065  	}
2066  
2067  	metadata_dev_size = get_dev_size(ca->metadata_dev);
2068  	if (metadata_dev_size > DM_CACHE_METADATA_MAX_SECTORS_WARNING)
2069  		DMWARN("Metadata device %pg is larger than %u sectors: excess space will not be used.",
2070  		       ca->metadata_dev->bdev, THIN_METADATA_MAX_SECTORS);
2071  
2072  	return 0;
2073  }
2074  
parse_cache_dev(struct cache_args * ca,struct dm_arg_set * as,char ** error)2075  static int parse_cache_dev(struct cache_args *ca, struct dm_arg_set *as,
2076  			   char **error)
2077  {
2078  	int r;
2079  
2080  	if (!at_least_one_arg(as, error))
2081  		return -EINVAL;
2082  
2083  	r = dm_get_device(ca->ti, dm_shift_arg(as),
2084  			  BLK_OPEN_READ | BLK_OPEN_WRITE, &ca->cache_dev);
2085  	if (r) {
2086  		*error = "Error opening cache device";
2087  		return r;
2088  	}
2089  	ca->cache_sectors = get_dev_size(ca->cache_dev);
2090  
2091  	return 0;
2092  }
2093  
parse_origin_dev(struct cache_args * ca,struct dm_arg_set * as,char ** error)2094  static int parse_origin_dev(struct cache_args *ca, struct dm_arg_set *as,
2095  			    char **error)
2096  {
2097  	sector_t origin_sectors;
2098  	int r;
2099  
2100  	if (!at_least_one_arg(as, error))
2101  		return -EINVAL;
2102  
2103  	r = dm_get_device(ca->ti, dm_shift_arg(as),
2104  			  BLK_OPEN_READ | BLK_OPEN_WRITE, &ca->origin_dev);
2105  	if (r) {
2106  		*error = "Error opening origin device";
2107  		return r;
2108  	}
2109  
2110  	origin_sectors = get_dev_size(ca->origin_dev);
2111  	if (ca->ti->len > origin_sectors) {
2112  		*error = "Device size larger than cached device";
2113  		return -EINVAL;
2114  	}
2115  
2116  	return 0;
2117  }
2118  
parse_block_size(struct cache_args * ca,struct dm_arg_set * as,char ** error)2119  static int parse_block_size(struct cache_args *ca, struct dm_arg_set *as,
2120  			    char **error)
2121  {
2122  	unsigned long block_size;
2123  
2124  	if (!at_least_one_arg(as, error))
2125  		return -EINVAL;
2126  
2127  	if (kstrtoul(dm_shift_arg(as), 10, &block_size) || !block_size ||
2128  	    block_size < DATA_DEV_BLOCK_SIZE_MIN_SECTORS ||
2129  	    block_size > DATA_DEV_BLOCK_SIZE_MAX_SECTORS ||
2130  	    block_size & (DATA_DEV_BLOCK_SIZE_MIN_SECTORS - 1)) {
2131  		*error = "Invalid data block size";
2132  		return -EINVAL;
2133  	}
2134  
2135  	if (block_size > ca->cache_sectors) {
2136  		*error = "Data block size is larger than the cache device";
2137  		return -EINVAL;
2138  	}
2139  
2140  	ca->block_size = block_size;
2141  
2142  	return 0;
2143  }
2144  
init_features(struct cache_features * cf)2145  static void init_features(struct cache_features *cf)
2146  {
2147  	cf->mode = CM_WRITE;
2148  	cf->io_mode = CM_IO_WRITEBACK;
2149  	cf->metadata_version = 1;
2150  	cf->discard_passdown = true;
2151  }
2152  
parse_features(struct cache_args * ca,struct dm_arg_set * as,char ** error)2153  static int parse_features(struct cache_args *ca, struct dm_arg_set *as,
2154  			  char **error)
2155  {
2156  	static const struct dm_arg _args[] = {
2157  		{0, 3, "Invalid number of cache feature arguments"},
2158  	};
2159  
2160  	int r, mode_ctr = 0;
2161  	unsigned int argc;
2162  	const char *arg;
2163  	struct cache_features *cf = &ca->features;
2164  
2165  	init_features(cf);
2166  
2167  	r = dm_read_arg_group(_args, as, &argc, error);
2168  	if (r)
2169  		return -EINVAL;
2170  
2171  	while (argc--) {
2172  		arg = dm_shift_arg(as);
2173  
2174  		if (!strcasecmp(arg, "writeback")) {
2175  			cf->io_mode = CM_IO_WRITEBACK;
2176  			mode_ctr++;
2177  		}
2178  
2179  		else if (!strcasecmp(arg, "writethrough")) {
2180  			cf->io_mode = CM_IO_WRITETHROUGH;
2181  			mode_ctr++;
2182  		}
2183  
2184  		else if (!strcasecmp(arg, "passthrough")) {
2185  			cf->io_mode = CM_IO_PASSTHROUGH;
2186  			mode_ctr++;
2187  		}
2188  
2189  		else if (!strcasecmp(arg, "metadata2"))
2190  			cf->metadata_version = 2;
2191  
2192  		else if (!strcasecmp(arg, "no_discard_passdown"))
2193  			cf->discard_passdown = false;
2194  
2195  		else {
2196  			*error = "Unrecognised cache feature requested";
2197  			return -EINVAL;
2198  		}
2199  	}
2200  
2201  	if (mode_ctr > 1) {
2202  		*error = "Duplicate cache io_mode features requested";
2203  		return -EINVAL;
2204  	}
2205  
2206  	return 0;
2207  }
2208  
parse_policy(struct cache_args * ca,struct dm_arg_set * as,char ** error)2209  static int parse_policy(struct cache_args *ca, struct dm_arg_set *as,
2210  			char **error)
2211  {
2212  	static const struct dm_arg _args[] = {
2213  		{0, 1024, "Invalid number of policy arguments"},
2214  	};
2215  
2216  	int r;
2217  
2218  	if (!at_least_one_arg(as, error))
2219  		return -EINVAL;
2220  
2221  	ca->policy_name = dm_shift_arg(as);
2222  
2223  	r = dm_read_arg_group(_args, as, &ca->policy_argc, error);
2224  	if (r)
2225  		return -EINVAL;
2226  
2227  	ca->policy_argv = (const char **)as->argv;
2228  	dm_consume_args(as, ca->policy_argc);
2229  
2230  	return 0;
2231  }
2232  
parse_cache_args(struct cache_args * ca,int argc,char ** argv,char ** error)2233  static int parse_cache_args(struct cache_args *ca, int argc, char **argv,
2234  			    char **error)
2235  {
2236  	int r;
2237  	struct dm_arg_set as;
2238  
2239  	as.argc = argc;
2240  	as.argv = argv;
2241  
2242  	r = parse_metadata_dev(ca, &as, error);
2243  	if (r)
2244  		return r;
2245  
2246  	r = parse_cache_dev(ca, &as, error);
2247  	if (r)
2248  		return r;
2249  
2250  	r = parse_origin_dev(ca, &as, error);
2251  	if (r)
2252  		return r;
2253  
2254  	r = parse_block_size(ca, &as, error);
2255  	if (r)
2256  		return r;
2257  
2258  	r = parse_features(ca, &as, error);
2259  	if (r)
2260  		return r;
2261  
2262  	r = parse_policy(ca, &as, error);
2263  	if (r)
2264  		return r;
2265  
2266  	return 0;
2267  }
2268  
2269  /*----------------------------------------------------------------*/
2270  
2271  static struct kmem_cache *migration_cache = NULL;
2272  
2273  #define NOT_CORE_OPTION 1
2274  
process_config_option(struct cache * cache,const char * key,const char * value)2275  static int process_config_option(struct cache *cache, const char *key, const char *value)
2276  {
2277  	unsigned long tmp;
2278  
2279  	if (!strcasecmp(key, "migration_threshold")) {
2280  		if (kstrtoul(value, 10, &tmp))
2281  			return -EINVAL;
2282  
2283  		cache->migration_threshold = tmp;
2284  		return 0;
2285  	}
2286  
2287  	return NOT_CORE_OPTION;
2288  }
2289  
set_config_value(struct cache * cache,const char * key,const char * value)2290  static int set_config_value(struct cache *cache, const char *key, const char *value)
2291  {
2292  	int r = process_config_option(cache, key, value);
2293  
2294  	if (r == NOT_CORE_OPTION)
2295  		r = policy_set_config_value(cache->policy, key, value);
2296  
2297  	if (r)
2298  		DMWARN("bad config value for %s: %s", key, value);
2299  
2300  	return r;
2301  }
2302  
set_config_values(struct cache * cache,int argc,const char ** argv)2303  static int set_config_values(struct cache *cache, int argc, const char **argv)
2304  {
2305  	int r = 0;
2306  
2307  	if (argc & 1) {
2308  		DMWARN("Odd number of policy arguments given but they should be <key> <value> pairs.");
2309  		return -EINVAL;
2310  	}
2311  
2312  	while (argc) {
2313  		r = set_config_value(cache, argv[0], argv[1]);
2314  		if (r)
2315  			break;
2316  
2317  		argc -= 2;
2318  		argv += 2;
2319  	}
2320  
2321  	return r;
2322  }
2323  
create_cache_policy(struct cache * cache,struct cache_args * ca,char ** error)2324  static int create_cache_policy(struct cache *cache, struct cache_args *ca,
2325  			       char **error)
2326  {
2327  	struct dm_cache_policy *p = dm_cache_policy_create(ca->policy_name,
2328  							   cache->cache_size,
2329  							   cache->origin_sectors,
2330  							   cache->sectors_per_block);
2331  	if (IS_ERR(p)) {
2332  		*error = "Error creating cache's policy";
2333  		return PTR_ERR(p);
2334  	}
2335  	cache->policy = p;
2336  	BUG_ON(!cache->policy);
2337  
2338  	return 0;
2339  }
2340  
2341  /*
2342   * We want the discard block size to be at least the size of the cache
2343   * block size and have no more than 2^14 discard blocks across the origin.
2344   */
2345  #define MAX_DISCARD_BLOCKS (1 << 14)
2346  
too_many_discard_blocks(sector_t discard_block_size,sector_t origin_size)2347  static bool too_many_discard_blocks(sector_t discard_block_size,
2348  				    sector_t origin_size)
2349  {
2350  	(void) sector_div(origin_size, discard_block_size);
2351  
2352  	return origin_size > MAX_DISCARD_BLOCKS;
2353  }
2354  
calculate_discard_block_size(sector_t cache_block_size,sector_t origin_size)2355  static sector_t calculate_discard_block_size(sector_t cache_block_size,
2356  					     sector_t origin_size)
2357  {
2358  	sector_t discard_block_size = cache_block_size;
2359  
2360  	if (origin_size)
2361  		while (too_many_discard_blocks(discard_block_size, origin_size))
2362  			discard_block_size *= 2;
2363  
2364  	return discard_block_size;
2365  }
2366  
set_cache_size(struct cache * cache,dm_cblock_t size)2367  static void set_cache_size(struct cache *cache, dm_cblock_t size)
2368  {
2369  	dm_block_t nr_blocks = from_cblock(size);
2370  
2371  	if (nr_blocks > (1 << 20) && cache->cache_size != size)
2372  		DMWARN_LIMIT("You have created a cache device with a lot of individual cache blocks (%llu)\n"
2373  			     "All these mappings can consume a lot of kernel memory, and take some time to read/write.\n"
2374  			     "Please consider increasing the cache block size to reduce the overall cache block count.",
2375  			     (unsigned long long) nr_blocks);
2376  
2377  	cache->cache_size = size;
2378  }
2379  
2380  #define DEFAULT_MIGRATION_THRESHOLD 2048
2381  
cache_create(struct cache_args * ca,struct cache ** result)2382  static int cache_create(struct cache_args *ca, struct cache **result)
2383  {
2384  	int r = 0;
2385  	char **error = &ca->ti->error;
2386  	struct cache *cache;
2387  	struct dm_target *ti = ca->ti;
2388  	dm_block_t origin_blocks;
2389  	struct dm_cache_metadata *cmd;
2390  	bool may_format = ca->features.mode == CM_WRITE;
2391  
2392  	cache = kzalloc(sizeof(*cache), GFP_KERNEL);
2393  	if (!cache)
2394  		return -ENOMEM;
2395  
2396  	cache->ti = ca->ti;
2397  	ti->private = cache;
2398  	ti->accounts_remapped_io = true;
2399  	ti->num_flush_bios = 2;
2400  	ti->flush_supported = true;
2401  
2402  	ti->num_discard_bios = 1;
2403  	ti->discards_supported = true;
2404  
2405  	ti->per_io_data_size = sizeof(struct per_bio_data);
2406  
2407  	cache->features = ca->features;
2408  	if (writethrough_mode(cache)) {
2409  		/* Create bioset for writethrough bios issued to origin */
2410  		r = bioset_init(&cache->bs, BIO_POOL_SIZE, 0, 0);
2411  		if (r)
2412  			goto bad;
2413  	}
2414  
2415  	cache->metadata_dev = ca->metadata_dev;
2416  	cache->origin_dev = ca->origin_dev;
2417  	cache->cache_dev = ca->cache_dev;
2418  
2419  	ca->metadata_dev = ca->origin_dev = ca->cache_dev = NULL;
2420  
2421  	origin_blocks = cache->origin_sectors = ti->len;
2422  	origin_blocks = block_div(origin_blocks, ca->block_size);
2423  	cache->origin_blocks = to_oblock(origin_blocks);
2424  
2425  	cache->sectors_per_block = ca->block_size;
2426  	if (dm_set_target_max_io_len(ti, cache->sectors_per_block)) {
2427  		r = -EINVAL;
2428  		goto bad;
2429  	}
2430  
2431  	if (ca->block_size & (ca->block_size - 1)) {
2432  		dm_block_t cache_size = ca->cache_sectors;
2433  
2434  		cache->sectors_per_block_shift = -1;
2435  		cache_size = block_div(cache_size, ca->block_size);
2436  		set_cache_size(cache, to_cblock(cache_size));
2437  	} else {
2438  		cache->sectors_per_block_shift = __ffs(ca->block_size);
2439  		set_cache_size(cache, to_cblock(ca->cache_sectors >> cache->sectors_per_block_shift));
2440  	}
2441  
2442  	r = create_cache_policy(cache, ca, error);
2443  	if (r)
2444  		goto bad;
2445  
2446  	cache->policy_nr_args = ca->policy_argc;
2447  	cache->migration_threshold = DEFAULT_MIGRATION_THRESHOLD;
2448  
2449  	r = set_config_values(cache, ca->policy_argc, ca->policy_argv);
2450  	if (r) {
2451  		*error = "Error setting cache policy's config values";
2452  		goto bad;
2453  	}
2454  
2455  	cmd = dm_cache_metadata_open(cache->metadata_dev->bdev,
2456  				     ca->block_size, may_format,
2457  				     dm_cache_policy_get_hint_size(cache->policy),
2458  				     ca->features.metadata_version);
2459  	if (IS_ERR(cmd)) {
2460  		*error = "Error creating metadata object";
2461  		r = PTR_ERR(cmd);
2462  		goto bad;
2463  	}
2464  	cache->cmd = cmd;
2465  	set_cache_mode(cache, CM_WRITE);
2466  	if (get_cache_mode(cache) != CM_WRITE) {
2467  		*error = "Unable to get write access to metadata, please check/repair metadata.";
2468  		r = -EINVAL;
2469  		goto bad;
2470  	}
2471  
2472  	if (passthrough_mode(cache)) {
2473  		bool all_clean;
2474  
2475  		r = dm_cache_metadata_all_clean(cache->cmd, &all_clean);
2476  		if (r) {
2477  			*error = "dm_cache_metadata_all_clean() failed";
2478  			goto bad;
2479  		}
2480  
2481  		if (!all_clean) {
2482  			*error = "Cannot enter passthrough mode unless all blocks are clean";
2483  			r = -EINVAL;
2484  			goto bad;
2485  		}
2486  
2487  		policy_allow_migrations(cache->policy, false);
2488  	}
2489  
2490  	spin_lock_init(&cache->lock);
2491  	bio_list_init(&cache->deferred_bios);
2492  	atomic_set(&cache->nr_allocated_migrations, 0);
2493  	atomic_set(&cache->nr_io_migrations, 0);
2494  	init_waitqueue_head(&cache->migration_wait);
2495  
2496  	r = -ENOMEM;
2497  	atomic_set(&cache->nr_dirty, 0);
2498  	cache->dirty_bitset = alloc_bitset(from_cblock(cache->cache_size));
2499  	if (!cache->dirty_bitset) {
2500  		*error = "could not allocate dirty bitset";
2501  		goto bad;
2502  	}
2503  	clear_bitset(cache->dirty_bitset, from_cblock(cache->cache_size));
2504  
2505  	cache->discard_block_size =
2506  		calculate_discard_block_size(cache->sectors_per_block,
2507  					     cache->origin_sectors);
2508  	cache->discard_nr_blocks = to_dblock(dm_sector_div_up(cache->origin_sectors,
2509  							      cache->discard_block_size));
2510  	cache->discard_bitset = alloc_bitset(from_dblock(cache->discard_nr_blocks));
2511  	if (!cache->discard_bitset) {
2512  		*error = "could not allocate discard bitset";
2513  		goto bad;
2514  	}
2515  	clear_bitset(cache->discard_bitset, from_dblock(cache->discard_nr_blocks));
2516  
2517  	cache->copier = dm_kcopyd_client_create(&dm_kcopyd_throttle);
2518  	if (IS_ERR(cache->copier)) {
2519  		*error = "could not create kcopyd client";
2520  		r = PTR_ERR(cache->copier);
2521  		goto bad;
2522  	}
2523  
2524  	cache->wq = alloc_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM, 0);
2525  	if (!cache->wq) {
2526  		*error = "could not create workqueue for metadata object";
2527  		goto bad;
2528  	}
2529  	INIT_WORK(&cache->deferred_bio_worker, process_deferred_bios);
2530  	INIT_WORK(&cache->migration_worker, check_migrations);
2531  	INIT_DELAYED_WORK(&cache->waker, do_waker);
2532  
2533  	cache->prison = dm_bio_prison_create_v2(cache->wq);
2534  	if (!cache->prison) {
2535  		*error = "could not create bio prison";
2536  		goto bad;
2537  	}
2538  
2539  	r = mempool_init_slab_pool(&cache->migration_pool, MIGRATION_POOL_SIZE,
2540  				   migration_cache);
2541  	if (r) {
2542  		*error = "Error creating cache's migration mempool";
2543  		goto bad;
2544  	}
2545  
2546  	cache->need_tick_bio = true;
2547  	cache->sized = false;
2548  	cache->invalidate = false;
2549  	cache->commit_requested = false;
2550  	cache->loaded_mappings = false;
2551  	cache->loaded_discards = false;
2552  
2553  	load_stats(cache);
2554  
2555  	atomic_set(&cache->stats.demotion, 0);
2556  	atomic_set(&cache->stats.promotion, 0);
2557  	atomic_set(&cache->stats.copies_avoided, 0);
2558  	atomic_set(&cache->stats.cache_cell_clash, 0);
2559  	atomic_set(&cache->stats.commit_count, 0);
2560  	atomic_set(&cache->stats.discard_count, 0);
2561  
2562  	spin_lock_init(&cache->invalidation_lock);
2563  	INIT_LIST_HEAD(&cache->invalidation_requests);
2564  
2565  	batcher_init(&cache->committer, commit_op, cache,
2566  		     issue_op, cache, cache->wq);
2567  	dm_iot_init(&cache->tracker);
2568  
2569  	init_rwsem(&cache->background_work_lock);
2570  	prevent_background_work(cache);
2571  
2572  	*result = cache;
2573  	return 0;
2574  bad:
2575  	__destroy(cache);
2576  	return r;
2577  }
2578  
copy_ctr_args(struct cache * cache,int argc,const char ** argv)2579  static int copy_ctr_args(struct cache *cache, int argc, const char **argv)
2580  {
2581  	unsigned int i;
2582  	const char **copy;
2583  
2584  	copy = kcalloc(argc, sizeof(*copy), GFP_KERNEL);
2585  	if (!copy)
2586  		return -ENOMEM;
2587  	for (i = 0; i < argc; i++) {
2588  		copy[i] = kstrdup(argv[i], GFP_KERNEL);
2589  		if (!copy[i]) {
2590  			while (i--)
2591  				kfree(copy[i]);
2592  			kfree(copy);
2593  			return -ENOMEM;
2594  		}
2595  	}
2596  
2597  	cache->nr_ctr_args = argc;
2598  	cache->ctr_args = copy;
2599  
2600  	return 0;
2601  }
2602  
cache_ctr(struct dm_target * ti,unsigned int argc,char ** argv)2603  static int cache_ctr(struct dm_target *ti, unsigned int argc, char **argv)
2604  {
2605  	int r = -EINVAL;
2606  	struct cache_args *ca;
2607  	struct cache *cache = NULL;
2608  
2609  	ca = kzalloc(sizeof(*ca), GFP_KERNEL);
2610  	if (!ca) {
2611  		ti->error = "Error allocating memory for cache";
2612  		return -ENOMEM;
2613  	}
2614  	ca->ti = ti;
2615  
2616  	r = parse_cache_args(ca, argc, argv, &ti->error);
2617  	if (r)
2618  		goto out;
2619  
2620  	r = cache_create(ca, &cache);
2621  	if (r)
2622  		goto out;
2623  
2624  	r = copy_ctr_args(cache, argc - 3, (const char **)argv + 3);
2625  	if (r) {
2626  		__destroy(cache);
2627  		goto out;
2628  	}
2629  
2630  	ti->private = cache;
2631  out:
2632  	destroy_cache_args(ca);
2633  	return r;
2634  }
2635  
2636  /*----------------------------------------------------------------*/
2637  
cache_map(struct dm_target * ti,struct bio * bio)2638  static int cache_map(struct dm_target *ti, struct bio *bio)
2639  {
2640  	struct cache *cache = ti->private;
2641  
2642  	int r;
2643  	bool commit_needed;
2644  	dm_oblock_t block = get_bio_block(cache, bio);
2645  
2646  	init_per_bio_data(bio);
2647  	if (unlikely(from_oblock(block) >= from_oblock(cache->origin_blocks))) {
2648  		/*
2649  		 * This can only occur if the io goes to a partial block at
2650  		 * the end of the origin device.  We don't cache these.
2651  		 * Just remap to the origin and carry on.
2652  		 */
2653  		remap_to_origin(cache, bio);
2654  		accounted_begin(cache, bio);
2655  		return DM_MAPIO_REMAPPED;
2656  	}
2657  
2658  	if (discard_or_flush(bio)) {
2659  		defer_bio(cache, bio);
2660  		return DM_MAPIO_SUBMITTED;
2661  	}
2662  
2663  	r = map_bio(cache, bio, block, &commit_needed);
2664  	if (commit_needed)
2665  		schedule_commit(&cache->committer);
2666  
2667  	return r;
2668  }
2669  
cache_end_io(struct dm_target * ti,struct bio * bio,blk_status_t * error)2670  static int cache_end_io(struct dm_target *ti, struct bio *bio, blk_status_t *error)
2671  {
2672  	struct cache *cache = ti->private;
2673  	unsigned long flags;
2674  	struct per_bio_data *pb = get_per_bio_data(bio);
2675  
2676  	if (pb->tick) {
2677  		policy_tick(cache->policy, false);
2678  
2679  		spin_lock_irqsave(&cache->lock, flags);
2680  		cache->need_tick_bio = true;
2681  		spin_unlock_irqrestore(&cache->lock, flags);
2682  	}
2683  
2684  	bio_drop_shared_lock(cache, bio);
2685  	accounted_complete(cache, bio);
2686  
2687  	return DM_ENDIO_DONE;
2688  }
2689  
write_dirty_bitset(struct cache * cache)2690  static int write_dirty_bitset(struct cache *cache)
2691  {
2692  	int r;
2693  
2694  	if (get_cache_mode(cache) >= CM_READ_ONLY)
2695  		return -EINVAL;
2696  
2697  	r = dm_cache_set_dirty_bits(cache->cmd, from_cblock(cache->cache_size), cache->dirty_bitset);
2698  	if (r)
2699  		metadata_operation_failed(cache, "dm_cache_set_dirty_bits", r);
2700  
2701  	return r;
2702  }
2703  
write_discard_bitset(struct cache * cache)2704  static int write_discard_bitset(struct cache *cache)
2705  {
2706  	unsigned int i, r;
2707  
2708  	if (get_cache_mode(cache) >= CM_READ_ONLY)
2709  		return -EINVAL;
2710  
2711  	r = dm_cache_discard_bitset_resize(cache->cmd, cache->discard_block_size,
2712  					   cache->discard_nr_blocks);
2713  	if (r) {
2714  		DMERR("%s: could not resize on-disk discard bitset", cache_device_name(cache));
2715  		metadata_operation_failed(cache, "dm_cache_discard_bitset_resize", r);
2716  		return r;
2717  	}
2718  
2719  	for (i = 0; i < from_dblock(cache->discard_nr_blocks); i++) {
2720  		r = dm_cache_set_discard(cache->cmd, to_dblock(i),
2721  					 is_discarded(cache, to_dblock(i)));
2722  		if (r) {
2723  			metadata_operation_failed(cache, "dm_cache_set_discard", r);
2724  			return r;
2725  		}
2726  	}
2727  
2728  	return 0;
2729  }
2730  
write_hints(struct cache * cache)2731  static int write_hints(struct cache *cache)
2732  {
2733  	int r;
2734  
2735  	if (get_cache_mode(cache) >= CM_READ_ONLY)
2736  		return -EINVAL;
2737  
2738  	r = dm_cache_write_hints(cache->cmd, cache->policy);
2739  	if (r) {
2740  		metadata_operation_failed(cache, "dm_cache_write_hints", r);
2741  		return r;
2742  	}
2743  
2744  	return 0;
2745  }
2746  
2747  /*
2748   * returns true on success
2749   */
sync_metadata(struct cache * cache)2750  static bool sync_metadata(struct cache *cache)
2751  {
2752  	int r1, r2, r3, r4;
2753  
2754  	r1 = write_dirty_bitset(cache);
2755  	if (r1)
2756  		DMERR("%s: could not write dirty bitset", cache_device_name(cache));
2757  
2758  	r2 = write_discard_bitset(cache);
2759  	if (r2)
2760  		DMERR("%s: could not write discard bitset", cache_device_name(cache));
2761  
2762  	save_stats(cache);
2763  
2764  	r3 = write_hints(cache);
2765  	if (r3)
2766  		DMERR("%s: could not write hints", cache_device_name(cache));
2767  
2768  	/*
2769  	 * If writing the above metadata failed, we still commit, but don't
2770  	 * set the clean shutdown flag.  This will effectively force every
2771  	 * dirty bit to be set on reload.
2772  	 */
2773  	r4 = commit(cache, !r1 && !r2 && !r3);
2774  	if (r4)
2775  		DMERR("%s: could not write cache metadata", cache_device_name(cache));
2776  
2777  	return !r1 && !r2 && !r3 && !r4;
2778  }
2779  
cache_postsuspend(struct dm_target * ti)2780  static void cache_postsuspend(struct dm_target *ti)
2781  {
2782  	struct cache *cache = ti->private;
2783  
2784  	prevent_background_work(cache);
2785  	BUG_ON(atomic_read(&cache->nr_io_migrations));
2786  
2787  	cancel_delayed_work_sync(&cache->waker);
2788  	drain_workqueue(cache->wq);
2789  	WARN_ON(cache->tracker.in_flight);
2790  
2791  	/*
2792  	 * If it's a flush suspend there won't be any deferred bios, so this
2793  	 * call is harmless.
2794  	 */
2795  	requeue_deferred_bios(cache);
2796  
2797  	if (get_cache_mode(cache) == CM_WRITE)
2798  		(void) sync_metadata(cache);
2799  }
2800  
load_mapping(void * context,dm_oblock_t oblock,dm_cblock_t cblock,bool dirty,uint32_t hint,bool hint_valid)2801  static int load_mapping(void *context, dm_oblock_t oblock, dm_cblock_t cblock,
2802  			bool dirty, uint32_t hint, bool hint_valid)
2803  {
2804  	struct cache *cache = context;
2805  
2806  	if (dirty) {
2807  		set_bit(from_cblock(cblock), cache->dirty_bitset);
2808  		atomic_inc(&cache->nr_dirty);
2809  	} else
2810  		clear_bit(from_cblock(cblock), cache->dirty_bitset);
2811  
2812  	return policy_load_mapping(cache->policy, oblock, cblock, dirty, hint, hint_valid);
2813  }
2814  
2815  /*
2816   * The discard block size in the on disk metadata is not
2817   * necessarily the same as we're currently using.  So we have to
2818   * be careful to only set the discarded attribute if we know it
2819   * covers a complete block of the new size.
2820   */
2821  struct discard_load_info {
2822  	struct cache *cache;
2823  
2824  	/*
2825  	 * These blocks are sized using the on disk dblock size, rather
2826  	 * than the current one.
2827  	 */
2828  	dm_block_t block_size;
2829  	dm_block_t discard_begin, discard_end;
2830  };
2831  
discard_load_info_init(struct cache * cache,struct discard_load_info * li)2832  static void discard_load_info_init(struct cache *cache,
2833  				   struct discard_load_info *li)
2834  {
2835  	li->cache = cache;
2836  	li->discard_begin = li->discard_end = 0;
2837  }
2838  
set_discard_range(struct discard_load_info * li)2839  static void set_discard_range(struct discard_load_info *li)
2840  {
2841  	sector_t b, e;
2842  
2843  	if (li->discard_begin == li->discard_end)
2844  		return;
2845  
2846  	/*
2847  	 * Convert to sectors.
2848  	 */
2849  	b = li->discard_begin * li->block_size;
2850  	e = li->discard_end * li->block_size;
2851  
2852  	/*
2853  	 * Then convert back to the current dblock size.
2854  	 */
2855  	b = dm_sector_div_up(b, li->cache->discard_block_size);
2856  	sector_div(e, li->cache->discard_block_size);
2857  
2858  	/*
2859  	 * The origin may have shrunk, so we need to check we're still in
2860  	 * bounds.
2861  	 */
2862  	if (e > from_dblock(li->cache->discard_nr_blocks))
2863  		e = from_dblock(li->cache->discard_nr_blocks);
2864  
2865  	for (; b < e; b++)
2866  		set_discard(li->cache, to_dblock(b));
2867  }
2868  
load_discard(void * context,sector_t discard_block_size,dm_dblock_t dblock,bool discard)2869  static int load_discard(void *context, sector_t discard_block_size,
2870  			dm_dblock_t dblock, bool discard)
2871  {
2872  	struct discard_load_info *li = context;
2873  
2874  	li->block_size = discard_block_size;
2875  
2876  	if (discard) {
2877  		if (from_dblock(dblock) == li->discard_end)
2878  			/*
2879  			 * We're already in a discard range, just extend it.
2880  			 */
2881  			li->discard_end = li->discard_end + 1ULL;
2882  
2883  		else {
2884  			/*
2885  			 * Emit the old range and start a new one.
2886  			 */
2887  			set_discard_range(li);
2888  			li->discard_begin = from_dblock(dblock);
2889  			li->discard_end = li->discard_begin + 1ULL;
2890  		}
2891  	} else {
2892  		set_discard_range(li);
2893  		li->discard_begin = li->discard_end = 0;
2894  	}
2895  
2896  	return 0;
2897  }
2898  
get_cache_dev_size(struct cache * cache)2899  static dm_cblock_t get_cache_dev_size(struct cache *cache)
2900  {
2901  	sector_t size = get_dev_size(cache->cache_dev);
2902  	(void) sector_div(size, cache->sectors_per_block);
2903  	return to_cblock(size);
2904  }
2905  
can_resize(struct cache * cache,dm_cblock_t new_size)2906  static bool can_resize(struct cache *cache, dm_cblock_t new_size)
2907  {
2908  	if (from_cblock(new_size) > from_cblock(cache->cache_size)) {
2909  		DMERR("%s: unable to extend cache due to missing cache table reload",
2910  		      cache_device_name(cache));
2911  		return false;
2912  	}
2913  
2914  	/*
2915  	 * We can't drop a dirty block when shrinking the cache.
2916  	 */
2917  	if (cache->loaded_mappings) {
2918  		new_size = to_cblock(find_next_bit(cache->dirty_bitset,
2919  						   from_cblock(cache->cache_size),
2920  						   from_cblock(new_size)));
2921  		if (new_size != cache->cache_size) {
2922  			DMERR("%s: unable to shrink cache; cache block %llu is dirty",
2923  			      cache_device_name(cache),
2924  			      (unsigned long long) from_cblock(new_size));
2925  			return false;
2926  		}
2927  	}
2928  
2929  	return true;
2930  }
2931  
resize_cache_dev(struct cache * cache,dm_cblock_t new_size)2932  static int resize_cache_dev(struct cache *cache, dm_cblock_t new_size)
2933  {
2934  	int r;
2935  
2936  	r = dm_cache_resize(cache->cmd, new_size);
2937  	if (r) {
2938  		DMERR("%s: could not resize cache metadata", cache_device_name(cache));
2939  		metadata_operation_failed(cache, "dm_cache_resize", r);
2940  		return r;
2941  	}
2942  
2943  	set_cache_size(cache, new_size);
2944  
2945  	return 0;
2946  }
2947  
cache_preresume(struct dm_target * ti)2948  static int cache_preresume(struct dm_target *ti)
2949  {
2950  	int r = 0;
2951  	struct cache *cache = ti->private;
2952  	dm_cblock_t csize = get_cache_dev_size(cache);
2953  
2954  	/*
2955  	 * Check to see if the cache has resized.
2956  	 */
2957  	if (!cache->sized || csize != cache->cache_size) {
2958  		if (!can_resize(cache, csize))
2959  			return -EINVAL;
2960  
2961  		r = resize_cache_dev(cache, csize);
2962  		if (r)
2963  			return r;
2964  
2965  		cache->sized = true;
2966  	}
2967  
2968  	if (!cache->loaded_mappings) {
2969  		r = dm_cache_load_mappings(cache->cmd, cache->policy,
2970  					   load_mapping, cache);
2971  		if (r) {
2972  			DMERR("%s: could not load cache mappings", cache_device_name(cache));
2973  			metadata_operation_failed(cache, "dm_cache_load_mappings", r);
2974  			return r;
2975  		}
2976  
2977  		cache->loaded_mappings = true;
2978  	}
2979  
2980  	if (!cache->loaded_discards) {
2981  		struct discard_load_info li;
2982  
2983  		/*
2984  		 * The discard bitset could have been resized, or the
2985  		 * discard block size changed.  To be safe we start by
2986  		 * setting every dblock to not discarded.
2987  		 */
2988  		clear_bitset(cache->discard_bitset, from_dblock(cache->discard_nr_blocks));
2989  
2990  		discard_load_info_init(cache, &li);
2991  		r = dm_cache_load_discards(cache->cmd, load_discard, &li);
2992  		if (r) {
2993  			DMERR("%s: could not load origin discards", cache_device_name(cache));
2994  			metadata_operation_failed(cache, "dm_cache_load_discards", r);
2995  			return r;
2996  		}
2997  		set_discard_range(&li);
2998  
2999  		cache->loaded_discards = true;
3000  	}
3001  
3002  	return r;
3003  }
3004  
cache_resume(struct dm_target * ti)3005  static void cache_resume(struct dm_target *ti)
3006  {
3007  	struct cache *cache = ti->private;
3008  
3009  	cache->need_tick_bio = true;
3010  	allow_background_work(cache);
3011  	do_waker(&cache->waker.work);
3012  }
3013  
emit_flags(struct cache * cache,char * result,unsigned int maxlen,ssize_t * sz_ptr)3014  static void emit_flags(struct cache *cache, char *result,
3015  		       unsigned int maxlen, ssize_t *sz_ptr)
3016  {
3017  	ssize_t sz = *sz_ptr;
3018  	struct cache_features *cf = &cache->features;
3019  	unsigned int count = (cf->metadata_version == 2) + !cf->discard_passdown + 1;
3020  
3021  	DMEMIT("%u ", count);
3022  
3023  	if (cf->metadata_version == 2)
3024  		DMEMIT("metadata2 ");
3025  
3026  	if (writethrough_mode(cache))
3027  		DMEMIT("writethrough ");
3028  
3029  	else if (passthrough_mode(cache))
3030  		DMEMIT("passthrough ");
3031  
3032  	else if (writeback_mode(cache))
3033  		DMEMIT("writeback ");
3034  
3035  	else {
3036  		DMEMIT("unknown ");
3037  		DMERR("%s: internal error: unknown io mode: %d",
3038  		      cache_device_name(cache), (int) cf->io_mode);
3039  	}
3040  
3041  	if (!cf->discard_passdown)
3042  		DMEMIT("no_discard_passdown ");
3043  
3044  	*sz_ptr = sz;
3045  }
3046  
3047  /*
3048   * Status format:
3049   *
3050   * <metadata block size> <#used metadata blocks>/<#total metadata blocks>
3051   * <cache block size> <#used cache blocks>/<#total cache blocks>
3052   * <#read hits> <#read misses> <#write hits> <#write misses>
3053   * <#demotions> <#promotions> <#dirty>
3054   * <#features> <features>*
3055   * <#core args> <core args>
3056   * <policy name> <#policy args> <policy args>* <cache metadata mode> <needs_check>
3057   */
cache_status(struct dm_target * ti,status_type_t type,unsigned int status_flags,char * result,unsigned int maxlen)3058  static void cache_status(struct dm_target *ti, status_type_t type,
3059  			 unsigned int status_flags, char *result, unsigned int maxlen)
3060  {
3061  	int r = 0;
3062  	unsigned int i;
3063  	ssize_t sz = 0;
3064  	dm_block_t nr_free_blocks_metadata = 0;
3065  	dm_block_t nr_blocks_metadata = 0;
3066  	char buf[BDEVNAME_SIZE];
3067  	struct cache *cache = ti->private;
3068  	dm_cblock_t residency;
3069  	bool needs_check;
3070  
3071  	switch (type) {
3072  	case STATUSTYPE_INFO:
3073  		if (get_cache_mode(cache) == CM_FAIL) {
3074  			DMEMIT("Fail");
3075  			break;
3076  		}
3077  
3078  		/* Commit to ensure statistics aren't out-of-date */
3079  		if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti))
3080  			(void) commit(cache, false);
3081  
3082  		r = dm_cache_get_free_metadata_block_count(cache->cmd, &nr_free_blocks_metadata);
3083  		if (r) {
3084  			DMERR("%s: dm_cache_get_free_metadata_block_count returned %d",
3085  			      cache_device_name(cache), r);
3086  			goto err;
3087  		}
3088  
3089  		r = dm_cache_get_metadata_dev_size(cache->cmd, &nr_blocks_metadata);
3090  		if (r) {
3091  			DMERR("%s: dm_cache_get_metadata_dev_size returned %d",
3092  			      cache_device_name(cache), r);
3093  			goto err;
3094  		}
3095  
3096  		residency = policy_residency(cache->policy);
3097  
3098  		DMEMIT("%u %llu/%llu %llu %llu/%llu %u %u %u %u %u %u %lu ",
3099  		       (unsigned int)DM_CACHE_METADATA_BLOCK_SIZE,
3100  		       (unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata),
3101  		       (unsigned long long)nr_blocks_metadata,
3102  		       (unsigned long long)cache->sectors_per_block,
3103  		       (unsigned long long) from_cblock(residency),
3104  		       (unsigned long long) from_cblock(cache->cache_size),
3105  		       (unsigned int) atomic_read(&cache->stats.read_hit),
3106  		       (unsigned int) atomic_read(&cache->stats.read_miss),
3107  		       (unsigned int) atomic_read(&cache->stats.write_hit),
3108  		       (unsigned int) atomic_read(&cache->stats.write_miss),
3109  		       (unsigned int) atomic_read(&cache->stats.demotion),
3110  		       (unsigned int) atomic_read(&cache->stats.promotion),
3111  		       (unsigned long) atomic_read(&cache->nr_dirty));
3112  
3113  		emit_flags(cache, result, maxlen, &sz);
3114  
3115  		DMEMIT("2 migration_threshold %llu ", (unsigned long long) cache->migration_threshold);
3116  
3117  		DMEMIT("%s ", dm_cache_policy_get_name(cache->policy));
3118  		if (sz < maxlen) {
3119  			r = policy_emit_config_values(cache->policy, result, maxlen, &sz);
3120  			if (r)
3121  				DMERR("%s: policy_emit_config_values returned %d",
3122  				      cache_device_name(cache), r);
3123  		}
3124  
3125  		if (get_cache_mode(cache) == CM_READ_ONLY)
3126  			DMEMIT("ro ");
3127  		else
3128  			DMEMIT("rw ");
3129  
3130  		r = dm_cache_metadata_needs_check(cache->cmd, &needs_check);
3131  
3132  		if (r || needs_check)
3133  			DMEMIT("needs_check ");
3134  		else
3135  			DMEMIT("- ");
3136  
3137  		break;
3138  
3139  	case STATUSTYPE_TABLE:
3140  		format_dev_t(buf, cache->metadata_dev->bdev->bd_dev);
3141  		DMEMIT("%s ", buf);
3142  		format_dev_t(buf, cache->cache_dev->bdev->bd_dev);
3143  		DMEMIT("%s ", buf);
3144  		format_dev_t(buf, cache->origin_dev->bdev->bd_dev);
3145  		DMEMIT("%s", buf);
3146  
3147  		for (i = 0; i < cache->nr_ctr_args - 1; i++)
3148  			DMEMIT(" %s", cache->ctr_args[i]);
3149  		if (cache->nr_ctr_args)
3150  			DMEMIT(" %s", cache->ctr_args[cache->nr_ctr_args - 1]);
3151  		break;
3152  
3153  	case STATUSTYPE_IMA:
3154  		DMEMIT_TARGET_NAME_VERSION(ti->type);
3155  		if (get_cache_mode(cache) == CM_FAIL)
3156  			DMEMIT(",metadata_mode=fail");
3157  		else if (get_cache_mode(cache) == CM_READ_ONLY)
3158  			DMEMIT(",metadata_mode=ro");
3159  		else
3160  			DMEMIT(",metadata_mode=rw");
3161  
3162  		format_dev_t(buf, cache->metadata_dev->bdev->bd_dev);
3163  		DMEMIT(",cache_metadata_device=%s", buf);
3164  		format_dev_t(buf, cache->cache_dev->bdev->bd_dev);
3165  		DMEMIT(",cache_device=%s", buf);
3166  		format_dev_t(buf, cache->origin_dev->bdev->bd_dev);
3167  		DMEMIT(",cache_origin_device=%s", buf);
3168  		DMEMIT(",writethrough=%c", writethrough_mode(cache) ? 'y' : 'n');
3169  		DMEMIT(",writeback=%c", writeback_mode(cache) ? 'y' : 'n');
3170  		DMEMIT(",passthrough=%c", passthrough_mode(cache) ? 'y' : 'n');
3171  		DMEMIT(",metadata2=%c", cache->features.metadata_version == 2 ? 'y' : 'n');
3172  		DMEMIT(",no_discard_passdown=%c", cache->features.discard_passdown ? 'n' : 'y');
3173  		DMEMIT(";");
3174  		break;
3175  	}
3176  
3177  	return;
3178  
3179  err:
3180  	DMEMIT("Error");
3181  }
3182  
3183  /*
3184   * Defines a range of cblocks, begin to (end - 1) are in the range.  end is
3185   * the one-past-the-end value.
3186   */
3187  struct cblock_range {
3188  	dm_cblock_t begin;
3189  	dm_cblock_t end;
3190  };
3191  
3192  /*
3193   * A cache block range can take two forms:
3194   *
3195   * i) A single cblock, eg. '3456'
3196   * ii) A begin and end cblock with a dash between, eg. 123-234
3197   */
parse_cblock_range(struct cache * cache,const char * str,struct cblock_range * result)3198  static int parse_cblock_range(struct cache *cache, const char *str,
3199  			      struct cblock_range *result)
3200  {
3201  	char dummy;
3202  	uint64_t b, e;
3203  	int r;
3204  
3205  	/*
3206  	 * Try and parse form (ii) first.
3207  	 */
3208  	r = sscanf(str, "%llu-%llu%c", &b, &e, &dummy);
3209  	if (r < 0)
3210  		return r;
3211  
3212  	if (r == 2) {
3213  		result->begin = to_cblock(b);
3214  		result->end = to_cblock(e);
3215  		return 0;
3216  	}
3217  
3218  	/*
3219  	 * That didn't work, try form (i).
3220  	 */
3221  	r = sscanf(str, "%llu%c", &b, &dummy);
3222  	if (r < 0)
3223  		return r;
3224  
3225  	if (r == 1) {
3226  		result->begin = to_cblock(b);
3227  		result->end = to_cblock(from_cblock(result->begin) + 1u);
3228  		return 0;
3229  	}
3230  
3231  	DMERR("%s: invalid cblock range '%s'", cache_device_name(cache), str);
3232  	return -EINVAL;
3233  }
3234  
validate_cblock_range(struct cache * cache,struct cblock_range * range)3235  static int validate_cblock_range(struct cache *cache, struct cblock_range *range)
3236  {
3237  	uint64_t b = from_cblock(range->begin);
3238  	uint64_t e = from_cblock(range->end);
3239  	uint64_t n = from_cblock(cache->cache_size);
3240  
3241  	if (b >= n) {
3242  		DMERR("%s: begin cblock out of range: %llu >= %llu",
3243  		      cache_device_name(cache), b, n);
3244  		return -EINVAL;
3245  	}
3246  
3247  	if (e > n) {
3248  		DMERR("%s: end cblock out of range: %llu > %llu",
3249  		      cache_device_name(cache), e, n);
3250  		return -EINVAL;
3251  	}
3252  
3253  	if (b >= e) {
3254  		DMERR("%s: invalid cblock range: %llu >= %llu",
3255  		      cache_device_name(cache), b, e);
3256  		return -EINVAL;
3257  	}
3258  
3259  	return 0;
3260  }
3261  
cblock_succ(dm_cblock_t b)3262  static inline dm_cblock_t cblock_succ(dm_cblock_t b)
3263  {
3264  	return to_cblock(from_cblock(b) + 1);
3265  }
3266  
request_invalidation(struct cache * cache,struct cblock_range * range)3267  static int request_invalidation(struct cache *cache, struct cblock_range *range)
3268  {
3269  	int r = 0;
3270  
3271  	/*
3272  	 * We don't need to do any locking here because we know we're in
3273  	 * passthrough mode.  There's is potential for a race between an
3274  	 * invalidation triggered by an io and an invalidation message.  This
3275  	 * is harmless, we must not worry if the policy call fails.
3276  	 */
3277  	while (range->begin != range->end) {
3278  		r = invalidate_cblock(cache, range->begin);
3279  		if (r)
3280  			return r;
3281  
3282  		range->begin = cblock_succ(range->begin);
3283  	}
3284  
3285  	cache->commit_requested = true;
3286  	return r;
3287  }
3288  
process_invalidate_cblocks_message(struct cache * cache,unsigned int count,const char ** cblock_ranges)3289  static int process_invalidate_cblocks_message(struct cache *cache, unsigned int count,
3290  					      const char **cblock_ranges)
3291  {
3292  	int r = 0;
3293  	unsigned int i;
3294  	struct cblock_range range;
3295  
3296  	if (!passthrough_mode(cache)) {
3297  		DMERR("%s: cache has to be in passthrough mode for invalidation",
3298  		      cache_device_name(cache));
3299  		return -EPERM;
3300  	}
3301  
3302  	for (i = 0; i < count; i++) {
3303  		r = parse_cblock_range(cache, cblock_ranges[i], &range);
3304  		if (r)
3305  			break;
3306  
3307  		r = validate_cblock_range(cache, &range);
3308  		if (r)
3309  			break;
3310  
3311  		/*
3312  		 * Pass begin and end origin blocks to the worker and wake it.
3313  		 */
3314  		r = request_invalidation(cache, &range);
3315  		if (r)
3316  			break;
3317  	}
3318  
3319  	return r;
3320  }
3321  
3322  /*
3323   * Supports
3324   *	"<key> <value>"
3325   * and
3326   *     "invalidate_cblocks [(<begin>)|(<begin>-<end>)]*
3327   *
3328   * The key migration_threshold is supported by the cache target core.
3329   */
cache_message(struct dm_target * ti,unsigned int argc,char ** argv,char * result,unsigned int maxlen)3330  static int cache_message(struct dm_target *ti, unsigned int argc, char **argv,
3331  			 char *result, unsigned int maxlen)
3332  {
3333  	struct cache *cache = ti->private;
3334  
3335  	if (!argc)
3336  		return -EINVAL;
3337  
3338  	if (get_cache_mode(cache) >= CM_READ_ONLY) {
3339  		DMERR("%s: unable to service cache target messages in READ_ONLY or FAIL mode",
3340  		      cache_device_name(cache));
3341  		return -EOPNOTSUPP;
3342  	}
3343  
3344  	if (!strcasecmp(argv[0], "invalidate_cblocks"))
3345  		return process_invalidate_cblocks_message(cache, argc - 1, (const char **) argv + 1);
3346  
3347  	if (argc != 2)
3348  		return -EINVAL;
3349  
3350  	return set_config_value(cache, argv[0], argv[1]);
3351  }
3352  
cache_iterate_devices(struct dm_target * ti,iterate_devices_callout_fn fn,void * data)3353  static int cache_iterate_devices(struct dm_target *ti,
3354  				 iterate_devices_callout_fn fn, void *data)
3355  {
3356  	int r = 0;
3357  	struct cache *cache = ti->private;
3358  
3359  	r = fn(ti, cache->cache_dev, 0, get_dev_size(cache->cache_dev), data);
3360  	if (!r)
3361  		r = fn(ti, cache->origin_dev, 0, ti->len, data);
3362  
3363  	return r;
3364  }
3365  
3366  /*
3367   * If discard_passdown was enabled verify that the origin device
3368   * supports discards.  Disable discard_passdown if not.
3369   */
disable_passdown_if_not_supported(struct cache * cache)3370  static void disable_passdown_if_not_supported(struct cache *cache)
3371  {
3372  	struct block_device *origin_bdev = cache->origin_dev->bdev;
3373  	struct queue_limits *origin_limits = &bdev_get_queue(origin_bdev)->limits;
3374  	const char *reason = NULL;
3375  
3376  	if (!cache->features.discard_passdown)
3377  		return;
3378  
3379  	if (!bdev_max_discard_sectors(origin_bdev))
3380  		reason = "discard unsupported";
3381  
3382  	else if (origin_limits->max_discard_sectors < cache->sectors_per_block)
3383  		reason = "max discard sectors smaller than a block";
3384  
3385  	if (reason) {
3386  		DMWARN("Origin device (%pg) %s: Disabling discard passdown.",
3387  		       origin_bdev, reason);
3388  		cache->features.discard_passdown = false;
3389  	}
3390  }
3391  
set_discard_limits(struct cache * cache,struct queue_limits * limits)3392  static void set_discard_limits(struct cache *cache, struct queue_limits *limits)
3393  {
3394  	struct block_device *origin_bdev = cache->origin_dev->bdev;
3395  	struct queue_limits *origin_limits = &bdev_get_queue(origin_bdev)->limits;
3396  
3397  	if (!cache->features.discard_passdown) {
3398  		/* No passdown is done so setting own virtual limits */
3399  		limits->max_discard_sectors = min_t(sector_t, cache->discard_block_size * 1024,
3400  						    cache->origin_sectors);
3401  		limits->discard_granularity = cache->discard_block_size << SECTOR_SHIFT;
3402  		return;
3403  	}
3404  
3405  	/*
3406  	 * cache_iterate_devices() is stacking both origin and fast device limits
3407  	 * but discards aren't passed to fast device, so inherit origin's limits.
3408  	 */
3409  	limits->max_discard_sectors = origin_limits->max_discard_sectors;
3410  	limits->max_hw_discard_sectors = origin_limits->max_hw_discard_sectors;
3411  	limits->discard_granularity = origin_limits->discard_granularity;
3412  	limits->discard_alignment = origin_limits->discard_alignment;
3413  	limits->discard_misaligned = origin_limits->discard_misaligned;
3414  }
3415  
cache_io_hints(struct dm_target * ti,struct queue_limits * limits)3416  static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits)
3417  {
3418  	struct cache *cache = ti->private;
3419  	uint64_t io_opt_sectors = limits->io_opt >> SECTOR_SHIFT;
3420  
3421  	/*
3422  	 * If the system-determined stacked limits are compatible with the
3423  	 * cache's blocksize (io_opt is a factor) do not override them.
3424  	 */
3425  	if (io_opt_sectors < cache->sectors_per_block ||
3426  	    do_div(io_opt_sectors, cache->sectors_per_block)) {
3427  		blk_limits_io_min(limits, cache->sectors_per_block << SECTOR_SHIFT);
3428  		blk_limits_io_opt(limits, cache->sectors_per_block << SECTOR_SHIFT);
3429  	}
3430  
3431  	disable_passdown_if_not_supported(cache);
3432  	set_discard_limits(cache, limits);
3433  }
3434  
3435  /*----------------------------------------------------------------*/
3436  
3437  static struct target_type cache_target = {
3438  	.name = "cache",
3439  	.version = {2, 2, 0},
3440  	.module = THIS_MODULE,
3441  	.ctr = cache_ctr,
3442  	.dtr = cache_dtr,
3443  	.map = cache_map,
3444  	.end_io = cache_end_io,
3445  	.postsuspend = cache_postsuspend,
3446  	.preresume = cache_preresume,
3447  	.resume = cache_resume,
3448  	.status = cache_status,
3449  	.message = cache_message,
3450  	.iterate_devices = cache_iterate_devices,
3451  	.io_hints = cache_io_hints,
3452  };
3453  
dm_cache_init(void)3454  static int __init dm_cache_init(void)
3455  {
3456  	int r;
3457  
3458  	migration_cache = KMEM_CACHE(dm_cache_migration, 0);
3459  	if (!migration_cache) {
3460  		r = -ENOMEM;
3461  		goto err;
3462  	}
3463  
3464  	btracker_work_cache = kmem_cache_create("dm_cache_bt_work",
3465  		sizeof(struct bt_work), __alignof__(struct bt_work), 0, NULL);
3466  	if (!btracker_work_cache) {
3467  		r = -ENOMEM;
3468  		goto err;
3469  	}
3470  
3471  	r = dm_register_target(&cache_target);
3472  	if (r) {
3473  		goto err;
3474  	}
3475  
3476  	return 0;
3477  
3478  err:
3479  	kmem_cache_destroy(migration_cache);
3480  	kmem_cache_destroy(btracker_work_cache);
3481  	return r;
3482  }
3483  
dm_cache_exit(void)3484  static void __exit dm_cache_exit(void)
3485  {
3486  	dm_unregister_target(&cache_target);
3487  	kmem_cache_destroy(migration_cache);
3488  	kmem_cache_destroy(btracker_work_cache);
3489  }
3490  
3491  module_init(dm_cache_init);
3492  module_exit(dm_cache_exit);
3493  
3494  MODULE_DESCRIPTION(DM_NAME " cache target");
3495  MODULE_AUTHOR("Joe Thornber <ejt@redhat.com>");
3496  MODULE_LICENSE("GPL");
3497