xref: /openbmc/linux/drivers/md/dm-thin.c (revision 171f1bc7)
1 /*
2  * Copyright (C) 2011 Red Hat UK.
3  *
4  * This file is released under the GPL.
5  */
6 
7 #include "dm-thin-metadata.h"
8 
9 #include <linux/device-mapper.h>
10 #include <linux/dm-io.h>
11 #include <linux/dm-kcopyd.h>
12 #include <linux/list.h>
13 #include <linux/init.h>
14 #include <linux/module.h>
15 #include <linux/slab.h>
16 
17 #define	DM_MSG_PREFIX	"thin"
18 
19 /*
20  * Tunable constants
21  */
22 #define ENDIO_HOOK_POOL_SIZE 10240
23 #define DEFERRED_SET_SIZE 64
24 #define MAPPING_POOL_SIZE 1024
25 #define PRISON_CELLS 1024
26 
27 /*
28  * The block size of the device holding pool data must be
29  * between 64KB and 1GB.
30  */
31 #define DATA_DEV_BLOCK_SIZE_MIN_SECTORS (64 * 1024 >> SECTOR_SHIFT)
32 #define DATA_DEV_BLOCK_SIZE_MAX_SECTORS (1024 * 1024 * 1024 >> SECTOR_SHIFT)
33 
34 /*
35  * The metadata device is currently limited in size.  The limitation is
36  * checked lower down in dm-space-map-metadata, but we also check it here
37  * so we can fail early.
38  *
39  * We have one block of index, which can hold 255 index entries.  Each
40  * index entry contains allocation info about 16k metadata blocks.
41  */
42 #define METADATA_DEV_MAX_SECTORS (255 * (1 << 14) * (THIN_METADATA_BLOCK_SIZE / (1 << SECTOR_SHIFT)))
43 
44 /*
45  * Device id is restricted to 24 bits.
46  */
47 #define MAX_DEV_ID ((1 << 24) - 1)
48 
49 /*
50  * How do we handle breaking sharing of data blocks?
51  * =================================================
52  *
53  * We use a standard copy-on-write btree to store the mappings for the
54  * devices (note I'm talking about copy-on-write of the metadata here, not
55  * the data).  When you take an internal snapshot you clone the root node
56  * of the origin btree.  After this there is no concept of an origin or a
57  * snapshot.  They are just two device trees that happen to point to the
58  * same data blocks.
59  *
60  * When we get a write in we decide if it's to a shared data block using
61  * some timestamp magic.  If it is, we have to break sharing.
62  *
63  * Let's say we write to a shared block in what was the origin.  The
64  * steps are:
65  *
66  * i) plug io further to this physical block. (see bio_prison code).
67  *
68  * ii) quiesce any read io to that shared data block.  Obviously
69  * including all devices that share this block.  (see deferred_set code)
70  *
71  * iii) copy the data block to a newly allocate block.  This step can be
72  * missed out if the io covers the block. (schedule_copy).
73  *
74  * iv) insert the new mapping into the origin's btree
75  * (process_prepared_mappings).  This act of inserting breaks some
76  * sharing of btree nodes between the two devices.  Breaking sharing only
77  * effects the btree of that specific device.  Btrees for the other
78  * devices that share the block never change.  The btree for the origin
79  * device as it was after the last commit is untouched, ie. we're using
80  * persistent data structures in the functional programming sense.
81  *
82  * v) unplug io to this physical block, including the io that triggered
83  * the breaking of sharing.
84  *
85  * Steps (ii) and (iii) occur in parallel.
86  *
87  * The metadata _doesn't_ need to be committed before the io continues.  We
88  * get away with this because the io is always written to a _new_ block.
89  * If there's a crash, then:
90  *
91  * - The origin mapping will point to the old origin block (the shared
92  * one).  This will contain the data as it was before the io that triggered
93  * the breaking of sharing came in.
94  *
95  * - The snap mapping still points to the old block.  As it would after
96  * the commit.
97  *
98  * The downside of this scheme is the timestamp magic isn't perfect, and
99  * will continue to think that data block in the snapshot device is shared
100  * even after the write to the origin has broken sharing.  I suspect data
101  * blocks will typically be shared by many different devices, so we're
102  * breaking sharing n + 1 times, rather than n, where n is the number of
103  * devices that reference this data block.  At the moment I think the
104  * benefits far, far outweigh the disadvantages.
105  */
106 
107 /*----------------------------------------------------------------*/
108 
109 /*
110  * Sometimes we can't deal with a bio straight away.  We put them in prison
111  * where they can't cause any mischief.  Bios are put in a cell identified
112  * by a key, multiple bios can be in the same cell.  When the cell is
113  * subsequently unlocked the bios become available.
114  */
115 struct bio_prison;
116 
117 struct cell_key {
118 	int virtual;
119 	dm_thin_id dev;
120 	dm_block_t block;
121 };
122 
123 struct cell {
124 	struct hlist_node list;
125 	struct bio_prison *prison;
126 	struct cell_key key;
127 	unsigned count;
128 	struct bio_list bios;
129 };
130 
131 struct bio_prison {
132 	spinlock_t lock;
133 	mempool_t *cell_pool;
134 
135 	unsigned nr_buckets;
136 	unsigned hash_mask;
137 	struct hlist_head *cells;
138 };
139 
140 static uint32_t calc_nr_buckets(unsigned nr_cells)
141 {
142 	uint32_t n = 128;
143 
144 	nr_cells /= 4;
145 	nr_cells = min(nr_cells, 8192u);
146 
147 	while (n < nr_cells)
148 		n <<= 1;
149 
150 	return n;
151 }
152 
153 /*
154  * @nr_cells should be the number of cells you want in use _concurrently_.
155  * Don't confuse it with the number of distinct keys.
156  */
157 static struct bio_prison *prison_create(unsigned nr_cells)
158 {
159 	unsigned i;
160 	uint32_t nr_buckets = calc_nr_buckets(nr_cells);
161 	size_t len = sizeof(struct bio_prison) +
162 		(sizeof(struct hlist_head) * nr_buckets);
163 	struct bio_prison *prison = kmalloc(len, GFP_KERNEL);
164 
165 	if (!prison)
166 		return NULL;
167 
168 	spin_lock_init(&prison->lock);
169 	prison->cell_pool = mempool_create_kmalloc_pool(nr_cells,
170 							sizeof(struct cell));
171 	if (!prison->cell_pool) {
172 		kfree(prison);
173 		return NULL;
174 	}
175 
176 	prison->nr_buckets = nr_buckets;
177 	prison->hash_mask = nr_buckets - 1;
178 	prison->cells = (struct hlist_head *) (prison + 1);
179 	for (i = 0; i < nr_buckets; i++)
180 		INIT_HLIST_HEAD(prison->cells + i);
181 
182 	return prison;
183 }
184 
185 static void prison_destroy(struct bio_prison *prison)
186 {
187 	mempool_destroy(prison->cell_pool);
188 	kfree(prison);
189 }
190 
191 static uint32_t hash_key(struct bio_prison *prison, struct cell_key *key)
192 {
193 	const unsigned long BIG_PRIME = 4294967291UL;
194 	uint64_t hash = key->block * BIG_PRIME;
195 
196 	return (uint32_t) (hash & prison->hash_mask);
197 }
198 
199 static int keys_equal(struct cell_key *lhs, struct cell_key *rhs)
200 {
201 	       return (lhs->virtual == rhs->virtual) &&
202 		       (lhs->dev == rhs->dev) &&
203 		       (lhs->block == rhs->block);
204 }
205 
206 static struct cell *__search_bucket(struct hlist_head *bucket,
207 				    struct cell_key *key)
208 {
209 	struct cell *cell;
210 	struct hlist_node *tmp;
211 
212 	hlist_for_each_entry(cell, tmp, bucket, list)
213 		if (keys_equal(&cell->key, key))
214 			return cell;
215 
216 	return NULL;
217 }
218 
219 /*
220  * This may block if a new cell needs allocating.  You must ensure that
221  * cells will be unlocked even if the calling thread is blocked.
222  *
223  * Returns the number of entries in the cell prior to the new addition
224  * or < 0 on failure.
225  */
226 static int bio_detain(struct bio_prison *prison, struct cell_key *key,
227 		      struct bio *inmate, struct cell **ref)
228 {
229 	int r;
230 	unsigned long flags;
231 	uint32_t hash = hash_key(prison, key);
232 	struct cell *uninitialized_var(cell), *cell2 = NULL;
233 
234 	BUG_ON(hash > prison->nr_buckets);
235 
236 	spin_lock_irqsave(&prison->lock, flags);
237 	cell = __search_bucket(prison->cells + hash, key);
238 
239 	if (!cell) {
240 		/*
241 		 * Allocate a new cell
242 		 */
243 		spin_unlock_irqrestore(&prison->lock, flags);
244 		cell2 = mempool_alloc(prison->cell_pool, GFP_NOIO);
245 		spin_lock_irqsave(&prison->lock, flags);
246 
247 		/*
248 		 * We've been unlocked, so we have to double check that
249 		 * nobody else has inserted this cell in the meantime.
250 		 */
251 		cell = __search_bucket(prison->cells + hash, key);
252 
253 		if (!cell) {
254 			cell = cell2;
255 			cell2 = NULL;
256 
257 			cell->prison = prison;
258 			memcpy(&cell->key, key, sizeof(cell->key));
259 			cell->count = 0;
260 			bio_list_init(&cell->bios);
261 			hlist_add_head(&cell->list, prison->cells + hash);
262 		}
263 	}
264 
265 	r = cell->count++;
266 	bio_list_add(&cell->bios, inmate);
267 	spin_unlock_irqrestore(&prison->lock, flags);
268 
269 	if (cell2)
270 		mempool_free(cell2, prison->cell_pool);
271 
272 	*ref = cell;
273 
274 	return r;
275 }
276 
277 /*
278  * @inmates must have been initialised prior to this call
279  */
280 static void __cell_release(struct cell *cell, struct bio_list *inmates)
281 {
282 	struct bio_prison *prison = cell->prison;
283 
284 	hlist_del(&cell->list);
285 
286 	if (inmates)
287 		bio_list_merge(inmates, &cell->bios);
288 
289 	mempool_free(cell, prison->cell_pool);
290 }
291 
292 static void cell_release(struct cell *cell, struct bio_list *bios)
293 {
294 	unsigned long flags;
295 	struct bio_prison *prison = cell->prison;
296 
297 	spin_lock_irqsave(&prison->lock, flags);
298 	__cell_release(cell, bios);
299 	spin_unlock_irqrestore(&prison->lock, flags);
300 }
301 
302 /*
303  * There are a couple of places where we put a bio into a cell briefly
304  * before taking it out again.  In these situations we know that no other
305  * bio may be in the cell.  This function releases the cell, and also does
306  * a sanity check.
307  */
308 static void cell_release_singleton(struct cell *cell, struct bio *bio)
309 {
310 	struct bio_prison *prison = cell->prison;
311 	struct bio_list bios;
312 	struct bio *b;
313 	unsigned long flags;
314 
315 	bio_list_init(&bios);
316 
317 	spin_lock_irqsave(&prison->lock, flags);
318 	__cell_release(cell, &bios);
319 	spin_unlock_irqrestore(&prison->lock, flags);
320 
321 	b = bio_list_pop(&bios);
322 	BUG_ON(b != bio);
323 	BUG_ON(!bio_list_empty(&bios));
324 }
325 
326 static void cell_error(struct cell *cell)
327 {
328 	struct bio_prison *prison = cell->prison;
329 	struct bio_list bios;
330 	struct bio *bio;
331 	unsigned long flags;
332 
333 	bio_list_init(&bios);
334 
335 	spin_lock_irqsave(&prison->lock, flags);
336 	__cell_release(cell, &bios);
337 	spin_unlock_irqrestore(&prison->lock, flags);
338 
339 	while ((bio = bio_list_pop(&bios)))
340 		bio_io_error(bio);
341 }
342 
343 /*----------------------------------------------------------------*/
344 
345 /*
346  * We use the deferred set to keep track of pending reads to shared blocks.
347  * We do this to ensure the new mapping caused by a write isn't performed
348  * until these prior reads have completed.  Otherwise the insertion of the
349  * new mapping could free the old block that the read bios are mapped to.
350  */
351 
352 struct deferred_set;
353 struct deferred_entry {
354 	struct deferred_set *ds;
355 	unsigned count;
356 	struct list_head work_items;
357 };
358 
359 struct deferred_set {
360 	spinlock_t lock;
361 	unsigned current_entry;
362 	unsigned sweeper;
363 	struct deferred_entry entries[DEFERRED_SET_SIZE];
364 };
365 
366 static void ds_init(struct deferred_set *ds)
367 {
368 	int i;
369 
370 	spin_lock_init(&ds->lock);
371 	ds->current_entry = 0;
372 	ds->sweeper = 0;
373 	for (i = 0; i < DEFERRED_SET_SIZE; i++) {
374 		ds->entries[i].ds = ds;
375 		ds->entries[i].count = 0;
376 		INIT_LIST_HEAD(&ds->entries[i].work_items);
377 	}
378 }
379 
380 static struct deferred_entry *ds_inc(struct deferred_set *ds)
381 {
382 	unsigned long flags;
383 	struct deferred_entry *entry;
384 
385 	spin_lock_irqsave(&ds->lock, flags);
386 	entry = ds->entries + ds->current_entry;
387 	entry->count++;
388 	spin_unlock_irqrestore(&ds->lock, flags);
389 
390 	return entry;
391 }
392 
393 static unsigned ds_next(unsigned index)
394 {
395 	return (index + 1) % DEFERRED_SET_SIZE;
396 }
397 
398 static void __sweep(struct deferred_set *ds, struct list_head *head)
399 {
400 	while ((ds->sweeper != ds->current_entry) &&
401 	       !ds->entries[ds->sweeper].count) {
402 		list_splice_init(&ds->entries[ds->sweeper].work_items, head);
403 		ds->sweeper = ds_next(ds->sweeper);
404 	}
405 
406 	if ((ds->sweeper == ds->current_entry) && !ds->entries[ds->sweeper].count)
407 		list_splice_init(&ds->entries[ds->sweeper].work_items, head);
408 }
409 
410 static void ds_dec(struct deferred_entry *entry, struct list_head *head)
411 {
412 	unsigned long flags;
413 
414 	spin_lock_irqsave(&entry->ds->lock, flags);
415 	BUG_ON(!entry->count);
416 	--entry->count;
417 	__sweep(entry->ds, head);
418 	spin_unlock_irqrestore(&entry->ds->lock, flags);
419 }
420 
421 /*
422  * Returns 1 if deferred or 0 if no pending items to delay job.
423  */
424 static int ds_add_work(struct deferred_set *ds, struct list_head *work)
425 {
426 	int r = 1;
427 	unsigned long flags;
428 	unsigned next_entry;
429 
430 	spin_lock_irqsave(&ds->lock, flags);
431 	if ((ds->sweeper == ds->current_entry) &&
432 	    !ds->entries[ds->current_entry].count)
433 		r = 0;
434 	else {
435 		list_add(work, &ds->entries[ds->current_entry].work_items);
436 		next_entry = ds_next(ds->current_entry);
437 		if (!ds->entries[next_entry].count)
438 			ds->current_entry = next_entry;
439 	}
440 	spin_unlock_irqrestore(&ds->lock, flags);
441 
442 	return r;
443 }
444 
445 /*----------------------------------------------------------------*/
446 
447 /*
448  * Key building.
449  */
450 static void build_data_key(struct dm_thin_device *td,
451 			   dm_block_t b, struct cell_key *key)
452 {
453 	key->virtual = 0;
454 	key->dev = dm_thin_dev_id(td);
455 	key->block = b;
456 }
457 
458 static void build_virtual_key(struct dm_thin_device *td, dm_block_t b,
459 			      struct cell_key *key)
460 {
461 	key->virtual = 1;
462 	key->dev = dm_thin_dev_id(td);
463 	key->block = b;
464 }
465 
466 /*----------------------------------------------------------------*/
467 
468 /*
469  * A pool device ties together a metadata device and a data device.  It
470  * also provides the interface for creating and destroying internal
471  * devices.
472  */
473 struct new_mapping;
474 struct pool {
475 	struct list_head list;
476 	struct dm_target *ti;	/* Only set if a pool target is bound */
477 
478 	struct mapped_device *pool_md;
479 	struct block_device *md_dev;
480 	struct dm_pool_metadata *pmd;
481 
482 	uint32_t sectors_per_block;
483 	unsigned block_shift;
484 	dm_block_t offset_mask;
485 	dm_block_t low_water_blocks;
486 
487 	unsigned zero_new_blocks:1;
488 	unsigned low_water_triggered:1;	/* A dm event has been sent */
489 	unsigned no_free_space:1;	/* A -ENOSPC warning has been issued */
490 
491 	struct bio_prison *prison;
492 	struct dm_kcopyd_client *copier;
493 
494 	struct workqueue_struct *wq;
495 	struct work_struct worker;
496 
497 	unsigned ref_count;
498 
499 	spinlock_t lock;
500 	struct bio_list deferred_bios;
501 	struct bio_list deferred_flush_bios;
502 	struct list_head prepared_mappings;
503 
504 	struct bio_list retry_on_resume_list;
505 
506 	struct deferred_set ds;	/* FIXME: move to thin_c */
507 
508 	struct new_mapping *next_mapping;
509 	mempool_t *mapping_pool;
510 	mempool_t *endio_hook_pool;
511 };
512 
513 /*
514  * Target context for a pool.
515  */
516 struct pool_c {
517 	struct dm_target *ti;
518 	struct pool *pool;
519 	struct dm_dev *data_dev;
520 	struct dm_dev *metadata_dev;
521 	struct dm_target_callbacks callbacks;
522 
523 	dm_block_t low_water_blocks;
524 	unsigned zero_new_blocks:1;
525 };
526 
527 /*
528  * Target context for a thin.
529  */
530 struct thin_c {
531 	struct dm_dev *pool_dev;
532 	dm_thin_id dev_id;
533 
534 	struct pool *pool;
535 	struct dm_thin_device *td;
536 };
537 
538 /*----------------------------------------------------------------*/
539 
540 /*
541  * A global list of pools that uses a struct mapped_device as a key.
542  */
543 static struct dm_thin_pool_table {
544 	struct mutex mutex;
545 	struct list_head pools;
546 } dm_thin_pool_table;
547 
548 static void pool_table_init(void)
549 {
550 	mutex_init(&dm_thin_pool_table.mutex);
551 	INIT_LIST_HEAD(&dm_thin_pool_table.pools);
552 }
553 
554 static void __pool_table_insert(struct pool *pool)
555 {
556 	BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
557 	list_add(&pool->list, &dm_thin_pool_table.pools);
558 }
559 
560 static void __pool_table_remove(struct pool *pool)
561 {
562 	BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
563 	list_del(&pool->list);
564 }
565 
566 static struct pool *__pool_table_lookup(struct mapped_device *md)
567 {
568 	struct pool *pool = NULL, *tmp;
569 
570 	BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
571 
572 	list_for_each_entry(tmp, &dm_thin_pool_table.pools, list) {
573 		if (tmp->pool_md == md) {
574 			pool = tmp;
575 			break;
576 		}
577 	}
578 
579 	return pool;
580 }
581 
582 static struct pool *__pool_table_lookup_metadata_dev(struct block_device *md_dev)
583 {
584 	struct pool *pool = NULL, *tmp;
585 
586 	BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
587 
588 	list_for_each_entry(tmp, &dm_thin_pool_table.pools, list) {
589 		if (tmp->md_dev == md_dev) {
590 			pool = tmp;
591 			break;
592 		}
593 	}
594 
595 	return pool;
596 }
597 
598 /*----------------------------------------------------------------*/
599 
600 static void __requeue_bio_list(struct thin_c *tc, struct bio_list *master)
601 {
602 	struct bio *bio;
603 	struct bio_list bios;
604 
605 	bio_list_init(&bios);
606 	bio_list_merge(&bios, master);
607 	bio_list_init(master);
608 
609 	while ((bio = bio_list_pop(&bios))) {
610 		if (dm_get_mapinfo(bio)->ptr == tc)
611 			bio_endio(bio, DM_ENDIO_REQUEUE);
612 		else
613 			bio_list_add(master, bio);
614 	}
615 }
616 
617 static void requeue_io(struct thin_c *tc)
618 {
619 	struct pool *pool = tc->pool;
620 	unsigned long flags;
621 
622 	spin_lock_irqsave(&pool->lock, flags);
623 	__requeue_bio_list(tc, &pool->deferred_bios);
624 	__requeue_bio_list(tc, &pool->retry_on_resume_list);
625 	spin_unlock_irqrestore(&pool->lock, flags);
626 }
627 
628 /*
629  * This section of code contains the logic for processing a thin device's IO.
630  * Much of the code depends on pool object resources (lists, workqueues, etc)
631  * but most is exclusively called from the thin target rather than the thin-pool
632  * target.
633  */
634 
635 static dm_block_t get_bio_block(struct thin_c *tc, struct bio *bio)
636 {
637 	return bio->bi_sector >> tc->pool->block_shift;
638 }
639 
640 static void remap(struct thin_c *tc, struct bio *bio, dm_block_t block)
641 {
642 	struct pool *pool = tc->pool;
643 
644 	bio->bi_bdev = tc->pool_dev->bdev;
645 	bio->bi_sector = (block << pool->block_shift) +
646 		(bio->bi_sector & pool->offset_mask);
647 }
648 
649 static void remap_and_issue(struct thin_c *tc, struct bio *bio,
650 			    dm_block_t block)
651 {
652 	struct pool *pool = tc->pool;
653 	unsigned long flags;
654 
655 	remap(tc, bio, block);
656 
657 	/*
658 	 * Batch together any FUA/FLUSH bios we find and then issue
659 	 * a single commit for them in process_deferred_bios().
660 	 */
661 	if (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) {
662 		spin_lock_irqsave(&pool->lock, flags);
663 		bio_list_add(&pool->deferred_flush_bios, bio);
664 		spin_unlock_irqrestore(&pool->lock, flags);
665 	} else
666 		generic_make_request(bio);
667 }
668 
669 /*
670  * wake_worker() is used when new work is queued and when pool_resume is
671  * ready to continue deferred IO processing.
672  */
673 static void wake_worker(struct pool *pool)
674 {
675 	queue_work(pool->wq, &pool->worker);
676 }
677 
678 /*----------------------------------------------------------------*/
679 
680 /*
681  * Bio endio functions.
682  */
683 struct endio_hook {
684 	struct thin_c *tc;
685 	bio_end_io_t *saved_bi_end_io;
686 	struct deferred_entry *entry;
687 };
688 
689 struct new_mapping {
690 	struct list_head list;
691 
692 	int prepared;
693 
694 	struct thin_c *tc;
695 	dm_block_t virt_block;
696 	dm_block_t data_block;
697 	struct cell *cell;
698 	int err;
699 
700 	/*
701 	 * If the bio covers the whole area of a block then we can avoid
702 	 * zeroing or copying.  Instead this bio is hooked.  The bio will
703 	 * still be in the cell, so care has to be taken to avoid issuing
704 	 * the bio twice.
705 	 */
706 	struct bio *bio;
707 	bio_end_io_t *saved_bi_end_io;
708 };
709 
710 static void __maybe_add_mapping(struct new_mapping *m)
711 {
712 	struct pool *pool = m->tc->pool;
713 
714 	if (list_empty(&m->list) && m->prepared) {
715 		list_add(&m->list, &pool->prepared_mappings);
716 		wake_worker(pool);
717 	}
718 }
719 
720 static void copy_complete(int read_err, unsigned long write_err, void *context)
721 {
722 	unsigned long flags;
723 	struct new_mapping *m = context;
724 	struct pool *pool = m->tc->pool;
725 
726 	m->err = read_err || write_err ? -EIO : 0;
727 
728 	spin_lock_irqsave(&pool->lock, flags);
729 	m->prepared = 1;
730 	__maybe_add_mapping(m);
731 	spin_unlock_irqrestore(&pool->lock, flags);
732 }
733 
734 static void overwrite_endio(struct bio *bio, int err)
735 {
736 	unsigned long flags;
737 	struct new_mapping *m = dm_get_mapinfo(bio)->ptr;
738 	struct pool *pool = m->tc->pool;
739 
740 	m->err = err;
741 
742 	spin_lock_irqsave(&pool->lock, flags);
743 	m->prepared = 1;
744 	__maybe_add_mapping(m);
745 	spin_unlock_irqrestore(&pool->lock, flags);
746 }
747 
748 static void shared_read_endio(struct bio *bio, int err)
749 {
750 	struct list_head mappings;
751 	struct new_mapping *m, *tmp;
752 	struct endio_hook *h = dm_get_mapinfo(bio)->ptr;
753 	unsigned long flags;
754 	struct pool *pool = h->tc->pool;
755 
756 	bio->bi_end_io = h->saved_bi_end_io;
757 	bio_endio(bio, err);
758 
759 	INIT_LIST_HEAD(&mappings);
760 	ds_dec(h->entry, &mappings);
761 
762 	spin_lock_irqsave(&pool->lock, flags);
763 	list_for_each_entry_safe(m, tmp, &mappings, list) {
764 		list_del(&m->list);
765 		INIT_LIST_HEAD(&m->list);
766 		__maybe_add_mapping(m);
767 	}
768 	spin_unlock_irqrestore(&pool->lock, flags);
769 
770 	mempool_free(h, pool->endio_hook_pool);
771 }
772 
773 /*----------------------------------------------------------------*/
774 
775 /*
776  * Workqueue.
777  */
778 
779 /*
780  * Prepared mapping jobs.
781  */
782 
783 /*
784  * This sends the bios in the cell back to the deferred_bios list.
785  */
786 static void cell_defer(struct thin_c *tc, struct cell *cell,
787 		       dm_block_t data_block)
788 {
789 	struct pool *pool = tc->pool;
790 	unsigned long flags;
791 
792 	spin_lock_irqsave(&pool->lock, flags);
793 	cell_release(cell, &pool->deferred_bios);
794 	spin_unlock_irqrestore(&tc->pool->lock, flags);
795 
796 	wake_worker(pool);
797 }
798 
799 /*
800  * Same as cell_defer above, except it omits one particular detainee,
801  * a write bio that covers the block and has already been processed.
802  */
803 static void cell_defer_except(struct thin_c *tc, struct cell *cell,
804 			      struct bio *exception)
805 {
806 	struct bio_list bios;
807 	struct bio *bio;
808 	struct pool *pool = tc->pool;
809 	unsigned long flags;
810 
811 	bio_list_init(&bios);
812 	cell_release(cell, &bios);
813 
814 	spin_lock_irqsave(&pool->lock, flags);
815 	while ((bio = bio_list_pop(&bios)))
816 		if (bio != exception)
817 			bio_list_add(&pool->deferred_bios, bio);
818 	spin_unlock_irqrestore(&pool->lock, flags);
819 
820 	wake_worker(pool);
821 }
822 
823 static void process_prepared_mapping(struct new_mapping *m)
824 {
825 	struct thin_c *tc = m->tc;
826 	struct bio *bio;
827 	int r;
828 
829 	bio = m->bio;
830 	if (bio)
831 		bio->bi_end_io = m->saved_bi_end_io;
832 
833 	if (m->err) {
834 		cell_error(m->cell);
835 		return;
836 	}
837 
838 	/*
839 	 * Commit the prepared block into the mapping btree.
840 	 * Any I/O for this block arriving after this point will get
841 	 * remapped to it directly.
842 	 */
843 	r = dm_thin_insert_block(tc->td, m->virt_block, m->data_block);
844 	if (r) {
845 		DMERR("dm_thin_insert_block() failed");
846 		cell_error(m->cell);
847 		return;
848 	}
849 
850 	/*
851 	 * Release any bios held while the block was being provisioned.
852 	 * If we are processing a write bio that completely covers the block,
853 	 * we already processed it so can ignore it now when processing
854 	 * the bios in the cell.
855 	 */
856 	if (bio) {
857 		cell_defer_except(tc, m->cell, bio);
858 		bio_endio(bio, 0);
859 	} else
860 		cell_defer(tc, m->cell, m->data_block);
861 
862 	list_del(&m->list);
863 	mempool_free(m, tc->pool->mapping_pool);
864 }
865 
866 static void process_prepared_mappings(struct pool *pool)
867 {
868 	unsigned long flags;
869 	struct list_head maps;
870 	struct new_mapping *m, *tmp;
871 
872 	INIT_LIST_HEAD(&maps);
873 	spin_lock_irqsave(&pool->lock, flags);
874 	list_splice_init(&pool->prepared_mappings, &maps);
875 	spin_unlock_irqrestore(&pool->lock, flags);
876 
877 	list_for_each_entry_safe(m, tmp, &maps, list)
878 		process_prepared_mapping(m);
879 }
880 
881 /*
882  * Deferred bio jobs.
883  */
884 static int io_overwrites_block(struct pool *pool, struct bio *bio)
885 {
886 	return ((bio_data_dir(bio) == WRITE) &&
887 		!(bio->bi_sector & pool->offset_mask)) &&
888 		(bio->bi_size == (pool->sectors_per_block << SECTOR_SHIFT));
889 }
890 
891 static void save_and_set_endio(struct bio *bio, bio_end_io_t **save,
892 			       bio_end_io_t *fn)
893 {
894 	*save = bio->bi_end_io;
895 	bio->bi_end_io = fn;
896 }
897 
898 static int ensure_next_mapping(struct pool *pool)
899 {
900 	if (pool->next_mapping)
901 		return 0;
902 
903 	pool->next_mapping = mempool_alloc(pool->mapping_pool, GFP_ATOMIC);
904 
905 	return pool->next_mapping ? 0 : -ENOMEM;
906 }
907 
908 static struct new_mapping *get_next_mapping(struct pool *pool)
909 {
910 	struct new_mapping *r = pool->next_mapping;
911 
912 	BUG_ON(!pool->next_mapping);
913 
914 	pool->next_mapping = NULL;
915 
916 	return r;
917 }
918 
919 static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
920 			  dm_block_t data_origin, dm_block_t data_dest,
921 			  struct cell *cell, struct bio *bio)
922 {
923 	int r;
924 	struct pool *pool = tc->pool;
925 	struct new_mapping *m = get_next_mapping(pool);
926 
927 	INIT_LIST_HEAD(&m->list);
928 	m->prepared = 0;
929 	m->tc = tc;
930 	m->virt_block = virt_block;
931 	m->data_block = data_dest;
932 	m->cell = cell;
933 	m->err = 0;
934 	m->bio = NULL;
935 
936 	ds_add_work(&pool->ds, &m->list);
937 
938 	/*
939 	 * IO to pool_dev remaps to the pool target's data_dev.
940 	 *
941 	 * If the whole block of data is being overwritten, we can issue the
942 	 * bio immediately. Otherwise we use kcopyd to clone the data first.
943 	 */
944 	if (io_overwrites_block(pool, bio)) {
945 		m->bio = bio;
946 		save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio);
947 		dm_get_mapinfo(bio)->ptr = m;
948 		remap_and_issue(tc, bio, data_dest);
949 	} else {
950 		struct dm_io_region from, to;
951 
952 		from.bdev = tc->pool_dev->bdev;
953 		from.sector = data_origin * pool->sectors_per_block;
954 		from.count = pool->sectors_per_block;
955 
956 		to.bdev = tc->pool_dev->bdev;
957 		to.sector = data_dest * pool->sectors_per_block;
958 		to.count = pool->sectors_per_block;
959 
960 		r = dm_kcopyd_copy(pool->copier, &from, 1, &to,
961 				   0, copy_complete, m);
962 		if (r < 0) {
963 			mempool_free(m, pool->mapping_pool);
964 			DMERR("dm_kcopyd_copy() failed");
965 			cell_error(cell);
966 		}
967 	}
968 }
969 
970 static void schedule_zero(struct thin_c *tc, dm_block_t virt_block,
971 			  dm_block_t data_block, struct cell *cell,
972 			  struct bio *bio)
973 {
974 	struct pool *pool = tc->pool;
975 	struct new_mapping *m = get_next_mapping(pool);
976 
977 	INIT_LIST_HEAD(&m->list);
978 	m->prepared = 0;
979 	m->tc = tc;
980 	m->virt_block = virt_block;
981 	m->data_block = data_block;
982 	m->cell = cell;
983 	m->err = 0;
984 	m->bio = NULL;
985 
986 	/*
987 	 * If the whole block of data is being overwritten or we are not
988 	 * zeroing pre-existing data, we can issue the bio immediately.
989 	 * Otherwise we use kcopyd to zero the data first.
990 	 */
991 	if (!pool->zero_new_blocks)
992 		process_prepared_mapping(m);
993 
994 	else if (io_overwrites_block(pool, bio)) {
995 		m->bio = bio;
996 		save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio);
997 		dm_get_mapinfo(bio)->ptr = m;
998 		remap_and_issue(tc, bio, data_block);
999 
1000 	} else {
1001 		int r;
1002 		struct dm_io_region to;
1003 
1004 		to.bdev = tc->pool_dev->bdev;
1005 		to.sector = data_block * pool->sectors_per_block;
1006 		to.count = pool->sectors_per_block;
1007 
1008 		r = dm_kcopyd_zero(pool->copier, 1, &to, 0, copy_complete, m);
1009 		if (r < 0) {
1010 			mempool_free(m, pool->mapping_pool);
1011 			DMERR("dm_kcopyd_zero() failed");
1012 			cell_error(cell);
1013 		}
1014 	}
1015 }
1016 
1017 static int alloc_data_block(struct thin_c *tc, dm_block_t *result)
1018 {
1019 	int r;
1020 	dm_block_t free_blocks;
1021 	unsigned long flags;
1022 	struct pool *pool = tc->pool;
1023 
1024 	r = dm_pool_get_free_block_count(pool->pmd, &free_blocks);
1025 	if (r)
1026 		return r;
1027 
1028 	if (free_blocks <= pool->low_water_blocks && !pool->low_water_triggered) {
1029 		DMWARN("%s: reached low water mark, sending event.",
1030 		       dm_device_name(pool->pool_md));
1031 		spin_lock_irqsave(&pool->lock, flags);
1032 		pool->low_water_triggered = 1;
1033 		spin_unlock_irqrestore(&pool->lock, flags);
1034 		dm_table_event(pool->ti->table);
1035 	}
1036 
1037 	if (!free_blocks) {
1038 		if (pool->no_free_space)
1039 			return -ENOSPC;
1040 		else {
1041 			/*
1042 			 * Try to commit to see if that will free up some
1043 			 * more space.
1044 			 */
1045 			r = dm_pool_commit_metadata(pool->pmd);
1046 			if (r) {
1047 				DMERR("%s: dm_pool_commit_metadata() failed, error = %d",
1048 				      __func__, r);
1049 				return r;
1050 			}
1051 
1052 			r = dm_pool_get_free_block_count(pool->pmd, &free_blocks);
1053 			if (r)
1054 				return r;
1055 
1056 			/*
1057 			 * If we still have no space we set a flag to avoid
1058 			 * doing all this checking and return -ENOSPC.
1059 			 */
1060 			if (!free_blocks) {
1061 				DMWARN("%s: no free space available.",
1062 				       dm_device_name(pool->pool_md));
1063 				spin_lock_irqsave(&pool->lock, flags);
1064 				pool->no_free_space = 1;
1065 				spin_unlock_irqrestore(&pool->lock, flags);
1066 				return -ENOSPC;
1067 			}
1068 		}
1069 	}
1070 
1071 	r = dm_pool_alloc_data_block(pool->pmd, result);
1072 	if (r)
1073 		return r;
1074 
1075 	return 0;
1076 }
1077 
1078 /*
1079  * If we have run out of space, queue bios until the device is
1080  * resumed, presumably after having been reloaded with more space.
1081  */
1082 static void retry_on_resume(struct bio *bio)
1083 {
1084 	struct thin_c *tc = dm_get_mapinfo(bio)->ptr;
1085 	struct pool *pool = tc->pool;
1086 	unsigned long flags;
1087 
1088 	spin_lock_irqsave(&pool->lock, flags);
1089 	bio_list_add(&pool->retry_on_resume_list, bio);
1090 	spin_unlock_irqrestore(&pool->lock, flags);
1091 }
1092 
1093 static void no_space(struct cell *cell)
1094 {
1095 	struct bio *bio;
1096 	struct bio_list bios;
1097 
1098 	bio_list_init(&bios);
1099 	cell_release(cell, &bios);
1100 
1101 	while ((bio = bio_list_pop(&bios)))
1102 		retry_on_resume(bio);
1103 }
1104 
1105 static void break_sharing(struct thin_c *tc, struct bio *bio, dm_block_t block,
1106 			  struct cell_key *key,
1107 			  struct dm_thin_lookup_result *lookup_result,
1108 			  struct cell *cell)
1109 {
1110 	int r;
1111 	dm_block_t data_block;
1112 
1113 	r = alloc_data_block(tc, &data_block);
1114 	switch (r) {
1115 	case 0:
1116 		schedule_copy(tc, block, lookup_result->block,
1117 			      data_block, cell, bio);
1118 		break;
1119 
1120 	case -ENOSPC:
1121 		no_space(cell);
1122 		break;
1123 
1124 	default:
1125 		DMERR("%s: alloc_data_block() failed, error = %d", __func__, r);
1126 		cell_error(cell);
1127 		break;
1128 	}
1129 }
1130 
1131 static void process_shared_bio(struct thin_c *tc, struct bio *bio,
1132 			       dm_block_t block,
1133 			       struct dm_thin_lookup_result *lookup_result)
1134 {
1135 	struct cell *cell;
1136 	struct pool *pool = tc->pool;
1137 	struct cell_key key;
1138 
1139 	/*
1140 	 * If cell is already occupied, then sharing is already in the process
1141 	 * of being broken so we have nothing further to do here.
1142 	 */
1143 	build_data_key(tc->td, lookup_result->block, &key);
1144 	if (bio_detain(pool->prison, &key, bio, &cell))
1145 		return;
1146 
1147 	if (bio_data_dir(bio) == WRITE)
1148 		break_sharing(tc, bio, block, &key, lookup_result, cell);
1149 	else {
1150 		struct endio_hook *h;
1151 		h = mempool_alloc(pool->endio_hook_pool, GFP_NOIO);
1152 
1153 		h->tc = tc;
1154 		h->entry = ds_inc(&pool->ds);
1155 		save_and_set_endio(bio, &h->saved_bi_end_io, shared_read_endio);
1156 		dm_get_mapinfo(bio)->ptr = h;
1157 
1158 		cell_release_singleton(cell, bio);
1159 		remap_and_issue(tc, bio, lookup_result->block);
1160 	}
1161 }
1162 
1163 static void provision_block(struct thin_c *tc, struct bio *bio, dm_block_t block,
1164 			    struct cell *cell)
1165 {
1166 	int r;
1167 	dm_block_t data_block;
1168 
1169 	/*
1170 	 * Remap empty bios (flushes) immediately, without provisioning.
1171 	 */
1172 	if (!bio->bi_size) {
1173 		cell_release_singleton(cell, bio);
1174 		remap_and_issue(tc, bio, 0);
1175 		return;
1176 	}
1177 
1178 	/*
1179 	 * Fill read bios with zeroes and complete them immediately.
1180 	 */
1181 	if (bio_data_dir(bio) == READ) {
1182 		zero_fill_bio(bio);
1183 		cell_release_singleton(cell, bio);
1184 		bio_endio(bio, 0);
1185 		return;
1186 	}
1187 
1188 	r = alloc_data_block(tc, &data_block);
1189 	switch (r) {
1190 	case 0:
1191 		schedule_zero(tc, block, data_block, cell, bio);
1192 		break;
1193 
1194 	case -ENOSPC:
1195 		no_space(cell);
1196 		break;
1197 
1198 	default:
1199 		DMERR("%s: alloc_data_block() failed, error = %d", __func__, r);
1200 		cell_error(cell);
1201 		break;
1202 	}
1203 }
1204 
1205 static void process_bio(struct thin_c *tc, struct bio *bio)
1206 {
1207 	int r;
1208 	dm_block_t block = get_bio_block(tc, bio);
1209 	struct cell *cell;
1210 	struct cell_key key;
1211 	struct dm_thin_lookup_result lookup_result;
1212 
1213 	/*
1214 	 * If cell is already occupied, then the block is already
1215 	 * being provisioned so we have nothing further to do here.
1216 	 */
1217 	build_virtual_key(tc->td, block, &key);
1218 	if (bio_detain(tc->pool->prison, &key, bio, &cell))
1219 		return;
1220 
1221 	r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
1222 	switch (r) {
1223 	case 0:
1224 		/*
1225 		 * We can release this cell now.  This thread is the only
1226 		 * one that puts bios into a cell, and we know there were
1227 		 * no preceding bios.
1228 		 */
1229 		/*
1230 		 * TODO: this will probably have to change when discard goes
1231 		 * back in.
1232 		 */
1233 		cell_release_singleton(cell, bio);
1234 
1235 		if (lookup_result.shared)
1236 			process_shared_bio(tc, bio, block, &lookup_result);
1237 		else
1238 			remap_and_issue(tc, bio, lookup_result.block);
1239 		break;
1240 
1241 	case -ENODATA:
1242 		provision_block(tc, bio, block, cell);
1243 		break;
1244 
1245 	default:
1246 		DMERR("dm_thin_find_block() failed, error = %d", r);
1247 		bio_io_error(bio);
1248 		break;
1249 	}
1250 }
1251 
1252 static void process_deferred_bios(struct pool *pool)
1253 {
1254 	unsigned long flags;
1255 	struct bio *bio;
1256 	struct bio_list bios;
1257 	int r;
1258 
1259 	bio_list_init(&bios);
1260 
1261 	spin_lock_irqsave(&pool->lock, flags);
1262 	bio_list_merge(&bios, &pool->deferred_bios);
1263 	bio_list_init(&pool->deferred_bios);
1264 	spin_unlock_irqrestore(&pool->lock, flags);
1265 
1266 	while ((bio = bio_list_pop(&bios))) {
1267 		struct thin_c *tc = dm_get_mapinfo(bio)->ptr;
1268 		/*
1269 		 * If we've got no free new_mapping structs, and processing
1270 		 * this bio might require one, we pause until there are some
1271 		 * prepared mappings to process.
1272 		 */
1273 		if (ensure_next_mapping(pool)) {
1274 			spin_lock_irqsave(&pool->lock, flags);
1275 			bio_list_merge(&pool->deferred_bios, &bios);
1276 			spin_unlock_irqrestore(&pool->lock, flags);
1277 
1278 			break;
1279 		}
1280 		process_bio(tc, bio);
1281 	}
1282 
1283 	/*
1284 	 * If there are any deferred flush bios, we must commit
1285 	 * the metadata before issuing them.
1286 	 */
1287 	bio_list_init(&bios);
1288 	spin_lock_irqsave(&pool->lock, flags);
1289 	bio_list_merge(&bios, &pool->deferred_flush_bios);
1290 	bio_list_init(&pool->deferred_flush_bios);
1291 	spin_unlock_irqrestore(&pool->lock, flags);
1292 
1293 	if (bio_list_empty(&bios))
1294 		return;
1295 
1296 	r = dm_pool_commit_metadata(pool->pmd);
1297 	if (r) {
1298 		DMERR("%s: dm_pool_commit_metadata() failed, error = %d",
1299 		      __func__, r);
1300 		while ((bio = bio_list_pop(&bios)))
1301 			bio_io_error(bio);
1302 		return;
1303 	}
1304 
1305 	while ((bio = bio_list_pop(&bios)))
1306 		generic_make_request(bio);
1307 }
1308 
1309 static void do_worker(struct work_struct *ws)
1310 {
1311 	struct pool *pool = container_of(ws, struct pool, worker);
1312 
1313 	process_prepared_mappings(pool);
1314 	process_deferred_bios(pool);
1315 }
1316 
1317 /*----------------------------------------------------------------*/
1318 
1319 /*
1320  * Mapping functions.
1321  */
1322 
1323 /*
1324  * Called only while mapping a thin bio to hand it over to the workqueue.
1325  */
1326 static void thin_defer_bio(struct thin_c *tc, struct bio *bio)
1327 {
1328 	unsigned long flags;
1329 	struct pool *pool = tc->pool;
1330 
1331 	spin_lock_irqsave(&pool->lock, flags);
1332 	bio_list_add(&pool->deferred_bios, bio);
1333 	spin_unlock_irqrestore(&pool->lock, flags);
1334 
1335 	wake_worker(pool);
1336 }
1337 
1338 /*
1339  * Non-blocking function called from the thin target's map function.
1340  */
1341 static int thin_bio_map(struct dm_target *ti, struct bio *bio,
1342 			union map_info *map_context)
1343 {
1344 	int r;
1345 	struct thin_c *tc = ti->private;
1346 	dm_block_t block = get_bio_block(tc, bio);
1347 	struct dm_thin_device *td = tc->td;
1348 	struct dm_thin_lookup_result result;
1349 
1350 	/*
1351 	 * Save the thin context for easy access from the deferred bio later.
1352 	 */
1353 	map_context->ptr = tc;
1354 
1355 	if (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) {
1356 		thin_defer_bio(tc, bio);
1357 		return DM_MAPIO_SUBMITTED;
1358 	}
1359 
1360 	r = dm_thin_find_block(td, block, 0, &result);
1361 
1362 	/*
1363 	 * Note that we defer readahead too.
1364 	 */
1365 	switch (r) {
1366 	case 0:
1367 		if (unlikely(result.shared)) {
1368 			/*
1369 			 * We have a race condition here between the
1370 			 * result.shared value returned by the lookup and
1371 			 * snapshot creation, which may cause new
1372 			 * sharing.
1373 			 *
1374 			 * To avoid this always quiesce the origin before
1375 			 * taking the snap.  You want to do this anyway to
1376 			 * ensure a consistent application view
1377 			 * (i.e. lockfs).
1378 			 *
1379 			 * More distant ancestors are irrelevant. The
1380 			 * shared flag will be set in their case.
1381 			 */
1382 			thin_defer_bio(tc, bio);
1383 			r = DM_MAPIO_SUBMITTED;
1384 		} else {
1385 			remap(tc, bio, result.block);
1386 			r = DM_MAPIO_REMAPPED;
1387 		}
1388 		break;
1389 
1390 	case -ENODATA:
1391 		/*
1392 		 * In future, the failed dm_thin_find_block above could
1393 		 * provide the hint to load the metadata into cache.
1394 		 */
1395 	case -EWOULDBLOCK:
1396 		thin_defer_bio(tc, bio);
1397 		r = DM_MAPIO_SUBMITTED;
1398 		break;
1399 	}
1400 
1401 	return r;
1402 }
1403 
1404 static int pool_is_congested(struct dm_target_callbacks *cb, int bdi_bits)
1405 {
1406 	int r;
1407 	unsigned long flags;
1408 	struct pool_c *pt = container_of(cb, struct pool_c, callbacks);
1409 
1410 	spin_lock_irqsave(&pt->pool->lock, flags);
1411 	r = !bio_list_empty(&pt->pool->retry_on_resume_list);
1412 	spin_unlock_irqrestore(&pt->pool->lock, flags);
1413 
1414 	if (!r) {
1415 		struct request_queue *q = bdev_get_queue(pt->data_dev->bdev);
1416 		r = bdi_congested(&q->backing_dev_info, bdi_bits);
1417 	}
1418 
1419 	return r;
1420 }
1421 
1422 static void __requeue_bios(struct pool *pool)
1423 {
1424 	bio_list_merge(&pool->deferred_bios, &pool->retry_on_resume_list);
1425 	bio_list_init(&pool->retry_on_resume_list);
1426 }
1427 
1428 /*----------------------------------------------------------------
1429  * Binding of control targets to a pool object
1430  *--------------------------------------------------------------*/
1431 static int bind_control_target(struct pool *pool, struct dm_target *ti)
1432 {
1433 	struct pool_c *pt = ti->private;
1434 
1435 	pool->ti = ti;
1436 	pool->low_water_blocks = pt->low_water_blocks;
1437 	pool->zero_new_blocks = pt->zero_new_blocks;
1438 
1439 	return 0;
1440 }
1441 
1442 static void unbind_control_target(struct pool *pool, struct dm_target *ti)
1443 {
1444 	if (pool->ti == ti)
1445 		pool->ti = NULL;
1446 }
1447 
1448 /*----------------------------------------------------------------
1449  * Pool creation
1450  *--------------------------------------------------------------*/
1451 static void __pool_destroy(struct pool *pool)
1452 {
1453 	__pool_table_remove(pool);
1454 
1455 	if (dm_pool_metadata_close(pool->pmd) < 0)
1456 		DMWARN("%s: dm_pool_metadata_close() failed.", __func__);
1457 
1458 	prison_destroy(pool->prison);
1459 	dm_kcopyd_client_destroy(pool->copier);
1460 
1461 	if (pool->wq)
1462 		destroy_workqueue(pool->wq);
1463 
1464 	if (pool->next_mapping)
1465 		mempool_free(pool->next_mapping, pool->mapping_pool);
1466 	mempool_destroy(pool->mapping_pool);
1467 	mempool_destroy(pool->endio_hook_pool);
1468 	kfree(pool);
1469 }
1470 
1471 static struct pool *pool_create(struct mapped_device *pool_md,
1472 				struct block_device *metadata_dev,
1473 				unsigned long block_size, char **error)
1474 {
1475 	int r;
1476 	void *err_p;
1477 	struct pool *pool;
1478 	struct dm_pool_metadata *pmd;
1479 
1480 	pmd = dm_pool_metadata_open(metadata_dev, block_size);
1481 	if (IS_ERR(pmd)) {
1482 		*error = "Error creating metadata object";
1483 		return (struct pool *)pmd;
1484 	}
1485 
1486 	pool = kmalloc(sizeof(*pool), GFP_KERNEL);
1487 	if (!pool) {
1488 		*error = "Error allocating memory for pool";
1489 		err_p = ERR_PTR(-ENOMEM);
1490 		goto bad_pool;
1491 	}
1492 
1493 	pool->pmd = pmd;
1494 	pool->sectors_per_block = block_size;
1495 	pool->block_shift = ffs(block_size) - 1;
1496 	pool->offset_mask = block_size - 1;
1497 	pool->low_water_blocks = 0;
1498 	pool->zero_new_blocks = 1;
1499 	pool->prison = prison_create(PRISON_CELLS);
1500 	if (!pool->prison) {
1501 		*error = "Error creating pool's bio prison";
1502 		err_p = ERR_PTR(-ENOMEM);
1503 		goto bad_prison;
1504 	}
1505 
1506 	pool->copier = dm_kcopyd_client_create();
1507 	if (IS_ERR(pool->copier)) {
1508 		r = PTR_ERR(pool->copier);
1509 		*error = "Error creating pool's kcopyd client";
1510 		err_p = ERR_PTR(r);
1511 		goto bad_kcopyd_client;
1512 	}
1513 
1514 	/*
1515 	 * Create singlethreaded workqueue that will service all devices
1516 	 * that use this metadata.
1517 	 */
1518 	pool->wq = alloc_ordered_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM);
1519 	if (!pool->wq) {
1520 		*error = "Error creating pool's workqueue";
1521 		err_p = ERR_PTR(-ENOMEM);
1522 		goto bad_wq;
1523 	}
1524 
1525 	INIT_WORK(&pool->worker, do_worker);
1526 	spin_lock_init(&pool->lock);
1527 	bio_list_init(&pool->deferred_bios);
1528 	bio_list_init(&pool->deferred_flush_bios);
1529 	INIT_LIST_HEAD(&pool->prepared_mappings);
1530 	pool->low_water_triggered = 0;
1531 	pool->no_free_space = 0;
1532 	bio_list_init(&pool->retry_on_resume_list);
1533 	ds_init(&pool->ds);
1534 
1535 	pool->next_mapping = NULL;
1536 	pool->mapping_pool =
1537 		mempool_create_kmalloc_pool(MAPPING_POOL_SIZE, sizeof(struct new_mapping));
1538 	if (!pool->mapping_pool) {
1539 		*error = "Error creating pool's mapping mempool";
1540 		err_p = ERR_PTR(-ENOMEM);
1541 		goto bad_mapping_pool;
1542 	}
1543 
1544 	pool->endio_hook_pool =
1545 		mempool_create_kmalloc_pool(ENDIO_HOOK_POOL_SIZE, sizeof(struct endio_hook));
1546 	if (!pool->endio_hook_pool) {
1547 		*error = "Error creating pool's endio_hook mempool";
1548 		err_p = ERR_PTR(-ENOMEM);
1549 		goto bad_endio_hook_pool;
1550 	}
1551 	pool->ref_count = 1;
1552 	pool->pool_md = pool_md;
1553 	pool->md_dev = metadata_dev;
1554 	__pool_table_insert(pool);
1555 
1556 	return pool;
1557 
1558 bad_endio_hook_pool:
1559 	mempool_destroy(pool->mapping_pool);
1560 bad_mapping_pool:
1561 	destroy_workqueue(pool->wq);
1562 bad_wq:
1563 	dm_kcopyd_client_destroy(pool->copier);
1564 bad_kcopyd_client:
1565 	prison_destroy(pool->prison);
1566 bad_prison:
1567 	kfree(pool);
1568 bad_pool:
1569 	if (dm_pool_metadata_close(pmd))
1570 		DMWARN("%s: dm_pool_metadata_close() failed.", __func__);
1571 
1572 	return err_p;
1573 }
1574 
1575 static void __pool_inc(struct pool *pool)
1576 {
1577 	BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
1578 	pool->ref_count++;
1579 }
1580 
1581 static void __pool_dec(struct pool *pool)
1582 {
1583 	BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
1584 	BUG_ON(!pool->ref_count);
1585 	if (!--pool->ref_count)
1586 		__pool_destroy(pool);
1587 }
1588 
1589 static struct pool *__pool_find(struct mapped_device *pool_md,
1590 				struct block_device *metadata_dev,
1591 				unsigned long block_size, char **error)
1592 {
1593 	struct pool *pool = __pool_table_lookup_metadata_dev(metadata_dev);
1594 
1595 	if (pool) {
1596 		if (pool->pool_md != pool_md)
1597 			return ERR_PTR(-EBUSY);
1598 		__pool_inc(pool);
1599 
1600 	} else {
1601 		pool = __pool_table_lookup(pool_md);
1602 		if (pool) {
1603 			if (pool->md_dev != metadata_dev)
1604 				return ERR_PTR(-EINVAL);
1605 			__pool_inc(pool);
1606 
1607 		} else
1608 			pool = pool_create(pool_md, metadata_dev, block_size, error);
1609 	}
1610 
1611 	return pool;
1612 }
1613 
1614 /*----------------------------------------------------------------
1615  * Pool target methods
1616  *--------------------------------------------------------------*/
1617 static void pool_dtr(struct dm_target *ti)
1618 {
1619 	struct pool_c *pt = ti->private;
1620 
1621 	mutex_lock(&dm_thin_pool_table.mutex);
1622 
1623 	unbind_control_target(pt->pool, ti);
1624 	__pool_dec(pt->pool);
1625 	dm_put_device(ti, pt->metadata_dev);
1626 	dm_put_device(ti, pt->data_dev);
1627 	kfree(pt);
1628 
1629 	mutex_unlock(&dm_thin_pool_table.mutex);
1630 }
1631 
1632 struct pool_features {
1633 	unsigned zero_new_blocks:1;
1634 };
1635 
1636 static int parse_pool_features(struct dm_arg_set *as, struct pool_features *pf,
1637 			       struct dm_target *ti)
1638 {
1639 	int r;
1640 	unsigned argc;
1641 	const char *arg_name;
1642 
1643 	static struct dm_arg _args[] = {
1644 		{0, 1, "Invalid number of pool feature arguments"},
1645 	};
1646 
1647 	/*
1648 	 * No feature arguments supplied.
1649 	 */
1650 	if (!as->argc)
1651 		return 0;
1652 
1653 	r = dm_read_arg_group(_args, as, &argc, &ti->error);
1654 	if (r)
1655 		return -EINVAL;
1656 
1657 	while (argc && !r) {
1658 		arg_name = dm_shift_arg(as);
1659 		argc--;
1660 
1661 		if (!strcasecmp(arg_name, "skip_block_zeroing")) {
1662 			pf->zero_new_blocks = 0;
1663 			continue;
1664 		}
1665 
1666 		ti->error = "Unrecognised pool feature requested";
1667 		r = -EINVAL;
1668 	}
1669 
1670 	return r;
1671 }
1672 
1673 /*
1674  * thin-pool <metadata dev> <data dev>
1675  *	     <data block size (sectors)>
1676  *	     <low water mark (blocks)>
1677  *	     [<#feature args> [<arg>]*]
1678  *
1679  * Optional feature arguments are:
1680  *	     skip_block_zeroing: skips the zeroing of newly-provisioned blocks.
1681  */
1682 static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
1683 {
1684 	int r;
1685 	struct pool_c *pt;
1686 	struct pool *pool;
1687 	struct pool_features pf;
1688 	struct dm_arg_set as;
1689 	struct dm_dev *data_dev;
1690 	unsigned long block_size;
1691 	dm_block_t low_water_blocks;
1692 	struct dm_dev *metadata_dev;
1693 	sector_t metadata_dev_size;
1694 
1695 	/*
1696 	 * FIXME Remove validation from scope of lock.
1697 	 */
1698 	mutex_lock(&dm_thin_pool_table.mutex);
1699 
1700 	if (argc < 4) {
1701 		ti->error = "Invalid argument count";
1702 		r = -EINVAL;
1703 		goto out_unlock;
1704 	}
1705 	as.argc = argc;
1706 	as.argv = argv;
1707 
1708 	r = dm_get_device(ti, argv[0], FMODE_READ | FMODE_WRITE, &metadata_dev);
1709 	if (r) {
1710 		ti->error = "Error opening metadata block device";
1711 		goto out_unlock;
1712 	}
1713 
1714 	metadata_dev_size = i_size_read(metadata_dev->bdev->bd_inode) >> SECTOR_SHIFT;
1715 	if (metadata_dev_size > METADATA_DEV_MAX_SECTORS) {
1716 		ti->error = "Metadata device is too large";
1717 		r = -EINVAL;
1718 		goto out_metadata;
1719 	}
1720 
1721 	r = dm_get_device(ti, argv[1], FMODE_READ | FMODE_WRITE, &data_dev);
1722 	if (r) {
1723 		ti->error = "Error getting data device";
1724 		goto out_metadata;
1725 	}
1726 
1727 	if (kstrtoul(argv[2], 10, &block_size) || !block_size ||
1728 	    block_size < DATA_DEV_BLOCK_SIZE_MIN_SECTORS ||
1729 	    block_size > DATA_DEV_BLOCK_SIZE_MAX_SECTORS ||
1730 	    !is_power_of_2(block_size)) {
1731 		ti->error = "Invalid block size";
1732 		r = -EINVAL;
1733 		goto out;
1734 	}
1735 
1736 	if (kstrtoull(argv[3], 10, (unsigned long long *)&low_water_blocks)) {
1737 		ti->error = "Invalid low water mark";
1738 		r = -EINVAL;
1739 		goto out;
1740 	}
1741 
1742 	/*
1743 	 * Set default pool features.
1744 	 */
1745 	memset(&pf, 0, sizeof(pf));
1746 	pf.zero_new_blocks = 1;
1747 
1748 	dm_consume_args(&as, 4);
1749 	r = parse_pool_features(&as, &pf, ti);
1750 	if (r)
1751 		goto out;
1752 
1753 	pt = kzalloc(sizeof(*pt), GFP_KERNEL);
1754 	if (!pt) {
1755 		r = -ENOMEM;
1756 		goto out;
1757 	}
1758 
1759 	pool = __pool_find(dm_table_get_md(ti->table), metadata_dev->bdev,
1760 			   block_size, &ti->error);
1761 	if (IS_ERR(pool)) {
1762 		r = PTR_ERR(pool);
1763 		goto out_free_pt;
1764 	}
1765 
1766 	pt->pool = pool;
1767 	pt->ti = ti;
1768 	pt->metadata_dev = metadata_dev;
1769 	pt->data_dev = data_dev;
1770 	pt->low_water_blocks = low_water_blocks;
1771 	pt->zero_new_blocks = pf.zero_new_blocks;
1772 	ti->num_flush_requests = 1;
1773 	ti->num_discard_requests = 0;
1774 	ti->private = pt;
1775 
1776 	pt->callbacks.congested_fn = pool_is_congested;
1777 	dm_table_add_target_callbacks(ti->table, &pt->callbacks);
1778 
1779 	mutex_unlock(&dm_thin_pool_table.mutex);
1780 
1781 	return 0;
1782 
1783 out_free_pt:
1784 	kfree(pt);
1785 out:
1786 	dm_put_device(ti, data_dev);
1787 out_metadata:
1788 	dm_put_device(ti, metadata_dev);
1789 out_unlock:
1790 	mutex_unlock(&dm_thin_pool_table.mutex);
1791 
1792 	return r;
1793 }
1794 
1795 static int pool_map(struct dm_target *ti, struct bio *bio,
1796 		    union map_info *map_context)
1797 {
1798 	int r;
1799 	struct pool_c *pt = ti->private;
1800 	struct pool *pool = pt->pool;
1801 	unsigned long flags;
1802 
1803 	/*
1804 	 * As this is a singleton target, ti->begin is always zero.
1805 	 */
1806 	spin_lock_irqsave(&pool->lock, flags);
1807 	bio->bi_bdev = pt->data_dev->bdev;
1808 	r = DM_MAPIO_REMAPPED;
1809 	spin_unlock_irqrestore(&pool->lock, flags);
1810 
1811 	return r;
1812 }
1813 
1814 /*
1815  * Retrieves the number of blocks of the data device from
1816  * the superblock and compares it to the actual device size,
1817  * thus resizing the data device in case it has grown.
1818  *
1819  * This both copes with opening preallocated data devices in the ctr
1820  * being followed by a resume
1821  * -and-
1822  * calling the resume method individually after userspace has
1823  * grown the data device in reaction to a table event.
1824  */
1825 static int pool_preresume(struct dm_target *ti)
1826 {
1827 	int r;
1828 	struct pool_c *pt = ti->private;
1829 	struct pool *pool = pt->pool;
1830 	dm_block_t data_size, sb_data_size;
1831 
1832 	/*
1833 	 * Take control of the pool object.
1834 	 */
1835 	r = bind_control_target(pool, ti);
1836 	if (r)
1837 		return r;
1838 
1839 	data_size = ti->len >> pool->block_shift;
1840 	r = dm_pool_get_data_dev_size(pool->pmd, &sb_data_size);
1841 	if (r) {
1842 		DMERR("failed to retrieve data device size");
1843 		return r;
1844 	}
1845 
1846 	if (data_size < sb_data_size) {
1847 		DMERR("pool target too small, is %llu blocks (expected %llu)",
1848 		      data_size, sb_data_size);
1849 		return -EINVAL;
1850 
1851 	} else if (data_size > sb_data_size) {
1852 		r = dm_pool_resize_data_dev(pool->pmd, data_size);
1853 		if (r) {
1854 			DMERR("failed to resize data device");
1855 			return r;
1856 		}
1857 
1858 		r = dm_pool_commit_metadata(pool->pmd);
1859 		if (r) {
1860 			DMERR("%s: dm_pool_commit_metadata() failed, error = %d",
1861 			      __func__, r);
1862 			return r;
1863 		}
1864 	}
1865 
1866 	return 0;
1867 }
1868 
1869 static void pool_resume(struct dm_target *ti)
1870 {
1871 	struct pool_c *pt = ti->private;
1872 	struct pool *pool = pt->pool;
1873 	unsigned long flags;
1874 
1875 	spin_lock_irqsave(&pool->lock, flags);
1876 	pool->low_water_triggered = 0;
1877 	pool->no_free_space = 0;
1878 	__requeue_bios(pool);
1879 	spin_unlock_irqrestore(&pool->lock, flags);
1880 
1881 	wake_worker(pool);
1882 }
1883 
1884 static void pool_postsuspend(struct dm_target *ti)
1885 {
1886 	int r;
1887 	struct pool_c *pt = ti->private;
1888 	struct pool *pool = pt->pool;
1889 
1890 	flush_workqueue(pool->wq);
1891 
1892 	r = dm_pool_commit_metadata(pool->pmd);
1893 	if (r < 0) {
1894 		DMERR("%s: dm_pool_commit_metadata() failed, error = %d",
1895 		      __func__, r);
1896 		/* FIXME: invalidate device? error the next FUA or FLUSH bio ?*/
1897 	}
1898 }
1899 
1900 static int check_arg_count(unsigned argc, unsigned args_required)
1901 {
1902 	if (argc != args_required) {
1903 		DMWARN("Message received with %u arguments instead of %u.",
1904 		       argc, args_required);
1905 		return -EINVAL;
1906 	}
1907 
1908 	return 0;
1909 }
1910 
1911 static int read_dev_id(char *arg, dm_thin_id *dev_id, int warning)
1912 {
1913 	if (!kstrtoull(arg, 10, (unsigned long long *)dev_id) &&
1914 	    *dev_id <= MAX_DEV_ID)
1915 		return 0;
1916 
1917 	if (warning)
1918 		DMWARN("Message received with invalid device id: %s", arg);
1919 
1920 	return -EINVAL;
1921 }
1922 
1923 static int process_create_thin_mesg(unsigned argc, char **argv, struct pool *pool)
1924 {
1925 	dm_thin_id dev_id;
1926 	int r;
1927 
1928 	r = check_arg_count(argc, 2);
1929 	if (r)
1930 		return r;
1931 
1932 	r = read_dev_id(argv[1], &dev_id, 1);
1933 	if (r)
1934 		return r;
1935 
1936 	r = dm_pool_create_thin(pool->pmd, dev_id);
1937 	if (r) {
1938 		DMWARN("Creation of new thinly-provisioned device with id %s failed.",
1939 		       argv[1]);
1940 		return r;
1941 	}
1942 
1943 	return 0;
1944 }
1945 
1946 static int process_create_snap_mesg(unsigned argc, char **argv, struct pool *pool)
1947 {
1948 	dm_thin_id dev_id;
1949 	dm_thin_id origin_dev_id;
1950 	int r;
1951 
1952 	r = check_arg_count(argc, 3);
1953 	if (r)
1954 		return r;
1955 
1956 	r = read_dev_id(argv[1], &dev_id, 1);
1957 	if (r)
1958 		return r;
1959 
1960 	r = read_dev_id(argv[2], &origin_dev_id, 1);
1961 	if (r)
1962 		return r;
1963 
1964 	r = dm_pool_create_snap(pool->pmd, dev_id, origin_dev_id);
1965 	if (r) {
1966 		DMWARN("Creation of new snapshot %s of device %s failed.",
1967 		       argv[1], argv[2]);
1968 		return r;
1969 	}
1970 
1971 	return 0;
1972 }
1973 
1974 static int process_delete_mesg(unsigned argc, char **argv, struct pool *pool)
1975 {
1976 	dm_thin_id dev_id;
1977 	int r;
1978 
1979 	r = check_arg_count(argc, 2);
1980 	if (r)
1981 		return r;
1982 
1983 	r = read_dev_id(argv[1], &dev_id, 1);
1984 	if (r)
1985 		return r;
1986 
1987 	r = dm_pool_delete_thin_device(pool->pmd, dev_id);
1988 	if (r)
1989 		DMWARN("Deletion of thin device %s failed.", argv[1]);
1990 
1991 	return r;
1992 }
1993 
1994 static int process_set_transaction_id_mesg(unsigned argc, char **argv, struct pool *pool)
1995 {
1996 	dm_thin_id old_id, new_id;
1997 	int r;
1998 
1999 	r = check_arg_count(argc, 3);
2000 	if (r)
2001 		return r;
2002 
2003 	if (kstrtoull(argv[1], 10, (unsigned long long *)&old_id)) {
2004 		DMWARN("set_transaction_id message: Unrecognised id %s.", argv[1]);
2005 		return -EINVAL;
2006 	}
2007 
2008 	if (kstrtoull(argv[2], 10, (unsigned long long *)&new_id)) {
2009 		DMWARN("set_transaction_id message: Unrecognised new id %s.", argv[2]);
2010 		return -EINVAL;
2011 	}
2012 
2013 	r = dm_pool_set_metadata_transaction_id(pool->pmd, old_id, new_id);
2014 	if (r) {
2015 		DMWARN("Failed to change transaction id from %s to %s.",
2016 		       argv[1], argv[2]);
2017 		return r;
2018 	}
2019 
2020 	return 0;
2021 }
2022 
2023 /*
2024  * Messages supported:
2025  *   create_thin	<dev_id>
2026  *   create_snap	<dev_id> <origin_id>
2027  *   delete		<dev_id>
2028  *   trim		<dev_id> <new_size_in_sectors>
2029  *   set_transaction_id <current_trans_id> <new_trans_id>
2030  */
2031 static int pool_message(struct dm_target *ti, unsigned argc, char **argv)
2032 {
2033 	int r = -EINVAL;
2034 	struct pool_c *pt = ti->private;
2035 	struct pool *pool = pt->pool;
2036 
2037 	if (!strcasecmp(argv[0], "create_thin"))
2038 		r = process_create_thin_mesg(argc, argv, pool);
2039 
2040 	else if (!strcasecmp(argv[0], "create_snap"))
2041 		r = process_create_snap_mesg(argc, argv, pool);
2042 
2043 	else if (!strcasecmp(argv[0], "delete"))
2044 		r = process_delete_mesg(argc, argv, pool);
2045 
2046 	else if (!strcasecmp(argv[0], "set_transaction_id"))
2047 		r = process_set_transaction_id_mesg(argc, argv, pool);
2048 
2049 	else
2050 		DMWARN("Unrecognised thin pool target message received: %s", argv[0]);
2051 
2052 	if (!r) {
2053 		r = dm_pool_commit_metadata(pool->pmd);
2054 		if (r)
2055 			DMERR("%s message: dm_pool_commit_metadata() failed, error = %d",
2056 			      argv[0], r);
2057 	}
2058 
2059 	return r;
2060 }
2061 
2062 /*
2063  * Status line is:
2064  *    <transaction id> <used metadata sectors>/<total metadata sectors>
2065  *    <used data sectors>/<total data sectors> <held metadata root>
2066  */
2067 static int pool_status(struct dm_target *ti, status_type_t type,
2068 		       char *result, unsigned maxlen)
2069 {
2070 	int r;
2071 	unsigned sz = 0;
2072 	uint64_t transaction_id;
2073 	dm_block_t nr_free_blocks_data;
2074 	dm_block_t nr_free_blocks_metadata;
2075 	dm_block_t nr_blocks_data;
2076 	dm_block_t nr_blocks_metadata;
2077 	dm_block_t held_root;
2078 	char buf[BDEVNAME_SIZE];
2079 	char buf2[BDEVNAME_SIZE];
2080 	struct pool_c *pt = ti->private;
2081 	struct pool *pool = pt->pool;
2082 
2083 	switch (type) {
2084 	case STATUSTYPE_INFO:
2085 		r = dm_pool_get_metadata_transaction_id(pool->pmd,
2086 							&transaction_id);
2087 		if (r)
2088 			return r;
2089 
2090 		r = dm_pool_get_free_metadata_block_count(pool->pmd,
2091 							  &nr_free_blocks_metadata);
2092 		if (r)
2093 			return r;
2094 
2095 		r = dm_pool_get_metadata_dev_size(pool->pmd, &nr_blocks_metadata);
2096 		if (r)
2097 			return r;
2098 
2099 		r = dm_pool_get_free_block_count(pool->pmd,
2100 						 &nr_free_blocks_data);
2101 		if (r)
2102 			return r;
2103 
2104 		r = dm_pool_get_data_dev_size(pool->pmd, &nr_blocks_data);
2105 		if (r)
2106 			return r;
2107 
2108 		r = dm_pool_get_held_metadata_root(pool->pmd, &held_root);
2109 		if (r)
2110 			return r;
2111 
2112 		DMEMIT("%llu %llu/%llu %llu/%llu ",
2113 		       (unsigned long long)transaction_id,
2114 		       (unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata),
2115 		       (unsigned long long)nr_blocks_metadata,
2116 		       (unsigned long long)(nr_blocks_data - nr_free_blocks_data),
2117 		       (unsigned long long)nr_blocks_data);
2118 
2119 		if (held_root)
2120 			DMEMIT("%llu", held_root);
2121 		else
2122 			DMEMIT("-");
2123 
2124 		break;
2125 
2126 	case STATUSTYPE_TABLE:
2127 		DMEMIT("%s %s %lu %llu ",
2128 		       format_dev_t(buf, pt->metadata_dev->bdev->bd_dev),
2129 		       format_dev_t(buf2, pt->data_dev->bdev->bd_dev),
2130 		       (unsigned long)pool->sectors_per_block,
2131 		       (unsigned long long)pt->low_water_blocks);
2132 
2133 		DMEMIT("%u ", !pool->zero_new_blocks);
2134 
2135 		if (!pool->zero_new_blocks)
2136 			DMEMIT("skip_block_zeroing ");
2137 		break;
2138 	}
2139 
2140 	return 0;
2141 }
2142 
2143 static int pool_iterate_devices(struct dm_target *ti,
2144 				iterate_devices_callout_fn fn, void *data)
2145 {
2146 	struct pool_c *pt = ti->private;
2147 
2148 	return fn(ti, pt->data_dev, 0, ti->len, data);
2149 }
2150 
2151 static int pool_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
2152 		      struct bio_vec *biovec, int max_size)
2153 {
2154 	struct pool_c *pt = ti->private;
2155 	struct request_queue *q = bdev_get_queue(pt->data_dev->bdev);
2156 
2157 	if (!q->merge_bvec_fn)
2158 		return max_size;
2159 
2160 	bvm->bi_bdev = pt->data_dev->bdev;
2161 
2162 	return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
2163 }
2164 
2165 static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits)
2166 {
2167 	struct pool_c *pt = ti->private;
2168 	struct pool *pool = pt->pool;
2169 
2170 	blk_limits_io_min(limits, 0);
2171 	blk_limits_io_opt(limits, pool->sectors_per_block << SECTOR_SHIFT);
2172 }
2173 
2174 static struct target_type pool_target = {
2175 	.name = "thin-pool",
2176 	.features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE |
2177 		    DM_TARGET_IMMUTABLE,
2178 	.version = {1, 0, 0},
2179 	.module = THIS_MODULE,
2180 	.ctr = pool_ctr,
2181 	.dtr = pool_dtr,
2182 	.map = pool_map,
2183 	.postsuspend = pool_postsuspend,
2184 	.preresume = pool_preresume,
2185 	.resume = pool_resume,
2186 	.message = pool_message,
2187 	.status = pool_status,
2188 	.merge = pool_merge,
2189 	.iterate_devices = pool_iterate_devices,
2190 	.io_hints = pool_io_hints,
2191 };
2192 
2193 /*----------------------------------------------------------------
2194  * Thin target methods
2195  *--------------------------------------------------------------*/
2196 static void thin_dtr(struct dm_target *ti)
2197 {
2198 	struct thin_c *tc = ti->private;
2199 
2200 	mutex_lock(&dm_thin_pool_table.mutex);
2201 
2202 	__pool_dec(tc->pool);
2203 	dm_pool_close_thin_device(tc->td);
2204 	dm_put_device(ti, tc->pool_dev);
2205 	kfree(tc);
2206 
2207 	mutex_unlock(&dm_thin_pool_table.mutex);
2208 }
2209 
2210 /*
2211  * Thin target parameters:
2212  *
2213  * <pool_dev> <dev_id>
2214  *
2215  * pool_dev: the path to the pool (eg, /dev/mapper/my_pool)
2216  * dev_id: the internal device identifier
2217  */
2218 static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
2219 {
2220 	int r;
2221 	struct thin_c *tc;
2222 	struct dm_dev *pool_dev;
2223 	struct mapped_device *pool_md;
2224 
2225 	mutex_lock(&dm_thin_pool_table.mutex);
2226 
2227 	if (argc != 2) {
2228 		ti->error = "Invalid argument count";
2229 		r = -EINVAL;
2230 		goto out_unlock;
2231 	}
2232 
2233 	tc = ti->private = kzalloc(sizeof(*tc), GFP_KERNEL);
2234 	if (!tc) {
2235 		ti->error = "Out of memory";
2236 		r = -ENOMEM;
2237 		goto out_unlock;
2238 	}
2239 
2240 	r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &pool_dev);
2241 	if (r) {
2242 		ti->error = "Error opening pool device";
2243 		goto bad_pool_dev;
2244 	}
2245 	tc->pool_dev = pool_dev;
2246 
2247 	if (read_dev_id(argv[1], (unsigned long long *)&tc->dev_id, 0)) {
2248 		ti->error = "Invalid device id";
2249 		r = -EINVAL;
2250 		goto bad_common;
2251 	}
2252 
2253 	pool_md = dm_get_md(tc->pool_dev->bdev->bd_dev);
2254 	if (!pool_md) {
2255 		ti->error = "Couldn't get pool mapped device";
2256 		r = -EINVAL;
2257 		goto bad_common;
2258 	}
2259 
2260 	tc->pool = __pool_table_lookup(pool_md);
2261 	if (!tc->pool) {
2262 		ti->error = "Couldn't find pool object";
2263 		r = -EINVAL;
2264 		goto bad_pool_lookup;
2265 	}
2266 	__pool_inc(tc->pool);
2267 
2268 	r = dm_pool_open_thin_device(tc->pool->pmd, tc->dev_id, &tc->td);
2269 	if (r) {
2270 		ti->error = "Couldn't open thin internal device";
2271 		goto bad_thin_open;
2272 	}
2273 
2274 	ti->split_io = tc->pool->sectors_per_block;
2275 	ti->num_flush_requests = 1;
2276 	ti->num_discard_requests = 0;
2277 	ti->discards_supported = 0;
2278 
2279 	dm_put(pool_md);
2280 
2281 	mutex_unlock(&dm_thin_pool_table.mutex);
2282 
2283 	return 0;
2284 
2285 bad_thin_open:
2286 	__pool_dec(tc->pool);
2287 bad_pool_lookup:
2288 	dm_put(pool_md);
2289 bad_common:
2290 	dm_put_device(ti, tc->pool_dev);
2291 bad_pool_dev:
2292 	kfree(tc);
2293 out_unlock:
2294 	mutex_unlock(&dm_thin_pool_table.mutex);
2295 
2296 	return r;
2297 }
2298 
2299 static int thin_map(struct dm_target *ti, struct bio *bio,
2300 		    union map_info *map_context)
2301 {
2302 	bio->bi_sector -= ti->begin;
2303 
2304 	return thin_bio_map(ti, bio, map_context);
2305 }
2306 
2307 static void thin_postsuspend(struct dm_target *ti)
2308 {
2309 	if (dm_noflush_suspending(ti))
2310 		requeue_io((struct thin_c *)ti->private);
2311 }
2312 
2313 /*
2314  * <nr mapped sectors> <highest mapped sector>
2315  */
2316 static int thin_status(struct dm_target *ti, status_type_t type,
2317 		       char *result, unsigned maxlen)
2318 {
2319 	int r;
2320 	ssize_t sz = 0;
2321 	dm_block_t mapped, highest;
2322 	char buf[BDEVNAME_SIZE];
2323 	struct thin_c *tc = ti->private;
2324 
2325 	if (!tc->td)
2326 		DMEMIT("-");
2327 	else {
2328 		switch (type) {
2329 		case STATUSTYPE_INFO:
2330 			r = dm_thin_get_mapped_count(tc->td, &mapped);
2331 			if (r)
2332 				return r;
2333 
2334 			r = dm_thin_get_highest_mapped_block(tc->td, &highest);
2335 			if (r < 0)
2336 				return r;
2337 
2338 			DMEMIT("%llu ", mapped * tc->pool->sectors_per_block);
2339 			if (r)
2340 				DMEMIT("%llu", ((highest + 1) *
2341 						tc->pool->sectors_per_block) - 1);
2342 			else
2343 				DMEMIT("-");
2344 			break;
2345 
2346 		case STATUSTYPE_TABLE:
2347 			DMEMIT("%s %lu",
2348 			       format_dev_t(buf, tc->pool_dev->bdev->bd_dev),
2349 			       (unsigned long) tc->dev_id);
2350 			break;
2351 		}
2352 	}
2353 
2354 	return 0;
2355 }
2356 
2357 static int thin_iterate_devices(struct dm_target *ti,
2358 				iterate_devices_callout_fn fn, void *data)
2359 {
2360 	dm_block_t blocks;
2361 	struct thin_c *tc = ti->private;
2362 
2363 	/*
2364 	 * We can't call dm_pool_get_data_dev_size() since that blocks.  So
2365 	 * we follow a more convoluted path through to the pool's target.
2366 	 */
2367 	if (!tc->pool->ti)
2368 		return 0;	/* nothing is bound */
2369 
2370 	blocks = tc->pool->ti->len >> tc->pool->block_shift;
2371 	if (blocks)
2372 		return fn(ti, tc->pool_dev, 0, tc->pool->sectors_per_block * blocks, data);
2373 
2374 	return 0;
2375 }
2376 
2377 static void thin_io_hints(struct dm_target *ti, struct queue_limits *limits)
2378 {
2379 	struct thin_c *tc = ti->private;
2380 
2381 	blk_limits_io_min(limits, 0);
2382 	blk_limits_io_opt(limits, tc->pool->sectors_per_block << SECTOR_SHIFT);
2383 }
2384 
2385 static struct target_type thin_target = {
2386 	.name = "thin",
2387 	.version = {1, 0, 0},
2388 	.module	= THIS_MODULE,
2389 	.ctr = thin_ctr,
2390 	.dtr = thin_dtr,
2391 	.map = thin_map,
2392 	.postsuspend = thin_postsuspend,
2393 	.status = thin_status,
2394 	.iterate_devices = thin_iterate_devices,
2395 	.io_hints = thin_io_hints,
2396 };
2397 
2398 /*----------------------------------------------------------------*/
2399 
2400 static int __init dm_thin_init(void)
2401 {
2402 	int r;
2403 
2404 	pool_table_init();
2405 
2406 	r = dm_register_target(&thin_target);
2407 	if (r)
2408 		return r;
2409 
2410 	r = dm_register_target(&pool_target);
2411 	if (r)
2412 		dm_unregister_target(&thin_target);
2413 
2414 	return r;
2415 }
2416 
2417 static void dm_thin_exit(void)
2418 {
2419 	dm_unregister_target(&thin_target);
2420 	dm_unregister_target(&pool_target);
2421 }
2422 
2423 module_init(dm_thin_init);
2424 module_exit(dm_thin_exit);
2425 
2426 MODULE_DESCRIPTION(DM_NAME "device-mapper thin provisioning target");
2427 MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
2428 MODULE_LICENSE("GPL");
2429