xref: /openbmc/linux/drivers/md/dm-thin.c (revision 05bcf503)
1 /*
2  * Copyright (C) 2011-2012 Red Hat UK.
3  *
4  * This file is released under the GPL.
5  */
6 
7 #include "dm-thin-metadata.h"
8 #include "dm-bio-prison.h"
9 #include "dm.h"
10 
11 #include <linux/device-mapper.h>
12 #include <linux/dm-io.h>
13 #include <linux/dm-kcopyd.h>
14 #include <linux/list.h>
15 #include <linux/init.h>
16 #include <linux/module.h>
17 #include <linux/slab.h>
18 
19 #define	DM_MSG_PREFIX	"thin"
20 
21 /*
22  * Tunable constants
23  */
24 #define ENDIO_HOOK_POOL_SIZE 1024
25 #define MAPPING_POOL_SIZE 1024
26 #define PRISON_CELLS 1024
27 #define COMMIT_PERIOD HZ
28 
29 /*
30  * The block size of the device holding pool data must be
31  * between 64KB and 1GB.
32  */
33 #define DATA_DEV_BLOCK_SIZE_MIN_SECTORS (64 * 1024 >> SECTOR_SHIFT)
34 #define DATA_DEV_BLOCK_SIZE_MAX_SECTORS (1024 * 1024 * 1024 >> SECTOR_SHIFT)
35 
36 /*
37  * Device id is restricted to 24 bits.
38  */
39 #define MAX_DEV_ID ((1 << 24) - 1)
40 
41 /*
42  * How do we handle breaking sharing of data blocks?
43  * =================================================
44  *
45  * We use a standard copy-on-write btree to store the mappings for the
46  * devices (note I'm talking about copy-on-write of the metadata here, not
47  * the data).  When you take an internal snapshot you clone the root node
48  * of the origin btree.  After this there is no concept of an origin or a
49  * snapshot.  They are just two device trees that happen to point to the
50  * same data blocks.
51  *
52  * When we get a write in we decide if it's to a shared data block using
53  * some timestamp magic.  If it is, we have to break sharing.
54  *
55  * Let's say we write to a shared block in what was the origin.  The
56  * steps are:
57  *
58  * i) plug io further to this physical block. (see bio_prison code).
59  *
60  * ii) quiesce any read io to that shared data block.  Obviously
61  * including all devices that share this block.  (see dm_deferred_set code)
62  *
63  * iii) copy the data block to a newly allocate block.  This step can be
64  * missed out if the io covers the block. (schedule_copy).
65  *
66  * iv) insert the new mapping into the origin's btree
67  * (process_prepared_mapping).  This act of inserting breaks some
68  * sharing of btree nodes between the two devices.  Breaking sharing only
69  * effects the btree of that specific device.  Btrees for the other
70  * devices that share the block never change.  The btree for the origin
71  * device as it was after the last commit is untouched, ie. we're using
72  * persistent data structures in the functional programming sense.
73  *
74  * v) unplug io to this physical block, including the io that triggered
75  * the breaking of sharing.
76  *
77  * Steps (ii) and (iii) occur in parallel.
78  *
79  * The metadata _doesn't_ need to be committed before the io continues.  We
80  * get away with this because the io is always written to a _new_ block.
81  * If there's a crash, then:
82  *
83  * - The origin mapping will point to the old origin block (the shared
84  * one).  This will contain the data as it was before the io that triggered
85  * the breaking of sharing came in.
86  *
87  * - The snap mapping still points to the old block.  As it would after
88  * the commit.
89  *
90  * The downside of this scheme is the timestamp magic isn't perfect, and
91  * will continue to think that data block in the snapshot device is shared
92  * even after the write to the origin has broken sharing.  I suspect data
93  * blocks will typically be shared by many different devices, so we're
94  * breaking sharing n + 1 times, rather than n, where n is the number of
95  * devices that reference this data block.  At the moment I think the
96  * benefits far, far outweigh the disadvantages.
97  */
98 
99 /*----------------------------------------------------------------*/
100 
101 /*
102  * Key building.
103  */
104 static void build_data_key(struct dm_thin_device *td,
105 			   dm_block_t b, struct dm_cell_key *key)
106 {
107 	key->virtual = 0;
108 	key->dev = dm_thin_dev_id(td);
109 	key->block = b;
110 }
111 
112 static void build_virtual_key(struct dm_thin_device *td, dm_block_t b,
113 			      struct dm_cell_key *key)
114 {
115 	key->virtual = 1;
116 	key->dev = dm_thin_dev_id(td);
117 	key->block = b;
118 }
119 
120 /*----------------------------------------------------------------*/
121 
122 /*
123  * A pool device ties together a metadata device and a data device.  It
124  * also provides the interface for creating and destroying internal
125  * devices.
126  */
127 struct dm_thin_new_mapping;
128 
129 /*
130  * The pool runs in 3 modes.  Ordered in degraded order for comparisons.
131  */
132 enum pool_mode {
133 	PM_WRITE,		/* metadata may be changed */
134 	PM_READ_ONLY,		/* metadata may not be changed */
135 	PM_FAIL,		/* all I/O fails */
136 };
137 
138 struct pool_features {
139 	enum pool_mode mode;
140 
141 	bool zero_new_blocks:1;
142 	bool discard_enabled:1;
143 	bool discard_passdown:1;
144 };
145 
146 struct thin_c;
147 typedef void (*process_bio_fn)(struct thin_c *tc, struct bio *bio);
148 typedef void (*process_mapping_fn)(struct dm_thin_new_mapping *m);
149 
150 struct pool {
151 	struct list_head list;
152 	struct dm_target *ti;	/* Only set if a pool target is bound */
153 
154 	struct mapped_device *pool_md;
155 	struct block_device *md_dev;
156 	struct dm_pool_metadata *pmd;
157 
158 	dm_block_t low_water_blocks;
159 	uint32_t sectors_per_block;
160 	int sectors_per_block_shift;
161 
162 	struct pool_features pf;
163 	unsigned low_water_triggered:1;	/* A dm event has been sent */
164 	unsigned no_free_space:1;	/* A -ENOSPC warning has been issued */
165 
166 	struct dm_bio_prison *prison;
167 	struct dm_kcopyd_client *copier;
168 
169 	struct workqueue_struct *wq;
170 	struct work_struct worker;
171 	struct delayed_work waker;
172 
173 	unsigned long last_commit_jiffies;
174 	unsigned ref_count;
175 
176 	spinlock_t lock;
177 	struct bio_list deferred_bios;
178 	struct bio_list deferred_flush_bios;
179 	struct list_head prepared_mappings;
180 	struct list_head prepared_discards;
181 
182 	struct bio_list retry_on_resume_list;
183 
184 	struct dm_deferred_set *shared_read_ds;
185 	struct dm_deferred_set *all_io_ds;
186 
187 	struct dm_thin_new_mapping *next_mapping;
188 	mempool_t *mapping_pool;
189 	mempool_t *endio_hook_pool;
190 
191 	process_bio_fn process_bio;
192 	process_bio_fn process_discard;
193 
194 	process_mapping_fn process_prepared_mapping;
195 	process_mapping_fn process_prepared_discard;
196 };
197 
198 static enum pool_mode get_pool_mode(struct pool *pool);
199 static void set_pool_mode(struct pool *pool, enum pool_mode mode);
200 
201 /*
202  * Target context for a pool.
203  */
204 struct pool_c {
205 	struct dm_target *ti;
206 	struct pool *pool;
207 	struct dm_dev *data_dev;
208 	struct dm_dev *metadata_dev;
209 	struct dm_target_callbacks callbacks;
210 
211 	dm_block_t low_water_blocks;
212 	struct pool_features requested_pf; /* Features requested during table load */
213 	struct pool_features adjusted_pf;  /* Features used after adjusting for constituent devices */
214 };
215 
216 /*
217  * Target context for a thin.
218  */
219 struct thin_c {
220 	struct dm_dev *pool_dev;
221 	struct dm_dev *origin_dev;
222 	dm_thin_id dev_id;
223 
224 	struct pool *pool;
225 	struct dm_thin_device *td;
226 };
227 
228 /*----------------------------------------------------------------*/
229 
230 /*
231  * A global list of pools that uses a struct mapped_device as a key.
232  */
233 static struct dm_thin_pool_table {
234 	struct mutex mutex;
235 	struct list_head pools;
236 } dm_thin_pool_table;
237 
238 static void pool_table_init(void)
239 {
240 	mutex_init(&dm_thin_pool_table.mutex);
241 	INIT_LIST_HEAD(&dm_thin_pool_table.pools);
242 }
243 
244 static void __pool_table_insert(struct pool *pool)
245 {
246 	BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
247 	list_add(&pool->list, &dm_thin_pool_table.pools);
248 }
249 
250 static void __pool_table_remove(struct pool *pool)
251 {
252 	BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
253 	list_del(&pool->list);
254 }
255 
256 static struct pool *__pool_table_lookup(struct mapped_device *md)
257 {
258 	struct pool *pool = NULL, *tmp;
259 
260 	BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
261 
262 	list_for_each_entry(tmp, &dm_thin_pool_table.pools, list) {
263 		if (tmp->pool_md == md) {
264 			pool = tmp;
265 			break;
266 		}
267 	}
268 
269 	return pool;
270 }
271 
272 static struct pool *__pool_table_lookup_metadata_dev(struct block_device *md_dev)
273 {
274 	struct pool *pool = NULL, *tmp;
275 
276 	BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
277 
278 	list_for_each_entry(tmp, &dm_thin_pool_table.pools, list) {
279 		if (tmp->md_dev == md_dev) {
280 			pool = tmp;
281 			break;
282 		}
283 	}
284 
285 	return pool;
286 }
287 
288 /*----------------------------------------------------------------*/
289 
290 struct dm_thin_endio_hook {
291 	struct thin_c *tc;
292 	struct dm_deferred_entry *shared_read_entry;
293 	struct dm_deferred_entry *all_io_entry;
294 	struct dm_thin_new_mapping *overwrite_mapping;
295 };
296 
297 static void __requeue_bio_list(struct thin_c *tc, struct bio_list *master)
298 {
299 	struct bio *bio;
300 	struct bio_list bios;
301 
302 	bio_list_init(&bios);
303 	bio_list_merge(&bios, master);
304 	bio_list_init(master);
305 
306 	while ((bio = bio_list_pop(&bios))) {
307 		struct dm_thin_endio_hook *h = dm_get_mapinfo(bio)->ptr;
308 
309 		if (h->tc == tc)
310 			bio_endio(bio, DM_ENDIO_REQUEUE);
311 		else
312 			bio_list_add(master, bio);
313 	}
314 }
315 
316 static void requeue_io(struct thin_c *tc)
317 {
318 	struct pool *pool = tc->pool;
319 	unsigned long flags;
320 
321 	spin_lock_irqsave(&pool->lock, flags);
322 	__requeue_bio_list(tc, &pool->deferred_bios);
323 	__requeue_bio_list(tc, &pool->retry_on_resume_list);
324 	spin_unlock_irqrestore(&pool->lock, flags);
325 }
326 
327 /*
328  * This section of code contains the logic for processing a thin device's IO.
329  * Much of the code depends on pool object resources (lists, workqueues, etc)
330  * but most is exclusively called from the thin target rather than the thin-pool
331  * target.
332  */
333 
334 static dm_block_t get_bio_block(struct thin_c *tc, struct bio *bio)
335 {
336 	sector_t block_nr = bio->bi_sector;
337 
338 	if (tc->pool->sectors_per_block_shift < 0)
339 		(void) sector_div(block_nr, tc->pool->sectors_per_block);
340 	else
341 		block_nr >>= tc->pool->sectors_per_block_shift;
342 
343 	return block_nr;
344 }
345 
346 static void remap(struct thin_c *tc, struct bio *bio, dm_block_t block)
347 {
348 	struct pool *pool = tc->pool;
349 	sector_t bi_sector = bio->bi_sector;
350 
351 	bio->bi_bdev = tc->pool_dev->bdev;
352 	if (tc->pool->sectors_per_block_shift < 0)
353 		bio->bi_sector = (block * pool->sectors_per_block) +
354 				 sector_div(bi_sector, pool->sectors_per_block);
355 	else
356 		bio->bi_sector = (block << pool->sectors_per_block_shift) |
357 				(bi_sector & (pool->sectors_per_block - 1));
358 }
359 
360 static void remap_to_origin(struct thin_c *tc, struct bio *bio)
361 {
362 	bio->bi_bdev = tc->origin_dev->bdev;
363 }
364 
365 static int bio_triggers_commit(struct thin_c *tc, struct bio *bio)
366 {
367 	return (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) &&
368 		dm_thin_changed_this_transaction(tc->td);
369 }
370 
371 static void issue(struct thin_c *tc, struct bio *bio)
372 {
373 	struct pool *pool = tc->pool;
374 	unsigned long flags;
375 
376 	if (!bio_triggers_commit(tc, bio)) {
377 		generic_make_request(bio);
378 		return;
379 	}
380 
381 	/*
382 	 * Complete bio with an error if earlier I/O caused changes to
383 	 * the metadata that can't be committed e.g, due to I/O errors
384 	 * on the metadata device.
385 	 */
386 	if (dm_thin_aborted_changes(tc->td)) {
387 		bio_io_error(bio);
388 		return;
389 	}
390 
391 	/*
392 	 * Batch together any bios that trigger commits and then issue a
393 	 * single commit for them in process_deferred_bios().
394 	 */
395 	spin_lock_irqsave(&pool->lock, flags);
396 	bio_list_add(&pool->deferred_flush_bios, bio);
397 	spin_unlock_irqrestore(&pool->lock, flags);
398 }
399 
400 static void remap_to_origin_and_issue(struct thin_c *tc, struct bio *bio)
401 {
402 	remap_to_origin(tc, bio);
403 	issue(tc, bio);
404 }
405 
406 static void remap_and_issue(struct thin_c *tc, struct bio *bio,
407 			    dm_block_t block)
408 {
409 	remap(tc, bio, block);
410 	issue(tc, bio);
411 }
412 
413 /*
414  * wake_worker() is used when new work is queued and when pool_resume is
415  * ready to continue deferred IO processing.
416  */
417 static void wake_worker(struct pool *pool)
418 {
419 	queue_work(pool->wq, &pool->worker);
420 }
421 
422 /*----------------------------------------------------------------*/
423 
424 /*
425  * Bio endio functions.
426  */
427 struct dm_thin_new_mapping {
428 	struct list_head list;
429 
430 	unsigned quiesced:1;
431 	unsigned prepared:1;
432 	unsigned pass_discard:1;
433 
434 	struct thin_c *tc;
435 	dm_block_t virt_block;
436 	dm_block_t data_block;
437 	struct dm_bio_prison_cell *cell, *cell2;
438 	int err;
439 
440 	/*
441 	 * If the bio covers the whole area of a block then we can avoid
442 	 * zeroing or copying.  Instead this bio is hooked.  The bio will
443 	 * still be in the cell, so care has to be taken to avoid issuing
444 	 * the bio twice.
445 	 */
446 	struct bio *bio;
447 	bio_end_io_t *saved_bi_end_io;
448 };
449 
450 static void __maybe_add_mapping(struct dm_thin_new_mapping *m)
451 {
452 	struct pool *pool = m->tc->pool;
453 
454 	if (m->quiesced && m->prepared) {
455 		list_add(&m->list, &pool->prepared_mappings);
456 		wake_worker(pool);
457 	}
458 }
459 
460 static void copy_complete(int read_err, unsigned long write_err, void *context)
461 {
462 	unsigned long flags;
463 	struct dm_thin_new_mapping *m = context;
464 	struct pool *pool = m->tc->pool;
465 
466 	m->err = read_err || write_err ? -EIO : 0;
467 
468 	spin_lock_irqsave(&pool->lock, flags);
469 	m->prepared = 1;
470 	__maybe_add_mapping(m);
471 	spin_unlock_irqrestore(&pool->lock, flags);
472 }
473 
474 static void overwrite_endio(struct bio *bio, int err)
475 {
476 	unsigned long flags;
477 	struct dm_thin_endio_hook *h = dm_get_mapinfo(bio)->ptr;
478 	struct dm_thin_new_mapping *m = h->overwrite_mapping;
479 	struct pool *pool = m->tc->pool;
480 
481 	m->err = err;
482 
483 	spin_lock_irqsave(&pool->lock, flags);
484 	m->prepared = 1;
485 	__maybe_add_mapping(m);
486 	spin_unlock_irqrestore(&pool->lock, flags);
487 }
488 
489 /*----------------------------------------------------------------*/
490 
491 /*
492  * Workqueue.
493  */
494 
495 /*
496  * Prepared mapping jobs.
497  */
498 
499 /*
500  * This sends the bios in the cell back to the deferred_bios list.
501  */
502 static void cell_defer(struct thin_c *tc, struct dm_bio_prison_cell *cell,
503 		       dm_block_t data_block)
504 {
505 	struct pool *pool = tc->pool;
506 	unsigned long flags;
507 
508 	spin_lock_irqsave(&pool->lock, flags);
509 	dm_cell_release(cell, &pool->deferred_bios);
510 	spin_unlock_irqrestore(&tc->pool->lock, flags);
511 
512 	wake_worker(pool);
513 }
514 
515 /*
516  * Same as cell_defer above, except it omits one particular detainee,
517  * a write bio that covers the block and has already been processed.
518  */
519 static void cell_defer_except(struct thin_c *tc, struct dm_bio_prison_cell *cell)
520 {
521 	struct bio_list bios;
522 	struct pool *pool = tc->pool;
523 	unsigned long flags;
524 
525 	bio_list_init(&bios);
526 
527 	spin_lock_irqsave(&pool->lock, flags);
528 	dm_cell_release_no_holder(cell, &pool->deferred_bios);
529 	spin_unlock_irqrestore(&pool->lock, flags);
530 
531 	wake_worker(pool);
532 }
533 
534 static void process_prepared_mapping_fail(struct dm_thin_new_mapping *m)
535 {
536 	if (m->bio)
537 		m->bio->bi_end_io = m->saved_bi_end_io;
538 	dm_cell_error(m->cell);
539 	list_del(&m->list);
540 	mempool_free(m, m->tc->pool->mapping_pool);
541 }
542 static void process_prepared_mapping(struct dm_thin_new_mapping *m)
543 {
544 	struct thin_c *tc = m->tc;
545 	struct bio *bio;
546 	int r;
547 
548 	bio = m->bio;
549 	if (bio)
550 		bio->bi_end_io = m->saved_bi_end_io;
551 
552 	if (m->err) {
553 		dm_cell_error(m->cell);
554 		goto out;
555 	}
556 
557 	/*
558 	 * Commit the prepared block into the mapping btree.
559 	 * Any I/O for this block arriving after this point will get
560 	 * remapped to it directly.
561 	 */
562 	r = dm_thin_insert_block(tc->td, m->virt_block, m->data_block);
563 	if (r) {
564 		DMERR("dm_thin_insert_block() failed");
565 		dm_cell_error(m->cell);
566 		goto out;
567 	}
568 
569 	/*
570 	 * Release any bios held while the block was being provisioned.
571 	 * If we are processing a write bio that completely covers the block,
572 	 * we already processed it so can ignore it now when processing
573 	 * the bios in the cell.
574 	 */
575 	if (bio) {
576 		cell_defer_except(tc, m->cell);
577 		bio_endio(bio, 0);
578 	} else
579 		cell_defer(tc, m->cell, m->data_block);
580 
581 out:
582 	list_del(&m->list);
583 	mempool_free(m, tc->pool->mapping_pool);
584 }
585 
586 static void process_prepared_discard_fail(struct dm_thin_new_mapping *m)
587 {
588 	struct thin_c *tc = m->tc;
589 
590 	bio_io_error(m->bio);
591 	cell_defer_except(tc, m->cell);
592 	cell_defer_except(tc, m->cell2);
593 	mempool_free(m, tc->pool->mapping_pool);
594 }
595 
596 static void process_prepared_discard_passdown(struct dm_thin_new_mapping *m)
597 {
598 	struct thin_c *tc = m->tc;
599 
600 	if (m->pass_discard)
601 		remap_and_issue(tc, m->bio, m->data_block);
602 	else
603 		bio_endio(m->bio, 0);
604 
605 	cell_defer_except(tc, m->cell);
606 	cell_defer_except(tc, m->cell2);
607 	mempool_free(m, tc->pool->mapping_pool);
608 }
609 
610 static void process_prepared_discard(struct dm_thin_new_mapping *m)
611 {
612 	int r;
613 	struct thin_c *tc = m->tc;
614 
615 	r = dm_thin_remove_block(tc->td, m->virt_block);
616 	if (r)
617 		DMERR("dm_thin_remove_block() failed");
618 
619 	process_prepared_discard_passdown(m);
620 }
621 
622 static void process_prepared(struct pool *pool, struct list_head *head,
623 			     process_mapping_fn *fn)
624 {
625 	unsigned long flags;
626 	struct list_head maps;
627 	struct dm_thin_new_mapping *m, *tmp;
628 
629 	INIT_LIST_HEAD(&maps);
630 	spin_lock_irqsave(&pool->lock, flags);
631 	list_splice_init(head, &maps);
632 	spin_unlock_irqrestore(&pool->lock, flags);
633 
634 	list_for_each_entry_safe(m, tmp, &maps, list)
635 		(*fn)(m);
636 }
637 
638 /*
639  * Deferred bio jobs.
640  */
641 static int io_overlaps_block(struct pool *pool, struct bio *bio)
642 {
643 	return bio->bi_size == (pool->sectors_per_block << SECTOR_SHIFT);
644 }
645 
646 static int io_overwrites_block(struct pool *pool, struct bio *bio)
647 {
648 	return (bio_data_dir(bio) == WRITE) &&
649 		io_overlaps_block(pool, bio);
650 }
651 
652 static void save_and_set_endio(struct bio *bio, bio_end_io_t **save,
653 			       bio_end_io_t *fn)
654 {
655 	*save = bio->bi_end_io;
656 	bio->bi_end_io = fn;
657 }
658 
659 static int ensure_next_mapping(struct pool *pool)
660 {
661 	if (pool->next_mapping)
662 		return 0;
663 
664 	pool->next_mapping = mempool_alloc(pool->mapping_pool, GFP_ATOMIC);
665 
666 	return pool->next_mapping ? 0 : -ENOMEM;
667 }
668 
669 static struct dm_thin_new_mapping *get_next_mapping(struct pool *pool)
670 {
671 	struct dm_thin_new_mapping *r = pool->next_mapping;
672 
673 	BUG_ON(!pool->next_mapping);
674 
675 	pool->next_mapping = NULL;
676 
677 	return r;
678 }
679 
680 static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
681 			  struct dm_dev *origin, dm_block_t data_origin,
682 			  dm_block_t data_dest,
683 			  struct dm_bio_prison_cell *cell, struct bio *bio)
684 {
685 	int r;
686 	struct pool *pool = tc->pool;
687 	struct dm_thin_new_mapping *m = get_next_mapping(pool);
688 
689 	INIT_LIST_HEAD(&m->list);
690 	m->quiesced = 0;
691 	m->prepared = 0;
692 	m->tc = tc;
693 	m->virt_block = virt_block;
694 	m->data_block = data_dest;
695 	m->cell = cell;
696 	m->err = 0;
697 	m->bio = NULL;
698 
699 	if (!dm_deferred_set_add_work(pool->shared_read_ds, &m->list))
700 		m->quiesced = 1;
701 
702 	/*
703 	 * IO to pool_dev remaps to the pool target's data_dev.
704 	 *
705 	 * If the whole block of data is being overwritten, we can issue the
706 	 * bio immediately. Otherwise we use kcopyd to clone the data first.
707 	 */
708 	if (io_overwrites_block(pool, bio)) {
709 		struct dm_thin_endio_hook *h = dm_get_mapinfo(bio)->ptr;
710 
711 		h->overwrite_mapping = m;
712 		m->bio = bio;
713 		save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio);
714 		remap_and_issue(tc, bio, data_dest);
715 	} else {
716 		struct dm_io_region from, to;
717 
718 		from.bdev = origin->bdev;
719 		from.sector = data_origin * pool->sectors_per_block;
720 		from.count = pool->sectors_per_block;
721 
722 		to.bdev = tc->pool_dev->bdev;
723 		to.sector = data_dest * pool->sectors_per_block;
724 		to.count = pool->sectors_per_block;
725 
726 		r = dm_kcopyd_copy(pool->copier, &from, 1, &to,
727 				   0, copy_complete, m);
728 		if (r < 0) {
729 			mempool_free(m, pool->mapping_pool);
730 			DMERR("dm_kcopyd_copy() failed");
731 			dm_cell_error(cell);
732 		}
733 	}
734 }
735 
736 static void schedule_internal_copy(struct thin_c *tc, dm_block_t virt_block,
737 				   dm_block_t data_origin, dm_block_t data_dest,
738 				   struct dm_bio_prison_cell *cell, struct bio *bio)
739 {
740 	schedule_copy(tc, virt_block, tc->pool_dev,
741 		      data_origin, data_dest, cell, bio);
742 }
743 
744 static void schedule_external_copy(struct thin_c *tc, dm_block_t virt_block,
745 				   dm_block_t data_dest,
746 				   struct dm_bio_prison_cell *cell, struct bio *bio)
747 {
748 	schedule_copy(tc, virt_block, tc->origin_dev,
749 		      virt_block, data_dest, cell, bio);
750 }
751 
752 static void schedule_zero(struct thin_c *tc, dm_block_t virt_block,
753 			  dm_block_t data_block, struct dm_bio_prison_cell *cell,
754 			  struct bio *bio)
755 {
756 	struct pool *pool = tc->pool;
757 	struct dm_thin_new_mapping *m = get_next_mapping(pool);
758 
759 	INIT_LIST_HEAD(&m->list);
760 	m->quiesced = 1;
761 	m->prepared = 0;
762 	m->tc = tc;
763 	m->virt_block = virt_block;
764 	m->data_block = data_block;
765 	m->cell = cell;
766 	m->err = 0;
767 	m->bio = NULL;
768 
769 	/*
770 	 * If the whole block of data is being overwritten or we are not
771 	 * zeroing pre-existing data, we can issue the bio immediately.
772 	 * Otherwise we use kcopyd to zero the data first.
773 	 */
774 	if (!pool->pf.zero_new_blocks)
775 		process_prepared_mapping(m);
776 
777 	else if (io_overwrites_block(pool, bio)) {
778 		struct dm_thin_endio_hook *h = dm_get_mapinfo(bio)->ptr;
779 
780 		h->overwrite_mapping = m;
781 		m->bio = bio;
782 		save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio);
783 		remap_and_issue(tc, bio, data_block);
784 	} else {
785 		int r;
786 		struct dm_io_region to;
787 
788 		to.bdev = tc->pool_dev->bdev;
789 		to.sector = data_block * pool->sectors_per_block;
790 		to.count = pool->sectors_per_block;
791 
792 		r = dm_kcopyd_zero(pool->copier, 1, &to, 0, copy_complete, m);
793 		if (r < 0) {
794 			mempool_free(m, pool->mapping_pool);
795 			DMERR("dm_kcopyd_zero() failed");
796 			dm_cell_error(cell);
797 		}
798 	}
799 }
800 
801 static int commit(struct pool *pool)
802 {
803 	int r;
804 
805 	r = dm_pool_commit_metadata(pool->pmd);
806 	if (r)
807 		DMERR("commit failed, error = %d", r);
808 
809 	return r;
810 }
811 
812 /*
813  * A non-zero return indicates read_only or fail_io mode.
814  * Many callers don't care about the return value.
815  */
816 static int commit_or_fallback(struct pool *pool)
817 {
818 	int r;
819 
820 	if (get_pool_mode(pool) != PM_WRITE)
821 		return -EINVAL;
822 
823 	r = commit(pool);
824 	if (r)
825 		set_pool_mode(pool, PM_READ_ONLY);
826 
827 	return r;
828 }
829 
830 static int alloc_data_block(struct thin_c *tc, dm_block_t *result)
831 {
832 	int r;
833 	dm_block_t free_blocks;
834 	unsigned long flags;
835 	struct pool *pool = tc->pool;
836 
837 	r = dm_pool_get_free_block_count(pool->pmd, &free_blocks);
838 	if (r)
839 		return r;
840 
841 	if (free_blocks <= pool->low_water_blocks && !pool->low_water_triggered) {
842 		DMWARN("%s: reached low water mark, sending event.",
843 		       dm_device_name(pool->pool_md));
844 		spin_lock_irqsave(&pool->lock, flags);
845 		pool->low_water_triggered = 1;
846 		spin_unlock_irqrestore(&pool->lock, flags);
847 		dm_table_event(pool->ti->table);
848 	}
849 
850 	if (!free_blocks) {
851 		if (pool->no_free_space)
852 			return -ENOSPC;
853 		else {
854 			/*
855 			 * Try to commit to see if that will free up some
856 			 * more space.
857 			 */
858 			(void) commit_or_fallback(pool);
859 
860 			r = dm_pool_get_free_block_count(pool->pmd, &free_blocks);
861 			if (r)
862 				return r;
863 
864 			/*
865 			 * If we still have no space we set a flag to avoid
866 			 * doing all this checking and return -ENOSPC.
867 			 */
868 			if (!free_blocks) {
869 				DMWARN("%s: no free space available.",
870 				       dm_device_name(pool->pool_md));
871 				spin_lock_irqsave(&pool->lock, flags);
872 				pool->no_free_space = 1;
873 				spin_unlock_irqrestore(&pool->lock, flags);
874 				return -ENOSPC;
875 			}
876 		}
877 	}
878 
879 	r = dm_pool_alloc_data_block(pool->pmd, result);
880 	if (r)
881 		return r;
882 
883 	return 0;
884 }
885 
886 /*
887  * If we have run out of space, queue bios until the device is
888  * resumed, presumably after having been reloaded with more space.
889  */
890 static void retry_on_resume(struct bio *bio)
891 {
892 	struct dm_thin_endio_hook *h = dm_get_mapinfo(bio)->ptr;
893 	struct thin_c *tc = h->tc;
894 	struct pool *pool = tc->pool;
895 	unsigned long flags;
896 
897 	spin_lock_irqsave(&pool->lock, flags);
898 	bio_list_add(&pool->retry_on_resume_list, bio);
899 	spin_unlock_irqrestore(&pool->lock, flags);
900 }
901 
902 static void no_space(struct dm_bio_prison_cell *cell)
903 {
904 	struct bio *bio;
905 	struct bio_list bios;
906 
907 	bio_list_init(&bios);
908 	dm_cell_release(cell, &bios);
909 
910 	while ((bio = bio_list_pop(&bios)))
911 		retry_on_resume(bio);
912 }
913 
914 static void process_discard(struct thin_c *tc, struct bio *bio)
915 {
916 	int r;
917 	unsigned long flags;
918 	struct pool *pool = tc->pool;
919 	struct dm_bio_prison_cell *cell, *cell2;
920 	struct dm_cell_key key, key2;
921 	dm_block_t block = get_bio_block(tc, bio);
922 	struct dm_thin_lookup_result lookup_result;
923 	struct dm_thin_new_mapping *m;
924 
925 	build_virtual_key(tc->td, block, &key);
926 	if (dm_bio_detain(tc->pool->prison, &key, bio, &cell))
927 		return;
928 
929 	r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
930 	switch (r) {
931 	case 0:
932 		/*
933 		 * Check nobody is fiddling with this pool block.  This can
934 		 * happen if someone's in the process of breaking sharing
935 		 * on this block.
936 		 */
937 		build_data_key(tc->td, lookup_result.block, &key2);
938 		if (dm_bio_detain(tc->pool->prison, &key2, bio, &cell2)) {
939 			dm_cell_release_singleton(cell, bio);
940 			break;
941 		}
942 
943 		if (io_overlaps_block(pool, bio)) {
944 			/*
945 			 * IO may still be going to the destination block.  We must
946 			 * quiesce before we can do the removal.
947 			 */
948 			m = get_next_mapping(pool);
949 			m->tc = tc;
950 			m->pass_discard = (!lookup_result.shared) && pool->pf.discard_passdown;
951 			m->virt_block = block;
952 			m->data_block = lookup_result.block;
953 			m->cell = cell;
954 			m->cell2 = cell2;
955 			m->err = 0;
956 			m->bio = bio;
957 
958 			if (!dm_deferred_set_add_work(pool->all_io_ds, &m->list)) {
959 				spin_lock_irqsave(&pool->lock, flags);
960 				list_add(&m->list, &pool->prepared_discards);
961 				spin_unlock_irqrestore(&pool->lock, flags);
962 				wake_worker(pool);
963 			}
964 		} else {
965 			/*
966 			 * The DM core makes sure that the discard doesn't span
967 			 * a block boundary.  So we submit the discard of a
968 			 * partial block appropriately.
969 			 */
970 			dm_cell_release_singleton(cell, bio);
971 			dm_cell_release_singleton(cell2, bio);
972 			if ((!lookup_result.shared) && pool->pf.discard_passdown)
973 				remap_and_issue(tc, bio, lookup_result.block);
974 			else
975 				bio_endio(bio, 0);
976 		}
977 		break;
978 
979 	case -ENODATA:
980 		/*
981 		 * It isn't provisioned, just forget it.
982 		 */
983 		dm_cell_release_singleton(cell, bio);
984 		bio_endio(bio, 0);
985 		break;
986 
987 	default:
988 		DMERR("discard: find block unexpectedly returned %d", r);
989 		dm_cell_release_singleton(cell, bio);
990 		bio_io_error(bio);
991 		break;
992 	}
993 }
994 
995 static void break_sharing(struct thin_c *tc, struct bio *bio, dm_block_t block,
996 			  struct dm_cell_key *key,
997 			  struct dm_thin_lookup_result *lookup_result,
998 			  struct dm_bio_prison_cell *cell)
999 {
1000 	int r;
1001 	dm_block_t data_block;
1002 
1003 	r = alloc_data_block(tc, &data_block);
1004 	switch (r) {
1005 	case 0:
1006 		schedule_internal_copy(tc, block, lookup_result->block,
1007 				       data_block, cell, bio);
1008 		break;
1009 
1010 	case -ENOSPC:
1011 		no_space(cell);
1012 		break;
1013 
1014 	default:
1015 		DMERR("%s: alloc_data_block() failed, error = %d", __func__, r);
1016 		dm_cell_error(cell);
1017 		break;
1018 	}
1019 }
1020 
1021 static void process_shared_bio(struct thin_c *tc, struct bio *bio,
1022 			       dm_block_t block,
1023 			       struct dm_thin_lookup_result *lookup_result)
1024 {
1025 	struct dm_bio_prison_cell *cell;
1026 	struct pool *pool = tc->pool;
1027 	struct dm_cell_key key;
1028 
1029 	/*
1030 	 * If cell is already occupied, then sharing is already in the process
1031 	 * of being broken so we have nothing further to do here.
1032 	 */
1033 	build_data_key(tc->td, lookup_result->block, &key);
1034 	if (dm_bio_detain(pool->prison, &key, bio, &cell))
1035 		return;
1036 
1037 	if (bio_data_dir(bio) == WRITE && bio->bi_size)
1038 		break_sharing(tc, bio, block, &key, lookup_result, cell);
1039 	else {
1040 		struct dm_thin_endio_hook *h = dm_get_mapinfo(bio)->ptr;
1041 
1042 		h->shared_read_entry = dm_deferred_entry_inc(pool->shared_read_ds);
1043 
1044 		dm_cell_release_singleton(cell, bio);
1045 		remap_and_issue(tc, bio, lookup_result->block);
1046 	}
1047 }
1048 
1049 static void provision_block(struct thin_c *tc, struct bio *bio, dm_block_t block,
1050 			    struct dm_bio_prison_cell *cell)
1051 {
1052 	int r;
1053 	dm_block_t data_block;
1054 
1055 	/*
1056 	 * Remap empty bios (flushes) immediately, without provisioning.
1057 	 */
1058 	if (!bio->bi_size) {
1059 		dm_cell_release_singleton(cell, bio);
1060 		remap_and_issue(tc, bio, 0);
1061 		return;
1062 	}
1063 
1064 	/*
1065 	 * Fill read bios with zeroes and complete them immediately.
1066 	 */
1067 	if (bio_data_dir(bio) == READ) {
1068 		zero_fill_bio(bio);
1069 		dm_cell_release_singleton(cell, bio);
1070 		bio_endio(bio, 0);
1071 		return;
1072 	}
1073 
1074 	r = alloc_data_block(tc, &data_block);
1075 	switch (r) {
1076 	case 0:
1077 		if (tc->origin_dev)
1078 			schedule_external_copy(tc, block, data_block, cell, bio);
1079 		else
1080 			schedule_zero(tc, block, data_block, cell, bio);
1081 		break;
1082 
1083 	case -ENOSPC:
1084 		no_space(cell);
1085 		break;
1086 
1087 	default:
1088 		DMERR("%s: alloc_data_block() failed, error = %d", __func__, r);
1089 		set_pool_mode(tc->pool, PM_READ_ONLY);
1090 		dm_cell_error(cell);
1091 		break;
1092 	}
1093 }
1094 
1095 static void process_bio(struct thin_c *tc, struct bio *bio)
1096 {
1097 	int r;
1098 	dm_block_t block = get_bio_block(tc, bio);
1099 	struct dm_bio_prison_cell *cell;
1100 	struct dm_cell_key key;
1101 	struct dm_thin_lookup_result lookup_result;
1102 
1103 	/*
1104 	 * If cell is already occupied, then the block is already
1105 	 * being provisioned so we have nothing further to do here.
1106 	 */
1107 	build_virtual_key(tc->td, block, &key);
1108 	if (dm_bio_detain(tc->pool->prison, &key, bio, &cell))
1109 		return;
1110 
1111 	r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
1112 	switch (r) {
1113 	case 0:
1114 		/*
1115 		 * We can release this cell now.  This thread is the only
1116 		 * one that puts bios into a cell, and we know there were
1117 		 * no preceding bios.
1118 		 */
1119 		/*
1120 		 * TODO: this will probably have to change when discard goes
1121 		 * back in.
1122 		 */
1123 		dm_cell_release_singleton(cell, bio);
1124 
1125 		if (lookup_result.shared)
1126 			process_shared_bio(tc, bio, block, &lookup_result);
1127 		else
1128 			remap_and_issue(tc, bio, lookup_result.block);
1129 		break;
1130 
1131 	case -ENODATA:
1132 		if (bio_data_dir(bio) == READ && tc->origin_dev) {
1133 			dm_cell_release_singleton(cell, bio);
1134 			remap_to_origin_and_issue(tc, bio);
1135 		} else
1136 			provision_block(tc, bio, block, cell);
1137 		break;
1138 
1139 	default:
1140 		DMERR("dm_thin_find_block() failed, error = %d", r);
1141 		dm_cell_release_singleton(cell, bio);
1142 		bio_io_error(bio);
1143 		break;
1144 	}
1145 }
1146 
1147 static void process_bio_read_only(struct thin_c *tc, struct bio *bio)
1148 {
1149 	int r;
1150 	int rw = bio_data_dir(bio);
1151 	dm_block_t block = get_bio_block(tc, bio);
1152 	struct dm_thin_lookup_result lookup_result;
1153 
1154 	r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
1155 	switch (r) {
1156 	case 0:
1157 		if (lookup_result.shared && (rw == WRITE) && bio->bi_size)
1158 			bio_io_error(bio);
1159 		else
1160 			remap_and_issue(tc, bio, lookup_result.block);
1161 		break;
1162 
1163 	case -ENODATA:
1164 		if (rw != READ) {
1165 			bio_io_error(bio);
1166 			break;
1167 		}
1168 
1169 		if (tc->origin_dev) {
1170 			remap_to_origin_and_issue(tc, bio);
1171 			break;
1172 		}
1173 
1174 		zero_fill_bio(bio);
1175 		bio_endio(bio, 0);
1176 		break;
1177 
1178 	default:
1179 		DMERR("dm_thin_find_block() failed, error = %d", r);
1180 		bio_io_error(bio);
1181 		break;
1182 	}
1183 }
1184 
1185 static void process_bio_fail(struct thin_c *tc, struct bio *bio)
1186 {
1187 	bio_io_error(bio);
1188 }
1189 
1190 static int need_commit_due_to_time(struct pool *pool)
1191 {
1192 	return jiffies < pool->last_commit_jiffies ||
1193 	       jiffies > pool->last_commit_jiffies + COMMIT_PERIOD;
1194 }
1195 
1196 static void process_deferred_bios(struct pool *pool)
1197 {
1198 	unsigned long flags;
1199 	struct bio *bio;
1200 	struct bio_list bios;
1201 
1202 	bio_list_init(&bios);
1203 
1204 	spin_lock_irqsave(&pool->lock, flags);
1205 	bio_list_merge(&bios, &pool->deferred_bios);
1206 	bio_list_init(&pool->deferred_bios);
1207 	spin_unlock_irqrestore(&pool->lock, flags);
1208 
1209 	while ((bio = bio_list_pop(&bios))) {
1210 		struct dm_thin_endio_hook *h = dm_get_mapinfo(bio)->ptr;
1211 		struct thin_c *tc = h->tc;
1212 
1213 		/*
1214 		 * If we've got no free new_mapping structs, and processing
1215 		 * this bio might require one, we pause until there are some
1216 		 * prepared mappings to process.
1217 		 */
1218 		if (ensure_next_mapping(pool)) {
1219 			spin_lock_irqsave(&pool->lock, flags);
1220 			bio_list_merge(&pool->deferred_bios, &bios);
1221 			spin_unlock_irqrestore(&pool->lock, flags);
1222 
1223 			break;
1224 		}
1225 
1226 		if (bio->bi_rw & REQ_DISCARD)
1227 			pool->process_discard(tc, bio);
1228 		else
1229 			pool->process_bio(tc, bio);
1230 	}
1231 
1232 	/*
1233 	 * If there are any deferred flush bios, we must commit
1234 	 * the metadata before issuing them.
1235 	 */
1236 	bio_list_init(&bios);
1237 	spin_lock_irqsave(&pool->lock, flags);
1238 	bio_list_merge(&bios, &pool->deferred_flush_bios);
1239 	bio_list_init(&pool->deferred_flush_bios);
1240 	spin_unlock_irqrestore(&pool->lock, flags);
1241 
1242 	if (bio_list_empty(&bios) && !need_commit_due_to_time(pool))
1243 		return;
1244 
1245 	if (commit_or_fallback(pool)) {
1246 		while ((bio = bio_list_pop(&bios)))
1247 			bio_io_error(bio);
1248 		return;
1249 	}
1250 	pool->last_commit_jiffies = jiffies;
1251 
1252 	while ((bio = bio_list_pop(&bios)))
1253 		generic_make_request(bio);
1254 }
1255 
1256 static void do_worker(struct work_struct *ws)
1257 {
1258 	struct pool *pool = container_of(ws, struct pool, worker);
1259 
1260 	process_prepared(pool, &pool->prepared_mappings, &pool->process_prepared_mapping);
1261 	process_prepared(pool, &pool->prepared_discards, &pool->process_prepared_discard);
1262 	process_deferred_bios(pool);
1263 }
1264 
1265 /*
1266  * We want to commit periodically so that not too much
1267  * unwritten data builds up.
1268  */
1269 static void do_waker(struct work_struct *ws)
1270 {
1271 	struct pool *pool = container_of(to_delayed_work(ws), struct pool, waker);
1272 	wake_worker(pool);
1273 	queue_delayed_work(pool->wq, &pool->waker, COMMIT_PERIOD);
1274 }
1275 
1276 /*----------------------------------------------------------------*/
1277 
1278 static enum pool_mode get_pool_mode(struct pool *pool)
1279 {
1280 	return pool->pf.mode;
1281 }
1282 
1283 static void set_pool_mode(struct pool *pool, enum pool_mode mode)
1284 {
1285 	int r;
1286 
1287 	pool->pf.mode = mode;
1288 
1289 	switch (mode) {
1290 	case PM_FAIL:
1291 		DMERR("switching pool to failure mode");
1292 		pool->process_bio = process_bio_fail;
1293 		pool->process_discard = process_bio_fail;
1294 		pool->process_prepared_mapping = process_prepared_mapping_fail;
1295 		pool->process_prepared_discard = process_prepared_discard_fail;
1296 		break;
1297 
1298 	case PM_READ_ONLY:
1299 		DMERR("switching pool to read-only mode");
1300 		r = dm_pool_abort_metadata(pool->pmd);
1301 		if (r) {
1302 			DMERR("aborting transaction failed");
1303 			set_pool_mode(pool, PM_FAIL);
1304 		} else {
1305 			dm_pool_metadata_read_only(pool->pmd);
1306 			pool->process_bio = process_bio_read_only;
1307 			pool->process_discard = process_discard;
1308 			pool->process_prepared_mapping = process_prepared_mapping_fail;
1309 			pool->process_prepared_discard = process_prepared_discard_passdown;
1310 		}
1311 		break;
1312 
1313 	case PM_WRITE:
1314 		pool->process_bio = process_bio;
1315 		pool->process_discard = process_discard;
1316 		pool->process_prepared_mapping = process_prepared_mapping;
1317 		pool->process_prepared_discard = process_prepared_discard;
1318 		break;
1319 	}
1320 }
1321 
1322 /*----------------------------------------------------------------*/
1323 
1324 /*
1325  * Mapping functions.
1326  */
1327 
1328 /*
1329  * Called only while mapping a thin bio to hand it over to the workqueue.
1330  */
1331 static void thin_defer_bio(struct thin_c *tc, struct bio *bio)
1332 {
1333 	unsigned long flags;
1334 	struct pool *pool = tc->pool;
1335 
1336 	spin_lock_irqsave(&pool->lock, flags);
1337 	bio_list_add(&pool->deferred_bios, bio);
1338 	spin_unlock_irqrestore(&pool->lock, flags);
1339 
1340 	wake_worker(pool);
1341 }
1342 
1343 static struct dm_thin_endio_hook *thin_hook_bio(struct thin_c *tc, struct bio *bio)
1344 {
1345 	struct pool *pool = tc->pool;
1346 	struct dm_thin_endio_hook *h = mempool_alloc(pool->endio_hook_pool, GFP_NOIO);
1347 
1348 	h->tc = tc;
1349 	h->shared_read_entry = NULL;
1350 	h->all_io_entry = bio->bi_rw & REQ_DISCARD ? NULL : dm_deferred_entry_inc(pool->all_io_ds);
1351 	h->overwrite_mapping = NULL;
1352 
1353 	return h;
1354 }
1355 
1356 /*
1357  * Non-blocking function called from the thin target's map function.
1358  */
1359 static int thin_bio_map(struct dm_target *ti, struct bio *bio,
1360 			union map_info *map_context)
1361 {
1362 	int r;
1363 	struct thin_c *tc = ti->private;
1364 	dm_block_t block = get_bio_block(tc, bio);
1365 	struct dm_thin_device *td = tc->td;
1366 	struct dm_thin_lookup_result result;
1367 
1368 	map_context->ptr = thin_hook_bio(tc, bio);
1369 
1370 	if (get_pool_mode(tc->pool) == PM_FAIL) {
1371 		bio_io_error(bio);
1372 		return DM_MAPIO_SUBMITTED;
1373 	}
1374 
1375 	if (bio->bi_rw & (REQ_DISCARD | REQ_FLUSH | REQ_FUA)) {
1376 		thin_defer_bio(tc, bio);
1377 		return DM_MAPIO_SUBMITTED;
1378 	}
1379 
1380 	r = dm_thin_find_block(td, block, 0, &result);
1381 
1382 	/*
1383 	 * Note that we defer readahead too.
1384 	 */
1385 	switch (r) {
1386 	case 0:
1387 		if (unlikely(result.shared)) {
1388 			/*
1389 			 * We have a race condition here between the
1390 			 * result.shared value returned by the lookup and
1391 			 * snapshot creation, which may cause new
1392 			 * sharing.
1393 			 *
1394 			 * To avoid this always quiesce the origin before
1395 			 * taking the snap.  You want to do this anyway to
1396 			 * ensure a consistent application view
1397 			 * (i.e. lockfs).
1398 			 *
1399 			 * More distant ancestors are irrelevant. The
1400 			 * shared flag will be set in their case.
1401 			 */
1402 			thin_defer_bio(tc, bio);
1403 			r = DM_MAPIO_SUBMITTED;
1404 		} else {
1405 			remap(tc, bio, result.block);
1406 			r = DM_MAPIO_REMAPPED;
1407 		}
1408 		break;
1409 
1410 	case -ENODATA:
1411 		if (get_pool_mode(tc->pool) == PM_READ_ONLY) {
1412 			/*
1413 			 * This block isn't provisioned, and we have no way
1414 			 * of doing so.  Just error it.
1415 			 */
1416 			bio_io_error(bio);
1417 			r = DM_MAPIO_SUBMITTED;
1418 			break;
1419 		}
1420 		/* fall through */
1421 
1422 	case -EWOULDBLOCK:
1423 		/*
1424 		 * In future, the failed dm_thin_find_block above could
1425 		 * provide the hint to load the metadata into cache.
1426 		 */
1427 		thin_defer_bio(tc, bio);
1428 		r = DM_MAPIO_SUBMITTED;
1429 		break;
1430 
1431 	default:
1432 		/*
1433 		 * Must always call bio_io_error on failure.
1434 		 * dm_thin_find_block can fail with -EINVAL if the
1435 		 * pool is switched to fail-io mode.
1436 		 */
1437 		bio_io_error(bio);
1438 		r = DM_MAPIO_SUBMITTED;
1439 		break;
1440 	}
1441 
1442 	return r;
1443 }
1444 
1445 static int pool_is_congested(struct dm_target_callbacks *cb, int bdi_bits)
1446 {
1447 	int r;
1448 	unsigned long flags;
1449 	struct pool_c *pt = container_of(cb, struct pool_c, callbacks);
1450 
1451 	spin_lock_irqsave(&pt->pool->lock, flags);
1452 	r = !bio_list_empty(&pt->pool->retry_on_resume_list);
1453 	spin_unlock_irqrestore(&pt->pool->lock, flags);
1454 
1455 	if (!r) {
1456 		struct request_queue *q = bdev_get_queue(pt->data_dev->bdev);
1457 		r = bdi_congested(&q->backing_dev_info, bdi_bits);
1458 	}
1459 
1460 	return r;
1461 }
1462 
1463 static void __requeue_bios(struct pool *pool)
1464 {
1465 	bio_list_merge(&pool->deferred_bios, &pool->retry_on_resume_list);
1466 	bio_list_init(&pool->retry_on_resume_list);
1467 }
1468 
1469 /*----------------------------------------------------------------
1470  * Binding of control targets to a pool object
1471  *--------------------------------------------------------------*/
1472 static bool data_dev_supports_discard(struct pool_c *pt)
1473 {
1474 	struct request_queue *q = bdev_get_queue(pt->data_dev->bdev);
1475 
1476 	return q && blk_queue_discard(q);
1477 }
1478 
1479 /*
1480  * If discard_passdown was enabled verify that the data device
1481  * supports discards.  Disable discard_passdown if not.
1482  */
1483 static void disable_passdown_if_not_supported(struct pool_c *pt)
1484 {
1485 	struct pool *pool = pt->pool;
1486 	struct block_device *data_bdev = pt->data_dev->bdev;
1487 	struct queue_limits *data_limits = &bdev_get_queue(data_bdev)->limits;
1488 	sector_t block_size = pool->sectors_per_block << SECTOR_SHIFT;
1489 	const char *reason = NULL;
1490 	char buf[BDEVNAME_SIZE];
1491 
1492 	if (!pt->adjusted_pf.discard_passdown)
1493 		return;
1494 
1495 	if (!data_dev_supports_discard(pt))
1496 		reason = "discard unsupported";
1497 
1498 	else if (data_limits->max_discard_sectors < pool->sectors_per_block)
1499 		reason = "max discard sectors smaller than a block";
1500 
1501 	else if (data_limits->discard_granularity > block_size)
1502 		reason = "discard granularity larger than a block";
1503 
1504 	else if (block_size & (data_limits->discard_granularity - 1))
1505 		reason = "discard granularity not a factor of block size";
1506 
1507 	if (reason) {
1508 		DMWARN("Data device (%s) %s: Disabling discard passdown.", bdevname(data_bdev, buf), reason);
1509 		pt->adjusted_pf.discard_passdown = false;
1510 	}
1511 }
1512 
1513 static int bind_control_target(struct pool *pool, struct dm_target *ti)
1514 {
1515 	struct pool_c *pt = ti->private;
1516 
1517 	/*
1518 	 * We want to make sure that degraded pools are never upgraded.
1519 	 */
1520 	enum pool_mode old_mode = pool->pf.mode;
1521 	enum pool_mode new_mode = pt->adjusted_pf.mode;
1522 
1523 	if (old_mode > new_mode)
1524 		new_mode = old_mode;
1525 
1526 	pool->ti = ti;
1527 	pool->low_water_blocks = pt->low_water_blocks;
1528 	pool->pf = pt->adjusted_pf;
1529 
1530 	set_pool_mode(pool, new_mode);
1531 
1532 	return 0;
1533 }
1534 
1535 static void unbind_control_target(struct pool *pool, struct dm_target *ti)
1536 {
1537 	if (pool->ti == ti)
1538 		pool->ti = NULL;
1539 }
1540 
1541 /*----------------------------------------------------------------
1542  * Pool creation
1543  *--------------------------------------------------------------*/
1544 /* Initialize pool features. */
1545 static void pool_features_init(struct pool_features *pf)
1546 {
1547 	pf->mode = PM_WRITE;
1548 	pf->zero_new_blocks = true;
1549 	pf->discard_enabled = true;
1550 	pf->discard_passdown = true;
1551 }
1552 
1553 static void __pool_destroy(struct pool *pool)
1554 {
1555 	__pool_table_remove(pool);
1556 
1557 	if (dm_pool_metadata_close(pool->pmd) < 0)
1558 		DMWARN("%s: dm_pool_metadata_close() failed.", __func__);
1559 
1560 	dm_bio_prison_destroy(pool->prison);
1561 	dm_kcopyd_client_destroy(pool->copier);
1562 
1563 	if (pool->wq)
1564 		destroy_workqueue(pool->wq);
1565 
1566 	if (pool->next_mapping)
1567 		mempool_free(pool->next_mapping, pool->mapping_pool);
1568 	mempool_destroy(pool->mapping_pool);
1569 	mempool_destroy(pool->endio_hook_pool);
1570 	dm_deferred_set_destroy(pool->shared_read_ds);
1571 	dm_deferred_set_destroy(pool->all_io_ds);
1572 	kfree(pool);
1573 }
1574 
1575 static struct kmem_cache *_new_mapping_cache;
1576 static struct kmem_cache *_endio_hook_cache;
1577 
1578 static struct pool *pool_create(struct mapped_device *pool_md,
1579 				struct block_device *metadata_dev,
1580 				unsigned long block_size,
1581 				int read_only, char **error)
1582 {
1583 	int r;
1584 	void *err_p;
1585 	struct pool *pool;
1586 	struct dm_pool_metadata *pmd;
1587 	bool format_device = read_only ? false : true;
1588 
1589 	pmd = dm_pool_metadata_open(metadata_dev, block_size, format_device);
1590 	if (IS_ERR(pmd)) {
1591 		*error = "Error creating metadata object";
1592 		return (struct pool *)pmd;
1593 	}
1594 
1595 	pool = kmalloc(sizeof(*pool), GFP_KERNEL);
1596 	if (!pool) {
1597 		*error = "Error allocating memory for pool";
1598 		err_p = ERR_PTR(-ENOMEM);
1599 		goto bad_pool;
1600 	}
1601 
1602 	pool->pmd = pmd;
1603 	pool->sectors_per_block = block_size;
1604 	if (block_size & (block_size - 1))
1605 		pool->sectors_per_block_shift = -1;
1606 	else
1607 		pool->sectors_per_block_shift = __ffs(block_size);
1608 	pool->low_water_blocks = 0;
1609 	pool_features_init(&pool->pf);
1610 	pool->prison = dm_bio_prison_create(PRISON_CELLS);
1611 	if (!pool->prison) {
1612 		*error = "Error creating pool's bio prison";
1613 		err_p = ERR_PTR(-ENOMEM);
1614 		goto bad_prison;
1615 	}
1616 
1617 	pool->copier = dm_kcopyd_client_create();
1618 	if (IS_ERR(pool->copier)) {
1619 		r = PTR_ERR(pool->copier);
1620 		*error = "Error creating pool's kcopyd client";
1621 		err_p = ERR_PTR(r);
1622 		goto bad_kcopyd_client;
1623 	}
1624 
1625 	/*
1626 	 * Create singlethreaded workqueue that will service all devices
1627 	 * that use this metadata.
1628 	 */
1629 	pool->wq = alloc_ordered_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM);
1630 	if (!pool->wq) {
1631 		*error = "Error creating pool's workqueue";
1632 		err_p = ERR_PTR(-ENOMEM);
1633 		goto bad_wq;
1634 	}
1635 
1636 	INIT_WORK(&pool->worker, do_worker);
1637 	INIT_DELAYED_WORK(&pool->waker, do_waker);
1638 	spin_lock_init(&pool->lock);
1639 	bio_list_init(&pool->deferred_bios);
1640 	bio_list_init(&pool->deferred_flush_bios);
1641 	INIT_LIST_HEAD(&pool->prepared_mappings);
1642 	INIT_LIST_HEAD(&pool->prepared_discards);
1643 	pool->low_water_triggered = 0;
1644 	pool->no_free_space = 0;
1645 	bio_list_init(&pool->retry_on_resume_list);
1646 
1647 	pool->shared_read_ds = dm_deferred_set_create();
1648 	if (!pool->shared_read_ds) {
1649 		*error = "Error creating pool's shared read deferred set";
1650 		err_p = ERR_PTR(-ENOMEM);
1651 		goto bad_shared_read_ds;
1652 	}
1653 
1654 	pool->all_io_ds = dm_deferred_set_create();
1655 	if (!pool->all_io_ds) {
1656 		*error = "Error creating pool's all io deferred set";
1657 		err_p = ERR_PTR(-ENOMEM);
1658 		goto bad_all_io_ds;
1659 	}
1660 
1661 	pool->next_mapping = NULL;
1662 	pool->mapping_pool = mempool_create_slab_pool(MAPPING_POOL_SIZE,
1663 						      _new_mapping_cache);
1664 	if (!pool->mapping_pool) {
1665 		*error = "Error creating pool's mapping mempool";
1666 		err_p = ERR_PTR(-ENOMEM);
1667 		goto bad_mapping_pool;
1668 	}
1669 
1670 	pool->endio_hook_pool = mempool_create_slab_pool(ENDIO_HOOK_POOL_SIZE,
1671 							 _endio_hook_cache);
1672 	if (!pool->endio_hook_pool) {
1673 		*error = "Error creating pool's endio_hook mempool";
1674 		err_p = ERR_PTR(-ENOMEM);
1675 		goto bad_endio_hook_pool;
1676 	}
1677 	pool->ref_count = 1;
1678 	pool->last_commit_jiffies = jiffies;
1679 	pool->pool_md = pool_md;
1680 	pool->md_dev = metadata_dev;
1681 	__pool_table_insert(pool);
1682 
1683 	return pool;
1684 
1685 bad_endio_hook_pool:
1686 	mempool_destroy(pool->mapping_pool);
1687 bad_mapping_pool:
1688 	dm_deferred_set_destroy(pool->all_io_ds);
1689 bad_all_io_ds:
1690 	dm_deferred_set_destroy(pool->shared_read_ds);
1691 bad_shared_read_ds:
1692 	destroy_workqueue(pool->wq);
1693 bad_wq:
1694 	dm_kcopyd_client_destroy(pool->copier);
1695 bad_kcopyd_client:
1696 	dm_bio_prison_destroy(pool->prison);
1697 bad_prison:
1698 	kfree(pool);
1699 bad_pool:
1700 	if (dm_pool_metadata_close(pmd))
1701 		DMWARN("%s: dm_pool_metadata_close() failed.", __func__);
1702 
1703 	return err_p;
1704 }
1705 
1706 static void __pool_inc(struct pool *pool)
1707 {
1708 	BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
1709 	pool->ref_count++;
1710 }
1711 
1712 static void __pool_dec(struct pool *pool)
1713 {
1714 	BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
1715 	BUG_ON(!pool->ref_count);
1716 	if (!--pool->ref_count)
1717 		__pool_destroy(pool);
1718 }
1719 
1720 static struct pool *__pool_find(struct mapped_device *pool_md,
1721 				struct block_device *metadata_dev,
1722 				unsigned long block_size, int read_only,
1723 				char **error, int *created)
1724 {
1725 	struct pool *pool = __pool_table_lookup_metadata_dev(metadata_dev);
1726 
1727 	if (pool) {
1728 		if (pool->pool_md != pool_md) {
1729 			*error = "metadata device already in use by a pool";
1730 			return ERR_PTR(-EBUSY);
1731 		}
1732 		__pool_inc(pool);
1733 
1734 	} else {
1735 		pool = __pool_table_lookup(pool_md);
1736 		if (pool) {
1737 			if (pool->md_dev != metadata_dev) {
1738 				*error = "different pool cannot replace a pool";
1739 				return ERR_PTR(-EINVAL);
1740 			}
1741 			__pool_inc(pool);
1742 
1743 		} else {
1744 			pool = pool_create(pool_md, metadata_dev, block_size, read_only, error);
1745 			*created = 1;
1746 		}
1747 	}
1748 
1749 	return pool;
1750 }
1751 
1752 /*----------------------------------------------------------------
1753  * Pool target methods
1754  *--------------------------------------------------------------*/
1755 static void pool_dtr(struct dm_target *ti)
1756 {
1757 	struct pool_c *pt = ti->private;
1758 
1759 	mutex_lock(&dm_thin_pool_table.mutex);
1760 
1761 	unbind_control_target(pt->pool, ti);
1762 	__pool_dec(pt->pool);
1763 	dm_put_device(ti, pt->metadata_dev);
1764 	dm_put_device(ti, pt->data_dev);
1765 	kfree(pt);
1766 
1767 	mutex_unlock(&dm_thin_pool_table.mutex);
1768 }
1769 
1770 static int parse_pool_features(struct dm_arg_set *as, struct pool_features *pf,
1771 			       struct dm_target *ti)
1772 {
1773 	int r;
1774 	unsigned argc;
1775 	const char *arg_name;
1776 
1777 	static struct dm_arg _args[] = {
1778 		{0, 3, "Invalid number of pool feature arguments"},
1779 	};
1780 
1781 	/*
1782 	 * No feature arguments supplied.
1783 	 */
1784 	if (!as->argc)
1785 		return 0;
1786 
1787 	r = dm_read_arg_group(_args, as, &argc, &ti->error);
1788 	if (r)
1789 		return -EINVAL;
1790 
1791 	while (argc && !r) {
1792 		arg_name = dm_shift_arg(as);
1793 		argc--;
1794 
1795 		if (!strcasecmp(arg_name, "skip_block_zeroing"))
1796 			pf->zero_new_blocks = false;
1797 
1798 		else if (!strcasecmp(arg_name, "ignore_discard"))
1799 			pf->discard_enabled = false;
1800 
1801 		else if (!strcasecmp(arg_name, "no_discard_passdown"))
1802 			pf->discard_passdown = false;
1803 
1804 		else if (!strcasecmp(arg_name, "read_only"))
1805 			pf->mode = PM_READ_ONLY;
1806 
1807 		else {
1808 			ti->error = "Unrecognised pool feature requested";
1809 			r = -EINVAL;
1810 			break;
1811 		}
1812 	}
1813 
1814 	return r;
1815 }
1816 
1817 /*
1818  * thin-pool <metadata dev> <data dev>
1819  *	     <data block size (sectors)>
1820  *	     <low water mark (blocks)>
1821  *	     [<#feature args> [<arg>]*]
1822  *
1823  * Optional feature arguments are:
1824  *	     skip_block_zeroing: skips the zeroing of newly-provisioned blocks.
1825  *	     ignore_discard: disable discard
1826  *	     no_discard_passdown: don't pass discards down to the data device
1827  */
1828 static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
1829 {
1830 	int r, pool_created = 0;
1831 	struct pool_c *pt;
1832 	struct pool *pool;
1833 	struct pool_features pf;
1834 	struct dm_arg_set as;
1835 	struct dm_dev *data_dev;
1836 	unsigned long block_size;
1837 	dm_block_t low_water_blocks;
1838 	struct dm_dev *metadata_dev;
1839 	sector_t metadata_dev_size;
1840 	char b[BDEVNAME_SIZE];
1841 
1842 	/*
1843 	 * FIXME Remove validation from scope of lock.
1844 	 */
1845 	mutex_lock(&dm_thin_pool_table.mutex);
1846 
1847 	if (argc < 4) {
1848 		ti->error = "Invalid argument count";
1849 		r = -EINVAL;
1850 		goto out_unlock;
1851 	}
1852 	as.argc = argc;
1853 	as.argv = argv;
1854 
1855 	r = dm_get_device(ti, argv[0], FMODE_READ | FMODE_WRITE, &metadata_dev);
1856 	if (r) {
1857 		ti->error = "Error opening metadata block device";
1858 		goto out_unlock;
1859 	}
1860 
1861 	metadata_dev_size = i_size_read(metadata_dev->bdev->bd_inode) >> SECTOR_SHIFT;
1862 	if (metadata_dev_size > THIN_METADATA_MAX_SECTORS_WARNING)
1863 		DMWARN("Metadata device %s is larger than %u sectors: excess space will not be used.",
1864 		       bdevname(metadata_dev->bdev, b), THIN_METADATA_MAX_SECTORS);
1865 
1866 	r = dm_get_device(ti, argv[1], FMODE_READ | FMODE_WRITE, &data_dev);
1867 	if (r) {
1868 		ti->error = "Error getting data device";
1869 		goto out_metadata;
1870 	}
1871 
1872 	if (kstrtoul(argv[2], 10, &block_size) || !block_size ||
1873 	    block_size < DATA_DEV_BLOCK_SIZE_MIN_SECTORS ||
1874 	    block_size > DATA_DEV_BLOCK_SIZE_MAX_SECTORS ||
1875 	    block_size & (DATA_DEV_BLOCK_SIZE_MIN_SECTORS - 1)) {
1876 		ti->error = "Invalid block size";
1877 		r = -EINVAL;
1878 		goto out;
1879 	}
1880 
1881 	if (kstrtoull(argv[3], 10, (unsigned long long *)&low_water_blocks)) {
1882 		ti->error = "Invalid low water mark";
1883 		r = -EINVAL;
1884 		goto out;
1885 	}
1886 
1887 	/*
1888 	 * Set default pool features.
1889 	 */
1890 	pool_features_init(&pf);
1891 
1892 	dm_consume_args(&as, 4);
1893 	r = parse_pool_features(&as, &pf, ti);
1894 	if (r)
1895 		goto out;
1896 
1897 	pt = kzalloc(sizeof(*pt), GFP_KERNEL);
1898 	if (!pt) {
1899 		r = -ENOMEM;
1900 		goto out;
1901 	}
1902 
1903 	pool = __pool_find(dm_table_get_md(ti->table), metadata_dev->bdev,
1904 			   block_size, pf.mode == PM_READ_ONLY, &ti->error, &pool_created);
1905 	if (IS_ERR(pool)) {
1906 		r = PTR_ERR(pool);
1907 		goto out_free_pt;
1908 	}
1909 
1910 	/*
1911 	 * 'pool_created' reflects whether this is the first table load.
1912 	 * Top level discard support is not allowed to be changed after
1913 	 * initial load.  This would require a pool reload to trigger thin
1914 	 * device changes.
1915 	 */
1916 	if (!pool_created && pf.discard_enabled != pool->pf.discard_enabled) {
1917 		ti->error = "Discard support cannot be disabled once enabled";
1918 		r = -EINVAL;
1919 		goto out_flags_changed;
1920 	}
1921 
1922 	pt->pool = pool;
1923 	pt->ti = ti;
1924 	pt->metadata_dev = metadata_dev;
1925 	pt->data_dev = data_dev;
1926 	pt->low_water_blocks = low_water_blocks;
1927 	pt->adjusted_pf = pt->requested_pf = pf;
1928 	ti->num_flush_requests = 1;
1929 
1930 	/*
1931 	 * Only need to enable discards if the pool should pass
1932 	 * them down to the data device.  The thin device's discard
1933 	 * processing will cause mappings to be removed from the btree.
1934 	 */
1935 	if (pf.discard_enabled && pf.discard_passdown) {
1936 		ti->num_discard_requests = 1;
1937 
1938 		/*
1939 		 * Setting 'discards_supported' circumvents the normal
1940 		 * stacking of discard limits (this keeps the pool and
1941 		 * thin devices' discard limits consistent).
1942 		 */
1943 		ti->discards_supported = true;
1944 		ti->discard_zeroes_data_unsupported = true;
1945 	}
1946 	ti->private = pt;
1947 
1948 	pt->callbacks.congested_fn = pool_is_congested;
1949 	dm_table_add_target_callbacks(ti->table, &pt->callbacks);
1950 
1951 	mutex_unlock(&dm_thin_pool_table.mutex);
1952 
1953 	return 0;
1954 
1955 out_flags_changed:
1956 	__pool_dec(pool);
1957 out_free_pt:
1958 	kfree(pt);
1959 out:
1960 	dm_put_device(ti, data_dev);
1961 out_metadata:
1962 	dm_put_device(ti, metadata_dev);
1963 out_unlock:
1964 	mutex_unlock(&dm_thin_pool_table.mutex);
1965 
1966 	return r;
1967 }
1968 
1969 static int pool_map(struct dm_target *ti, struct bio *bio,
1970 		    union map_info *map_context)
1971 {
1972 	int r;
1973 	struct pool_c *pt = ti->private;
1974 	struct pool *pool = pt->pool;
1975 	unsigned long flags;
1976 
1977 	/*
1978 	 * As this is a singleton target, ti->begin is always zero.
1979 	 */
1980 	spin_lock_irqsave(&pool->lock, flags);
1981 	bio->bi_bdev = pt->data_dev->bdev;
1982 	r = DM_MAPIO_REMAPPED;
1983 	spin_unlock_irqrestore(&pool->lock, flags);
1984 
1985 	return r;
1986 }
1987 
1988 /*
1989  * Retrieves the number of blocks of the data device from
1990  * the superblock and compares it to the actual device size,
1991  * thus resizing the data device in case it has grown.
1992  *
1993  * This both copes with opening preallocated data devices in the ctr
1994  * being followed by a resume
1995  * -and-
1996  * calling the resume method individually after userspace has
1997  * grown the data device in reaction to a table event.
1998  */
1999 static int pool_preresume(struct dm_target *ti)
2000 {
2001 	int r;
2002 	struct pool_c *pt = ti->private;
2003 	struct pool *pool = pt->pool;
2004 	sector_t data_size = ti->len;
2005 	dm_block_t sb_data_size;
2006 
2007 	/*
2008 	 * Take control of the pool object.
2009 	 */
2010 	r = bind_control_target(pool, ti);
2011 	if (r)
2012 		return r;
2013 
2014 	(void) sector_div(data_size, pool->sectors_per_block);
2015 
2016 	r = dm_pool_get_data_dev_size(pool->pmd, &sb_data_size);
2017 	if (r) {
2018 		DMERR("failed to retrieve data device size");
2019 		return r;
2020 	}
2021 
2022 	if (data_size < sb_data_size) {
2023 		DMERR("pool target too small, is %llu blocks (expected %llu)",
2024 		      (unsigned long long)data_size, sb_data_size);
2025 		return -EINVAL;
2026 
2027 	} else if (data_size > sb_data_size) {
2028 		r = dm_pool_resize_data_dev(pool->pmd, data_size);
2029 		if (r) {
2030 			DMERR("failed to resize data device");
2031 			/* FIXME Stricter than necessary: Rollback transaction instead here */
2032 			set_pool_mode(pool, PM_READ_ONLY);
2033 			return r;
2034 		}
2035 
2036 		(void) commit_or_fallback(pool);
2037 	}
2038 
2039 	return 0;
2040 }
2041 
2042 static void pool_resume(struct dm_target *ti)
2043 {
2044 	struct pool_c *pt = ti->private;
2045 	struct pool *pool = pt->pool;
2046 	unsigned long flags;
2047 
2048 	spin_lock_irqsave(&pool->lock, flags);
2049 	pool->low_water_triggered = 0;
2050 	pool->no_free_space = 0;
2051 	__requeue_bios(pool);
2052 	spin_unlock_irqrestore(&pool->lock, flags);
2053 
2054 	do_waker(&pool->waker.work);
2055 }
2056 
2057 static void pool_postsuspend(struct dm_target *ti)
2058 {
2059 	struct pool_c *pt = ti->private;
2060 	struct pool *pool = pt->pool;
2061 
2062 	cancel_delayed_work(&pool->waker);
2063 	flush_workqueue(pool->wq);
2064 	(void) commit_or_fallback(pool);
2065 }
2066 
2067 static int check_arg_count(unsigned argc, unsigned args_required)
2068 {
2069 	if (argc != args_required) {
2070 		DMWARN("Message received with %u arguments instead of %u.",
2071 		       argc, args_required);
2072 		return -EINVAL;
2073 	}
2074 
2075 	return 0;
2076 }
2077 
2078 static int read_dev_id(char *arg, dm_thin_id *dev_id, int warning)
2079 {
2080 	if (!kstrtoull(arg, 10, (unsigned long long *)dev_id) &&
2081 	    *dev_id <= MAX_DEV_ID)
2082 		return 0;
2083 
2084 	if (warning)
2085 		DMWARN("Message received with invalid device id: %s", arg);
2086 
2087 	return -EINVAL;
2088 }
2089 
2090 static int process_create_thin_mesg(unsigned argc, char **argv, struct pool *pool)
2091 {
2092 	dm_thin_id dev_id;
2093 	int r;
2094 
2095 	r = check_arg_count(argc, 2);
2096 	if (r)
2097 		return r;
2098 
2099 	r = read_dev_id(argv[1], &dev_id, 1);
2100 	if (r)
2101 		return r;
2102 
2103 	r = dm_pool_create_thin(pool->pmd, dev_id);
2104 	if (r) {
2105 		DMWARN("Creation of new thinly-provisioned device with id %s failed.",
2106 		       argv[1]);
2107 		return r;
2108 	}
2109 
2110 	return 0;
2111 }
2112 
2113 static int process_create_snap_mesg(unsigned argc, char **argv, struct pool *pool)
2114 {
2115 	dm_thin_id dev_id;
2116 	dm_thin_id origin_dev_id;
2117 	int r;
2118 
2119 	r = check_arg_count(argc, 3);
2120 	if (r)
2121 		return r;
2122 
2123 	r = read_dev_id(argv[1], &dev_id, 1);
2124 	if (r)
2125 		return r;
2126 
2127 	r = read_dev_id(argv[2], &origin_dev_id, 1);
2128 	if (r)
2129 		return r;
2130 
2131 	r = dm_pool_create_snap(pool->pmd, dev_id, origin_dev_id);
2132 	if (r) {
2133 		DMWARN("Creation of new snapshot %s of device %s failed.",
2134 		       argv[1], argv[2]);
2135 		return r;
2136 	}
2137 
2138 	return 0;
2139 }
2140 
2141 static int process_delete_mesg(unsigned argc, char **argv, struct pool *pool)
2142 {
2143 	dm_thin_id dev_id;
2144 	int r;
2145 
2146 	r = check_arg_count(argc, 2);
2147 	if (r)
2148 		return r;
2149 
2150 	r = read_dev_id(argv[1], &dev_id, 1);
2151 	if (r)
2152 		return r;
2153 
2154 	r = dm_pool_delete_thin_device(pool->pmd, dev_id);
2155 	if (r)
2156 		DMWARN("Deletion of thin device %s failed.", argv[1]);
2157 
2158 	return r;
2159 }
2160 
2161 static int process_set_transaction_id_mesg(unsigned argc, char **argv, struct pool *pool)
2162 {
2163 	dm_thin_id old_id, new_id;
2164 	int r;
2165 
2166 	r = check_arg_count(argc, 3);
2167 	if (r)
2168 		return r;
2169 
2170 	if (kstrtoull(argv[1], 10, (unsigned long long *)&old_id)) {
2171 		DMWARN("set_transaction_id message: Unrecognised id %s.", argv[1]);
2172 		return -EINVAL;
2173 	}
2174 
2175 	if (kstrtoull(argv[2], 10, (unsigned long long *)&new_id)) {
2176 		DMWARN("set_transaction_id message: Unrecognised new id %s.", argv[2]);
2177 		return -EINVAL;
2178 	}
2179 
2180 	r = dm_pool_set_metadata_transaction_id(pool->pmd, old_id, new_id);
2181 	if (r) {
2182 		DMWARN("Failed to change transaction id from %s to %s.",
2183 		       argv[1], argv[2]);
2184 		return r;
2185 	}
2186 
2187 	return 0;
2188 }
2189 
2190 static int process_reserve_metadata_snap_mesg(unsigned argc, char **argv, struct pool *pool)
2191 {
2192 	int r;
2193 
2194 	r = check_arg_count(argc, 1);
2195 	if (r)
2196 		return r;
2197 
2198 	(void) commit_or_fallback(pool);
2199 
2200 	r = dm_pool_reserve_metadata_snap(pool->pmd);
2201 	if (r)
2202 		DMWARN("reserve_metadata_snap message failed.");
2203 
2204 	return r;
2205 }
2206 
2207 static int process_release_metadata_snap_mesg(unsigned argc, char **argv, struct pool *pool)
2208 {
2209 	int r;
2210 
2211 	r = check_arg_count(argc, 1);
2212 	if (r)
2213 		return r;
2214 
2215 	r = dm_pool_release_metadata_snap(pool->pmd);
2216 	if (r)
2217 		DMWARN("release_metadata_snap message failed.");
2218 
2219 	return r;
2220 }
2221 
2222 /*
2223  * Messages supported:
2224  *   create_thin	<dev_id>
2225  *   create_snap	<dev_id> <origin_id>
2226  *   delete		<dev_id>
2227  *   trim		<dev_id> <new_size_in_sectors>
2228  *   set_transaction_id <current_trans_id> <new_trans_id>
2229  *   reserve_metadata_snap
2230  *   release_metadata_snap
2231  */
2232 static int pool_message(struct dm_target *ti, unsigned argc, char **argv)
2233 {
2234 	int r = -EINVAL;
2235 	struct pool_c *pt = ti->private;
2236 	struct pool *pool = pt->pool;
2237 
2238 	if (!strcasecmp(argv[0], "create_thin"))
2239 		r = process_create_thin_mesg(argc, argv, pool);
2240 
2241 	else if (!strcasecmp(argv[0], "create_snap"))
2242 		r = process_create_snap_mesg(argc, argv, pool);
2243 
2244 	else if (!strcasecmp(argv[0], "delete"))
2245 		r = process_delete_mesg(argc, argv, pool);
2246 
2247 	else if (!strcasecmp(argv[0], "set_transaction_id"))
2248 		r = process_set_transaction_id_mesg(argc, argv, pool);
2249 
2250 	else if (!strcasecmp(argv[0], "reserve_metadata_snap"))
2251 		r = process_reserve_metadata_snap_mesg(argc, argv, pool);
2252 
2253 	else if (!strcasecmp(argv[0], "release_metadata_snap"))
2254 		r = process_release_metadata_snap_mesg(argc, argv, pool);
2255 
2256 	else
2257 		DMWARN("Unrecognised thin pool target message received: %s", argv[0]);
2258 
2259 	if (!r)
2260 		(void) commit_or_fallback(pool);
2261 
2262 	return r;
2263 }
2264 
2265 static void emit_flags(struct pool_features *pf, char *result,
2266 		       unsigned sz, unsigned maxlen)
2267 {
2268 	unsigned count = !pf->zero_new_blocks + !pf->discard_enabled +
2269 		!pf->discard_passdown + (pf->mode == PM_READ_ONLY);
2270 	DMEMIT("%u ", count);
2271 
2272 	if (!pf->zero_new_blocks)
2273 		DMEMIT("skip_block_zeroing ");
2274 
2275 	if (!pf->discard_enabled)
2276 		DMEMIT("ignore_discard ");
2277 
2278 	if (!pf->discard_passdown)
2279 		DMEMIT("no_discard_passdown ");
2280 
2281 	if (pf->mode == PM_READ_ONLY)
2282 		DMEMIT("read_only ");
2283 }
2284 
2285 /*
2286  * Status line is:
2287  *    <transaction id> <used metadata sectors>/<total metadata sectors>
2288  *    <used data sectors>/<total data sectors> <held metadata root>
2289  */
2290 static int pool_status(struct dm_target *ti, status_type_t type,
2291 		       unsigned status_flags, char *result, unsigned maxlen)
2292 {
2293 	int r;
2294 	unsigned sz = 0;
2295 	uint64_t transaction_id;
2296 	dm_block_t nr_free_blocks_data;
2297 	dm_block_t nr_free_blocks_metadata;
2298 	dm_block_t nr_blocks_data;
2299 	dm_block_t nr_blocks_metadata;
2300 	dm_block_t held_root;
2301 	char buf[BDEVNAME_SIZE];
2302 	char buf2[BDEVNAME_SIZE];
2303 	struct pool_c *pt = ti->private;
2304 	struct pool *pool = pt->pool;
2305 
2306 	switch (type) {
2307 	case STATUSTYPE_INFO:
2308 		if (get_pool_mode(pool) == PM_FAIL) {
2309 			DMEMIT("Fail");
2310 			break;
2311 		}
2312 
2313 		/* Commit to ensure statistics aren't out-of-date */
2314 		if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti))
2315 			(void) commit_or_fallback(pool);
2316 
2317 		r = dm_pool_get_metadata_transaction_id(pool->pmd,
2318 							&transaction_id);
2319 		if (r)
2320 			return r;
2321 
2322 		r = dm_pool_get_free_metadata_block_count(pool->pmd,
2323 							  &nr_free_blocks_metadata);
2324 		if (r)
2325 			return r;
2326 
2327 		r = dm_pool_get_metadata_dev_size(pool->pmd, &nr_blocks_metadata);
2328 		if (r)
2329 			return r;
2330 
2331 		r = dm_pool_get_free_block_count(pool->pmd,
2332 						 &nr_free_blocks_data);
2333 		if (r)
2334 			return r;
2335 
2336 		r = dm_pool_get_data_dev_size(pool->pmd, &nr_blocks_data);
2337 		if (r)
2338 			return r;
2339 
2340 		r = dm_pool_get_metadata_snap(pool->pmd, &held_root);
2341 		if (r)
2342 			return r;
2343 
2344 		DMEMIT("%llu %llu/%llu %llu/%llu ",
2345 		       (unsigned long long)transaction_id,
2346 		       (unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata),
2347 		       (unsigned long long)nr_blocks_metadata,
2348 		       (unsigned long long)(nr_blocks_data - nr_free_blocks_data),
2349 		       (unsigned long long)nr_blocks_data);
2350 
2351 		if (held_root)
2352 			DMEMIT("%llu ", held_root);
2353 		else
2354 			DMEMIT("- ");
2355 
2356 		if (pool->pf.mode == PM_READ_ONLY)
2357 			DMEMIT("ro ");
2358 		else
2359 			DMEMIT("rw ");
2360 
2361 		if (pool->pf.discard_enabled && pool->pf.discard_passdown)
2362 			DMEMIT("discard_passdown");
2363 		else
2364 			DMEMIT("no_discard_passdown");
2365 
2366 		break;
2367 
2368 	case STATUSTYPE_TABLE:
2369 		DMEMIT("%s %s %lu %llu ",
2370 		       format_dev_t(buf, pt->metadata_dev->bdev->bd_dev),
2371 		       format_dev_t(buf2, pt->data_dev->bdev->bd_dev),
2372 		       (unsigned long)pool->sectors_per_block,
2373 		       (unsigned long long)pt->low_water_blocks);
2374 		emit_flags(&pt->requested_pf, result, sz, maxlen);
2375 		break;
2376 	}
2377 
2378 	return 0;
2379 }
2380 
2381 static int pool_iterate_devices(struct dm_target *ti,
2382 				iterate_devices_callout_fn fn, void *data)
2383 {
2384 	struct pool_c *pt = ti->private;
2385 
2386 	return fn(ti, pt->data_dev, 0, ti->len, data);
2387 }
2388 
2389 static int pool_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
2390 		      struct bio_vec *biovec, int max_size)
2391 {
2392 	struct pool_c *pt = ti->private;
2393 	struct request_queue *q = bdev_get_queue(pt->data_dev->bdev);
2394 
2395 	if (!q->merge_bvec_fn)
2396 		return max_size;
2397 
2398 	bvm->bi_bdev = pt->data_dev->bdev;
2399 
2400 	return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
2401 }
2402 
2403 static bool block_size_is_power_of_two(struct pool *pool)
2404 {
2405 	return pool->sectors_per_block_shift >= 0;
2406 }
2407 
2408 static void set_discard_limits(struct pool_c *pt, struct queue_limits *limits)
2409 {
2410 	struct pool *pool = pt->pool;
2411 	struct queue_limits *data_limits;
2412 
2413 	limits->max_discard_sectors = pool->sectors_per_block;
2414 
2415 	/*
2416 	 * discard_granularity is just a hint, and not enforced.
2417 	 */
2418 	if (pt->adjusted_pf.discard_passdown) {
2419 		data_limits = &bdev_get_queue(pt->data_dev->bdev)->limits;
2420 		limits->discard_granularity = data_limits->discard_granularity;
2421 	} else if (block_size_is_power_of_two(pool))
2422 		limits->discard_granularity = pool->sectors_per_block << SECTOR_SHIFT;
2423 	else
2424 		/*
2425 		 * Use largest power of 2 that is a factor of sectors_per_block
2426 		 * but at least DATA_DEV_BLOCK_SIZE_MIN_SECTORS.
2427 		 */
2428 		limits->discard_granularity = max(1 << (ffs(pool->sectors_per_block) - 1),
2429 						  DATA_DEV_BLOCK_SIZE_MIN_SECTORS) << SECTOR_SHIFT;
2430 }
2431 
2432 static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits)
2433 {
2434 	struct pool_c *pt = ti->private;
2435 	struct pool *pool = pt->pool;
2436 
2437 	blk_limits_io_min(limits, 0);
2438 	blk_limits_io_opt(limits, pool->sectors_per_block << SECTOR_SHIFT);
2439 
2440 	/*
2441 	 * pt->adjusted_pf is a staging area for the actual features to use.
2442 	 * They get transferred to the live pool in bind_control_target()
2443 	 * called from pool_preresume().
2444 	 */
2445 	if (!pt->adjusted_pf.discard_enabled)
2446 		return;
2447 
2448 	disable_passdown_if_not_supported(pt);
2449 
2450 	set_discard_limits(pt, limits);
2451 }
2452 
2453 static struct target_type pool_target = {
2454 	.name = "thin-pool",
2455 	.features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE |
2456 		    DM_TARGET_IMMUTABLE,
2457 	.version = {1, 5, 0},
2458 	.module = THIS_MODULE,
2459 	.ctr = pool_ctr,
2460 	.dtr = pool_dtr,
2461 	.map = pool_map,
2462 	.postsuspend = pool_postsuspend,
2463 	.preresume = pool_preresume,
2464 	.resume = pool_resume,
2465 	.message = pool_message,
2466 	.status = pool_status,
2467 	.merge = pool_merge,
2468 	.iterate_devices = pool_iterate_devices,
2469 	.io_hints = pool_io_hints,
2470 };
2471 
2472 /*----------------------------------------------------------------
2473  * Thin target methods
2474  *--------------------------------------------------------------*/
2475 static void thin_dtr(struct dm_target *ti)
2476 {
2477 	struct thin_c *tc = ti->private;
2478 
2479 	mutex_lock(&dm_thin_pool_table.mutex);
2480 
2481 	__pool_dec(tc->pool);
2482 	dm_pool_close_thin_device(tc->td);
2483 	dm_put_device(ti, tc->pool_dev);
2484 	if (tc->origin_dev)
2485 		dm_put_device(ti, tc->origin_dev);
2486 	kfree(tc);
2487 
2488 	mutex_unlock(&dm_thin_pool_table.mutex);
2489 }
2490 
2491 /*
2492  * Thin target parameters:
2493  *
2494  * <pool_dev> <dev_id> [origin_dev]
2495  *
2496  * pool_dev: the path to the pool (eg, /dev/mapper/my_pool)
2497  * dev_id: the internal device identifier
2498  * origin_dev: a device external to the pool that should act as the origin
2499  *
2500  * If the pool device has discards disabled, they get disabled for the thin
2501  * device as well.
2502  */
2503 static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
2504 {
2505 	int r;
2506 	struct thin_c *tc;
2507 	struct dm_dev *pool_dev, *origin_dev;
2508 	struct mapped_device *pool_md;
2509 
2510 	mutex_lock(&dm_thin_pool_table.mutex);
2511 
2512 	if (argc != 2 && argc != 3) {
2513 		ti->error = "Invalid argument count";
2514 		r = -EINVAL;
2515 		goto out_unlock;
2516 	}
2517 
2518 	tc = ti->private = kzalloc(sizeof(*tc), GFP_KERNEL);
2519 	if (!tc) {
2520 		ti->error = "Out of memory";
2521 		r = -ENOMEM;
2522 		goto out_unlock;
2523 	}
2524 
2525 	if (argc == 3) {
2526 		r = dm_get_device(ti, argv[2], FMODE_READ, &origin_dev);
2527 		if (r) {
2528 			ti->error = "Error opening origin device";
2529 			goto bad_origin_dev;
2530 		}
2531 		tc->origin_dev = origin_dev;
2532 	}
2533 
2534 	r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &pool_dev);
2535 	if (r) {
2536 		ti->error = "Error opening pool device";
2537 		goto bad_pool_dev;
2538 	}
2539 	tc->pool_dev = pool_dev;
2540 
2541 	if (read_dev_id(argv[1], (unsigned long long *)&tc->dev_id, 0)) {
2542 		ti->error = "Invalid device id";
2543 		r = -EINVAL;
2544 		goto bad_common;
2545 	}
2546 
2547 	pool_md = dm_get_md(tc->pool_dev->bdev->bd_dev);
2548 	if (!pool_md) {
2549 		ti->error = "Couldn't get pool mapped device";
2550 		r = -EINVAL;
2551 		goto bad_common;
2552 	}
2553 
2554 	tc->pool = __pool_table_lookup(pool_md);
2555 	if (!tc->pool) {
2556 		ti->error = "Couldn't find pool object";
2557 		r = -EINVAL;
2558 		goto bad_pool_lookup;
2559 	}
2560 	__pool_inc(tc->pool);
2561 
2562 	if (get_pool_mode(tc->pool) == PM_FAIL) {
2563 		ti->error = "Couldn't open thin device, Pool is in fail mode";
2564 		goto bad_thin_open;
2565 	}
2566 
2567 	r = dm_pool_open_thin_device(tc->pool->pmd, tc->dev_id, &tc->td);
2568 	if (r) {
2569 		ti->error = "Couldn't open thin internal device";
2570 		goto bad_thin_open;
2571 	}
2572 
2573 	r = dm_set_target_max_io_len(ti, tc->pool->sectors_per_block);
2574 	if (r)
2575 		goto bad_thin_open;
2576 
2577 	ti->num_flush_requests = 1;
2578 	ti->flush_supported = true;
2579 
2580 	/* In case the pool supports discards, pass them on. */
2581 	if (tc->pool->pf.discard_enabled) {
2582 		ti->discards_supported = true;
2583 		ti->num_discard_requests = 1;
2584 		ti->discard_zeroes_data_unsupported = true;
2585 		/* Discard requests must be split on a block boundary */
2586 		ti->split_discard_requests = true;
2587 	}
2588 
2589 	dm_put(pool_md);
2590 
2591 	mutex_unlock(&dm_thin_pool_table.mutex);
2592 
2593 	return 0;
2594 
2595 bad_thin_open:
2596 	__pool_dec(tc->pool);
2597 bad_pool_lookup:
2598 	dm_put(pool_md);
2599 bad_common:
2600 	dm_put_device(ti, tc->pool_dev);
2601 bad_pool_dev:
2602 	if (tc->origin_dev)
2603 		dm_put_device(ti, tc->origin_dev);
2604 bad_origin_dev:
2605 	kfree(tc);
2606 out_unlock:
2607 	mutex_unlock(&dm_thin_pool_table.mutex);
2608 
2609 	return r;
2610 }
2611 
2612 static int thin_map(struct dm_target *ti, struct bio *bio,
2613 		    union map_info *map_context)
2614 {
2615 	bio->bi_sector = dm_target_offset(ti, bio->bi_sector);
2616 
2617 	return thin_bio_map(ti, bio, map_context);
2618 }
2619 
2620 static int thin_endio(struct dm_target *ti,
2621 		      struct bio *bio, int err,
2622 		      union map_info *map_context)
2623 {
2624 	unsigned long flags;
2625 	struct dm_thin_endio_hook *h = map_context->ptr;
2626 	struct list_head work;
2627 	struct dm_thin_new_mapping *m, *tmp;
2628 	struct pool *pool = h->tc->pool;
2629 
2630 	if (h->shared_read_entry) {
2631 		INIT_LIST_HEAD(&work);
2632 		dm_deferred_entry_dec(h->shared_read_entry, &work);
2633 
2634 		spin_lock_irqsave(&pool->lock, flags);
2635 		list_for_each_entry_safe(m, tmp, &work, list) {
2636 			list_del(&m->list);
2637 			m->quiesced = 1;
2638 			__maybe_add_mapping(m);
2639 		}
2640 		spin_unlock_irqrestore(&pool->lock, flags);
2641 	}
2642 
2643 	if (h->all_io_entry) {
2644 		INIT_LIST_HEAD(&work);
2645 		dm_deferred_entry_dec(h->all_io_entry, &work);
2646 		spin_lock_irqsave(&pool->lock, flags);
2647 		list_for_each_entry_safe(m, tmp, &work, list)
2648 			list_add(&m->list, &pool->prepared_discards);
2649 		spin_unlock_irqrestore(&pool->lock, flags);
2650 	}
2651 
2652 	mempool_free(h, pool->endio_hook_pool);
2653 
2654 	return 0;
2655 }
2656 
2657 static void thin_postsuspend(struct dm_target *ti)
2658 {
2659 	if (dm_noflush_suspending(ti))
2660 		requeue_io((struct thin_c *)ti->private);
2661 }
2662 
2663 /*
2664  * <nr mapped sectors> <highest mapped sector>
2665  */
2666 static int thin_status(struct dm_target *ti, status_type_t type,
2667 		       unsigned status_flags, char *result, unsigned maxlen)
2668 {
2669 	int r;
2670 	ssize_t sz = 0;
2671 	dm_block_t mapped, highest;
2672 	char buf[BDEVNAME_SIZE];
2673 	struct thin_c *tc = ti->private;
2674 
2675 	if (get_pool_mode(tc->pool) == PM_FAIL) {
2676 		DMEMIT("Fail");
2677 		return 0;
2678 	}
2679 
2680 	if (!tc->td)
2681 		DMEMIT("-");
2682 	else {
2683 		switch (type) {
2684 		case STATUSTYPE_INFO:
2685 			r = dm_thin_get_mapped_count(tc->td, &mapped);
2686 			if (r)
2687 				return r;
2688 
2689 			r = dm_thin_get_highest_mapped_block(tc->td, &highest);
2690 			if (r < 0)
2691 				return r;
2692 
2693 			DMEMIT("%llu ", mapped * tc->pool->sectors_per_block);
2694 			if (r)
2695 				DMEMIT("%llu", ((highest + 1) *
2696 						tc->pool->sectors_per_block) - 1);
2697 			else
2698 				DMEMIT("-");
2699 			break;
2700 
2701 		case STATUSTYPE_TABLE:
2702 			DMEMIT("%s %lu",
2703 			       format_dev_t(buf, tc->pool_dev->bdev->bd_dev),
2704 			       (unsigned long) tc->dev_id);
2705 			if (tc->origin_dev)
2706 				DMEMIT(" %s", format_dev_t(buf, tc->origin_dev->bdev->bd_dev));
2707 			break;
2708 		}
2709 	}
2710 
2711 	return 0;
2712 }
2713 
2714 static int thin_iterate_devices(struct dm_target *ti,
2715 				iterate_devices_callout_fn fn, void *data)
2716 {
2717 	sector_t blocks;
2718 	struct thin_c *tc = ti->private;
2719 	struct pool *pool = tc->pool;
2720 
2721 	/*
2722 	 * We can't call dm_pool_get_data_dev_size() since that blocks.  So
2723 	 * we follow a more convoluted path through to the pool's target.
2724 	 */
2725 	if (!pool->ti)
2726 		return 0;	/* nothing is bound */
2727 
2728 	blocks = pool->ti->len;
2729 	(void) sector_div(blocks, pool->sectors_per_block);
2730 	if (blocks)
2731 		return fn(ti, tc->pool_dev, 0, pool->sectors_per_block * blocks, data);
2732 
2733 	return 0;
2734 }
2735 
2736 /*
2737  * A thin device always inherits its queue limits from its pool.
2738  */
2739 static void thin_io_hints(struct dm_target *ti, struct queue_limits *limits)
2740 {
2741 	struct thin_c *tc = ti->private;
2742 
2743 	*limits = bdev_get_queue(tc->pool_dev->bdev)->limits;
2744 }
2745 
2746 static struct target_type thin_target = {
2747 	.name = "thin",
2748 	.version = {1, 5, 0},
2749 	.module	= THIS_MODULE,
2750 	.ctr = thin_ctr,
2751 	.dtr = thin_dtr,
2752 	.map = thin_map,
2753 	.end_io = thin_endio,
2754 	.postsuspend = thin_postsuspend,
2755 	.status = thin_status,
2756 	.iterate_devices = thin_iterate_devices,
2757 	.io_hints = thin_io_hints,
2758 };
2759 
2760 /*----------------------------------------------------------------*/
2761 
2762 static int __init dm_thin_init(void)
2763 {
2764 	int r;
2765 
2766 	pool_table_init();
2767 
2768 	r = dm_register_target(&thin_target);
2769 	if (r)
2770 		return r;
2771 
2772 	r = dm_register_target(&pool_target);
2773 	if (r)
2774 		goto bad_pool_target;
2775 
2776 	r = -ENOMEM;
2777 
2778 	_new_mapping_cache = KMEM_CACHE(dm_thin_new_mapping, 0);
2779 	if (!_new_mapping_cache)
2780 		goto bad_new_mapping_cache;
2781 
2782 	_endio_hook_cache = KMEM_CACHE(dm_thin_endio_hook, 0);
2783 	if (!_endio_hook_cache)
2784 		goto bad_endio_hook_cache;
2785 
2786 	return 0;
2787 
2788 bad_endio_hook_cache:
2789 	kmem_cache_destroy(_new_mapping_cache);
2790 bad_new_mapping_cache:
2791 	dm_unregister_target(&pool_target);
2792 bad_pool_target:
2793 	dm_unregister_target(&thin_target);
2794 
2795 	return r;
2796 }
2797 
2798 static void dm_thin_exit(void)
2799 {
2800 	dm_unregister_target(&thin_target);
2801 	dm_unregister_target(&pool_target);
2802 
2803 	kmem_cache_destroy(_new_mapping_cache);
2804 	kmem_cache_destroy(_endio_hook_cache);
2805 }
2806 
2807 module_init(dm_thin_init);
2808 module_exit(dm_thin_exit);
2809 
2810 MODULE_DESCRIPTION(DM_NAME " thin provisioning target");
2811 MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
2812 MODULE_LICENSE("GPL");
2813