xref: /openbmc/linux/drivers/md/dm-thin.c (revision d0b73b48)
1 /*
2  * Copyright (C) 2011-2012 Red Hat UK.
3  *
4  * This file is released under the GPL.
5  */
6 
7 #include "dm-thin-metadata.h"
8 #include "dm-bio-prison.h"
9 #include "dm.h"
10 
11 #include <linux/device-mapper.h>
12 #include <linux/dm-io.h>
13 #include <linux/dm-kcopyd.h>
14 #include <linux/list.h>
15 #include <linux/init.h>
16 #include <linux/module.h>
17 #include <linux/slab.h>
18 
19 #define	DM_MSG_PREFIX	"thin"
20 
21 /*
22  * Tunable constants
23  */
24 #define ENDIO_HOOK_POOL_SIZE 1024
25 #define MAPPING_POOL_SIZE 1024
26 #define PRISON_CELLS 1024
27 #define COMMIT_PERIOD HZ
28 
29 /*
30  * The block size of the device holding pool data must be
31  * between 64KB and 1GB.
32  */
33 #define DATA_DEV_BLOCK_SIZE_MIN_SECTORS (64 * 1024 >> SECTOR_SHIFT)
34 #define DATA_DEV_BLOCK_SIZE_MAX_SECTORS (1024 * 1024 * 1024 >> SECTOR_SHIFT)
35 
36 /*
37  * Device id is restricted to 24 bits.
38  */
39 #define MAX_DEV_ID ((1 << 24) - 1)
40 
41 /*
42  * How do we handle breaking sharing of data blocks?
43  * =================================================
44  *
45  * We use a standard copy-on-write btree to store the mappings for the
46  * devices (note I'm talking about copy-on-write of the metadata here, not
47  * the data).  When you take an internal snapshot you clone the root node
48  * of the origin btree.  After this there is no concept of an origin or a
49  * snapshot.  They are just two device trees that happen to point to the
50  * same data blocks.
51  *
52  * When we get a write in we decide if it's to a shared data block using
53  * some timestamp magic.  If it is, we have to break sharing.
54  *
55  * Let's say we write to a shared block in what was the origin.  The
56  * steps are:
57  *
58  * i) plug io further to this physical block. (see bio_prison code).
59  *
60  * ii) quiesce any read io to that shared data block.  Obviously
61  * including all devices that share this block.  (see dm_deferred_set code)
62  *
63  * iii) copy the data block to a newly allocate block.  This step can be
64  * missed out if the io covers the block. (schedule_copy).
65  *
66  * iv) insert the new mapping into the origin's btree
67  * (process_prepared_mapping).  This act of inserting breaks some
68  * sharing of btree nodes between the two devices.  Breaking sharing only
69  * effects the btree of that specific device.  Btrees for the other
70  * devices that share the block never change.  The btree for the origin
71  * device as it was after the last commit is untouched, ie. we're using
72  * persistent data structures in the functional programming sense.
73  *
74  * v) unplug io to this physical block, including the io that triggered
75  * the breaking of sharing.
76  *
77  * Steps (ii) and (iii) occur in parallel.
78  *
79  * The metadata _doesn't_ need to be committed before the io continues.  We
80  * get away with this because the io is always written to a _new_ block.
81  * If there's a crash, then:
82  *
83  * - The origin mapping will point to the old origin block (the shared
84  * one).  This will contain the data as it was before the io that triggered
85  * the breaking of sharing came in.
86  *
87  * - The snap mapping still points to the old block.  As it would after
88  * the commit.
89  *
90  * The downside of this scheme is the timestamp magic isn't perfect, and
91  * will continue to think that data block in the snapshot device is shared
92  * even after the write to the origin has broken sharing.  I suspect data
93  * blocks will typically be shared by many different devices, so we're
94  * breaking sharing n + 1 times, rather than n, where n is the number of
95  * devices that reference this data block.  At the moment I think the
96  * benefits far, far outweigh the disadvantages.
97  */
98 
99 /*----------------------------------------------------------------*/
100 
101 /*
102  * Key building.
103  */
104 static void build_data_key(struct dm_thin_device *td,
105 			   dm_block_t b, struct dm_cell_key *key)
106 {
107 	key->virtual = 0;
108 	key->dev = dm_thin_dev_id(td);
109 	key->block = b;
110 }
111 
112 static void build_virtual_key(struct dm_thin_device *td, dm_block_t b,
113 			      struct dm_cell_key *key)
114 {
115 	key->virtual = 1;
116 	key->dev = dm_thin_dev_id(td);
117 	key->block = b;
118 }
119 
120 /*----------------------------------------------------------------*/
121 
122 /*
123  * A pool device ties together a metadata device and a data device.  It
124  * also provides the interface for creating and destroying internal
125  * devices.
126  */
127 struct dm_thin_new_mapping;
128 
129 /*
130  * The pool runs in 3 modes.  Ordered in degraded order for comparisons.
131  */
132 enum pool_mode {
133 	PM_WRITE,		/* metadata may be changed */
134 	PM_READ_ONLY,		/* metadata may not be changed */
135 	PM_FAIL,		/* all I/O fails */
136 };
137 
138 struct pool_features {
139 	enum pool_mode mode;
140 
141 	bool zero_new_blocks:1;
142 	bool discard_enabled:1;
143 	bool discard_passdown:1;
144 };
145 
146 struct thin_c;
147 typedef void (*process_bio_fn)(struct thin_c *tc, struct bio *bio);
148 typedef void (*process_mapping_fn)(struct dm_thin_new_mapping *m);
149 
150 struct pool {
151 	struct list_head list;
152 	struct dm_target *ti;	/* Only set if a pool target is bound */
153 
154 	struct mapped_device *pool_md;
155 	struct block_device *md_dev;
156 	struct dm_pool_metadata *pmd;
157 
158 	dm_block_t low_water_blocks;
159 	uint32_t sectors_per_block;
160 	int sectors_per_block_shift;
161 
162 	struct pool_features pf;
163 	unsigned low_water_triggered:1;	/* A dm event has been sent */
164 	unsigned no_free_space:1;	/* A -ENOSPC warning has been issued */
165 
166 	struct dm_bio_prison *prison;
167 	struct dm_kcopyd_client *copier;
168 
169 	struct workqueue_struct *wq;
170 	struct work_struct worker;
171 	struct delayed_work waker;
172 
173 	unsigned long last_commit_jiffies;
174 	unsigned ref_count;
175 
176 	spinlock_t lock;
177 	struct bio_list deferred_bios;
178 	struct bio_list deferred_flush_bios;
179 	struct list_head prepared_mappings;
180 	struct list_head prepared_discards;
181 
182 	struct bio_list retry_on_resume_list;
183 
184 	struct dm_deferred_set *shared_read_ds;
185 	struct dm_deferred_set *all_io_ds;
186 
187 	struct dm_thin_new_mapping *next_mapping;
188 	mempool_t *mapping_pool;
189 
190 	process_bio_fn process_bio;
191 	process_bio_fn process_discard;
192 
193 	process_mapping_fn process_prepared_mapping;
194 	process_mapping_fn process_prepared_discard;
195 };
196 
197 static enum pool_mode get_pool_mode(struct pool *pool);
198 static void set_pool_mode(struct pool *pool, enum pool_mode mode);
199 
200 /*
201  * Target context for a pool.
202  */
203 struct pool_c {
204 	struct dm_target *ti;
205 	struct pool *pool;
206 	struct dm_dev *data_dev;
207 	struct dm_dev *metadata_dev;
208 	struct dm_target_callbacks callbacks;
209 
210 	dm_block_t low_water_blocks;
211 	struct pool_features requested_pf; /* Features requested during table load */
212 	struct pool_features adjusted_pf;  /* Features used after adjusting for constituent devices */
213 };
214 
215 /*
216  * Target context for a thin.
217  */
218 struct thin_c {
219 	struct dm_dev *pool_dev;
220 	struct dm_dev *origin_dev;
221 	dm_thin_id dev_id;
222 
223 	struct pool *pool;
224 	struct dm_thin_device *td;
225 };
226 
227 /*----------------------------------------------------------------*/
228 
229 /*
230  * A global list of pools that uses a struct mapped_device as a key.
231  */
232 static struct dm_thin_pool_table {
233 	struct mutex mutex;
234 	struct list_head pools;
235 } dm_thin_pool_table;
236 
237 static void pool_table_init(void)
238 {
239 	mutex_init(&dm_thin_pool_table.mutex);
240 	INIT_LIST_HEAD(&dm_thin_pool_table.pools);
241 }
242 
243 static void __pool_table_insert(struct pool *pool)
244 {
245 	BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
246 	list_add(&pool->list, &dm_thin_pool_table.pools);
247 }
248 
249 static void __pool_table_remove(struct pool *pool)
250 {
251 	BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
252 	list_del(&pool->list);
253 }
254 
255 static struct pool *__pool_table_lookup(struct mapped_device *md)
256 {
257 	struct pool *pool = NULL, *tmp;
258 
259 	BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
260 
261 	list_for_each_entry(tmp, &dm_thin_pool_table.pools, list) {
262 		if (tmp->pool_md == md) {
263 			pool = tmp;
264 			break;
265 		}
266 	}
267 
268 	return pool;
269 }
270 
271 static struct pool *__pool_table_lookup_metadata_dev(struct block_device *md_dev)
272 {
273 	struct pool *pool = NULL, *tmp;
274 
275 	BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
276 
277 	list_for_each_entry(tmp, &dm_thin_pool_table.pools, list) {
278 		if (tmp->md_dev == md_dev) {
279 			pool = tmp;
280 			break;
281 		}
282 	}
283 
284 	return pool;
285 }
286 
287 /*----------------------------------------------------------------*/
288 
289 struct dm_thin_endio_hook {
290 	struct thin_c *tc;
291 	struct dm_deferred_entry *shared_read_entry;
292 	struct dm_deferred_entry *all_io_entry;
293 	struct dm_thin_new_mapping *overwrite_mapping;
294 };
295 
296 static void __requeue_bio_list(struct thin_c *tc, struct bio_list *master)
297 {
298 	struct bio *bio;
299 	struct bio_list bios;
300 
301 	bio_list_init(&bios);
302 	bio_list_merge(&bios, master);
303 	bio_list_init(master);
304 
305 	while ((bio = bio_list_pop(&bios))) {
306 		struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
307 
308 		if (h->tc == tc)
309 			bio_endio(bio, DM_ENDIO_REQUEUE);
310 		else
311 			bio_list_add(master, bio);
312 	}
313 }
314 
315 static void requeue_io(struct thin_c *tc)
316 {
317 	struct pool *pool = tc->pool;
318 	unsigned long flags;
319 
320 	spin_lock_irqsave(&pool->lock, flags);
321 	__requeue_bio_list(tc, &pool->deferred_bios);
322 	__requeue_bio_list(tc, &pool->retry_on_resume_list);
323 	spin_unlock_irqrestore(&pool->lock, flags);
324 }
325 
326 /*
327  * This section of code contains the logic for processing a thin device's IO.
328  * Much of the code depends on pool object resources (lists, workqueues, etc)
329  * but most is exclusively called from the thin target rather than the thin-pool
330  * target.
331  */
332 
333 static dm_block_t get_bio_block(struct thin_c *tc, struct bio *bio)
334 {
335 	sector_t block_nr = bio->bi_sector;
336 
337 	if (tc->pool->sectors_per_block_shift < 0)
338 		(void) sector_div(block_nr, tc->pool->sectors_per_block);
339 	else
340 		block_nr >>= tc->pool->sectors_per_block_shift;
341 
342 	return block_nr;
343 }
344 
345 static void remap(struct thin_c *tc, struct bio *bio, dm_block_t block)
346 {
347 	struct pool *pool = tc->pool;
348 	sector_t bi_sector = bio->bi_sector;
349 
350 	bio->bi_bdev = tc->pool_dev->bdev;
351 	if (tc->pool->sectors_per_block_shift < 0)
352 		bio->bi_sector = (block * pool->sectors_per_block) +
353 				 sector_div(bi_sector, pool->sectors_per_block);
354 	else
355 		bio->bi_sector = (block << pool->sectors_per_block_shift) |
356 				(bi_sector & (pool->sectors_per_block - 1));
357 }
358 
359 static void remap_to_origin(struct thin_c *tc, struct bio *bio)
360 {
361 	bio->bi_bdev = tc->origin_dev->bdev;
362 }
363 
364 static int bio_triggers_commit(struct thin_c *tc, struct bio *bio)
365 {
366 	return (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) &&
367 		dm_thin_changed_this_transaction(tc->td);
368 }
369 
370 static void inc_all_io_entry(struct pool *pool, struct bio *bio)
371 {
372 	struct dm_thin_endio_hook *h;
373 
374 	if (bio->bi_rw & REQ_DISCARD)
375 		return;
376 
377 	h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
378 	h->all_io_entry = dm_deferred_entry_inc(pool->all_io_ds);
379 }
380 
381 static void issue(struct thin_c *tc, struct bio *bio)
382 {
383 	struct pool *pool = tc->pool;
384 	unsigned long flags;
385 
386 	if (!bio_triggers_commit(tc, bio)) {
387 		generic_make_request(bio);
388 		return;
389 	}
390 
391 	/*
392 	 * Complete bio with an error if earlier I/O caused changes to
393 	 * the metadata that can't be committed e.g, due to I/O errors
394 	 * on the metadata device.
395 	 */
396 	if (dm_thin_aborted_changes(tc->td)) {
397 		bio_io_error(bio);
398 		return;
399 	}
400 
401 	/*
402 	 * Batch together any bios that trigger commits and then issue a
403 	 * single commit for them in process_deferred_bios().
404 	 */
405 	spin_lock_irqsave(&pool->lock, flags);
406 	bio_list_add(&pool->deferred_flush_bios, bio);
407 	spin_unlock_irqrestore(&pool->lock, flags);
408 }
409 
410 static void remap_to_origin_and_issue(struct thin_c *tc, struct bio *bio)
411 {
412 	remap_to_origin(tc, bio);
413 	issue(tc, bio);
414 }
415 
416 static void remap_and_issue(struct thin_c *tc, struct bio *bio,
417 			    dm_block_t block)
418 {
419 	remap(tc, bio, block);
420 	issue(tc, bio);
421 }
422 
423 /*
424  * wake_worker() is used when new work is queued and when pool_resume is
425  * ready to continue deferred IO processing.
426  */
427 static void wake_worker(struct pool *pool)
428 {
429 	queue_work(pool->wq, &pool->worker);
430 }
431 
432 /*----------------------------------------------------------------*/
433 
434 /*
435  * Bio endio functions.
436  */
437 struct dm_thin_new_mapping {
438 	struct list_head list;
439 
440 	unsigned quiesced:1;
441 	unsigned prepared:1;
442 	unsigned pass_discard:1;
443 
444 	struct thin_c *tc;
445 	dm_block_t virt_block;
446 	dm_block_t data_block;
447 	struct dm_bio_prison_cell *cell, *cell2;
448 	int err;
449 
450 	/*
451 	 * If the bio covers the whole area of a block then we can avoid
452 	 * zeroing or copying.  Instead this bio is hooked.  The bio will
453 	 * still be in the cell, so care has to be taken to avoid issuing
454 	 * the bio twice.
455 	 */
456 	struct bio *bio;
457 	bio_end_io_t *saved_bi_end_io;
458 };
459 
460 static void __maybe_add_mapping(struct dm_thin_new_mapping *m)
461 {
462 	struct pool *pool = m->tc->pool;
463 
464 	if (m->quiesced && m->prepared) {
465 		list_add(&m->list, &pool->prepared_mappings);
466 		wake_worker(pool);
467 	}
468 }
469 
470 static void copy_complete(int read_err, unsigned long write_err, void *context)
471 {
472 	unsigned long flags;
473 	struct dm_thin_new_mapping *m = context;
474 	struct pool *pool = m->tc->pool;
475 
476 	m->err = read_err || write_err ? -EIO : 0;
477 
478 	spin_lock_irqsave(&pool->lock, flags);
479 	m->prepared = 1;
480 	__maybe_add_mapping(m);
481 	spin_unlock_irqrestore(&pool->lock, flags);
482 }
483 
484 static void overwrite_endio(struct bio *bio, int err)
485 {
486 	unsigned long flags;
487 	struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
488 	struct dm_thin_new_mapping *m = h->overwrite_mapping;
489 	struct pool *pool = m->tc->pool;
490 
491 	m->err = err;
492 
493 	spin_lock_irqsave(&pool->lock, flags);
494 	m->prepared = 1;
495 	__maybe_add_mapping(m);
496 	spin_unlock_irqrestore(&pool->lock, flags);
497 }
498 
499 /*----------------------------------------------------------------*/
500 
501 /*
502  * Workqueue.
503  */
504 
505 /*
506  * Prepared mapping jobs.
507  */
508 
509 /*
510  * This sends the bios in the cell back to the deferred_bios list.
511  */
512 static void cell_defer(struct thin_c *tc, struct dm_bio_prison_cell *cell)
513 {
514 	struct pool *pool = tc->pool;
515 	unsigned long flags;
516 
517 	spin_lock_irqsave(&pool->lock, flags);
518 	dm_cell_release(cell, &pool->deferred_bios);
519 	spin_unlock_irqrestore(&tc->pool->lock, flags);
520 
521 	wake_worker(pool);
522 }
523 
524 /*
525  * Same as cell_defer except it omits the original holder of the cell.
526  */
527 static void cell_defer_no_holder(struct thin_c *tc, struct dm_bio_prison_cell *cell)
528 {
529 	struct pool *pool = tc->pool;
530 	unsigned long flags;
531 
532 	spin_lock_irqsave(&pool->lock, flags);
533 	dm_cell_release_no_holder(cell, &pool->deferred_bios);
534 	spin_unlock_irqrestore(&pool->lock, flags);
535 
536 	wake_worker(pool);
537 }
538 
539 static void process_prepared_mapping_fail(struct dm_thin_new_mapping *m)
540 {
541 	if (m->bio)
542 		m->bio->bi_end_io = m->saved_bi_end_io;
543 	dm_cell_error(m->cell);
544 	list_del(&m->list);
545 	mempool_free(m, m->tc->pool->mapping_pool);
546 }
547 static void process_prepared_mapping(struct dm_thin_new_mapping *m)
548 {
549 	struct thin_c *tc = m->tc;
550 	struct bio *bio;
551 	int r;
552 
553 	bio = m->bio;
554 	if (bio)
555 		bio->bi_end_io = m->saved_bi_end_io;
556 
557 	if (m->err) {
558 		dm_cell_error(m->cell);
559 		goto out;
560 	}
561 
562 	/*
563 	 * Commit the prepared block into the mapping btree.
564 	 * Any I/O for this block arriving after this point will get
565 	 * remapped to it directly.
566 	 */
567 	r = dm_thin_insert_block(tc->td, m->virt_block, m->data_block);
568 	if (r) {
569 		DMERR_LIMIT("dm_thin_insert_block() failed");
570 		dm_cell_error(m->cell);
571 		goto out;
572 	}
573 
574 	/*
575 	 * Release any bios held while the block was being provisioned.
576 	 * If we are processing a write bio that completely covers the block,
577 	 * we already processed it so can ignore it now when processing
578 	 * the bios in the cell.
579 	 */
580 	if (bio) {
581 		cell_defer_no_holder(tc, m->cell);
582 		bio_endio(bio, 0);
583 	} else
584 		cell_defer(tc, m->cell);
585 
586 out:
587 	list_del(&m->list);
588 	mempool_free(m, tc->pool->mapping_pool);
589 }
590 
591 static void process_prepared_discard_fail(struct dm_thin_new_mapping *m)
592 {
593 	struct thin_c *tc = m->tc;
594 
595 	bio_io_error(m->bio);
596 	cell_defer_no_holder(tc, m->cell);
597 	cell_defer_no_holder(tc, m->cell2);
598 	mempool_free(m, tc->pool->mapping_pool);
599 }
600 
601 static void process_prepared_discard_passdown(struct dm_thin_new_mapping *m)
602 {
603 	struct thin_c *tc = m->tc;
604 
605 	inc_all_io_entry(tc->pool, m->bio);
606 	cell_defer_no_holder(tc, m->cell);
607 	cell_defer_no_holder(tc, m->cell2);
608 
609 	if (m->pass_discard)
610 		remap_and_issue(tc, m->bio, m->data_block);
611 	else
612 		bio_endio(m->bio, 0);
613 
614 	mempool_free(m, tc->pool->mapping_pool);
615 }
616 
617 static void process_prepared_discard(struct dm_thin_new_mapping *m)
618 {
619 	int r;
620 	struct thin_c *tc = m->tc;
621 
622 	r = dm_thin_remove_block(tc->td, m->virt_block);
623 	if (r)
624 		DMERR_LIMIT("dm_thin_remove_block() failed");
625 
626 	process_prepared_discard_passdown(m);
627 }
628 
629 static void process_prepared(struct pool *pool, struct list_head *head,
630 			     process_mapping_fn *fn)
631 {
632 	unsigned long flags;
633 	struct list_head maps;
634 	struct dm_thin_new_mapping *m, *tmp;
635 
636 	INIT_LIST_HEAD(&maps);
637 	spin_lock_irqsave(&pool->lock, flags);
638 	list_splice_init(head, &maps);
639 	spin_unlock_irqrestore(&pool->lock, flags);
640 
641 	list_for_each_entry_safe(m, tmp, &maps, list)
642 		(*fn)(m);
643 }
644 
645 /*
646  * Deferred bio jobs.
647  */
648 static int io_overlaps_block(struct pool *pool, struct bio *bio)
649 {
650 	return bio->bi_size == (pool->sectors_per_block << SECTOR_SHIFT);
651 }
652 
653 static int io_overwrites_block(struct pool *pool, struct bio *bio)
654 {
655 	return (bio_data_dir(bio) == WRITE) &&
656 		io_overlaps_block(pool, bio);
657 }
658 
659 static void save_and_set_endio(struct bio *bio, bio_end_io_t **save,
660 			       bio_end_io_t *fn)
661 {
662 	*save = bio->bi_end_io;
663 	bio->bi_end_io = fn;
664 }
665 
666 static int ensure_next_mapping(struct pool *pool)
667 {
668 	if (pool->next_mapping)
669 		return 0;
670 
671 	pool->next_mapping = mempool_alloc(pool->mapping_pool, GFP_ATOMIC);
672 
673 	return pool->next_mapping ? 0 : -ENOMEM;
674 }
675 
676 static struct dm_thin_new_mapping *get_next_mapping(struct pool *pool)
677 {
678 	struct dm_thin_new_mapping *r = pool->next_mapping;
679 
680 	BUG_ON(!pool->next_mapping);
681 
682 	pool->next_mapping = NULL;
683 
684 	return r;
685 }
686 
687 static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
688 			  struct dm_dev *origin, dm_block_t data_origin,
689 			  dm_block_t data_dest,
690 			  struct dm_bio_prison_cell *cell, struct bio *bio)
691 {
692 	int r;
693 	struct pool *pool = tc->pool;
694 	struct dm_thin_new_mapping *m = get_next_mapping(pool);
695 
696 	INIT_LIST_HEAD(&m->list);
697 	m->quiesced = 0;
698 	m->prepared = 0;
699 	m->tc = tc;
700 	m->virt_block = virt_block;
701 	m->data_block = data_dest;
702 	m->cell = cell;
703 	m->err = 0;
704 	m->bio = NULL;
705 
706 	if (!dm_deferred_set_add_work(pool->shared_read_ds, &m->list))
707 		m->quiesced = 1;
708 
709 	/*
710 	 * IO to pool_dev remaps to the pool target's data_dev.
711 	 *
712 	 * If the whole block of data is being overwritten, we can issue the
713 	 * bio immediately. Otherwise we use kcopyd to clone the data first.
714 	 */
715 	if (io_overwrites_block(pool, bio)) {
716 		struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
717 
718 		h->overwrite_mapping = m;
719 		m->bio = bio;
720 		save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio);
721 		inc_all_io_entry(pool, bio);
722 		remap_and_issue(tc, bio, data_dest);
723 	} else {
724 		struct dm_io_region from, to;
725 
726 		from.bdev = origin->bdev;
727 		from.sector = data_origin * pool->sectors_per_block;
728 		from.count = pool->sectors_per_block;
729 
730 		to.bdev = tc->pool_dev->bdev;
731 		to.sector = data_dest * pool->sectors_per_block;
732 		to.count = pool->sectors_per_block;
733 
734 		r = dm_kcopyd_copy(pool->copier, &from, 1, &to,
735 				   0, copy_complete, m);
736 		if (r < 0) {
737 			mempool_free(m, pool->mapping_pool);
738 			DMERR_LIMIT("dm_kcopyd_copy() failed");
739 			dm_cell_error(cell);
740 		}
741 	}
742 }
743 
744 static void schedule_internal_copy(struct thin_c *tc, dm_block_t virt_block,
745 				   dm_block_t data_origin, dm_block_t data_dest,
746 				   struct dm_bio_prison_cell *cell, struct bio *bio)
747 {
748 	schedule_copy(tc, virt_block, tc->pool_dev,
749 		      data_origin, data_dest, cell, bio);
750 }
751 
752 static void schedule_external_copy(struct thin_c *tc, dm_block_t virt_block,
753 				   dm_block_t data_dest,
754 				   struct dm_bio_prison_cell *cell, struct bio *bio)
755 {
756 	schedule_copy(tc, virt_block, tc->origin_dev,
757 		      virt_block, data_dest, cell, bio);
758 }
759 
760 static void schedule_zero(struct thin_c *tc, dm_block_t virt_block,
761 			  dm_block_t data_block, struct dm_bio_prison_cell *cell,
762 			  struct bio *bio)
763 {
764 	struct pool *pool = tc->pool;
765 	struct dm_thin_new_mapping *m = get_next_mapping(pool);
766 
767 	INIT_LIST_HEAD(&m->list);
768 	m->quiesced = 1;
769 	m->prepared = 0;
770 	m->tc = tc;
771 	m->virt_block = virt_block;
772 	m->data_block = data_block;
773 	m->cell = cell;
774 	m->err = 0;
775 	m->bio = NULL;
776 
777 	/*
778 	 * If the whole block of data is being overwritten or we are not
779 	 * zeroing pre-existing data, we can issue the bio immediately.
780 	 * Otherwise we use kcopyd to zero the data first.
781 	 */
782 	if (!pool->pf.zero_new_blocks)
783 		process_prepared_mapping(m);
784 
785 	else if (io_overwrites_block(pool, bio)) {
786 		struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
787 
788 		h->overwrite_mapping = m;
789 		m->bio = bio;
790 		save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio);
791 		inc_all_io_entry(pool, bio);
792 		remap_and_issue(tc, bio, data_block);
793 	} else {
794 		int r;
795 		struct dm_io_region to;
796 
797 		to.bdev = tc->pool_dev->bdev;
798 		to.sector = data_block * pool->sectors_per_block;
799 		to.count = pool->sectors_per_block;
800 
801 		r = dm_kcopyd_zero(pool->copier, 1, &to, 0, copy_complete, m);
802 		if (r < 0) {
803 			mempool_free(m, pool->mapping_pool);
804 			DMERR_LIMIT("dm_kcopyd_zero() failed");
805 			dm_cell_error(cell);
806 		}
807 	}
808 }
809 
810 static int commit(struct pool *pool)
811 {
812 	int r;
813 
814 	r = dm_pool_commit_metadata(pool->pmd);
815 	if (r)
816 		DMERR_LIMIT("commit failed: error = %d", r);
817 
818 	return r;
819 }
820 
821 /*
822  * A non-zero return indicates read_only or fail_io mode.
823  * Many callers don't care about the return value.
824  */
825 static int commit_or_fallback(struct pool *pool)
826 {
827 	int r;
828 
829 	if (get_pool_mode(pool) != PM_WRITE)
830 		return -EINVAL;
831 
832 	r = commit(pool);
833 	if (r)
834 		set_pool_mode(pool, PM_READ_ONLY);
835 
836 	return r;
837 }
838 
839 static int alloc_data_block(struct thin_c *tc, dm_block_t *result)
840 {
841 	int r;
842 	dm_block_t free_blocks;
843 	unsigned long flags;
844 	struct pool *pool = tc->pool;
845 
846 	r = dm_pool_get_free_block_count(pool->pmd, &free_blocks);
847 	if (r)
848 		return r;
849 
850 	if (free_blocks <= pool->low_water_blocks && !pool->low_water_triggered) {
851 		DMWARN("%s: reached low water mark, sending event.",
852 		       dm_device_name(pool->pool_md));
853 		spin_lock_irqsave(&pool->lock, flags);
854 		pool->low_water_triggered = 1;
855 		spin_unlock_irqrestore(&pool->lock, flags);
856 		dm_table_event(pool->ti->table);
857 	}
858 
859 	if (!free_blocks) {
860 		if (pool->no_free_space)
861 			return -ENOSPC;
862 		else {
863 			/*
864 			 * Try to commit to see if that will free up some
865 			 * more space.
866 			 */
867 			(void) commit_or_fallback(pool);
868 
869 			r = dm_pool_get_free_block_count(pool->pmd, &free_blocks);
870 			if (r)
871 				return r;
872 
873 			/*
874 			 * If we still have no space we set a flag to avoid
875 			 * doing all this checking and return -ENOSPC.
876 			 */
877 			if (!free_blocks) {
878 				DMWARN("%s: no free space available.",
879 				       dm_device_name(pool->pool_md));
880 				spin_lock_irqsave(&pool->lock, flags);
881 				pool->no_free_space = 1;
882 				spin_unlock_irqrestore(&pool->lock, flags);
883 				return -ENOSPC;
884 			}
885 		}
886 	}
887 
888 	r = dm_pool_alloc_data_block(pool->pmd, result);
889 	if (r)
890 		return r;
891 
892 	return 0;
893 }
894 
895 /*
896  * If we have run out of space, queue bios until the device is
897  * resumed, presumably after having been reloaded with more space.
898  */
899 static void retry_on_resume(struct bio *bio)
900 {
901 	struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
902 	struct thin_c *tc = h->tc;
903 	struct pool *pool = tc->pool;
904 	unsigned long flags;
905 
906 	spin_lock_irqsave(&pool->lock, flags);
907 	bio_list_add(&pool->retry_on_resume_list, bio);
908 	spin_unlock_irqrestore(&pool->lock, flags);
909 }
910 
911 static void no_space(struct dm_bio_prison_cell *cell)
912 {
913 	struct bio *bio;
914 	struct bio_list bios;
915 
916 	bio_list_init(&bios);
917 	dm_cell_release(cell, &bios);
918 
919 	while ((bio = bio_list_pop(&bios)))
920 		retry_on_resume(bio);
921 }
922 
923 static void process_discard(struct thin_c *tc, struct bio *bio)
924 {
925 	int r;
926 	unsigned long flags;
927 	struct pool *pool = tc->pool;
928 	struct dm_bio_prison_cell *cell, *cell2;
929 	struct dm_cell_key key, key2;
930 	dm_block_t block = get_bio_block(tc, bio);
931 	struct dm_thin_lookup_result lookup_result;
932 	struct dm_thin_new_mapping *m;
933 
934 	build_virtual_key(tc->td, block, &key);
935 	if (dm_bio_detain(tc->pool->prison, &key, bio, &cell))
936 		return;
937 
938 	r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
939 	switch (r) {
940 	case 0:
941 		/*
942 		 * Check nobody is fiddling with this pool block.  This can
943 		 * happen if someone's in the process of breaking sharing
944 		 * on this block.
945 		 */
946 		build_data_key(tc->td, lookup_result.block, &key2);
947 		if (dm_bio_detain(tc->pool->prison, &key2, bio, &cell2)) {
948 			cell_defer_no_holder(tc, cell);
949 			break;
950 		}
951 
952 		if (io_overlaps_block(pool, bio)) {
953 			/*
954 			 * IO may still be going to the destination block.  We must
955 			 * quiesce before we can do the removal.
956 			 */
957 			m = get_next_mapping(pool);
958 			m->tc = tc;
959 			m->pass_discard = (!lookup_result.shared) && pool->pf.discard_passdown;
960 			m->virt_block = block;
961 			m->data_block = lookup_result.block;
962 			m->cell = cell;
963 			m->cell2 = cell2;
964 			m->err = 0;
965 			m->bio = bio;
966 
967 			if (!dm_deferred_set_add_work(pool->all_io_ds, &m->list)) {
968 				spin_lock_irqsave(&pool->lock, flags);
969 				list_add(&m->list, &pool->prepared_discards);
970 				spin_unlock_irqrestore(&pool->lock, flags);
971 				wake_worker(pool);
972 			}
973 		} else {
974 			inc_all_io_entry(pool, bio);
975 			cell_defer_no_holder(tc, cell);
976 			cell_defer_no_holder(tc, cell2);
977 
978 			/*
979 			 * The DM core makes sure that the discard doesn't span
980 			 * a block boundary.  So we submit the discard of a
981 			 * partial block appropriately.
982 			 */
983 			if ((!lookup_result.shared) && pool->pf.discard_passdown)
984 				remap_and_issue(tc, bio, lookup_result.block);
985 			else
986 				bio_endio(bio, 0);
987 		}
988 		break;
989 
990 	case -ENODATA:
991 		/*
992 		 * It isn't provisioned, just forget it.
993 		 */
994 		cell_defer_no_holder(tc, cell);
995 		bio_endio(bio, 0);
996 		break;
997 
998 	default:
999 		DMERR_LIMIT("%s: dm_thin_find_block() failed: error = %d",
1000 			    __func__, r);
1001 		cell_defer_no_holder(tc, cell);
1002 		bio_io_error(bio);
1003 		break;
1004 	}
1005 }
1006 
1007 static void break_sharing(struct thin_c *tc, struct bio *bio, dm_block_t block,
1008 			  struct dm_cell_key *key,
1009 			  struct dm_thin_lookup_result *lookup_result,
1010 			  struct dm_bio_prison_cell *cell)
1011 {
1012 	int r;
1013 	dm_block_t data_block;
1014 
1015 	r = alloc_data_block(tc, &data_block);
1016 	switch (r) {
1017 	case 0:
1018 		schedule_internal_copy(tc, block, lookup_result->block,
1019 				       data_block, cell, bio);
1020 		break;
1021 
1022 	case -ENOSPC:
1023 		no_space(cell);
1024 		break;
1025 
1026 	default:
1027 		DMERR_LIMIT("%s: alloc_data_block() failed: error = %d",
1028 			    __func__, r);
1029 		dm_cell_error(cell);
1030 		break;
1031 	}
1032 }
1033 
1034 static void process_shared_bio(struct thin_c *tc, struct bio *bio,
1035 			       dm_block_t block,
1036 			       struct dm_thin_lookup_result *lookup_result)
1037 {
1038 	struct dm_bio_prison_cell *cell;
1039 	struct pool *pool = tc->pool;
1040 	struct dm_cell_key key;
1041 
1042 	/*
1043 	 * If cell is already occupied, then sharing is already in the process
1044 	 * of being broken so we have nothing further to do here.
1045 	 */
1046 	build_data_key(tc->td, lookup_result->block, &key);
1047 	if (dm_bio_detain(pool->prison, &key, bio, &cell))
1048 		return;
1049 
1050 	if (bio_data_dir(bio) == WRITE && bio->bi_size)
1051 		break_sharing(tc, bio, block, &key, lookup_result, cell);
1052 	else {
1053 		struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
1054 
1055 		h->shared_read_entry = dm_deferred_entry_inc(pool->shared_read_ds);
1056 		inc_all_io_entry(pool, bio);
1057 		cell_defer_no_holder(tc, cell);
1058 
1059 		remap_and_issue(tc, bio, lookup_result->block);
1060 	}
1061 }
1062 
1063 static void provision_block(struct thin_c *tc, struct bio *bio, dm_block_t block,
1064 			    struct dm_bio_prison_cell *cell)
1065 {
1066 	int r;
1067 	dm_block_t data_block;
1068 
1069 	/*
1070 	 * Remap empty bios (flushes) immediately, without provisioning.
1071 	 */
1072 	if (!bio->bi_size) {
1073 		inc_all_io_entry(tc->pool, bio);
1074 		cell_defer_no_holder(tc, cell);
1075 
1076 		remap_and_issue(tc, bio, 0);
1077 		return;
1078 	}
1079 
1080 	/*
1081 	 * Fill read bios with zeroes and complete them immediately.
1082 	 */
1083 	if (bio_data_dir(bio) == READ) {
1084 		zero_fill_bio(bio);
1085 		cell_defer_no_holder(tc, cell);
1086 		bio_endio(bio, 0);
1087 		return;
1088 	}
1089 
1090 	r = alloc_data_block(tc, &data_block);
1091 	switch (r) {
1092 	case 0:
1093 		if (tc->origin_dev)
1094 			schedule_external_copy(tc, block, data_block, cell, bio);
1095 		else
1096 			schedule_zero(tc, block, data_block, cell, bio);
1097 		break;
1098 
1099 	case -ENOSPC:
1100 		no_space(cell);
1101 		break;
1102 
1103 	default:
1104 		DMERR_LIMIT("%s: alloc_data_block() failed: error = %d",
1105 			    __func__, r);
1106 		set_pool_mode(tc->pool, PM_READ_ONLY);
1107 		dm_cell_error(cell);
1108 		break;
1109 	}
1110 }
1111 
1112 static void process_bio(struct thin_c *tc, struct bio *bio)
1113 {
1114 	int r;
1115 	dm_block_t block = get_bio_block(tc, bio);
1116 	struct dm_bio_prison_cell *cell;
1117 	struct dm_cell_key key;
1118 	struct dm_thin_lookup_result lookup_result;
1119 
1120 	/*
1121 	 * If cell is already occupied, then the block is already
1122 	 * being provisioned so we have nothing further to do here.
1123 	 */
1124 	build_virtual_key(tc->td, block, &key);
1125 	if (dm_bio_detain(tc->pool->prison, &key, bio, &cell))
1126 		return;
1127 
1128 	r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
1129 	switch (r) {
1130 	case 0:
1131 		if (lookup_result.shared) {
1132 			process_shared_bio(tc, bio, block, &lookup_result);
1133 			cell_defer_no_holder(tc, cell);
1134 		} else {
1135 			inc_all_io_entry(tc->pool, bio);
1136 			cell_defer_no_holder(tc, cell);
1137 
1138 			remap_and_issue(tc, bio, lookup_result.block);
1139 		}
1140 		break;
1141 
1142 	case -ENODATA:
1143 		if (bio_data_dir(bio) == READ && tc->origin_dev) {
1144 			inc_all_io_entry(tc->pool, bio);
1145 			cell_defer_no_holder(tc, cell);
1146 
1147 			remap_to_origin_and_issue(tc, bio);
1148 		} else
1149 			provision_block(tc, bio, block, cell);
1150 		break;
1151 
1152 	default:
1153 		DMERR_LIMIT("%s: dm_thin_find_block() failed: error = %d",
1154 			    __func__, r);
1155 		cell_defer_no_holder(tc, cell);
1156 		bio_io_error(bio);
1157 		break;
1158 	}
1159 }
1160 
1161 static void process_bio_read_only(struct thin_c *tc, struct bio *bio)
1162 {
1163 	int r;
1164 	int rw = bio_data_dir(bio);
1165 	dm_block_t block = get_bio_block(tc, bio);
1166 	struct dm_thin_lookup_result lookup_result;
1167 
1168 	r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
1169 	switch (r) {
1170 	case 0:
1171 		if (lookup_result.shared && (rw == WRITE) && bio->bi_size)
1172 			bio_io_error(bio);
1173 		else {
1174 			inc_all_io_entry(tc->pool, bio);
1175 			remap_and_issue(tc, bio, lookup_result.block);
1176 		}
1177 		break;
1178 
1179 	case -ENODATA:
1180 		if (rw != READ) {
1181 			bio_io_error(bio);
1182 			break;
1183 		}
1184 
1185 		if (tc->origin_dev) {
1186 			inc_all_io_entry(tc->pool, bio);
1187 			remap_to_origin_and_issue(tc, bio);
1188 			break;
1189 		}
1190 
1191 		zero_fill_bio(bio);
1192 		bio_endio(bio, 0);
1193 		break;
1194 
1195 	default:
1196 		DMERR_LIMIT("%s: dm_thin_find_block() failed: error = %d",
1197 			    __func__, r);
1198 		bio_io_error(bio);
1199 		break;
1200 	}
1201 }
1202 
1203 static void process_bio_fail(struct thin_c *tc, struct bio *bio)
1204 {
1205 	bio_io_error(bio);
1206 }
1207 
1208 static int need_commit_due_to_time(struct pool *pool)
1209 {
1210 	return jiffies < pool->last_commit_jiffies ||
1211 	       jiffies > pool->last_commit_jiffies + COMMIT_PERIOD;
1212 }
1213 
1214 static void process_deferred_bios(struct pool *pool)
1215 {
1216 	unsigned long flags;
1217 	struct bio *bio;
1218 	struct bio_list bios;
1219 
1220 	bio_list_init(&bios);
1221 
1222 	spin_lock_irqsave(&pool->lock, flags);
1223 	bio_list_merge(&bios, &pool->deferred_bios);
1224 	bio_list_init(&pool->deferred_bios);
1225 	spin_unlock_irqrestore(&pool->lock, flags);
1226 
1227 	while ((bio = bio_list_pop(&bios))) {
1228 		struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
1229 		struct thin_c *tc = h->tc;
1230 
1231 		/*
1232 		 * If we've got no free new_mapping structs, and processing
1233 		 * this bio might require one, we pause until there are some
1234 		 * prepared mappings to process.
1235 		 */
1236 		if (ensure_next_mapping(pool)) {
1237 			spin_lock_irqsave(&pool->lock, flags);
1238 			bio_list_merge(&pool->deferred_bios, &bios);
1239 			spin_unlock_irqrestore(&pool->lock, flags);
1240 
1241 			break;
1242 		}
1243 
1244 		if (bio->bi_rw & REQ_DISCARD)
1245 			pool->process_discard(tc, bio);
1246 		else
1247 			pool->process_bio(tc, bio);
1248 	}
1249 
1250 	/*
1251 	 * If there are any deferred flush bios, we must commit
1252 	 * the metadata before issuing them.
1253 	 */
1254 	bio_list_init(&bios);
1255 	spin_lock_irqsave(&pool->lock, flags);
1256 	bio_list_merge(&bios, &pool->deferred_flush_bios);
1257 	bio_list_init(&pool->deferred_flush_bios);
1258 	spin_unlock_irqrestore(&pool->lock, flags);
1259 
1260 	if (bio_list_empty(&bios) && !need_commit_due_to_time(pool))
1261 		return;
1262 
1263 	if (commit_or_fallback(pool)) {
1264 		while ((bio = bio_list_pop(&bios)))
1265 			bio_io_error(bio);
1266 		return;
1267 	}
1268 	pool->last_commit_jiffies = jiffies;
1269 
1270 	while ((bio = bio_list_pop(&bios)))
1271 		generic_make_request(bio);
1272 }
1273 
1274 static void do_worker(struct work_struct *ws)
1275 {
1276 	struct pool *pool = container_of(ws, struct pool, worker);
1277 
1278 	process_prepared(pool, &pool->prepared_mappings, &pool->process_prepared_mapping);
1279 	process_prepared(pool, &pool->prepared_discards, &pool->process_prepared_discard);
1280 	process_deferred_bios(pool);
1281 }
1282 
1283 /*
1284  * We want to commit periodically so that not too much
1285  * unwritten data builds up.
1286  */
1287 static void do_waker(struct work_struct *ws)
1288 {
1289 	struct pool *pool = container_of(to_delayed_work(ws), struct pool, waker);
1290 	wake_worker(pool);
1291 	queue_delayed_work(pool->wq, &pool->waker, COMMIT_PERIOD);
1292 }
1293 
1294 /*----------------------------------------------------------------*/
1295 
1296 static enum pool_mode get_pool_mode(struct pool *pool)
1297 {
1298 	return pool->pf.mode;
1299 }
1300 
1301 static void set_pool_mode(struct pool *pool, enum pool_mode mode)
1302 {
1303 	int r;
1304 
1305 	pool->pf.mode = mode;
1306 
1307 	switch (mode) {
1308 	case PM_FAIL:
1309 		DMERR("switching pool to failure mode");
1310 		pool->process_bio = process_bio_fail;
1311 		pool->process_discard = process_bio_fail;
1312 		pool->process_prepared_mapping = process_prepared_mapping_fail;
1313 		pool->process_prepared_discard = process_prepared_discard_fail;
1314 		break;
1315 
1316 	case PM_READ_ONLY:
1317 		DMERR("switching pool to read-only mode");
1318 		r = dm_pool_abort_metadata(pool->pmd);
1319 		if (r) {
1320 			DMERR("aborting transaction failed");
1321 			set_pool_mode(pool, PM_FAIL);
1322 		} else {
1323 			dm_pool_metadata_read_only(pool->pmd);
1324 			pool->process_bio = process_bio_read_only;
1325 			pool->process_discard = process_discard;
1326 			pool->process_prepared_mapping = process_prepared_mapping_fail;
1327 			pool->process_prepared_discard = process_prepared_discard_passdown;
1328 		}
1329 		break;
1330 
1331 	case PM_WRITE:
1332 		pool->process_bio = process_bio;
1333 		pool->process_discard = process_discard;
1334 		pool->process_prepared_mapping = process_prepared_mapping;
1335 		pool->process_prepared_discard = process_prepared_discard;
1336 		break;
1337 	}
1338 }
1339 
1340 /*----------------------------------------------------------------*/
1341 
1342 /*
1343  * Mapping functions.
1344  */
1345 
1346 /*
1347  * Called only while mapping a thin bio to hand it over to the workqueue.
1348  */
1349 static void thin_defer_bio(struct thin_c *tc, struct bio *bio)
1350 {
1351 	unsigned long flags;
1352 	struct pool *pool = tc->pool;
1353 
1354 	spin_lock_irqsave(&pool->lock, flags);
1355 	bio_list_add(&pool->deferred_bios, bio);
1356 	spin_unlock_irqrestore(&pool->lock, flags);
1357 
1358 	wake_worker(pool);
1359 }
1360 
1361 static void thin_hook_bio(struct thin_c *tc, struct bio *bio)
1362 {
1363 	struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
1364 
1365 	h->tc = tc;
1366 	h->shared_read_entry = NULL;
1367 	h->all_io_entry = NULL;
1368 	h->overwrite_mapping = NULL;
1369 }
1370 
1371 /*
1372  * Non-blocking function called from the thin target's map function.
1373  */
1374 static int thin_bio_map(struct dm_target *ti, struct bio *bio)
1375 {
1376 	int r;
1377 	struct thin_c *tc = ti->private;
1378 	dm_block_t block = get_bio_block(tc, bio);
1379 	struct dm_thin_device *td = tc->td;
1380 	struct dm_thin_lookup_result result;
1381 	struct dm_bio_prison_cell *cell1, *cell2;
1382 	struct dm_cell_key key;
1383 
1384 	thin_hook_bio(tc, bio);
1385 
1386 	if (get_pool_mode(tc->pool) == PM_FAIL) {
1387 		bio_io_error(bio);
1388 		return DM_MAPIO_SUBMITTED;
1389 	}
1390 
1391 	if (bio->bi_rw & (REQ_DISCARD | REQ_FLUSH | REQ_FUA)) {
1392 		thin_defer_bio(tc, bio);
1393 		return DM_MAPIO_SUBMITTED;
1394 	}
1395 
1396 	r = dm_thin_find_block(td, block, 0, &result);
1397 
1398 	/*
1399 	 * Note that we defer readahead too.
1400 	 */
1401 	switch (r) {
1402 	case 0:
1403 		if (unlikely(result.shared)) {
1404 			/*
1405 			 * We have a race condition here between the
1406 			 * result.shared value returned by the lookup and
1407 			 * snapshot creation, which may cause new
1408 			 * sharing.
1409 			 *
1410 			 * To avoid this always quiesce the origin before
1411 			 * taking the snap.  You want to do this anyway to
1412 			 * ensure a consistent application view
1413 			 * (i.e. lockfs).
1414 			 *
1415 			 * More distant ancestors are irrelevant. The
1416 			 * shared flag will be set in their case.
1417 			 */
1418 			thin_defer_bio(tc, bio);
1419 			return DM_MAPIO_SUBMITTED;
1420 		}
1421 
1422 		build_virtual_key(tc->td, block, &key);
1423 		if (dm_bio_detain(tc->pool->prison, &key, bio, &cell1))
1424 			return DM_MAPIO_SUBMITTED;
1425 
1426 		build_data_key(tc->td, result.block, &key);
1427 		if (dm_bio_detain(tc->pool->prison, &key, bio, &cell2)) {
1428 			cell_defer_no_holder(tc, cell1);
1429 			return DM_MAPIO_SUBMITTED;
1430 		}
1431 
1432 		inc_all_io_entry(tc->pool, bio);
1433 		cell_defer_no_holder(tc, cell2);
1434 		cell_defer_no_holder(tc, cell1);
1435 
1436 		remap(tc, bio, result.block);
1437 		return DM_MAPIO_REMAPPED;
1438 
1439 	case -ENODATA:
1440 		if (get_pool_mode(tc->pool) == PM_READ_ONLY) {
1441 			/*
1442 			 * This block isn't provisioned, and we have no way
1443 			 * of doing so.  Just error it.
1444 			 */
1445 			bio_io_error(bio);
1446 			return DM_MAPIO_SUBMITTED;
1447 		}
1448 		/* fall through */
1449 
1450 	case -EWOULDBLOCK:
1451 		/*
1452 		 * In future, the failed dm_thin_find_block above could
1453 		 * provide the hint to load the metadata into cache.
1454 		 */
1455 		thin_defer_bio(tc, bio);
1456 		return DM_MAPIO_SUBMITTED;
1457 
1458 	default:
1459 		/*
1460 		 * Must always call bio_io_error on failure.
1461 		 * dm_thin_find_block can fail with -EINVAL if the
1462 		 * pool is switched to fail-io mode.
1463 		 */
1464 		bio_io_error(bio);
1465 		return DM_MAPIO_SUBMITTED;
1466 	}
1467 }
1468 
1469 static int pool_is_congested(struct dm_target_callbacks *cb, int bdi_bits)
1470 {
1471 	int r;
1472 	unsigned long flags;
1473 	struct pool_c *pt = container_of(cb, struct pool_c, callbacks);
1474 
1475 	spin_lock_irqsave(&pt->pool->lock, flags);
1476 	r = !bio_list_empty(&pt->pool->retry_on_resume_list);
1477 	spin_unlock_irqrestore(&pt->pool->lock, flags);
1478 
1479 	if (!r) {
1480 		struct request_queue *q = bdev_get_queue(pt->data_dev->bdev);
1481 		r = bdi_congested(&q->backing_dev_info, bdi_bits);
1482 	}
1483 
1484 	return r;
1485 }
1486 
1487 static void __requeue_bios(struct pool *pool)
1488 {
1489 	bio_list_merge(&pool->deferred_bios, &pool->retry_on_resume_list);
1490 	bio_list_init(&pool->retry_on_resume_list);
1491 }
1492 
1493 /*----------------------------------------------------------------
1494  * Binding of control targets to a pool object
1495  *--------------------------------------------------------------*/
1496 static bool data_dev_supports_discard(struct pool_c *pt)
1497 {
1498 	struct request_queue *q = bdev_get_queue(pt->data_dev->bdev);
1499 
1500 	return q && blk_queue_discard(q);
1501 }
1502 
1503 /*
1504  * If discard_passdown was enabled verify that the data device
1505  * supports discards.  Disable discard_passdown if not.
1506  */
1507 static void disable_passdown_if_not_supported(struct pool_c *pt)
1508 {
1509 	struct pool *pool = pt->pool;
1510 	struct block_device *data_bdev = pt->data_dev->bdev;
1511 	struct queue_limits *data_limits = &bdev_get_queue(data_bdev)->limits;
1512 	sector_t block_size = pool->sectors_per_block << SECTOR_SHIFT;
1513 	const char *reason = NULL;
1514 	char buf[BDEVNAME_SIZE];
1515 
1516 	if (!pt->adjusted_pf.discard_passdown)
1517 		return;
1518 
1519 	if (!data_dev_supports_discard(pt))
1520 		reason = "discard unsupported";
1521 
1522 	else if (data_limits->max_discard_sectors < pool->sectors_per_block)
1523 		reason = "max discard sectors smaller than a block";
1524 
1525 	else if (data_limits->discard_granularity > block_size)
1526 		reason = "discard granularity larger than a block";
1527 
1528 	else if (block_size & (data_limits->discard_granularity - 1))
1529 		reason = "discard granularity not a factor of block size";
1530 
1531 	if (reason) {
1532 		DMWARN("Data device (%s) %s: Disabling discard passdown.", bdevname(data_bdev, buf), reason);
1533 		pt->adjusted_pf.discard_passdown = false;
1534 	}
1535 }
1536 
1537 static int bind_control_target(struct pool *pool, struct dm_target *ti)
1538 {
1539 	struct pool_c *pt = ti->private;
1540 
1541 	/*
1542 	 * We want to make sure that degraded pools are never upgraded.
1543 	 */
1544 	enum pool_mode old_mode = pool->pf.mode;
1545 	enum pool_mode new_mode = pt->adjusted_pf.mode;
1546 
1547 	if (old_mode > new_mode)
1548 		new_mode = old_mode;
1549 
1550 	pool->ti = ti;
1551 	pool->low_water_blocks = pt->low_water_blocks;
1552 	pool->pf = pt->adjusted_pf;
1553 
1554 	set_pool_mode(pool, new_mode);
1555 
1556 	return 0;
1557 }
1558 
1559 static void unbind_control_target(struct pool *pool, struct dm_target *ti)
1560 {
1561 	if (pool->ti == ti)
1562 		pool->ti = NULL;
1563 }
1564 
1565 /*----------------------------------------------------------------
1566  * Pool creation
1567  *--------------------------------------------------------------*/
1568 /* Initialize pool features. */
1569 static void pool_features_init(struct pool_features *pf)
1570 {
1571 	pf->mode = PM_WRITE;
1572 	pf->zero_new_blocks = true;
1573 	pf->discard_enabled = true;
1574 	pf->discard_passdown = true;
1575 }
1576 
1577 static void __pool_destroy(struct pool *pool)
1578 {
1579 	__pool_table_remove(pool);
1580 
1581 	if (dm_pool_metadata_close(pool->pmd) < 0)
1582 		DMWARN("%s: dm_pool_metadata_close() failed.", __func__);
1583 
1584 	dm_bio_prison_destroy(pool->prison);
1585 	dm_kcopyd_client_destroy(pool->copier);
1586 
1587 	if (pool->wq)
1588 		destroy_workqueue(pool->wq);
1589 
1590 	if (pool->next_mapping)
1591 		mempool_free(pool->next_mapping, pool->mapping_pool);
1592 	mempool_destroy(pool->mapping_pool);
1593 	dm_deferred_set_destroy(pool->shared_read_ds);
1594 	dm_deferred_set_destroy(pool->all_io_ds);
1595 	kfree(pool);
1596 }
1597 
1598 static struct kmem_cache *_new_mapping_cache;
1599 
1600 static struct pool *pool_create(struct mapped_device *pool_md,
1601 				struct block_device *metadata_dev,
1602 				unsigned long block_size,
1603 				int read_only, char **error)
1604 {
1605 	int r;
1606 	void *err_p;
1607 	struct pool *pool;
1608 	struct dm_pool_metadata *pmd;
1609 	bool format_device = read_only ? false : true;
1610 
1611 	pmd = dm_pool_metadata_open(metadata_dev, block_size, format_device);
1612 	if (IS_ERR(pmd)) {
1613 		*error = "Error creating metadata object";
1614 		return (struct pool *)pmd;
1615 	}
1616 
1617 	pool = kmalloc(sizeof(*pool), GFP_KERNEL);
1618 	if (!pool) {
1619 		*error = "Error allocating memory for pool";
1620 		err_p = ERR_PTR(-ENOMEM);
1621 		goto bad_pool;
1622 	}
1623 
1624 	pool->pmd = pmd;
1625 	pool->sectors_per_block = block_size;
1626 	if (block_size & (block_size - 1))
1627 		pool->sectors_per_block_shift = -1;
1628 	else
1629 		pool->sectors_per_block_shift = __ffs(block_size);
1630 	pool->low_water_blocks = 0;
1631 	pool_features_init(&pool->pf);
1632 	pool->prison = dm_bio_prison_create(PRISON_CELLS);
1633 	if (!pool->prison) {
1634 		*error = "Error creating pool's bio prison";
1635 		err_p = ERR_PTR(-ENOMEM);
1636 		goto bad_prison;
1637 	}
1638 
1639 	pool->copier = dm_kcopyd_client_create();
1640 	if (IS_ERR(pool->copier)) {
1641 		r = PTR_ERR(pool->copier);
1642 		*error = "Error creating pool's kcopyd client";
1643 		err_p = ERR_PTR(r);
1644 		goto bad_kcopyd_client;
1645 	}
1646 
1647 	/*
1648 	 * Create singlethreaded workqueue that will service all devices
1649 	 * that use this metadata.
1650 	 */
1651 	pool->wq = alloc_ordered_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM);
1652 	if (!pool->wq) {
1653 		*error = "Error creating pool's workqueue";
1654 		err_p = ERR_PTR(-ENOMEM);
1655 		goto bad_wq;
1656 	}
1657 
1658 	INIT_WORK(&pool->worker, do_worker);
1659 	INIT_DELAYED_WORK(&pool->waker, do_waker);
1660 	spin_lock_init(&pool->lock);
1661 	bio_list_init(&pool->deferred_bios);
1662 	bio_list_init(&pool->deferred_flush_bios);
1663 	INIT_LIST_HEAD(&pool->prepared_mappings);
1664 	INIT_LIST_HEAD(&pool->prepared_discards);
1665 	pool->low_water_triggered = 0;
1666 	pool->no_free_space = 0;
1667 	bio_list_init(&pool->retry_on_resume_list);
1668 
1669 	pool->shared_read_ds = dm_deferred_set_create();
1670 	if (!pool->shared_read_ds) {
1671 		*error = "Error creating pool's shared read deferred set";
1672 		err_p = ERR_PTR(-ENOMEM);
1673 		goto bad_shared_read_ds;
1674 	}
1675 
1676 	pool->all_io_ds = dm_deferred_set_create();
1677 	if (!pool->all_io_ds) {
1678 		*error = "Error creating pool's all io deferred set";
1679 		err_p = ERR_PTR(-ENOMEM);
1680 		goto bad_all_io_ds;
1681 	}
1682 
1683 	pool->next_mapping = NULL;
1684 	pool->mapping_pool = mempool_create_slab_pool(MAPPING_POOL_SIZE,
1685 						      _new_mapping_cache);
1686 	if (!pool->mapping_pool) {
1687 		*error = "Error creating pool's mapping mempool";
1688 		err_p = ERR_PTR(-ENOMEM);
1689 		goto bad_mapping_pool;
1690 	}
1691 
1692 	pool->ref_count = 1;
1693 	pool->last_commit_jiffies = jiffies;
1694 	pool->pool_md = pool_md;
1695 	pool->md_dev = metadata_dev;
1696 	__pool_table_insert(pool);
1697 
1698 	return pool;
1699 
1700 bad_mapping_pool:
1701 	dm_deferred_set_destroy(pool->all_io_ds);
1702 bad_all_io_ds:
1703 	dm_deferred_set_destroy(pool->shared_read_ds);
1704 bad_shared_read_ds:
1705 	destroy_workqueue(pool->wq);
1706 bad_wq:
1707 	dm_kcopyd_client_destroy(pool->copier);
1708 bad_kcopyd_client:
1709 	dm_bio_prison_destroy(pool->prison);
1710 bad_prison:
1711 	kfree(pool);
1712 bad_pool:
1713 	if (dm_pool_metadata_close(pmd))
1714 		DMWARN("%s: dm_pool_metadata_close() failed.", __func__);
1715 
1716 	return err_p;
1717 }
1718 
1719 static void __pool_inc(struct pool *pool)
1720 {
1721 	BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
1722 	pool->ref_count++;
1723 }
1724 
1725 static void __pool_dec(struct pool *pool)
1726 {
1727 	BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
1728 	BUG_ON(!pool->ref_count);
1729 	if (!--pool->ref_count)
1730 		__pool_destroy(pool);
1731 }
1732 
1733 static struct pool *__pool_find(struct mapped_device *pool_md,
1734 				struct block_device *metadata_dev,
1735 				unsigned long block_size, int read_only,
1736 				char **error, int *created)
1737 {
1738 	struct pool *pool = __pool_table_lookup_metadata_dev(metadata_dev);
1739 
1740 	if (pool) {
1741 		if (pool->pool_md != pool_md) {
1742 			*error = "metadata device already in use by a pool";
1743 			return ERR_PTR(-EBUSY);
1744 		}
1745 		__pool_inc(pool);
1746 
1747 	} else {
1748 		pool = __pool_table_lookup(pool_md);
1749 		if (pool) {
1750 			if (pool->md_dev != metadata_dev) {
1751 				*error = "different pool cannot replace a pool";
1752 				return ERR_PTR(-EINVAL);
1753 			}
1754 			__pool_inc(pool);
1755 
1756 		} else {
1757 			pool = pool_create(pool_md, metadata_dev, block_size, read_only, error);
1758 			*created = 1;
1759 		}
1760 	}
1761 
1762 	return pool;
1763 }
1764 
1765 /*----------------------------------------------------------------
1766  * Pool target methods
1767  *--------------------------------------------------------------*/
1768 static void pool_dtr(struct dm_target *ti)
1769 {
1770 	struct pool_c *pt = ti->private;
1771 
1772 	mutex_lock(&dm_thin_pool_table.mutex);
1773 
1774 	unbind_control_target(pt->pool, ti);
1775 	__pool_dec(pt->pool);
1776 	dm_put_device(ti, pt->metadata_dev);
1777 	dm_put_device(ti, pt->data_dev);
1778 	kfree(pt);
1779 
1780 	mutex_unlock(&dm_thin_pool_table.mutex);
1781 }
1782 
1783 static int parse_pool_features(struct dm_arg_set *as, struct pool_features *pf,
1784 			       struct dm_target *ti)
1785 {
1786 	int r;
1787 	unsigned argc;
1788 	const char *arg_name;
1789 
1790 	static struct dm_arg _args[] = {
1791 		{0, 3, "Invalid number of pool feature arguments"},
1792 	};
1793 
1794 	/*
1795 	 * No feature arguments supplied.
1796 	 */
1797 	if (!as->argc)
1798 		return 0;
1799 
1800 	r = dm_read_arg_group(_args, as, &argc, &ti->error);
1801 	if (r)
1802 		return -EINVAL;
1803 
1804 	while (argc && !r) {
1805 		arg_name = dm_shift_arg(as);
1806 		argc--;
1807 
1808 		if (!strcasecmp(arg_name, "skip_block_zeroing"))
1809 			pf->zero_new_blocks = false;
1810 
1811 		else if (!strcasecmp(arg_name, "ignore_discard"))
1812 			pf->discard_enabled = false;
1813 
1814 		else if (!strcasecmp(arg_name, "no_discard_passdown"))
1815 			pf->discard_passdown = false;
1816 
1817 		else if (!strcasecmp(arg_name, "read_only"))
1818 			pf->mode = PM_READ_ONLY;
1819 
1820 		else {
1821 			ti->error = "Unrecognised pool feature requested";
1822 			r = -EINVAL;
1823 			break;
1824 		}
1825 	}
1826 
1827 	return r;
1828 }
1829 
1830 /*
1831  * thin-pool <metadata dev> <data dev>
1832  *	     <data block size (sectors)>
1833  *	     <low water mark (blocks)>
1834  *	     [<#feature args> [<arg>]*]
1835  *
1836  * Optional feature arguments are:
1837  *	     skip_block_zeroing: skips the zeroing of newly-provisioned blocks.
1838  *	     ignore_discard: disable discard
1839  *	     no_discard_passdown: don't pass discards down to the data device
1840  */
1841 static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
1842 {
1843 	int r, pool_created = 0;
1844 	struct pool_c *pt;
1845 	struct pool *pool;
1846 	struct pool_features pf;
1847 	struct dm_arg_set as;
1848 	struct dm_dev *data_dev;
1849 	unsigned long block_size;
1850 	dm_block_t low_water_blocks;
1851 	struct dm_dev *metadata_dev;
1852 	sector_t metadata_dev_size;
1853 	char b[BDEVNAME_SIZE];
1854 
1855 	/*
1856 	 * FIXME Remove validation from scope of lock.
1857 	 */
1858 	mutex_lock(&dm_thin_pool_table.mutex);
1859 
1860 	if (argc < 4) {
1861 		ti->error = "Invalid argument count";
1862 		r = -EINVAL;
1863 		goto out_unlock;
1864 	}
1865 	as.argc = argc;
1866 	as.argv = argv;
1867 
1868 	r = dm_get_device(ti, argv[0], FMODE_READ | FMODE_WRITE, &metadata_dev);
1869 	if (r) {
1870 		ti->error = "Error opening metadata block device";
1871 		goto out_unlock;
1872 	}
1873 
1874 	metadata_dev_size = i_size_read(metadata_dev->bdev->bd_inode) >> SECTOR_SHIFT;
1875 	if (metadata_dev_size > THIN_METADATA_MAX_SECTORS_WARNING)
1876 		DMWARN("Metadata device %s is larger than %u sectors: excess space will not be used.",
1877 		       bdevname(metadata_dev->bdev, b), THIN_METADATA_MAX_SECTORS);
1878 
1879 	r = dm_get_device(ti, argv[1], FMODE_READ | FMODE_WRITE, &data_dev);
1880 	if (r) {
1881 		ti->error = "Error getting data device";
1882 		goto out_metadata;
1883 	}
1884 
1885 	if (kstrtoul(argv[2], 10, &block_size) || !block_size ||
1886 	    block_size < DATA_DEV_BLOCK_SIZE_MIN_SECTORS ||
1887 	    block_size > DATA_DEV_BLOCK_SIZE_MAX_SECTORS ||
1888 	    block_size & (DATA_DEV_BLOCK_SIZE_MIN_SECTORS - 1)) {
1889 		ti->error = "Invalid block size";
1890 		r = -EINVAL;
1891 		goto out;
1892 	}
1893 
1894 	if (kstrtoull(argv[3], 10, (unsigned long long *)&low_water_blocks)) {
1895 		ti->error = "Invalid low water mark";
1896 		r = -EINVAL;
1897 		goto out;
1898 	}
1899 
1900 	/*
1901 	 * Set default pool features.
1902 	 */
1903 	pool_features_init(&pf);
1904 
1905 	dm_consume_args(&as, 4);
1906 	r = parse_pool_features(&as, &pf, ti);
1907 	if (r)
1908 		goto out;
1909 
1910 	pt = kzalloc(sizeof(*pt), GFP_KERNEL);
1911 	if (!pt) {
1912 		r = -ENOMEM;
1913 		goto out;
1914 	}
1915 
1916 	pool = __pool_find(dm_table_get_md(ti->table), metadata_dev->bdev,
1917 			   block_size, pf.mode == PM_READ_ONLY, &ti->error, &pool_created);
1918 	if (IS_ERR(pool)) {
1919 		r = PTR_ERR(pool);
1920 		goto out_free_pt;
1921 	}
1922 
1923 	/*
1924 	 * 'pool_created' reflects whether this is the first table load.
1925 	 * Top level discard support is not allowed to be changed after
1926 	 * initial load.  This would require a pool reload to trigger thin
1927 	 * device changes.
1928 	 */
1929 	if (!pool_created && pf.discard_enabled != pool->pf.discard_enabled) {
1930 		ti->error = "Discard support cannot be disabled once enabled";
1931 		r = -EINVAL;
1932 		goto out_flags_changed;
1933 	}
1934 
1935 	pt->pool = pool;
1936 	pt->ti = ti;
1937 	pt->metadata_dev = metadata_dev;
1938 	pt->data_dev = data_dev;
1939 	pt->low_water_blocks = low_water_blocks;
1940 	pt->adjusted_pf = pt->requested_pf = pf;
1941 	ti->num_flush_requests = 1;
1942 
1943 	/*
1944 	 * Only need to enable discards if the pool should pass
1945 	 * them down to the data device.  The thin device's discard
1946 	 * processing will cause mappings to be removed from the btree.
1947 	 */
1948 	if (pf.discard_enabled && pf.discard_passdown) {
1949 		ti->num_discard_requests = 1;
1950 
1951 		/*
1952 		 * Setting 'discards_supported' circumvents the normal
1953 		 * stacking of discard limits (this keeps the pool and
1954 		 * thin devices' discard limits consistent).
1955 		 */
1956 		ti->discards_supported = true;
1957 		ti->discard_zeroes_data_unsupported = true;
1958 	}
1959 	ti->private = pt;
1960 
1961 	pt->callbacks.congested_fn = pool_is_congested;
1962 	dm_table_add_target_callbacks(ti->table, &pt->callbacks);
1963 
1964 	mutex_unlock(&dm_thin_pool_table.mutex);
1965 
1966 	return 0;
1967 
1968 out_flags_changed:
1969 	__pool_dec(pool);
1970 out_free_pt:
1971 	kfree(pt);
1972 out:
1973 	dm_put_device(ti, data_dev);
1974 out_metadata:
1975 	dm_put_device(ti, metadata_dev);
1976 out_unlock:
1977 	mutex_unlock(&dm_thin_pool_table.mutex);
1978 
1979 	return r;
1980 }
1981 
1982 static int pool_map(struct dm_target *ti, struct bio *bio)
1983 {
1984 	int r;
1985 	struct pool_c *pt = ti->private;
1986 	struct pool *pool = pt->pool;
1987 	unsigned long flags;
1988 
1989 	/*
1990 	 * As this is a singleton target, ti->begin is always zero.
1991 	 */
1992 	spin_lock_irqsave(&pool->lock, flags);
1993 	bio->bi_bdev = pt->data_dev->bdev;
1994 	r = DM_MAPIO_REMAPPED;
1995 	spin_unlock_irqrestore(&pool->lock, flags);
1996 
1997 	return r;
1998 }
1999 
2000 /*
2001  * Retrieves the number of blocks of the data device from
2002  * the superblock and compares it to the actual device size,
2003  * thus resizing the data device in case it has grown.
2004  *
2005  * This both copes with opening preallocated data devices in the ctr
2006  * being followed by a resume
2007  * -and-
2008  * calling the resume method individually after userspace has
2009  * grown the data device in reaction to a table event.
2010  */
2011 static int pool_preresume(struct dm_target *ti)
2012 {
2013 	int r;
2014 	struct pool_c *pt = ti->private;
2015 	struct pool *pool = pt->pool;
2016 	sector_t data_size = ti->len;
2017 	dm_block_t sb_data_size;
2018 
2019 	/*
2020 	 * Take control of the pool object.
2021 	 */
2022 	r = bind_control_target(pool, ti);
2023 	if (r)
2024 		return r;
2025 
2026 	(void) sector_div(data_size, pool->sectors_per_block);
2027 
2028 	r = dm_pool_get_data_dev_size(pool->pmd, &sb_data_size);
2029 	if (r) {
2030 		DMERR("failed to retrieve data device size");
2031 		return r;
2032 	}
2033 
2034 	if (data_size < sb_data_size) {
2035 		DMERR("pool target too small, is %llu blocks (expected %llu)",
2036 		      (unsigned long long)data_size, sb_data_size);
2037 		return -EINVAL;
2038 
2039 	} else if (data_size > sb_data_size) {
2040 		r = dm_pool_resize_data_dev(pool->pmd, data_size);
2041 		if (r) {
2042 			DMERR("failed to resize data device");
2043 			/* FIXME Stricter than necessary: Rollback transaction instead here */
2044 			set_pool_mode(pool, PM_READ_ONLY);
2045 			return r;
2046 		}
2047 
2048 		(void) commit_or_fallback(pool);
2049 	}
2050 
2051 	return 0;
2052 }
2053 
2054 static void pool_resume(struct dm_target *ti)
2055 {
2056 	struct pool_c *pt = ti->private;
2057 	struct pool *pool = pt->pool;
2058 	unsigned long flags;
2059 
2060 	spin_lock_irqsave(&pool->lock, flags);
2061 	pool->low_water_triggered = 0;
2062 	pool->no_free_space = 0;
2063 	__requeue_bios(pool);
2064 	spin_unlock_irqrestore(&pool->lock, flags);
2065 
2066 	do_waker(&pool->waker.work);
2067 }
2068 
2069 static void pool_postsuspend(struct dm_target *ti)
2070 {
2071 	struct pool_c *pt = ti->private;
2072 	struct pool *pool = pt->pool;
2073 
2074 	cancel_delayed_work(&pool->waker);
2075 	flush_workqueue(pool->wq);
2076 	(void) commit_or_fallback(pool);
2077 }
2078 
2079 static int check_arg_count(unsigned argc, unsigned args_required)
2080 {
2081 	if (argc != args_required) {
2082 		DMWARN("Message received with %u arguments instead of %u.",
2083 		       argc, args_required);
2084 		return -EINVAL;
2085 	}
2086 
2087 	return 0;
2088 }
2089 
2090 static int read_dev_id(char *arg, dm_thin_id *dev_id, int warning)
2091 {
2092 	if (!kstrtoull(arg, 10, (unsigned long long *)dev_id) &&
2093 	    *dev_id <= MAX_DEV_ID)
2094 		return 0;
2095 
2096 	if (warning)
2097 		DMWARN("Message received with invalid device id: %s", arg);
2098 
2099 	return -EINVAL;
2100 }
2101 
2102 static int process_create_thin_mesg(unsigned argc, char **argv, struct pool *pool)
2103 {
2104 	dm_thin_id dev_id;
2105 	int r;
2106 
2107 	r = check_arg_count(argc, 2);
2108 	if (r)
2109 		return r;
2110 
2111 	r = read_dev_id(argv[1], &dev_id, 1);
2112 	if (r)
2113 		return r;
2114 
2115 	r = dm_pool_create_thin(pool->pmd, dev_id);
2116 	if (r) {
2117 		DMWARN("Creation of new thinly-provisioned device with id %s failed.",
2118 		       argv[1]);
2119 		return r;
2120 	}
2121 
2122 	return 0;
2123 }
2124 
2125 static int process_create_snap_mesg(unsigned argc, char **argv, struct pool *pool)
2126 {
2127 	dm_thin_id dev_id;
2128 	dm_thin_id origin_dev_id;
2129 	int r;
2130 
2131 	r = check_arg_count(argc, 3);
2132 	if (r)
2133 		return r;
2134 
2135 	r = read_dev_id(argv[1], &dev_id, 1);
2136 	if (r)
2137 		return r;
2138 
2139 	r = read_dev_id(argv[2], &origin_dev_id, 1);
2140 	if (r)
2141 		return r;
2142 
2143 	r = dm_pool_create_snap(pool->pmd, dev_id, origin_dev_id);
2144 	if (r) {
2145 		DMWARN("Creation of new snapshot %s of device %s failed.",
2146 		       argv[1], argv[2]);
2147 		return r;
2148 	}
2149 
2150 	return 0;
2151 }
2152 
2153 static int process_delete_mesg(unsigned argc, char **argv, struct pool *pool)
2154 {
2155 	dm_thin_id dev_id;
2156 	int r;
2157 
2158 	r = check_arg_count(argc, 2);
2159 	if (r)
2160 		return r;
2161 
2162 	r = read_dev_id(argv[1], &dev_id, 1);
2163 	if (r)
2164 		return r;
2165 
2166 	r = dm_pool_delete_thin_device(pool->pmd, dev_id);
2167 	if (r)
2168 		DMWARN("Deletion of thin device %s failed.", argv[1]);
2169 
2170 	return r;
2171 }
2172 
2173 static int process_set_transaction_id_mesg(unsigned argc, char **argv, struct pool *pool)
2174 {
2175 	dm_thin_id old_id, new_id;
2176 	int r;
2177 
2178 	r = check_arg_count(argc, 3);
2179 	if (r)
2180 		return r;
2181 
2182 	if (kstrtoull(argv[1], 10, (unsigned long long *)&old_id)) {
2183 		DMWARN("set_transaction_id message: Unrecognised id %s.", argv[1]);
2184 		return -EINVAL;
2185 	}
2186 
2187 	if (kstrtoull(argv[2], 10, (unsigned long long *)&new_id)) {
2188 		DMWARN("set_transaction_id message: Unrecognised new id %s.", argv[2]);
2189 		return -EINVAL;
2190 	}
2191 
2192 	r = dm_pool_set_metadata_transaction_id(pool->pmd, old_id, new_id);
2193 	if (r) {
2194 		DMWARN("Failed to change transaction id from %s to %s.",
2195 		       argv[1], argv[2]);
2196 		return r;
2197 	}
2198 
2199 	return 0;
2200 }
2201 
2202 static int process_reserve_metadata_snap_mesg(unsigned argc, char **argv, struct pool *pool)
2203 {
2204 	int r;
2205 
2206 	r = check_arg_count(argc, 1);
2207 	if (r)
2208 		return r;
2209 
2210 	(void) commit_or_fallback(pool);
2211 
2212 	r = dm_pool_reserve_metadata_snap(pool->pmd);
2213 	if (r)
2214 		DMWARN("reserve_metadata_snap message failed.");
2215 
2216 	return r;
2217 }
2218 
2219 static int process_release_metadata_snap_mesg(unsigned argc, char **argv, struct pool *pool)
2220 {
2221 	int r;
2222 
2223 	r = check_arg_count(argc, 1);
2224 	if (r)
2225 		return r;
2226 
2227 	r = dm_pool_release_metadata_snap(pool->pmd);
2228 	if (r)
2229 		DMWARN("release_metadata_snap message failed.");
2230 
2231 	return r;
2232 }
2233 
2234 /*
2235  * Messages supported:
2236  *   create_thin	<dev_id>
2237  *   create_snap	<dev_id> <origin_id>
2238  *   delete		<dev_id>
2239  *   trim		<dev_id> <new_size_in_sectors>
2240  *   set_transaction_id <current_trans_id> <new_trans_id>
2241  *   reserve_metadata_snap
2242  *   release_metadata_snap
2243  */
2244 static int pool_message(struct dm_target *ti, unsigned argc, char **argv)
2245 {
2246 	int r = -EINVAL;
2247 	struct pool_c *pt = ti->private;
2248 	struct pool *pool = pt->pool;
2249 
2250 	if (!strcasecmp(argv[0], "create_thin"))
2251 		r = process_create_thin_mesg(argc, argv, pool);
2252 
2253 	else if (!strcasecmp(argv[0], "create_snap"))
2254 		r = process_create_snap_mesg(argc, argv, pool);
2255 
2256 	else if (!strcasecmp(argv[0], "delete"))
2257 		r = process_delete_mesg(argc, argv, pool);
2258 
2259 	else if (!strcasecmp(argv[0], "set_transaction_id"))
2260 		r = process_set_transaction_id_mesg(argc, argv, pool);
2261 
2262 	else if (!strcasecmp(argv[0], "reserve_metadata_snap"))
2263 		r = process_reserve_metadata_snap_mesg(argc, argv, pool);
2264 
2265 	else if (!strcasecmp(argv[0], "release_metadata_snap"))
2266 		r = process_release_metadata_snap_mesg(argc, argv, pool);
2267 
2268 	else
2269 		DMWARN("Unrecognised thin pool target message received: %s", argv[0]);
2270 
2271 	if (!r)
2272 		(void) commit_or_fallback(pool);
2273 
2274 	return r;
2275 }
2276 
2277 static void emit_flags(struct pool_features *pf, char *result,
2278 		       unsigned sz, unsigned maxlen)
2279 {
2280 	unsigned count = !pf->zero_new_blocks + !pf->discard_enabled +
2281 		!pf->discard_passdown + (pf->mode == PM_READ_ONLY);
2282 	DMEMIT("%u ", count);
2283 
2284 	if (!pf->zero_new_blocks)
2285 		DMEMIT("skip_block_zeroing ");
2286 
2287 	if (!pf->discard_enabled)
2288 		DMEMIT("ignore_discard ");
2289 
2290 	if (!pf->discard_passdown)
2291 		DMEMIT("no_discard_passdown ");
2292 
2293 	if (pf->mode == PM_READ_ONLY)
2294 		DMEMIT("read_only ");
2295 }
2296 
2297 /*
2298  * Status line is:
2299  *    <transaction id> <used metadata sectors>/<total metadata sectors>
2300  *    <used data sectors>/<total data sectors> <held metadata root>
2301  */
2302 static int pool_status(struct dm_target *ti, status_type_t type,
2303 		       unsigned status_flags, char *result, unsigned maxlen)
2304 {
2305 	int r;
2306 	unsigned sz = 0;
2307 	uint64_t transaction_id;
2308 	dm_block_t nr_free_blocks_data;
2309 	dm_block_t nr_free_blocks_metadata;
2310 	dm_block_t nr_blocks_data;
2311 	dm_block_t nr_blocks_metadata;
2312 	dm_block_t held_root;
2313 	char buf[BDEVNAME_SIZE];
2314 	char buf2[BDEVNAME_SIZE];
2315 	struct pool_c *pt = ti->private;
2316 	struct pool *pool = pt->pool;
2317 
2318 	switch (type) {
2319 	case STATUSTYPE_INFO:
2320 		if (get_pool_mode(pool) == PM_FAIL) {
2321 			DMEMIT("Fail");
2322 			break;
2323 		}
2324 
2325 		/* Commit to ensure statistics aren't out-of-date */
2326 		if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti))
2327 			(void) commit_or_fallback(pool);
2328 
2329 		r = dm_pool_get_metadata_transaction_id(pool->pmd,
2330 							&transaction_id);
2331 		if (r)
2332 			return r;
2333 
2334 		r = dm_pool_get_free_metadata_block_count(pool->pmd,
2335 							  &nr_free_blocks_metadata);
2336 		if (r)
2337 			return r;
2338 
2339 		r = dm_pool_get_metadata_dev_size(pool->pmd, &nr_blocks_metadata);
2340 		if (r)
2341 			return r;
2342 
2343 		r = dm_pool_get_free_block_count(pool->pmd,
2344 						 &nr_free_blocks_data);
2345 		if (r)
2346 			return r;
2347 
2348 		r = dm_pool_get_data_dev_size(pool->pmd, &nr_blocks_data);
2349 		if (r)
2350 			return r;
2351 
2352 		r = dm_pool_get_metadata_snap(pool->pmd, &held_root);
2353 		if (r)
2354 			return r;
2355 
2356 		DMEMIT("%llu %llu/%llu %llu/%llu ",
2357 		       (unsigned long long)transaction_id,
2358 		       (unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata),
2359 		       (unsigned long long)nr_blocks_metadata,
2360 		       (unsigned long long)(nr_blocks_data - nr_free_blocks_data),
2361 		       (unsigned long long)nr_blocks_data);
2362 
2363 		if (held_root)
2364 			DMEMIT("%llu ", held_root);
2365 		else
2366 			DMEMIT("- ");
2367 
2368 		if (pool->pf.mode == PM_READ_ONLY)
2369 			DMEMIT("ro ");
2370 		else
2371 			DMEMIT("rw ");
2372 
2373 		if (!pool->pf.discard_enabled)
2374 			DMEMIT("ignore_discard");
2375 		else if (pool->pf.discard_passdown)
2376 			DMEMIT("discard_passdown");
2377 		else
2378 			DMEMIT("no_discard_passdown");
2379 
2380 		break;
2381 
2382 	case STATUSTYPE_TABLE:
2383 		DMEMIT("%s %s %lu %llu ",
2384 		       format_dev_t(buf, pt->metadata_dev->bdev->bd_dev),
2385 		       format_dev_t(buf2, pt->data_dev->bdev->bd_dev),
2386 		       (unsigned long)pool->sectors_per_block,
2387 		       (unsigned long long)pt->low_water_blocks);
2388 		emit_flags(&pt->requested_pf, result, sz, maxlen);
2389 		break;
2390 	}
2391 
2392 	return 0;
2393 }
2394 
2395 static int pool_iterate_devices(struct dm_target *ti,
2396 				iterate_devices_callout_fn fn, void *data)
2397 {
2398 	struct pool_c *pt = ti->private;
2399 
2400 	return fn(ti, pt->data_dev, 0, ti->len, data);
2401 }
2402 
2403 static int pool_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
2404 		      struct bio_vec *biovec, int max_size)
2405 {
2406 	struct pool_c *pt = ti->private;
2407 	struct request_queue *q = bdev_get_queue(pt->data_dev->bdev);
2408 
2409 	if (!q->merge_bvec_fn)
2410 		return max_size;
2411 
2412 	bvm->bi_bdev = pt->data_dev->bdev;
2413 
2414 	return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
2415 }
2416 
2417 static bool block_size_is_power_of_two(struct pool *pool)
2418 {
2419 	return pool->sectors_per_block_shift >= 0;
2420 }
2421 
2422 static void set_discard_limits(struct pool_c *pt, struct queue_limits *limits)
2423 {
2424 	struct pool *pool = pt->pool;
2425 	struct queue_limits *data_limits;
2426 
2427 	limits->max_discard_sectors = pool->sectors_per_block;
2428 
2429 	/*
2430 	 * discard_granularity is just a hint, and not enforced.
2431 	 */
2432 	if (pt->adjusted_pf.discard_passdown) {
2433 		data_limits = &bdev_get_queue(pt->data_dev->bdev)->limits;
2434 		limits->discard_granularity = data_limits->discard_granularity;
2435 	} else if (block_size_is_power_of_two(pool))
2436 		limits->discard_granularity = pool->sectors_per_block << SECTOR_SHIFT;
2437 	else
2438 		/*
2439 		 * Use largest power of 2 that is a factor of sectors_per_block
2440 		 * but at least DATA_DEV_BLOCK_SIZE_MIN_SECTORS.
2441 		 */
2442 		limits->discard_granularity = max(1 << (ffs(pool->sectors_per_block) - 1),
2443 						  DATA_DEV_BLOCK_SIZE_MIN_SECTORS) << SECTOR_SHIFT;
2444 }
2445 
2446 static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits)
2447 {
2448 	struct pool_c *pt = ti->private;
2449 	struct pool *pool = pt->pool;
2450 
2451 	blk_limits_io_min(limits, 0);
2452 	blk_limits_io_opt(limits, pool->sectors_per_block << SECTOR_SHIFT);
2453 
2454 	/*
2455 	 * pt->adjusted_pf is a staging area for the actual features to use.
2456 	 * They get transferred to the live pool in bind_control_target()
2457 	 * called from pool_preresume().
2458 	 */
2459 	if (!pt->adjusted_pf.discard_enabled)
2460 		return;
2461 
2462 	disable_passdown_if_not_supported(pt);
2463 
2464 	set_discard_limits(pt, limits);
2465 }
2466 
2467 static struct target_type pool_target = {
2468 	.name = "thin-pool",
2469 	.features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE |
2470 		    DM_TARGET_IMMUTABLE,
2471 	.version = {1, 6, 0},
2472 	.module = THIS_MODULE,
2473 	.ctr = pool_ctr,
2474 	.dtr = pool_dtr,
2475 	.map = pool_map,
2476 	.postsuspend = pool_postsuspend,
2477 	.preresume = pool_preresume,
2478 	.resume = pool_resume,
2479 	.message = pool_message,
2480 	.status = pool_status,
2481 	.merge = pool_merge,
2482 	.iterate_devices = pool_iterate_devices,
2483 	.io_hints = pool_io_hints,
2484 };
2485 
2486 /*----------------------------------------------------------------
2487  * Thin target methods
2488  *--------------------------------------------------------------*/
2489 static void thin_dtr(struct dm_target *ti)
2490 {
2491 	struct thin_c *tc = ti->private;
2492 
2493 	mutex_lock(&dm_thin_pool_table.mutex);
2494 
2495 	__pool_dec(tc->pool);
2496 	dm_pool_close_thin_device(tc->td);
2497 	dm_put_device(ti, tc->pool_dev);
2498 	if (tc->origin_dev)
2499 		dm_put_device(ti, tc->origin_dev);
2500 	kfree(tc);
2501 
2502 	mutex_unlock(&dm_thin_pool_table.mutex);
2503 }
2504 
2505 /*
2506  * Thin target parameters:
2507  *
2508  * <pool_dev> <dev_id> [origin_dev]
2509  *
2510  * pool_dev: the path to the pool (eg, /dev/mapper/my_pool)
2511  * dev_id: the internal device identifier
2512  * origin_dev: a device external to the pool that should act as the origin
2513  *
2514  * If the pool device has discards disabled, they get disabled for the thin
2515  * device as well.
2516  */
2517 static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
2518 {
2519 	int r;
2520 	struct thin_c *tc;
2521 	struct dm_dev *pool_dev, *origin_dev;
2522 	struct mapped_device *pool_md;
2523 
2524 	mutex_lock(&dm_thin_pool_table.mutex);
2525 
2526 	if (argc != 2 && argc != 3) {
2527 		ti->error = "Invalid argument count";
2528 		r = -EINVAL;
2529 		goto out_unlock;
2530 	}
2531 
2532 	tc = ti->private = kzalloc(sizeof(*tc), GFP_KERNEL);
2533 	if (!tc) {
2534 		ti->error = "Out of memory";
2535 		r = -ENOMEM;
2536 		goto out_unlock;
2537 	}
2538 
2539 	if (argc == 3) {
2540 		r = dm_get_device(ti, argv[2], FMODE_READ, &origin_dev);
2541 		if (r) {
2542 			ti->error = "Error opening origin device";
2543 			goto bad_origin_dev;
2544 		}
2545 		tc->origin_dev = origin_dev;
2546 	}
2547 
2548 	r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &pool_dev);
2549 	if (r) {
2550 		ti->error = "Error opening pool device";
2551 		goto bad_pool_dev;
2552 	}
2553 	tc->pool_dev = pool_dev;
2554 
2555 	if (read_dev_id(argv[1], (unsigned long long *)&tc->dev_id, 0)) {
2556 		ti->error = "Invalid device id";
2557 		r = -EINVAL;
2558 		goto bad_common;
2559 	}
2560 
2561 	pool_md = dm_get_md(tc->pool_dev->bdev->bd_dev);
2562 	if (!pool_md) {
2563 		ti->error = "Couldn't get pool mapped device";
2564 		r = -EINVAL;
2565 		goto bad_common;
2566 	}
2567 
2568 	tc->pool = __pool_table_lookup(pool_md);
2569 	if (!tc->pool) {
2570 		ti->error = "Couldn't find pool object";
2571 		r = -EINVAL;
2572 		goto bad_pool_lookup;
2573 	}
2574 	__pool_inc(tc->pool);
2575 
2576 	if (get_pool_mode(tc->pool) == PM_FAIL) {
2577 		ti->error = "Couldn't open thin device, Pool is in fail mode";
2578 		goto bad_thin_open;
2579 	}
2580 
2581 	r = dm_pool_open_thin_device(tc->pool->pmd, tc->dev_id, &tc->td);
2582 	if (r) {
2583 		ti->error = "Couldn't open thin internal device";
2584 		goto bad_thin_open;
2585 	}
2586 
2587 	r = dm_set_target_max_io_len(ti, tc->pool->sectors_per_block);
2588 	if (r)
2589 		goto bad_thin_open;
2590 
2591 	ti->num_flush_requests = 1;
2592 	ti->flush_supported = true;
2593 	ti->per_bio_data_size = sizeof(struct dm_thin_endio_hook);
2594 
2595 	/* In case the pool supports discards, pass them on. */
2596 	if (tc->pool->pf.discard_enabled) {
2597 		ti->discards_supported = true;
2598 		ti->num_discard_requests = 1;
2599 		ti->discard_zeroes_data_unsupported = true;
2600 		/* Discard requests must be split on a block boundary */
2601 		ti->split_discard_requests = true;
2602 	}
2603 
2604 	dm_put(pool_md);
2605 
2606 	mutex_unlock(&dm_thin_pool_table.mutex);
2607 
2608 	return 0;
2609 
2610 bad_thin_open:
2611 	__pool_dec(tc->pool);
2612 bad_pool_lookup:
2613 	dm_put(pool_md);
2614 bad_common:
2615 	dm_put_device(ti, tc->pool_dev);
2616 bad_pool_dev:
2617 	if (tc->origin_dev)
2618 		dm_put_device(ti, tc->origin_dev);
2619 bad_origin_dev:
2620 	kfree(tc);
2621 out_unlock:
2622 	mutex_unlock(&dm_thin_pool_table.mutex);
2623 
2624 	return r;
2625 }
2626 
2627 static int thin_map(struct dm_target *ti, struct bio *bio)
2628 {
2629 	bio->bi_sector = dm_target_offset(ti, bio->bi_sector);
2630 
2631 	return thin_bio_map(ti, bio);
2632 }
2633 
2634 static int thin_endio(struct dm_target *ti, struct bio *bio, int err)
2635 {
2636 	unsigned long flags;
2637 	struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
2638 	struct list_head work;
2639 	struct dm_thin_new_mapping *m, *tmp;
2640 	struct pool *pool = h->tc->pool;
2641 
2642 	if (h->shared_read_entry) {
2643 		INIT_LIST_HEAD(&work);
2644 		dm_deferred_entry_dec(h->shared_read_entry, &work);
2645 
2646 		spin_lock_irqsave(&pool->lock, flags);
2647 		list_for_each_entry_safe(m, tmp, &work, list) {
2648 			list_del(&m->list);
2649 			m->quiesced = 1;
2650 			__maybe_add_mapping(m);
2651 		}
2652 		spin_unlock_irqrestore(&pool->lock, flags);
2653 	}
2654 
2655 	if (h->all_io_entry) {
2656 		INIT_LIST_HEAD(&work);
2657 		dm_deferred_entry_dec(h->all_io_entry, &work);
2658 		if (!list_empty(&work)) {
2659 			spin_lock_irqsave(&pool->lock, flags);
2660 			list_for_each_entry_safe(m, tmp, &work, list)
2661 				list_add(&m->list, &pool->prepared_discards);
2662 			spin_unlock_irqrestore(&pool->lock, flags);
2663 			wake_worker(pool);
2664 		}
2665 	}
2666 
2667 	return 0;
2668 }
2669 
2670 static void thin_postsuspend(struct dm_target *ti)
2671 {
2672 	if (dm_noflush_suspending(ti))
2673 		requeue_io((struct thin_c *)ti->private);
2674 }
2675 
2676 /*
2677  * <nr mapped sectors> <highest mapped sector>
2678  */
2679 static int thin_status(struct dm_target *ti, status_type_t type,
2680 		       unsigned status_flags, char *result, unsigned maxlen)
2681 {
2682 	int r;
2683 	ssize_t sz = 0;
2684 	dm_block_t mapped, highest;
2685 	char buf[BDEVNAME_SIZE];
2686 	struct thin_c *tc = ti->private;
2687 
2688 	if (get_pool_mode(tc->pool) == PM_FAIL) {
2689 		DMEMIT("Fail");
2690 		return 0;
2691 	}
2692 
2693 	if (!tc->td)
2694 		DMEMIT("-");
2695 	else {
2696 		switch (type) {
2697 		case STATUSTYPE_INFO:
2698 			r = dm_thin_get_mapped_count(tc->td, &mapped);
2699 			if (r)
2700 				return r;
2701 
2702 			r = dm_thin_get_highest_mapped_block(tc->td, &highest);
2703 			if (r < 0)
2704 				return r;
2705 
2706 			DMEMIT("%llu ", mapped * tc->pool->sectors_per_block);
2707 			if (r)
2708 				DMEMIT("%llu", ((highest + 1) *
2709 						tc->pool->sectors_per_block) - 1);
2710 			else
2711 				DMEMIT("-");
2712 			break;
2713 
2714 		case STATUSTYPE_TABLE:
2715 			DMEMIT("%s %lu",
2716 			       format_dev_t(buf, tc->pool_dev->bdev->bd_dev),
2717 			       (unsigned long) tc->dev_id);
2718 			if (tc->origin_dev)
2719 				DMEMIT(" %s", format_dev_t(buf, tc->origin_dev->bdev->bd_dev));
2720 			break;
2721 		}
2722 	}
2723 
2724 	return 0;
2725 }
2726 
2727 static int thin_iterate_devices(struct dm_target *ti,
2728 				iterate_devices_callout_fn fn, void *data)
2729 {
2730 	sector_t blocks;
2731 	struct thin_c *tc = ti->private;
2732 	struct pool *pool = tc->pool;
2733 
2734 	/*
2735 	 * We can't call dm_pool_get_data_dev_size() since that blocks.  So
2736 	 * we follow a more convoluted path through to the pool's target.
2737 	 */
2738 	if (!pool->ti)
2739 		return 0;	/* nothing is bound */
2740 
2741 	blocks = pool->ti->len;
2742 	(void) sector_div(blocks, pool->sectors_per_block);
2743 	if (blocks)
2744 		return fn(ti, tc->pool_dev, 0, pool->sectors_per_block * blocks, data);
2745 
2746 	return 0;
2747 }
2748 
2749 static struct target_type thin_target = {
2750 	.name = "thin",
2751 	.version = {1, 7, 0},
2752 	.module	= THIS_MODULE,
2753 	.ctr = thin_ctr,
2754 	.dtr = thin_dtr,
2755 	.map = thin_map,
2756 	.end_io = thin_endio,
2757 	.postsuspend = thin_postsuspend,
2758 	.status = thin_status,
2759 	.iterate_devices = thin_iterate_devices,
2760 };
2761 
2762 /*----------------------------------------------------------------*/
2763 
2764 static int __init dm_thin_init(void)
2765 {
2766 	int r;
2767 
2768 	pool_table_init();
2769 
2770 	r = dm_register_target(&thin_target);
2771 	if (r)
2772 		return r;
2773 
2774 	r = dm_register_target(&pool_target);
2775 	if (r)
2776 		goto bad_pool_target;
2777 
2778 	r = -ENOMEM;
2779 
2780 	_new_mapping_cache = KMEM_CACHE(dm_thin_new_mapping, 0);
2781 	if (!_new_mapping_cache)
2782 		goto bad_new_mapping_cache;
2783 
2784 	return 0;
2785 
2786 bad_new_mapping_cache:
2787 	dm_unregister_target(&pool_target);
2788 bad_pool_target:
2789 	dm_unregister_target(&thin_target);
2790 
2791 	return r;
2792 }
2793 
2794 static void dm_thin_exit(void)
2795 {
2796 	dm_unregister_target(&thin_target);
2797 	dm_unregister_target(&pool_target);
2798 
2799 	kmem_cache_destroy(_new_mapping_cache);
2800 }
2801 
2802 module_init(dm_thin_init);
2803 module_exit(dm_thin_exit);
2804 
2805 MODULE_DESCRIPTION(DM_NAME " thin provisioning target");
2806 MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
2807 MODULE_LICENSE("GPL");
2808