xref: /openbmc/linux/drivers/md/dm-thin.c (revision a2fb4d78)
1 /*
2  * Copyright (C) 2011-2012 Red Hat UK.
3  *
4  * This file is released under the GPL.
5  */
6 
7 #include "dm-thin-metadata.h"
8 #include "dm-bio-prison.h"
9 #include "dm.h"
10 
11 #include <linux/device-mapper.h>
12 #include <linux/dm-io.h>
13 #include <linux/dm-kcopyd.h>
14 #include <linux/list.h>
15 #include <linux/init.h>
16 #include <linux/module.h>
17 #include <linux/slab.h>
18 
19 #define	DM_MSG_PREFIX	"thin"
20 
21 /*
22  * Tunable constants
23  */
24 #define ENDIO_HOOK_POOL_SIZE 1024
25 #define MAPPING_POOL_SIZE 1024
26 #define PRISON_CELLS 1024
27 #define COMMIT_PERIOD HZ
28 
29 DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(snapshot_copy_throttle,
30 		"A percentage of time allocated for copy on write");
31 
32 /*
33  * The block size of the device holding pool data must be
34  * between 64KB and 1GB.
35  */
36 #define DATA_DEV_BLOCK_SIZE_MIN_SECTORS (64 * 1024 >> SECTOR_SHIFT)
37 #define DATA_DEV_BLOCK_SIZE_MAX_SECTORS (1024 * 1024 * 1024 >> SECTOR_SHIFT)
38 
39 /*
40  * Device id is restricted to 24 bits.
41  */
42 #define MAX_DEV_ID ((1 << 24) - 1)
43 
44 /*
45  * How do we handle breaking sharing of data blocks?
46  * =================================================
47  *
48  * We use a standard copy-on-write btree to store the mappings for the
49  * devices (note I'm talking about copy-on-write of the metadata here, not
50  * the data).  When you take an internal snapshot you clone the root node
51  * of the origin btree.  After this there is no concept of an origin or a
52  * snapshot.  They are just two device trees that happen to point to the
53  * same data blocks.
54  *
55  * When we get a write in we decide if it's to a shared data block using
56  * some timestamp magic.  If it is, we have to break sharing.
57  *
58  * Let's say we write to a shared block in what was the origin.  The
59  * steps are:
60  *
61  * i) plug io further to this physical block. (see bio_prison code).
62  *
63  * ii) quiesce any read io to that shared data block.  Obviously
64  * including all devices that share this block.  (see dm_deferred_set code)
65  *
66  * iii) copy the data block to a newly allocate block.  This step can be
67  * missed out if the io covers the block. (schedule_copy).
68  *
69  * iv) insert the new mapping into the origin's btree
70  * (process_prepared_mapping).  This act of inserting breaks some
71  * sharing of btree nodes between the two devices.  Breaking sharing only
72  * effects the btree of that specific device.  Btrees for the other
73  * devices that share the block never change.  The btree for the origin
74  * device as it was after the last commit is untouched, ie. we're using
75  * persistent data structures in the functional programming sense.
76  *
77  * v) unplug io to this physical block, including the io that triggered
78  * the breaking of sharing.
79  *
80  * Steps (ii) and (iii) occur in parallel.
81  *
82  * The metadata _doesn't_ need to be committed before the io continues.  We
83  * get away with this because the io is always written to a _new_ block.
84  * If there's a crash, then:
85  *
86  * - The origin mapping will point to the old origin block (the shared
87  * one).  This will contain the data as it was before the io that triggered
88  * the breaking of sharing came in.
89  *
90  * - The snap mapping still points to the old block.  As it would after
91  * the commit.
92  *
93  * The downside of this scheme is the timestamp magic isn't perfect, and
94  * will continue to think that data block in the snapshot device is shared
95  * even after the write to the origin has broken sharing.  I suspect data
96  * blocks will typically be shared by many different devices, so we're
97  * breaking sharing n + 1 times, rather than n, where n is the number of
98  * devices that reference this data block.  At the moment I think the
99  * benefits far, far outweigh the disadvantages.
100  */
101 
102 /*----------------------------------------------------------------*/
103 
104 /*
105  * Key building.
106  */
107 static void build_data_key(struct dm_thin_device *td,
108 			   dm_block_t b, struct dm_cell_key *key)
109 {
110 	key->virtual = 0;
111 	key->dev = dm_thin_dev_id(td);
112 	key->block = b;
113 }
114 
115 static void build_virtual_key(struct dm_thin_device *td, dm_block_t b,
116 			      struct dm_cell_key *key)
117 {
118 	key->virtual = 1;
119 	key->dev = dm_thin_dev_id(td);
120 	key->block = b;
121 }
122 
123 /*----------------------------------------------------------------*/
124 
125 /*
126  * A pool device ties together a metadata device and a data device.  It
127  * also provides the interface for creating and destroying internal
128  * devices.
129  */
130 struct dm_thin_new_mapping;
131 
132 /*
133  * The pool runs in 4 modes.  Ordered in degraded order for comparisons.
134  */
135 enum pool_mode {
136 	PM_WRITE,		/* metadata may be changed */
137 	PM_OUT_OF_DATA_SPACE,	/* metadata may be changed, though data may not be allocated */
138 	PM_READ_ONLY,		/* metadata may not be changed */
139 	PM_FAIL,		/* all I/O fails */
140 };
141 
142 struct pool_features {
143 	enum pool_mode mode;
144 
145 	bool zero_new_blocks:1;
146 	bool discard_enabled:1;
147 	bool discard_passdown:1;
148 	bool error_if_no_space:1;
149 };
150 
151 struct thin_c;
152 typedef void (*process_bio_fn)(struct thin_c *tc, struct bio *bio);
153 typedef void (*process_mapping_fn)(struct dm_thin_new_mapping *m);
154 
155 struct pool {
156 	struct list_head list;
157 	struct dm_target *ti;	/* Only set if a pool target is bound */
158 
159 	struct mapped_device *pool_md;
160 	struct block_device *md_dev;
161 	struct dm_pool_metadata *pmd;
162 
163 	dm_block_t low_water_blocks;
164 	uint32_t sectors_per_block;
165 	int sectors_per_block_shift;
166 
167 	struct pool_features pf;
168 	bool low_water_triggered:1;	/* A dm event has been sent */
169 
170 	struct dm_bio_prison *prison;
171 	struct dm_kcopyd_client *copier;
172 
173 	struct workqueue_struct *wq;
174 	struct work_struct worker;
175 	struct delayed_work waker;
176 
177 	unsigned long last_commit_jiffies;
178 	unsigned ref_count;
179 
180 	spinlock_t lock;
181 	struct bio_list deferred_bios;
182 	struct bio_list deferred_flush_bios;
183 	struct list_head prepared_mappings;
184 	struct list_head prepared_discards;
185 
186 	struct bio_list retry_on_resume_list;
187 
188 	struct dm_deferred_set *shared_read_ds;
189 	struct dm_deferred_set *all_io_ds;
190 
191 	struct dm_thin_new_mapping *next_mapping;
192 	mempool_t *mapping_pool;
193 
194 	process_bio_fn process_bio;
195 	process_bio_fn process_discard;
196 
197 	process_mapping_fn process_prepared_mapping;
198 	process_mapping_fn process_prepared_discard;
199 };
200 
201 static enum pool_mode get_pool_mode(struct pool *pool);
202 static void metadata_operation_failed(struct pool *pool, const char *op, int r);
203 
204 /*
205  * Target context for a pool.
206  */
207 struct pool_c {
208 	struct dm_target *ti;
209 	struct pool *pool;
210 	struct dm_dev *data_dev;
211 	struct dm_dev *metadata_dev;
212 	struct dm_target_callbacks callbacks;
213 
214 	dm_block_t low_water_blocks;
215 	struct pool_features requested_pf; /* Features requested during table load */
216 	struct pool_features adjusted_pf;  /* Features used after adjusting for constituent devices */
217 };
218 
219 /*
220  * Target context for a thin.
221  */
222 struct thin_c {
223 	struct dm_dev *pool_dev;
224 	struct dm_dev *origin_dev;
225 	dm_thin_id dev_id;
226 
227 	struct pool *pool;
228 	struct dm_thin_device *td;
229 	bool requeue_mode:1;
230 };
231 
232 /*----------------------------------------------------------------*/
233 
234 /*
235  * wake_worker() is used when new work is queued and when pool_resume is
236  * ready to continue deferred IO processing.
237  */
238 static void wake_worker(struct pool *pool)
239 {
240 	queue_work(pool->wq, &pool->worker);
241 }
242 
243 /*----------------------------------------------------------------*/
244 
245 static int bio_detain(struct pool *pool, struct dm_cell_key *key, struct bio *bio,
246 		      struct dm_bio_prison_cell **cell_result)
247 {
248 	int r;
249 	struct dm_bio_prison_cell *cell_prealloc;
250 
251 	/*
252 	 * Allocate a cell from the prison's mempool.
253 	 * This might block but it can't fail.
254 	 */
255 	cell_prealloc = dm_bio_prison_alloc_cell(pool->prison, GFP_NOIO);
256 
257 	r = dm_bio_detain(pool->prison, key, bio, cell_prealloc, cell_result);
258 	if (r)
259 		/*
260 		 * We reused an old cell; we can get rid of
261 		 * the new one.
262 		 */
263 		dm_bio_prison_free_cell(pool->prison, cell_prealloc);
264 
265 	return r;
266 }
267 
268 static void cell_release(struct pool *pool,
269 			 struct dm_bio_prison_cell *cell,
270 			 struct bio_list *bios)
271 {
272 	dm_cell_release(pool->prison, cell, bios);
273 	dm_bio_prison_free_cell(pool->prison, cell);
274 }
275 
276 static void cell_release_no_holder(struct pool *pool,
277 				   struct dm_bio_prison_cell *cell,
278 				   struct bio_list *bios)
279 {
280 	dm_cell_release_no_holder(pool->prison, cell, bios);
281 	dm_bio_prison_free_cell(pool->prison, cell);
282 }
283 
284 static void cell_defer_no_holder_no_free(struct thin_c *tc,
285 					 struct dm_bio_prison_cell *cell)
286 {
287 	struct pool *pool = tc->pool;
288 	unsigned long flags;
289 
290 	spin_lock_irqsave(&pool->lock, flags);
291 	dm_cell_release_no_holder(pool->prison, cell, &pool->deferred_bios);
292 	spin_unlock_irqrestore(&pool->lock, flags);
293 
294 	wake_worker(pool);
295 }
296 
297 static void cell_error(struct pool *pool,
298 		       struct dm_bio_prison_cell *cell)
299 {
300 	dm_cell_error(pool->prison, cell);
301 	dm_bio_prison_free_cell(pool->prison, cell);
302 }
303 
304 /*----------------------------------------------------------------*/
305 
306 /*
307  * A global list of pools that uses a struct mapped_device as a key.
308  */
309 static struct dm_thin_pool_table {
310 	struct mutex mutex;
311 	struct list_head pools;
312 } dm_thin_pool_table;
313 
314 static void pool_table_init(void)
315 {
316 	mutex_init(&dm_thin_pool_table.mutex);
317 	INIT_LIST_HEAD(&dm_thin_pool_table.pools);
318 }
319 
320 static void __pool_table_insert(struct pool *pool)
321 {
322 	BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
323 	list_add(&pool->list, &dm_thin_pool_table.pools);
324 }
325 
326 static void __pool_table_remove(struct pool *pool)
327 {
328 	BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
329 	list_del(&pool->list);
330 }
331 
332 static struct pool *__pool_table_lookup(struct mapped_device *md)
333 {
334 	struct pool *pool = NULL, *tmp;
335 
336 	BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
337 
338 	list_for_each_entry(tmp, &dm_thin_pool_table.pools, list) {
339 		if (tmp->pool_md == md) {
340 			pool = tmp;
341 			break;
342 		}
343 	}
344 
345 	return pool;
346 }
347 
348 static struct pool *__pool_table_lookup_metadata_dev(struct block_device *md_dev)
349 {
350 	struct pool *pool = NULL, *tmp;
351 
352 	BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
353 
354 	list_for_each_entry(tmp, &dm_thin_pool_table.pools, list) {
355 		if (tmp->md_dev == md_dev) {
356 			pool = tmp;
357 			break;
358 		}
359 	}
360 
361 	return pool;
362 }
363 
364 /*----------------------------------------------------------------*/
365 
366 struct dm_thin_endio_hook {
367 	struct thin_c *tc;
368 	struct dm_deferred_entry *shared_read_entry;
369 	struct dm_deferred_entry *all_io_entry;
370 	struct dm_thin_new_mapping *overwrite_mapping;
371 };
372 
373 static void requeue_bio_list(struct thin_c *tc, struct bio_list *master)
374 {
375 	struct bio *bio;
376 	struct bio_list bios;
377 	unsigned long flags;
378 
379 	bio_list_init(&bios);
380 
381 	spin_lock_irqsave(&tc->pool->lock, flags);
382 	bio_list_merge(&bios, master);
383 	bio_list_init(master);
384 	spin_unlock_irqrestore(&tc->pool->lock, flags);
385 
386 	while ((bio = bio_list_pop(&bios))) {
387 		struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
388 
389 		if (h->tc == tc)
390 			bio_endio(bio, DM_ENDIO_REQUEUE);
391 		else
392 			bio_list_add(master, bio);
393 	}
394 }
395 
396 static void requeue_io(struct thin_c *tc)
397 {
398 	struct pool *pool = tc->pool;
399 
400 	requeue_bio_list(tc, &pool->deferred_bios);
401 	requeue_bio_list(tc, &pool->retry_on_resume_list);
402 }
403 
404 static void error_retry_list(struct pool *pool)
405 {
406 	struct bio *bio;
407 	unsigned long flags;
408 	struct bio_list bios;
409 
410 	bio_list_init(&bios);
411 
412 	spin_lock_irqsave(&pool->lock, flags);
413 	bio_list_merge(&bios, &pool->retry_on_resume_list);
414 	bio_list_init(&pool->retry_on_resume_list);
415 	spin_unlock_irqrestore(&pool->lock, flags);
416 
417 	while ((bio = bio_list_pop(&bios)))
418 		bio_io_error(bio);
419 }
420 
421 /*
422  * This section of code contains the logic for processing a thin device's IO.
423  * Much of the code depends on pool object resources (lists, workqueues, etc)
424  * but most is exclusively called from the thin target rather than the thin-pool
425  * target.
426  */
427 
428 static bool block_size_is_power_of_two(struct pool *pool)
429 {
430 	return pool->sectors_per_block_shift >= 0;
431 }
432 
433 static dm_block_t get_bio_block(struct thin_c *tc, struct bio *bio)
434 {
435 	struct pool *pool = tc->pool;
436 	sector_t block_nr = bio->bi_iter.bi_sector;
437 
438 	if (block_size_is_power_of_two(pool))
439 		block_nr >>= pool->sectors_per_block_shift;
440 	else
441 		(void) sector_div(block_nr, pool->sectors_per_block);
442 
443 	return block_nr;
444 }
445 
446 static void remap(struct thin_c *tc, struct bio *bio, dm_block_t block)
447 {
448 	struct pool *pool = tc->pool;
449 	sector_t bi_sector = bio->bi_iter.bi_sector;
450 
451 	bio->bi_bdev = tc->pool_dev->bdev;
452 	if (block_size_is_power_of_two(pool))
453 		bio->bi_iter.bi_sector =
454 			(block << pool->sectors_per_block_shift) |
455 			(bi_sector & (pool->sectors_per_block - 1));
456 	else
457 		bio->bi_iter.bi_sector = (block * pool->sectors_per_block) +
458 				 sector_div(bi_sector, pool->sectors_per_block);
459 }
460 
461 static void remap_to_origin(struct thin_c *tc, struct bio *bio)
462 {
463 	bio->bi_bdev = tc->origin_dev->bdev;
464 }
465 
466 static int bio_triggers_commit(struct thin_c *tc, struct bio *bio)
467 {
468 	return (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) &&
469 		dm_thin_changed_this_transaction(tc->td);
470 }
471 
472 static void inc_all_io_entry(struct pool *pool, struct bio *bio)
473 {
474 	struct dm_thin_endio_hook *h;
475 
476 	if (bio->bi_rw & REQ_DISCARD)
477 		return;
478 
479 	h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
480 	h->all_io_entry = dm_deferred_entry_inc(pool->all_io_ds);
481 }
482 
483 static void issue(struct thin_c *tc, struct bio *bio)
484 {
485 	struct pool *pool = tc->pool;
486 	unsigned long flags;
487 
488 	if (!bio_triggers_commit(tc, bio)) {
489 		generic_make_request(bio);
490 		return;
491 	}
492 
493 	/*
494 	 * Complete bio with an error if earlier I/O caused changes to
495 	 * the metadata that can't be committed e.g, due to I/O errors
496 	 * on the metadata device.
497 	 */
498 	if (dm_thin_aborted_changes(tc->td)) {
499 		bio_io_error(bio);
500 		return;
501 	}
502 
503 	/*
504 	 * Batch together any bios that trigger commits and then issue a
505 	 * single commit for them in process_deferred_bios().
506 	 */
507 	spin_lock_irqsave(&pool->lock, flags);
508 	bio_list_add(&pool->deferred_flush_bios, bio);
509 	spin_unlock_irqrestore(&pool->lock, flags);
510 }
511 
512 static void remap_to_origin_and_issue(struct thin_c *tc, struct bio *bio)
513 {
514 	remap_to_origin(tc, bio);
515 	issue(tc, bio);
516 }
517 
518 static void remap_and_issue(struct thin_c *tc, struct bio *bio,
519 			    dm_block_t block)
520 {
521 	remap(tc, bio, block);
522 	issue(tc, bio);
523 }
524 
525 /*----------------------------------------------------------------*/
526 
527 /*
528  * Bio endio functions.
529  */
530 struct dm_thin_new_mapping {
531 	struct list_head list;
532 
533 	bool quiesced:1;
534 	bool prepared:1;
535 	bool pass_discard:1;
536 	bool definitely_not_shared:1;
537 
538 	int err;
539 	struct thin_c *tc;
540 	dm_block_t virt_block;
541 	dm_block_t data_block;
542 	struct dm_bio_prison_cell *cell, *cell2;
543 
544 	/*
545 	 * If the bio covers the whole area of a block then we can avoid
546 	 * zeroing or copying.  Instead this bio is hooked.  The bio will
547 	 * still be in the cell, so care has to be taken to avoid issuing
548 	 * the bio twice.
549 	 */
550 	struct bio *bio;
551 	bio_end_io_t *saved_bi_end_io;
552 };
553 
554 static void __maybe_add_mapping(struct dm_thin_new_mapping *m)
555 {
556 	struct pool *pool = m->tc->pool;
557 
558 	if (m->quiesced && m->prepared) {
559 		list_add_tail(&m->list, &pool->prepared_mappings);
560 		wake_worker(pool);
561 	}
562 }
563 
564 static void copy_complete(int read_err, unsigned long write_err, void *context)
565 {
566 	unsigned long flags;
567 	struct dm_thin_new_mapping *m = context;
568 	struct pool *pool = m->tc->pool;
569 
570 	m->err = read_err || write_err ? -EIO : 0;
571 
572 	spin_lock_irqsave(&pool->lock, flags);
573 	m->prepared = true;
574 	__maybe_add_mapping(m);
575 	spin_unlock_irqrestore(&pool->lock, flags);
576 }
577 
578 static void overwrite_endio(struct bio *bio, int err)
579 {
580 	unsigned long flags;
581 	struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
582 	struct dm_thin_new_mapping *m = h->overwrite_mapping;
583 	struct pool *pool = m->tc->pool;
584 
585 	m->err = err;
586 
587 	spin_lock_irqsave(&pool->lock, flags);
588 	m->prepared = true;
589 	__maybe_add_mapping(m);
590 	spin_unlock_irqrestore(&pool->lock, flags);
591 }
592 
593 /*----------------------------------------------------------------*/
594 
595 /*
596  * Workqueue.
597  */
598 
599 /*
600  * Prepared mapping jobs.
601  */
602 
603 /*
604  * This sends the bios in the cell back to the deferred_bios list.
605  */
606 static void cell_defer(struct thin_c *tc, struct dm_bio_prison_cell *cell)
607 {
608 	struct pool *pool = tc->pool;
609 	unsigned long flags;
610 
611 	spin_lock_irqsave(&pool->lock, flags);
612 	cell_release(pool, cell, &pool->deferred_bios);
613 	spin_unlock_irqrestore(&tc->pool->lock, flags);
614 
615 	wake_worker(pool);
616 }
617 
618 /*
619  * Same as cell_defer above, except it omits the original holder of the cell.
620  */
621 static void cell_defer_no_holder(struct thin_c *tc, struct dm_bio_prison_cell *cell)
622 {
623 	struct pool *pool = tc->pool;
624 	unsigned long flags;
625 
626 	spin_lock_irqsave(&pool->lock, flags);
627 	cell_release_no_holder(pool, cell, &pool->deferred_bios);
628 	spin_unlock_irqrestore(&pool->lock, flags);
629 
630 	wake_worker(pool);
631 }
632 
633 static void process_prepared_mapping_fail(struct dm_thin_new_mapping *m)
634 {
635 	if (m->bio) {
636 		m->bio->bi_end_io = m->saved_bi_end_io;
637 		atomic_inc(&m->bio->bi_remaining);
638 	}
639 	cell_error(m->tc->pool, m->cell);
640 	list_del(&m->list);
641 	mempool_free(m, m->tc->pool->mapping_pool);
642 }
643 
644 static void process_prepared_mapping(struct dm_thin_new_mapping *m)
645 {
646 	struct thin_c *tc = m->tc;
647 	struct pool *pool = tc->pool;
648 	struct bio *bio;
649 	int r;
650 
651 	bio = m->bio;
652 	if (bio) {
653 		bio->bi_end_io = m->saved_bi_end_io;
654 		atomic_inc(&bio->bi_remaining);
655 	}
656 
657 	if (m->err) {
658 		cell_error(pool, m->cell);
659 		goto out;
660 	}
661 
662 	/*
663 	 * Commit the prepared block into the mapping btree.
664 	 * Any I/O for this block arriving after this point will get
665 	 * remapped to it directly.
666 	 */
667 	r = dm_thin_insert_block(tc->td, m->virt_block, m->data_block);
668 	if (r) {
669 		metadata_operation_failed(pool, "dm_thin_insert_block", r);
670 		cell_error(pool, m->cell);
671 		goto out;
672 	}
673 
674 	/*
675 	 * Release any bios held while the block was being provisioned.
676 	 * If we are processing a write bio that completely covers the block,
677 	 * we already processed it so can ignore it now when processing
678 	 * the bios in the cell.
679 	 */
680 	if (bio) {
681 		cell_defer_no_holder(tc, m->cell);
682 		bio_endio(bio, 0);
683 	} else
684 		cell_defer(tc, m->cell);
685 
686 out:
687 	list_del(&m->list);
688 	mempool_free(m, pool->mapping_pool);
689 }
690 
691 static void process_prepared_discard_fail(struct dm_thin_new_mapping *m)
692 {
693 	struct thin_c *tc = m->tc;
694 
695 	bio_io_error(m->bio);
696 	cell_defer_no_holder(tc, m->cell);
697 	cell_defer_no_holder(tc, m->cell2);
698 	mempool_free(m, tc->pool->mapping_pool);
699 }
700 
701 static void process_prepared_discard_passdown(struct dm_thin_new_mapping *m)
702 {
703 	struct thin_c *tc = m->tc;
704 
705 	inc_all_io_entry(tc->pool, m->bio);
706 	cell_defer_no_holder(tc, m->cell);
707 	cell_defer_no_holder(tc, m->cell2);
708 
709 	if (m->pass_discard)
710 		if (m->definitely_not_shared)
711 			remap_and_issue(tc, m->bio, m->data_block);
712 		else {
713 			bool used = false;
714 			if (dm_pool_block_is_used(tc->pool->pmd, m->data_block, &used) || used)
715 				bio_endio(m->bio, 0);
716 			else
717 				remap_and_issue(tc, m->bio, m->data_block);
718 		}
719 	else
720 		bio_endio(m->bio, 0);
721 
722 	mempool_free(m, tc->pool->mapping_pool);
723 }
724 
725 static void process_prepared_discard(struct dm_thin_new_mapping *m)
726 {
727 	int r;
728 	struct thin_c *tc = m->tc;
729 
730 	r = dm_thin_remove_block(tc->td, m->virt_block);
731 	if (r)
732 		DMERR_LIMIT("dm_thin_remove_block() failed");
733 
734 	process_prepared_discard_passdown(m);
735 }
736 
737 static void process_prepared(struct pool *pool, struct list_head *head,
738 			     process_mapping_fn *fn)
739 {
740 	unsigned long flags;
741 	struct list_head maps;
742 	struct dm_thin_new_mapping *m, *tmp;
743 
744 	INIT_LIST_HEAD(&maps);
745 	spin_lock_irqsave(&pool->lock, flags);
746 	list_splice_init(head, &maps);
747 	spin_unlock_irqrestore(&pool->lock, flags);
748 
749 	list_for_each_entry_safe(m, tmp, &maps, list)
750 		(*fn)(m);
751 }
752 
753 /*
754  * Deferred bio jobs.
755  */
756 static int io_overlaps_block(struct pool *pool, struct bio *bio)
757 {
758 	return bio->bi_iter.bi_size ==
759 		(pool->sectors_per_block << SECTOR_SHIFT);
760 }
761 
762 static int io_overwrites_block(struct pool *pool, struct bio *bio)
763 {
764 	return (bio_data_dir(bio) == WRITE) &&
765 		io_overlaps_block(pool, bio);
766 }
767 
768 static void save_and_set_endio(struct bio *bio, bio_end_io_t **save,
769 			       bio_end_io_t *fn)
770 {
771 	*save = bio->bi_end_io;
772 	bio->bi_end_io = fn;
773 }
774 
775 static int ensure_next_mapping(struct pool *pool)
776 {
777 	if (pool->next_mapping)
778 		return 0;
779 
780 	pool->next_mapping = mempool_alloc(pool->mapping_pool, GFP_ATOMIC);
781 
782 	return pool->next_mapping ? 0 : -ENOMEM;
783 }
784 
785 static struct dm_thin_new_mapping *get_next_mapping(struct pool *pool)
786 {
787 	struct dm_thin_new_mapping *m = pool->next_mapping;
788 
789 	BUG_ON(!pool->next_mapping);
790 
791 	memset(m, 0, sizeof(struct dm_thin_new_mapping));
792 	INIT_LIST_HEAD(&m->list);
793 	m->bio = NULL;
794 
795 	pool->next_mapping = NULL;
796 
797 	return m;
798 }
799 
800 static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
801 			  struct dm_dev *origin, dm_block_t data_origin,
802 			  dm_block_t data_dest,
803 			  struct dm_bio_prison_cell *cell, struct bio *bio)
804 {
805 	int r;
806 	struct pool *pool = tc->pool;
807 	struct dm_thin_new_mapping *m = get_next_mapping(pool);
808 
809 	m->tc = tc;
810 	m->virt_block = virt_block;
811 	m->data_block = data_dest;
812 	m->cell = cell;
813 
814 	if (!dm_deferred_set_add_work(pool->shared_read_ds, &m->list))
815 		m->quiesced = true;
816 
817 	/*
818 	 * IO to pool_dev remaps to the pool target's data_dev.
819 	 *
820 	 * If the whole block of data is being overwritten, we can issue the
821 	 * bio immediately. Otherwise we use kcopyd to clone the data first.
822 	 */
823 	if (io_overwrites_block(pool, bio)) {
824 		struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
825 
826 		h->overwrite_mapping = m;
827 		m->bio = bio;
828 		save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio);
829 		inc_all_io_entry(pool, bio);
830 		remap_and_issue(tc, bio, data_dest);
831 	} else {
832 		struct dm_io_region from, to;
833 
834 		from.bdev = origin->bdev;
835 		from.sector = data_origin * pool->sectors_per_block;
836 		from.count = pool->sectors_per_block;
837 
838 		to.bdev = tc->pool_dev->bdev;
839 		to.sector = data_dest * pool->sectors_per_block;
840 		to.count = pool->sectors_per_block;
841 
842 		r = dm_kcopyd_copy(pool->copier, &from, 1, &to,
843 				   0, copy_complete, m);
844 		if (r < 0) {
845 			mempool_free(m, pool->mapping_pool);
846 			DMERR_LIMIT("dm_kcopyd_copy() failed");
847 			cell_error(pool, cell);
848 		}
849 	}
850 }
851 
852 static void schedule_internal_copy(struct thin_c *tc, dm_block_t virt_block,
853 				   dm_block_t data_origin, dm_block_t data_dest,
854 				   struct dm_bio_prison_cell *cell, struct bio *bio)
855 {
856 	schedule_copy(tc, virt_block, tc->pool_dev,
857 		      data_origin, data_dest, cell, bio);
858 }
859 
860 static void schedule_external_copy(struct thin_c *tc, dm_block_t virt_block,
861 				   dm_block_t data_dest,
862 				   struct dm_bio_prison_cell *cell, struct bio *bio)
863 {
864 	schedule_copy(tc, virt_block, tc->origin_dev,
865 		      virt_block, data_dest, cell, bio);
866 }
867 
868 static void schedule_zero(struct thin_c *tc, dm_block_t virt_block,
869 			  dm_block_t data_block, struct dm_bio_prison_cell *cell,
870 			  struct bio *bio)
871 {
872 	struct pool *pool = tc->pool;
873 	struct dm_thin_new_mapping *m = get_next_mapping(pool);
874 
875 	m->quiesced = true;
876 	m->prepared = false;
877 	m->tc = tc;
878 	m->virt_block = virt_block;
879 	m->data_block = data_block;
880 	m->cell = cell;
881 
882 	/*
883 	 * If the whole block of data is being overwritten or we are not
884 	 * zeroing pre-existing data, we can issue the bio immediately.
885 	 * Otherwise we use kcopyd to zero the data first.
886 	 */
887 	if (!pool->pf.zero_new_blocks)
888 		process_prepared_mapping(m);
889 
890 	else if (io_overwrites_block(pool, bio)) {
891 		struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
892 
893 		h->overwrite_mapping = m;
894 		m->bio = bio;
895 		save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio);
896 		inc_all_io_entry(pool, bio);
897 		remap_and_issue(tc, bio, data_block);
898 	} else {
899 		int r;
900 		struct dm_io_region to;
901 
902 		to.bdev = tc->pool_dev->bdev;
903 		to.sector = data_block * pool->sectors_per_block;
904 		to.count = pool->sectors_per_block;
905 
906 		r = dm_kcopyd_zero(pool->copier, 1, &to, 0, copy_complete, m);
907 		if (r < 0) {
908 			mempool_free(m, pool->mapping_pool);
909 			DMERR_LIMIT("dm_kcopyd_zero() failed");
910 			cell_error(pool, cell);
911 		}
912 	}
913 }
914 
915 /*
916  * A non-zero return indicates read_only or fail_io mode.
917  * Many callers don't care about the return value.
918  */
919 static int commit(struct pool *pool)
920 {
921 	int r;
922 
923 	if (get_pool_mode(pool) != PM_WRITE)
924 		return -EINVAL;
925 
926 	r = dm_pool_commit_metadata(pool->pmd);
927 	if (r)
928 		metadata_operation_failed(pool, "dm_pool_commit_metadata", r);
929 
930 	return r;
931 }
932 
933 static void check_low_water_mark(struct pool *pool, dm_block_t free_blocks)
934 {
935 	unsigned long flags;
936 
937 	if (free_blocks <= pool->low_water_blocks && !pool->low_water_triggered) {
938 		DMWARN("%s: reached low water mark for data device: sending event.",
939 		       dm_device_name(pool->pool_md));
940 		spin_lock_irqsave(&pool->lock, flags);
941 		pool->low_water_triggered = true;
942 		spin_unlock_irqrestore(&pool->lock, flags);
943 		dm_table_event(pool->ti->table);
944 	}
945 }
946 
947 static void set_pool_mode(struct pool *pool, enum pool_mode new_mode);
948 
949 static int alloc_data_block(struct thin_c *tc, dm_block_t *result)
950 {
951 	int r;
952 	dm_block_t free_blocks;
953 	struct pool *pool = tc->pool;
954 
955 	if (WARN_ON(get_pool_mode(pool) != PM_WRITE))
956 		return -EINVAL;
957 
958 	r = dm_pool_get_free_block_count(pool->pmd, &free_blocks);
959 	if (r) {
960 		metadata_operation_failed(pool, "dm_pool_get_free_block_count", r);
961 		return r;
962 	}
963 
964 	check_low_water_mark(pool, free_blocks);
965 
966 	if (!free_blocks) {
967 		/*
968 		 * Try to commit to see if that will free up some
969 		 * more space.
970 		 */
971 		r = commit(pool);
972 		if (r)
973 			return r;
974 
975 		r = dm_pool_get_free_block_count(pool->pmd, &free_blocks);
976 		if (r) {
977 			metadata_operation_failed(pool, "dm_pool_get_free_block_count", r);
978 			return r;
979 		}
980 
981 		if (!free_blocks) {
982 			set_pool_mode(pool, PM_OUT_OF_DATA_SPACE);
983 			return -ENOSPC;
984 		}
985 	}
986 
987 	r = dm_pool_alloc_data_block(pool->pmd, result);
988 	if (r) {
989 		metadata_operation_failed(pool, "dm_pool_alloc_data_block", r);
990 		return r;
991 	}
992 
993 	return 0;
994 }
995 
996 /*
997  * If we have run out of space, queue bios until the device is
998  * resumed, presumably after having been reloaded with more space.
999  */
1000 static void retry_on_resume(struct bio *bio)
1001 {
1002 	struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
1003 	struct thin_c *tc = h->tc;
1004 	struct pool *pool = tc->pool;
1005 	unsigned long flags;
1006 
1007 	spin_lock_irqsave(&pool->lock, flags);
1008 	bio_list_add(&pool->retry_on_resume_list, bio);
1009 	spin_unlock_irqrestore(&pool->lock, flags);
1010 }
1011 
1012 static bool should_error_unserviceable_bio(struct pool *pool)
1013 {
1014 	enum pool_mode m = get_pool_mode(pool);
1015 
1016 	switch (m) {
1017 	case PM_WRITE:
1018 		/* Shouldn't get here */
1019 		DMERR_LIMIT("bio unserviceable, yet pool is in PM_WRITE mode");
1020 		return true;
1021 
1022 	case PM_OUT_OF_DATA_SPACE:
1023 		return pool->pf.error_if_no_space;
1024 
1025 	case PM_READ_ONLY:
1026 	case PM_FAIL:
1027 		return true;
1028 	default:
1029 		/* Shouldn't get here */
1030 		DMERR_LIMIT("bio unserviceable, yet pool has an unknown mode");
1031 		return true;
1032 	}
1033 }
1034 
1035 static void handle_unserviceable_bio(struct pool *pool, struct bio *bio)
1036 {
1037 	if (should_error_unserviceable_bio(pool))
1038 		bio_io_error(bio);
1039 	else
1040 		retry_on_resume(bio);
1041 }
1042 
1043 static void retry_bios_on_resume(struct pool *pool, struct dm_bio_prison_cell *cell)
1044 {
1045 	struct bio *bio;
1046 	struct bio_list bios;
1047 
1048 	if (should_error_unserviceable_bio(pool)) {
1049 		cell_error(pool, cell);
1050 		return;
1051 	}
1052 
1053 	bio_list_init(&bios);
1054 	cell_release(pool, cell, &bios);
1055 
1056 	if (should_error_unserviceable_bio(pool))
1057 		while ((bio = bio_list_pop(&bios)))
1058 			bio_io_error(bio);
1059 	else
1060 		while ((bio = bio_list_pop(&bios)))
1061 			retry_on_resume(bio);
1062 }
1063 
1064 static void process_discard(struct thin_c *tc, struct bio *bio)
1065 {
1066 	int r;
1067 	unsigned long flags;
1068 	struct pool *pool = tc->pool;
1069 	struct dm_bio_prison_cell *cell, *cell2;
1070 	struct dm_cell_key key, key2;
1071 	dm_block_t block = get_bio_block(tc, bio);
1072 	struct dm_thin_lookup_result lookup_result;
1073 	struct dm_thin_new_mapping *m;
1074 
1075 	build_virtual_key(tc->td, block, &key);
1076 	if (bio_detain(tc->pool, &key, bio, &cell))
1077 		return;
1078 
1079 	r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
1080 	switch (r) {
1081 	case 0:
1082 		/*
1083 		 * Check nobody is fiddling with this pool block.  This can
1084 		 * happen if someone's in the process of breaking sharing
1085 		 * on this block.
1086 		 */
1087 		build_data_key(tc->td, lookup_result.block, &key2);
1088 		if (bio_detain(tc->pool, &key2, bio, &cell2)) {
1089 			cell_defer_no_holder(tc, cell);
1090 			break;
1091 		}
1092 
1093 		if (io_overlaps_block(pool, bio)) {
1094 			/*
1095 			 * IO may still be going to the destination block.  We must
1096 			 * quiesce before we can do the removal.
1097 			 */
1098 			m = get_next_mapping(pool);
1099 			m->tc = tc;
1100 			m->pass_discard = pool->pf.discard_passdown;
1101 			m->definitely_not_shared = !lookup_result.shared;
1102 			m->virt_block = block;
1103 			m->data_block = lookup_result.block;
1104 			m->cell = cell;
1105 			m->cell2 = cell2;
1106 			m->bio = bio;
1107 
1108 			if (!dm_deferred_set_add_work(pool->all_io_ds, &m->list)) {
1109 				spin_lock_irqsave(&pool->lock, flags);
1110 				list_add_tail(&m->list, &pool->prepared_discards);
1111 				spin_unlock_irqrestore(&pool->lock, flags);
1112 				wake_worker(pool);
1113 			}
1114 		} else {
1115 			inc_all_io_entry(pool, bio);
1116 			cell_defer_no_holder(tc, cell);
1117 			cell_defer_no_holder(tc, cell2);
1118 
1119 			/*
1120 			 * The DM core makes sure that the discard doesn't span
1121 			 * a block boundary.  So we submit the discard of a
1122 			 * partial block appropriately.
1123 			 */
1124 			if ((!lookup_result.shared) && pool->pf.discard_passdown)
1125 				remap_and_issue(tc, bio, lookup_result.block);
1126 			else
1127 				bio_endio(bio, 0);
1128 		}
1129 		break;
1130 
1131 	case -ENODATA:
1132 		/*
1133 		 * It isn't provisioned, just forget it.
1134 		 */
1135 		cell_defer_no_holder(tc, cell);
1136 		bio_endio(bio, 0);
1137 		break;
1138 
1139 	default:
1140 		DMERR_LIMIT("%s: dm_thin_find_block() failed: error = %d",
1141 			    __func__, r);
1142 		cell_defer_no_holder(tc, cell);
1143 		bio_io_error(bio);
1144 		break;
1145 	}
1146 }
1147 
1148 static void break_sharing(struct thin_c *tc, struct bio *bio, dm_block_t block,
1149 			  struct dm_cell_key *key,
1150 			  struct dm_thin_lookup_result *lookup_result,
1151 			  struct dm_bio_prison_cell *cell)
1152 {
1153 	int r;
1154 	dm_block_t data_block;
1155 	struct pool *pool = tc->pool;
1156 
1157 	r = alloc_data_block(tc, &data_block);
1158 	switch (r) {
1159 	case 0:
1160 		schedule_internal_copy(tc, block, lookup_result->block,
1161 				       data_block, cell, bio);
1162 		break;
1163 
1164 	case -ENOSPC:
1165 		retry_bios_on_resume(pool, cell);
1166 		break;
1167 
1168 	default:
1169 		DMERR_LIMIT("%s: alloc_data_block() failed: error = %d",
1170 			    __func__, r);
1171 		cell_error(pool, cell);
1172 		break;
1173 	}
1174 }
1175 
1176 static void process_shared_bio(struct thin_c *tc, struct bio *bio,
1177 			       dm_block_t block,
1178 			       struct dm_thin_lookup_result *lookup_result)
1179 {
1180 	struct dm_bio_prison_cell *cell;
1181 	struct pool *pool = tc->pool;
1182 	struct dm_cell_key key;
1183 
1184 	/*
1185 	 * If cell is already occupied, then sharing is already in the process
1186 	 * of being broken so we have nothing further to do here.
1187 	 */
1188 	build_data_key(tc->td, lookup_result->block, &key);
1189 	if (bio_detain(pool, &key, bio, &cell))
1190 		return;
1191 
1192 	if (bio_data_dir(bio) == WRITE && bio->bi_iter.bi_size)
1193 		break_sharing(tc, bio, block, &key, lookup_result, cell);
1194 	else {
1195 		struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
1196 
1197 		h->shared_read_entry = dm_deferred_entry_inc(pool->shared_read_ds);
1198 		inc_all_io_entry(pool, bio);
1199 		cell_defer_no_holder(tc, cell);
1200 
1201 		remap_and_issue(tc, bio, lookup_result->block);
1202 	}
1203 }
1204 
1205 static void provision_block(struct thin_c *tc, struct bio *bio, dm_block_t block,
1206 			    struct dm_bio_prison_cell *cell)
1207 {
1208 	int r;
1209 	dm_block_t data_block;
1210 	struct pool *pool = tc->pool;
1211 
1212 	/*
1213 	 * Remap empty bios (flushes) immediately, without provisioning.
1214 	 */
1215 	if (!bio->bi_iter.bi_size) {
1216 		inc_all_io_entry(pool, bio);
1217 		cell_defer_no_holder(tc, cell);
1218 
1219 		remap_and_issue(tc, bio, 0);
1220 		return;
1221 	}
1222 
1223 	/*
1224 	 * Fill read bios with zeroes and complete them immediately.
1225 	 */
1226 	if (bio_data_dir(bio) == READ) {
1227 		zero_fill_bio(bio);
1228 		cell_defer_no_holder(tc, cell);
1229 		bio_endio(bio, 0);
1230 		return;
1231 	}
1232 
1233 	r = alloc_data_block(tc, &data_block);
1234 	switch (r) {
1235 	case 0:
1236 		if (tc->origin_dev)
1237 			schedule_external_copy(tc, block, data_block, cell, bio);
1238 		else
1239 			schedule_zero(tc, block, data_block, cell, bio);
1240 		break;
1241 
1242 	case -ENOSPC:
1243 		retry_bios_on_resume(pool, cell);
1244 		break;
1245 
1246 	default:
1247 		DMERR_LIMIT("%s: alloc_data_block() failed: error = %d",
1248 			    __func__, r);
1249 		cell_error(pool, cell);
1250 		break;
1251 	}
1252 }
1253 
1254 static void process_bio(struct thin_c *tc, struct bio *bio)
1255 {
1256 	int r;
1257 	struct pool *pool = tc->pool;
1258 	dm_block_t block = get_bio_block(tc, bio);
1259 	struct dm_bio_prison_cell *cell;
1260 	struct dm_cell_key key;
1261 	struct dm_thin_lookup_result lookup_result;
1262 
1263 	/*
1264 	 * If cell is already occupied, then the block is already
1265 	 * being provisioned so we have nothing further to do here.
1266 	 */
1267 	build_virtual_key(tc->td, block, &key);
1268 	if (bio_detain(pool, &key, bio, &cell))
1269 		return;
1270 
1271 	r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
1272 	switch (r) {
1273 	case 0:
1274 		if (lookup_result.shared) {
1275 			process_shared_bio(tc, bio, block, &lookup_result);
1276 			cell_defer_no_holder(tc, cell); /* FIXME: pass this cell into process_shared? */
1277 		} else {
1278 			inc_all_io_entry(pool, bio);
1279 			cell_defer_no_holder(tc, cell);
1280 
1281 			remap_and_issue(tc, bio, lookup_result.block);
1282 		}
1283 		break;
1284 
1285 	case -ENODATA:
1286 		if (bio_data_dir(bio) == READ && tc->origin_dev) {
1287 			inc_all_io_entry(pool, bio);
1288 			cell_defer_no_holder(tc, cell);
1289 
1290 			remap_to_origin_and_issue(tc, bio);
1291 		} else
1292 			provision_block(tc, bio, block, cell);
1293 		break;
1294 
1295 	default:
1296 		DMERR_LIMIT("%s: dm_thin_find_block() failed: error = %d",
1297 			    __func__, r);
1298 		cell_defer_no_holder(tc, cell);
1299 		bio_io_error(bio);
1300 		break;
1301 	}
1302 }
1303 
1304 static void process_bio_read_only(struct thin_c *tc, struct bio *bio)
1305 {
1306 	int r;
1307 	int rw = bio_data_dir(bio);
1308 	dm_block_t block = get_bio_block(tc, bio);
1309 	struct dm_thin_lookup_result lookup_result;
1310 
1311 	r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
1312 	switch (r) {
1313 	case 0:
1314 		if (lookup_result.shared && (rw == WRITE) && bio->bi_iter.bi_size)
1315 			handle_unserviceable_bio(tc->pool, bio);
1316 		else {
1317 			inc_all_io_entry(tc->pool, bio);
1318 			remap_and_issue(tc, bio, lookup_result.block);
1319 		}
1320 		break;
1321 
1322 	case -ENODATA:
1323 		if (rw != READ) {
1324 			handle_unserviceable_bio(tc->pool, bio);
1325 			break;
1326 		}
1327 
1328 		if (tc->origin_dev) {
1329 			inc_all_io_entry(tc->pool, bio);
1330 			remap_to_origin_and_issue(tc, bio);
1331 			break;
1332 		}
1333 
1334 		zero_fill_bio(bio);
1335 		bio_endio(bio, 0);
1336 		break;
1337 
1338 	default:
1339 		DMERR_LIMIT("%s: dm_thin_find_block() failed: error = %d",
1340 			    __func__, r);
1341 		bio_io_error(bio);
1342 		break;
1343 	}
1344 }
1345 
1346 static void process_bio_success(struct thin_c *tc, struct bio *bio)
1347 {
1348 	bio_endio(bio, 0);
1349 }
1350 
1351 static void process_bio_fail(struct thin_c *tc, struct bio *bio)
1352 {
1353 	bio_io_error(bio);
1354 }
1355 
1356 /*
1357  * FIXME: should we also commit due to size of transaction, measured in
1358  * metadata blocks?
1359  */
1360 static int need_commit_due_to_time(struct pool *pool)
1361 {
1362 	return jiffies < pool->last_commit_jiffies ||
1363 	       jiffies > pool->last_commit_jiffies + COMMIT_PERIOD;
1364 }
1365 
1366 static void process_deferred_bios(struct pool *pool)
1367 {
1368 	unsigned long flags;
1369 	struct bio *bio;
1370 	struct bio_list bios;
1371 
1372 	bio_list_init(&bios);
1373 
1374 	spin_lock_irqsave(&pool->lock, flags);
1375 	bio_list_merge(&bios, &pool->deferred_bios);
1376 	bio_list_init(&pool->deferred_bios);
1377 	spin_unlock_irqrestore(&pool->lock, flags);
1378 
1379 	while ((bio = bio_list_pop(&bios))) {
1380 		struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
1381 		struct thin_c *tc = h->tc;
1382 
1383 		if (tc->requeue_mode) {
1384 			bio_endio(bio, DM_ENDIO_REQUEUE);
1385 			continue;
1386 		}
1387 
1388 		/*
1389 		 * If we've got no free new_mapping structs, and processing
1390 		 * this bio might require one, we pause until there are some
1391 		 * prepared mappings to process.
1392 		 */
1393 		if (ensure_next_mapping(pool)) {
1394 			spin_lock_irqsave(&pool->lock, flags);
1395 			bio_list_merge(&pool->deferred_bios, &bios);
1396 			spin_unlock_irqrestore(&pool->lock, flags);
1397 
1398 			break;
1399 		}
1400 
1401 		if (bio->bi_rw & REQ_DISCARD)
1402 			pool->process_discard(tc, bio);
1403 		else
1404 			pool->process_bio(tc, bio);
1405 	}
1406 
1407 	/*
1408 	 * If there are any deferred flush bios, we must commit
1409 	 * the metadata before issuing them.
1410 	 */
1411 	bio_list_init(&bios);
1412 	spin_lock_irqsave(&pool->lock, flags);
1413 	bio_list_merge(&bios, &pool->deferred_flush_bios);
1414 	bio_list_init(&pool->deferred_flush_bios);
1415 	spin_unlock_irqrestore(&pool->lock, flags);
1416 
1417 	if (bio_list_empty(&bios) &&
1418 	    !(dm_pool_changed_this_transaction(pool->pmd) && need_commit_due_to_time(pool)))
1419 		return;
1420 
1421 	if (commit(pool)) {
1422 		while ((bio = bio_list_pop(&bios)))
1423 			bio_io_error(bio);
1424 		return;
1425 	}
1426 	pool->last_commit_jiffies = jiffies;
1427 
1428 	while ((bio = bio_list_pop(&bios)))
1429 		generic_make_request(bio);
1430 }
1431 
1432 static void do_worker(struct work_struct *ws)
1433 {
1434 	struct pool *pool = container_of(ws, struct pool, worker);
1435 
1436 	process_prepared(pool, &pool->prepared_mappings, &pool->process_prepared_mapping);
1437 	process_prepared(pool, &pool->prepared_discards, &pool->process_prepared_discard);
1438 	process_deferred_bios(pool);
1439 }
1440 
1441 /*
1442  * We want to commit periodically so that not too much
1443  * unwritten data builds up.
1444  */
1445 static void do_waker(struct work_struct *ws)
1446 {
1447 	struct pool *pool = container_of(to_delayed_work(ws), struct pool, waker);
1448 	wake_worker(pool);
1449 	queue_delayed_work(pool->wq, &pool->waker, COMMIT_PERIOD);
1450 }
1451 
1452 /*----------------------------------------------------------------*/
1453 
1454 struct noflush_work {
1455 	struct work_struct worker;
1456 	struct thin_c *tc;
1457 
1458 	atomic_t complete;
1459 	wait_queue_head_t wait;
1460 };
1461 
1462 static void complete_noflush_work(struct noflush_work *w)
1463 {
1464 	atomic_set(&w->complete, 1);
1465 	wake_up(&w->wait);
1466 }
1467 
1468 static void do_noflush_start(struct work_struct *ws)
1469 {
1470 	struct noflush_work *w = container_of(ws, struct noflush_work, worker);
1471 	w->tc->requeue_mode = true;
1472 	requeue_io(w->tc);
1473 	complete_noflush_work(w);
1474 }
1475 
1476 static void do_noflush_stop(struct work_struct *ws)
1477 {
1478 	struct noflush_work *w = container_of(ws, struct noflush_work, worker);
1479 	w->tc->requeue_mode = false;
1480 	complete_noflush_work(w);
1481 }
1482 
1483 static void noflush_work(struct thin_c *tc, void (*fn)(struct work_struct *))
1484 {
1485 	struct noflush_work w;
1486 
1487 	INIT_WORK(&w.worker, fn);
1488 	w.tc = tc;
1489 	atomic_set(&w.complete, 0);
1490 	init_waitqueue_head(&w.wait);
1491 
1492 	queue_work(tc->pool->wq, &w.worker);
1493 
1494 	wait_event(w.wait, atomic_read(&w.complete));
1495 }
1496 
1497 /*----------------------------------------------------------------*/
1498 
1499 static enum pool_mode get_pool_mode(struct pool *pool)
1500 {
1501 	return pool->pf.mode;
1502 }
1503 
1504 static void notify_of_pool_mode_change(struct pool *pool, const char *new_mode)
1505 {
1506 	dm_table_event(pool->ti->table);
1507 	DMINFO("%s: switching pool to %s mode",
1508 	       dm_device_name(pool->pool_md), new_mode);
1509 }
1510 
1511 static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)
1512 {
1513 	struct pool_c *pt = pool->ti->private;
1514 	bool needs_check = dm_pool_metadata_needs_check(pool->pmd);
1515 	enum pool_mode old_mode = get_pool_mode(pool);
1516 
1517 	/*
1518 	 * Never allow the pool to transition to PM_WRITE mode if user
1519 	 * intervention is required to verify metadata and data consistency.
1520 	 */
1521 	if (new_mode == PM_WRITE && needs_check) {
1522 		DMERR("%s: unable to switch pool to write mode until repaired.",
1523 		      dm_device_name(pool->pool_md));
1524 		if (old_mode != new_mode)
1525 			new_mode = old_mode;
1526 		else
1527 			new_mode = PM_READ_ONLY;
1528 	}
1529 	/*
1530 	 * If we were in PM_FAIL mode, rollback of metadata failed.  We're
1531 	 * not going to recover without a thin_repair.	So we never let the
1532 	 * pool move out of the old mode.
1533 	 */
1534 	if (old_mode == PM_FAIL)
1535 		new_mode = old_mode;
1536 
1537 	switch (new_mode) {
1538 	case PM_FAIL:
1539 		if (old_mode != new_mode)
1540 			notify_of_pool_mode_change(pool, "failure");
1541 		dm_pool_metadata_read_only(pool->pmd);
1542 		pool->process_bio = process_bio_fail;
1543 		pool->process_discard = process_bio_fail;
1544 		pool->process_prepared_mapping = process_prepared_mapping_fail;
1545 		pool->process_prepared_discard = process_prepared_discard_fail;
1546 
1547 		error_retry_list(pool);
1548 		break;
1549 
1550 	case PM_READ_ONLY:
1551 		if (old_mode != new_mode)
1552 			notify_of_pool_mode_change(pool, "read-only");
1553 		dm_pool_metadata_read_only(pool->pmd);
1554 		pool->process_bio = process_bio_read_only;
1555 		pool->process_discard = process_bio_success;
1556 		pool->process_prepared_mapping = process_prepared_mapping_fail;
1557 		pool->process_prepared_discard = process_prepared_discard_passdown;
1558 
1559 		error_retry_list(pool);
1560 		break;
1561 
1562 	case PM_OUT_OF_DATA_SPACE:
1563 		/*
1564 		 * Ideally we'd never hit this state; the low water mark
1565 		 * would trigger userland to extend the pool before we
1566 		 * completely run out of data space.  However, many small
1567 		 * IOs to unprovisioned space can consume data space at an
1568 		 * alarming rate.  Adjust your low water mark if you're
1569 		 * frequently seeing this mode.
1570 		 */
1571 		if (old_mode != new_mode)
1572 			notify_of_pool_mode_change(pool, "out-of-data-space");
1573 		pool->process_bio = process_bio_read_only;
1574 		pool->process_discard = process_discard;
1575 		pool->process_prepared_mapping = process_prepared_mapping;
1576 		pool->process_prepared_discard = process_prepared_discard_passdown;
1577 		break;
1578 
1579 	case PM_WRITE:
1580 		if (old_mode != new_mode)
1581 			notify_of_pool_mode_change(pool, "write");
1582 		dm_pool_metadata_read_write(pool->pmd);
1583 		pool->process_bio = process_bio;
1584 		pool->process_discard = process_discard;
1585 		pool->process_prepared_mapping = process_prepared_mapping;
1586 		pool->process_prepared_discard = process_prepared_discard;
1587 		break;
1588 	}
1589 
1590 	pool->pf.mode = new_mode;
1591 	/*
1592 	 * The pool mode may have changed, sync it so bind_control_target()
1593 	 * doesn't cause an unexpected mode transition on resume.
1594 	 */
1595 	pt->adjusted_pf.mode = new_mode;
1596 }
1597 
1598 static void abort_transaction(struct pool *pool)
1599 {
1600 	const char *dev_name = dm_device_name(pool->pool_md);
1601 
1602 	DMERR_LIMIT("%s: aborting current metadata transaction", dev_name);
1603 	if (dm_pool_abort_metadata(pool->pmd)) {
1604 		DMERR("%s: failed to abort metadata transaction", dev_name);
1605 		set_pool_mode(pool, PM_FAIL);
1606 	}
1607 
1608 	if (dm_pool_metadata_set_needs_check(pool->pmd)) {
1609 		DMERR("%s: failed to set 'needs_check' flag in metadata", dev_name);
1610 		set_pool_mode(pool, PM_FAIL);
1611 	}
1612 }
1613 
1614 static void metadata_operation_failed(struct pool *pool, const char *op, int r)
1615 {
1616 	DMERR_LIMIT("%s: metadata operation '%s' failed: error = %d",
1617 		    dm_device_name(pool->pool_md), op, r);
1618 
1619 	abort_transaction(pool);
1620 	set_pool_mode(pool, PM_READ_ONLY);
1621 }
1622 
1623 /*----------------------------------------------------------------*/
1624 
1625 /*
1626  * Mapping functions.
1627  */
1628 
1629 /*
1630  * Called only while mapping a thin bio to hand it over to the workqueue.
1631  */
1632 static void thin_defer_bio(struct thin_c *tc, struct bio *bio)
1633 {
1634 	unsigned long flags;
1635 	struct pool *pool = tc->pool;
1636 
1637 	spin_lock_irqsave(&pool->lock, flags);
1638 	bio_list_add(&pool->deferred_bios, bio);
1639 	spin_unlock_irqrestore(&pool->lock, flags);
1640 
1641 	wake_worker(pool);
1642 }
1643 
1644 static void thin_hook_bio(struct thin_c *tc, struct bio *bio)
1645 {
1646 	struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
1647 
1648 	h->tc = tc;
1649 	h->shared_read_entry = NULL;
1650 	h->all_io_entry = NULL;
1651 	h->overwrite_mapping = NULL;
1652 }
1653 
1654 /*
1655  * Non-blocking function called from the thin target's map function.
1656  */
1657 static int thin_bio_map(struct dm_target *ti, struct bio *bio)
1658 {
1659 	int r;
1660 	struct thin_c *tc = ti->private;
1661 	dm_block_t block = get_bio_block(tc, bio);
1662 	struct dm_thin_device *td = tc->td;
1663 	struct dm_thin_lookup_result result;
1664 	struct dm_bio_prison_cell cell1, cell2;
1665 	struct dm_bio_prison_cell *cell_result;
1666 	struct dm_cell_key key;
1667 
1668 	thin_hook_bio(tc, bio);
1669 
1670 	if (tc->requeue_mode) {
1671 		bio_endio(bio, DM_ENDIO_REQUEUE);
1672 		return DM_MAPIO_SUBMITTED;
1673 	}
1674 
1675 	if (get_pool_mode(tc->pool) == PM_FAIL) {
1676 		bio_io_error(bio);
1677 		return DM_MAPIO_SUBMITTED;
1678 	}
1679 
1680 	if (bio->bi_rw & (REQ_DISCARD | REQ_FLUSH | REQ_FUA)) {
1681 		thin_defer_bio(tc, bio);
1682 		return DM_MAPIO_SUBMITTED;
1683 	}
1684 
1685 	r = dm_thin_find_block(td, block, 0, &result);
1686 
1687 	/*
1688 	 * Note that we defer readahead too.
1689 	 */
1690 	switch (r) {
1691 	case 0:
1692 		if (unlikely(result.shared)) {
1693 			/*
1694 			 * We have a race condition here between the
1695 			 * result.shared value returned by the lookup and
1696 			 * snapshot creation, which may cause new
1697 			 * sharing.
1698 			 *
1699 			 * To avoid this always quiesce the origin before
1700 			 * taking the snap.  You want to do this anyway to
1701 			 * ensure a consistent application view
1702 			 * (i.e. lockfs).
1703 			 *
1704 			 * More distant ancestors are irrelevant. The
1705 			 * shared flag will be set in their case.
1706 			 */
1707 			thin_defer_bio(tc, bio);
1708 			return DM_MAPIO_SUBMITTED;
1709 		}
1710 
1711 		build_virtual_key(tc->td, block, &key);
1712 		if (dm_bio_detain(tc->pool->prison, &key, bio, &cell1, &cell_result))
1713 			return DM_MAPIO_SUBMITTED;
1714 
1715 		build_data_key(tc->td, result.block, &key);
1716 		if (dm_bio_detain(tc->pool->prison, &key, bio, &cell2, &cell_result)) {
1717 			cell_defer_no_holder_no_free(tc, &cell1);
1718 			return DM_MAPIO_SUBMITTED;
1719 		}
1720 
1721 		inc_all_io_entry(tc->pool, bio);
1722 		cell_defer_no_holder_no_free(tc, &cell2);
1723 		cell_defer_no_holder_no_free(tc, &cell1);
1724 
1725 		remap(tc, bio, result.block);
1726 		return DM_MAPIO_REMAPPED;
1727 
1728 	case -ENODATA:
1729 		if (get_pool_mode(tc->pool) == PM_READ_ONLY) {
1730 			/*
1731 			 * This block isn't provisioned, and we have no way
1732 			 * of doing so.
1733 			 */
1734 			handle_unserviceable_bio(tc->pool, bio);
1735 			return DM_MAPIO_SUBMITTED;
1736 		}
1737 		/* fall through */
1738 
1739 	case -EWOULDBLOCK:
1740 		/*
1741 		 * In future, the failed dm_thin_find_block above could
1742 		 * provide the hint to load the metadata into cache.
1743 		 */
1744 		thin_defer_bio(tc, bio);
1745 		return DM_MAPIO_SUBMITTED;
1746 
1747 	default:
1748 		/*
1749 		 * Must always call bio_io_error on failure.
1750 		 * dm_thin_find_block can fail with -EINVAL if the
1751 		 * pool is switched to fail-io mode.
1752 		 */
1753 		bio_io_error(bio);
1754 		return DM_MAPIO_SUBMITTED;
1755 	}
1756 }
1757 
1758 static int pool_is_congested(struct dm_target_callbacks *cb, int bdi_bits)
1759 {
1760 	int r;
1761 	unsigned long flags;
1762 	struct pool_c *pt = container_of(cb, struct pool_c, callbacks);
1763 
1764 	spin_lock_irqsave(&pt->pool->lock, flags);
1765 	r = !bio_list_empty(&pt->pool->retry_on_resume_list);
1766 	spin_unlock_irqrestore(&pt->pool->lock, flags);
1767 
1768 	if (!r) {
1769 		struct request_queue *q = bdev_get_queue(pt->data_dev->bdev);
1770 		r = bdi_congested(&q->backing_dev_info, bdi_bits);
1771 	}
1772 
1773 	return r;
1774 }
1775 
1776 static void __requeue_bios(struct pool *pool)
1777 {
1778 	bio_list_merge(&pool->deferred_bios, &pool->retry_on_resume_list);
1779 	bio_list_init(&pool->retry_on_resume_list);
1780 }
1781 
1782 /*----------------------------------------------------------------
1783  * Binding of control targets to a pool object
1784  *--------------------------------------------------------------*/
1785 static bool data_dev_supports_discard(struct pool_c *pt)
1786 {
1787 	struct request_queue *q = bdev_get_queue(pt->data_dev->bdev);
1788 
1789 	return q && blk_queue_discard(q);
1790 }
1791 
1792 static bool is_factor(sector_t block_size, uint32_t n)
1793 {
1794 	return !sector_div(block_size, n);
1795 }
1796 
1797 /*
1798  * If discard_passdown was enabled verify that the data device
1799  * supports discards.  Disable discard_passdown if not.
1800  */
1801 static void disable_passdown_if_not_supported(struct pool_c *pt)
1802 {
1803 	struct pool *pool = pt->pool;
1804 	struct block_device *data_bdev = pt->data_dev->bdev;
1805 	struct queue_limits *data_limits = &bdev_get_queue(data_bdev)->limits;
1806 	sector_t block_size = pool->sectors_per_block << SECTOR_SHIFT;
1807 	const char *reason = NULL;
1808 	char buf[BDEVNAME_SIZE];
1809 
1810 	if (!pt->adjusted_pf.discard_passdown)
1811 		return;
1812 
1813 	if (!data_dev_supports_discard(pt))
1814 		reason = "discard unsupported";
1815 
1816 	else if (data_limits->max_discard_sectors < pool->sectors_per_block)
1817 		reason = "max discard sectors smaller than a block";
1818 
1819 	else if (data_limits->discard_granularity > block_size)
1820 		reason = "discard granularity larger than a block";
1821 
1822 	else if (!is_factor(block_size, data_limits->discard_granularity))
1823 		reason = "discard granularity not a factor of block size";
1824 
1825 	if (reason) {
1826 		DMWARN("Data device (%s) %s: Disabling discard passdown.", bdevname(data_bdev, buf), reason);
1827 		pt->adjusted_pf.discard_passdown = false;
1828 	}
1829 }
1830 
1831 static int bind_control_target(struct pool *pool, struct dm_target *ti)
1832 {
1833 	struct pool_c *pt = ti->private;
1834 
1835 	/*
1836 	 * We want to make sure that a pool in PM_FAIL mode is never upgraded.
1837 	 */
1838 	enum pool_mode old_mode = get_pool_mode(pool);
1839 	enum pool_mode new_mode = pt->adjusted_pf.mode;
1840 
1841 	/*
1842 	 * Don't change the pool's mode until set_pool_mode() below.
1843 	 * Otherwise the pool's process_* function pointers may
1844 	 * not match the desired pool mode.
1845 	 */
1846 	pt->adjusted_pf.mode = old_mode;
1847 
1848 	pool->ti = ti;
1849 	pool->pf = pt->adjusted_pf;
1850 	pool->low_water_blocks = pt->low_water_blocks;
1851 
1852 	set_pool_mode(pool, new_mode);
1853 
1854 	return 0;
1855 }
1856 
1857 static void unbind_control_target(struct pool *pool, struct dm_target *ti)
1858 {
1859 	if (pool->ti == ti)
1860 		pool->ti = NULL;
1861 }
1862 
1863 /*----------------------------------------------------------------
1864  * Pool creation
1865  *--------------------------------------------------------------*/
1866 /* Initialize pool features. */
1867 static void pool_features_init(struct pool_features *pf)
1868 {
1869 	pf->mode = PM_WRITE;
1870 	pf->zero_new_blocks = true;
1871 	pf->discard_enabled = true;
1872 	pf->discard_passdown = true;
1873 	pf->error_if_no_space = false;
1874 }
1875 
1876 static void __pool_destroy(struct pool *pool)
1877 {
1878 	__pool_table_remove(pool);
1879 
1880 	if (dm_pool_metadata_close(pool->pmd) < 0)
1881 		DMWARN("%s: dm_pool_metadata_close() failed.", __func__);
1882 
1883 	dm_bio_prison_destroy(pool->prison);
1884 	dm_kcopyd_client_destroy(pool->copier);
1885 
1886 	if (pool->wq)
1887 		destroy_workqueue(pool->wq);
1888 
1889 	if (pool->next_mapping)
1890 		mempool_free(pool->next_mapping, pool->mapping_pool);
1891 	mempool_destroy(pool->mapping_pool);
1892 	dm_deferred_set_destroy(pool->shared_read_ds);
1893 	dm_deferred_set_destroy(pool->all_io_ds);
1894 	kfree(pool);
1895 }
1896 
1897 static struct kmem_cache *_new_mapping_cache;
1898 
1899 static struct pool *pool_create(struct mapped_device *pool_md,
1900 				struct block_device *metadata_dev,
1901 				unsigned long block_size,
1902 				int read_only, char **error)
1903 {
1904 	int r;
1905 	void *err_p;
1906 	struct pool *pool;
1907 	struct dm_pool_metadata *pmd;
1908 	bool format_device = read_only ? false : true;
1909 
1910 	pmd = dm_pool_metadata_open(metadata_dev, block_size, format_device);
1911 	if (IS_ERR(pmd)) {
1912 		*error = "Error creating metadata object";
1913 		return (struct pool *)pmd;
1914 	}
1915 
1916 	pool = kmalloc(sizeof(*pool), GFP_KERNEL);
1917 	if (!pool) {
1918 		*error = "Error allocating memory for pool";
1919 		err_p = ERR_PTR(-ENOMEM);
1920 		goto bad_pool;
1921 	}
1922 
1923 	pool->pmd = pmd;
1924 	pool->sectors_per_block = block_size;
1925 	if (block_size & (block_size - 1))
1926 		pool->sectors_per_block_shift = -1;
1927 	else
1928 		pool->sectors_per_block_shift = __ffs(block_size);
1929 	pool->low_water_blocks = 0;
1930 	pool_features_init(&pool->pf);
1931 	pool->prison = dm_bio_prison_create(PRISON_CELLS);
1932 	if (!pool->prison) {
1933 		*error = "Error creating pool's bio prison";
1934 		err_p = ERR_PTR(-ENOMEM);
1935 		goto bad_prison;
1936 	}
1937 
1938 	pool->copier = dm_kcopyd_client_create(&dm_kcopyd_throttle);
1939 	if (IS_ERR(pool->copier)) {
1940 		r = PTR_ERR(pool->copier);
1941 		*error = "Error creating pool's kcopyd client";
1942 		err_p = ERR_PTR(r);
1943 		goto bad_kcopyd_client;
1944 	}
1945 
1946 	/*
1947 	 * Create singlethreaded workqueue that will service all devices
1948 	 * that use this metadata.
1949 	 */
1950 	pool->wq = alloc_ordered_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM);
1951 	if (!pool->wq) {
1952 		*error = "Error creating pool's workqueue";
1953 		err_p = ERR_PTR(-ENOMEM);
1954 		goto bad_wq;
1955 	}
1956 
1957 	INIT_WORK(&pool->worker, do_worker);
1958 	INIT_DELAYED_WORK(&pool->waker, do_waker);
1959 	spin_lock_init(&pool->lock);
1960 	bio_list_init(&pool->deferred_bios);
1961 	bio_list_init(&pool->deferred_flush_bios);
1962 	INIT_LIST_HEAD(&pool->prepared_mappings);
1963 	INIT_LIST_HEAD(&pool->prepared_discards);
1964 	pool->low_water_triggered = false;
1965 	bio_list_init(&pool->retry_on_resume_list);
1966 
1967 	pool->shared_read_ds = dm_deferred_set_create();
1968 	if (!pool->shared_read_ds) {
1969 		*error = "Error creating pool's shared read deferred set";
1970 		err_p = ERR_PTR(-ENOMEM);
1971 		goto bad_shared_read_ds;
1972 	}
1973 
1974 	pool->all_io_ds = dm_deferred_set_create();
1975 	if (!pool->all_io_ds) {
1976 		*error = "Error creating pool's all io deferred set";
1977 		err_p = ERR_PTR(-ENOMEM);
1978 		goto bad_all_io_ds;
1979 	}
1980 
1981 	pool->next_mapping = NULL;
1982 	pool->mapping_pool = mempool_create_slab_pool(MAPPING_POOL_SIZE,
1983 						      _new_mapping_cache);
1984 	if (!pool->mapping_pool) {
1985 		*error = "Error creating pool's mapping mempool";
1986 		err_p = ERR_PTR(-ENOMEM);
1987 		goto bad_mapping_pool;
1988 	}
1989 
1990 	pool->ref_count = 1;
1991 	pool->last_commit_jiffies = jiffies;
1992 	pool->pool_md = pool_md;
1993 	pool->md_dev = metadata_dev;
1994 	__pool_table_insert(pool);
1995 
1996 	return pool;
1997 
1998 bad_mapping_pool:
1999 	dm_deferred_set_destroy(pool->all_io_ds);
2000 bad_all_io_ds:
2001 	dm_deferred_set_destroy(pool->shared_read_ds);
2002 bad_shared_read_ds:
2003 	destroy_workqueue(pool->wq);
2004 bad_wq:
2005 	dm_kcopyd_client_destroy(pool->copier);
2006 bad_kcopyd_client:
2007 	dm_bio_prison_destroy(pool->prison);
2008 bad_prison:
2009 	kfree(pool);
2010 bad_pool:
2011 	if (dm_pool_metadata_close(pmd))
2012 		DMWARN("%s: dm_pool_metadata_close() failed.", __func__);
2013 
2014 	return err_p;
2015 }
2016 
2017 static void __pool_inc(struct pool *pool)
2018 {
2019 	BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
2020 	pool->ref_count++;
2021 }
2022 
2023 static void __pool_dec(struct pool *pool)
2024 {
2025 	BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
2026 	BUG_ON(!pool->ref_count);
2027 	if (!--pool->ref_count)
2028 		__pool_destroy(pool);
2029 }
2030 
2031 static struct pool *__pool_find(struct mapped_device *pool_md,
2032 				struct block_device *metadata_dev,
2033 				unsigned long block_size, int read_only,
2034 				char **error, int *created)
2035 {
2036 	struct pool *pool = __pool_table_lookup_metadata_dev(metadata_dev);
2037 
2038 	if (pool) {
2039 		if (pool->pool_md != pool_md) {
2040 			*error = "metadata device already in use by a pool";
2041 			return ERR_PTR(-EBUSY);
2042 		}
2043 		__pool_inc(pool);
2044 
2045 	} else {
2046 		pool = __pool_table_lookup(pool_md);
2047 		if (pool) {
2048 			if (pool->md_dev != metadata_dev) {
2049 				*error = "different pool cannot replace a pool";
2050 				return ERR_PTR(-EINVAL);
2051 			}
2052 			__pool_inc(pool);
2053 
2054 		} else {
2055 			pool = pool_create(pool_md, metadata_dev, block_size, read_only, error);
2056 			*created = 1;
2057 		}
2058 	}
2059 
2060 	return pool;
2061 }
2062 
2063 /*----------------------------------------------------------------
2064  * Pool target methods
2065  *--------------------------------------------------------------*/
2066 static void pool_dtr(struct dm_target *ti)
2067 {
2068 	struct pool_c *pt = ti->private;
2069 
2070 	mutex_lock(&dm_thin_pool_table.mutex);
2071 
2072 	unbind_control_target(pt->pool, ti);
2073 	__pool_dec(pt->pool);
2074 	dm_put_device(ti, pt->metadata_dev);
2075 	dm_put_device(ti, pt->data_dev);
2076 	kfree(pt);
2077 
2078 	mutex_unlock(&dm_thin_pool_table.mutex);
2079 }
2080 
2081 static int parse_pool_features(struct dm_arg_set *as, struct pool_features *pf,
2082 			       struct dm_target *ti)
2083 {
2084 	int r;
2085 	unsigned argc;
2086 	const char *arg_name;
2087 
2088 	static struct dm_arg _args[] = {
2089 		{0, 4, "Invalid number of pool feature arguments"},
2090 	};
2091 
2092 	/*
2093 	 * No feature arguments supplied.
2094 	 */
2095 	if (!as->argc)
2096 		return 0;
2097 
2098 	r = dm_read_arg_group(_args, as, &argc, &ti->error);
2099 	if (r)
2100 		return -EINVAL;
2101 
2102 	while (argc && !r) {
2103 		arg_name = dm_shift_arg(as);
2104 		argc--;
2105 
2106 		if (!strcasecmp(arg_name, "skip_block_zeroing"))
2107 			pf->zero_new_blocks = false;
2108 
2109 		else if (!strcasecmp(arg_name, "ignore_discard"))
2110 			pf->discard_enabled = false;
2111 
2112 		else if (!strcasecmp(arg_name, "no_discard_passdown"))
2113 			pf->discard_passdown = false;
2114 
2115 		else if (!strcasecmp(arg_name, "read_only"))
2116 			pf->mode = PM_READ_ONLY;
2117 
2118 		else if (!strcasecmp(arg_name, "error_if_no_space"))
2119 			pf->error_if_no_space = true;
2120 
2121 		else {
2122 			ti->error = "Unrecognised pool feature requested";
2123 			r = -EINVAL;
2124 			break;
2125 		}
2126 	}
2127 
2128 	return r;
2129 }
2130 
2131 static void metadata_low_callback(void *context)
2132 {
2133 	struct pool *pool = context;
2134 
2135 	DMWARN("%s: reached low water mark for metadata device: sending event.",
2136 	       dm_device_name(pool->pool_md));
2137 
2138 	dm_table_event(pool->ti->table);
2139 }
2140 
2141 static sector_t get_dev_size(struct block_device *bdev)
2142 {
2143 	return i_size_read(bdev->bd_inode) >> SECTOR_SHIFT;
2144 }
2145 
2146 static void warn_if_metadata_device_too_big(struct block_device *bdev)
2147 {
2148 	sector_t metadata_dev_size = get_dev_size(bdev);
2149 	char buffer[BDEVNAME_SIZE];
2150 
2151 	if (metadata_dev_size > THIN_METADATA_MAX_SECTORS_WARNING)
2152 		DMWARN("Metadata device %s is larger than %u sectors: excess space will not be used.",
2153 		       bdevname(bdev, buffer), THIN_METADATA_MAX_SECTORS);
2154 }
2155 
2156 static sector_t get_metadata_dev_size(struct block_device *bdev)
2157 {
2158 	sector_t metadata_dev_size = get_dev_size(bdev);
2159 
2160 	if (metadata_dev_size > THIN_METADATA_MAX_SECTORS)
2161 		metadata_dev_size = THIN_METADATA_MAX_SECTORS;
2162 
2163 	return metadata_dev_size;
2164 }
2165 
2166 static dm_block_t get_metadata_dev_size_in_blocks(struct block_device *bdev)
2167 {
2168 	sector_t metadata_dev_size = get_metadata_dev_size(bdev);
2169 
2170 	sector_div(metadata_dev_size, THIN_METADATA_BLOCK_SIZE);
2171 
2172 	return metadata_dev_size;
2173 }
2174 
2175 /*
2176  * When a metadata threshold is crossed a dm event is triggered, and
2177  * userland should respond by growing the metadata device.  We could let
2178  * userland set the threshold, like we do with the data threshold, but I'm
2179  * not sure they know enough to do this well.
2180  */
2181 static dm_block_t calc_metadata_threshold(struct pool_c *pt)
2182 {
2183 	/*
2184 	 * 4M is ample for all ops with the possible exception of thin
2185 	 * device deletion which is harmless if it fails (just retry the
2186 	 * delete after you've grown the device).
2187 	 */
2188 	dm_block_t quarter = get_metadata_dev_size_in_blocks(pt->metadata_dev->bdev) / 4;
2189 	return min((dm_block_t)1024ULL /* 4M */, quarter);
2190 }
2191 
2192 /*
2193  * thin-pool <metadata dev> <data dev>
2194  *	     <data block size (sectors)>
2195  *	     <low water mark (blocks)>
2196  *	     [<#feature args> [<arg>]*]
2197  *
2198  * Optional feature arguments are:
2199  *	     skip_block_zeroing: skips the zeroing of newly-provisioned blocks.
2200  *	     ignore_discard: disable discard
2201  *	     no_discard_passdown: don't pass discards down to the data device
2202  *	     read_only: Don't allow any changes to be made to the pool metadata.
2203  *	     error_if_no_space: error IOs, instead of queueing, if no space.
2204  */
2205 static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
2206 {
2207 	int r, pool_created = 0;
2208 	struct pool_c *pt;
2209 	struct pool *pool;
2210 	struct pool_features pf;
2211 	struct dm_arg_set as;
2212 	struct dm_dev *data_dev;
2213 	unsigned long block_size;
2214 	dm_block_t low_water_blocks;
2215 	struct dm_dev *metadata_dev;
2216 	fmode_t metadata_mode;
2217 
2218 	/*
2219 	 * FIXME Remove validation from scope of lock.
2220 	 */
2221 	mutex_lock(&dm_thin_pool_table.mutex);
2222 
2223 	if (argc < 4) {
2224 		ti->error = "Invalid argument count";
2225 		r = -EINVAL;
2226 		goto out_unlock;
2227 	}
2228 
2229 	as.argc = argc;
2230 	as.argv = argv;
2231 
2232 	/*
2233 	 * Set default pool features.
2234 	 */
2235 	pool_features_init(&pf);
2236 
2237 	dm_consume_args(&as, 4);
2238 	r = parse_pool_features(&as, &pf, ti);
2239 	if (r)
2240 		goto out_unlock;
2241 
2242 	metadata_mode = FMODE_READ | ((pf.mode == PM_READ_ONLY) ? 0 : FMODE_WRITE);
2243 	r = dm_get_device(ti, argv[0], metadata_mode, &metadata_dev);
2244 	if (r) {
2245 		ti->error = "Error opening metadata block device";
2246 		goto out_unlock;
2247 	}
2248 	warn_if_metadata_device_too_big(metadata_dev->bdev);
2249 
2250 	r = dm_get_device(ti, argv[1], FMODE_READ | FMODE_WRITE, &data_dev);
2251 	if (r) {
2252 		ti->error = "Error getting data device";
2253 		goto out_metadata;
2254 	}
2255 
2256 	if (kstrtoul(argv[2], 10, &block_size) || !block_size ||
2257 	    block_size < DATA_DEV_BLOCK_SIZE_MIN_SECTORS ||
2258 	    block_size > DATA_DEV_BLOCK_SIZE_MAX_SECTORS ||
2259 	    block_size & (DATA_DEV_BLOCK_SIZE_MIN_SECTORS - 1)) {
2260 		ti->error = "Invalid block size";
2261 		r = -EINVAL;
2262 		goto out;
2263 	}
2264 
2265 	if (kstrtoull(argv[3], 10, (unsigned long long *)&low_water_blocks)) {
2266 		ti->error = "Invalid low water mark";
2267 		r = -EINVAL;
2268 		goto out;
2269 	}
2270 
2271 	pt = kzalloc(sizeof(*pt), GFP_KERNEL);
2272 	if (!pt) {
2273 		r = -ENOMEM;
2274 		goto out;
2275 	}
2276 
2277 	pool = __pool_find(dm_table_get_md(ti->table), metadata_dev->bdev,
2278 			   block_size, pf.mode == PM_READ_ONLY, &ti->error, &pool_created);
2279 	if (IS_ERR(pool)) {
2280 		r = PTR_ERR(pool);
2281 		goto out_free_pt;
2282 	}
2283 
2284 	/*
2285 	 * 'pool_created' reflects whether this is the first table load.
2286 	 * Top level discard support is not allowed to be changed after
2287 	 * initial load.  This would require a pool reload to trigger thin
2288 	 * device changes.
2289 	 */
2290 	if (!pool_created && pf.discard_enabled != pool->pf.discard_enabled) {
2291 		ti->error = "Discard support cannot be disabled once enabled";
2292 		r = -EINVAL;
2293 		goto out_flags_changed;
2294 	}
2295 
2296 	pt->pool = pool;
2297 	pt->ti = ti;
2298 	pt->metadata_dev = metadata_dev;
2299 	pt->data_dev = data_dev;
2300 	pt->low_water_blocks = low_water_blocks;
2301 	pt->adjusted_pf = pt->requested_pf = pf;
2302 	ti->num_flush_bios = 1;
2303 
2304 	/*
2305 	 * Only need to enable discards if the pool should pass
2306 	 * them down to the data device.  The thin device's discard
2307 	 * processing will cause mappings to be removed from the btree.
2308 	 */
2309 	ti->discard_zeroes_data_unsupported = true;
2310 	if (pf.discard_enabled && pf.discard_passdown) {
2311 		ti->num_discard_bios = 1;
2312 
2313 		/*
2314 		 * Setting 'discards_supported' circumvents the normal
2315 		 * stacking of discard limits (this keeps the pool and
2316 		 * thin devices' discard limits consistent).
2317 		 */
2318 		ti->discards_supported = true;
2319 	}
2320 	ti->private = pt;
2321 
2322 	r = dm_pool_register_metadata_threshold(pt->pool->pmd,
2323 						calc_metadata_threshold(pt),
2324 						metadata_low_callback,
2325 						pool);
2326 	if (r)
2327 		goto out_free_pt;
2328 
2329 	pt->callbacks.congested_fn = pool_is_congested;
2330 	dm_table_add_target_callbacks(ti->table, &pt->callbacks);
2331 
2332 	mutex_unlock(&dm_thin_pool_table.mutex);
2333 
2334 	return 0;
2335 
2336 out_flags_changed:
2337 	__pool_dec(pool);
2338 out_free_pt:
2339 	kfree(pt);
2340 out:
2341 	dm_put_device(ti, data_dev);
2342 out_metadata:
2343 	dm_put_device(ti, metadata_dev);
2344 out_unlock:
2345 	mutex_unlock(&dm_thin_pool_table.mutex);
2346 
2347 	return r;
2348 }
2349 
2350 static int pool_map(struct dm_target *ti, struct bio *bio)
2351 {
2352 	int r;
2353 	struct pool_c *pt = ti->private;
2354 	struct pool *pool = pt->pool;
2355 	unsigned long flags;
2356 
2357 	/*
2358 	 * As this is a singleton target, ti->begin is always zero.
2359 	 */
2360 	spin_lock_irqsave(&pool->lock, flags);
2361 	bio->bi_bdev = pt->data_dev->bdev;
2362 	r = DM_MAPIO_REMAPPED;
2363 	spin_unlock_irqrestore(&pool->lock, flags);
2364 
2365 	return r;
2366 }
2367 
2368 static int maybe_resize_data_dev(struct dm_target *ti, bool *need_commit)
2369 {
2370 	int r;
2371 	struct pool_c *pt = ti->private;
2372 	struct pool *pool = pt->pool;
2373 	sector_t data_size = ti->len;
2374 	dm_block_t sb_data_size;
2375 
2376 	*need_commit = false;
2377 
2378 	(void) sector_div(data_size, pool->sectors_per_block);
2379 
2380 	r = dm_pool_get_data_dev_size(pool->pmd, &sb_data_size);
2381 	if (r) {
2382 		DMERR("%s: failed to retrieve data device size",
2383 		      dm_device_name(pool->pool_md));
2384 		return r;
2385 	}
2386 
2387 	if (data_size < sb_data_size) {
2388 		DMERR("%s: pool target (%llu blocks) too small: expected %llu",
2389 		      dm_device_name(pool->pool_md),
2390 		      (unsigned long long)data_size, sb_data_size);
2391 		return -EINVAL;
2392 
2393 	} else if (data_size > sb_data_size) {
2394 		if (dm_pool_metadata_needs_check(pool->pmd)) {
2395 			DMERR("%s: unable to grow the data device until repaired.",
2396 			      dm_device_name(pool->pool_md));
2397 			return 0;
2398 		}
2399 
2400 		if (sb_data_size)
2401 			DMINFO("%s: growing the data device from %llu to %llu blocks",
2402 			       dm_device_name(pool->pool_md),
2403 			       sb_data_size, (unsigned long long)data_size);
2404 		r = dm_pool_resize_data_dev(pool->pmd, data_size);
2405 		if (r) {
2406 			metadata_operation_failed(pool, "dm_pool_resize_data_dev", r);
2407 			return r;
2408 		}
2409 
2410 		*need_commit = true;
2411 	}
2412 
2413 	return 0;
2414 }
2415 
2416 static int maybe_resize_metadata_dev(struct dm_target *ti, bool *need_commit)
2417 {
2418 	int r;
2419 	struct pool_c *pt = ti->private;
2420 	struct pool *pool = pt->pool;
2421 	dm_block_t metadata_dev_size, sb_metadata_dev_size;
2422 
2423 	*need_commit = false;
2424 
2425 	metadata_dev_size = get_metadata_dev_size_in_blocks(pool->md_dev);
2426 
2427 	r = dm_pool_get_metadata_dev_size(pool->pmd, &sb_metadata_dev_size);
2428 	if (r) {
2429 		DMERR("%s: failed to retrieve metadata device size",
2430 		      dm_device_name(pool->pool_md));
2431 		return r;
2432 	}
2433 
2434 	if (metadata_dev_size < sb_metadata_dev_size) {
2435 		DMERR("%s: metadata device (%llu blocks) too small: expected %llu",
2436 		      dm_device_name(pool->pool_md),
2437 		      metadata_dev_size, sb_metadata_dev_size);
2438 		return -EINVAL;
2439 
2440 	} else if (metadata_dev_size > sb_metadata_dev_size) {
2441 		if (dm_pool_metadata_needs_check(pool->pmd)) {
2442 			DMERR("%s: unable to grow the metadata device until repaired.",
2443 			      dm_device_name(pool->pool_md));
2444 			return 0;
2445 		}
2446 
2447 		warn_if_metadata_device_too_big(pool->md_dev);
2448 		DMINFO("%s: growing the metadata device from %llu to %llu blocks",
2449 		       dm_device_name(pool->pool_md),
2450 		       sb_metadata_dev_size, metadata_dev_size);
2451 		r = dm_pool_resize_metadata_dev(pool->pmd, metadata_dev_size);
2452 		if (r) {
2453 			metadata_operation_failed(pool, "dm_pool_resize_metadata_dev", r);
2454 			return r;
2455 		}
2456 
2457 		*need_commit = true;
2458 	}
2459 
2460 	return 0;
2461 }
2462 
2463 /*
2464  * Retrieves the number of blocks of the data device from
2465  * the superblock and compares it to the actual device size,
2466  * thus resizing the data device in case it has grown.
2467  *
2468  * This both copes with opening preallocated data devices in the ctr
2469  * being followed by a resume
2470  * -and-
2471  * calling the resume method individually after userspace has
2472  * grown the data device in reaction to a table event.
2473  */
2474 static int pool_preresume(struct dm_target *ti)
2475 {
2476 	int r;
2477 	bool need_commit1, need_commit2;
2478 	struct pool_c *pt = ti->private;
2479 	struct pool *pool = pt->pool;
2480 
2481 	/*
2482 	 * Take control of the pool object.
2483 	 */
2484 	r = bind_control_target(pool, ti);
2485 	if (r)
2486 		return r;
2487 
2488 	r = maybe_resize_data_dev(ti, &need_commit1);
2489 	if (r)
2490 		return r;
2491 
2492 	r = maybe_resize_metadata_dev(ti, &need_commit2);
2493 	if (r)
2494 		return r;
2495 
2496 	if (need_commit1 || need_commit2)
2497 		(void) commit(pool);
2498 
2499 	return 0;
2500 }
2501 
2502 static void pool_resume(struct dm_target *ti)
2503 {
2504 	struct pool_c *pt = ti->private;
2505 	struct pool *pool = pt->pool;
2506 	unsigned long flags;
2507 
2508 	spin_lock_irqsave(&pool->lock, flags);
2509 	pool->low_water_triggered = false;
2510 	__requeue_bios(pool);
2511 	spin_unlock_irqrestore(&pool->lock, flags);
2512 
2513 	do_waker(&pool->waker.work);
2514 }
2515 
2516 static void pool_postsuspend(struct dm_target *ti)
2517 {
2518 	struct pool_c *pt = ti->private;
2519 	struct pool *pool = pt->pool;
2520 
2521 	cancel_delayed_work(&pool->waker);
2522 	flush_workqueue(pool->wq);
2523 	(void) commit(pool);
2524 }
2525 
2526 static int check_arg_count(unsigned argc, unsigned args_required)
2527 {
2528 	if (argc != args_required) {
2529 		DMWARN("Message received with %u arguments instead of %u.",
2530 		       argc, args_required);
2531 		return -EINVAL;
2532 	}
2533 
2534 	return 0;
2535 }
2536 
2537 static int read_dev_id(char *arg, dm_thin_id *dev_id, int warning)
2538 {
2539 	if (!kstrtoull(arg, 10, (unsigned long long *)dev_id) &&
2540 	    *dev_id <= MAX_DEV_ID)
2541 		return 0;
2542 
2543 	if (warning)
2544 		DMWARN("Message received with invalid device id: %s", arg);
2545 
2546 	return -EINVAL;
2547 }
2548 
2549 static int process_create_thin_mesg(unsigned argc, char **argv, struct pool *pool)
2550 {
2551 	dm_thin_id dev_id;
2552 	int r;
2553 
2554 	r = check_arg_count(argc, 2);
2555 	if (r)
2556 		return r;
2557 
2558 	r = read_dev_id(argv[1], &dev_id, 1);
2559 	if (r)
2560 		return r;
2561 
2562 	r = dm_pool_create_thin(pool->pmd, dev_id);
2563 	if (r) {
2564 		DMWARN("Creation of new thinly-provisioned device with id %s failed.",
2565 		       argv[1]);
2566 		return r;
2567 	}
2568 
2569 	return 0;
2570 }
2571 
2572 static int process_create_snap_mesg(unsigned argc, char **argv, struct pool *pool)
2573 {
2574 	dm_thin_id dev_id;
2575 	dm_thin_id origin_dev_id;
2576 	int r;
2577 
2578 	r = check_arg_count(argc, 3);
2579 	if (r)
2580 		return r;
2581 
2582 	r = read_dev_id(argv[1], &dev_id, 1);
2583 	if (r)
2584 		return r;
2585 
2586 	r = read_dev_id(argv[2], &origin_dev_id, 1);
2587 	if (r)
2588 		return r;
2589 
2590 	r = dm_pool_create_snap(pool->pmd, dev_id, origin_dev_id);
2591 	if (r) {
2592 		DMWARN("Creation of new snapshot %s of device %s failed.",
2593 		       argv[1], argv[2]);
2594 		return r;
2595 	}
2596 
2597 	return 0;
2598 }
2599 
2600 static int process_delete_mesg(unsigned argc, char **argv, struct pool *pool)
2601 {
2602 	dm_thin_id dev_id;
2603 	int r;
2604 
2605 	r = check_arg_count(argc, 2);
2606 	if (r)
2607 		return r;
2608 
2609 	r = read_dev_id(argv[1], &dev_id, 1);
2610 	if (r)
2611 		return r;
2612 
2613 	r = dm_pool_delete_thin_device(pool->pmd, dev_id);
2614 	if (r)
2615 		DMWARN("Deletion of thin device %s failed.", argv[1]);
2616 
2617 	return r;
2618 }
2619 
2620 static int process_set_transaction_id_mesg(unsigned argc, char **argv, struct pool *pool)
2621 {
2622 	dm_thin_id old_id, new_id;
2623 	int r;
2624 
2625 	r = check_arg_count(argc, 3);
2626 	if (r)
2627 		return r;
2628 
2629 	if (kstrtoull(argv[1], 10, (unsigned long long *)&old_id)) {
2630 		DMWARN("set_transaction_id message: Unrecognised id %s.", argv[1]);
2631 		return -EINVAL;
2632 	}
2633 
2634 	if (kstrtoull(argv[2], 10, (unsigned long long *)&new_id)) {
2635 		DMWARN("set_transaction_id message: Unrecognised new id %s.", argv[2]);
2636 		return -EINVAL;
2637 	}
2638 
2639 	r = dm_pool_set_metadata_transaction_id(pool->pmd, old_id, new_id);
2640 	if (r) {
2641 		DMWARN("Failed to change transaction id from %s to %s.",
2642 		       argv[1], argv[2]);
2643 		return r;
2644 	}
2645 
2646 	return 0;
2647 }
2648 
2649 static int process_reserve_metadata_snap_mesg(unsigned argc, char **argv, struct pool *pool)
2650 {
2651 	int r;
2652 
2653 	r = check_arg_count(argc, 1);
2654 	if (r)
2655 		return r;
2656 
2657 	(void) commit(pool);
2658 
2659 	r = dm_pool_reserve_metadata_snap(pool->pmd);
2660 	if (r)
2661 		DMWARN("reserve_metadata_snap message failed.");
2662 
2663 	return r;
2664 }
2665 
2666 static int process_release_metadata_snap_mesg(unsigned argc, char **argv, struct pool *pool)
2667 {
2668 	int r;
2669 
2670 	r = check_arg_count(argc, 1);
2671 	if (r)
2672 		return r;
2673 
2674 	r = dm_pool_release_metadata_snap(pool->pmd);
2675 	if (r)
2676 		DMWARN("release_metadata_snap message failed.");
2677 
2678 	return r;
2679 }
2680 
2681 /*
2682  * Messages supported:
2683  *   create_thin	<dev_id>
2684  *   create_snap	<dev_id> <origin_id>
2685  *   delete		<dev_id>
2686  *   trim		<dev_id> <new_size_in_sectors>
2687  *   set_transaction_id <current_trans_id> <new_trans_id>
2688  *   reserve_metadata_snap
2689  *   release_metadata_snap
2690  */
2691 static int pool_message(struct dm_target *ti, unsigned argc, char **argv)
2692 {
2693 	int r = -EINVAL;
2694 	struct pool_c *pt = ti->private;
2695 	struct pool *pool = pt->pool;
2696 
2697 	if (!strcasecmp(argv[0], "create_thin"))
2698 		r = process_create_thin_mesg(argc, argv, pool);
2699 
2700 	else if (!strcasecmp(argv[0], "create_snap"))
2701 		r = process_create_snap_mesg(argc, argv, pool);
2702 
2703 	else if (!strcasecmp(argv[0], "delete"))
2704 		r = process_delete_mesg(argc, argv, pool);
2705 
2706 	else if (!strcasecmp(argv[0], "set_transaction_id"))
2707 		r = process_set_transaction_id_mesg(argc, argv, pool);
2708 
2709 	else if (!strcasecmp(argv[0], "reserve_metadata_snap"))
2710 		r = process_reserve_metadata_snap_mesg(argc, argv, pool);
2711 
2712 	else if (!strcasecmp(argv[0], "release_metadata_snap"))
2713 		r = process_release_metadata_snap_mesg(argc, argv, pool);
2714 
2715 	else
2716 		DMWARN("Unrecognised thin pool target message received: %s", argv[0]);
2717 
2718 	if (!r)
2719 		(void) commit(pool);
2720 
2721 	return r;
2722 }
2723 
2724 static void emit_flags(struct pool_features *pf, char *result,
2725 		       unsigned sz, unsigned maxlen)
2726 {
2727 	unsigned count = !pf->zero_new_blocks + !pf->discard_enabled +
2728 		!pf->discard_passdown + (pf->mode == PM_READ_ONLY) +
2729 		pf->error_if_no_space;
2730 	DMEMIT("%u ", count);
2731 
2732 	if (!pf->zero_new_blocks)
2733 		DMEMIT("skip_block_zeroing ");
2734 
2735 	if (!pf->discard_enabled)
2736 		DMEMIT("ignore_discard ");
2737 
2738 	if (!pf->discard_passdown)
2739 		DMEMIT("no_discard_passdown ");
2740 
2741 	if (pf->mode == PM_READ_ONLY)
2742 		DMEMIT("read_only ");
2743 
2744 	if (pf->error_if_no_space)
2745 		DMEMIT("error_if_no_space ");
2746 }
2747 
2748 /*
2749  * Status line is:
2750  *    <transaction id> <used metadata sectors>/<total metadata sectors>
2751  *    <used data sectors>/<total data sectors> <held metadata root>
2752  */
2753 static void pool_status(struct dm_target *ti, status_type_t type,
2754 			unsigned status_flags, char *result, unsigned maxlen)
2755 {
2756 	int r;
2757 	unsigned sz = 0;
2758 	uint64_t transaction_id;
2759 	dm_block_t nr_free_blocks_data;
2760 	dm_block_t nr_free_blocks_metadata;
2761 	dm_block_t nr_blocks_data;
2762 	dm_block_t nr_blocks_metadata;
2763 	dm_block_t held_root;
2764 	char buf[BDEVNAME_SIZE];
2765 	char buf2[BDEVNAME_SIZE];
2766 	struct pool_c *pt = ti->private;
2767 	struct pool *pool = pt->pool;
2768 
2769 	switch (type) {
2770 	case STATUSTYPE_INFO:
2771 		if (get_pool_mode(pool) == PM_FAIL) {
2772 			DMEMIT("Fail");
2773 			break;
2774 		}
2775 
2776 		/* Commit to ensure statistics aren't out-of-date */
2777 		if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti))
2778 			(void) commit(pool);
2779 
2780 		r = dm_pool_get_metadata_transaction_id(pool->pmd, &transaction_id);
2781 		if (r) {
2782 			DMERR("%s: dm_pool_get_metadata_transaction_id returned %d",
2783 			      dm_device_name(pool->pool_md), r);
2784 			goto err;
2785 		}
2786 
2787 		r = dm_pool_get_free_metadata_block_count(pool->pmd, &nr_free_blocks_metadata);
2788 		if (r) {
2789 			DMERR("%s: dm_pool_get_free_metadata_block_count returned %d",
2790 			      dm_device_name(pool->pool_md), r);
2791 			goto err;
2792 		}
2793 
2794 		r = dm_pool_get_metadata_dev_size(pool->pmd, &nr_blocks_metadata);
2795 		if (r) {
2796 			DMERR("%s: dm_pool_get_metadata_dev_size returned %d",
2797 			      dm_device_name(pool->pool_md), r);
2798 			goto err;
2799 		}
2800 
2801 		r = dm_pool_get_free_block_count(pool->pmd, &nr_free_blocks_data);
2802 		if (r) {
2803 			DMERR("%s: dm_pool_get_free_block_count returned %d",
2804 			      dm_device_name(pool->pool_md), r);
2805 			goto err;
2806 		}
2807 
2808 		r = dm_pool_get_data_dev_size(pool->pmd, &nr_blocks_data);
2809 		if (r) {
2810 			DMERR("%s: dm_pool_get_data_dev_size returned %d",
2811 			      dm_device_name(pool->pool_md), r);
2812 			goto err;
2813 		}
2814 
2815 		r = dm_pool_get_metadata_snap(pool->pmd, &held_root);
2816 		if (r) {
2817 			DMERR("%s: dm_pool_get_metadata_snap returned %d",
2818 			      dm_device_name(pool->pool_md), r);
2819 			goto err;
2820 		}
2821 
2822 		DMEMIT("%llu %llu/%llu %llu/%llu ",
2823 		       (unsigned long long)transaction_id,
2824 		       (unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata),
2825 		       (unsigned long long)nr_blocks_metadata,
2826 		       (unsigned long long)(nr_blocks_data - nr_free_blocks_data),
2827 		       (unsigned long long)nr_blocks_data);
2828 
2829 		if (held_root)
2830 			DMEMIT("%llu ", held_root);
2831 		else
2832 			DMEMIT("- ");
2833 
2834 		if (pool->pf.mode == PM_OUT_OF_DATA_SPACE)
2835 			DMEMIT("out_of_data_space ");
2836 		else if (pool->pf.mode == PM_READ_ONLY)
2837 			DMEMIT("ro ");
2838 		else
2839 			DMEMIT("rw ");
2840 
2841 		if (!pool->pf.discard_enabled)
2842 			DMEMIT("ignore_discard ");
2843 		else if (pool->pf.discard_passdown)
2844 			DMEMIT("discard_passdown ");
2845 		else
2846 			DMEMIT("no_discard_passdown ");
2847 
2848 		if (pool->pf.error_if_no_space)
2849 			DMEMIT("error_if_no_space ");
2850 		else
2851 			DMEMIT("queue_if_no_space ");
2852 
2853 		break;
2854 
2855 	case STATUSTYPE_TABLE:
2856 		DMEMIT("%s %s %lu %llu ",
2857 		       format_dev_t(buf, pt->metadata_dev->bdev->bd_dev),
2858 		       format_dev_t(buf2, pt->data_dev->bdev->bd_dev),
2859 		       (unsigned long)pool->sectors_per_block,
2860 		       (unsigned long long)pt->low_water_blocks);
2861 		emit_flags(&pt->requested_pf, result, sz, maxlen);
2862 		break;
2863 	}
2864 	return;
2865 
2866 err:
2867 	DMEMIT("Error");
2868 }
2869 
2870 static int pool_iterate_devices(struct dm_target *ti,
2871 				iterate_devices_callout_fn fn, void *data)
2872 {
2873 	struct pool_c *pt = ti->private;
2874 
2875 	return fn(ti, pt->data_dev, 0, ti->len, data);
2876 }
2877 
2878 static int pool_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
2879 		      struct bio_vec *biovec, int max_size)
2880 {
2881 	struct pool_c *pt = ti->private;
2882 	struct request_queue *q = bdev_get_queue(pt->data_dev->bdev);
2883 
2884 	if (!q->merge_bvec_fn)
2885 		return max_size;
2886 
2887 	bvm->bi_bdev = pt->data_dev->bdev;
2888 
2889 	return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
2890 }
2891 
2892 static void set_discard_limits(struct pool_c *pt, struct queue_limits *limits)
2893 {
2894 	struct pool *pool = pt->pool;
2895 	struct queue_limits *data_limits;
2896 
2897 	limits->max_discard_sectors = pool->sectors_per_block;
2898 
2899 	/*
2900 	 * discard_granularity is just a hint, and not enforced.
2901 	 */
2902 	if (pt->adjusted_pf.discard_passdown) {
2903 		data_limits = &bdev_get_queue(pt->data_dev->bdev)->limits;
2904 		limits->discard_granularity = data_limits->discard_granularity;
2905 	} else
2906 		limits->discard_granularity = pool->sectors_per_block << SECTOR_SHIFT;
2907 }
2908 
2909 static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits)
2910 {
2911 	struct pool_c *pt = ti->private;
2912 	struct pool *pool = pt->pool;
2913 	uint64_t io_opt_sectors = limits->io_opt >> SECTOR_SHIFT;
2914 
2915 	/*
2916 	 * If the system-determined stacked limits are compatible with the
2917 	 * pool's blocksize (io_opt is a factor) do not override them.
2918 	 */
2919 	if (io_opt_sectors < pool->sectors_per_block ||
2920 	    do_div(io_opt_sectors, pool->sectors_per_block)) {
2921 		blk_limits_io_min(limits, 0);
2922 		blk_limits_io_opt(limits, pool->sectors_per_block << SECTOR_SHIFT);
2923 	}
2924 
2925 	/*
2926 	 * pt->adjusted_pf is a staging area for the actual features to use.
2927 	 * They get transferred to the live pool in bind_control_target()
2928 	 * called from pool_preresume().
2929 	 */
2930 	if (!pt->adjusted_pf.discard_enabled) {
2931 		/*
2932 		 * Must explicitly disallow stacking discard limits otherwise the
2933 		 * block layer will stack them if pool's data device has support.
2934 		 * QUEUE_FLAG_DISCARD wouldn't be set but there is no way for the
2935 		 * user to see that, so make sure to set all discard limits to 0.
2936 		 */
2937 		limits->discard_granularity = 0;
2938 		return;
2939 	}
2940 
2941 	disable_passdown_if_not_supported(pt);
2942 
2943 	set_discard_limits(pt, limits);
2944 }
2945 
2946 static struct target_type pool_target = {
2947 	.name = "thin-pool",
2948 	.features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE |
2949 		    DM_TARGET_IMMUTABLE,
2950 	.version = {1, 11, 0},
2951 	.module = THIS_MODULE,
2952 	.ctr = pool_ctr,
2953 	.dtr = pool_dtr,
2954 	.map = pool_map,
2955 	.postsuspend = pool_postsuspend,
2956 	.preresume = pool_preresume,
2957 	.resume = pool_resume,
2958 	.message = pool_message,
2959 	.status = pool_status,
2960 	.merge = pool_merge,
2961 	.iterate_devices = pool_iterate_devices,
2962 	.io_hints = pool_io_hints,
2963 };
2964 
2965 /*----------------------------------------------------------------
2966  * Thin target methods
2967  *--------------------------------------------------------------*/
2968 static void thin_dtr(struct dm_target *ti)
2969 {
2970 	struct thin_c *tc = ti->private;
2971 
2972 	mutex_lock(&dm_thin_pool_table.mutex);
2973 
2974 	__pool_dec(tc->pool);
2975 	dm_pool_close_thin_device(tc->td);
2976 	dm_put_device(ti, tc->pool_dev);
2977 	if (tc->origin_dev)
2978 		dm_put_device(ti, tc->origin_dev);
2979 	kfree(tc);
2980 
2981 	mutex_unlock(&dm_thin_pool_table.mutex);
2982 }
2983 
2984 /*
2985  * Thin target parameters:
2986  *
2987  * <pool_dev> <dev_id> [origin_dev]
2988  *
2989  * pool_dev: the path to the pool (eg, /dev/mapper/my_pool)
2990  * dev_id: the internal device identifier
2991  * origin_dev: a device external to the pool that should act as the origin
2992  *
2993  * If the pool device has discards disabled, they get disabled for the thin
2994  * device as well.
2995  */
2996 static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
2997 {
2998 	int r;
2999 	struct thin_c *tc;
3000 	struct dm_dev *pool_dev, *origin_dev;
3001 	struct mapped_device *pool_md;
3002 
3003 	mutex_lock(&dm_thin_pool_table.mutex);
3004 
3005 	if (argc != 2 && argc != 3) {
3006 		ti->error = "Invalid argument count";
3007 		r = -EINVAL;
3008 		goto out_unlock;
3009 	}
3010 
3011 	tc = ti->private = kzalloc(sizeof(*tc), GFP_KERNEL);
3012 	if (!tc) {
3013 		ti->error = "Out of memory";
3014 		r = -ENOMEM;
3015 		goto out_unlock;
3016 	}
3017 
3018 	if (argc == 3) {
3019 		r = dm_get_device(ti, argv[2], FMODE_READ, &origin_dev);
3020 		if (r) {
3021 			ti->error = "Error opening origin device";
3022 			goto bad_origin_dev;
3023 		}
3024 		tc->origin_dev = origin_dev;
3025 	}
3026 
3027 	r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &pool_dev);
3028 	if (r) {
3029 		ti->error = "Error opening pool device";
3030 		goto bad_pool_dev;
3031 	}
3032 	tc->pool_dev = pool_dev;
3033 
3034 	if (read_dev_id(argv[1], (unsigned long long *)&tc->dev_id, 0)) {
3035 		ti->error = "Invalid device id";
3036 		r = -EINVAL;
3037 		goto bad_common;
3038 	}
3039 
3040 	pool_md = dm_get_md(tc->pool_dev->bdev->bd_dev);
3041 	if (!pool_md) {
3042 		ti->error = "Couldn't get pool mapped device";
3043 		r = -EINVAL;
3044 		goto bad_common;
3045 	}
3046 
3047 	tc->pool = __pool_table_lookup(pool_md);
3048 	if (!tc->pool) {
3049 		ti->error = "Couldn't find pool object";
3050 		r = -EINVAL;
3051 		goto bad_pool_lookup;
3052 	}
3053 	__pool_inc(tc->pool);
3054 
3055 	if (get_pool_mode(tc->pool) == PM_FAIL) {
3056 		ti->error = "Couldn't open thin device, Pool is in fail mode";
3057 		r = -EINVAL;
3058 		goto bad_thin_open;
3059 	}
3060 
3061 	r = dm_pool_open_thin_device(tc->pool->pmd, tc->dev_id, &tc->td);
3062 	if (r) {
3063 		ti->error = "Couldn't open thin internal device";
3064 		goto bad_thin_open;
3065 	}
3066 
3067 	r = dm_set_target_max_io_len(ti, tc->pool->sectors_per_block);
3068 	if (r)
3069 		goto bad_target_max_io_len;
3070 
3071 	ti->num_flush_bios = 1;
3072 	ti->flush_supported = true;
3073 	ti->per_bio_data_size = sizeof(struct dm_thin_endio_hook);
3074 
3075 	/* In case the pool supports discards, pass them on. */
3076 	ti->discard_zeroes_data_unsupported = true;
3077 	if (tc->pool->pf.discard_enabled) {
3078 		ti->discards_supported = true;
3079 		ti->num_discard_bios = 1;
3080 		/* Discard bios must be split on a block boundary */
3081 		ti->split_discard_bios = true;
3082 	}
3083 
3084 	dm_put(pool_md);
3085 
3086 	mutex_unlock(&dm_thin_pool_table.mutex);
3087 
3088 	return 0;
3089 
3090 bad_target_max_io_len:
3091 	dm_pool_close_thin_device(tc->td);
3092 bad_thin_open:
3093 	__pool_dec(tc->pool);
3094 bad_pool_lookup:
3095 	dm_put(pool_md);
3096 bad_common:
3097 	dm_put_device(ti, tc->pool_dev);
3098 bad_pool_dev:
3099 	if (tc->origin_dev)
3100 		dm_put_device(ti, tc->origin_dev);
3101 bad_origin_dev:
3102 	kfree(tc);
3103 out_unlock:
3104 	mutex_unlock(&dm_thin_pool_table.mutex);
3105 
3106 	return r;
3107 }
3108 
3109 static int thin_map(struct dm_target *ti, struct bio *bio)
3110 {
3111 	bio->bi_iter.bi_sector = dm_target_offset(ti, bio->bi_iter.bi_sector);
3112 
3113 	return thin_bio_map(ti, bio);
3114 }
3115 
3116 static int thin_endio(struct dm_target *ti, struct bio *bio, int err)
3117 {
3118 	unsigned long flags;
3119 	struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
3120 	struct list_head work;
3121 	struct dm_thin_new_mapping *m, *tmp;
3122 	struct pool *pool = h->tc->pool;
3123 
3124 	if (h->shared_read_entry) {
3125 		INIT_LIST_HEAD(&work);
3126 		dm_deferred_entry_dec(h->shared_read_entry, &work);
3127 
3128 		spin_lock_irqsave(&pool->lock, flags);
3129 		list_for_each_entry_safe(m, tmp, &work, list) {
3130 			list_del(&m->list);
3131 			m->quiesced = true;
3132 			__maybe_add_mapping(m);
3133 		}
3134 		spin_unlock_irqrestore(&pool->lock, flags);
3135 	}
3136 
3137 	if (h->all_io_entry) {
3138 		INIT_LIST_HEAD(&work);
3139 		dm_deferred_entry_dec(h->all_io_entry, &work);
3140 		if (!list_empty(&work)) {
3141 			spin_lock_irqsave(&pool->lock, flags);
3142 			list_for_each_entry_safe(m, tmp, &work, list)
3143 				list_add_tail(&m->list, &pool->prepared_discards);
3144 			spin_unlock_irqrestore(&pool->lock, flags);
3145 			wake_worker(pool);
3146 		}
3147 	}
3148 
3149 	return 0;
3150 }
3151 
3152 static void thin_presuspend(struct dm_target *ti)
3153 {
3154 	struct thin_c *tc = ti->private;
3155 
3156 	if (dm_noflush_suspending(ti))
3157 		noflush_work(tc, do_noflush_start);
3158 }
3159 
3160 static void thin_postsuspend(struct dm_target *ti)
3161 {
3162 	struct thin_c *tc = ti->private;
3163 
3164 	/*
3165 	 * The dm_noflush_suspending flag has been cleared by now, so
3166 	 * unfortunately we must always run this.
3167 	 */
3168 	noflush_work(tc, do_noflush_stop);
3169 }
3170 
3171 /*
3172  * <nr mapped sectors> <highest mapped sector>
3173  */
3174 static void thin_status(struct dm_target *ti, status_type_t type,
3175 			unsigned status_flags, char *result, unsigned maxlen)
3176 {
3177 	int r;
3178 	ssize_t sz = 0;
3179 	dm_block_t mapped, highest;
3180 	char buf[BDEVNAME_SIZE];
3181 	struct thin_c *tc = ti->private;
3182 
3183 	if (get_pool_mode(tc->pool) == PM_FAIL) {
3184 		DMEMIT("Fail");
3185 		return;
3186 	}
3187 
3188 	if (!tc->td)
3189 		DMEMIT("-");
3190 	else {
3191 		switch (type) {
3192 		case STATUSTYPE_INFO:
3193 			r = dm_thin_get_mapped_count(tc->td, &mapped);
3194 			if (r) {
3195 				DMERR("dm_thin_get_mapped_count returned %d", r);
3196 				goto err;
3197 			}
3198 
3199 			r = dm_thin_get_highest_mapped_block(tc->td, &highest);
3200 			if (r < 0) {
3201 				DMERR("dm_thin_get_highest_mapped_block returned %d", r);
3202 				goto err;
3203 			}
3204 
3205 			DMEMIT("%llu ", mapped * tc->pool->sectors_per_block);
3206 			if (r)
3207 				DMEMIT("%llu", ((highest + 1) *
3208 						tc->pool->sectors_per_block) - 1);
3209 			else
3210 				DMEMIT("-");
3211 			break;
3212 
3213 		case STATUSTYPE_TABLE:
3214 			DMEMIT("%s %lu",
3215 			       format_dev_t(buf, tc->pool_dev->bdev->bd_dev),
3216 			       (unsigned long) tc->dev_id);
3217 			if (tc->origin_dev)
3218 				DMEMIT(" %s", format_dev_t(buf, tc->origin_dev->bdev->bd_dev));
3219 			break;
3220 		}
3221 	}
3222 
3223 	return;
3224 
3225 err:
3226 	DMEMIT("Error");
3227 }
3228 
3229 static int thin_iterate_devices(struct dm_target *ti,
3230 				iterate_devices_callout_fn fn, void *data)
3231 {
3232 	sector_t blocks;
3233 	struct thin_c *tc = ti->private;
3234 	struct pool *pool = tc->pool;
3235 
3236 	/*
3237 	 * We can't call dm_pool_get_data_dev_size() since that blocks.  So
3238 	 * we follow a more convoluted path through to the pool's target.
3239 	 */
3240 	if (!pool->ti)
3241 		return 0;	/* nothing is bound */
3242 
3243 	blocks = pool->ti->len;
3244 	(void) sector_div(blocks, pool->sectors_per_block);
3245 	if (blocks)
3246 		return fn(ti, tc->pool_dev, 0, pool->sectors_per_block * blocks, data);
3247 
3248 	return 0;
3249 }
3250 
3251 static struct target_type thin_target = {
3252 	.name = "thin",
3253 	.version = {1, 11, 0},
3254 	.module	= THIS_MODULE,
3255 	.ctr = thin_ctr,
3256 	.dtr = thin_dtr,
3257 	.map = thin_map,
3258 	.end_io = thin_endio,
3259 	.presuspend = thin_presuspend,
3260 	.postsuspend = thin_postsuspend,
3261 	.status = thin_status,
3262 	.iterate_devices = thin_iterate_devices,
3263 };
3264 
3265 /*----------------------------------------------------------------*/
3266 
3267 static int __init dm_thin_init(void)
3268 {
3269 	int r;
3270 
3271 	pool_table_init();
3272 
3273 	r = dm_register_target(&thin_target);
3274 	if (r)
3275 		return r;
3276 
3277 	r = dm_register_target(&pool_target);
3278 	if (r)
3279 		goto bad_pool_target;
3280 
3281 	r = -ENOMEM;
3282 
3283 	_new_mapping_cache = KMEM_CACHE(dm_thin_new_mapping, 0);
3284 	if (!_new_mapping_cache)
3285 		goto bad_new_mapping_cache;
3286 
3287 	return 0;
3288 
3289 bad_new_mapping_cache:
3290 	dm_unregister_target(&pool_target);
3291 bad_pool_target:
3292 	dm_unregister_target(&thin_target);
3293 
3294 	return r;
3295 }
3296 
3297 static void dm_thin_exit(void)
3298 {
3299 	dm_unregister_target(&thin_target);
3300 	dm_unregister_target(&pool_target);
3301 
3302 	kmem_cache_destroy(_new_mapping_cache);
3303 }
3304 
3305 module_init(dm_thin_init);
3306 module_exit(dm_thin_exit);
3307 
3308 MODULE_DESCRIPTION(DM_NAME " thin provisioning target");
3309 MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
3310 MODULE_LICENSE("GPL");
3311