xref: /openbmc/linux/drivers/md/dm-thin.c (revision d2999e1b)
1 /*
2  * Copyright (C) 2011-2012 Red Hat UK.
3  *
4  * This file is released under the GPL.
5  */
6 
7 #include "dm-thin-metadata.h"
8 #include "dm-bio-prison.h"
9 #include "dm.h"
10 
11 #include <linux/device-mapper.h>
12 #include <linux/dm-io.h>
13 #include <linux/dm-kcopyd.h>
14 #include <linux/list.h>
15 #include <linux/rculist.h>
16 #include <linux/init.h>
17 #include <linux/module.h>
18 #include <linux/slab.h>
19 #include <linux/rbtree.h>
20 
21 #define	DM_MSG_PREFIX	"thin"
22 
23 /*
24  * Tunable constants
25  */
26 #define ENDIO_HOOK_POOL_SIZE 1024
27 #define MAPPING_POOL_SIZE 1024
28 #define PRISON_CELLS 1024
29 #define COMMIT_PERIOD HZ
30 #define NO_SPACE_TIMEOUT_SECS 60
31 
32 static unsigned no_space_timeout_secs = NO_SPACE_TIMEOUT_SECS;
33 
34 DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(snapshot_copy_throttle,
35 		"A percentage of time allocated for copy on write");
36 
37 /*
38  * The block size of the device holding pool data must be
39  * between 64KB and 1GB.
40  */
41 #define DATA_DEV_BLOCK_SIZE_MIN_SECTORS (64 * 1024 >> SECTOR_SHIFT)
42 #define DATA_DEV_BLOCK_SIZE_MAX_SECTORS (1024 * 1024 * 1024 >> SECTOR_SHIFT)
43 
44 /*
45  * Device id is restricted to 24 bits.
46  */
47 #define MAX_DEV_ID ((1 << 24) - 1)
48 
49 /*
50  * How do we handle breaking sharing of data blocks?
51  * =================================================
52  *
53  * We use a standard copy-on-write btree to store the mappings for the
54  * devices (note I'm talking about copy-on-write of the metadata here, not
55  * the data).  When you take an internal snapshot you clone the root node
56  * of the origin btree.  After this there is no concept of an origin or a
57  * snapshot.  They are just two device trees that happen to point to the
58  * same data blocks.
59  *
60  * When we get a write in we decide if it's to a shared data block using
61  * some timestamp magic.  If it is, we have to break sharing.
62  *
63  * Let's say we write to a shared block in what was the origin.  The
64  * steps are:
65  *
66  * i) plug io further to this physical block. (see bio_prison code).
67  *
68  * ii) quiesce any read io to that shared data block.  Obviously
69  * including all devices that share this block.  (see dm_deferred_set code)
70  *
71  * iii) copy the data block to a newly allocate block.  This step can be
72  * missed out if the io covers the block. (schedule_copy).
73  *
74  * iv) insert the new mapping into the origin's btree
75  * (process_prepared_mapping).  This act of inserting breaks some
76  * sharing of btree nodes between the two devices.  Breaking sharing only
77  * effects the btree of that specific device.  Btrees for the other
78  * devices that share the block never change.  The btree for the origin
79  * device as it was after the last commit is untouched, ie. we're using
80  * persistent data structures in the functional programming sense.
81  *
82  * v) unplug io to this physical block, including the io that triggered
83  * the breaking of sharing.
84  *
85  * Steps (ii) and (iii) occur in parallel.
86  *
87  * The metadata _doesn't_ need to be committed before the io continues.  We
88  * get away with this because the io is always written to a _new_ block.
89  * If there's a crash, then:
90  *
91  * - The origin mapping will point to the old origin block (the shared
92  * one).  This will contain the data as it was before the io that triggered
93  * the breaking of sharing came in.
94  *
95  * - The snap mapping still points to the old block.  As it would after
96  * the commit.
97  *
98  * The downside of this scheme is the timestamp magic isn't perfect, and
99  * will continue to think that data block in the snapshot device is shared
100  * even after the write to the origin has broken sharing.  I suspect data
101  * blocks will typically be shared by many different devices, so we're
102  * breaking sharing n + 1 times, rather than n, where n is the number of
103  * devices that reference this data block.  At the moment I think the
104  * benefits far, far outweigh the disadvantages.
105  */
106 
107 /*----------------------------------------------------------------*/
108 
109 /*
110  * Key building.
111  */
112 static void build_data_key(struct dm_thin_device *td,
113 			   dm_block_t b, struct dm_cell_key *key)
114 {
115 	key->virtual = 0;
116 	key->dev = dm_thin_dev_id(td);
117 	key->block = b;
118 }
119 
120 static void build_virtual_key(struct dm_thin_device *td, dm_block_t b,
121 			      struct dm_cell_key *key)
122 {
123 	key->virtual = 1;
124 	key->dev = dm_thin_dev_id(td);
125 	key->block = b;
126 }
127 
128 /*----------------------------------------------------------------*/
129 
130 /*
131  * A pool device ties together a metadata device and a data device.  It
132  * also provides the interface for creating and destroying internal
133  * devices.
134  */
135 struct dm_thin_new_mapping;
136 
137 /*
138  * The pool runs in 4 modes.  Ordered in degraded order for comparisons.
139  */
140 enum pool_mode {
141 	PM_WRITE,		/* metadata may be changed */
142 	PM_OUT_OF_DATA_SPACE,	/* metadata may be changed, though data may not be allocated */
143 	PM_READ_ONLY,		/* metadata may not be changed */
144 	PM_FAIL,		/* all I/O fails */
145 };
146 
147 struct pool_features {
148 	enum pool_mode mode;
149 
150 	bool zero_new_blocks:1;
151 	bool discard_enabled:1;
152 	bool discard_passdown:1;
153 	bool error_if_no_space:1;
154 };
155 
156 struct thin_c;
157 typedef void (*process_bio_fn)(struct thin_c *tc, struct bio *bio);
158 typedef void (*process_mapping_fn)(struct dm_thin_new_mapping *m);
159 
160 struct pool {
161 	struct list_head list;
162 	struct dm_target *ti;	/* Only set if a pool target is bound */
163 
164 	struct mapped_device *pool_md;
165 	struct block_device *md_dev;
166 	struct dm_pool_metadata *pmd;
167 
168 	dm_block_t low_water_blocks;
169 	uint32_t sectors_per_block;
170 	int sectors_per_block_shift;
171 
172 	struct pool_features pf;
173 	bool low_water_triggered:1;	/* A dm event has been sent */
174 
175 	struct dm_bio_prison *prison;
176 	struct dm_kcopyd_client *copier;
177 
178 	struct workqueue_struct *wq;
179 	struct work_struct worker;
180 	struct delayed_work waker;
181 	struct delayed_work no_space_timeout;
182 
183 	unsigned long last_commit_jiffies;
184 	unsigned ref_count;
185 
186 	spinlock_t lock;
187 	struct bio_list deferred_flush_bios;
188 	struct list_head prepared_mappings;
189 	struct list_head prepared_discards;
190 	struct list_head active_thins;
191 
192 	struct dm_deferred_set *shared_read_ds;
193 	struct dm_deferred_set *all_io_ds;
194 
195 	struct dm_thin_new_mapping *next_mapping;
196 	mempool_t *mapping_pool;
197 
198 	process_bio_fn process_bio;
199 	process_bio_fn process_discard;
200 
201 	process_mapping_fn process_prepared_mapping;
202 	process_mapping_fn process_prepared_discard;
203 };
204 
205 static enum pool_mode get_pool_mode(struct pool *pool);
206 static void metadata_operation_failed(struct pool *pool, const char *op, int r);
207 
208 /*
209  * Target context for a pool.
210  */
211 struct pool_c {
212 	struct dm_target *ti;
213 	struct pool *pool;
214 	struct dm_dev *data_dev;
215 	struct dm_dev *metadata_dev;
216 	struct dm_target_callbacks callbacks;
217 
218 	dm_block_t low_water_blocks;
219 	struct pool_features requested_pf; /* Features requested during table load */
220 	struct pool_features adjusted_pf;  /* Features used after adjusting for constituent devices */
221 };
222 
223 /*
224  * Target context for a thin.
225  */
226 struct thin_c {
227 	struct list_head list;
228 	struct dm_dev *pool_dev;
229 	struct dm_dev *origin_dev;
230 	dm_thin_id dev_id;
231 
232 	struct pool *pool;
233 	struct dm_thin_device *td;
234 	bool requeue_mode:1;
235 	spinlock_t lock;
236 	struct bio_list deferred_bio_list;
237 	struct bio_list retry_on_resume_list;
238 	struct rb_root sort_bio_list; /* sorted list of deferred bios */
239 
240 	/*
241 	 * Ensures the thin is not destroyed until the worker has finished
242 	 * iterating the active_thins list.
243 	 */
244 	atomic_t refcount;
245 	struct completion can_destroy;
246 };
247 
248 /*----------------------------------------------------------------*/
249 
250 /*
251  * wake_worker() is used when new work is queued and when pool_resume is
252  * ready to continue deferred IO processing.
253  */
254 static void wake_worker(struct pool *pool)
255 {
256 	queue_work(pool->wq, &pool->worker);
257 }
258 
259 /*----------------------------------------------------------------*/
260 
261 static int bio_detain(struct pool *pool, struct dm_cell_key *key, struct bio *bio,
262 		      struct dm_bio_prison_cell **cell_result)
263 {
264 	int r;
265 	struct dm_bio_prison_cell *cell_prealloc;
266 
267 	/*
268 	 * Allocate a cell from the prison's mempool.
269 	 * This might block but it can't fail.
270 	 */
271 	cell_prealloc = dm_bio_prison_alloc_cell(pool->prison, GFP_NOIO);
272 
273 	r = dm_bio_detain(pool->prison, key, bio, cell_prealloc, cell_result);
274 	if (r)
275 		/*
276 		 * We reused an old cell; we can get rid of
277 		 * the new one.
278 		 */
279 		dm_bio_prison_free_cell(pool->prison, cell_prealloc);
280 
281 	return r;
282 }
283 
284 static void cell_release(struct pool *pool,
285 			 struct dm_bio_prison_cell *cell,
286 			 struct bio_list *bios)
287 {
288 	dm_cell_release(pool->prison, cell, bios);
289 	dm_bio_prison_free_cell(pool->prison, cell);
290 }
291 
292 static void cell_release_no_holder(struct pool *pool,
293 				   struct dm_bio_prison_cell *cell,
294 				   struct bio_list *bios)
295 {
296 	dm_cell_release_no_holder(pool->prison, cell, bios);
297 	dm_bio_prison_free_cell(pool->prison, cell);
298 }
299 
300 static void cell_defer_no_holder_no_free(struct thin_c *tc,
301 					 struct dm_bio_prison_cell *cell)
302 {
303 	struct pool *pool = tc->pool;
304 	unsigned long flags;
305 
306 	spin_lock_irqsave(&tc->lock, flags);
307 	dm_cell_release_no_holder(pool->prison, cell, &tc->deferred_bio_list);
308 	spin_unlock_irqrestore(&tc->lock, flags);
309 
310 	wake_worker(pool);
311 }
312 
313 static void cell_error_with_code(struct pool *pool,
314 				 struct dm_bio_prison_cell *cell, int error_code)
315 {
316 	dm_cell_error(pool->prison, cell, error_code);
317 	dm_bio_prison_free_cell(pool->prison, cell);
318 }
319 
320 static void cell_error(struct pool *pool, struct dm_bio_prison_cell *cell)
321 {
322 	cell_error_with_code(pool, cell, -EIO);
323 }
324 
325 /*----------------------------------------------------------------*/
326 
327 /*
328  * A global list of pools that uses a struct mapped_device as a key.
329  */
330 static struct dm_thin_pool_table {
331 	struct mutex mutex;
332 	struct list_head pools;
333 } dm_thin_pool_table;
334 
335 static void pool_table_init(void)
336 {
337 	mutex_init(&dm_thin_pool_table.mutex);
338 	INIT_LIST_HEAD(&dm_thin_pool_table.pools);
339 }
340 
341 static void __pool_table_insert(struct pool *pool)
342 {
343 	BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
344 	list_add(&pool->list, &dm_thin_pool_table.pools);
345 }
346 
347 static void __pool_table_remove(struct pool *pool)
348 {
349 	BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
350 	list_del(&pool->list);
351 }
352 
353 static struct pool *__pool_table_lookup(struct mapped_device *md)
354 {
355 	struct pool *pool = NULL, *tmp;
356 
357 	BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
358 
359 	list_for_each_entry(tmp, &dm_thin_pool_table.pools, list) {
360 		if (tmp->pool_md == md) {
361 			pool = tmp;
362 			break;
363 		}
364 	}
365 
366 	return pool;
367 }
368 
369 static struct pool *__pool_table_lookup_metadata_dev(struct block_device *md_dev)
370 {
371 	struct pool *pool = NULL, *tmp;
372 
373 	BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
374 
375 	list_for_each_entry(tmp, &dm_thin_pool_table.pools, list) {
376 		if (tmp->md_dev == md_dev) {
377 			pool = tmp;
378 			break;
379 		}
380 	}
381 
382 	return pool;
383 }
384 
385 /*----------------------------------------------------------------*/
386 
387 struct dm_thin_endio_hook {
388 	struct thin_c *tc;
389 	struct dm_deferred_entry *shared_read_entry;
390 	struct dm_deferred_entry *all_io_entry;
391 	struct dm_thin_new_mapping *overwrite_mapping;
392 	struct rb_node rb_node;
393 };
394 
395 static void requeue_bio_list(struct thin_c *tc, struct bio_list *master)
396 {
397 	struct bio *bio;
398 	struct bio_list bios;
399 	unsigned long flags;
400 
401 	bio_list_init(&bios);
402 
403 	spin_lock_irqsave(&tc->lock, flags);
404 	bio_list_merge(&bios, master);
405 	bio_list_init(master);
406 	spin_unlock_irqrestore(&tc->lock, flags);
407 
408 	while ((bio = bio_list_pop(&bios)))
409 		bio_endio(bio, DM_ENDIO_REQUEUE);
410 }
411 
412 static void requeue_io(struct thin_c *tc)
413 {
414 	requeue_bio_list(tc, &tc->deferred_bio_list);
415 	requeue_bio_list(tc, &tc->retry_on_resume_list);
416 }
417 
418 static void error_thin_retry_list(struct thin_c *tc)
419 {
420 	struct bio *bio;
421 	unsigned long flags;
422 	struct bio_list bios;
423 
424 	bio_list_init(&bios);
425 
426 	spin_lock_irqsave(&tc->lock, flags);
427 	bio_list_merge(&bios, &tc->retry_on_resume_list);
428 	bio_list_init(&tc->retry_on_resume_list);
429 	spin_unlock_irqrestore(&tc->lock, flags);
430 
431 	while ((bio = bio_list_pop(&bios)))
432 		bio_io_error(bio);
433 }
434 
435 static void error_retry_list(struct pool *pool)
436 {
437 	struct thin_c *tc;
438 
439 	rcu_read_lock();
440 	list_for_each_entry_rcu(tc, &pool->active_thins, list)
441 		error_thin_retry_list(tc);
442 	rcu_read_unlock();
443 }
444 
445 /*
446  * This section of code contains the logic for processing a thin device's IO.
447  * Much of the code depends on pool object resources (lists, workqueues, etc)
448  * but most is exclusively called from the thin target rather than the thin-pool
449  * target.
450  */
451 
452 static bool block_size_is_power_of_two(struct pool *pool)
453 {
454 	return pool->sectors_per_block_shift >= 0;
455 }
456 
457 static dm_block_t get_bio_block(struct thin_c *tc, struct bio *bio)
458 {
459 	struct pool *pool = tc->pool;
460 	sector_t block_nr = bio->bi_iter.bi_sector;
461 
462 	if (block_size_is_power_of_two(pool))
463 		block_nr >>= pool->sectors_per_block_shift;
464 	else
465 		(void) sector_div(block_nr, pool->sectors_per_block);
466 
467 	return block_nr;
468 }
469 
470 static void remap(struct thin_c *tc, struct bio *bio, dm_block_t block)
471 {
472 	struct pool *pool = tc->pool;
473 	sector_t bi_sector = bio->bi_iter.bi_sector;
474 
475 	bio->bi_bdev = tc->pool_dev->bdev;
476 	if (block_size_is_power_of_two(pool))
477 		bio->bi_iter.bi_sector =
478 			(block << pool->sectors_per_block_shift) |
479 			(bi_sector & (pool->sectors_per_block - 1));
480 	else
481 		bio->bi_iter.bi_sector = (block * pool->sectors_per_block) +
482 				 sector_div(bi_sector, pool->sectors_per_block);
483 }
484 
485 static void remap_to_origin(struct thin_c *tc, struct bio *bio)
486 {
487 	bio->bi_bdev = tc->origin_dev->bdev;
488 }
489 
490 static int bio_triggers_commit(struct thin_c *tc, struct bio *bio)
491 {
492 	return (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) &&
493 		dm_thin_changed_this_transaction(tc->td);
494 }
495 
496 static void inc_all_io_entry(struct pool *pool, struct bio *bio)
497 {
498 	struct dm_thin_endio_hook *h;
499 
500 	if (bio->bi_rw & REQ_DISCARD)
501 		return;
502 
503 	h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
504 	h->all_io_entry = dm_deferred_entry_inc(pool->all_io_ds);
505 }
506 
507 static void issue(struct thin_c *tc, struct bio *bio)
508 {
509 	struct pool *pool = tc->pool;
510 	unsigned long flags;
511 
512 	if (!bio_triggers_commit(tc, bio)) {
513 		generic_make_request(bio);
514 		return;
515 	}
516 
517 	/*
518 	 * Complete bio with an error if earlier I/O caused changes to
519 	 * the metadata that can't be committed e.g, due to I/O errors
520 	 * on the metadata device.
521 	 */
522 	if (dm_thin_aborted_changes(tc->td)) {
523 		bio_io_error(bio);
524 		return;
525 	}
526 
527 	/*
528 	 * Batch together any bios that trigger commits and then issue a
529 	 * single commit for them in process_deferred_bios().
530 	 */
531 	spin_lock_irqsave(&pool->lock, flags);
532 	bio_list_add(&pool->deferred_flush_bios, bio);
533 	spin_unlock_irqrestore(&pool->lock, flags);
534 }
535 
536 static void remap_to_origin_and_issue(struct thin_c *tc, struct bio *bio)
537 {
538 	remap_to_origin(tc, bio);
539 	issue(tc, bio);
540 }
541 
542 static void remap_and_issue(struct thin_c *tc, struct bio *bio,
543 			    dm_block_t block)
544 {
545 	remap(tc, bio, block);
546 	issue(tc, bio);
547 }
548 
549 /*----------------------------------------------------------------*/
550 
551 /*
552  * Bio endio functions.
553  */
554 struct dm_thin_new_mapping {
555 	struct list_head list;
556 
557 	bool quiesced:1;
558 	bool prepared:1;
559 	bool pass_discard:1;
560 	bool definitely_not_shared:1;
561 
562 	int err;
563 	struct thin_c *tc;
564 	dm_block_t virt_block;
565 	dm_block_t data_block;
566 	struct dm_bio_prison_cell *cell, *cell2;
567 
568 	/*
569 	 * If the bio covers the whole area of a block then we can avoid
570 	 * zeroing or copying.  Instead this bio is hooked.  The bio will
571 	 * still be in the cell, so care has to be taken to avoid issuing
572 	 * the bio twice.
573 	 */
574 	struct bio *bio;
575 	bio_end_io_t *saved_bi_end_io;
576 };
577 
578 static void __maybe_add_mapping(struct dm_thin_new_mapping *m)
579 {
580 	struct pool *pool = m->tc->pool;
581 
582 	if (m->quiesced && m->prepared) {
583 		list_add_tail(&m->list, &pool->prepared_mappings);
584 		wake_worker(pool);
585 	}
586 }
587 
588 static void copy_complete(int read_err, unsigned long write_err, void *context)
589 {
590 	unsigned long flags;
591 	struct dm_thin_new_mapping *m = context;
592 	struct pool *pool = m->tc->pool;
593 
594 	m->err = read_err || write_err ? -EIO : 0;
595 
596 	spin_lock_irqsave(&pool->lock, flags);
597 	m->prepared = true;
598 	__maybe_add_mapping(m);
599 	spin_unlock_irqrestore(&pool->lock, flags);
600 }
601 
602 static void overwrite_endio(struct bio *bio, int err)
603 {
604 	unsigned long flags;
605 	struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
606 	struct dm_thin_new_mapping *m = h->overwrite_mapping;
607 	struct pool *pool = m->tc->pool;
608 
609 	m->err = err;
610 
611 	spin_lock_irqsave(&pool->lock, flags);
612 	m->prepared = true;
613 	__maybe_add_mapping(m);
614 	spin_unlock_irqrestore(&pool->lock, flags);
615 }
616 
617 /*----------------------------------------------------------------*/
618 
619 /*
620  * Workqueue.
621  */
622 
623 /*
624  * Prepared mapping jobs.
625  */
626 
627 /*
628  * This sends the bios in the cell back to the deferred_bios list.
629  */
630 static void cell_defer(struct thin_c *tc, struct dm_bio_prison_cell *cell)
631 {
632 	struct pool *pool = tc->pool;
633 	unsigned long flags;
634 
635 	spin_lock_irqsave(&tc->lock, flags);
636 	cell_release(pool, cell, &tc->deferred_bio_list);
637 	spin_unlock_irqrestore(&tc->lock, flags);
638 
639 	wake_worker(pool);
640 }
641 
642 /*
643  * Same as cell_defer above, except it omits the original holder of the cell.
644  */
645 static void cell_defer_no_holder(struct thin_c *tc, struct dm_bio_prison_cell *cell)
646 {
647 	struct pool *pool = tc->pool;
648 	unsigned long flags;
649 
650 	spin_lock_irqsave(&tc->lock, flags);
651 	cell_release_no_holder(pool, cell, &tc->deferred_bio_list);
652 	spin_unlock_irqrestore(&tc->lock, flags);
653 
654 	wake_worker(pool);
655 }
656 
657 static void process_prepared_mapping_fail(struct dm_thin_new_mapping *m)
658 {
659 	if (m->bio) {
660 		m->bio->bi_end_io = m->saved_bi_end_io;
661 		atomic_inc(&m->bio->bi_remaining);
662 	}
663 	cell_error(m->tc->pool, m->cell);
664 	list_del(&m->list);
665 	mempool_free(m, m->tc->pool->mapping_pool);
666 }
667 
668 static void process_prepared_mapping(struct dm_thin_new_mapping *m)
669 {
670 	struct thin_c *tc = m->tc;
671 	struct pool *pool = tc->pool;
672 	struct bio *bio;
673 	int r;
674 
675 	bio = m->bio;
676 	if (bio) {
677 		bio->bi_end_io = m->saved_bi_end_io;
678 		atomic_inc(&bio->bi_remaining);
679 	}
680 
681 	if (m->err) {
682 		cell_error(pool, m->cell);
683 		goto out;
684 	}
685 
686 	/*
687 	 * Commit the prepared block into the mapping btree.
688 	 * Any I/O for this block arriving after this point will get
689 	 * remapped to it directly.
690 	 */
691 	r = dm_thin_insert_block(tc->td, m->virt_block, m->data_block);
692 	if (r) {
693 		metadata_operation_failed(pool, "dm_thin_insert_block", r);
694 		cell_error(pool, m->cell);
695 		goto out;
696 	}
697 
698 	/*
699 	 * Release any bios held while the block was being provisioned.
700 	 * If we are processing a write bio that completely covers the block,
701 	 * we already processed it so can ignore it now when processing
702 	 * the bios in the cell.
703 	 */
704 	if (bio) {
705 		cell_defer_no_holder(tc, m->cell);
706 		bio_endio(bio, 0);
707 	} else
708 		cell_defer(tc, m->cell);
709 
710 out:
711 	list_del(&m->list);
712 	mempool_free(m, pool->mapping_pool);
713 }
714 
715 static void process_prepared_discard_fail(struct dm_thin_new_mapping *m)
716 {
717 	struct thin_c *tc = m->tc;
718 
719 	bio_io_error(m->bio);
720 	cell_defer_no_holder(tc, m->cell);
721 	cell_defer_no_holder(tc, m->cell2);
722 	mempool_free(m, tc->pool->mapping_pool);
723 }
724 
725 static void process_prepared_discard_passdown(struct dm_thin_new_mapping *m)
726 {
727 	struct thin_c *tc = m->tc;
728 
729 	inc_all_io_entry(tc->pool, m->bio);
730 	cell_defer_no_holder(tc, m->cell);
731 	cell_defer_no_holder(tc, m->cell2);
732 
733 	if (m->pass_discard)
734 		if (m->definitely_not_shared)
735 			remap_and_issue(tc, m->bio, m->data_block);
736 		else {
737 			bool used = false;
738 			if (dm_pool_block_is_used(tc->pool->pmd, m->data_block, &used) || used)
739 				bio_endio(m->bio, 0);
740 			else
741 				remap_and_issue(tc, m->bio, m->data_block);
742 		}
743 	else
744 		bio_endio(m->bio, 0);
745 
746 	mempool_free(m, tc->pool->mapping_pool);
747 }
748 
749 static void process_prepared_discard(struct dm_thin_new_mapping *m)
750 {
751 	int r;
752 	struct thin_c *tc = m->tc;
753 
754 	r = dm_thin_remove_block(tc->td, m->virt_block);
755 	if (r)
756 		DMERR_LIMIT("dm_thin_remove_block() failed");
757 
758 	process_prepared_discard_passdown(m);
759 }
760 
761 static void process_prepared(struct pool *pool, struct list_head *head,
762 			     process_mapping_fn *fn)
763 {
764 	unsigned long flags;
765 	struct list_head maps;
766 	struct dm_thin_new_mapping *m, *tmp;
767 
768 	INIT_LIST_HEAD(&maps);
769 	spin_lock_irqsave(&pool->lock, flags);
770 	list_splice_init(head, &maps);
771 	spin_unlock_irqrestore(&pool->lock, flags);
772 
773 	list_for_each_entry_safe(m, tmp, &maps, list)
774 		(*fn)(m);
775 }
776 
777 /*
778  * Deferred bio jobs.
779  */
780 static int io_overlaps_block(struct pool *pool, struct bio *bio)
781 {
782 	return bio->bi_iter.bi_size ==
783 		(pool->sectors_per_block << SECTOR_SHIFT);
784 }
785 
786 static int io_overwrites_block(struct pool *pool, struct bio *bio)
787 {
788 	return (bio_data_dir(bio) == WRITE) &&
789 		io_overlaps_block(pool, bio);
790 }
791 
792 static void save_and_set_endio(struct bio *bio, bio_end_io_t **save,
793 			       bio_end_io_t *fn)
794 {
795 	*save = bio->bi_end_io;
796 	bio->bi_end_io = fn;
797 }
798 
799 static int ensure_next_mapping(struct pool *pool)
800 {
801 	if (pool->next_mapping)
802 		return 0;
803 
804 	pool->next_mapping = mempool_alloc(pool->mapping_pool, GFP_ATOMIC);
805 
806 	return pool->next_mapping ? 0 : -ENOMEM;
807 }
808 
809 static struct dm_thin_new_mapping *get_next_mapping(struct pool *pool)
810 {
811 	struct dm_thin_new_mapping *m = pool->next_mapping;
812 
813 	BUG_ON(!pool->next_mapping);
814 
815 	memset(m, 0, sizeof(struct dm_thin_new_mapping));
816 	INIT_LIST_HEAD(&m->list);
817 	m->bio = NULL;
818 
819 	pool->next_mapping = NULL;
820 
821 	return m;
822 }
823 
824 static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
825 			  struct dm_dev *origin, dm_block_t data_origin,
826 			  dm_block_t data_dest,
827 			  struct dm_bio_prison_cell *cell, struct bio *bio)
828 {
829 	int r;
830 	struct pool *pool = tc->pool;
831 	struct dm_thin_new_mapping *m = get_next_mapping(pool);
832 
833 	m->tc = tc;
834 	m->virt_block = virt_block;
835 	m->data_block = data_dest;
836 	m->cell = cell;
837 
838 	if (!dm_deferred_set_add_work(pool->shared_read_ds, &m->list))
839 		m->quiesced = true;
840 
841 	/*
842 	 * IO to pool_dev remaps to the pool target's data_dev.
843 	 *
844 	 * If the whole block of data is being overwritten, we can issue the
845 	 * bio immediately. Otherwise we use kcopyd to clone the data first.
846 	 */
847 	if (io_overwrites_block(pool, bio)) {
848 		struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
849 
850 		h->overwrite_mapping = m;
851 		m->bio = bio;
852 		save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio);
853 		inc_all_io_entry(pool, bio);
854 		remap_and_issue(tc, bio, data_dest);
855 	} else {
856 		struct dm_io_region from, to;
857 
858 		from.bdev = origin->bdev;
859 		from.sector = data_origin * pool->sectors_per_block;
860 		from.count = pool->sectors_per_block;
861 
862 		to.bdev = tc->pool_dev->bdev;
863 		to.sector = data_dest * pool->sectors_per_block;
864 		to.count = pool->sectors_per_block;
865 
866 		r = dm_kcopyd_copy(pool->copier, &from, 1, &to,
867 				   0, copy_complete, m);
868 		if (r < 0) {
869 			mempool_free(m, pool->mapping_pool);
870 			DMERR_LIMIT("dm_kcopyd_copy() failed");
871 			cell_error(pool, cell);
872 		}
873 	}
874 }
875 
876 static void schedule_internal_copy(struct thin_c *tc, dm_block_t virt_block,
877 				   dm_block_t data_origin, dm_block_t data_dest,
878 				   struct dm_bio_prison_cell *cell, struct bio *bio)
879 {
880 	schedule_copy(tc, virt_block, tc->pool_dev,
881 		      data_origin, data_dest, cell, bio);
882 }
883 
884 static void schedule_external_copy(struct thin_c *tc, dm_block_t virt_block,
885 				   dm_block_t data_dest,
886 				   struct dm_bio_prison_cell *cell, struct bio *bio)
887 {
888 	schedule_copy(tc, virt_block, tc->origin_dev,
889 		      virt_block, data_dest, cell, bio);
890 }
891 
892 static void schedule_zero(struct thin_c *tc, dm_block_t virt_block,
893 			  dm_block_t data_block, struct dm_bio_prison_cell *cell,
894 			  struct bio *bio)
895 {
896 	struct pool *pool = tc->pool;
897 	struct dm_thin_new_mapping *m = get_next_mapping(pool);
898 
899 	m->quiesced = true;
900 	m->prepared = false;
901 	m->tc = tc;
902 	m->virt_block = virt_block;
903 	m->data_block = data_block;
904 	m->cell = cell;
905 
906 	/*
907 	 * If the whole block of data is being overwritten or we are not
908 	 * zeroing pre-existing data, we can issue the bio immediately.
909 	 * Otherwise we use kcopyd to zero the data first.
910 	 */
911 	if (!pool->pf.zero_new_blocks)
912 		process_prepared_mapping(m);
913 
914 	else if (io_overwrites_block(pool, bio)) {
915 		struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
916 
917 		h->overwrite_mapping = m;
918 		m->bio = bio;
919 		save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio);
920 		inc_all_io_entry(pool, bio);
921 		remap_and_issue(tc, bio, data_block);
922 	} else {
923 		int r;
924 		struct dm_io_region to;
925 
926 		to.bdev = tc->pool_dev->bdev;
927 		to.sector = data_block * pool->sectors_per_block;
928 		to.count = pool->sectors_per_block;
929 
930 		r = dm_kcopyd_zero(pool->copier, 1, &to, 0, copy_complete, m);
931 		if (r < 0) {
932 			mempool_free(m, pool->mapping_pool);
933 			DMERR_LIMIT("dm_kcopyd_zero() failed");
934 			cell_error(pool, cell);
935 		}
936 	}
937 }
938 
939 /*
940  * A non-zero return indicates read_only or fail_io mode.
941  * Many callers don't care about the return value.
942  */
943 static int commit(struct pool *pool)
944 {
945 	int r;
946 
947 	if (get_pool_mode(pool) >= PM_READ_ONLY)
948 		return -EINVAL;
949 
950 	r = dm_pool_commit_metadata(pool->pmd);
951 	if (r)
952 		metadata_operation_failed(pool, "dm_pool_commit_metadata", r);
953 
954 	return r;
955 }
956 
957 static void check_low_water_mark(struct pool *pool, dm_block_t free_blocks)
958 {
959 	unsigned long flags;
960 
961 	if (free_blocks <= pool->low_water_blocks && !pool->low_water_triggered) {
962 		DMWARN("%s: reached low water mark for data device: sending event.",
963 		       dm_device_name(pool->pool_md));
964 		spin_lock_irqsave(&pool->lock, flags);
965 		pool->low_water_triggered = true;
966 		spin_unlock_irqrestore(&pool->lock, flags);
967 		dm_table_event(pool->ti->table);
968 	}
969 }
970 
971 static void set_pool_mode(struct pool *pool, enum pool_mode new_mode);
972 
973 static int alloc_data_block(struct thin_c *tc, dm_block_t *result)
974 {
975 	int r;
976 	dm_block_t free_blocks;
977 	struct pool *pool = tc->pool;
978 
979 	if (WARN_ON(get_pool_mode(pool) != PM_WRITE))
980 		return -EINVAL;
981 
982 	r = dm_pool_get_free_block_count(pool->pmd, &free_blocks);
983 	if (r) {
984 		metadata_operation_failed(pool, "dm_pool_get_free_block_count", r);
985 		return r;
986 	}
987 
988 	check_low_water_mark(pool, free_blocks);
989 
990 	if (!free_blocks) {
991 		/*
992 		 * Try to commit to see if that will free up some
993 		 * more space.
994 		 */
995 		r = commit(pool);
996 		if (r)
997 			return r;
998 
999 		r = dm_pool_get_free_block_count(pool->pmd, &free_blocks);
1000 		if (r) {
1001 			metadata_operation_failed(pool, "dm_pool_get_free_block_count", r);
1002 			return r;
1003 		}
1004 
1005 		if (!free_blocks) {
1006 			set_pool_mode(pool, PM_OUT_OF_DATA_SPACE);
1007 			return -ENOSPC;
1008 		}
1009 	}
1010 
1011 	r = dm_pool_alloc_data_block(pool->pmd, result);
1012 	if (r) {
1013 		metadata_operation_failed(pool, "dm_pool_alloc_data_block", r);
1014 		return r;
1015 	}
1016 
1017 	return 0;
1018 }
1019 
1020 /*
1021  * If we have run out of space, queue bios until the device is
1022  * resumed, presumably after having been reloaded with more space.
1023  */
1024 static void retry_on_resume(struct bio *bio)
1025 {
1026 	struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
1027 	struct thin_c *tc = h->tc;
1028 	unsigned long flags;
1029 
1030 	spin_lock_irqsave(&tc->lock, flags);
1031 	bio_list_add(&tc->retry_on_resume_list, bio);
1032 	spin_unlock_irqrestore(&tc->lock, flags);
1033 }
1034 
1035 static int should_error_unserviceable_bio(struct pool *pool)
1036 {
1037 	enum pool_mode m = get_pool_mode(pool);
1038 
1039 	switch (m) {
1040 	case PM_WRITE:
1041 		/* Shouldn't get here */
1042 		DMERR_LIMIT("bio unserviceable, yet pool is in PM_WRITE mode");
1043 		return -EIO;
1044 
1045 	case PM_OUT_OF_DATA_SPACE:
1046 		return pool->pf.error_if_no_space ? -ENOSPC : 0;
1047 
1048 	case PM_READ_ONLY:
1049 	case PM_FAIL:
1050 		return -EIO;
1051 	default:
1052 		/* Shouldn't get here */
1053 		DMERR_LIMIT("bio unserviceable, yet pool has an unknown mode");
1054 		return -EIO;
1055 	}
1056 }
1057 
1058 static void handle_unserviceable_bio(struct pool *pool, struct bio *bio)
1059 {
1060 	int error = should_error_unserviceable_bio(pool);
1061 
1062 	if (error)
1063 		bio_endio(bio, error);
1064 	else
1065 		retry_on_resume(bio);
1066 }
1067 
1068 static void retry_bios_on_resume(struct pool *pool, struct dm_bio_prison_cell *cell)
1069 {
1070 	struct bio *bio;
1071 	struct bio_list bios;
1072 	int error;
1073 
1074 	error = should_error_unserviceable_bio(pool);
1075 	if (error) {
1076 		cell_error_with_code(pool, cell, error);
1077 		return;
1078 	}
1079 
1080 	bio_list_init(&bios);
1081 	cell_release(pool, cell, &bios);
1082 
1083 	error = should_error_unserviceable_bio(pool);
1084 	if (error)
1085 		while ((bio = bio_list_pop(&bios)))
1086 			bio_endio(bio, error);
1087 	else
1088 		while ((bio = bio_list_pop(&bios)))
1089 			retry_on_resume(bio);
1090 }
1091 
1092 static void process_discard(struct thin_c *tc, struct bio *bio)
1093 {
1094 	int r;
1095 	unsigned long flags;
1096 	struct pool *pool = tc->pool;
1097 	struct dm_bio_prison_cell *cell, *cell2;
1098 	struct dm_cell_key key, key2;
1099 	dm_block_t block = get_bio_block(tc, bio);
1100 	struct dm_thin_lookup_result lookup_result;
1101 	struct dm_thin_new_mapping *m;
1102 
1103 	build_virtual_key(tc->td, block, &key);
1104 	if (bio_detain(tc->pool, &key, bio, &cell))
1105 		return;
1106 
1107 	r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
1108 	switch (r) {
1109 	case 0:
1110 		/*
1111 		 * Check nobody is fiddling with this pool block.  This can
1112 		 * happen if someone's in the process of breaking sharing
1113 		 * on this block.
1114 		 */
1115 		build_data_key(tc->td, lookup_result.block, &key2);
1116 		if (bio_detain(tc->pool, &key2, bio, &cell2)) {
1117 			cell_defer_no_holder(tc, cell);
1118 			break;
1119 		}
1120 
1121 		if (io_overlaps_block(pool, bio)) {
1122 			/*
1123 			 * IO may still be going to the destination block.  We must
1124 			 * quiesce before we can do the removal.
1125 			 */
1126 			m = get_next_mapping(pool);
1127 			m->tc = tc;
1128 			m->pass_discard = pool->pf.discard_passdown;
1129 			m->definitely_not_shared = !lookup_result.shared;
1130 			m->virt_block = block;
1131 			m->data_block = lookup_result.block;
1132 			m->cell = cell;
1133 			m->cell2 = cell2;
1134 			m->bio = bio;
1135 
1136 			if (!dm_deferred_set_add_work(pool->all_io_ds, &m->list)) {
1137 				spin_lock_irqsave(&pool->lock, flags);
1138 				list_add_tail(&m->list, &pool->prepared_discards);
1139 				spin_unlock_irqrestore(&pool->lock, flags);
1140 				wake_worker(pool);
1141 			}
1142 		} else {
1143 			inc_all_io_entry(pool, bio);
1144 			cell_defer_no_holder(tc, cell);
1145 			cell_defer_no_holder(tc, cell2);
1146 
1147 			/*
1148 			 * The DM core makes sure that the discard doesn't span
1149 			 * a block boundary.  So we submit the discard of a
1150 			 * partial block appropriately.
1151 			 */
1152 			if ((!lookup_result.shared) && pool->pf.discard_passdown)
1153 				remap_and_issue(tc, bio, lookup_result.block);
1154 			else
1155 				bio_endio(bio, 0);
1156 		}
1157 		break;
1158 
1159 	case -ENODATA:
1160 		/*
1161 		 * It isn't provisioned, just forget it.
1162 		 */
1163 		cell_defer_no_holder(tc, cell);
1164 		bio_endio(bio, 0);
1165 		break;
1166 
1167 	default:
1168 		DMERR_LIMIT("%s: dm_thin_find_block() failed: error = %d",
1169 			    __func__, r);
1170 		cell_defer_no_holder(tc, cell);
1171 		bio_io_error(bio);
1172 		break;
1173 	}
1174 }
1175 
1176 static void break_sharing(struct thin_c *tc, struct bio *bio, dm_block_t block,
1177 			  struct dm_cell_key *key,
1178 			  struct dm_thin_lookup_result *lookup_result,
1179 			  struct dm_bio_prison_cell *cell)
1180 {
1181 	int r;
1182 	dm_block_t data_block;
1183 	struct pool *pool = tc->pool;
1184 
1185 	r = alloc_data_block(tc, &data_block);
1186 	switch (r) {
1187 	case 0:
1188 		schedule_internal_copy(tc, block, lookup_result->block,
1189 				       data_block, cell, bio);
1190 		break;
1191 
1192 	case -ENOSPC:
1193 		retry_bios_on_resume(pool, cell);
1194 		break;
1195 
1196 	default:
1197 		DMERR_LIMIT("%s: alloc_data_block() failed: error = %d",
1198 			    __func__, r);
1199 		cell_error(pool, cell);
1200 		break;
1201 	}
1202 }
1203 
1204 static void process_shared_bio(struct thin_c *tc, struct bio *bio,
1205 			       dm_block_t block,
1206 			       struct dm_thin_lookup_result *lookup_result)
1207 {
1208 	struct dm_bio_prison_cell *cell;
1209 	struct pool *pool = tc->pool;
1210 	struct dm_cell_key key;
1211 
1212 	/*
1213 	 * If cell is already occupied, then sharing is already in the process
1214 	 * of being broken so we have nothing further to do here.
1215 	 */
1216 	build_data_key(tc->td, lookup_result->block, &key);
1217 	if (bio_detain(pool, &key, bio, &cell))
1218 		return;
1219 
1220 	if (bio_data_dir(bio) == WRITE && bio->bi_iter.bi_size)
1221 		break_sharing(tc, bio, block, &key, lookup_result, cell);
1222 	else {
1223 		struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
1224 
1225 		h->shared_read_entry = dm_deferred_entry_inc(pool->shared_read_ds);
1226 		inc_all_io_entry(pool, bio);
1227 		cell_defer_no_holder(tc, cell);
1228 
1229 		remap_and_issue(tc, bio, lookup_result->block);
1230 	}
1231 }
1232 
1233 static void provision_block(struct thin_c *tc, struct bio *bio, dm_block_t block,
1234 			    struct dm_bio_prison_cell *cell)
1235 {
1236 	int r;
1237 	dm_block_t data_block;
1238 	struct pool *pool = tc->pool;
1239 
1240 	/*
1241 	 * Remap empty bios (flushes) immediately, without provisioning.
1242 	 */
1243 	if (!bio->bi_iter.bi_size) {
1244 		inc_all_io_entry(pool, bio);
1245 		cell_defer_no_holder(tc, cell);
1246 
1247 		remap_and_issue(tc, bio, 0);
1248 		return;
1249 	}
1250 
1251 	/*
1252 	 * Fill read bios with zeroes and complete them immediately.
1253 	 */
1254 	if (bio_data_dir(bio) == READ) {
1255 		zero_fill_bio(bio);
1256 		cell_defer_no_holder(tc, cell);
1257 		bio_endio(bio, 0);
1258 		return;
1259 	}
1260 
1261 	r = alloc_data_block(tc, &data_block);
1262 	switch (r) {
1263 	case 0:
1264 		if (tc->origin_dev)
1265 			schedule_external_copy(tc, block, data_block, cell, bio);
1266 		else
1267 			schedule_zero(tc, block, data_block, cell, bio);
1268 		break;
1269 
1270 	case -ENOSPC:
1271 		retry_bios_on_resume(pool, cell);
1272 		break;
1273 
1274 	default:
1275 		DMERR_LIMIT("%s: alloc_data_block() failed: error = %d",
1276 			    __func__, r);
1277 		cell_error(pool, cell);
1278 		break;
1279 	}
1280 }
1281 
1282 static void process_bio(struct thin_c *tc, struct bio *bio)
1283 {
1284 	int r;
1285 	struct pool *pool = tc->pool;
1286 	dm_block_t block = get_bio_block(tc, bio);
1287 	struct dm_bio_prison_cell *cell;
1288 	struct dm_cell_key key;
1289 	struct dm_thin_lookup_result lookup_result;
1290 
1291 	/*
1292 	 * If cell is already occupied, then the block is already
1293 	 * being provisioned so we have nothing further to do here.
1294 	 */
1295 	build_virtual_key(tc->td, block, &key);
1296 	if (bio_detain(pool, &key, bio, &cell))
1297 		return;
1298 
1299 	r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
1300 	switch (r) {
1301 	case 0:
1302 		if (lookup_result.shared) {
1303 			process_shared_bio(tc, bio, block, &lookup_result);
1304 			cell_defer_no_holder(tc, cell); /* FIXME: pass this cell into process_shared? */
1305 		} else {
1306 			inc_all_io_entry(pool, bio);
1307 			cell_defer_no_holder(tc, cell);
1308 
1309 			remap_and_issue(tc, bio, lookup_result.block);
1310 		}
1311 		break;
1312 
1313 	case -ENODATA:
1314 		if (bio_data_dir(bio) == READ && tc->origin_dev) {
1315 			inc_all_io_entry(pool, bio);
1316 			cell_defer_no_holder(tc, cell);
1317 
1318 			remap_to_origin_and_issue(tc, bio);
1319 		} else
1320 			provision_block(tc, bio, block, cell);
1321 		break;
1322 
1323 	default:
1324 		DMERR_LIMIT("%s: dm_thin_find_block() failed: error = %d",
1325 			    __func__, r);
1326 		cell_defer_no_holder(tc, cell);
1327 		bio_io_error(bio);
1328 		break;
1329 	}
1330 }
1331 
1332 static void process_bio_read_only(struct thin_c *tc, struct bio *bio)
1333 {
1334 	int r;
1335 	int rw = bio_data_dir(bio);
1336 	dm_block_t block = get_bio_block(tc, bio);
1337 	struct dm_thin_lookup_result lookup_result;
1338 
1339 	r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
1340 	switch (r) {
1341 	case 0:
1342 		if (lookup_result.shared && (rw == WRITE) && bio->bi_iter.bi_size)
1343 			handle_unserviceable_bio(tc->pool, bio);
1344 		else {
1345 			inc_all_io_entry(tc->pool, bio);
1346 			remap_and_issue(tc, bio, lookup_result.block);
1347 		}
1348 		break;
1349 
1350 	case -ENODATA:
1351 		if (rw != READ) {
1352 			handle_unserviceable_bio(tc->pool, bio);
1353 			break;
1354 		}
1355 
1356 		if (tc->origin_dev) {
1357 			inc_all_io_entry(tc->pool, bio);
1358 			remap_to_origin_and_issue(tc, bio);
1359 			break;
1360 		}
1361 
1362 		zero_fill_bio(bio);
1363 		bio_endio(bio, 0);
1364 		break;
1365 
1366 	default:
1367 		DMERR_LIMIT("%s: dm_thin_find_block() failed: error = %d",
1368 			    __func__, r);
1369 		bio_io_error(bio);
1370 		break;
1371 	}
1372 }
1373 
1374 static void process_bio_success(struct thin_c *tc, struct bio *bio)
1375 {
1376 	bio_endio(bio, 0);
1377 }
1378 
1379 static void process_bio_fail(struct thin_c *tc, struct bio *bio)
1380 {
1381 	bio_io_error(bio);
1382 }
1383 
1384 /*
1385  * FIXME: should we also commit due to size of transaction, measured in
1386  * metadata blocks?
1387  */
1388 static int need_commit_due_to_time(struct pool *pool)
1389 {
1390 	return jiffies < pool->last_commit_jiffies ||
1391 	       jiffies > pool->last_commit_jiffies + COMMIT_PERIOD;
1392 }
1393 
1394 #define thin_pbd(node) rb_entry((node), struct dm_thin_endio_hook, rb_node)
1395 #define thin_bio(pbd) dm_bio_from_per_bio_data((pbd), sizeof(struct dm_thin_endio_hook))
1396 
1397 static void __thin_bio_rb_add(struct thin_c *tc, struct bio *bio)
1398 {
1399 	struct rb_node **rbp, *parent;
1400 	struct dm_thin_endio_hook *pbd;
1401 	sector_t bi_sector = bio->bi_iter.bi_sector;
1402 
1403 	rbp = &tc->sort_bio_list.rb_node;
1404 	parent = NULL;
1405 	while (*rbp) {
1406 		parent = *rbp;
1407 		pbd = thin_pbd(parent);
1408 
1409 		if (bi_sector < thin_bio(pbd)->bi_iter.bi_sector)
1410 			rbp = &(*rbp)->rb_left;
1411 		else
1412 			rbp = &(*rbp)->rb_right;
1413 	}
1414 
1415 	pbd = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
1416 	rb_link_node(&pbd->rb_node, parent, rbp);
1417 	rb_insert_color(&pbd->rb_node, &tc->sort_bio_list);
1418 }
1419 
1420 static void __extract_sorted_bios(struct thin_c *tc)
1421 {
1422 	struct rb_node *node;
1423 	struct dm_thin_endio_hook *pbd;
1424 	struct bio *bio;
1425 
1426 	for (node = rb_first(&tc->sort_bio_list); node; node = rb_next(node)) {
1427 		pbd = thin_pbd(node);
1428 		bio = thin_bio(pbd);
1429 
1430 		bio_list_add(&tc->deferred_bio_list, bio);
1431 		rb_erase(&pbd->rb_node, &tc->sort_bio_list);
1432 	}
1433 
1434 	WARN_ON(!RB_EMPTY_ROOT(&tc->sort_bio_list));
1435 }
1436 
1437 static void __sort_thin_deferred_bios(struct thin_c *tc)
1438 {
1439 	struct bio *bio;
1440 	struct bio_list bios;
1441 
1442 	bio_list_init(&bios);
1443 	bio_list_merge(&bios, &tc->deferred_bio_list);
1444 	bio_list_init(&tc->deferred_bio_list);
1445 
1446 	/* Sort deferred_bio_list using rb-tree */
1447 	while ((bio = bio_list_pop(&bios)))
1448 		__thin_bio_rb_add(tc, bio);
1449 
1450 	/*
1451 	 * Transfer the sorted bios in sort_bio_list back to
1452 	 * deferred_bio_list to allow lockless submission of
1453 	 * all bios.
1454 	 */
1455 	__extract_sorted_bios(tc);
1456 }
1457 
1458 static void process_thin_deferred_bios(struct thin_c *tc)
1459 {
1460 	struct pool *pool = tc->pool;
1461 	unsigned long flags;
1462 	struct bio *bio;
1463 	struct bio_list bios;
1464 	struct blk_plug plug;
1465 
1466 	if (tc->requeue_mode) {
1467 		requeue_bio_list(tc, &tc->deferred_bio_list);
1468 		return;
1469 	}
1470 
1471 	bio_list_init(&bios);
1472 
1473 	spin_lock_irqsave(&tc->lock, flags);
1474 
1475 	if (bio_list_empty(&tc->deferred_bio_list)) {
1476 		spin_unlock_irqrestore(&tc->lock, flags);
1477 		return;
1478 	}
1479 
1480 	__sort_thin_deferred_bios(tc);
1481 
1482 	bio_list_merge(&bios, &tc->deferred_bio_list);
1483 	bio_list_init(&tc->deferred_bio_list);
1484 
1485 	spin_unlock_irqrestore(&tc->lock, flags);
1486 
1487 	blk_start_plug(&plug);
1488 	while ((bio = bio_list_pop(&bios))) {
1489 		/*
1490 		 * If we've got no free new_mapping structs, and processing
1491 		 * this bio might require one, we pause until there are some
1492 		 * prepared mappings to process.
1493 		 */
1494 		if (ensure_next_mapping(pool)) {
1495 			spin_lock_irqsave(&tc->lock, flags);
1496 			bio_list_add(&tc->deferred_bio_list, bio);
1497 			bio_list_merge(&tc->deferred_bio_list, &bios);
1498 			spin_unlock_irqrestore(&tc->lock, flags);
1499 			break;
1500 		}
1501 
1502 		if (bio->bi_rw & REQ_DISCARD)
1503 			pool->process_discard(tc, bio);
1504 		else
1505 			pool->process_bio(tc, bio);
1506 	}
1507 	blk_finish_plug(&plug);
1508 }
1509 
1510 static void thin_get(struct thin_c *tc);
1511 static void thin_put(struct thin_c *tc);
1512 
1513 /*
1514  * We can't hold rcu_read_lock() around code that can block.  So we
1515  * find a thin with the rcu lock held; bump a refcount; then drop
1516  * the lock.
1517  */
1518 static struct thin_c *get_first_thin(struct pool *pool)
1519 {
1520 	struct thin_c *tc = NULL;
1521 
1522 	rcu_read_lock();
1523 	if (!list_empty(&pool->active_thins)) {
1524 		tc = list_entry_rcu(pool->active_thins.next, struct thin_c, list);
1525 		thin_get(tc);
1526 	}
1527 	rcu_read_unlock();
1528 
1529 	return tc;
1530 }
1531 
1532 static struct thin_c *get_next_thin(struct pool *pool, struct thin_c *tc)
1533 {
1534 	struct thin_c *old_tc = tc;
1535 
1536 	rcu_read_lock();
1537 	list_for_each_entry_continue_rcu(tc, &pool->active_thins, list) {
1538 		thin_get(tc);
1539 		thin_put(old_tc);
1540 		rcu_read_unlock();
1541 		return tc;
1542 	}
1543 	thin_put(old_tc);
1544 	rcu_read_unlock();
1545 
1546 	return NULL;
1547 }
1548 
1549 static void process_deferred_bios(struct pool *pool)
1550 {
1551 	unsigned long flags;
1552 	struct bio *bio;
1553 	struct bio_list bios;
1554 	struct thin_c *tc;
1555 
1556 	tc = get_first_thin(pool);
1557 	while (tc) {
1558 		process_thin_deferred_bios(tc);
1559 		tc = get_next_thin(pool, tc);
1560 	}
1561 
1562 	/*
1563 	 * If there are any deferred flush bios, we must commit
1564 	 * the metadata before issuing them.
1565 	 */
1566 	bio_list_init(&bios);
1567 	spin_lock_irqsave(&pool->lock, flags);
1568 	bio_list_merge(&bios, &pool->deferred_flush_bios);
1569 	bio_list_init(&pool->deferred_flush_bios);
1570 	spin_unlock_irqrestore(&pool->lock, flags);
1571 
1572 	if (bio_list_empty(&bios) &&
1573 	    !(dm_pool_changed_this_transaction(pool->pmd) && need_commit_due_to_time(pool)))
1574 		return;
1575 
1576 	if (commit(pool)) {
1577 		while ((bio = bio_list_pop(&bios)))
1578 			bio_io_error(bio);
1579 		return;
1580 	}
1581 	pool->last_commit_jiffies = jiffies;
1582 
1583 	while ((bio = bio_list_pop(&bios)))
1584 		generic_make_request(bio);
1585 }
1586 
1587 static void do_worker(struct work_struct *ws)
1588 {
1589 	struct pool *pool = container_of(ws, struct pool, worker);
1590 
1591 	process_prepared(pool, &pool->prepared_mappings, &pool->process_prepared_mapping);
1592 	process_prepared(pool, &pool->prepared_discards, &pool->process_prepared_discard);
1593 	process_deferred_bios(pool);
1594 }
1595 
1596 /*
1597  * We want to commit periodically so that not too much
1598  * unwritten data builds up.
1599  */
1600 static void do_waker(struct work_struct *ws)
1601 {
1602 	struct pool *pool = container_of(to_delayed_work(ws), struct pool, waker);
1603 	wake_worker(pool);
1604 	queue_delayed_work(pool->wq, &pool->waker, COMMIT_PERIOD);
1605 }
1606 
1607 /*
1608  * We're holding onto IO to allow userland time to react.  After the
1609  * timeout either the pool will have been resized (and thus back in
1610  * PM_WRITE mode), or we degrade to PM_READ_ONLY and start erroring IO.
1611  */
1612 static void do_no_space_timeout(struct work_struct *ws)
1613 {
1614 	struct pool *pool = container_of(to_delayed_work(ws), struct pool,
1615 					 no_space_timeout);
1616 
1617 	if (get_pool_mode(pool) == PM_OUT_OF_DATA_SPACE && !pool->pf.error_if_no_space)
1618 		set_pool_mode(pool, PM_READ_ONLY);
1619 }
1620 
1621 /*----------------------------------------------------------------*/
1622 
1623 struct pool_work {
1624 	struct work_struct worker;
1625 	struct completion complete;
1626 };
1627 
1628 static struct pool_work *to_pool_work(struct work_struct *ws)
1629 {
1630 	return container_of(ws, struct pool_work, worker);
1631 }
1632 
1633 static void pool_work_complete(struct pool_work *pw)
1634 {
1635 	complete(&pw->complete);
1636 }
1637 
1638 static void pool_work_wait(struct pool_work *pw, struct pool *pool,
1639 			   void (*fn)(struct work_struct *))
1640 {
1641 	INIT_WORK_ONSTACK(&pw->worker, fn);
1642 	init_completion(&pw->complete);
1643 	queue_work(pool->wq, &pw->worker);
1644 	wait_for_completion(&pw->complete);
1645 }
1646 
1647 /*----------------------------------------------------------------*/
1648 
1649 struct noflush_work {
1650 	struct pool_work pw;
1651 	struct thin_c *tc;
1652 };
1653 
1654 static struct noflush_work *to_noflush(struct work_struct *ws)
1655 {
1656 	return container_of(to_pool_work(ws), struct noflush_work, pw);
1657 }
1658 
1659 static void do_noflush_start(struct work_struct *ws)
1660 {
1661 	struct noflush_work *w = to_noflush(ws);
1662 	w->tc->requeue_mode = true;
1663 	requeue_io(w->tc);
1664 	pool_work_complete(&w->pw);
1665 }
1666 
1667 static void do_noflush_stop(struct work_struct *ws)
1668 {
1669 	struct noflush_work *w = to_noflush(ws);
1670 	w->tc->requeue_mode = false;
1671 	pool_work_complete(&w->pw);
1672 }
1673 
1674 static void noflush_work(struct thin_c *tc, void (*fn)(struct work_struct *))
1675 {
1676 	struct noflush_work w;
1677 
1678 	w.tc = tc;
1679 	pool_work_wait(&w.pw, tc->pool, fn);
1680 }
1681 
1682 /*----------------------------------------------------------------*/
1683 
1684 static enum pool_mode get_pool_mode(struct pool *pool)
1685 {
1686 	return pool->pf.mode;
1687 }
1688 
1689 static void notify_of_pool_mode_change(struct pool *pool, const char *new_mode)
1690 {
1691 	dm_table_event(pool->ti->table);
1692 	DMINFO("%s: switching pool to %s mode",
1693 	       dm_device_name(pool->pool_md), new_mode);
1694 }
1695 
1696 static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)
1697 {
1698 	struct pool_c *pt = pool->ti->private;
1699 	bool needs_check = dm_pool_metadata_needs_check(pool->pmd);
1700 	enum pool_mode old_mode = get_pool_mode(pool);
1701 	unsigned long no_space_timeout = ACCESS_ONCE(no_space_timeout_secs) * HZ;
1702 
1703 	/*
1704 	 * Never allow the pool to transition to PM_WRITE mode if user
1705 	 * intervention is required to verify metadata and data consistency.
1706 	 */
1707 	if (new_mode == PM_WRITE && needs_check) {
1708 		DMERR("%s: unable to switch pool to write mode until repaired.",
1709 		      dm_device_name(pool->pool_md));
1710 		if (old_mode != new_mode)
1711 			new_mode = old_mode;
1712 		else
1713 			new_mode = PM_READ_ONLY;
1714 	}
1715 	/*
1716 	 * If we were in PM_FAIL mode, rollback of metadata failed.  We're
1717 	 * not going to recover without a thin_repair.	So we never let the
1718 	 * pool move out of the old mode.
1719 	 */
1720 	if (old_mode == PM_FAIL)
1721 		new_mode = old_mode;
1722 
1723 	switch (new_mode) {
1724 	case PM_FAIL:
1725 		if (old_mode != new_mode)
1726 			notify_of_pool_mode_change(pool, "failure");
1727 		dm_pool_metadata_read_only(pool->pmd);
1728 		pool->process_bio = process_bio_fail;
1729 		pool->process_discard = process_bio_fail;
1730 		pool->process_prepared_mapping = process_prepared_mapping_fail;
1731 		pool->process_prepared_discard = process_prepared_discard_fail;
1732 
1733 		error_retry_list(pool);
1734 		break;
1735 
1736 	case PM_READ_ONLY:
1737 		if (old_mode != new_mode)
1738 			notify_of_pool_mode_change(pool, "read-only");
1739 		dm_pool_metadata_read_only(pool->pmd);
1740 		pool->process_bio = process_bio_read_only;
1741 		pool->process_discard = process_bio_success;
1742 		pool->process_prepared_mapping = process_prepared_mapping_fail;
1743 		pool->process_prepared_discard = process_prepared_discard_passdown;
1744 
1745 		error_retry_list(pool);
1746 		break;
1747 
1748 	case PM_OUT_OF_DATA_SPACE:
1749 		/*
1750 		 * Ideally we'd never hit this state; the low water mark
1751 		 * would trigger userland to extend the pool before we
1752 		 * completely run out of data space.  However, many small
1753 		 * IOs to unprovisioned space can consume data space at an
1754 		 * alarming rate.  Adjust your low water mark if you're
1755 		 * frequently seeing this mode.
1756 		 */
1757 		if (old_mode != new_mode)
1758 			notify_of_pool_mode_change(pool, "out-of-data-space");
1759 		pool->process_bio = process_bio_read_only;
1760 		pool->process_discard = process_discard;
1761 		pool->process_prepared_mapping = process_prepared_mapping;
1762 		pool->process_prepared_discard = process_prepared_discard_passdown;
1763 
1764 		if (!pool->pf.error_if_no_space && no_space_timeout)
1765 			queue_delayed_work(pool->wq, &pool->no_space_timeout, no_space_timeout);
1766 		break;
1767 
1768 	case PM_WRITE:
1769 		if (old_mode != new_mode)
1770 			notify_of_pool_mode_change(pool, "write");
1771 		dm_pool_metadata_read_write(pool->pmd);
1772 		pool->process_bio = process_bio;
1773 		pool->process_discard = process_discard;
1774 		pool->process_prepared_mapping = process_prepared_mapping;
1775 		pool->process_prepared_discard = process_prepared_discard;
1776 		break;
1777 	}
1778 
1779 	pool->pf.mode = new_mode;
1780 	/*
1781 	 * The pool mode may have changed, sync it so bind_control_target()
1782 	 * doesn't cause an unexpected mode transition on resume.
1783 	 */
1784 	pt->adjusted_pf.mode = new_mode;
1785 }
1786 
1787 static void abort_transaction(struct pool *pool)
1788 {
1789 	const char *dev_name = dm_device_name(pool->pool_md);
1790 
1791 	DMERR_LIMIT("%s: aborting current metadata transaction", dev_name);
1792 	if (dm_pool_abort_metadata(pool->pmd)) {
1793 		DMERR("%s: failed to abort metadata transaction", dev_name);
1794 		set_pool_mode(pool, PM_FAIL);
1795 	}
1796 
1797 	if (dm_pool_metadata_set_needs_check(pool->pmd)) {
1798 		DMERR("%s: failed to set 'needs_check' flag in metadata", dev_name);
1799 		set_pool_mode(pool, PM_FAIL);
1800 	}
1801 }
1802 
1803 static void metadata_operation_failed(struct pool *pool, const char *op, int r)
1804 {
1805 	DMERR_LIMIT("%s: metadata operation '%s' failed: error = %d",
1806 		    dm_device_name(pool->pool_md), op, r);
1807 
1808 	abort_transaction(pool);
1809 	set_pool_mode(pool, PM_READ_ONLY);
1810 }
1811 
1812 /*----------------------------------------------------------------*/
1813 
1814 /*
1815  * Mapping functions.
1816  */
1817 
1818 /*
1819  * Called only while mapping a thin bio to hand it over to the workqueue.
1820  */
1821 static void thin_defer_bio(struct thin_c *tc, struct bio *bio)
1822 {
1823 	unsigned long flags;
1824 	struct pool *pool = tc->pool;
1825 
1826 	spin_lock_irqsave(&tc->lock, flags);
1827 	bio_list_add(&tc->deferred_bio_list, bio);
1828 	spin_unlock_irqrestore(&tc->lock, flags);
1829 
1830 	wake_worker(pool);
1831 }
1832 
1833 static void thin_hook_bio(struct thin_c *tc, struct bio *bio)
1834 {
1835 	struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
1836 
1837 	h->tc = tc;
1838 	h->shared_read_entry = NULL;
1839 	h->all_io_entry = NULL;
1840 	h->overwrite_mapping = NULL;
1841 }
1842 
1843 /*
1844  * Non-blocking function called from the thin target's map function.
1845  */
1846 static int thin_bio_map(struct dm_target *ti, struct bio *bio)
1847 {
1848 	int r;
1849 	struct thin_c *tc = ti->private;
1850 	dm_block_t block = get_bio_block(tc, bio);
1851 	struct dm_thin_device *td = tc->td;
1852 	struct dm_thin_lookup_result result;
1853 	struct dm_bio_prison_cell cell1, cell2;
1854 	struct dm_bio_prison_cell *cell_result;
1855 	struct dm_cell_key key;
1856 
1857 	thin_hook_bio(tc, bio);
1858 
1859 	if (tc->requeue_mode) {
1860 		bio_endio(bio, DM_ENDIO_REQUEUE);
1861 		return DM_MAPIO_SUBMITTED;
1862 	}
1863 
1864 	if (get_pool_mode(tc->pool) == PM_FAIL) {
1865 		bio_io_error(bio);
1866 		return DM_MAPIO_SUBMITTED;
1867 	}
1868 
1869 	if (bio->bi_rw & (REQ_DISCARD | REQ_FLUSH | REQ_FUA)) {
1870 		thin_defer_bio(tc, bio);
1871 		return DM_MAPIO_SUBMITTED;
1872 	}
1873 
1874 	r = dm_thin_find_block(td, block, 0, &result);
1875 
1876 	/*
1877 	 * Note that we defer readahead too.
1878 	 */
1879 	switch (r) {
1880 	case 0:
1881 		if (unlikely(result.shared)) {
1882 			/*
1883 			 * We have a race condition here between the
1884 			 * result.shared value returned by the lookup and
1885 			 * snapshot creation, which may cause new
1886 			 * sharing.
1887 			 *
1888 			 * To avoid this always quiesce the origin before
1889 			 * taking the snap.  You want to do this anyway to
1890 			 * ensure a consistent application view
1891 			 * (i.e. lockfs).
1892 			 *
1893 			 * More distant ancestors are irrelevant. The
1894 			 * shared flag will be set in their case.
1895 			 */
1896 			thin_defer_bio(tc, bio);
1897 			return DM_MAPIO_SUBMITTED;
1898 		}
1899 
1900 		build_virtual_key(tc->td, block, &key);
1901 		if (dm_bio_detain(tc->pool->prison, &key, bio, &cell1, &cell_result))
1902 			return DM_MAPIO_SUBMITTED;
1903 
1904 		build_data_key(tc->td, result.block, &key);
1905 		if (dm_bio_detain(tc->pool->prison, &key, bio, &cell2, &cell_result)) {
1906 			cell_defer_no_holder_no_free(tc, &cell1);
1907 			return DM_MAPIO_SUBMITTED;
1908 		}
1909 
1910 		inc_all_io_entry(tc->pool, bio);
1911 		cell_defer_no_holder_no_free(tc, &cell2);
1912 		cell_defer_no_holder_no_free(tc, &cell1);
1913 
1914 		remap(tc, bio, result.block);
1915 		return DM_MAPIO_REMAPPED;
1916 
1917 	case -ENODATA:
1918 		if (get_pool_mode(tc->pool) == PM_READ_ONLY) {
1919 			/*
1920 			 * This block isn't provisioned, and we have no way
1921 			 * of doing so.
1922 			 */
1923 			handle_unserviceable_bio(tc->pool, bio);
1924 			return DM_MAPIO_SUBMITTED;
1925 		}
1926 		/* fall through */
1927 
1928 	case -EWOULDBLOCK:
1929 		/*
1930 		 * In future, the failed dm_thin_find_block above could
1931 		 * provide the hint to load the metadata into cache.
1932 		 */
1933 		thin_defer_bio(tc, bio);
1934 		return DM_MAPIO_SUBMITTED;
1935 
1936 	default:
1937 		/*
1938 		 * Must always call bio_io_error on failure.
1939 		 * dm_thin_find_block can fail with -EINVAL if the
1940 		 * pool is switched to fail-io mode.
1941 		 */
1942 		bio_io_error(bio);
1943 		return DM_MAPIO_SUBMITTED;
1944 	}
1945 }
1946 
1947 static int pool_is_congested(struct dm_target_callbacks *cb, int bdi_bits)
1948 {
1949 	struct pool_c *pt = container_of(cb, struct pool_c, callbacks);
1950 	struct request_queue *q;
1951 
1952 	if (get_pool_mode(pt->pool) == PM_OUT_OF_DATA_SPACE)
1953 		return 1;
1954 
1955 	q = bdev_get_queue(pt->data_dev->bdev);
1956 	return bdi_congested(&q->backing_dev_info, bdi_bits);
1957 }
1958 
1959 static void requeue_bios(struct pool *pool)
1960 {
1961 	unsigned long flags;
1962 	struct thin_c *tc;
1963 
1964 	rcu_read_lock();
1965 	list_for_each_entry_rcu(tc, &pool->active_thins, list) {
1966 		spin_lock_irqsave(&tc->lock, flags);
1967 		bio_list_merge(&tc->deferred_bio_list, &tc->retry_on_resume_list);
1968 		bio_list_init(&tc->retry_on_resume_list);
1969 		spin_unlock_irqrestore(&tc->lock, flags);
1970 	}
1971 	rcu_read_unlock();
1972 }
1973 
1974 /*----------------------------------------------------------------
1975  * Binding of control targets to a pool object
1976  *--------------------------------------------------------------*/
1977 static bool data_dev_supports_discard(struct pool_c *pt)
1978 {
1979 	struct request_queue *q = bdev_get_queue(pt->data_dev->bdev);
1980 
1981 	return q && blk_queue_discard(q);
1982 }
1983 
1984 static bool is_factor(sector_t block_size, uint32_t n)
1985 {
1986 	return !sector_div(block_size, n);
1987 }
1988 
1989 /*
1990  * If discard_passdown was enabled verify that the data device
1991  * supports discards.  Disable discard_passdown if not.
1992  */
1993 static void disable_passdown_if_not_supported(struct pool_c *pt)
1994 {
1995 	struct pool *pool = pt->pool;
1996 	struct block_device *data_bdev = pt->data_dev->bdev;
1997 	struct queue_limits *data_limits = &bdev_get_queue(data_bdev)->limits;
1998 	sector_t block_size = pool->sectors_per_block << SECTOR_SHIFT;
1999 	const char *reason = NULL;
2000 	char buf[BDEVNAME_SIZE];
2001 
2002 	if (!pt->adjusted_pf.discard_passdown)
2003 		return;
2004 
2005 	if (!data_dev_supports_discard(pt))
2006 		reason = "discard unsupported";
2007 
2008 	else if (data_limits->max_discard_sectors < pool->sectors_per_block)
2009 		reason = "max discard sectors smaller than a block";
2010 
2011 	else if (data_limits->discard_granularity > block_size)
2012 		reason = "discard granularity larger than a block";
2013 
2014 	else if (!is_factor(block_size, data_limits->discard_granularity))
2015 		reason = "discard granularity not a factor of block size";
2016 
2017 	if (reason) {
2018 		DMWARN("Data device (%s) %s: Disabling discard passdown.", bdevname(data_bdev, buf), reason);
2019 		pt->adjusted_pf.discard_passdown = false;
2020 	}
2021 }
2022 
2023 static int bind_control_target(struct pool *pool, struct dm_target *ti)
2024 {
2025 	struct pool_c *pt = ti->private;
2026 
2027 	/*
2028 	 * We want to make sure that a pool in PM_FAIL mode is never upgraded.
2029 	 */
2030 	enum pool_mode old_mode = get_pool_mode(pool);
2031 	enum pool_mode new_mode = pt->adjusted_pf.mode;
2032 
2033 	/*
2034 	 * Don't change the pool's mode until set_pool_mode() below.
2035 	 * Otherwise the pool's process_* function pointers may
2036 	 * not match the desired pool mode.
2037 	 */
2038 	pt->adjusted_pf.mode = old_mode;
2039 
2040 	pool->ti = ti;
2041 	pool->pf = pt->adjusted_pf;
2042 	pool->low_water_blocks = pt->low_water_blocks;
2043 
2044 	set_pool_mode(pool, new_mode);
2045 
2046 	return 0;
2047 }
2048 
2049 static void unbind_control_target(struct pool *pool, struct dm_target *ti)
2050 {
2051 	if (pool->ti == ti)
2052 		pool->ti = NULL;
2053 }
2054 
2055 /*----------------------------------------------------------------
2056  * Pool creation
2057  *--------------------------------------------------------------*/
2058 /* Initialize pool features. */
2059 static void pool_features_init(struct pool_features *pf)
2060 {
2061 	pf->mode = PM_WRITE;
2062 	pf->zero_new_blocks = true;
2063 	pf->discard_enabled = true;
2064 	pf->discard_passdown = true;
2065 	pf->error_if_no_space = false;
2066 }
2067 
2068 static void __pool_destroy(struct pool *pool)
2069 {
2070 	__pool_table_remove(pool);
2071 
2072 	if (dm_pool_metadata_close(pool->pmd) < 0)
2073 		DMWARN("%s: dm_pool_metadata_close() failed.", __func__);
2074 
2075 	dm_bio_prison_destroy(pool->prison);
2076 	dm_kcopyd_client_destroy(pool->copier);
2077 
2078 	if (pool->wq)
2079 		destroy_workqueue(pool->wq);
2080 
2081 	if (pool->next_mapping)
2082 		mempool_free(pool->next_mapping, pool->mapping_pool);
2083 	mempool_destroy(pool->mapping_pool);
2084 	dm_deferred_set_destroy(pool->shared_read_ds);
2085 	dm_deferred_set_destroy(pool->all_io_ds);
2086 	kfree(pool);
2087 }
2088 
2089 static struct kmem_cache *_new_mapping_cache;
2090 
2091 static struct pool *pool_create(struct mapped_device *pool_md,
2092 				struct block_device *metadata_dev,
2093 				unsigned long block_size,
2094 				int read_only, char **error)
2095 {
2096 	int r;
2097 	void *err_p;
2098 	struct pool *pool;
2099 	struct dm_pool_metadata *pmd;
2100 	bool format_device = read_only ? false : true;
2101 
2102 	pmd = dm_pool_metadata_open(metadata_dev, block_size, format_device);
2103 	if (IS_ERR(pmd)) {
2104 		*error = "Error creating metadata object";
2105 		return (struct pool *)pmd;
2106 	}
2107 
2108 	pool = kmalloc(sizeof(*pool), GFP_KERNEL);
2109 	if (!pool) {
2110 		*error = "Error allocating memory for pool";
2111 		err_p = ERR_PTR(-ENOMEM);
2112 		goto bad_pool;
2113 	}
2114 
2115 	pool->pmd = pmd;
2116 	pool->sectors_per_block = block_size;
2117 	if (block_size & (block_size - 1))
2118 		pool->sectors_per_block_shift = -1;
2119 	else
2120 		pool->sectors_per_block_shift = __ffs(block_size);
2121 	pool->low_water_blocks = 0;
2122 	pool_features_init(&pool->pf);
2123 	pool->prison = dm_bio_prison_create(PRISON_CELLS);
2124 	if (!pool->prison) {
2125 		*error = "Error creating pool's bio prison";
2126 		err_p = ERR_PTR(-ENOMEM);
2127 		goto bad_prison;
2128 	}
2129 
2130 	pool->copier = dm_kcopyd_client_create(&dm_kcopyd_throttle);
2131 	if (IS_ERR(pool->copier)) {
2132 		r = PTR_ERR(pool->copier);
2133 		*error = "Error creating pool's kcopyd client";
2134 		err_p = ERR_PTR(r);
2135 		goto bad_kcopyd_client;
2136 	}
2137 
2138 	/*
2139 	 * Create singlethreaded workqueue that will service all devices
2140 	 * that use this metadata.
2141 	 */
2142 	pool->wq = alloc_ordered_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM);
2143 	if (!pool->wq) {
2144 		*error = "Error creating pool's workqueue";
2145 		err_p = ERR_PTR(-ENOMEM);
2146 		goto bad_wq;
2147 	}
2148 
2149 	INIT_WORK(&pool->worker, do_worker);
2150 	INIT_DELAYED_WORK(&pool->waker, do_waker);
2151 	INIT_DELAYED_WORK(&pool->no_space_timeout, do_no_space_timeout);
2152 	spin_lock_init(&pool->lock);
2153 	bio_list_init(&pool->deferred_flush_bios);
2154 	INIT_LIST_HEAD(&pool->prepared_mappings);
2155 	INIT_LIST_HEAD(&pool->prepared_discards);
2156 	INIT_LIST_HEAD(&pool->active_thins);
2157 	pool->low_water_triggered = false;
2158 
2159 	pool->shared_read_ds = dm_deferred_set_create();
2160 	if (!pool->shared_read_ds) {
2161 		*error = "Error creating pool's shared read deferred set";
2162 		err_p = ERR_PTR(-ENOMEM);
2163 		goto bad_shared_read_ds;
2164 	}
2165 
2166 	pool->all_io_ds = dm_deferred_set_create();
2167 	if (!pool->all_io_ds) {
2168 		*error = "Error creating pool's all io deferred set";
2169 		err_p = ERR_PTR(-ENOMEM);
2170 		goto bad_all_io_ds;
2171 	}
2172 
2173 	pool->next_mapping = NULL;
2174 	pool->mapping_pool = mempool_create_slab_pool(MAPPING_POOL_SIZE,
2175 						      _new_mapping_cache);
2176 	if (!pool->mapping_pool) {
2177 		*error = "Error creating pool's mapping mempool";
2178 		err_p = ERR_PTR(-ENOMEM);
2179 		goto bad_mapping_pool;
2180 	}
2181 
2182 	pool->ref_count = 1;
2183 	pool->last_commit_jiffies = jiffies;
2184 	pool->pool_md = pool_md;
2185 	pool->md_dev = metadata_dev;
2186 	__pool_table_insert(pool);
2187 
2188 	return pool;
2189 
2190 bad_mapping_pool:
2191 	dm_deferred_set_destroy(pool->all_io_ds);
2192 bad_all_io_ds:
2193 	dm_deferred_set_destroy(pool->shared_read_ds);
2194 bad_shared_read_ds:
2195 	destroy_workqueue(pool->wq);
2196 bad_wq:
2197 	dm_kcopyd_client_destroy(pool->copier);
2198 bad_kcopyd_client:
2199 	dm_bio_prison_destroy(pool->prison);
2200 bad_prison:
2201 	kfree(pool);
2202 bad_pool:
2203 	if (dm_pool_metadata_close(pmd))
2204 		DMWARN("%s: dm_pool_metadata_close() failed.", __func__);
2205 
2206 	return err_p;
2207 }
2208 
2209 static void __pool_inc(struct pool *pool)
2210 {
2211 	BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
2212 	pool->ref_count++;
2213 }
2214 
2215 static void __pool_dec(struct pool *pool)
2216 {
2217 	BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
2218 	BUG_ON(!pool->ref_count);
2219 	if (!--pool->ref_count)
2220 		__pool_destroy(pool);
2221 }
2222 
2223 static struct pool *__pool_find(struct mapped_device *pool_md,
2224 				struct block_device *metadata_dev,
2225 				unsigned long block_size, int read_only,
2226 				char **error, int *created)
2227 {
2228 	struct pool *pool = __pool_table_lookup_metadata_dev(metadata_dev);
2229 
2230 	if (pool) {
2231 		if (pool->pool_md != pool_md) {
2232 			*error = "metadata device already in use by a pool";
2233 			return ERR_PTR(-EBUSY);
2234 		}
2235 		__pool_inc(pool);
2236 
2237 	} else {
2238 		pool = __pool_table_lookup(pool_md);
2239 		if (pool) {
2240 			if (pool->md_dev != metadata_dev) {
2241 				*error = "different pool cannot replace a pool";
2242 				return ERR_PTR(-EINVAL);
2243 			}
2244 			__pool_inc(pool);
2245 
2246 		} else {
2247 			pool = pool_create(pool_md, metadata_dev, block_size, read_only, error);
2248 			*created = 1;
2249 		}
2250 	}
2251 
2252 	return pool;
2253 }
2254 
2255 /*----------------------------------------------------------------
2256  * Pool target methods
2257  *--------------------------------------------------------------*/
2258 static void pool_dtr(struct dm_target *ti)
2259 {
2260 	struct pool_c *pt = ti->private;
2261 
2262 	mutex_lock(&dm_thin_pool_table.mutex);
2263 
2264 	unbind_control_target(pt->pool, ti);
2265 	__pool_dec(pt->pool);
2266 	dm_put_device(ti, pt->metadata_dev);
2267 	dm_put_device(ti, pt->data_dev);
2268 	kfree(pt);
2269 
2270 	mutex_unlock(&dm_thin_pool_table.mutex);
2271 }
2272 
2273 static int parse_pool_features(struct dm_arg_set *as, struct pool_features *pf,
2274 			       struct dm_target *ti)
2275 {
2276 	int r;
2277 	unsigned argc;
2278 	const char *arg_name;
2279 
2280 	static struct dm_arg _args[] = {
2281 		{0, 4, "Invalid number of pool feature arguments"},
2282 	};
2283 
2284 	/*
2285 	 * No feature arguments supplied.
2286 	 */
2287 	if (!as->argc)
2288 		return 0;
2289 
2290 	r = dm_read_arg_group(_args, as, &argc, &ti->error);
2291 	if (r)
2292 		return -EINVAL;
2293 
2294 	while (argc && !r) {
2295 		arg_name = dm_shift_arg(as);
2296 		argc--;
2297 
2298 		if (!strcasecmp(arg_name, "skip_block_zeroing"))
2299 			pf->zero_new_blocks = false;
2300 
2301 		else if (!strcasecmp(arg_name, "ignore_discard"))
2302 			pf->discard_enabled = false;
2303 
2304 		else if (!strcasecmp(arg_name, "no_discard_passdown"))
2305 			pf->discard_passdown = false;
2306 
2307 		else if (!strcasecmp(arg_name, "read_only"))
2308 			pf->mode = PM_READ_ONLY;
2309 
2310 		else if (!strcasecmp(arg_name, "error_if_no_space"))
2311 			pf->error_if_no_space = true;
2312 
2313 		else {
2314 			ti->error = "Unrecognised pool feature requested";
2315 			r = -EINVAL;
2316 			break;
2317 		}
2318 	}
2319 
2320 	return r;
2321 }
2322 
2323 static void metadata_low_callback(void *context)
2324 {
2325 	struct pool *pool = context;
2326 
2327 	DMWARN("%s: reached low water mark for metadata device: sending event.",
2328 	       dm_device_name(pool->pool_md));
2329 
2330 	dm_table_event(pool->ti->table);
2331 }
2332 
2333 static sector_t get_dev_size(struct block_device *bdev)
2334 {
2335 	return i_size_read(bdev->bd_inode) >> SECTOR_SHIFT;
2336 }
2337 
2338 static void warn_if_metadata_device_too_big(struct block_device *bdev)
2339 {
2340 	sector_t metadata_dev_size = get_dev_size(bdev);
2341 	char buffer[BDEVNAME_SIZE];
2342 
2343 	if (metadata_dev_size > THIN_METADATA_MAX_SECTORS_WARNING)
2344 		DMWARN("Metadata device %s is larger than %u sectors: excess space will not be used.",
2345 		       bdevname(bdev, buffer), THIN_METADATA_MAX_SECTORS);
2346 }
2347 
2348 static sector_t get_metadata_dev_size(struct block_device *bdev)
2349 {
2350 	sector_t metadata_dev_size = get_dev_size(bdev);
2351 
2352 	if (metadata_dev_size > THIN_METADATA_MAX_SECTORS)
2353 		metadata_dev_size = THIN_METADATA_MAX_SECTORS;
2354 
2355 	return metadata_dev_size;
2356 }
2357 
2358 static dm_block_t get_metadata_dev_size_in_blocks(struct block_device *bdev)
2359 {
2360 	sector_t metadata_dev_size = get_metadata_dev_size(bdev);
2361 
2362 	sector_div(metadata_dev_size, THIN_METADATA_BLOCK_SIZE);
2363 
2364 	return metadata_dev_size;
2365 }
2366 
2367 /*
2368  * When a metadata threshold is crossed a dm event is triggered, and
2369  * userland should respond by growing the metadata device.  We could let
2370  * userland set the threshold, like we do with the data threshold, but I'm
2371  * not sure they know enough to do this well.
2372  */
2373 static dm_block_t calc_metadata_threshold(struct pool_c *pt)
2374 {
2375 	/*
2376 	 * 4M is ample for all ops with the possible exception of thin
2377 	 * device deletion which is harmless if it fails (just retry the
2378 	 * delete after you've grown the device).
2379 	 */
2380 	dm_block_t quarter = get_metadata_dev_size_in_blocks(pt->metadata_dev->bdev) / 4;
2381 	return min((dm_block_t)1024ULL /* 4M */, quarter);
2382 }
2383 
2384 /*
2385  * thin-pool <metadata dev> <data dev>
2386  *	     <data block size (sectors)>
2387  *	     <low water mark (blocks)>
2388  *	     [<#feature args> [<arg>]*]
2389  *
2390  * Optional feature arguments are:
2391  *	     skip_block_zeroing: skips the zeroing of newly-provisioned blocks.
2392  *	     ignore_discard: disable discard
2393  *	     no_discard_passdown: don't pass discards down to the data device
2394  *	     read_only: Don't allow any changes to be made to the pool metadata.
2395  *	     error_if_no_space: error IOs, instead of queueing, if no space.
2396  */
2397 static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
2398 {
2399 	int r, pool_created = 0;
2400 	struct pool_c *pt;
2401 	struct pool *pool;
2402 	struct pool_features pf;
2403 	struct dm_arg_set as;
2404 	struct dm_dev *data_dev;
2405 	unsigned long block_size;
2406 	dm_block_t low_water_blocks;
2407 	struct dm_dev *metadata_dev;
2408 	fmode_t metadata_mode;
2409 
2410 	/*
2411 	 * FIXME Remove validation from scope of lock.
2412 	 */
2413 	mutex_lock(&dm_thin_pool_table.mutex);
2414 
2415 	if (argc < 4) {
2416 		ti->error = "Invalid argument count";
2417 		r = -EINVAL;
2418 		goto out_unlock;
2419 	}
2420 
2421 	as.argc = argc;
2422 	as.argv = argv;
2423 
2424 	/*
2425 	 * Set default pool features.
2426 	 */
2427 	pool_features_init(&pf);
2428 
2429 	dm_consume_args(&as, 4);
2430 	r = parse_pool_features(&as, &pf, ti);
2431 	if (r)
2432 		goto out_unlock;
2433 
2434 	metadata_mode = FMODE_READ | ((pf.mode == PM_READ_ONLY) ? 0 : FMODE_WRITE);
2435 	r = dm_get_device(ti, argv[0], metadata_mode, &metadata_dev);
2436 	if (r) {
2437 		ti->error = "Error opening metadata block device";
2438 		goto out_unlock;
2439 	}
2440 	warn_if_metadata_device_too_big(metadata_dev->bdev);
2441 
2442 	r = dm_get_device(ti, argv[1], FMODE_READ | FMODE_WRITE, &data_dev);
2443 	if (r) {
2444 		ti->error = "Error getting data device";
2445 		goto out_metadata;
2446 	}
2447 
2448 	if (kstrtoul(argv[2], 10, &block_size) || !block_size ||
2449 	    block_size < DATA_DEV_BLOCK_SIZE_MIN_SECTORS ||
2450 	    block_size > DATA_DEV_BLOCK_SIZE_MAX_SECTORS ||
2451 	    block_size & (DATA_DEV_BLOCK_SIZE_MIN_SECTORS - 1)) {
2452 		ti->error = "Invalid block size";
2453 		r = -EINVAL;
2454 		goto out;
2455 	}
2456 
2457 	if (kstrtoull(argv[3], 10, (unsigned long long *)&low_water_blocks)) {
2458 		ti->error = "Invalid low water mark";
2459 		r = -EINVAL;
2460 		goto out;
2461 	}
2462 
2463 	pt = kzalloc(sizeof(*pt), GFP_KERNEL);
2464 	if (!pt) {
2465 		r = -ENOMEM;
2466 		goto out;
2467 	}
2468 
2469 	pool = __pool_find(dm_table_get_md(ti->table), metadata_dev->bdev,
2470 			   block_size, pf.mode == PM_READ_ONLY, &ti->error, &pool_created);
2471 	if (IS_ERR(pool)) {
2472 		r = PTR_ERR(pool);
2473 		goto out_free_pt;
2474 	}
2475 
2476 	/*
2477 	 * 'pool_created' reflects whether this is the first table load.
2478 	 * Top level discard support is not allowed to be changed after
2479 	 * initial load.  This would require a pool reload to trigger thin
2480 	 * device changes.
2481 	 */
2482 	if (!pool_created && pf.discard_enabled != pool->pf.discard_enabled) {
2483 		ti->error = "Discard support cannot be disabled once enabled";
2484 		r = -EINVAL;
2485 		goto out_flags_changed;
2486 	}
2487 
2488 	pt->pool = pool;
2489 	pt->ti = ti;
2490 	pt->metadata_dev = metadata_dev;
2491 	pt->data_dev = data_dev;
2492 	pt->low_water_blocks = low_water_blocks;
2493 	pt->adjusted_pf = pt->requested_pf = pf;
2494 	ti->num_flush_bios = 1;
2495 
2496 	/*
2497 	 * Only need to enable discards if the pool should pass
2498 	 * them down to the data device.  The thin device's discard
2499 	 * processing will cause mappings to be removed from the btree.
2500 	 */
2501 	ti->discard_zeroes_data_unsupported = true;
2502 	if (pf.discard_enabled && pf.discard_passdown) {
2503 		ti->num_discard_bios = 1;
2504 
2505 		/*
2506 		 * Setting 'discards_supported' circumvents the normal
2507 		 * stacking of discard limits (this keeps the pool and
2508 		 * thin devices' discard limits consistent).
2509 		 */
2510 		ti->discards_supported = true;
2511 	}
2512 	ti->private = pt;
2513 
2514 	r = dm_pool_register_metadata_threshold(pt->pool->pmd,
2515 						calc_metadata_threshold(pt),
2516 						metadata_low_callback,
2517 						pool);
2518 	if (r)
2519 		goto out_free_pt;
2520 
2521 	pt->callbacks.congested_fn = pool_is_congested;
2522 	dm_table_add_target_callbacks(ti->table, &pt->callbacks);
2523 
2524 	mutex_unlock(&dm_thin_pool_table.mutex);
2525 
2526 	return 0;
2527 
2528 out_flags_changed:
2529 	__pool_dec(pool);
2530 out_free_pt:
2531 	kfree(pt);
2532 out:
2533 	dm_put_device(ti, data_dev);
2534 out_metadata:
2535 	dm_put_device(ti, metadata_dev);
2536 out_unlock:
2537 	mutex_unlock(&dm_thin_pool_table.mutex);
2538 
2539 	return r;
2540 }
2541 
2542 static int pool_map(struct dm_target *ti, struct bio *bio)
2543 {
2544 	int r;
2545 	struct pool_c *pt = ti->private;
2546 	struct pool *pool = pt->pool;
2547 	unsigned long flags;
2548 
2549 	/*
2550 	 * As this is a singleton target, ti->begin is always zero.
2551 	 */
2552 	spin_lock_irqsave(&pool->lock, flags);
2553 	bio->bi_bdev = pt->data_dev->bdev;
2554 	r = DM_MAPIO_REMAPPED;
2555 	spin_unlock_irqrestore(&pool->lock, flags);
2556 
2557 	return r;
2558 }
2559 
2560 static int maybe_resize_data_dev(struct dm_target *ti, bool *need_commit)
2561 {
2562 	int r;
2563 	struct pool_c *pt = ti->private;
2564 	struct pool *pool = pt->pool;
2565 	sector_t data_size = ti->len;
2566 	dm_block_t sb_data_size;
2567 
2568 	*need_commit = false;
2569 
2570 	(void) sector_div(data_size, pool->sectors_per_block);
2571 
2572 	r = dm_pool_get_data_dev_size(pool->pmd, &sb_data_size);
2573 	if (r) {
2574 		DMERR("%s: failed to retrieve data device size",
2575 		      dm_device_name(pool->pool_md));
2576 		return r;
2577 	}
2578 
2579 	if (data_size < sb_data_size) {
2580 		DMERR("%s: pool target (%llu blocks) too small: expected %llu",
2581 		      dm_device_name(pool->pool_md),
2582 		      (unsigned long long)data_size, sb_data_size);
2583 		return -EINVAL;
2584 
2585 	} else if (data_size > sb_data_size) {
2586 		if (dm_pool_metadata_needs_check(pool->pmd)) {
2587 			DMERR("%s: unable to grow the data device until repaired.",
2588 			      dm_device_name(pool->pool_md));
2589 			return 0;
2590 		}
2591 
2592 		if (sb_data_size)
2593 			DMINFO("%s: growing the data device from %llu to %llu blocks",
2594 			       dm_device_name(pool->pool_md),
2595 			       sb_data_size, (unsigned long long)data_size);
2596 		r = dm_pool_resize_data_dev(pool->pmd, data_size);
2597 		if (r) {
2598 			metadata_operation_failed(pool, "dm_pool_resize_data_dev", r);
2599 			return r;
2600 		}
2601 
2602 		*need_commit = true;
2603 	}
2604 
2605 	return 0;
2606 }
2607 
2608 static int maybe_resize_metadata_dev(struct dm_target *ti, bool *need_commit)
2609 {
2610 	int r;
2611 	struct pool_c *pt = ti->private;
2612 	struct pool *pool = pt->pool;
2613 	dm_block_t metadata_dev_size, sb_metadata_dev_size;
2614 
2615 	*need_commit = false;
2616 
2617 	metadata_dev_size = get_metadata_dev_size_in_blocks(pool->md_dev);
2618 
2619 	r = dm_pool_get_metadata_dev_size(pool->pmd, &sb_metadata_dev_size);
2620 	if (r) {
2621 		DMERR("%s: failed to retrieve metadata device size",
2622 		      dm_device_name(pool->pool_md));
2623 		return r;
2624 	}
2625 
2626 	if (metadata_dev_size < sb_metadata_dev_size) {
2627 		DMERR("%s: metadata device (%llu blocks) too small: expected %llu",
2628 		      dm_device_name(pool->pool_md),
2629 		      metadata_dev_size, sb_metadata_dev_size);
2630 		return -EINVAL;
2631 
2632 	} else if (metadata_dev_size > sb_metadata_dev_size) {
2633 		if (dm_pool_metadata_needs_check(pool->pmd)) {
2634 			DMERR("%s: unable to grow the metadata device until repaired.",
2635 			      dm_device_name(pool->pool_md));
2636 			return 0;
2637 		}
2638 
2639 		warn_if_metadata_device_too_big(pool->md_dev);
2640 		DMINFO("%s: growing the metadata device from %llu to %llu blocks",
2641 		       dm_device_name(pool->pool_md),
2642 		       sb_metadata_dev_size, metadata_dev_size);
2643 		r = dm_pool_resize_metadata_dev(pool->pmd, metadata_dev_size);
2644 		if (r) {
2645 			metadata_operation_failed(pool, "dm_pool_resize_metadata_dev", r);
2646 			return r;
2647 		}
2648 
2649 		*need_commit = true;
2650 	}
2651 
2652 	return 0;
2653 }
2654 
2655 /*
2656  * Retrieves the number of blocks of the data device from
2657  * the superblock and compares it to the actual device size,
2658  * thus resizing the data device in case it has grown.
2659  *
2660  * This both copes with opening preallocated data devices in the ctr
2661  * being followed by a resume
2662  * -and-
2663  * calling the resume method individually after userspace has
2664  * grown the data device in reaction to a table event.
2665  */
2666 static int pool_preresume(struct dm_target *ti)
2667 {
2668 	int r;
2669 	bool need_commit1, need_commit2;
2670 	struct pool_c *pt = ti->private;
2671 	struct pool *pool = pt->pool;
2672 
2673 	/*
2674 	 * Take control of the pool object.
2675 	 */
2676 	r = bind_control_target(pool, ti);
2677 	if (r)
2678 		return r;
2679 
2680 	r = maybe_resize_data_dev(ti, &need_commit1);
2681 	if (r)
2682 		return r;
2683 
2684 	r = maybe_resize_metadata_dev(ti, &need_commit2);
2685 	if (r)
2686 		return r;
2687 
2688 	if (need_commit1 || need_commit2)
2689 		(void) commit(pool);
2690 
2691 	return 0;
2692 }
2693 
2694 static void pool_resume(struct dm_target *ti)
2695 {
2696 	struct pool_c *pt = ti->private;
2697 	struct pool *pool = pt->pool;
2698 	unsigned long flags;
2699 
2700 	spin_lock_irqsave(&pool->lock, flags);
2701 	pool->low_water_triggered = false;
2702 	spin_unlock_irqrestore(&pool->lock, flags);
2703 	requeue_bios(pool);
2704 
2705 	do_waker(&pool->waker.work);
2706 }
2707 
2708 static void pool_postsuspend(struct dm_target *ti)
2709 {
2710 	struct pool_c *pt = ti->private;
2711 	struct pool *pool = pt->pool;
2712 
2713 	cancel_delayed_work(&pool->waker);
2714 	cancel_delayed_work(&pool->no_space_timeout);
2715 	flush_workqueue(pool->wq);
2716 	(void) commit(pool);
2717 }
2718 
2719 static int check_arg_count(unsigned argc, unsigned args_required)
2720 {
2721 	if (argc != args_required) {
2722 		DMWARN("Message received with %u arguments instead of %u.",
2723 		       argc, args_required);
2724 		return -EINVAL;
2725 	}
2726 
2727 	return 0;
2728 }
2729 
2730 static int read_dev_id(char *arg, dm_thin_id *dev_id, int warning)
2731 {
2732 	if (!kstrtoull(arg, 10, (unsigned long long *)dev_id) &&
2733 	    *dev_id <= MAX_DEV_ID)
2734 		return 0;
2735 
2736 	if (warning)
2737 		DMWARN("Message received with invalid device id: %s", arg);
2738 
2739 	return -EINVAL;
2740 }
2741 
2742 static int process_create_thin_mesg(unsigned argc, char **argv, struct pool *pool)
2743 {
2744 	dm_thin_id dev_id;
2745 	int r;
2746 
2747 	r = check_arg_count(argc, 2);
2748 	if (r)
2749 		return r;
2750 
2751 	r = read_dev_id(argv[1], &dev_id, 1);
2752 	if (r)
2753 		return r;
2754 
2755 	r = dm_pool_create_thin(pool->pmd, dev_id);
2756 	if (r) {
2757 		DMWARN("Creation of new thinly-provisioned device with id %s failed.",
2758 		       argv[1]);
2759 		return r;
2760 	}
2761 
2762 	return 0;
2763 }
2764 
2765 static int process_create_snap_mesg(unsigned argc, char **argv, struct pool *pool)
2766 {
2767 	dm_thin_id dev_id;
2768 	dm_thin_id origin_dev_id;
2769 	int r;
2770 
2771 	r = check_arg_count(argc, 3);
2772 	if (r)
2773 		return r;
2774 
2775 	r = read_dev_id(argv[1], &dev_id, 1);
2776 	if (r)
2777 		return r;
2778 
2779 	r = read_dev_id(argv[2], &origin_dev_id, 1);
2780 	if (r)
2781 		return r;
2782 
2783 	r = dm_pool_create_snap(pool->pmd, dev_id, origin_dev_id);
2784 	if (r) {
2785 		DMWARN("Creation of new snapshot %s of device %s failed.",
2786 		       argv[1], argv[2]);
2787 		return r;
2788 	}
2789 
2790 	return 0;
2791 }
2792 
2793 static int process_delete_mesg(unsigned argc, char **argv, struct pool *pool)
2794 {
2795 	dm_thin_id dev_id;
2796 	int r;
2797 
2798 	r = check_arg_count(argc, 2);
2799 	if (r)
2800 		return r;
2801 
2802 	r = read_dev_id(argv[1], &dev_id, 1);
2803 	if (r)
2804 		return r;
2805 
2806 	r = dm_pool_delete_thin_device(pool->pmd, dev_id);
2807 	if (r)
2808 		DMWARN("Deletion of thin device %s failed.", argv[1]);
2809 
2810 	return r;
2811 }
2812 
2813 static int process_set_transaction_id_mesg(unsigned argc, char **argv, struct pool *pool)
2814 {
2815 	dm_thin_id old_id, new_id;
2816 	int r;
2817 
2818 	r = check_arg_count(argc, 3);
2819 	if (r)
2820 		return r;
2821 
2822 	if (kstrtoull(argv[1], 10, (unsigned long long *)&old_id)) {
2823 		DMWARN("set_transaction_id message: Unrecognised id %s.", argv[1]);
2824 		return -EINVAL;
2825 	}
2826 
2827 	if (kstrtoull(argv[2], 10, (unsigned long long *)&new_id)) {
2828 		DMWARN("set_transaction_id message: Unrecognised new id %s.", argv[2]);
2829 		return -EINVAL;
2830 	}
2831 
2832 	r = dm_pool_set_metadata_transaction_id(pool->pmd, old_id, new_id);
2833 	if (r) {
2834 		DMWARN("Failed to change transaction id from %s to %s.",
2835 		       argv[1], argv[2]);
2836 		return r;
2837 	}
2838 
2839 	return 0;
2840 }
2841 
2842 static int process_reserve_metadata_snap_mesg(unsigned argc, char **argv, struct pool *pool)
2843 {
2844 	int r;
2845 
2846 	r = check_arg_count(argc, 1);
2847 	if (r)
2848 		return r;
2849 
2850 	(void) commit(pool);
2851 
2852 	r = dm_pool_reserve_metadata_snap(pool->pmd);
2853 	if (r)
2854 		DMWARN("reserve_metadata_snap message failed.");
2855 
2856 	return r;
2857 }
2858 
2859 static int process_release_metadata_snap_mesg(unsigned argc, char **argv, struct pool *pool)
2860 {
2861 	int r;
2862 
2863 	r = check_arg_count(argc, 1);
2864 	if (r)
2865 		return r;
2866 
2867 	r = dm_pool_release_metadata_snap(pool->pmd);
2868 	if (r)
2869 		DMWARN("release_metadata_snap message failed.");
2870 
2871 	return r;
2872 }
2873 
2874 /*
2875  * Messages supported:
2876  *   create_thin	<dev_id>
2877  *   create_snap	<dev_id> <origin_id>
2878  *   delete		<dev_id>
2879  *   trim		<dev_id> <new_size_in_sectors>
2880  *   set_transaction_id <current_trans_id> <new_trans_id>
2881  *   reserve_metadata_snap
2882  *   release_metadata_snap
2883  */
2884 static int pool_message(struct dm_target *ti, unsigned argc, char **argv)
2885 {
2886 	int r = -EINVAL;
2887 	struct pool_c *pt = ti->private;
2888 	struct pool *pool = pt->pool;
2889 
2890 	if (!strcasecmp(argv[0], "create_thin"))
2891 		r = process_create_thin_mesg(argc, argv, pool);
2892 
2893 	else if (!strcasecmp(argv[0], "create_snap"))
2894 		r = process_create_snap_mesg(argc, argv, pool);
2895 
2896 	else if (!strcasecmp(argv[0], "delete"))
2897 		r = process_delete_mesg(argc, argv, pool);
2898 
2899 	else if (!strcasecmp(argv[0], "set_transaction_id"))
2900 		r = process_set_transaction_id_mesg(argc, argv, pool);
2901 
2902 	else if (!strcasecmp(argv[0], "reserve_metadata_snap"))
2903 		r = process_reserve_metadata_snap_mesg(argc, argv, pool);
2904 
2905 	else if (!strcasecmp(argv[0], "release_metadata_snap"))
2906 		r = process_release_metadata_snap_mesg(argc, argv, pool);
2907 
2908 	else
2909 		DMWARN("Unrecognised thin pool target message received: %s", argv[0]);
2910 
2911 	if (!r)
2912 		(void) commit(pool);
2913 
2914 	return r;
2915 }
2916 
2917 static void emit_flags(struct pool_features *pf, char *result,
2918 		       unsigned sz, unsigned maxlen)
2919 {
2920 	unsigned count = !pf->zero_new_blocks + !pf->discard_enabled +
2921 		!pf->discard_passdown + (pf->mode == PM_READ_ONLY) +
2922 		pf->error_if_no_space;
2923 	DMEMIT("%u ", count);
2924 
2925 	if (!pf->zero_new_blocks)
2926 		DMEMIT("skip_block_zeroing ");
2927 
2928 	if (!pf->discard_enabled)
2929 		DMEMIT("ignore_discard ");
2930 
2931 	if (!pf->discard_passdown)
2932 		DMEMIT("no_discard_passdown ");
2933 
2934 	if (pf->mode == PM_READ_ONLY)
2935 		DMEMIT("read_only ");
2936 
2937 	if (pf->error_if_no_space)
2938 		DMEMIT("error_if_no_space ");
2939 }
2940 
2941 /*
2942  * Status line is:
2943  *    <transaction id> <used metadata sectors>/<total metadata sectors>
2944  *    <used data sectors>/<total data sectors> <held metadata root>
2945  */
2946 static void pool_status(struct dm_target *ti, status_type_t type,
2947 			unsigned status_flags, char *result, unsigned maxlen)
2948 {
2949 	int r;
2950 	unsigned sz = 0;
2951 	uint64_t transaction_id;
2952 	dm_block_t nr_free_blocks_data;
2953 	dm_block_t nr_free_blocks_metadata;
2954 	dm_block_t nr_blocks_data;
2955 	dm_block_t nr_blocks_metadata;
2956 	dm_block_t held_root;
2957 	char buf[BDEVNAME_SIZE];
2958 	char buf2[BDEVNAME_SIZE];
2959 	struct pool_c *pt = ti->private;
2960 	struct pool *pool = pt->pool;
2961 
2962 	switch (type) {
2963 	case STATUSTYPE_INFO:
2964 		if (get_pool_mode(pool) == PM_FAIL) {
2965 			DMEMIT("Fail");
2966 			break;
2967 		}
2968 
2969 		/* Commit to ensure statistics aren't out-of-date */
2970 		if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti))
2971 			(void) commit(pool);
2972 
2973 		r = dm_pool_get_metadata_transaction_id(pool->pmd, &transaction_id);
2974 		if (r) {
2975 			DMERR("%s: dm_pool_get_metadata_transaction_id returned %d",
2976 			      dm_device_name(pool->pool_md), r);
2977 			goto err;
2978 		}
2979 
2980 		r = dm_pool_get_free_metadata_block_count(pool->pmd, &nr_free_blocks_metadata);
2981 		if (r) {
2982 			DMERR("%s: dm_pool_get_free_metadata_block_count returned %d",
2983 			      dm_device_name(pool->pool_md), r);
2984 			goto err;
2985 		}
2986 
2987 		r = dm_pool_get_metadata_dev_size(pool->pmd, &nr_blocks_metadata);
2988 		if (r) {
2989 			DMERR("%s: dm_pool_get_metadata_dev_size returned %d",
2990 			      dm_device_name(pool->pool_md), r);
2991 			goto err;
2992 		}
2993 
2994 		r = dm_pool_get_free_block_count(pool->pmd, &nr_free_blocks_data);
2995 		if (r) {
2996 			DMERR("%s: dm_pool_get_free_block_count returned %d",
2997 			      dm_device_name(pool->pool_md), r);
2998 			goto err;
2999 		}
3000 
3001 		r = dm_pool_get_data_dev_size(pool->pmd, &nr_blocks_data);
3002 		if (r) {
3003 			DMERR("%s: dm_pool_get_data_dev_size returned %d",
3004 			      dm_device_name(pool->pool_md), r);
3005 			goto err;
3006 		}
3007 
3008 		r = dm_pool_get_metadata_snap(pool->pmd, &held_root);
3009 		if (r) {
3010 			DMERR("%s: dm_pool_get_metadata_snap returned %d",
3011 			      dm_device_name(pool->pool_md), r);
3012 			goto err;
3013 		}
3014 
3015 		DMEMIT("%llu %llu/%llu %llu/%llu ",
3016 		       (unsigned long long)transaction_id,
3017 		       (unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata),
3018 		       (unsigned long long)nr_blocks_metadata,
3019 		       (unsigned long long)(nr_blocks_data - nr_free_blocks_data),
3020 		       (unsigned long long)nr_blocks_data);
3021 
3022 		if (held_root)
3023 			DMEMIT("%llu ", held_root);
3024 		else
3025 			DMEMIT("- ");
3026 
3027 		if (pool->pf.mode == PM_OUT_OF_DATA_SPACE)
3028 			DMEMIT("out_of_data_space ");
3029 		else if (pool->pf.mode == PM_READ_ONLY)
3030 			DMEMIT("ro ");
3031 		else
3032 			DMEMIT("rw ");
3033 
3034 		if (!pool->pf.discard_enabled)
3035 			DMEMIT("ignore_discard ");
3036 		else if (pool->pf.discard_passdown)
3037 			DMEMIT("discard_passdown ");
3038 		else
3039 			DMEMIT("no_discard_passdown ");
3040 
3041 		if (pool->pf.error_if_no_space)
3042 			DMEMIT("error_if_no_space ");
3043 		else
3044 			DMEMIT("queue_if_no_space ");
3045 
3046 		break;
3047 
3048 	case STATUSTYPE_TABLE:
3049 		DMEMIT("%s %s %lu %llu ",
3050 		       format_dev_t(buf, pt->metadata_dev->bdev->bd_dev),
3051 		       format_dev_t(buf2, pt->data_dev->bdev->bd_dev),
3052 		       (unsigned long)pool->sectors_per_block,
3053 		       (unsigned long long)pt->low_water_blocks);
3054 		emit_flags(&pt->requested_pf, result, sz, maxlen);
3055 		break;
3056 	}
3057 	return;
3058 
3059 err:
3060 	DMEMIT("Error");
3061 }
3062 
3063 static int pool_iterate_devices(struct dm_target *ti,
3064 				iterate_devices_callout_fn fn, void *data)
3065 {
3066 	struct pool_c *pt = ti->private;
3067 
3068 	return fn(ti, pt->data_dev, 0, ti->len, data);
3069 }
3070 
3071 static int pool_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
3072 		      struct bio_vec *biovec, int max_size)
3073 {
3074 	struct pool_c *pt = ti->private;
3075 	struct request_queue *q = bdev_get_queue(pt->data_dev->bdev);
3076 
3077 	if (!q->merge_bvec_fn)
3078 		return max_size;
3079 
3080 	bvm->bi_bdev = pt->data_dev->bdev;
3081 
3082 	return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
3083 }
3084 
3085 static void set_discard_limits(struct pool_c *pt, struct queue_limits *limits)
3086 {
3087 	struct pool *pool = pt->pool;
3088 	struct queue_limits *data_limits;
3089 
3090 	limits->max_discard_sectors = pool->sectors_per_block;
3091 
3092 	/*
3093 	 * discard_granularity is just a hint, and not enforced.
3094 	 */
3095 	if (pt->adjusted_pf.discard_passdown) {
3096 		data_limits = &bdev_get_queue(pt->data_dev->bdev)->limits;
3097 		limits->discard_granularity = max(data_limits->discard_granularity,
3098 						  pool->sectors_per_block << SECTOR_SHIFT);
3099 	} else
3100 		limits->discard_granularity = pool->sectors_per_block << SECTOR_SHIFT;
3101 }
3102 
3103 static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits)
3104 {
3105 	struct pool_c *pt = ti->private;
3106 	struct pool *pool = pt->pool;
3107 	uint64_t io_opt_sectors = limits->io_opt >> SECTOR_SHIFT;
3108 
3109 	/*
3110 	 * If the system-determined stacked limits are compatible with the
3111 	 * pool's blocksize (io_opt is a factor) do not override them.
3112 	 */
3113 	if (io_opt_sectors < pool->sectors_per_block ||
3114 	    do_div(io_opt_sectors, pool->sectors_per_block)) {
3115 		blk_limits_io_min(limits, 0);
3116 		blk_limits_io_opt(limits, pool->sectors_per_block << SECTOR_SHIFT);
3117 	}
3118 
3119 	/*
3120 	 * pt->adjusted_pf is a staging area for the actual features to use.
3121 	 * They get transferred to the live pool in bind_control_target()
3122 	 * called from pool_preresume().
3123 	 */
3124 	if (!pt->adjusted_pf.discard_enabled) {
3125 		/*
3126 		 * Must explicitly disallow stacking discard limits otherwise the
3127 		 * block layer will stack them if pool's data device has support.
3128 		 * QUEUE_FLAG_DISCARD wouldn't be set but there is no way for the
3129 		 * user to see that, so make sure to set all discard limits to 0.
3130 		 */
3131 		limits->discard_granularity = 0;
3132 		return;
3133 	}
3134 
3135 	disable_passdown_if_not_supported(pt);
3136 
3137 	set_discard_limits(pt, limits);
3138 }
3139 
3140 static struct target_type pool_target = {
3141 	.name = "thin-pool",
3142 	.features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE |
3143 		    DM_TARGET_IMMUTABLE,
3144 	.version = {1, 12, 0},
3145 	.module = THIS_MODULE,
3146 	.ctr = pool_ctr,
3147 	.dtr = pool_dtr,
3148 	.map = pool_map,
3149 	.postsuspend = pool_postsuspend,
3150 	.preresume = pool_preresume,
3151 	.resume = pool_resume,
3152 	.message = pool_message,
3153 	.status = pool_status,
3154 	.merge = pool_merge,
3155 	.iterate_devices = pool_iterate_devices,
3156 	.io_hints = pool_io_hints,
3157 };
3158 
3159 /*----------------------------------------------------------------
3160  * Thin target methods
3161  *--------------------------------------------------------------*/
3162 static void thin_get(struct thin_c *tc)
3163 {
3164 	atomic_inc(&tc->refcount);
3165 }
3166 
3167 static void thin_put(struct thin_c *tc)
3168 {
3169 	if (atomic_dec_and_test(&tc->refcount))
3170 		complete(&tc->can_destroy);
3171 }
3172 
3173 static void thin_dtr(struct dm_target *ti)
3174 {
3175 	struct thin_c *tc = ti->private;
3176 	unsigned long flags;
3177 
3178 	thin_put(tc);
3179 	wait_for_completion(&tc->can_destroy);
3180 
3181 	spin_lock_irqsave(&tc->pool->lock, flags);
3182 	list_del_rcu(&tc->list);
3183 	spin_unlock_irqrestore(&tc->pool->lock, flags);
3184 	synchronize_rcu();
3185 
3186 	mutex_lock(&dm_thin_pool_table.mutex);
3187 
3188 	__pool_dec(tc->pool);
3189 	dm_pool_close_thin_device(tc->td);
3190 	dm_put_device(ti, tc->pool_dev);
3191 	if (tc->origin_dev)
3192 		dm_put_device(ti, tc->origin_dev);
3193 	kfree(tc);
3194 
3195 	mutex_unlock(&dm_thin_pool_table.mutex);
3196 }
3197 
3198 /*
3199  * Thin target parameters:
3200  *
3201  * <pool_dev> <dev_id> [origin_dev]
3202  *
3203  * pool_dev: the path to the pool (eg, /dev/mapper/my_pool)
3204  * dev_id: the internal device identifier
3205  * origin_dev: a device external to the pool that should act as the origin
3206  *
3207  * If the pool device has discards disabled, they get disabled for the thin
3208  * device as well.
3209  */
3210 static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
3211 {
3212 	int r;
3213 	struct thin_c *tc;
3214 	struct dm_dev *pool_dev, *origin_dev;
3215 	struct mapped_device *pool_md;
3216 	unsigned long flags;
3217 
3218 	mutex_lock(&dm_thin_pool_table.mutex);
3219 
3220 	if (argc != 2 && argc != 3) {
3221 		ti->error = "Invalid argument count";
3222 		r = -EINVAL;
3223 		goto out_unlock;
3224 	}
3225 
3226 	tc = ti->private = kzalloc(sizeof(*tc), GFP_KERNEL);
3227 	if (!tc) {
3228 		ti->error = "Out of memory";
3229 		r = -ENOMEM;
3230 		goto out_unlock;
3231 	}
3232 	spin_lock_init(&tc->lock);
3233 	bio_list_init(&tc->deferred_bio_list);
3234 	bio_list_init(&tc->retry_on_resume_list);
3235 	tc->sort_bio_list = RB_ROOT;
3236 
3237 	if (argc == 3) {
3238 		r = dm_get_device(ti, argv[2], FMODE_READ, &origin_dev);
3239 		if (r) {
3240 			ti->error = "Error opening origin device";
3241 			goto bad_origin_dev;
3242 		}
3243 		tc->origin_dev = origin_dev;
3244 	}
3245 
3246 	r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &pool_dev);
3247 	if (r) {
3248 		ti->error = "Error opening pool device";
3249 		goto bad_pool_dev;
3250 	}
3251 	tc->pool_dev = pool_dev;
3252 
3253 	if (read_dev_id(argv[1], (unsigned long long *)&tc->dev_id, 0)) {
3254 		ti->error = "Invalid device id";
3255 		r = -EINVAL;
3256 		goto bad_common;
3257 	}
3258 
3259 	pool_md = dm_get_md(tc->pool_dev->bdev->bd_dev);
3260 	if (!pool_md) {
3261 		ti->error = "Couldn't get pool mapped device";
3262 		r = -EINVAL;
3263 		goto bad_common;
3264 	}
3265 
3266 	tc->pool = __pool_table_lookup(pool_md);
3267 	if (!tc->pool) {
3268 		ti->error = "Couldn't find pool object";
3269 		r = -EINVAL;
3270 		goto bad_pool_lookup;
3271 	}
3272 	__pool_inc(tc->pool);
3273 
3274 	if (get_pool_mode(tc->pool) == PM_FAIL) {
3275 		ti->error = "Couldn't open thin device, Pool is in fail mode";
3276 		r = -EINVAL;
3277 		goto bad_thin_open;
3278 	}
3279 
3280 	r = dm_pool_open_thin_device(tc->pool->pmd, tc->dev_id, &tc->td);
3281 	if (r) {
3282 		ti->error = "Couldn't open thin internal device";
3283 		goto bad_thin_open;
3284 	}
3285 
3286 	r = dm_set_target_max_io_len(ti, tc->pool->sectors_per_block);
3287 	if (r)
3288 		goto bad_target_max_io_len;
3289 
3290 	ti->num_flush_bios = 1;
3291 	ti->flush_supported = true;
3292 	ti->per_bio_data_size = sizeof(struct dm_thin_endio_hook);
3293 
3294 	/* In case the pool supports discards, pass them on. */
3295 	ti->discard_zeroes_data_unsupported = true;
3296 	if (tc->pool->pf.discard_enabled) {
3297 		ti->discards_supported = true;
3298 		ti->num_discard_bios = 1;
3299 		/* Discard bios must be split on a block boundary */
3300 		ti->split_discard_bios = true;
3301 	}
3302 
3303 	dm_put(pool_md);
3304 
3305 	mutex_unlock(&dm_thin_pool_table.mutex);
3306 
3307 	atomic_set(&tc->refcount, 1);
3308 	init_completion(&tc->can_destroy);
3309 
3310 	spin_lock_irqsave(&tc->pool->lock, flags);
3311 	list_add_tail_rcu(&tc->list, &tc->pool->active_thins);
3312 	spin_unlock_irqrestore(&tc->pool->lock, flags);
3313 	/*
3314 	 * This synchronize_rcu() call is needed here otherwise we risk a
3315 	 * wake_worker() call finding no bios to process (because the newly
3316 	 * added tc isn't yet visible).  So this reduces latency since we
3317 	 * aren't then dependent on the periodic commit to wake_worker().
3318 	 */
3319 	synchronize_rcu();
3320 
3321 	return 0;
3322 
3323 bad_target_max_io_len:
3324 	dm_pool_close_thin_device(tc->td);
3325 bad_thin_open:
3326 	__pool_dec(tc->pool);
3327 bad_pool_lookup:
3328 	dm_put(pool_md);
3329 bad_common:
3330 	dm_put_device(ti, tc->pool_dev);
3331 bad_pool_dev:
3332 	if (tc->origin_dev)
3333 		dm_put_device(ti, tc->origin_dev);
3334 bad_origin_dev:
3335 	kfree(tc);
3336 out_unlock:
3337 	mutex_unlock(&dm_thin_pool_table.mutex);
3338 
3339 	return r;
3340 }
3341 
3342 static int thin_map(struct dm_target *ti, struct bio *bio)
3343 {
3344 	bio->bi_iter.bi_sector = dm_target_offset(ti, bio->bi_iter.bi_sector);
3345 
3346 	return thin_bio_map(ti, bio);
3347 }
3348 
3349 static int thin_endio(struct dm_target *ti, struct bio *bio, int err)
3350 {
3351 	unsigned long flags;
3352 	struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
3353 	struct list_head work;
3354 	struct dm_thin_new_mapping *m, *tmp;
3355 	struct pool *pool = h->tc->pool;
3356 
3357 	if (h->shared_read_entry) {
3358 		INIT_LIST_HEAD(&work);
3359 		dm_deferred_entry_dec(h->shared_read_entry, &work);
3360 
3361 		spin_lock_irqsave(&pool->lock, flags);
3362 		list_for_each_entry_safe(m, tmp, &work, list) {
3363 			list_del(&m->list);
3364 			m->quiesced = true;
3365 			__maybe_add_mapping(m);
3366 		}
3367 		spin_unlock_irqrestore(&pool->lock, flags);
3368 	}
3369 
3370 	if (h->all_io_entry) {
3371 		INIT_LIST_HEAD(&work);
3372 		dm_deferred_entry_dec(h->all_io_entry, &work);
3373 		if (!list_empty(&work)) {
3374 			spin_lock_irqsave(&pool->lock, flags);
3375 			list_for_each_entry_safe(m, tmp, &work, list)
3376 				list_add_tail(&m->list, &pool->prepared_discards);
3377 			spin_unlock_irqrestore(&pool->lock, flags);
3378 			wake_worker(pool);
3379 		}
3380 	}
3381 
3382 	return 0;
3383 }
3384 
3385 static void thin_presuspend(struct dm_target *ti)
3386 {
3387 	struct thin_c *tc = ti->private;
3388 
3389 	if (dm_noflush_suspending(ti))
3390 		noflush_work(tc, do_noflush_start);
3391 }
3392 
3393 static void thin_postsuspend(struct dm_target *ti)
3394 {
3395 	struct thin_c *tc = ti->private;
3396 
3397 	/*
3398 	 * The dm_noflush_suspending flag has been cleared by now, so
3399 	 * unfortunately we must always run this.
3400 	 */
3401 	noflush_work(tc, do_noflush_stop);
3402 }
3403 
3404 /*
3405  * <nr mapped sectors> <highest mapped sector>
3406  */
3407 static void thin_status(struct dm_target *ti, status_type_t type,
3408 			unsigned status_flags, char *result, unsigned maxlen)
3409 {
3410 	int r;
3411 	ssize_t sz = 0;
3412 	dm_block_t mapped, highest;
3413 	char buf[BDEVNAME_SIZE];
3414 	struct thin_c *tc = ti->private;
3415 
3416 	if (get_pool_mode(tc->pool) == PM_FAIL) {
3417 		DMEMIT("Fail");
3418 		return;
3419 	}
3420 
3421 	if (!tc->td)
3422 		DMEMIT("-");
3423 	else {
3424 		switch (type) {
3425 		case STATUSTYPE_INFO:
3426 			r = dm_thin_get_mapped_count(tc->td, &mapped);
3427 			if (r) {
3428 				DMERR("dm_thin_get_mapped_count returned %d", r);
3429 				goto err;
3430 			}
3431 
3432 			r = dm_thin_get_highest_mapped_block(tc->td, &highest);
3433 			if (r < 0) {
3434 				DMERR("dm_thin_get_highest_mapped_block returned %d", r);
3435 				goto err;
3436 			}
3437 
3438 			DMEMIT("%llu ", mapped * tc->pool->sectors_per_block);
3439 			if (r)
3440 				DMEMIT("%llu", ((highest + 1) *
3441 						tc->pool->sectors_per_block) - 1);
3442 			else
3443 				DMEMIT("-");
3444 			break;
3445 
3446 		case STATUSTYPE_TABLE:
3447 			DMEMIT("%s %lu",
3448 			       format_dev_t(buf, tc->pool_dev->bdev->bd_dev),
3449 			       (unsigned long) tc->dev_id);
3450 			if (tc->origin_dev)
3451 				DMEMIT(" %s", format_dev_t(buf, tc->origin_dev->bdev->bd_dev));
3452 			break;
3453 		}
3454 	}
3455 
3456 	return;
3457 
3458 err:
3459 	DMEMIT("Error");
3460 }
3461 
3462 static int thin_iterate_devices(struct dm_target *ti,
3463 				iterate_devices_callout_fn fn, void *data)
3464 {
3465 	sector_t blocks;
3466 	struct thin_c *tc = ti->private;
3467 	struct pool *pool = tc->pool;
3468 
3469 	/*
3470 	 * We can't call dm_pool_get_data_dev_size() since that blocks.  So
3471 	 * we follow a more convoluted path through to the pool's target.
3472 	 */
3473 	if (!pool->ti)
3474 		return 0;	/* nothing is bound */
3475 
3476 	blocks = pool->ti->len;
3477 	(void) sector_div(blocks, pool->sectors_per_block);
3478 	if (blocks)
3479 		return fn(ti, tc->pool_dev, 0, pool->sectors_per_block * blocks, data);
3480 
3481 	return 0;
3482 }
3483 
3484 static struct target_type thin_target = {
3485 	.name = "thin",
3486 	.version = {1, 12, 0},
3487 	.module	= THIS_MODULE,
3488 	.ctr = thin_ctr,
3489 	.dtr = thin_dtr,
3490 	.map = thin_map,
3491 	.end_io = thin_endio,
3492 	.presuspend = thin_presuspend,
3493 	.postsuspend = thin_postsuspend,
3494 	.status = thin_status,
3495 	.iterate_devices = thin_iterate_devices,
3496 };
3497 
3498 /*----------------------------------------------------------------*/
3499 
3500 static int __init dm_thin_init(void)
3501 {
3502 	int r;
3503 
3504 	pool_table_init();
3505 
3506 	r = dm_register_target(&thin_target);
3507 	if (r)
3508 		return r;
3509 
3510 	r = dm_register_target(&pool_target);
3511 	if (r)
3512 		goto bad_pool_target;
3513 
3514 	r = -ENOMEM;
3515 
3516 	_new_mapping_cache = KMEM_CACHE(dm_thin_new_mapping, 0);
3517 	if (!_new_mapping_cache)
3518 		goto bad_new_mapping_cache;
3519 
3520 	return 0;
3521 
3522 bad_new_mapping_cache:
3523 	dm_unregister_target(&pool_target);
3524 bad_pool_target:
3525 	dm_unregister_target(&thin_target);
3526 
3527 	return r;
3528 }
3529 
3530 static void dm_thin_exit(void)
3531 {
3532 	dm_unregister_target(&thin_target);
3533 	dm_unregister_target(&pool_target);
3534 
3535 	kmem_cache_destroy(_new_mapping_cache);
3536 }
3537 
3538 module_init(dm_thin_init);
3539 module_exit(dm_thin_exit);
3540 
3541 module_param_named(no_space_timeout, no_space_timeout_secs, uint, S_IRUGO | S_IWUSR);
3542 MODULE_PARM_DESC(no_space_timeout, "Out of data space queue IO timeout in seconds");
3543 
3544 MODULE_DESCRIPTION(DM_NAME " thin provisioning target");
3545 MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
3546 MODULE_LICENSE("GPL");
3547