xref: /openbmc/linux/drivers/md/dm.c (revision 62f75c2f3244553b1290447abd1f1e6b1144d3e9)
1 /*
2  * Copyright (C) 2001, 2002 Sistina Software (UK) Limited.
3  * Copyright (C) 2004 Red Hat, Inc. All rights reserved.
4  *
5  * This file is released under the GPL.
6  */
7 
8 #include "dm.h"
9 #include "dm-bio-list.h"
10 
11 #include <linux/init.h>
12 #include <linux/module.h>
13 #include <linux/mutex.h>
14 #include <linux/moduleparam.h>
15 #include <linux/blkpg.h>
16 #include <linux/bio.h>
17 #include <linux/buffer_head.h>
18 #include <linux/mempool.h>
19 #include <linux/slab.h>
20 #include <linux/idr.h>
21 #include <linux/hdreg.h>
22 #include <linux/blktrace_api.h>
23 
24 static const char *_name = DM_NAME;
25 
26 static unsigned int major = 0;
27 static unsigned int _major = 0;
28 
29 /*
30  * One of these is allocated per bio.
31  */
32 struct dm_io {
33 	struct mapped_device *md;
34 	int error;
35 	struct bio *bio;
36 	atomic_t io_count;
37 	unsigned long start_time;
38 };
39 
40 /*
41  * One of these is allocated per target within a bio.  Hopefully
42  * this will be simplified out one day.
43  */
44 struct target_io {
45 	struct dm_io *io;
46 	struct dm_target *ti;
47 	union map_info info;
48 };
49 
50 union map_info *dm_get_mapinfo(struct bio *bio)
51 {
52         if (bio && bio->bi_private)
53                 return &((struct target_io *)bio->bi_private)->info;
54         return NULL;
55 }
56 
57 #define MINOR_ALLOCED ((void *)-1)
58 
59 /*
60  * Bits for the md->flags field.
61  */
62 #define DMF_BLOCK_IO 0
63 #define DMF_SUSPENDED 1
64 #define DMF_FROZEN 2
65 
66 struct mapped_device {
67 	struct rw_semaphore io_lock;
68 	struct semaphore suspend_lock;
69 	rwlock_t map_lock;
70 	atomic_t holders;
71 
72 	unsigned long flags;
73 
74 	request_queue_t *queue;
75 	struct gendisk *disk;
76 	char name[16];
77 
78 	void *interface_ptr;
79 
80 	/*
81 	 * A list of ios that arrived while we were suspended.
82 	 */
83 	atomic_t pending;
84 	wait_queue_head_t wait;
85  	struct bio_list deferred;
86 
87 	/*
88 	 * The current mapping.
89 	 */
90 	struct dm_table *map;
91 
92 	/*
93 	 * io objects are allocated from here.
94 	 */
95 	mempool_t *io_pool;
96 	mempool_t *tio_pool;
97 
98 	/*
99 	 * Event handling.
100 	 */
101 	atomic_t event_nr;
102 	wait_queue_head_t eventq;
103 
104 	/*
105 	 * freeze/thaw support require holding onto a super block
106 	 */
107 	struct super_block *frozen_sb;
108 	struct block_device *suspended_bdev;
109 
110 	/* forced geometry settings */
111 	struct hd_geometry geometry;
112 };
113 
114 #define MIN_IOS 256
115 static kmem_cache_t *_io_cache;
116 static kmem_cache_t *_tio_cache;
117 
118 static struct bio_set *dm_set;
119 
120 static int __init local_init(void)
121 {
122 	int r;
123 
124 	dm_set = bioset_create(16, 16, 4);
125 	if (!dm_set)
126 		return -ENOMEM;
127 
128 	/* allocate a slab for the dm_ios */
129 	_io_cache = kmem_cache_create("dm_io",
130 				      sizeof(struct dm_io), 0, 0, NULL, NULL);
131 	if (!_io_cache)
132 		return -ENOMEM;
133 
134 	/* allocate a slab for the target ios */
135 	_tio_cache = kmem_cache_create("dm_tio", sizeof(struct target_io),
136 				       0, 0, NULL, NULL);
137 	if (!_tio_cache) {
138 		kmem_cache_destroy(_io_cache);
139 		return -ENOMEM;
140 	}
141 
142 	_major = major;
143 	r = register_blkdev(_major, _name);
144 	if (r < 0) {
145 		kmem_cache_destroy(_tio_cache);
146 		kmem_cache_destroy(_io_cache);
147 		return r;
148 	}
149 
150 	if (!_major)
151 		_major = r;
152 
153 	return 0;
154 }
155 
156 static void local_exit(void)
157 {
158 	kmem_cache_destroy(_tio_cache);
159 	kmem_cache_destroy(_io_cache);
160 
161 	bioset_free(dm_set);
162 
163 	if (unregister_blkdev(_major, _name) < 0)
164 		DMERR("devfs_unregister_blkdev failed");
165 
166 	_major = 0;
167 
168 	DMINFO("cleaned up");
169 }
170 
171 int (*_inits[])(void) __initdata = {
172 	local_init,
173 	dm_target_init,
174 	dm_linear_init,
175 	dm_stripe_init,
176 	dm_interface_init,
177 };
178 
179 void (*_exits[])(void) = {
180 	local_exit,
181 	dm_target_exit,
182 	dm_linear_exit,
183 	dm_stripe_exit,
184 	dm_interface_exit,
185 };
186 
187 static int __init dm_init(void)
188 {
189 	const int count = ARRAY_SIZE(_inits);
190 
191 	int r, i;
192 
193 	for (i = 0; i < count; i++) {
194 		r = _inits[i]();
195 		if (r)
196 			goto bad;
197 	}
198 
199 	return 0;
200 
201       bad:
202 	while (i--)
203 		_exits[i]();
204 
205 	return r;
206 }
207 
208 static void __exit dm_exit(void)
209 {
210 	int i = ARRAY_SIZE(_exits);
211 
212 	while (i--)
213 		_exits[i]();
214 }
215 
216 /*
217  * Block device functions
218  */
219 static int dm_blk_open(struct inode *inode, struct file *file)
220 {
221 	struct mapped_device *md;
222 
223 	md = inode->i_bdev->bd_disk->private_data;
224 	dm_get(md);
225 	return 0;
226 }
227 
228 static int dm_blk_close(struct inode *inode, struct file *file)
229 {
230 	struct mapped_device *md;
231 
232 	md = inode->i_bdev->bd_disk->private_data;
233 	dm_put(md);
234 	return 0;
235 }
236 
237 static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo)
238 {
239 	struct mapped_device *md = bdev->bd_disk->private_data;
240 
241 	return dm_get_geometry(md, geo);
242 }
243 
244 static inline struct dm_io *alloc_io(struct mapped_device *md)
245 {
246 	return mempool_alloc(md->io_pool, GFP_NOIO);
247 }
248 
249 static inline void free_io(struct mapped_device *md, struct dm_io *io)
250 {
251 	mempool_free(io, md->io_pool);
252 }
253 
254 static inline struct target_io *alloc_tio(struct mapped_device *md)
255 {
256 	return mempool_alloc(md->tio_pool, GFP_NOIO);
257 }
258 
259 static inline void free_tio(struct mapped_device *md, struct target_io *tio)
260 {
261 	mempool_free(tio, md->tio_pool);
262 }
263 
264 static void start_io_acct(struct dm_io *io)
265 {
266 	struct mapped_device *md = io->md;
267 
268 	io->start_time = jiffies;
269 
270 	preempt_disable();
271 	disk_round_stats(dm_disk(md));
272 	preempt_enable();
273 	dm_disk(md)->in_flight = atomic_inc_return(&md->pending);
274 }
275 
276 static int end_io_acct(struct dm_io *io)
277 {
278 	struct mapped_device *md = io->md;
279 	struct bio *bio = io->bio;
280 	unsigned long duration = jiffies - io->start_time;
281 	int pending;
282 	int rw = bio_data_dir(bio);
283 
284 	preempt_disable();
285 	disk_round_stats(dm_disk(md));
286 	preempt_enable();
287 	dm_disk(md)->in_flight = pending = atomic_dec_return(&md->pending);
288 
289 	disk_stat_add(dm_disk(md), ticks[rw], duration);
290 
291 	return !pending;
292 }
293 
294 /*
295  * Add the bio to the list of deferred io.
296  */
297 static int queue_io(struct mapped_device *md, struct bio *bio)
298 {
299 	down_write(&md->io_lock);
300 
301 	if (!test_bit(DMF_BLOCK_IO, &md->flags)) {
302 		up_write(&md->io_lock);
303 		return 1;
304 	}
305 
306 	bio_list_add(&md->deferred, bio);
307 
308 	up_write(&md->io_lock);
309 	return 0;		/* deferred successfully */
310 }
311 
312 /*
313  * Everyone (including functions in this file), should use this
314  * function to access the md->map field, and make sure they call
315  * dm_table_put() when finished.
316  */
317 struct dm_table *dm_get_table(struct mapped_device *md)
318 {
319 	struct dm_table *t;
320 
321 	read_lock(&md->map_lock);
322 	t = md->map;
323 	if (t)
324 		dm_table_get(t);
325 	read_unlock(&md->map_lock);
326 
327 	return t;
328 }
329 
330 /*
331  * Get the geometry associated with a dm device
332  */
333 int dm_get_geometry(struct mapped_device *md, struct hd_geometry *geo)
334 {
335 	*geo = md->geometry;
336 
337 	return 0;
338 }
339 
340 /*
341  * Set the geometry of a device.
342  */
343 int dm_set_geometry(struct mapped_device *md, struct hd_geometry *geo)
344 {
345 	sector_t sz = (sector_t)geo->cylinders * geo->heads * geo->sectors;
346 
347 	if (geo->start > sz) {
348 		DMWARN("Start sector is beyond the geometry limits.");
349 		return -EINVAL;
350 	}
351 
352 	md->geometry = *geo;
353 
354 	return 0;
355 }
356 
357 /*-----------------------------------------------------------------
358  * CRUD START:
359  *   A more elegant soln is in the works that uses the queue
360  *   merge fn, unfortunately there are a couple of changes to
361  *   the block layer that I want to make for this.  So in the
362  *   interests of getting something for people to use I give
363  *   you this clearly demarcated crap.
364  *---------------------------------------------------------------*/
365 
366 /*
367  * Decrements the number of outstanding ios that a bio has been
368  * cloned into, completing the original io if necc.
369  */
370 static void dec_pending(struct dm_io *io, int error)
371 {
372 	if (error)
373 		io->error = error;
374 
375 	if (atomic_dec_and_test(&io->io_count)) {
376 		if (end_io_acct(io))
377 			/* nudge anyone waiting on suspend queue */
378 			wake_up(&io->md->wait);
379 
380 		blk_add_trace_bio(io->md->queue, io->bio, BLK_TA_COMPLETE);
381 
382 		bio_endio(io->bio, io->bio->bi_size, io->error);
383 		free_io(io->md, io);
384 	}
385 }
386 
387 static int clone_endio(struct bio *bio, unsigned int done, int error)
388 {
389 	int r = 0;
390 	struct target_io *tio = bio->bi_private;
391 	struct dm_io *io = tio->io;
392 	dm_endio_fn endio = tio->ti->type->end_io;
393 
394 	if (bio->bi_size)
395 		return 1;
396 
397 	if (!bio_flagged(bio, BIO_UPTODATE) && !error)
398 		error = -EIO;
399 
400 	if (endio) {
401 		r = endio(tio->ti, bio, error, &tio->info);
402 		if (r < 0)
403 			error = r;
404 
405 		else if (r > 0)
406 			/* the target wants another shot at the io */
407 			return 1;
408 	}
409 
410 	free_tio(io->md, tio);
411 	dec_pending(io, error);
412 	bio_put(bio);
413 	return r;
414 }
415 
416 static sector_t max_io_len(struct mapped_device *md,
417 			   sector_t sector, struct dm_target *ti)
418 {
419 	sector_t offset = sector - ti->begin;
420 	sector_t len = ti->len - offset;
421 
422 	/*
423 	 * Does the target need to split even further ?
424 	 */
425 	if (ti->split_io) {
426 		sector_t boundary;
427 		boundary = ((offset + ti->split_io) & ~(ti->split_io - 1))
428 			   - offset;
429 		if (len > boundary)
430 			len = boundary;
431 	}
432 
433 	return len;
434 }
435 
436 static void __map_bio(struct dm_target *ti, struct bio *clone,
437 		      struct target_io *tio)
438 {
439 	int r;
440 	sector_t sector;
441 
442 	/*
443 	 * Sanity checks.
444 	 */
445 	BUG_ON(!clone->bi_size);
446 
447 	clone->bi_end_io = clone_endio;
448 	clone->bi_private = tio;
449 
450 	/*
451 	 * Map the clone.  If r == 0 we don't need to do
452 	 * anything, the target has assumed ownership of
453 	 * this io.
454 	 */
455 	atomic_inc(&tio->io->io_count);
456 	sector = clone->bi_sector;
457 	r = ti->type->map(ti, clone, &tio->info);
458 	if (r > 0) {
459 		/* the bio has been remapped so dispatch it */
460 
461 		blk_add_trace_remap(bdev_get_queue(clone->bi_bdev), clone,
462 				    tio->io->bio->bi_bdev->bd_dev, sector,
463 				    clone->bi_sector);
464 
465 		generic_make_request(clone);
466 	}
467 
468 	else if (r < 0) {
469 		/* error the io and bail out */
470 		struct dm_io *io = tio->io;
471 		free_tio(tio->io->md, tio);
472 		dec_pending(io, r);
473 		bio_put(clone);
474 	}
475 }
476 
477 struct clone_info {
478 	struct mapped_device *md;
479 	struct dm_table *map;
480 	struct bio *bio;
481 	struct dm_io *io;
482 	sector_t sector;
483 	sector_t sector_count;
484 	unsigned short idx;
485 };
486 
487 static void dm_bio_destructor(struct bio *bio)
488 {
489 	bio_free(bio, dm_set);
490 }
491 
492 /*
493  * Creates a little bio that is just does part of a bvec.
494  */
495 static struct bio *split_bvec(struct bio *bio, sector_t sector,
496 			      unsigned short idx, unsigned int offset,
497 			      unsigned int len)
498 {
499 	struct bio *clone;
500 	struct bio_vec *bv = bio->bi_io_vec + idx;
501 
502 	clone = bio_alloc_bioset(GFP_NOIO, 1, dm_set);
503 	clone->bi_destructor = dm_bio_destructor;
504 	*clone->bi_io_vec = *bv;
505 
506 	clone->bi_sector = sector;
507 	clone->bi_bdev = bio->bi_bdev;
508 	clone->bi_rw = bio->bi_rw;
509 	clone->bi_vcnt = 1;
510 	clone->bi_size = to_bytes(len);
511 	clone->bi_io_vec->bv_offset = offset;
512 	clone->bi_io_vec->bv_len = clone->bi_size;
513 
514 	return clone;
515 }
516 
517 /*
518  * Creates a bio that consists of range of complete bvecs.
519  */
520 static struct bio *clone_bio(struct bio *bio, sector_t sector,
521 			     unsigned short idx, unsigned short bv_count,
522 			     unsigned int len)
523 {
524 	struct bio *clone;
525 
526 	clone = bio_clone(bio, GFP_NOIO);
527 	clone->bi_sector = sector;
528 	clone->bi_idx = idx;
529 	clone->bi_vcnt = idx + bv_count;
530 	clone->bi_size = to_bytes(len);
531 	clone->bi_flags &= ~(1 << BIO_SEG_VALID);
532 
533 	return clone;
534 }
535 
536 static void __clone_and_map(struct clone_info *ci)
537 {
538 	struct bio *clone, *bio = ci->bio;
539 	struct dm_target *ti = dm_table_find_target(ci->map, ci->sector);
540 	sector_t len = 0, max = max_io_len(ci->md, ci->sector, ti);
541 	struct target_io *tio;
542 
543 	/*
544 	 * Allocate a target io object.
545 	 */
546 	tio = alloc_tio(ci->md);
547 	tio->io = ci->io;
548 	tio->ti = ti;
549 	memset(&tio->info, 0, sizeof(tio->info));
550 
551 	if (ci->sector_count <= max) {
552 		/*
553 		 * Optimise for the simple case where we can do all of
554 		 * the remaining io with a single clone.
555 		 */
556 		clone = clone_bio(bio, ci->sector, ci->idx,
557 				  bio->bi_vcnt - ci->idx, ci->sector_count);
558 		__map_bio(ti, clone, tio);
559 		ci->sector_count = 0;
560 
561 	} else if (to_sector(bio->bi_io_vec[ci->idx].bv_len) <= max) {
562 		/*
563 		 * There are some bvecs that don't span targets.
564 		 * Do as many of these as possible.
565 		 */
566 		int i;
567 		sector_t remaining = max;
568 		sector_t bv_len;
569 
570 		for (i = ci->idx; remaining && (i < bio->bi_vcnt); i++) {
571 			bv_len = to_sector(bio->bi_io_vec[i].bv_len);
572 
573 			if (bv_len > remaining)
574 				break;
575 
576 			remaining -= bv_len;
577 			len += bv_len;
578 		}
579 
580 		clone = clone_bio(bio, ci->sector, ci->idx, i - ci->idx, len);
581 		__map_bio(ti, clone, tio);
582 
583 		ci->sector += len;
584 		ci->sector_count -= len;
585 		ci->idx = i;
586 
587 	} else {
588 		/*
589 		 * Handle a bvec that must be split between two or more targets.
590 		 */
591 		struct bio_vec *bv = bio->bi_io_vec + ci->idx;
592 		sector_t remaining = to_sector(bv->bv_len);
593 		unsigned int offset = 0;
594 
595 		do {
596 			if (offset) {
597 				ti = dm_table_find_target(ci->map, ci->sector);
598 				max = max_io_len(ci->md, ci->sector, ti);
599 
600 				tio = alloc_tio(ci->md);
601 				tio->io = ci->io;
602 				tio->ti = ti;
603 				memset(&tio->info, 0, sizeof(tio->info));
604 			}
605 
606 			len = min(remaining, max);
607 
608 			clone = split_bvec(bio, ci->sector, ci->idx,
609 					   bv->bv_offset + offset, len);
610 
611 			__map_bio(ti, clone, tio);
612 
613 			ci->sector += len;
614 			ci->sector_count -= len;
615 			offset += to_bytes(len);
616 		} while (remaining -= len);
617 
618 		ci->idx++;
619 	}
620 }
621 
622 /*
623  * Split the bio into several clones.
624  */
625 static void __split_bio(struct mapped_device *md, struct bio *bio)
626 {
627 	struct clone_info ci;
628 
629 	ci.map = dm_get_table(md);
630 	if (!ci.map) {
631 		bio_io_error(bio, bio->bi_size);
632 		return;
633 	}
634 
635 	ci.md = md;
636 	ci.bio = bio;
637 	ci.io = alloc_io(md);
638 	ci.io->error = 0;
639 	atomic_set(&ci.io->io_count, 1);
640 	ci.io->bio = bio;
641 	ci.io->md = md;
642 	ci.sector = bio->bi_sector;
643 	ci.sector_count = bio_sectors(bio);
644 	ci.idx = bio->bi_idx;
645 
646 	start_io_acct(ci.io);
647 	while (ci.sector_count)
648 		__clone_and_map(&ci);
649 
650 	/* drop the extra reference count */
651 	dec_pending(ci.io, 0);
652 	dm_table_put(ci.map);
653 }
654 /*-----------------------------------------------------------------
655  * CRUD END
656  *---------------------------------------------------------------*/
657 
658 /*
659  * The request function that just remaps the bio built up by
660  * dm_merge_bvec.
661  */
662 static int dm_request(request_queue_t *q, struct bio *bio)
663 {
664 	int r;
665 	int rw = bio_data_dir(bio);
666 	struct mapped_device *md = q->queuedata;
667 
668 	down_read(&md->io_lock);
669 
670 	disk_stat_inc(dm_disk(md), ios[rw]);
671 	disk_stat_add(dm_disk(md), sectors[rw], bio_sectors(bio));
672 
673 	/*
674 	 * If we're suspended we have to queue
675 	 * this io for later.
676 	 */
677 	while (test_bit(DMF_BLOCK_IO, &md->flags)) {
678 		up_read(&md->io_lock);
679 
680 		if (bio_rw(bio) == READA) {
681 			bio_io_error(bio, bio->bi_size);
682 			return 0;
683 		}
684 
685 		r = queue_io(md, bio);
686 		if (r < 0) {
687 			bio_io_error(bio, bio->bi_size);
688 			return 0;
689 
690 		} else if (r == 0)
691 			return 0;	/* deferred successfully */
692 
693 		/*
694 		 * We're in a while loop, because someone could suspend
695 		 * before we get to the following read lock.
696 		 */
697 		down_read(&md->io_lock);
698 	}
699 
700 	__split_bio(md, bio);
701 	up_read(&md->io_lock);
702 	return 0;
703 }
704 
705 static int dm_flush_all(request_queue_t *q, struct gendisk *disk,
706 			sector_t *error_sector)
707 {
708 	struct mapped_device *md = q->queuedata;
709 	struct dm_table *map = dm_get_table(md);
710 	int ret = -ENXIO;
711 
712 	if (map) {
713 		ret = dm_table_flush_all(map);
714 		dm_table_put(map);
715 	}
716 
717 	return ret;
718 }
719 
720 static void dm_unplug_all(request_queue_t *q)
721 {
722 	struct mapped_device *md = q->queuedata;
723 	struct dm_table *map = dm_get_table(md);
724 
725 	if (map) {
726 		dm_table_unplug_all(map);
727 		dm_table_put(map);
728 	}
729 }
730 
731 static int dm_any_congested(void *congested_data, int bdi_bits)
732 {
733 	int r;
734 	struct mapped_device *md = (struct mapped_device *) congested_data;
735 	struct dm_table *map = dm_get_table(md);
736 
737 	if (!map || test_bit(DMF_BLOCK_IO, &md->flags))
738 		r = bdi_bits;
739 	else
740 		r = dm_table_any_congested(map, bdi_bits);
741 
742 	dm_table_put(map);
743 	return r;
744 }
745 
746 /*-----------------------------------------------------------------
747  * An IDR is used to keep track of allocated minor numbers.
748  *---------------------------------------------------------------*/
749 static DEFINE_MUTEX(_minor_lock);
750 static DEFINE_IDR(_minor_idr);
751 
752 static void free_minor(unsigned int minor)
753 {
754 	mutex_lock(&_minor_lock);
755 	idr_remove(&_minor_idr, minor);
756 	mutex_unlock(&_minor_lock);
757 }
758 
759 /*
760  * See if the device with a specific minor # is free.
761  */
762 static int specific_minor(struct mapped_device *md, unsigned int minor)
763 {
764 	int r, m;
765 
766 	if (minor >= (1 << MINORBITS))
767 		return -EINVAL;
768 
769 	r = idr_pre_get(&_minor_idr, GFP_KERNEL);
770 	if (!r)
771 		return -ENOMEM;
772 
773 	mutex_lock(&_minor_lock);
774 
775 	if (idr_find(&_minor_idr, minor)) {
776 		r = -EBUSY;
777 		goto out;
778 	}
779 
780 	r = idr_get_new_above(&_minor_idr, MINOR_ALLOCED, minor, &m);
781 	if (r)
782 		goto out;
783 
784 	if (m != minor) {
785 		idr_remove(&_minor_idr, m);
786 		r = -EBUSY;
787 		goto out;
788 	}
789 
790 out:
791 	mutex_unlock(&_minor_lock);
792 	return r;
793 }
794 
795 static int next_free_minor(struct mapped_device *md, unsigned int *minor)
796 {
797 	int r;
798 	unsigned int m;
799 
800 	r = idr_pre_get(&_minor_idr, GFP_KERNEL);
801 	if (!r)
802 		return -ENOMEM;
803 
804 	mutex_lock(&_minor_lock);
805 
806 	r = idr_get_new(&_minor_idr, MINOR_ALLOCED, &m);
807 	if (r) {
808 		goto out;
809 	}
810 
811 	if (m >= (1 << MINORBITS)) {
812 		idr_remove(&_minor_idr, m);
813 		r = -ENOSPC;
814 		goto out;
815 	}
816 
817 	*minor = m;
818 
819 out:
820 	mutex_unlock(&_minor_lock);
821 	return r;
822 }
823 
824 static struct block_device_operations dm_blk_dops;
825 
826 /*
827  * Allocate and initialise a blank device with a given minor.
828  */
829 static struct mapped_device *alloc_dev(unsigned int minor, int persistent)
830 {
831 	int r;
832 	struct mapped_device *md = kmalloc(sizeof(*md), GFP_KERNEL);
833 	void *old_md;
834 
835 	if (!md) {
836 		DMWARN("unable to allocate device, out of memory.");
837 		return NULL;
838 	}
839 
840 	/* get a minor number for the dev */
841 	r = persistent ? specific_minor(md, minor) : next_free_minor(md, &minor);
842 	if (r < 0)
843 		goto bad1;
844 
845 	memset(md, 0, sizeof(*md));
846 	init_rwsem(&md->io_lock);
847 	init_MUTEX(&md->suspend_lock);
848 	rwlock_init(&md->map_lock);
849 	atomic_set(&md->holders, 1);
850 	atomic_set(&md->event_nr, 0);
851 
852 	md->queue = blk_alloc_queue(GFP_KERNEL);
853 	if (!md->queue)
854 		goto bad1;
855 
856 	md->queue->queuedata = md;
857 	md->queue->backing_dev_info.congested_fn = dm_any_congested;
858 	md->queue->backing_dev_info.congested_data = md;
859 	blk_queue_make_request(md->queue, dm_request);
860 	blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY);
861 	md->queue->unplug_fn = dm_unplug_all;
862 	md->queue->issue_flush_fn = dm_flush_all;
863 
864 	md->io_pool = mempool_create_slab_pool(MIN_IOS, _io_cache);
865  	if (!md->io_pool)
866  		goto bad2;
867 
868 	md->tio_pool = mempool_create_slab_pool(MIN_IOS, _tio_cache);
869 	if (!md->tio_pool)
870 		goto bad3;
871 
872 	md->disk = alloc_disk(1);
873 	if (!md->disk)
874 		goto bad4;
875 
876 	md->disk->major = _major;
877 	md->disk->first_minor = minor;
878 	md->disk->fops = &dm_blk_dops;
879 	md->disk->queue = md->queue;
880 	md->disk->private_data = md;
881 	sprintf(md->disk->disk_name, "dm-%d", minor);
882 	add_disk(md->disk);
883 	format_dev_t(md->name, MKDEV(_major, minor));
884 
885 	atomic_set(&md->pending, 0);
886 	init_waitqueue_head(&md->wait);
887 	init_waitqueue_head(&md->eventq);
888 
889 	/* Populate the mapping, nobody knows we exist yet */
890 	mutex_lock(&_minor_lock);
891 	old_md = idr_replace(&_minor_idr, md, minor);
892 	mutex_unlock(&_minor_lock);
893 
894 	BUG_ON(old_md != MINOR_ALLOCED);
895 
896 	return md;
897 
898  bad4:
899 	mempool_destroy(md->tio_pool);
900  bad3:
901 	mempool_destroy(md->io_pool);
902  bad2:
903 	blk_cleanup_queue(md->queue);
904 	free_minor(minor);
905  bad1:
906 	kfree(md);
907 	return NULL;
908 }
909 
910 static void free_dev(struct mapped_device *md)
911 {
912 	unsigned int minor = md->disk->first_minor;
913 
914 	if (md->suspended_bdev) {
915 		thaw_bdev(md->suspended_bdev, NULL);
916 		bdput(md->suspended_bdev);
917 	}
918 	mempool_destroy(md->tio_pool);
919 	mempool_destroy(md->io_pool);
920 	del_gendisk(md->disk);
921 	free_minor(minor);
922 	put_disk(md->disk);
923 	blk_cleanup_queue(md->queue);
924 	kfree(md);
925 }
926 
927 /*
928  * Bind a table to the device.
929  */
930 static void event_callback(void *context)
931 {
932 	struct mapped_device *md = (struct mapped_device *) context;
933 
934 	atomic_inc(&md->event_nr);
935 	wake_up(&md->eventq);
936 }
937 
938 static void __set_size(struct mapped_device *md, sector_t size)
939 {
940 	set_capacity(md->disk, size);
941 
942 	mutex_lock(&md->suspended_bdev->bd_inode->i_mutex);
943 	i_size_write(md->suspended_bdev->bd_inode, (loff_t)size << SECTOR_SHIFT);
944 	mutex_unlock(&md->suspended_bdev->bd_inode->i_mutex);
945 }
946 
947 static int __bind(struct mapped_device *md, struct dm_table *t)
948 {
949 	request_queue_t *q = md->queue;
950 	sector_t size;
951 
952 	size = dm_table_get_size(t);
953 
954 	/*
955 	 * Wipe any geometry if the size of the table changed.
956 	 */
957 	if (size != get_capacity(md->disk))
958 		memset(&md->geometry, 0, sizeof(md->geometry));
959 
960 	__set_size(md, size);
961 	if (size == 0)
962 		return 0;
963 
964 	dm_table_get(t);
965 	dm_table_event_callback(t, event_callback, md);
966 
967 	write_lock(&md->map_lock);
968 	md->map = t;
969 	dm_table_set_restrictions(t, q);
970 	write_unlock(&md->map_lock);
971 
972 	return 0;
973 }
974 
975 static void __unbind(struct mapped_device *md)
976 {
977 	struct dm_table *map = md->map;
978 
979 	if (!map)
980 		return;
981 
982 	dm_table_event_callback(map, NULL, NULL);
983 	write_lock(&md->map_lock);
984 	md->map = NULL;
985 	write_unlock(&md->map_lock);
986 	dm_table_put(map);
987 }
988 
989 /*
990  * Constructor for a new device.
991  */
992 static int create_aux(unsigned int minor, int persistent,
993 		      struct mapped_device **result)
994 {
995 	struct mapped_device *md;
996 
997 	md = alloc_dev(minor, persistent);
998 	if (!md)
999 		return -ENXIO;
1000 
1001 	*result = md;
1002 	return 0;
1003 }
1004 
1005 int dm_create(struct mapped_device **result)
1006 {
1007 	return create_aux(0, 0, result);
1008 }
1009 
1010 int dm_create_with_minor(unsigned int minor, struct mapped_device **result)
1011 {
1012 	return create_aux(minor, 1, result);
1013 }
1014 
1015 static struct mapped_device *dm_find_md(dev_t dev)
1016 {
1017 	struct mapped_device *md;
1018 	unsigned minor = MINOR(dev);
1019 
1020 	if (MAJOR(dev) != _major || minor >= (1 << MINORBITS))
1021 		return NULL;
1022 
1023 	mutex_lock(&_minor_lock);
1024 
1025 	md = idr_find(&_minor_idr, minor);
1026 	if (md && (md == MINOR_ALLOCED || (dm_disk(md)->first_minor != minor)))
1027 		md = NULL;
1028 
1029 	mutex_unlock(&_minor_lock);
1030 
1031 	return md;
1032 }
1033 
1034 struct mapped_device *dm_get_md(dev_t dev)
1035 {
1036 	struct mapped_device *md = dm_find_md(dev);
1037 
1038 	if (md)
1039 		dm_get(md);
1040 
1041 	return md;
1042 }
1043 
1044 void *dm_get_mdptr(struct mapped_device *md)
1045 {
1046 	return md->interface_ptr;
1047 }
1048 
1049 void dm_set_mdptr(struct mapped_device *md, void *ptr)
1050 {
1051 	md->interface_ptr = ptr;
1052 }
1053 
1054 void dm_get(struct mapped_device *md)
1055 {
1056 	atomic_inc(&md->holders);
1057 }
1058 
1059 void dm_put(struct mapped_device *md)
1060 {
1061 	struct dm_table *map;
1062 
1063 	if (atomic_dec_and_test(&md->holders)) {
1064 		map = dm_get_table(md);
1065 		mutex_lock(&_minor_lock);
1066 		idr_replace(&_minor_idr, MINOR_ALLOCED, dm_disk(md)->first_minor);
1067 		mutex_unlock(&_minor_lock);
1068 		if (!dm_suspended(md)) {
1069 			dm_table_presuspend_targets(map);
1070 			dm_table_postsuspend_targets(map);
1071 		}
1072 		__unbind(md);
1073 		dm_table_put(map);
1074 		free_dev(md);
1075 	}
1076 }
1077 
1078 /*
1079  * Process the deferred bios
1080  */
1081 static void __flush_deferred_io(struct mapped_device *md, struct bio *c)
1082 {
1083 	struct bio *n;
1084 
1085 	while (c) {
1086 		n = c->bi_next;
1087 		c->bi_next = NULL;
1088 		__split_bio(md, c);
1089 		c = n;
1090 	}
1091 }
1092 
1093 /*
1094  * Swap in a new table (destroying old one).
1095  */
1096 int dm_swap_table(struct mapped_device *md, struct dm_table *table)
1097 {
1098 	int r = -EINVAL;
1099 
1100 	down(&md->suspend_lock);
1101 
1102 	/* device must be suspended */
1103 	if (!dm_suspended(md))
1104 		goto out;
1105 
1106 	__unbind(md);
1107 	r = __bind(md, table);
1108 
1109 out:
1110 	up(&md->suspend_lock);
1111 	return r;
1112 }
1113 
1114 /*
1115  * Functions to lock and unlock any filesystem running on the
1116  * device.
1117  */
1118 static int lock_fs(struct mapped_device *md)
1119 {
1120 	int r;
1121 
1122 	WARN_ON(md->frozen_sb);
1123 
1124 	md->frozen_sb = freeze_bdev(md->suspended_bdev);
1125 	if (IS_ERR(md->frozen_sb)) {
1126 		r = PTR_ERR(md->frozen_sb);
1127 		md->frozen_sb = NULL;
1128 		return r;
1129 	}
1130 
1131 	set_bit(DMF_FROZEN, &md->flags);
1132 
1133 	/* don't bdput right now, we don't want the bdev
1134 	 * to go away while it is locked.
1135 	 */
1136 	return 0;
1137 }
1138 
1139 static void unlock_fs(struct mapped_device *md)
1140 {
1141 	if (!test_bit(DMF_FROZEN, &md->flags))
1142 		return;
1143 
1144 	thaw_bdev(md->suspended_bdev, md->frozen_sb);
1145 	md->frozen_sb = NULL;
1146 	clear_bit(DMF_FROZEN, &md->flags);
1147 }
1148 
1149 /*
1150  * We need to be able to change a mapping table under a mounted
1151  * filesystem.  For example we might want to move some data in
1152  * the background.  Before the table can be swapped with
1153  * dm_bind_table, dm_suspend must be called to flush any in
1154  * flight bios and ensure that any further io gets deferred.
1155  */
1156 int dm_suspend(struct mapped_device *md, int do_lockfs)
1157 {
1158 	struct dm_table *map = NULL;
1159 	DECLARE_WAITQUEUE(wait, current);
1160 	struct bio *def;
1161 	int r = -EINVAL;
1162 
1163 	down(&md->suspend_lock);
1164 
1165 	if (dm_suspended(md))
1166 		goto out;
1167 
1168 	map = dm_get_table(md);
1169 
1170 	/* This does not get reverted if there's an error later. */
1171 	dm_table_presuspend_targets(map);
1172 
1173 	md->suspended_bdev = bdget_disk(md->disk, 0);
1174 	if (!md->suspended_bdev) {
1175 		DMWARN("bdget failed in dm_suspend");
1176 		r = -ENOMEM;
1177 		goto out;
1178 	}
1179 
1180 	/* Flush I/O to the device. */
1181 	if (do_lockfs) {
1182 		r = lock_fs(md);
1183 		if (r)
1184 			goto out;
1185 	}
1186 
1187 	/*
1188 	 * First we set the BLOCK_IO flag so no more ios will be mapped.
1189 	 */
1190 	down_write(&md->io_lock);
1191 	set_bit(DMF_BLOCK_IO, &md->flags);
1192 
1193 	add_wait_queue(&md->wait, &wait);
1194 	up_write(&md->io_lock);
1195 
1196 	/* unplug */
1197 	if (map)
1198 		dm_table_unplug_all(map);
1199 
1200 	/*
1201 	 * Then we wait for the already mapped ios to
1202 	 * complete.
1203 	 */
1204 	while (1) {
1205 		set_current_state(TASK_INTERRUPTIBLE);
1206 
1207 		if (!atomic_read(&md->pending) || signal_pending(current))
1208 			break;
1209 
1210 		io_schedule();
1211 	}
1212 	set_current_state(TASK_RUNNING);
1213 
1214 	down_write(&md->io_lock);
1215 	remove_wait_queue(&md->wait, &wait);
1216 
1217 	/* were we interrupted ? */
1218 	r = -EINTR;
1219 	if (atomic_read(&md->pending)) {
1220 		clear_bit(DMF_BLOCK_IO, &md->flags);
1221 		def = bio_list_get(&md->deferred);
1222 		__flush_deferred_io(md, def);
1223 		up_write(&md->io_lock);
1224 		unlock_fs(md);
1225 		goto out;
1226 	}
1227 	up_write(&md->io_lock);
1228 
1229 	dm_table_postsuspend_targets(map);
1230 
1231 	set_bit(DMF_SUSPENDED, &md->flags);
1232 
1233 	r = 0;
1234 
1235 out:
1236 	if (r && md->suspended_bdev) {
1237 		bdput(md->suspended_bdev);
1238 		md->suspended_bdev = NULL;
1239 	}
1240 
1241 	dm_table_put(map);
1242 	up(&md->suspend_lock);
1243 	return r;
1244 }
1245 
1246 int dm_resume(struct mapped_device *md)
1247 {
1248 	int r = -EINVAL;
1249 	struct bio *def;
1250 	struct dm_table *map = NULL;
1251 
1252 	down(&md->suspend_lock);
1253 	if (!dm_suspended(md))
1254 		goto out;
1255 
1256 	map = dm_get_table(md);
1257 	if (!map || !dm_table_get_size(map))
1258 		goto out;
1259 
1260 	dm_table_resume_targets(map);
1261 
1262 	down_write(&md->io_lock);
1263 	clear_bit(DMF_BLOCK_IO, &md->flags);
1264 
1265 	def = bio_list_get(&md->deferred);
1266 	__flush_deferred_io(md, def);
1267 	up_write(&md->io_lock);
1268 
1269 	unlock_fs(md);
1270 
1271 	bdput(md->suspended_bdev);
1272 	md->suspended_bdev = NULL;
1273 
1274 	clear_bit(DMF_SUSPENDED, &md->flags);
1275 
1276 	dm_table_unplug_all(map);
1277 
1278 	r = 0;
1279 
1280 out:
1281 	dm_table_put(map);
1282 	up(&md->suspend_lock);
1283 
1284 	return r;
1285 }
1286 
1287 /*-----------------------------------------------------------------
1288  * Event notification.
1289  *---------------------------------------------------------------*/
1290 uint32_t dm_get_event_nr(struct mapped_device *md)
1291 {
1292 	return atomic_read(&md->event_nr);
1293 }
1294 
1295 int dm_wait_event(struct mapped_device *md, int event_nr)
1296 {
1297 	return wait_event_interruptible(md->eventq,
1298 			(event_nr != atomic_read(&md->event_nr)));
1299 }
1300 
1301 /*
1302  * The gendisk is only valid as long as you have a reference
1303  * count on 'md'.
1304  */
1305 struct gendisk *dm_disk(struct mapped_device *md)
1306 {
1307 	return md->disk;
1308 }
1309 
1310 int dm_suspended(struct mapped_device *md)
1311 {
1312 	return test_bit(DMF_SUSPENDED, &md->flags);
1313 }
1314 
1315 static struct block_device_operations dm_blk_dops = {
1316 	.open = dm_blk_open,
1317 	.release = dm_blk_close,
1318 	.getgeo = dm_blk_getgeo,
1319 	.owner = THIS_MODULE
1320 };
1321 
1322 EXPORT_SYMBOL(dm_get_mapinfo);
1323 
1324 /*
1325  * module hooks
1326  */
1327 module_init(dm_init);
1328 module_exit(dm_exit);
1329 
1330 module_param(major, uint, 0);
1331 MODULE_PARM_DESC(major, "The major number of the device mapper");
1332 MODULE_DESCRIPTION(DM_NAME " driver");
1333 MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
1334 MODULE_LICENSE("GPL");
1335