1 /* 2 * Copyright (C) 2001, 2002 Sistina Software (UK) Limited. 3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. 4 * 5 * This file is released under the GPL. 6 */ 7 8 #include "dm.h" 9 #include "dm-bio-list.h" 10 11 #include <linux/init.h> 12 #include <linux/module.h> 13 #include <linux/mutex.h> 14 #include <linux/moduleparam.h> 15 #include <linux/blkpg.h> 16 #include <linux/bio.h> 17 #include <linux/buffer_head.h> 18 #include <linux/mempool.h> 19 #include <linux/slab.h> 20 #include <linux/idr.h> 21 #include <linux/hdreg.h> 22 #include <linux/blktrace_api.h> 23 #include <linux/smp_lock.h> 24 25 #define DM_MSG_PREFIX "core" 26 27 static const char *_name = DM_NAME; 28 29 static unsigned int major = 0; 30 static unsigned int _major = 0; 31 32 static DEFINE_SPINLOCK(_minor_lock); 33 /* 34 * One of these is allocated per bio. 35 */ 36 struct dm_io { 37 struct mapped_device *md; 38 int error; 39 struct bio *bio; 40 atomic_t io_count; 41 unsigned long start_time; 42 }; 43 44 /* 45 * One of these is allocated per target within a bio. Hopefully 46 * this will be simplified out one day. 47 */ 48 struct target_io { 49 struct dm_io *io; 50 struct dm_target *ti; 51 union map_info info; 52 }; 53 54 union map_info *dm_get_mapinfo(struct bio *bio) 55 { 56 if (bio && bio->bi_private) 57 return &((struct target_io *)bio->bi_private)->info; 58 return NULL; 59 } 60 61 #define MINOR_ALLOCED ((void *)-1) 62 63 /* 64 * Bits for the md->flags field. 65 */ 66 #define DMF_BLOCK_IO 0 67 #define DMF_SUSPENDED 1 68 #define DMF_FROZEN 2 69 #define DMF_FREEING 3 70 #define DMF_DELETING 4 71 72 struct mapped_device { 73 struct rw_semaphore io_lock; 74 struct semaphore suspend_lock; 75 rwlock_t map_lock; 76 atomic_t holders; 77 atomic_t open_count; 78 79 unsigned long flags; 80 81 request_queue_t *queue; 82 struct gendisk *disk; 83 char name[16]; 84 85 void *interface_ptr; 86 87 /* 88 * A list of ios that arrived while we were suspended. 89 */ 90 atomic_t pending; 91 wait_queue_head_t wait; 92 struct bio_list deferred; 93 94 /* 95 * The current mapping. 96 */ 97 struct dm_table *map; 98 99 /* 100 * io objects are allocated from here. 101 */ 102 mempool_t *io_pool; 103 mempool_t *tio_pool; 104 105 struct bio_set *bs; 106 107 /* 108 * Event handling. 109 */ 110 atomic_t event_nr; 111 wait_queue_head_t eventq; 112 113 /* 114 * freeze/thaw support require holding onto a super block 115 */ 116 struct super_block *frozen_sb; 117 struct block_device *suspended_bdev; 118 119 /* forced geometry settings */ 120 struct hd_geometry geometry; 121 }; 122 123 #define MIN_IOS 256 124 static struct kmem_cache *_io_cache; 125 static struct kmem_cache *_tio_cache; 126 127 static int __init local_init(void) 128 { 129 int r; 130 131 /* allocate a slab for the dm_ios */ 132 _io_cache = kmem_cache_create("dm_io", 133 sizeof(struct dm_io), 0, 0, NULL, NULL); 134 if (!_io_cache) 135 return -ENOMEM; 136 137 /* allocate a slab for the target ios */ 138 _tio_cache = kmem_cache_create("dm_tio", sizeof(struct target_io), 139 0, 0, NULL, NULL); 140 if (!_tio_cache) { 141 kmem_cache_destroy(_io_cache); 142 return -ENOMEM; 143 } 144 145 _major = major; 146 r = register_blkdev(_major, _name); 147 if (r < 0) { 148 kmem_cache_destroy(_tio_cache); 149 kmem_cache_destroy(_io_cache); 150 return r; 151 } 152 153 if (!_major) 154 _major = r; 155 156 return 0; 157 } 158 159 static void local_exit(void) 160 { 161 kmem_cache_destroy(_tio_cache); 162 kmem_cache_destroy(_io_cache); 163 164 if (unregister_blkdev(_major, _name) < 0) 165 DMERR("unregister_blkdev failed"); 166 167 _major = 0; 168 169 DMINFO("cleaned up"); 170 } 171 172 int (*_inits[])(void) __initdata = { 173 local_init, 174 dm_target_init, 175 dm_linear_init, 176 dm_stripe_init, 177 dm_interface_init, 178 }; 179 180 void (*_exits[])(void) = { 181 local_exit, 182 dm_target_exit, 183 dm_linear_exit, 184 dm_stripe_exit, 185 dm_interface_exit, 186 }; 187 188 static int __init dm_init(void) 189 { 190 const int count = ARRAY_SIZE(_inits); 191 192 int r, i; 193 194 for (i = 0; i < count; i++) { 195 r = _inits[i](); 196 if (r) 197 goto bad; 198 } 199 200 return 0; 201 202 bad: 203 while (i--) 204 _exits[i](); 205 206 return r; 207 } 208 209 static void __exit dm_exit(void) 210 { 211 int i = ARRAY_SIZE(_exits); 212 213 while (i--) 214 _exits[i](); 215 } 216 217 /* 218 * Block device functions 219 */ 220 static int dm_blk_open(struct inode *inode, struct file *file) 221 { 222 struct mapped_device *md; 223 224 spin_lock(&_minor_lock); 225 226 md = inode->i_bdev->bd_disk->private_data; 227 if (!md) 228 goto out; 229 230 if (test_bit(DMF_FREEING, &md->flags) || 231 test_bit(DMF_DELETING, &md->flags)) { 232 md = NULL; 233 goto out; 234 } 235 236 dm_get(md); 237 atomic_inc(&md->open_count); 238 239 out: 240 spin_unlock(&_minor_lock); 241 242 return md ? 0 : -ENXIO; 243 } 244 245 static int dm_blk_close(struct inode *inode, struct file *file) 246 { 247 struct mapped_device *md; 248 249 md = inode->i_bdev->bd_disk->private_data; 250 atomic_dec(&md->open_count); 251 dm_put(md); 252 return 0; 253 } 254 255 int dm_open_count(struct mapped_device *md) 256 { 257 return atomic_read(&md->open_count); 258 } 259 260 /* 261 * Guarantees nothing is using the device before it's deleted. 262 */ 263 int dm_lock_for_deletion(struct mapped_device *md) 264 { 265 int r = 0; 266 267 spin_lock(&_minor_lock); 268 269 if (dm_open_count(md)) 270 r = -EBUSY; 271 else 272 set_bit(DMF_DELETING, &md->flags); 273 274 spin_unlock(&_minor_lock); 275 276 return r; 277 } 278 279 static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo) 280 { 281 struct mapped_device *md = bdev->bd_disk->private_data; 282 283 return dm_get_geometry(md, geo); 284 } 285 286 static int dm_blk_ioctl(struct inode *inode, struct file *file, 287 unsigned int cmd, unsigned long arg) 288 { 289 struct mapped_device *md; 290 struct dm_table *map; 291 struct dm_target *tgt; 292 int r = -ENOTTY; 293 294 /* We don't really need this lock, but we do need 'inode'. */ 295 unlock_kernel(); 296 297 md = inode->i_bdev->bd_disk->private_data; 298 299 map = dm_get_table(md); 300 301 if (!map || !dm_table_get_size(map)) 302 goto out; 303 304 /* We only support devices that have a single target */ 305 if (dm_table_get_num_targets(map) != 1) 306 goto out; 307 308 tgt = dm_table_get_target(map, 0); 309 310 if (dm_suspended(md)) { 311 r = -EAGAIN; 312 goto out; 313 } 314 315 if (tgt->type->ioctl) 316 r = tgt->type->ioctl(tgt, inode, file, cmd, arg); 317 318 out: 319 dm_table_put(map); 320 321 lock_kernel(); 322 return r; 323 } 324 325 static inline struct dm_io *alloc_io(struct mapped_device *md) 326 { 327 return mempool_alloc(md->io_pool, GFP_NOIO); 328 } 329 330 static inline void free_io(struct mapped_device *md, struct dm_io *io) 331 { 332 mempool_free(io, md->io_pool); 333 } 334 335 static inline struct target_io *alloc_tio(struct mapped_device *md) 336 { 337 return mempool_alloc(md->tio_pool, GFP_NOIO); 338 } 339 340 static inline void free_tio(struct mapped_device *md, struct target_io *tio) 341 { 342 mempool_free(tio, md->tio_pool); 343 } 344 345 static void start_io_acct(struct dm_io *io) 346 { 347 struct mapped_device *md = io->md; 348 349 io->start_time = jiffies; 350 351 preempt_disable(); 352 disk_round_stats(dm_disk(md)); 353 preempt_enable(); 354 dm_disk(md)->in_flight = atomic_inc_return(&md->pending); 355 } 356 357 static int end_io_acct(struct dm_io *io) 358 { 359 struct mapped_device *md = io->md; 360 struct bio *bio = io->bio; 361 unsigned long duration = jiffies - io->start_time; 362 int pending; 363 int rw = bio_data_dir(bio); 364 365 preempt_disable(); 366 disk_round_stats(dm_disk(md)); 367 preempt_enable(); 368 dm_disk(md)->in_flight = pending = atomic_dec_return(&md->pending); 369 370 disk_stat_add(dm_disk(md), ticks[rw], duration); 371 372 return !pending; 373 } 374 375 /* 376 * Add the bio to the list of deferred io. 377 */ 378 static int queue_io(struct mapped_device *md, struct bio *bio) 379 { 380 down_write(&md->io_lock); 381 382 if (!test_bit(DMF_BLOCK_IO, &md->flags)) { 383 up_write(&md->io_lock); 384 return 1; 385 } 386 387 bio_list_add(&md->deferred, bio); 388 389 up_write(&md->io_lock); 390 return 0; /* deferred successfully */ 391 } 392 393 /* 394 * Everyone (including functions in this file), should use this 395 * function to access the md->map field, and make sure they call 396 * dm_table_put() when finished. 397 */ 398 struct dm_table *dm_get_table(struct mapped_device *md) 399 { 400 struct dm_table *t; 401 402 read_lock(&md->map_lock); 403 t = md->map; 404 if (t) 405 dm_table_get(t); 406 read_unlock(&md->map_lock); 407 408 return t; 409 } 410 411 /* 412 * Get the geometry associated with a dm device 413 */ 414 int dm_get_geometry(struct mapped_device *md, struct hd_geometry *geo) 415 { 416 *geo = md->geometry; 417 418 return 0; 419 } 420 421 /* 422 * Set the geometry of a device. 423 */ 424 int dm_set_geometry(struct mapped_device *md, struct hd_geometry *geo) 425 { 426 sector_t sz = (sector_t)geo->cylinders * geo->heads * geo->sectors; 427 428 if (geo->start > sz) { 429 DMWARN("Start sector is beyond the geometry limits."); 430 return -EINVAL; 431 } 432 433 md->geometry = *geo; 434 435 return 0; 436 } 437 438 /*----------------------------------------------------------------- 439 * CRUD START: 440 * A more elegant soln is in the works that uses the queue 441 * merge fn, unfortunately there are a couple of changes to 442 * the block layer that I want to make for this. So in the 443 * interests of getting something for people to use I give 444 * you this clearly demarcated crap. 445 *---------------------------------------------------------------*/ 446 447 /* 448 * Decrements the number of outstanding ios that a bio has been 449 * cloned into, completing the original io if necc. 450 */ 451 static void dec_pending(struct dm_io *io, int error) 452 { 453 if (error) 454 io->error = error; 455 456 if (atomic_dec_and_test(&io->io_count)) { 457 if (end_io_acct(io)) 458 /* nudge anyone waiting on suspend queue */ 459 wake_up(&io->md->wait); 460 461 blk_add_trace_bio(io->md->queue, io->bio, BLK_TA_COMPLETE); 462 463 bio_endio(io->bio, io->bio->bi_size, io->error); 464 free_io(io->md, io); 465 } 466 } 467 468 static int clone_endio(struct bio *bio, unsigned int done, int error) 469 { 470 int r = 0; 471 struct target_io *tio = bio->bi_private; 472 struct mapped_device *md = tio->io->md; 473 dm_endio_fn endio = tio->ti->type->end_io; 474 475 if (bio->bi_size) 476 return 1; 477 478 if (!bio_flagged(bio, BIO_UPTODATE) && !error) 479 error = -EIO; 480 481 if (endio) { 482 r = endio(tio->ti, bio, error, &tio->info); 483 if (r < 0) 484 error = r; 485 else if (r > 0) 486 /* the target wants another shot at the io */ 487 return 1; 488 } 489 490 dec_pending(tio->io, error); 491 492 /* 493 * Store md for cleanup instead of tio which is about to get freed. 494 */ 495 bio->bi_private = md->bs; 496 497 bio_put(bio); 498 free_tio(md, tio); 499 return r; 500 } 501 502 static sector_t max_io_len(struct mapped_device *md, 503 sector_t sector, struct dm_target *ti) 504 { 505 sector_t offset = sector - ti->begin; 506 sector_t len = ti->len - offset; 507 508 /* 509 * Does the target need to split even further ? 510 */ 511 if (ti->split_io) { 512 sector_t boundary; 513 boundary = ((offset + ti->split_io) & ~(ti->split_io - 1)) 514 - offset; 515 if (len > boundary) 516 len = boundary; 517 } 518 519 return len; 520 } 521 522 static void __map_bio(struct dm_target *ti, struct bio *clone, 523 struct target_io *tio) 524 { 525 int r; 526 sector_t sector; 527 struct mapped_device *md; 528 529 /* 530 * Sanity checks. 531 */ 532 BUG_ON(!clone->bi_size); 533 534 clone->bi_end_io = clone_endio; 535 clone->bi_private = tio; 536 537 /* 538 * Map the clone. If r == 0 we don't need to do 539 * anything, the target has assumed ownership of 540 * this io. 541 */ 542 atomic_inc(&tio->io->io_count); 543 sector = clone->bi_sector; 544 r = ti->type->map(ti, clone, &tio->info); 545 if (r > 0) { 546 /* the bio has been remapped so dispatch it */ 547 548 blk_add_trace_remap(bdev_get_queue(clone->bi_bdev), clone, 549 tio->io->bio->bi_bdev->bd_dev, sector, 550 clone->bi_sector); 551 552 generic_make_request(clone); 553 } else if (r < 0) { 554 /* error the io and bail out */ 555 md = tio->io->md; 556 dec_pending(tio->io, r); 557 /* 558 * Store bio_set for cleanup. 559 */ 560 clone->bi_private = md->bs; 561 bio_put(clone); 562 free_tio(md, tio); 563 } 564 } 565 566 struct clone_info { 567 struct mapped_device *md; 568 struct dm_table *map; 569 struct bio *bio; 570 struct dm_io *io; 571 sector_t sector; 572 sector_t sector_count; 573 unsigned short idx; 574 }; 575 576 static void dm_bio_destructor(struct bio *bio) 577 { 578 struct bio_set *bs = bio->bi_private; 579 580 bio_free(bio, bs); 581 } 582 583 /* 584 * Creates a little bio that is just does part of a bvec. 585 */ 586 static struct bio *split_bvec(struct bio *bio, sector_t sector, 587 unsigned short idx, unsigned int offset, 588 unsigned int len, struct bio_set *bs) 589 { 590 struct bio *clone; 591 struct bio_vec *bv = bio->bi_io_vec + idx; 592 593 clone = bio_alloc_bioset(GFP_NOIO, 1, bs); 594 clone->bi_destructor = dm_bio_destructor; 595 *clone->bi_io_vec = *bv; 596 597 clone->bi_sector = sector; 598 clone->bi_bdev = bio->bi_bdev; 599 clone->bi_rw = bio->bi_rw; 600 clone->bi_vcnt = 1; 601 clone->bi_size = to_bytes(len); 602 clone->bi_io_vec->bv_offset = offset; 603 clone->bi_io_vec->bv_len = clone->bi_size; 604 605 return clone; 606 } 607 608 /* 609 * Creates a bio that consists of range of complete bvecs. 610 */ 611 static struct bio *clone_bio(struct bio *bio, sector_t sector, 612 unsigned short idx, unsigned short bv_count, 613 unsigned int len, struct bio_set *bs) 614 { 615 struct bio *clone; 616 617 clone = bio_alloc_bioset(GFP_NOIO, bio->bi_max_vecs, bs); 618 __bio_clone(clone, bio); 619 clone->bi_destructor = dm_bio_destructor; 620 clone->bi_sector = sector; 621 clone->bi_idx = idx; 622 clone->bi_vcnt = idx + bv_count; 623 clone->bi_size = to_bytes(len); 624 clone->bi_flags &= ~(1 << BIO_SEG_VALID); 625 626 return clone; 627 } 628 629 static void __clone_and_map(struct clone_info *ci) 630 { 631 struct bio *clone, *bio = ci->bio; 632 struct dm_target *ti = dm_table_find_target(ci->map, ci->sector); 633 sector_t len = 0, max = max_io_len(ci->md, ci->sector, ti); 634 struct target_io *tio; 635 636 /* 637 * Allocate a target io object. 638 */ 639 tio = alloc_tio(ci->md); 640 tio->io = ci->io; 641 tio->ti = ti; 642 memset(&tio->info, 0, sizeof(tio->info)); 643 644 if (ci->sector_count <= max) { 645 /* 646 * Optimise for the simple case where we can do all of 647 * the remaining io with a single clone. 648 */ 649 clone = clone_bio(bio, ci->sector, ci->idx, 650 bio->bi_vcnt - ci->idx, ci->sector_count, 651 ci->md->bs); 652 __map_bio(ti, clone, tio); 653 ci->sector_count = 0; 654 655 } else if (to_sector(bio->bi_io_vec[ci->idx].bv_len) <= max) { 656 /* 657 * There are some bvecs that don't span targets. 658 * Do as many of these as possible. 659 */ 660 int i; 661 sector_t remaining = max; 662 sector_t bv_len; 663 664 for (i = ci->idx; remaining && (i < bio->bi_vcnt); i++) { 665 bv_len = to_sector(bio->bi_io_vec[i].bv_len); 666 667 if (bv_len > remaining) 668 break; 669 670 remaining -= bv_len; 671 len += bv_len; 672 } 673 674 clone = clone_bio(bio, ci->sector, ci->idx, i - ci->idx, len, 675 ci->md->bs); 676 __map_bio(ti, clone, tio); 677 678 ci->sector += len; 679 ci->sector_count -= len; 680 ci->idx = i; 681 682 } else { 683 /* 684 * Handle a bvec that must be split between two or more targets. 685 */ 686 struct bio_vec *bv = bio->bi_io_vec + ci->idx; 687 sector_t remaining = to_sector(bv->bv_len); 688 unsigned int offset = 0; 689 690 do { 691 if (offset) { 692 ti = dm_table_find_target(ci->map, ci->sector); 693 max = max_io_len(ci->md, ci->sector, ti); 694 695 tio = alloc_tio(ci->md); 696 tio->io = ci->io; 697 tio->ti = ti; 698 memset(&tio->info, 0, sizeof(tio->info)); 699 } 700 701 len = min(remaining, max); 702 703 clone = split_bvec(bio, ci->sector, ci->idx, 704 bv->bv_offset + offset, len, 705 ci->md->bs); 706 707 __map_bio(ti, clone, tio); 708 709 ci->sector += len; 710 ci->sector_count -= len; 711 offset += to_bytes(len); 712 } while (remaining -= len); 713 714 ci->idx++; 715 } 716 } 717 718 /* 719 * Split the bio into several clones. 720 */ 721 static void __split_bio(struct mapped_device *md, struct bio *bio) 722 { 723 struct clone_info ci; 724 725 ci.map = dm_get_table(md); 726 if (!ci.map) { 727 bio_io_error(bio, bio->bi_size); 728 return; 729 } 730 731 ci.md = md; 732 ci.bio = bio; 733 ci.io = alloc_io(md); 734 ci.io->error = 0; 735 atomic_set(&ci.io->io_count, 1); 736 ci.io->bio = bio; 737 ci.io->md = md; 738 ci.sector = bio->bi_sector; 739 ci.sector_count = bio_sectors(bio); 740 ci.idx = bio->bi_idx; 741 742 start_io_acct(ci.io); 743 while (ci.sector_count) 744 __clone_and_map(&ci); 745 746 /* drop the extra reference count */ 747 dec_pending(ci.io, 0); 748 dm_table_put(ci.map); 749 } 750 /*----------------------------------------------------------------- 751 * CRUD END 752 *---------------------------------------------------------------*/ 753 754 /* 755 * The request function that just remaps the bio built up by 756 * dm_merge_bvec. 757 */ 758 static int dm_request(request_queue_t *q, struct bio *bio) 759 { 760 int r; 761 int rw = bio_data_dir(bio); 762 struct mapped_device *md = q->queuedata; 763 764 down_read(&md->io_lock); 765 766 disk_stat_inc(dm_disk(md), ios[rw]); 767 disk_stat_add(dm_disk(md), sectors[rw], bio_sectors(bio)); 768 769 /* 770 * If we're suspended we have to queue 771 * this io for later. 772 */ 773 while (test_bit(DMF_BLOCK_IO, &md->flags)) { 774 up_read(&md->io_lock); 775 776 if (bio_rw(bio) == READA) { 777 bio_io_error(bio, bio->bi_size); 778 return 0; 779 } 780 781 r = queue_io(md, bio); 782 if (r < 0) { 783 bio_io_error(bio, bio->bi_size); 784 return 0; 785 786 } else if (r == 0) 787 return 0; /* deferred successfully */ 788 789 /* 790 * We're in a while loop, because someone could suspend 791 * before we get to the following read lock. 792 */ 793 down_read(&md->io_lock); 794 } 795 796 __split_bio(md, bio); 797 up_read(&md->io_lock); 798 return 0; 799 } 800 801 static int dm_flush_all(request_queue_t *q, struct gendisk *disk, 802 sector_t *error_sector) 803 { 804 struct mapped_device *md = q->queuedata; 805 struct dm_table *map = dm_get_table(md); 806 int ret = -ENXIO; 807 808 if (map) { 809 ret = dm_table_flush_all(map); 810 dm_table_put(map); 811 } 812 813 return ret; 814 } 815 816 static void dm_unplug_all(request_queue_t *q) 817 { 818 struct mapped_device *md = q->queuedata; 819 struct dm_table *map = dm_get_table(md); 820 821 if (map) { 822 dm_table_unplug_all(map); 823 dm_table_put(map); 824 } 825 } 826 827 static int dm_any_congested(void *congested_data, int bdi_bits) 828 { 829 int r; 830 struct mapped_device *md = (struct mapped_device *) congested_data; 831 struct dm_table *map = dm_get_table(md); 832 833 if (!map || test_bit(DMF_BLOCK_IO, &md->flags)) 834 r = bdi_bits; 835 else 836 r = dm_table_any_congested(map, bdi_bits); 837 838 dm_table_put(map); 839 return r; 840 } 841 842 /*----------------------------------------------------------------- 843 * An IDR is used to keep track of allocated minor numbers. 844 *---------------------------------------------------------------*/ 845 static DEFINE_IDR(_minor_idr); 846 847 static void free_minor(int minor) 848 { 849 spin_lock(&_minor_lock); 850 idr_remove(&_minor_idr, minor); 851 spin_unlock(&_minor_lock); 852 } 853 854 /* 855 * See if the device with a specific minor # is free. 856 */ 857 static int specific_minor(struct mapped_device *md, int minor) 858 { 859 int r, m; 860 861 if (minor >= (1 << MINORBITS)) 862 return -EINVAL; 863 864 r = idr_pre_get(&_minor_idr, GFP_KERNEL); 865 if (!r) 866 return -ENOMEM; 867 868 spin_lock(&_minor_lock); 869 870 if (idr_find(&_minor_idr, minor)) { 871 r = -EBUSY; 872 goto out; 873 } 874 875 r = idr_get_new_above(&_minor_idr, MINOR_ALLOCED, minor, &m); 876 if (r) 877 goto out; 878 879 if (m != minor) { 880 idr_remove(&_minor_idr, m); 881 r = -EBUSY; 882 goto out; 883 } 884 885 out: 886 spin_unlock(&_minor_lock); 887 return r; 888 } 889 890 static int next_free_minor(struct mapped_device *md, int *minor) 891 { 892 int r, m; 893 894 r = idr_pre_get(&_minor_idr, GFP_KERNEL); 895 if (!r) 896 return -ENOMEM; 897 898 spin_lock(&_minor_lock); 899 900 r = idr_get_new(&_minor_idr, MINOR_ALLOCED, &m); 901 if (r) { 902 goto out; 903 } 904 905 if (m >= (1 << MINORBITS)) { 906 idr_remove(&_minor_idr, m); 907 r = -ENOSPC; 908 goto out; 909 } 910 911 *minor = m; 912 913 out: 914 spin_unlock(&_minor_lock); 915 return r; 916 } 917 918 static struct block_device_operations dm_blk_dops; 919 920 /* 921 * Allocate and initialise a blank device with a given minor. 922 */ 923 static struct mapped_device *alloc_dev(int minor) 924 { 925 int r; 926 struct mapped_device *md = kmalloc(sizeof(*md), GFP_KERNEL); 927 void *old_md; 928 929 if (!md) { 930 DMWARN("unable to allocate device, out of memory."); 931 return NULL; 932 } 933 934 if (!try_module_get(THIS_MODULE)) 935 goto bad0; 936 937 /* get a minor number for the dev */ 938 if (minor == DM_ANY_MINOR) 939 r = next_free_minor(md, &minor); 940 else 941 r = specific_minor(md, minor); 942 if (r < 0) 943 goto bad1; 944 945 memset(md, 0, sizeof(*md)); 946 init_rwsem(&md->io_lock); 947 init_MUTEX(&md->suspend_lock); 948 rwlock_init(&md->map_lock); 949 atomic_set(&md->holders, 1); 950 atomic_set(&md->open_count, 0); 951 atomic_set(&md->event_nr, 0); 952 953 md->queue = blk_alloc_queue(GFP_KERNEL); 954 if (!md->queue) 955 goto bad1_free_minor; 956 957 md->queue->queuedata = md; 958 md->queue->backing_dev_info.congested_fn = dm_any_congested; 959 md->queue->backing_dev_info.congested_data = md; 960 blk_queue_make_request(md->queue, dm_request); 961 blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY); 962 md->queue->unplug_fn = dm_unplug_all; 963 md->queue->issue_flush_fn = dm_flush_all; 964 965 md->io_pool = mempool_create_slab_pool(MIN_IOS, _io_cache); 966 if (!md->io_pool) 967 goto bad2; 968 969 md->tio_pool = mempool_create_slab_pool(MIN_IOS, _tio_cache); 970 if (!md->tio_pool) 971 goto bad3; 972 973 md->bs = bioset_create(16, 16, 4); 974 if (!md->bs) 975 goto bad_no_bioset; 976 977 md->disk = alloc_disk(1); 978 if (!md->disk) 979 goto bad4; 980 981 atomic_set(&md->pending, 0); 982 init_waitqueue_head(&md->wait); 983 init_waitqueue_head(&md->eventq); 984 985 md->disk->major = _major; 986 md->disk->first_minor = minor; 987 md->disk->fops = &dm_blk_dops; 988 md->disk->queue = md->queue; 989 md->disk->private_data = md; 990 sprintf(md->disk->disk_name, "dm-%d", minor); 991 add_disk(md->disk); 992 format_dev_t(md->name, MKDEV(_major, minor)); 993 994 /* Populate the mapping, nobody knows we exist yet */ 995 spin_lock(&_minor_lock); 996 old_md = idr_replace(&_minor_idr, md, minor); 997 spin_unlock(&_minor_lock); 998 999 BUG_ON(old_md != MINOR_ALLOCED); 1000 1001 return md; 1002 1003 bad4: 1004 bioset_free(md->bs); 1005 bad_no_bioset: 1006 mempool_destroy(md->tio_pool); 1007 bad3: 1008 mempool_destroy(md->io_pool); 1009 bad2: 1010 blk_cleanup_queue(md->queue); 1011 bad1_free_minor: 1012 free_minor(minor); 1013 bad1: 1014 module_put(THIS_MODULE); 1015 bad0: 1016 kfree(md); 1017 return NULL; 1018 } 1019 1020 static void free_dev(struct mapped_device *md) 1021 { 1022 int minor = md->disk->first_minor; 1023 1024 if (md->suspended_bdev) { 1025 thaw_bdev(md->suspended_bdev, NULL); 1026 bdput(md->suspended_bdev); 1027 } 1028 mempool_destroy(md->tio_pool); 1029 mempool_destroy(md->io_pool); 1030 bioset_free(md->bs); 1031 del_gendisk(md->disk); 1032 free_minor(minor); 1033 1034 spin_lock(&_minor_lock); 1035 md->disk->private_data = NULL; 1036 spin_unlock(&_minor_lock); 1037 1038 put_disk(md->disk); 1039 blk_cleanup_queue(md->queue); 1040 module_put(THIS_MODULE); 1041 kfree(md); 1042 } 1043 1044 /* 1045 * Bind a table to the device. 1046 */ 1047 static void event_callback(void *context) 1048 { 1049 struct mapped_device *md = (struct mapped_device *) context; 1050 1051 atomic_inc(&md->event_nr); 1052 wake_up(&md->eventq); 1053 } 1054 1055 static void __set_size(struct mapped_device *md, sector_t size) 1056 { 1057 set_capacity(md->disk, size); 1058 1059 mutex_lock(&md->suspended_bdev->bd_inode->i_mutex); 1060 i_size_write(md->suspended_bdev->bd_inode, (loff_t)size << SECTOR_SHIFT); 1061 mutex_unlock(&md->suspended_bdev->bd_inode->i_mutex); 1062 } 1063 1064 static int __bind(struct mapped_device *md, struct dm_table *t) 1065 { 1066 request_queue_t *q = md->queue; 1067 sector_t size; 1068 1069 size = dm_table_get_size(t); 1070 1071 /* 1072 * Wipe any geometry if the size of the table changed. 1073 */ 1074 if (size != get_capacity(md->disk)) 1075 memset(&md->geometry, 0, sizeof(md->geometry)); 1076 1077 __set_size(md, size); 1078 if (size == 0) 1079 return 0; 1080 1081 dm_table_get(t); 1082 dm_table_event_callback(t, event_callback, md); 1083 1084 write_lock(&md->map_lock); 1085 md->map = t; 1086 dm_table_set_restrictions(t, q); 1087 write_unlock(&md->map_lock); 1088 1089 return 0; 1090 } 1091 1092 static void __unbind(struct mapped_device *md) 1093 { 1094 struct dm_table *map = md->map; 1095 1096 if (!map) 1097 return; 1098 1099 dm_table_event_callback(map, NULL, NULL); 1100 write_lock(&md->map_lock); 1101 md->map = NULL; 1102 write_unlock(&md->map_lock); 1103 dm_table_put(map); 1104 } 1105 1106 /* 1107 * Constructor for a new device. 1108 */ 1109 int dm_create(int minor, struct mapped_device **result) 1110 { 1111 struct mapped_device *md; 1112 1113 md = alloc_dev(minor); 1114 if (!md) 1115 return -ENXIO; 1116 1117 *result = md; 1118 return 0; 1119 } 1120 1121 static struct mapped_device *dm_find_md(dev_t dev) 1122 { 1123 struct mapped_device *md; 1124 unsigned minor = MINOR(dev); 1125 1126 if (MAJOR(dev) != _major || minor >= (1 << MINORBITS)) 1127 return NULL; 1128 1129 spin_lock(&_minor_lock); 1130 1131 md = idr_find(&_minor_idr, minor); 1132 if (md && (md == MINOR_ALLOCED || 1133 (dm_disk(md)->first_minor != minor) || 1134 test_bit(DMF_FREEING, &md->flags))) { 1135 md = NULL; 1136 goto out; 1137 } 1138 1139 out: 1140 spin_unlock(&_minor_lock); 1141 1142 return md; 1143 } 1144 1145 struct mapped_device *dm_get_md(dev_t dev) 1146 { 1147 struct mapped_device *md = dm_find_md(dev); 1148 1149 if (md) 1150 dm_get(md); 1151 1152 return md; 1153 } 1154 1155 void *dm_get_mdptr(struct mapped_device *md) 1156 { 1157 return md->interface_ptr; 1158 } 1159 1160 void dm_set_mdptr(struct mapped_device *md, void *ptr) 1161 { 1162 md->interface_ptr = ptr; 1163 } 1164 1165 void dm_get(struct mapped_device *md) 1166 { 1167 atomic_inc(&md->holders); 1168 } 1169 1170 const char *dm_device_name(struct mapped_device *md) 1171 { 1172 return md->name; 1173 } 1174 EXPORT_SYMBOL_GPL(dm_device_name); 1175 1176 void dm_put(struct mapped_device *md) 1177 { 1178 struct dm_table *map; 1179 1180 BUG_ON(test_bit(DMF_FREEING, &md->flags)); 1181 1182 if (atomic_dec_and_lock(&md->holders, &_minor_lock)) { 1183 map = dm_get_table(md); 1184 idr_replace(&_minor_idr, MINOR_ALLOCED, dm_disk(md)->first_minor); 1185 set_bit(DMF_FREEING, &md->flags); 1186 spin_unlock(&_minor_lock); 1187 if (!dm_suspended(md)) { 1188 dm_table_presuspend_targets(map); 1189 dm_table_postsuspend_targets(map); 1190 } 1191 __unbind(md); 1192 dm_table_put(map); 1193 free_dev(md); 1194 } 1195 } 1196 1197 /* 1198 * Process the deferred bios 1199 */ 1200 static void __flush_deferred_io(struct mapped_device *md, struct bio *c) 1201 { 1202 struct bio *n; 1203 1204 while (c) { 1205 n = c->bi_next; 1206 c->bi_next = NULL; 1207 __split_bio(md, c); 1208 c = n; 1209 } 1210 } 1211 1212 /* 1213 * Swap in a new table (destroying old one). 1214 */ 1215 int dm_swap_table(struct mapped_device *md, struct dm_table *table) 1216 { 1217 int r = -EINVAL; 1218 1219 down(&md->suspend_lock); 1220 1221 /* device must be suspended */ 1222 if (!dm_suspended(md)) 1223 goto out; 1224 1225 __unbind(md); 1226 r = __bind(md, table); 1227 1228 out: 1229 up(&md->suspend_lock); 1230 return r; 1231 } 1232 1233 /* 1234 * Functions to lock and unlock any filesystem running on the 1235 * device. 1236 */ 1237 static int lock_fs(struct mapped_device *md) 1238 { 1239 int r; 1240 1241 WARN_ON(md->frozen_sb); 1242 1243 md->frozen_sb = freeze_bdev(md->suspended_bdev); 1244 if (IS_ERR(md->frozen_sb)) { 1245 r = PTR_ERR(md->frozen_sb); 1246 md->frozen_sb = NULL; 1247 return r; 1248 } 1249 1250 set_bit(DMF_FROZEN, &md->flags); 1251 1252 /* don't bdput right now, we don't want the bdev 1253 * to go away while it is locked. 1254 */ 1255 return 0; 1256 } 1257 1258 static void unlock_fs(struct mapped_device *md) 1259 { 1260 if (!test_bit(DMF_FROZEN, &md->flags)) 1261 return; 1262 1263 thaw_bdev(md->suspended_bdev, md->frozen_sb); 1264 md->frozen_sb = NULL; 1265 clear_bit(DMF_FROZEN, &md->flags); 1266 } 1267 1268 /* 1269 * We need to be able to change a mapping table under a mounted 1270 * filesystem. For example we might want to move some data in 1271 * the background. Before the table can be swapped with 1272 * dm_bind_table, dm_suspend must be called to flush any in 1273 * flight bios and ensure that any further io gets deferred. 1274 */ 1275 int dm_suspend(struct mapped_device *md, unsigned suspend_flags) 1276 { 1277 struct dm_table *map = NULL; 1278 DECLARE_WAITQUEUE(wait, current); 1279 struct bio *def; 1280 int r = -EINVAL; 1281 int do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG ? 1 : 0; 1282 1283 down(&md->suspend_lock); 1284 1285 if (dm_suspended(md)) 1286 goto out_unlock; 1287 1288 map = dm_get_table(md); 1289 1290 /* This does not get reverted if there's an error later. */ 1291 dm_table_presuspend_targets(map); 1292 1293 md->suspended_bdev = bdget_disk(md->disk, 0); 1294 if (!md->suspended_bdev) { 1295 DMWARN("bdget failed in dm_suspend"); 1296 r = -ENOMEM; 1297 goto out; 1298 } 1299 1300 /* Flush I/O to the device. */ 1301 if (do_lockfs) { 1302 r = lock_fs(md); 1303 if (r) 1304 goto out; 1305 } 1306 1307 /* 1308 * First we set the BLOCK_IO flag so no more ios will be mapped. 1309 */ 1310 down_write(&md->io_lock); 1311 set_bit(DMF_BLOCK_IO, &md->flags); 1312 1313 add_wait_queue(&md->wait, &wait); 1314 up_write(&md->io_lock); 1315 1316 /* unplug */ 1317 if (map) 1318 dm_table_unplug_all(map); 1319 1320 /* 1321 * Then we wait for the already mapped ios to 1322 * complete. 1323 */ 1324 while (1) { 1325 set_current_state(TASK_INTERRUPTIBLE); 1326 1327 if (!atomic_read(&md->pending) || signal_pending(current)) 1328 break; 1329 1330 io_schedule(); 1331 } 1332 set_current_state(TASK_RUNNING); 1333 1334 down_write(&md->io_lock); 1335 remove_wait_queue(&md->wait, &wait); 1336 1337 /* were we interrupted ? */ 1338 r = -EINTR; 1339 if (atomic_read(&md->pending)) { 1340 clear_bit(DMF_BLOCK_IO, &md->flags); 1341 def = bio_list_get(&md->deferred); 1342 __flush_deferred_io(md, def); 1343 up_write(&md->io_lock); 1344 unlock_fs(md); 1345 goto out; 1346 } 1347 up_write(&md->io_lock); 1348 1349 dm_table_postsuspend_targets(map); 1350 1351 set_bit(DMF_SUSPENDED, &md->flags); 1352 1353 r = 0; 1354 1355 out: 1356 if (r && md->suspended_bdev) { 1357 bdput(md->suspended_bdev); 1358 md->suspended_bdev = NULL; 1359 } 1360 1361 dm_table_put(map); 1362 1363 out_unlock: 1364 up(&md->suspend_lock); 1365 return r; 1366 } 1367 1368 int dm_resume(struct mapped_device *md) 1369 { 1370 int r = -EINVAL; 1371 struct bio *def; 1372 struct dm_table *map = NULL; 1373 1374 down(&md->suspend_lock); 1375 if (!dm_suspended(md)) 1376 goto out; 1377 1378 map = dm_get_table(md); 1379 if (!map || !dm_table_get_size(map)) 1380 goto out; 1381 1382 r = dm_table_resume_targets(map); 1383 if (r) 1384 goto out; 1385 1386 down_write(&md->io_lock); 1387 clear_bit(DMF_BLOCK_IO, &md->flags); 1388 1389 def = bio_list_get(&md->deferred); 1390 __flush_deferred_io(md, def); 1391 up_write(&md->io_lock); 1392 1393 unlock_fs(md); 1394 1395 bdput(md->suspended_bdev); 1396 md->suspended_bdev = NULL; 1397 1398 clear_bit(DMF_SUSPENDED, &md->flags); 1399 1400 dm_table_unplug_all(map); 1401 1402 kobject_uevent(&md->disk->kobj, KOBJ_CHANGE); 1403 1404 r = 0; 1405 1406 out: 1407 dm_table_put(map); 1408 up(&md->suspend_lock); 1409 1410 return r; 1411 } 1412 1413 /*----------------------------------------------------------------- 1414 * Event notification. 1415 *---------------------------------------------------------------*/ 1416 uint32_t dm_get_event_nr(struct mapped_device *md) 1417 { 1418 return atomic_read(&md->event_nr); 1419 } 1420 1421 int dm_wait_event(struct mapped_device *md, int event_nr) 1422 { 1423 return wait_event_interruptible(md->eventq, 1424 (event_nr != atomic_read(&md->event_nr))); 1425 } 1426 1427 /* 1428 * The gendisk is only valid as long as you have a reference 1429 * count on 'md'. 1430 */ 1431 struct gendisk *dm_disk(struct mapped_device *md) 1432 { 1433 return md->disk; 1434 } 1435 1436 int dm_suspended(struct mapped_device *md) 1437 { 1438 return test_bit(DMF_SUSPENDED, &md->flags); 1439 } 1440 1441 static struct block_device_operations dm_blk_dops = { 1442 .open = dm_blk_open, 1443 .release = dm_blk_close, 1444 .ioctl = dm_blk_ioctl, 1445 .getgeo = dm_blk_getgeo, 1446 .owner = THIS_MODULE 1447 }; 1448 1449 EXPORT_SYMBOL(dm_get_mapinfo); 1450 1451 /* 1452 * module hooks 1453 */ 1454 module_init(dm_init); 1455 module_exit(dm_exit); 1456 1457 module_param(major, uint, 0); 1458 MODULE_PARM_DESC(major, "The major number of the device mapper"); 1459 MODULE_DESCRIPTION(DM_NAME " driver"); 1460 MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>"); 1461 MODULE_LICENSE("GPL"); 1462