1 /* 2 * Copyright (C) 2001, 2002 Sistina Software (UK) Limited. 3 * Copyright (C) 2004 Red Hat, Inc. All rights reserved. 4 * 5 * This file is released under the GPL. 6 */ 7 8 #include "dm.h" 9 #include "dm-bio-list.h" 10 11 #include <linux/init.h> 12 #include <linux/module.h> 13 #include <linux/moduleparam.h> 14 #include <linux/blkpg.h> 15 #include <linux/bio.h> 16 #include <linux/buffer_head.h> 17 #include <linux/mempool.h> 18 #include <linux/slab.h> 19 #include <linux/idr.h> 20 #include <linux/blktrace_api.h> 21 22 static const char *_name = DM_NAME; 23 24 static unsigned int major = 0; 25 static unsigned int _major = 0; 26 27 /* 28 * One of these is allocated per bio. 29 */ 30 struct dm_io { 31 struct mapped_device *md; 32 int error; 33 struct bio *bio; 34 atomic_t io_count; 35 unsigned long start_time; 36 }; 37 38 /* 39 * One of these is allocated per target within a bio. Hopefully 40 * this will be simplified out one day. 41 */ 42 struct target_io { 43 struct dm_io *io; 44 struct dm_target *ti; 45 union map_info info; 46 }; 47 48 union map_info *dm_get_mapinfo(struct bio *bio) 49 { 50 if (bio && bio->bi_private) 51 return &((struct target_io *)bio->bi_private)->info; 52 return NULL; 53 } 54 55 /* 56 * Bits for the md->flags field. 57 */ 58 #define DMF_BLOCK_IO 0 59 #define DMF_SUSPENDED 1 60 #define DMF_FROZEN 2 61 62 struct mapped_device { 63 struct rw_semaphore io_lock; 64 struct semaphore suspend_lock; 65 rwlock_t map_lock; 66 atomic_t holders; 67 68 unsigned long flags; 69 70 request_queue_t *queue; 71 struct gendisk *disk; 72 char name[16]; 73 74 void *interface_ptr; 75 76 /* 77 * A list of ios that arrived while we were suspended. 78 */ 79 atomic_t pending; 80 wait_queue_head_t wait; 81 struct bio_list deferred; 82 83 /* 84 * The current mapping. 85 */ 86 struct dm_table *map; 87 88 /* 89 * io objects are allocated from here. 90 */ 91 mempool_t *io_pool; 92 mempool_t *tio_pool; 93 94 /* 95 * Event handling. 96 */ 97 atomic_t event_nr; 98 wait_queue_head_t eventq; 99 100 /* 101 * freeze/thaw support require holding onto a super block 102 */ 103 struct super_block *frozen_sb; 104 struct block_device *suspended_bdev; 105 }; 106 107 #define MIN_IOS 256 108 static kmem_cache_t *_io_cache; 109 static kmem_cache_t *_tio_cache; 110 111 static struct bio_set *dm_set; 112 113 static int __init local_init(void) 114 { 115 int r; 116 117 dm_set = bioset_create(16, 16, 4); 118 if (!dm_set) 119 return -ENOMEM; 120 121 /* allocate a slab for the dm_ios */ 122 _io_cache = kmem_cache_create("dm_io", 123 sizeof(struct dm_io), 0, 0, NULL, NULL); 124 if (!_io_cache) 125 return -ENOMEM; 126 127 /* allocate a slab for the target ios */ 128 _tio_cache = kmem_cache_create("dm_tio", sizeof(struct target_io), 129 0, 0, NULL, NULL); 130 if (!_tio_cache) { 131 kmem_cache_destroy(_io_cache); 132 return -ENOMEM; 133 } 134 135 _major = major; 136 r = register_blkdev(_major, _name); 137 if (r < 0) { 138 kmem_cache_destroy(_tio_cache); 139 kmem_cache_destroy(_io_cache); 140 return r; 141 } 142 143 if (!_major) 144 _major = r; 145 146 return 0; 147 } 148 149 static void local_exit(void) 150 { 151 kmem_cache_destroy(_tio_cache); 152 kmem_cache_destroy(_io_cache); 153 154 bioset_free(dm_set); 155 156 if (unregister_blkdev(_major, _name) < 0) 157 DMERR("devfs_unregister_blkdev failed"); 158 159 _major = 0; 160 161 DMINFO("cleaned up"); 162 } 163 164 int (*_inits[])(void) __initdata = { 165 local_init, 166 dm_target_init, 167 dm_linear_init, 168 dm_stripe_init, 169 dm_interface_init, 170 }; 171 172 void (*_exits[])(void) = { 173 local_exit, 174 dm_target_exit, 175 dm_linear_exit, 176 dm_stripe_exit, 177 dm_interface_exit, 178 }; 179 180 static int __init dm_init(void) 181 { 182 const int count = ARRAY_SIZE(_inits); 183 184 int r, i; 185 186 for (i = 0; i < count; i++) { 187 r = _inits[i](); 188 if (r) 189 goto bad; 190 } 191 192 return 0; 193 194 bad: 195 while (i--) 196 _exits[i](); 197 198 return r; 199 } 200 201 static void __exit dm_exit(void) 202 { 203 int i = ARRAY_SIZE(_exits); 204 205 while (i--) 206 _exits[i](); 207 } 208 209 /* 210 * Block device functions 211 */ 212 static int dm_blk_open(struct inode *inode, struct file *file) 213 { 214 struct mapped_device *md; 215 216 md = inode->i_bdev->bd_disk->private_data; 217 dm_get(md); 218 return 0; 219 } 220 221 static int dm_blk_close(struct inode *inode, struct file *file) 222 { 223 struct mapped_device *md; 224 225 md = inode->i_bdev->bd_disk->private_data; 226 dm_put(md); 227 return 0; 228 } 229 230 static inline struct dm_io *alloc_io(struct mapped_device *md) 231 { 232 return mempool_alloc(md->io_pool, GFP_NOIO); 233 } 234 235 static inline void free_io(struct mapped_device *md, struct dm_io *io) 236 { 237 mempool_free(io, md->io_pool); 238 } 239 240 static inline struct target_io *alloc_tio(struct mapped_device *md) 241 { 242 return mempool_alloc(md->tio_pool, GFP_NOIO); 243 } 244 245 static inline void free_tio(struct mapped_device *md, struct target_io *tio) 246 { 247 mempool_free(tio, md->tio_pool); 248 } 249 250 static void start_io_acct(struct dm_io *io) 251 { 252 struct mapped_device *md = io->md; 253 254 io->start_time = jiffies; 255 256 preempt_disable(); 257 disk_round_stats(dm_disk(md)); 258 preempt_enable(); 259 dm_disk(md)->in_flight = atomic_inc_return(&md->pending); 260 } 261 262 static int end_io_acct(struct dm_io *io) 263 { 264 struct mapped_device *md = io->md; 265 struct bio *bio = io->bio; 266 unsigned long duration = jiffies - io->start_time; 267 int pending; 268 int rw = bio_data_dir(bio); 269 270 preempt_disable(); 271 disk_round_stats(dm_disk(md)); 272 preempt_enable(); 273 dm_disk(md)->in_flight = pending = atomic_dec_return(&md->pending); 274 275 disk_stat_add(dm_disk(md), ticks[rw], duration); 276 277 return !pending; 278 } 279 280 /* 281 * Add the bio to the list of deferred io. 282 */ 283 static int queue_io(struct mapped_device *md, struct bio *bio) 284 { 285 down_write(&md->io_lock); 286 287 if (!test_bit(DMF_BLOCK_IO, &md->flags)) { 288 up_write(&md->io_lock); 289 return 1; 290 } 291 292 bio_list_add(&md->deferred, bio); 293 294 up_write(&md->io_lock); 295 return 0; /* deferred successfully */ 296 } 297 298 /* 299 * Everyone (including functions in this file), should use this 300 * function to access the md->map field, and make sure they call 301 * dm_table_put() when finished. 302 */ 303 struct dm_table *dm_get_table(struct mapped_device *md) 304 { 305 struct dm_table *t; 306 307 read_lock(&md->map_lock); 308 t = md->map; 309 if (t) 310 dm_table_get(t); 311 read_unlock(&md->map_lock); 312 313 return t; 314 } 315 316 /*----------------------------------------------------------------- 317 * CRUD START: 318 * A more elegant soln is in the works that uses the queue 319 * merge fn, unfortunately there are a couple of changes to 320 * the block layer that I want to make for this. So in the 321 * interests of getting something for people to use I give 322 * you this clearly demarcated crap. 323 *---------------------------------------------------------------*/ 324 325 /* 326 * Decrements the number of outstanding ios that a bio has been 327 * cloned into, completing the original io if necc. 328 */ 329 static void dec_pending(struct dm_io *io, int error) 330 { 331 if (error) 332 io->error = error; 333 334 if (atomic_dec_and_test(&io->io_count)) { 335 if (end_io_acct(io)) 336 /* nudge anyone waiting on suspend queue */ 337 wake_up(&io->md->wait); 338 339 blk_add_trace_bio(io->md->queue, io->bio, BLK_TA_COMPLETE); 340 341 bio_endio(io->bio, io->bio->bi_size, io->error); 342 free_io(io->md, io); 343 } 344 } 345 346 static int clone_endio(struct bio *bio, unsigned int done, int error) 347 { 348 int r = 0; 349 struct target_io *tio = bio->bi_private; 350 struct dm_io *io = tio->io; 351 dm_endio_fn endio = tio->ti->type->end_io; 352 353 if (bio->bi_size) 354 return 1; 355 356 if (!bio_flagged(bio, BIO_UPTODATE) && !error) 357 error = -EIO; 358 359 if (endio) { 360 r = endio(tio->ti, bio, error, &tio->info); 361 if (r < 0) 362 error = r; 363 364 else if (r > 0) 365 /* the target wants another shot at the io */ 366 return 1; 367 } 368 369 free_tio(io->md, tio); 370 dec_pending(io, error); 371 bio_put(bio); 372 return r; 373 } 374 375 static sector_t max_io_len(struct mapped_device *md, 376 sector_t sector, struct dm_target *ti) 377 { 378 sector_t offset = sector - ti->begin; 379 sector_t len = ti->len - offset; 380 381 /* 382 * Does the target need to split even further ? 383 */ 384 if (ti->split_io) { 385 sector_t boundary; 386 boundary = ((offset + ti->split_io) & ~(ti->split_io - 1)) 387 - offset; 388 if (len > boundary) 389 len = boundary; 390 } 391 392 return len; 393 } 394 395 static void __map_bio(struct dm_target *ti, struct bio *clone, 396 struct target_io *tio) 397 { 398 int r; 399 sector_t sector; 400 401 /* 402 * Sanity checks. 403 */ 404 BUG_ON(!clone->bi_size); 405 406 clone->bi_end_io = clone_endio; 407 clone->bi_private = tio; 408 409 /* 410 * Map the clone. If r == 0 we don't need to do 411 * anything, the target has assumed ownership of 412 * this io. 413 */ 414 atomic_inc(&tio->io->io_count); 415 sector = clone->bi_sector; 416 r = ti->type->map(ti, clone, &tio->info); 417 if (r > 0) { 418 /* the bio has been remapped so dispatch it */ 419 420 blk_add_trace_remap(bdev_get_queue(clone->bi_bdev), clone, 421 tio->io->bio->bi_bdev->bd_dev, sector, 422 clone->bi_sector); 423 424 generic_make_request(clone); 425 } 426 427 else if (r < 0) { 428 /* error the io and bail out */ 429 struct dm_io *io = tio->io; 430 free_tio(tio->io->md, tio); 431 dec_pending(io, r); 432 bio_put(clone); 433 } 434 } 435 436 struct clone_info { 437 struct mapped_device *md; 438 struct dm_table *map; 439 struct bio *bio; 440 struct dm_io *io; 441 sector_t sector; 442 sector_t sector_count; 443 unsigned short idx; 444 }; 445 446 static void dm_bio_destructor(struct bio *bio) 447 { 448 bio_free(bio, dm_set); 449 } 450 451 /* 452 * Creates a little bio that is just does part of a bvec. 453 */ 454 static struct bio *split_bvec(struct bio *bio, sector_t sector, 455 unsigned short idx, unsigned int offset, 456 unsigned int len) 457 { 458 struct bio *clone; 459 struct bio_vec *bv = bio->bi_io_vec + idx; 460 461 clone = bio_alloc_bioset(GFP_NOIO, 1, dm_set); 462 clone->bi_destructor = dm_bio_destructor; 463 *clone->bi_io_vec = *bv; 464 465 clone->bi_sector = sector; 466 clone->bi_bdev = bio->bi_bdev; 467 clone->bi_rw = bio->bi_rw; 468 clone->bi_vcnt = 1; 469 clone->bi_size = to_bytes(len); 470 clone->bi_io_vec->bv_offset = offset; 471 clone->bi_io_vec->bv_len = clone->bi_size; 472 473 return clone; 474 } 475 476 /* 477 * Creates a bio that consists of range of complete bvecs. 478 */ 479 static struct bio *clone_bio(struct bio *bio, sector_t sector, 480 unsigned short idx, unsigned short bv_count, 481 unsigned int len) 482 { 483 struct bio *clone; 484 485 clone = bio_clone(bio, GFP_NOIO); 486 clone->bi_sector = sector; 487 clone->bi_idx = idx; 488 clone->bi_vcnt = idx + bv_count; 489 clone->bi_size = to_bytes(len); 490 clone->bi_flags &= ~(1 << BIO_SEG_VALID); 491 492 return clone; 493 } 494 495 static void __clone_and_map(struct clone_info *ci) 496 { 497 struct bio *clone, *bio = ci->bio; 498 struct dm_target *ti = dm_table_find_target(ci->map, ci->sector); 499 sector_t len = 0, max = max_io_len(ci->md, ci->sector, ti); 500 struct target_io *tio; 501 502 /* 503 * Allocate a target io object. 504 */ 505 tio = alloc_tio(ci->md); 506 tio->io = ci->io; 507 tio->ti = ti; 508 memset(&tio->info, 0, sizeof(tio->info)); 509 510 if (ci->sector_count <= max) { 511 /* 512 * Optimise for the simple case where we can do all of 513 * the remaining io with a single clone. 514 */ 515 clone = clone_bio(bio, ci->sector, ci->idx, 516 bio->bi_vcnt - ci->idx, ci->sector_count); 517 __map_bio(ti, clone, tio); 518 ci->sector_count = 0; 519 520 } else if (to_sector(bio->bi_io_vec[ci->idx].bv_len) <= max) { 521 /* 522 * There are some bvecs that don't span targets. 523 * Do as many of these as possible. 524 */ 525 int i; 526 sector_t remaining = max; 527 sector_t bv_len; 528 529 for (i = ci->idx; remaining && (i < bio->bi_vcnt); i++) { 530 bv_len = to_sector(bio->bi_io_vec[i].bv_len); 531 532 if (bv_len > remaining) 533 break; 534 535 remaining -= bv_len; 536 len += bv_len; 537 } 538 539 clone = clone_bio(bio, ci->sector, ci->idx, i - ci->idx, len); 540 __map_bio(ti, clone, tio); 541 542 ci->sector += len; 543 ci->sector_count -= len; 544 ci->idx = i; 545 546 } else { 547 /* 548 * Handle a bvec that must be split between two or more targets. 549 */ 550 struct bio_vec *bv = bio->bi_io_vec + ci->idx; 551 sector_t remaining = to_sector(bv->bv_len); 552 unsigned int offset = 0; 553 554 do { 555 if (offset) { 556 ti = dm_table_find_target(ci->map, ci->sector); 557 max = max_io_len(ci->md, ci->sector, ti); 558 559 tio = alloc_tio(ci->md); 560 tio->io = ci->io; 561 tio->ti = ti; 562 memset(&tio->info, 0, sizeof(tio->info)); 563 } 564 565 len = min(remaining, max); 566 567 clone = split_bvec(bio, ci->sector, ci->idx, 568 bv->bv_offset + offset, len); 569 570 __map_bio(ti, clone, tio); 571 572 ci->sector += len; 573 ci->sector_count -= len; 574 offset += to_bytes(len); 575 } while (remaining -= len); 576 577 ci->idx++; 578 } 579 } 580 581 /* 582 * Split the bio into several clones. 583 */ 584 static void __split_bio(struct mapped_device *md, struct bio *bio) 585 { 586 struct clone_info ci; 587 588 ci.map = dm_get_table(md); 589 if (!ci.map) { 590 bio_io_error(bio, bio->bi_size); 591 return; 592 } 593 594 ci.md = md; 595 ci.bio = bio; 596 ci.io = alloc_io(md); 597 ci.io->error = 0; 598 atomic_set(&ci.io->io_count, 1); 599 ci.io->bio = bio; 600 ci.io->md = md; 601 ci.sector = bio->bi_sector; 602 ci.sector_count = bio_sectors(bio); 603 ci.idx = bio->bi_idx; 604 605 start_io_acct(ci.io); 606 while (ci.sector_count) 607 __clone_and_map(&ci); 608 609 /* drop the extra reference count */ 610 dec_pending(ci.io, 0); 611 dm_table_put(ci.map); 612 } 613 /*----------------------------------------------------------------- 614 * CRUD END 615 *---------------------------------------------------------------*/ 616 617 /* 618 * The request function that just remaps the bio built up by 619 * dm_merge_bvec. 620 */ 621 static int dm_request(request_queue_t *q, struct bio *bio) 622 { 623 int r; 624 int rw = bio_data_dir(bio); 625 struct mapped_device *md = q->queuedata; 626 627 down_read(&md->io_lock); 628 629 disk_stat_inc(dm_disk(md), ios[rw]); 630 disk_stat_add(dm_disk(md), sectors[rw], bio_sectors(bio)); 631 632 /* 633 * If we're suspended we have to queue 634 * this io for later. 635 */ 636 while (test_bit(DMF_BLOCK_IO, &md->flags)) { 637 up_read(&md->io_lock); 638 639 if (bio_rw(bio) == READA) { 640 bio_io_error(bio, bio->bi_size); 641 return 0; 642 } 643 644 r = queue_io(md, bio); 645 if (r < 0) { 646 bio_io_error(bio, bio->bi_size); 647 return 0; 648 649 } else if (r == 0) 650 return 0; /* deferred successfully */ 651 652 /* 653 * We're in a while loop, because someone could suspend 654 * before we get to the following read lock. 655 */ 656 down_read(&md->io_lock); 657 } 658 659 __split_bio(md, bio); 660 up_read(&md->io_lock); 661 return 0; 662 } 663 664 static int dm_flush_all(request_queue_t *q, struct gendisk *disk, 665 sector_t *error_sector) 666 { 667 struct mapped_device *md = q->queuedata; 668 struct dm_table *map = dm_get_table(md); 669 int ret = -ENXIO; 670 671 if (map) { 672 ret = dm_table_flush_all(map); 673 dm_table_put(map); 674 } 675 676 return ret; 677 } 678 679 static void dm_unplug_all(request_queue_t *q) 680 { 681 struct mapped_device *md = q->queuedata; 682 struct dm_table *map = dm_get_table(md); 683 684 if (map) { 685 dm_table_unplug_all(map); 686 dm_table_put(map); 687 } 688 } 689 690 static int dm_any_congested(void *congested_data, int bdi_bits) 691 { 692 int r; 693 struct mapped_device *md = (struct mapped_device *) congested_data; 694 struct dm_table *map = dm_get_table(md); 695 696 if (!map || test_bit(DMF_BLOCK_IO, &md->flags)) 697 r = bdi_bits; 698 else 699 r = dm_table_any_congested(map, bdi_bits); 700 701 dm_table_put(map); 702 return r; 703 } 704 705 /*----------------------------------------------------------------- 706 * An IDR is used to keep track of allocated minor numbers. 707 *---------------------------------------------------------------*/ 708 static DECLARE_MUTEX(_minor_lock); 709 static DEFINE_IDR(_minor_idr); 710 711 static void free_minor(unsigned int minor) 712 { 713 down(&_minor_lock); 714 idr_remove(&_minor_idr, minor); 715 up(&_minor_lock); 716 } 717 718 /* 719 * See if the device with a specific minor # is free. 720 */ 721 static int specific_minor(struct mapped_device *md, unsigned int minor) 722 { 723 int r, m; 724 725 if (minor >= (1 << MINORBITS)) 726 return -EINVAL; 727 728 down(&_minor_lock); 729 730 if (idr_find(&_minor_idr, minor)) { 731 r = -EBUSY; 732 goto out; 733 } 734 735 r = idr_pre_get(&_minor_idr, GFP_KERNEL); 736 if (!r) { 737 r = -ENOMEM; 738 goto out; 739 } 740 741 r = idr_get_new_above(&_minor_idr, md, minor, &m); 742 if (r) { 743 goto out; 744 } 745 746 if (m != minor) { 747 idr_remove(&_minor_idr, m); 748 r = -EBUSY; 749 goto out; 750 } 751 752 out: 753 up(&_minor_lock); 754 return r; 755 } 756 757 static int next_free_minor(struct mapped_device *md, unsigned int *minor) 758 { 759 int r; 760 unsigned int m; 761 762 down(&_minor_lock); 763 764 r = idr_pre_get(&_minor_idr, GFP_KERNEL); 765 if (!r) { 766 r = -ENOMEM; 767 goto out; 768 } 769 770 r = idr_get_new(&_minor_idr, md, &m); 771 if (r) { 772 goto out; 773 } 774 775 if (m >= (1 << MINORBITS)) { 776 idr_remove(&_minor_idr, m); 777 r = -ENOSPC; 778 goto out; 779 } 780 781 *minor = m; 782 783 out: 784 up(&_minor_lock); 785 return r; 786 } 787 788 static struct block_device_operations dm_blk_dops; 789 790 /* 791 * Allocate and initialise a blank device with a given minor. 792 */ 793 static struct mapped_device *alloc_dev(unsigned int minor, int persistent) 794 { 795 int r; 796 struct mapped_device *md = kmalloc(sizeof(*md), GFP_KERNEL); 797 798 if (!md) { 799 DMWARN("unable to allocate device, out of memory."); 800 return NULL; 801 } 802 803 /* get a minor number for the dev */ 804 r = persistent ? specific_minor(md, minor) : next_free_minor(md, &minor); 805 if (r < 0) 806 goto bad1; 807 808 memset(md, 0, sizeof(*md)); 809 init_rwsem(&md->io_lock); 810 init_MUTEX(&md->suspend_lock); 811 rwlock_init(&md->map_lock); 812 atomic_set(&md->holders, 1); 813 atomic_set(&md->event_nr, 0); 814 815 md->queue = blk_alloc_queue(GFP_KERNEL); 816 if (!md->queue) 817 goto bad1; 818 819 md->queue->queuedata = md; 820 md->queue->backing_dev_info.congested_fn = dm_any_congested; 821 md->queue->backing_dev_info.congested_data = md; 822 blk_queue_make_request(md->queue, dm_request); 823 blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY); 824 md->queue->unplug_fn = dm_unplug_all; 825 md->queue->issue_flush_fn = dm_flush_all; 826 827 md->io_pool = mempool_create_slab_pool(MIN_IOS, _io_cache); 828 if (!md->io_pool) 829 goto bad2; 830 831 md->tio_pool = mempool_create_slab_pool(MIN_IOS, _tio_cache); 832 if (!md->tio_pool) 833 goto bad3; 834 835 md->disk = alloc_disk(1); 836 if (!md->disk) 837 goto bad4; 838 839 md->disk->major = _major; 840 md->disk->first_minor = minor; 841 md->disk->fops = &dm_blk_dops; 842 md->disk->queue = md->queue; 843 md->disk->private_data = md; 844 sprintf(md->disk->disk_name, "dm-%d", minor); 845 add_disk(md->disk); 846 format_dev_t(md->name, MKDEV(_major, minor)); 847 848 atomic_set(&md->pending, 0); 849 init_waitqueue_head(&md->wait); 850 init_waitqueue_head(&md->eventq); 851 852 return md; 853 854 bad4: 855 mempool_destroy(md->tio_pool); 856 bad3: 857 mempool_destroy(md->io_pool); 858 bad2: 859 blk_cleanup_queue(md->queue); 860 free_minor(minor); 861 bad1: 862 kfree(md); 863 return NULL; 864 } 865 866 static void free_dev(struct mapped_device *md) 867 { 868 unsigned int minor = md->disk->first_minor; 869 870 if (md->suspended_bdev) { 871 thaw_bdev(md->suspended_bdev, NULL); 872 bdput(md->suspended_bdev); 873 } 874 mempool_destroy(md->tio_pool); 875 mempool_destroy(md->io_pool); 876 del_gendisk(md->disk); 877 free_minor(minor); 878 put_disk(md->disk); 879 blk_cleanup_queue(md->queue); 880 kfree(md); 881 } 882 883 /* 884 * Bind a table to the device. 885 */ 886 static void event_callback(void *context) 887 { 888 struct mapped_device *md = (struct mapped_device *) context; 889 890 atomic_inc(&md->event_nr); 891 wake_up(&md->eventq); 892 } 893 894 static void __set_size(struct mapped_device *md, sector_t size) 895 { 896 set_capacity(md->disk, size); 897 898 mutex_lock(&md->suspended_bdev->bd_inode->i_mutex); 899 i_size_write(md->suspended_bdev->bd_inode, (loff_t)size << SECTOR_SHIFT); 900 mutex_unlock(&md->suspended_bdev->bd_inode->i_mutex); 901 } 902 903 static int __bind(struct mapped_device *md, struct dm_table *t) 904 { 905 request_queue_t *q = md->queue; 906 sector_t size; 907 908 size = dm_table_get_size(t); 909 __set_size(md, size); 910 if (size == 0) 911 return 0; 912 913 dm_table_get(t); 914 dm_table_event_callback(t, event_callback, md); 915 916 write_lock(&md->map_lock); 917 md->map = t; 918 dm_table_set_restrictions(t, q); 919 write_unlock(&md->map_lock); 920 921 return 0; 922 } 923 924 static void __unbind(struct mapped_device *md) 925 { 926 struct dm_table *map = md->map; 927 928 if (!map) 929 return; 930 931 dm_table_event_callback(map, NULL, NULL); 932 write_lock(&md->map_lock); 933 md->map = NULL; 934 write_unlock(&md->map_lock); 935 dm_table_put(map); 936 } 937 938 /* 939 * Constructor for a new device. 940 */ 941 static int create_aux(unsigned int minor, int persistent, 942 struct mapped_device **result) 943 { 944 struct mapped_device *md; 945 946 md = alloc_dev(minor, persistent); 947 if (!md) 948 return -ENXIO; 949 950 *result = md; 951 return 0; 952 } 953 954 int dm_create(struct mapped_device **result) 955 { 956 return create_aux(0, 0, result); 957 } 958 959 int dm_create_with_minor(unsigned int minor, struct mapped_device **result) 960 { 961 return create_aux(minor, 1, result); 962 } 963 964 static struct mapped_device *dm_find_md(dev_t dev) 965 { 966 struct mapped_device *md; 967 unsigned minor = MINOR(dev); 968 969 if (MAJOR(dev) != _major || minor >= (1 << MINORBITS)) 970 return NULL; 971 972 down(&_minor_lock); 973 974 md = idr_find(&_minor_idr, minor); 975 if (!md || (dm_disk(md)->first_minor != minor)) 976 md = NULL; 977 978 up(&_minor_lock); 979 980 return md; 981 } 982 983 struct mapped_device *dm_get_md(dev_t dev) 984 { 985 struct mapped_device *md = dm_find_md(dev); 986 987 if (md) 988 dm_get(md); 989 990 return md; 991 } 992 993 void *dm_get_mdptr(struct mapped_device *md) 994 { 995 return md->interface_ptr; 996 } 997 998 void dm_set_mdptr(struct mapped_device *md, void *ptr) 999 { 1000 md->interface_ptr = ptr; 1001 } 1002 1003 void dm_get(struct mapped_device *md) 1004 { 1005 atomic_inc(&md->holders); 1006 } 1007 1008 void dm_put(struct mapped_device *md) 1009 { 1010 struct dm_table *map; 1011 1012 if (atomic_dec_and_test(&md->holders)) { 1013 map = dm_get_table(md); 1014 if (!dm_suspended(md)) { 1015 dm_table_presuspend_targets(map); 1016 dm_table_postsuspend_targets(map); 1017 } 1018 __unbind(md); 1019 dm_table_put(map); 1020 free_dev(md); 1021 } 1022 } 1023 1024 /* 1025 * Process the deferred bios 1026 */ 1027 static void __flush_deferred_io(struct mapped_device *md, struct bio *c) 1028 { 1029 struct bio *n; 1030 1031 while (c) { 1032 n = c->bi_next; 1033 c->bi_next = NULL; 1034 __split_bio(md, c); 1035 c = n; 1036 } 1037 } 1038 1039 /* 1040 * Swap in a new table (destroying old one). 1041 */ 1042 int dm_swap_table(struct mapped_device *md, struct dm_table *table) 1043 { 1044 int r = -EINVAL; 1045 1046 down(&md->suspend_lock); 1047 1048 /* device must be suspended */ 1049 if (!dm_suspended(md)) 1050 goto out; 1051 1052 __unbind(md); 1053 r = __bind(md, table); 1054 1055 out: 1056 up(&md->suspend_lock); 1057 return r; 1058 } 1059 1060 /* 1061 * Functions to lock and unlock any filesystem running on the 1062 * device. 1063 */ 1064 static int lock_fs(struct mapped_device *md) 1065 { 1066 int r; 1067 1068 WARN_ON(md->frozen_sb); 1069 1070 md->frozen_sb = freeze_bdev(md->suspended_bdev); 1071 if (IS_ERR(md->frozen_sb)) { 1072 r = PTR_ERR(md->frozen_sb); 1073 md->frozen_sb = NULL; 1074 return r; 1075 } 1076 1077 set_bit(DMF_FROZEN, &md->flags); 1078 1079 /* don't bdput right now, we don't want the bdev 1080 * to go away while it is locked. 1081 */ 1082 return 0; 1083 } 1084 1085 static void unlock_fs(struct mapped_device *md) 1086 { 1087 if (!test_bit(DMF_FROZEN, &md->flags)) 1088 return; 1089 1090 thaw_bdev(md->suspended_bdev, md->frozen_sb); 1091 md->frozen_sb = NULL; 1092 clear_bit(DMF_FROZEN, &md->flags); 1093 } 1094 1095 /* 1096 * We need to be able to change a mapping table under a mounted 1097 * filesystem. For example we might want to move some data in 1098 * the background. Before the table can be swapped with 1099 * dm_bind_table, dm_suspend must be called to flush any in 1100 * flight bios and ensure that any further io gets deferred. 1101 */ 1102 int dm_suspend(struct mapped_device *md, int do_lockfs) 1103 { 1104 struct dm_table *map = NULL; 1105 DECLARE_WAITQUEUE(wait, current); 1106 struct bio *def; 1107 int r = -EINVAL; 1108 1109 down(&md->suspend_lock); 1110 1111 if (dm_suspended(md)) 1112 goto out; 1113 1114 map = dm_get_table(md); 1115 1116 /* This does not get reverted if there's an error later. */ 1117 dm_table_presuspend_targets(map); 1118 1119 md->suspended_bdev = bdget_disk(md->disk, 0); 1120 if (!md->suspended_bdev) { 1121 DMWARN("bdget failed in dm_suspend"); 1122 r = -ENOMEM; 1123 goto out; 1124 } 1125 1126 /* Flush I/O to the device. */ 1127 if (do_lockfs) { 1128 r = lock_fs(md); 1129 if (r) 1130 goto out; 1131 } 1132 1133 /* 1134 * First we set the BLOCK_IO flag so no more ios will be mapped. 1135 */ 1136 down_write(&md->io_lock); 1137 set_bit(DMF_BLOCK_IO, &md->flags); 1138 1139 add_wait_queue(&md->wait, &wait); 1140 up_write(&md->io_lock); 1141 1142 /* unplug */ 1143 if (map) 1144 dm_table_unplug_all(map); 1145 1146 /* 1147 * Then we wait for the already mapped ios to 1148 * complete. 1149 */ 1150 while (1) { 1151 set_current_state(TASK_INTERRUPTIBLE); 1152 1153 if (!atomic_read(&md->pending) || signal_pending(current)) 1154 break; 1155 1156 io_schedule(); 1157 } 1158 set_current_state(TASK_RUNNING); 1159 1160 down_write(&md->io_lock); 1161 remove_wait_queue(&md->wait, &wait); 1162 1163 /* were we interrupted ? */ 1164 r = -EINTR; 1165 if (atomic_read(&md->pending)) { 1166 clear_bit(DMF_BLOCK_IO, &md->flags); 1167 def = bio_list_get(&md->deferred); 1168 __flush_deferred_io(md, def); 1169 up_write(&md->io_lock); 1170 unlock_fs(md); 1171 goto out; 1172 } 1173 up_write(&md->io_lock); 1174 1175 dm_table_postsuspend_targets(map); 1176 1177 set_bit(DMF_SUSPENDED, &md->flags); 1178 1179 r = 0; 1180 1181 out: 1182 if (r && md->suspended_bdev) { 1183 bdput(md->suspended_bdev); 1184 md->suspended_bdev = NULL; 1185 } 1186 1187 dm_table_put(map); 1188 up(&md->suspend_lock); 1189 return r; 1190 } 1191 1192 int dm_resume(struct mapped_device *md) 1193 { 1194 int r = -EINVAL; 1195 struct bio *def; 1196 struct dm_table *map = NULL; 1197 1198 down(&md->suspend_lock); 1199 if (!dm_suspended(md)) 1200 goto out; 1201 1202 map = dm_get_table(md); 1203 if (!map || !dm_table_get_size(map)) 1204 goto out; 1205 1206 dm_table_resume_targets(map); 1207 1208 down_write(&md->io_lock); 1209 clear_bit(DMF_BLOCK_IO, &md->flags); 1210 1211 def = bio_list_get(&md->deferred); 1212 __flush_deferred_io(md, def); 1213 up_write(&md->io_lock); 1214 1215 unlock_fs(md); 1216 1217 bdput(md->suspended_bdev); 1218 md->suspended_bdev = NULL; 1219 1220 clear_bit(DMF_SUSPENDED, &md->flags); 1221 1222 dm_table_unplug_all(map); 1223 1224 r = 0; 1225 1226 out: 1227 dm_table_put(map); 1228 up(&md->suspend_lock); 1229 1230 return r; 1231 } 1232 1233 /*----------------------------------------------------------------- 1234 * Event notification. 1235 *---------------------------------------------------------------*/ 1236 uint32_t dm_get_event_nr(struct mapped_device *md) 1237 { 1238 return atomic_read(&md->event_nr); 1239 } 1240 1241 int dm_wait_event(struct mapped_device *md, int event_nr) 1242 { 1243 return wait_event_interruptible(md->eventq, 1244 (event_nr != atomic_read(&md->event_nr))); 1245 } 1246 1247 /* 1248 * The gendisk is only valid as long as you have a reference 1249 * count on 'md'. 1250 */ 1251 struct gendisk *dm_disk(struct mapped_device *md) 1252 { 1253 return md->disk; 1254 } 1255 1256 int dm_suspended(struct mapped_device *md) 1257 { 1258 return test_bit(DMF_SUSPENDED, &md->flags); 1259 } 1260 1261 static struct block_device_operations dm_blk_dops = { 1262 .open = dm_blk_open, 1263 .release = dm_blk_close, 1264 .owner = THIS_MODULE 1265 }; 1266 1267 EXPORT_SYMBOL(dm_get_mapinfo); 1268 1269 /* 1270 * module hooks 1271 */ 1272 module_init(dm_init); 1273 module_exit(dm_exit); 1274 1275 module_param(major, uint, 0); 1276 MODULE_PARM_DESC(major, "The major number of the device mapper"); 1277 MODULE_DESCRIPTION(DM_NAME " driver"); 1278 MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>"); 1279 MODULE_LICENSE("GPL"); 1280