1 /* 2 * Copyright (C) 2001, 2002 Sistina Software (UK) Limited. 3 * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved. 4 * 5 * This file is released under the GPL. 6 */ 7 8 #include "dm.h" 9 #include "dm-uevent.h" 10 11 #include <linux/init.h> 12 #include <linux/module.h> 13 #include <linux/mutex.h> 14 #include <linux/moduleparam.h> 15 #include <linux/blkpg.h> 16 #include <linux/bio.h> 17 #include <linux/mempool.h> 18 #include <linux/slab.h> 19 #include <linux/idr.h> 20 #include <linux/hdreg.h> 21 #include <linux/delay.h> 22 #include <linux/wait.h> 23 24 #include <trace/events/block.h> 25 26 #define DM_MSG_PREFIX "core" 27 28 #ifdef CONFIG_PRINTK 29 /* 30 * ratelimit state to be used in DMXXX_LIMIT(). 31 */ 32 DEFINE_RATELIMIT_STATE(dm_ratelimit_state, 33 DEFAULT_RATELIMIT_INTERVAL, 34 DEFAULT_RATELIMIT_BURST); 35 EXPORT_SYMBOL(dm_ratelimit_state); 36 #endif 37 38 /* 39 * Cookies are numeric values sent with CHANGE and REMOVE 40 * uevents while resuming, removing or renaming the device. 41 */ 42 #define DM_COOKIE_ENV_VAR_NAME "DM_COOKIE" 43 #define DM_COOKIE_LENGTH 24 44 45 static const char *_name = DM_NAME; 46 47 static unsigned int major = 0; 48 static unsigned int _major = 0; 49 50 static DEFINE_IDR(_minor_idr); 51 52 static DEFINE_SPINLOCK(_minor_lock); 53 54 static void do_deferred_remove(struct work_struct *w); 55 56 static DECLARE_WORK(deferred_remove_work, do_deferred_remove); 57 58 static struct workqueue_struct *deferred_remove_workqueue; 59 60 /* 61 * For bio-based dm. 62 * One of these is allocated per bio. 63 */ 64 struct dm_io { 65 struct mapped_device *md; 66 int error; 67 atomic_t io_count; 68 struct bio *bio; 69 unsigned long start_time; 70 spinlock_t endio_lock; 71 struct dm_stats_aux stats_aux; 72 }; 73 74 /* 75 * For request-based dm. 76 * One of these is allocated per request. 77 */ 78 struct dm_rq_target_io { 79 struct mapped_device *md; 80 struct dm_target *ti; 81 struct request *orig, clone; 82 int error; 83 union map_info info; 84 }; 85 86 /* 87 * For request-based dm - the bio clones we allocate are embedded in these 88 * structs. 89 * 90 * We allocate these with bio_alloc_bioset, using the front_pad parameter when 91 * the bioset is created - this means the bio has to come at the end of the 92 * struct. 93 */ 94 struct dm_rq_clone_bio_info { 95 struct bio *orig; 96 struct dm_rq_target_io *tio; 97 struct bio clone; 98 }; 99 100 union map_info *dm_get_rq_mapinfo(struct request *rq) 101 { 102 if (rq && rq->end_io_data) 103 return &((struct dm_rq_target_io *)rq->end_io_data)->info; 104 return NULL; 105 } 106 EXPORT_SYMBOL_GPL(dm_get_rq_mapinfo); 107 108 #define MINOR_ALLOCED ((void *)-1) 109 110 /* 111 * Bits for the md->flags field. 112 */ 113 #define DMF_BLOCK_IO_FOR_SUSPEND 0 114 #define DMF_SUSPENDED 1 115 #define DMF_FROZEN 2 116 #define DMF_FREEING 3 117 #define DMF_DELETING 4 118 #define DMF_NOFLUSH_SUSPENDING 5 119 #define DMF_MERGE_IS_OPTIONAL 6 120 #define DMF_DEFERRED_REMOVE 7 121 #define DMF_SUSPENDED_INTERNALLY 8 122 123 /* 124 * A dummy definition to make RCU happy. 125 * struct dm_table should never be dereferenced in this file. 126 */ 127 struct dm_table { 128 int undefined__; 129 }; 130 131 /* 132 * Work processed by per-device workqueue. 133 */ 134 struct mapped_device { 135 struct srcu_struct io_barrier; 136 struct mutex suspend_lock; 137 atomic_t holders; 138 atomic_t open_count; 139 140 /* 141 * The current mapping. 142 * Use dm_get_live_table{_fast} or take suspend_lock for 143 * dereference. 144 */ 145 struct dm_table __rcu *map; 146 147 struct list_head table_devices; 148 struct mutex table_devices_lock; 149 150 unsigned long flags; 151 152 struct request_queue *queue; 153 unsigned type; 154 /* Protect queue and type against concurrent access. */ 155 struct mutex type_lock; 156 157 struct target_type *immutable_target_type; 158 159 struct gendisk *disk; 160 char name[16]; 161 162 void *interface_ptr; 163 164 /* 165 * A list of ios that arrived while we were suspended. 166 */ 167 atomic_t pending[2]; 168 wait_queue_head_t wait; 169 struct work_struct work; 170 struct bio_list deferred; 171 spinlock_t deferred_lock; 172 173 /* 174 * Processing queue (flush) 175 */ 176 struct workqueue_struct *wq; 177 178 /* 179 * io objects are allocated from here. 180 */ 181 mempool_t *io_pool; 182 183 struct bio_set *bs; 184 185 /* 186 * Event handling. 187 */ 188 atomic_t event_nr; 189 wait_queue_head_t eventq; 190 atomic_t uevent_seq; 191 struct list_head uevent_list; 192 spinlock_t uevent_lock; /* Protect access to uevent_list */ 193 194 /* 195 * freeze/thaw support require holding onto a super block 196 */ 197 struct super_block *frozen_sb; 198 struct block_device *bdev; 199 200 /* forced geometry settings */ 201 struct hd_geometry geometry; 202 203 /* kobject and completion */ 204 struct dm_kobject_holder kobj_holder; 205 206 /* zero-length flush that will be cloned and submitted to targets */ 207 struct bio flush_bio; 208 209 /* the number of internal suspends */ 210 unsigned internal_suspend_count; 211 212 struct dm_stats stats; 213 }; 214 215 /* 216 * For mempools pre-allocation at the table loading time. 217 */ 218 struct dm_md_mempools { 219 mempool_t *io_pool; 220 struct bio_set *bs; 221 }; 222 223 struct table_device { 224 struct list_head list; 225 atomic_t count; 226 struct dm_dev dm_dev; 227 }; 228 229 #define RESERVED_BIO_BASED_IOS 16 230 #define RESERVED_REQUEST_BASED_IOS 256 231 #define RESERVED_MAX_IOS 1024 232 static struct kmem_cache *_io_cache; 233 static struct kmem_cache *_rq_tio_cache; 234 235 /* 236 * Bio-based DM's mempools' reserved IOs set by the user. 237 */ 238 static unsigned reserved_bio_based_ios = RESERVED_BIO_BASED_IOS; 239 240 /* 241 * Request-based DM's mempools' reserved IOs set by the user. 242 */ 243 static unsigned reserved_rq_based_ios = RESERVED_REQUEST_BASED_IOS; 244 245 static unsigned __dm_get_reserved_ios(unsigned *reserved_ios, 246 unsigned def, unsigned max) 247 { 248 unsigned ios = ACCESS_ONCE(*reserved_ios); 249 unsigned modified_ios = 0; 250 251 if (!ios) 252 modified_ios = def; 253 else if (ios > max) 254 modified_ios = max; 255 256 if (modified_ios) { 257 (void)cmpxchg(reserved_ios, ios, modified_ios); 258 ios = modified_ios; 259 } 260 261 return ios; 262 } 263 264 unsigned dm_get_reserved_bio_based_ios(void) 265 { 266 return __dm_get_reserved_ios(&reserved_bio_based_ios, 267 RESERVED_BIO_BASED_IOS, RESERVED_MAX_IOS); 268 } 269 EXPORT_SYMBOL_GPL(dm_get_reserved_bio_based_ios); 270 271 unsigned dm_get_reserved_rq_based_ios(void) 272 { 273 return __dm_get_reserved_ios(&reserved_rq_based_ios, 274 RESERVED_REQUEST_BASED_IOS, RESERVED_MAX_IOS); 275 } 276 EXPORT_SYMBOL_GPL(dm_get_reserved_rq_based_ios); 277 278 static int __init local_init(void) 279 { 280 int r = -ENOMEM; 281 282 /* allocate a slab for the dm_ios */ 283 _io_cache = KMEM_CACHE(dm_io, 0); 284 if (!_io_cache) 285 return r; 286 287 _rq_tio_cache = KMEM_CACHE(dm_rq_target_io, 0); 288 if (!_rq_tio_cache) 289 goto out_free_io_cache; 290 291 r = dm_uevent_init(); 292 if (r) 293 goto out_free_rq_tio_cache; 294 295 deferred_remove_workqueue = alloc_workqueue("kdmremove", WQ_UNBOUND, 1); 296 if (!deferred_remove_workqueue) { 297 r = -ENOMEM; 298 goto out_uevent_exit; 299 } 300 301 _major = major; 302 r = register_blkdev(_major, _name); 303 if (r < 0) 304 goto out_free_workqueue; 305 306 if (!_major) 307 _major = r; 308 309 return 0; 310 311 out_free_workqueue: 312 destroy_workqueue(deferred_remove_workqueue); 313 out_uevent_exit: 314 dm_uevent_exit(); 315 out_free_rq_tio_cache: 316 kmem_cache_destroy(_rq_tio_cache); 317 out_free_io_cache: 318 kmem_cache_destroy(_io_cache); 319 320 return r; 321 } 322 323 static void local_exit(void) 324 { 325 flush_scheduled_work(); 326 destroy_workqueue(deferred_remove_workqueue); 327 328 kmem_cache_destroy(_rq_tio_cache); 329 kmem_cache_destroy(_io_cache); 330 unregister_blkdev(_major, _name); 331 dm_uevent_exit(); 332 333 _major = 0; 334 335 DMINFO("cleaned up"); 336 } 337 338 static int (*_inits[])(void) __initdata = { 339 local_init, 340 dm_target_init, 341 dm_linear_init, 342 dm_stripe_init, 343 dm_io_init, 344 dm_kcopyd_init, 345 dm_interface_init, 346 dm_statistics_init, 347 }; 348 349 static void (*_exits[])(void) = { 350 local_exit, 351 dm_target_exit, 352 dm_linear_exit, 353 dm_stripe_exit, 354 dm_io_exit, 355 dm_kcopyd_exit, 356 dm_interface_exit, 357 dm_statistics_exit, 358 }; 359 360 static int __init dm_init(void) 361 { 362 const int count = ARRAY_SIZE(_inits); 363 364 int r, i; 365 366 for (i = 0; i < count; i++) { 367 r = _inits[i](); 368 if (r) 369 goto bad; 370 } 371 372 return 0; 373 374 bad: 375 while (i--) 376 _exits[i](); 377 378 return r; 379 } 380 381 static void __exit dm_exit(void) 382 { 383 int i = ARRAY_SIZE(_exits); 384 385 while (i--) 386 _exits[i](); 387 388 /* 389 * Should be empty by this point. 390 */ 391 idr_destroy(&_minor_idr); 392 } 393 394 /* 395 * Block device functions 396 */ 397 int dm_deleting_md(struct mapped_device *md) 398 { 399 return test_bit(DMF_DELETING, &md->flags); 400 } 401 402 static int dm_blk_open(struct block_device *bdev, fmode_t mode) 403 { 404 struct mapped_device *md; 405 406 spin_lock(&_minor_lock); 407 408 md = bdev->bd_disk->private_data; 409 if (!md) 410 goto out; 411 412 if (test_bit(DMF_FREEING, &md->flags) || 413 dm_deleting_md(md)) { 414 md = NULL; 415 goto out; 416 } 417 418 dm_get(md); 419 atomic_inc(&md->open_count); 420 421 out: 422 spin_unlock(&_minor_lock); 423 424 return md ? 0 : -ENXIO; 425 } 426 427 static void dm_blk_close(struct gendisk *disk, fmode_t mode) 428 { 429 struct mapped_device *md = disk->private_data; 430 431 spin_lock(&_minor_lock); 432 433 if (atomic_dec_and_test(&md->open_count) && 434 (test_bit(DMF_DEFERRED_REMOVE, &md->flags))) 435 queue_work(deferred_remove_workqueue, &deferred_remove_work); 436 437 dm_put(md); 438 439 spin_unlock(&_minor_lock); 440 } 441 442 int dm_open_count(struct mapped_device *md) 443 { 444 return atomic_read(&md->open_count); 445 } 446 447 /* 448 * Guarantees nothing is using the device before it's deleted. 449 */ 450 int dm_lock_for_deletion(struct mapped_device *md, bool mark_deferred, bool only_deferred) 451 { 452 int r = 0; 453 454 spin_lock(&_minor_lock); 455 456 if (dm_open_count(md)) { 457 r = -EBUSY; 458 if (mark_deferred) 459 set_bit(DMF_DEFERRED_REMOVE, &md->flags); 460 } else if (only_deferred && !test_bit(DMF_DEFERRED_REMOVE, &md->flags)) 461 r = -EEXIST; 462 else 463 set_bit(DMF_DELETING, &md->flags); 464 465 spin_unlock(&_minor_lock); 466 467 return r; 468 } 469 470 int dm_cancel_deferred_remove(struct mapped_device *md) 471 { 472 int r = 0; 473 474 spin_lock(&_minor_lock); 475 476 if (test_bit(DMF_DELETING, &md->flags)) 477 r = -EBUSY; 478 else 479 clear_bit(DMF_DEFERRED_REMOVE, &md->flags); 480 481 spin_unlock(&_minor_lock); 482 483 return r; 484 } 485 486 static void do_deferred_remove(struct work_struct *w) 487 { 488 dm_deferred_remove(); 489 } 490 491 sector_t dm_get_size(struct mapped_device *md) 492 { 493 return get_capacity(md->disk); 494 } 495 496 struct request_queue *dm_get_md_queue(struct mapped_device *md) 497 { 498 return md->queue; 499 } 500 501 struct dm_stats *dm_get_stats(struct mapped_device *md) 502 { 503 return &md->stats; 504 } 505 506 static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo) 507 { 508 struct mapped_device *md = bdev->bd_disk->private_data; 509 510 return dm_get_geometry(md, geo); 511 } 512 513 static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode, 514 unsigned int cmd, unsigned long arg) 515 { 516 struct mapped_device *md = bdev->bd_disk->private_data; 517 int srcu_idx; 518 struct dm_table *map; 519 struct dm_target *tgt; 520 int r = -ENOTTY; 521 522 retry: 523 map = dm_get_live_table(md, &srcu_idx); 524 525 if (!map || !dm_table_get_size(map)) 526 goto out; 527 528 /* We only support devices that have a single target */ 529 if (dm_table_get_num_targets(map) != 1) 530 goto out; 531 532 tgt = dm_table_get_target(map, 0); 533 if (!tgt->type->ioctl) 534 goto out; 535 536 if (dm_suspended_md(md)) { 537 r = -EAGAIN; 538 goto out; 539 } 540 541 r = tgt->type->ioctl(tgt, cmd, arg); 542 543 out: 544 dm_put_live_table(md, srcu_idx); 545 546 if (r == -ENOTCONN) { 547 msleep(10); 548 goto retry; 549 } 550 551 return r; 552 } 553 554 static struct dm_io *alloc_io(struct mapped_device *md) 555 { 556 return mempool_alloc(md->io_pool, GFP_NOIO); 557 } 558 559 static void free_io(struct mapped_device *md, struct dm_io *io) 560 { 561 mempool_free(io, md->io_pool); 562 } 563 564 static void free_tio(struct mapped_device *md, struct dm_target_io *tio) 565 { 566 bio_put(&tio->clone); 567 } 568 569 static struct dm_rq_target_io *alloc_rq_tio(struct mapped_device *md, 570 gfp_t gfp_mask) 571 { 572 return mempool_alloc(md->io_pool, gfp_mask); 573 } 574 575 static void free_rq_tio(struct dm_rq_target_io *tio) 576 { 577 mempool_free(tio, tio->md->io_pool); 578 } 579 580 static int md_in_flight(struct mapped_device *md) 581 { 582 return atomic_read(&md->pending[READ]) + 583 atomic_read(&md->pending[WRITE]); 584 } 585 586 static void start_io_acct(struct dm_io *io) 587 { 588 struct mapped_device *md = io->md; 589 struct bio *bio = io->bio; 590 int cpu; 591 int rw = bio_data_dir(bio); 592 593 io->start_time = jiffies; 594 595 cpu = part_stat_lock(); 596 part_round_stats(cpu, &dm_disk(md)->part0); 597 part_stat_unlock(); 598 atomic_set(&dm_disk(md)->part0.in_flight[rw], 599 atomic_inc_return(&md->pending[rw])); 600 601 if (unlikely(dm_stats_used(&md->stats))) 602 dm_stats_account_io(&md->stats, bio->bi_rw, bio->bi_iter.bi_sector, 603 bio_sectors(bio), false, 0, &io->stats_aux); 604 } 605 606 static void end_io_acct(struct dm_io *io) 607 { 608 struct mapped_device *md = io->md; 609 struct bio *bio = io->bio; 610 unsigned long duration = jiffies - io->start_time; 611 int pending; 612 int rw = bio_data_dir(bio); 613 614 generic_end_io_acct(rw, &dm_disk(md)->part0, io->start_time); 615 616 if (unlikely(dm_stats_used(&md->stats))) 617 dm_stats_account_io(&md->stats, bio->bi_rw, bio->bi_iter.bi_sector, 618 bio_sectors(bio), true, duration, &io->stats_aux); 619 620 /* 621 * After this is decremented the bio must not be touched if it is 622 * a flush. 623 */ 624 pending = atomic_dec_return(&md->pending[rw]); 625 atomic_set(&dm_disk(md)->part0.in_flight[rw], pending); 626 pending += atomic_read(&md->pending[rw^0x1]); 627 628 /* nudge anyone waiting on suspend queue */ 629 if (!pending) 630 wake_up(&md->wait); 631 } 632 633 /* 634 * Add the bio to the list of deferred io. 635 */ 636 static void queue_io(struct mapped_device *md, struct bio *bio) 637 { 638 unsigned long flags; 639 640 spin_lock_irqsave(&md->deferred_lock, flags); 641 bio_list_add(&md->deferred, bio); 642 spin_unlock_irqrestore(&md->deferred_lock, flags); 643 queue_work(md->wq, &md->work); 644 } 645 646 /* 647 * Everyone (including functions in this file), should use this 648 * function to access the md->map field, and make sure they call 649 * dm_put_live_table() when finished. 650 */ 651 struct dm_table *dm_get_live_table(struct mapped_device *md, int *srcu_idx) __acquires(md->io_barrier) 652 { 653 *srcu_idx = srcu_read_lock(&md->io_barrier); 654 655 return srcu_dereference(md->map, &md->io_barrier); 656 } 657 658 void dm_put_live_table(struct mapped_device *md, int srcu_idx) __releases(md->io_barrier) 659 { 660 srcu_read_unlock(&md->io_barrier, srcu_idx); 661 } 662 663 void dm_sync_table(struct mapped_device *md) 664 { 665 synchronize_srcu(&md->io_barrier); 666 synchronize_rcu_expedited(); 667 } 668 669 /* 670 * A fast alternative to dm_get_live_table/dm_put_live_table. 671 * The caller must not block between these two functions. 672 */ 673 static struct dm_table *dm_get_live_table_fast(struct mapped_device *md) __acquires(RCU) 674 { 675 rcu_read_lock(); 676 return rcu_dereference(md->map); 677 } 678 679 static void dm_put_live_table_fast(struct mapped_device *md) __releases(RCU) 680 { 681 rcu_read_unlock(); 682 } 683 684 /* 685 * Open a table device so we can use it as a map destination. 686 */ 687 static int open_table_device(struct table_device *td, dev_t dev, 688 struct mapped_device *md) 689 { 690 static char *_claim_ptr = "I belong to device-mapper"; 691 struct block_device *bdev; 692 693 int r; 694 695 BUG_ON(td->dm_dev.bdev); 696 697 bdev = blkdev_get_by_dev(dev, td->dm_dev.mode | FMODE_EXCL, _claim_ptr); 698 if (IS_ERR(bdev)) 699 return PTR_ERR(bdev); 700 701 r = bd_link_disk_holder(bdev, dm_disk(md)); 702 if (r) { 703 blkdev_put(bdev, td->dm_dev.mode | FMODE_EXCL); 704 return r; 705 } 706 707 td->dm_dev.bdev = bdev; 708 return 0; 709 } 710 711 /* 712 * Close a table device that we've been using. 713 */ 714 static void close_table_device(struct table_device *td, struct mapped_device *md) 715 { 716 if (!td->dm_dev.bdev) 717 return; 718 719 bd_unlink_disk_holder(td->dm_dev.bdev, dm_disk(md)); 720 blkdev_put(td->dm_dev.bdev, td->dm_dev.mode | FMODE_EXCL); 721 td->dm_dev.bdev = NULL; 722 } 723 724 static struct table_device *find_table_device(struct list_head *l, dev_t dev, 725 fmode_t mode) { 726 struct table_device *td; 727 728 list_for_each_entry(td, l, list) 729 if (td->dm_dev.bdev->bd_dev == dev && td->dm_dev.mode == mode) 730 return td; 731 732 return NULL; 733 } 734 735 int dm_get_table_device(struct mapped_device *md, dev_t dev, fmode_t mode, 736 struct dm_dev **result) { 737 int r; 738 struct table_device *td; 739 740 mutex_lock(&md->table_devices_lock); 741 td = find_table_device(&md->table_devices, dev, mode); 742 if (!td) { 743 td = kmalloc(sizeof(*td), GFP_KERNEL); 744 if (!td) { 745 mutex_unlock(&md->table_devices_lock); 746 return -ENOMEM; 747 } 748 749 td->dm_dev.mode = mode; 750 td->dm_dev.bdev = NULL; 751 752 if ((r = open_table_device(td, dev, md))) { 753 mutex_unlock(&md->table_devices_lock); 754 kfree(td); 755 return r; 756 } 757 758 format_dev_t(td->dm_dev.name, dev); 759 760 atomic_set(&td->count, 0); 761 list_add(&td->list, &md->table_devices); 762 } 763 atomic_inc(&td->count); 764 mutex_unlock(&md->table_devices_lock); 765 766 *result = &td->dm_dev; 767 return 0; 768 } 769 EXPORT_SYMBOL_GPL(dm_get_table_device); 770 771 void dm_put_table_device(struct mapped_device *md, struct dm_dev *d) 772 { 773 struct table_device *td = container_of(d, struct table_device, dm_dev); 774 775 mutex_lock(&md->table_devices_lock); 776 if (atomic_dec_and_test(&td->count)) { 777 close_table_device(td, md); 778 list_del(&td->list); 779 kfree(td); 780 } 781 mutex_unlock(&md->table_devices_lock); 782 } 783 EXPORT_SYMBOL(dm_put_table_device); 784 785 static void free_table_devices(struct list_head *devices) 786 { 787 struct list_head *tmp, *next; 788 789 list_for_each_safe(tmp, next, devices) { 790 struct table_device *td = list_entry(tmp, struct table_device, list); 791 792 DMWARN("dm_destroy: %s still exists with %d references", 793 td->dm_dev.name, atomic_read(&td->count)); 794 kfree(td); 795 } 796 } 797 798 /* 799 * Get the geometry associated with a dm device 800 */ 801 int dm_get_geometry(struct mapped_device *md, struct hd_geometry *geo) 802 { 803 *geo = md->geometry; 804 805 return 0; 806 } 807 808 /* 809 * Set the geometry of a device. 810 */ 811 int dm_set_geometry(struct mapped_device *md, struct hd_geometry *geo) 812 { 813 sector_t sz = (sector_t)geo->cylinders * geo->heads * geo->sectors; 814 815 if (geo->start > sz) { 816 DMWARN("Start sector is beyond the geometry limits."); 817 return -EINVAL; 818 } 819 820 md->geometry = *geo; 821 822 return 0; 823 } 824 825 /*----------------------------------------------------------------- 826 * CRUD START: 827 * A more elegant soln is in the works that uses the queue 828 * merge fn, unfortunately there are a couple of changes to 829 * the block layer that I want to make for this. So in the 830 * interests of getting something for people to use I give 831 * you this clearly demarcated crap. 832 *---------------------------------------------------------------*/ 833 834 static int __noflush_suspending(struct mapped_device *md) 835 { 836 return test_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); 837 } 838 839 /* 840 * Decrements the number of outstanding ios that a bio has been 841 * cloned into, completing the original io if necc. 842 */ 843 static void dec_pending(struct dm_io *io, int error) 844 { 845 unsigned long flags; 846 int io_error; 847 struct bio *bio; 848 struct mapped_device *md = io->md; 849 850 /* Push-back supersedes any I/O errors */ 851 if (unlikely(error)) { 852 spin_lock_irqsave(&io->endio_lock, flags); 853 if (!(io->error > 0 && __noflush_suspending(md))) 854 io->error = error; 855 spin_unlock_irqrestore(&io->endio_lock, flags); 856 } 857 858 if (atomic_dec_and_test(&io->io_count)) { 859 if (io->error == DM_ENDIO_REQUEUE) { 860 /* 861 * Target requested pushing back the I/O. 862 */ 863 spin_lock_irqsave(&md->deferred_lock, flags); 864 if (__noflush_suspending(md)) 865 bio_list_add_head(&md->deferred, io->bio); 866 else 867 /* noflush suspend was interrupted. */ 868 io->error = -EIO; 869 spin_unlock_irqrestore(&md->deferred_lock, flags); 870 } 871 872 io_error = io->error; 873 bio = io->bio; 874 end_io_acct(io); 875 free_io(md, io); 876 877 if (io_error == DM_ENDIO_REQUEUE) 878 return; 879 880 if ((bio->bi_rw & REQ_FLUSH) && bio->bi_iter.bi_size) { 881 /* 882 * Preflush done for flush with data, reissue 883 * without REQ_FLUSH. 884 */ 885 bio->bi_rw &= ~REQ_FLUSH; 886 queue_io(md, bio); 887 } else { 888 /* done with normal IO or empty flush */ 889 trace_block_bio_complete(md->queue, bio, io_error); 890 bio_endio(bio, io_error); 891 } 892 } 893 } 894 895 static void disable_write_same(struct mapped_device *md) 896 { 897 struct queue_limits *limits = dm_get_queue_limits(md); 898 899 /* device doesn't really support WRITE SAME, disable it */ 900 limits->max_write_same_sectors = 0; 901 } 902 903 static void clone_endio(struct bio *bio, int error) 904 { 905 int r = error; 906 struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone); 907 struct dm_io *io = tio->io; 908 struct mapped_device *md = tio->io->md; 909 dm_endio_fn endio = tio->ti->type->end_io; 910 911 if (!bio_flagged(bio, BIO_UPTODATE) && !error) 912 error = -EIO; 913 914 if (endio) { 915 r = endio(tio->ti, bio, error); 916 if (r < 0 || r == DM_ENDIO_REQUEUE) 917 /* 918 * error and requeue request are handled 919 * in dec_pending(). 920 */ 921 error = r; 922 else if (r == DM_ENDIO_INCOMPLETE) 923 /* The target will handle the io */ 924 return; 925 else if (r) { 926 DMWARN("unimplemented target endio return value: %d", r); 927 BUG(); 928 } 929 } 930 931 if (unlikely(r == -EREMOTEIO && (bio->bi_rw & REQ_WRITE_SAME) && 932 !bdev_get_queue(bio->bi_bdev)->limits.max_write_same_sectors)) 933 disable_write_same(md); 934 935 free_tio(md, tio); 936 dec_pending(io, error); 937 } 938 939 /* 940 * Partial completion handling for request-based dm 941 */ 942 static void end_clone_bio(struct bio *clone, int error) 943 { 944 struct dm_rq_clone_bio_info *info = 945 container_of(clone, struct dm_rq_clone_bio_info, clone); 946 struct dm_rq_target_io *tio = info->tio; 947 struct bio *bio = info->orig; 948 unsigned int nr_bytes = info->orig->bi_iter.bi_size; 949 950 bio_put(clone); 951 952 if (tio->error) 953 /* 954 * An error has already been detected on the request. 955 * Once error occurred, just let clone->end_io() handle 956 * the remainder. 957 */ 958 return; 959 else if (error) { 960 /* 961 * Don't notice the error to the upper layer yet. 962 * The error handling decision is made by the target driver, 963 * when the request is completed. 964 */ 965 tio->error = error; 966 return; 967 } 968 969 /* 970 * I/O for the bio successfully completed. 971 * Notice the data completion to the upper layer. 972 */ 973 974 /* 975 * bios are processed from the head of the list. 976 * So the completing bio should always be rq->bio. 977 * If it's not, something wrong is happening. 978 */ 979 if (tio->orig->bio != bio) 980 DMERR("bio completion is going in the middle of the request"); 981 982 /* 983 * Update the original request. 984 * Do not use blk_end_request() here, because it may complete 985 * the original request before the clone, and break the ordering. 986 */ 987 blk_update_request(tio->orig, 0, nr_bytes); 988 } 989 990 /* 991 * Don't touch any member of the md after calling this function because 992 * the md may be freed in dm_put() at the end of this function. 993 * Or do dm_get() before calling this function and dm_put() later. 994 */ 995 static void rq_completed(struct mapped_device *md, int rw, int run_queue) 996 { 997 atomic_dec(&md->pending[rw]); 998 999 /* nudge anyone waiting on suspend queue */ 1000 if (!md_in_flight(md)) 1001 wake_up(&md->wait); 1002 1003 /* 1004 * Run this off this callpath, as drivers could invoke end_io while 1005 * inside their request_fn (and holding the queue lock). Calling 1006 * back into ->request_fn() could deadlock attempting to grab the 1007 * queue lock again. 1008 */ 1009 if (run_queue) 1010 blk_run_queue_async(md->queue); 1011 1012 /* 1013 * dm_put() must be at the end of this function. See the comment above 1014 */ 1015 dm_put(md); 1016 } 1017 1018 static void free_rq_clone(struct request *clone) 1019 { 1020 struct dm_rq_target_io *tio = clone->end_io_data; 1021 1022 blk_rq_unprep_clone(clone); 1023 free_rq_tio(tio); 1024 } 1025 1026 /* 1027 * Complete the clone and the original request. 1028 * Must be called without queue lock. 1029 */ 1030 static void dm_end_request(struct request *clone, int error) 1031 { 1032 int rw = rq_data_dir(clone); 1033 struct dm_rq_target_io *tio = clone->end_io_data; 1034 struct mapped_device *md = tio->md; 1035 struct request *rq = tio->orig; 1036 1037 if (rq->cmd_type == REQ_TYPE_BLOCK_PC) { 1038 rq->errors = clone->errors; 1039 rq->resid_len = clone->resid_len; 1040 1041 if (rq->sense) 1042 /* 1043 * We are using the sense buffer of the original 1044 * request. 1045 * So setting the length of the sense data is enough. 1046 */ 1047 rq->sense_len = clone->sense_len; 1048 } 1049 1050 free_rq_clone(clone); 1051 blk_end_request_all(rq, error); 1052 rq_completed(md, rw, true); 1053 } 1054 1055 static void dm_unprep_request(struct request *rq) 1056 { 1057 struct request *clone = rq->special; 1058 1059 rq->special = NULL; 1060 rq->cmd_flags &= ~REQ_DONTPREP; 1061 1062 free_rq_clone(clone); 1063 } 1064 1065 /* 1066 * Requeue the original request of a clone. 1067 */ 1068 void dm_requeue_unmapped_request(struct request *clone) 1069 { 1070 int rw = rq_data_dir(clone); 1071 struct dm_rq_target_io *tio = clone->end_io_data; 1072 struct mapped_device *md = tio->md; 1073 struct request *rq = tio->orig; 1074 struct request_queue *q = rq->q; 1075 unsigned long flags; 1076 1077 dm_unprep_request(rq); 1078 1079 spin_lock_irqsave(q->queue_lock, flags); 1080 blk_requeue_request(q, rq); 1081 spin_unlock_irqrestore(q->queue_lock, flags); 1082 1083 rq_completed(md, rw, 0); 1084 } 1085 EXPORT_SYMBOL_GPL(dm_requeue_unmapped_request); 1086 1087 static void __stop_queue(struct request_queue *q) 1088 { 1089 blk_stop_queue(q); 1090 } 1091 1092 static void stop_queue(struct request_queue *q) 1093 { 1094 unsigned long flags; 1095 1096 spin_lock_irqsave(q->queue_lock, flags); 1097 __stop_queue(q); 1098 spin_unlock_irqrestore(q->queue_lock, flags); 1099 } 1100 1101 static void __start_queue(struct request_queue *q) 1102 { 1103 if (blk_queue_stopped(q)) 1104 blk_start_queue(q); 1105 } 1106 1107 static void start_queue(struct request_queue *q) 1108 { 1109 unsigned long flags; 1110 1111 spin_lock_irqsave(q->queue_lock, flags); 1112 __start_queue(q); 1113 spin_unlock_irqrestore(q->queue_lock, flags); 1114 } 1115 1116 static void dm_done(struct request *clone, int error, bool mapped) 1117 { 1118 int r = error; 1119 struct dm_rq_target_io *tio = clone->end_io_data; 1120 dm_request_endio_fn rq_end_io = NULL; 1121 1122 if (tio->ti) { 1123 rq_end_io = tio->ti->type->rq_end_io; 1124 1125 if (mapped && rq_end_io) 1126 r = rq_end_io(tio->ti, clone, error, &tio->info); 1127 } 1128 1129 if (unlikely(r == -EREMOTEIO && (clone->cmd_flags & REQ_WRITE_SAME) && 1130 !clone->q->limits.max_write_same_sectors)) 1131 disable_write_same(tio->md); 1132 1133 if (r <= 0) 1134 /* The target wants to complete the I/O */ 1135 dm_end_request(clone, r); 1136 else if (r == DM_ENDIO_INCOMPLETE) 1137 /* The target will handle the I/O */ 1138 return; 1139 else if (r == DM_ENDIO_REQUEUE) 1140 /* The target wants to requeue the I/O */ 1141 dm_requeue_unmapped_request(clone); 1142 else { 1143 DMWARN("unimplemented target endio return value: %d", r); 1144 BUG(); 1145 } 1146 } 1147 1148 /* 1149 * Request completion handler for request-based dm 1150 */ 1151 static void dm_softirq_done(struct request *rq) 1152 { 1153 bool mapped = true; 1154 struct request *clone = rq->completion_data; 1155 struct dm_rq_target_io *tio = clone->end_io_data; 1156 1157 if (rq->cmd_flags & REQ_FAILED) 1158 mapped = false; 1159 1160 dm_done(clone, tio->error, mapped); 1161 } 1162 1163 /* 1164 * Complete the clone and the original request with the error status 1165 * through softirq context. 1166 */ 1167 static void dm_complete_request(struct request *clone, int error) 1168 { 1169 struct dm_rq_target_io *tio = clone->end_io_data; 1170 struct request *rq = tio->orig; 1171 1172 tio->error = error; 1173 rq->completion_data = clone; 1174 blk_complete_request(rq); 1175 } 1176 1177 /* 1178 * Complete the not-mapped clone and the original request with the error status 1179 * through softirq context. 1180 * Target's rq_end_io() function isn't called. 1181 * This may be used when the target's map_rq() function fails. 1182 */ 1183 void dm_kill_unmapped_request(struct request *clone, int error) 1184 { 1185 struct dm_rq_target_io *tio = clone->end_io_data; 1186 struct request *rq = tio->orig; 1187 1188 rq->cmd_flags |= REQ_FAILED; 1189 dm_complete_request(clone, error); 1190 } 1191 EXPORT_SYMBOL_GPL(dm_kill_unmapped_request); 1192 1193 /* 1194 * Called with the queue lock held 1195 */ 1196 static void end_clone_request(struct request *clone, int error) 1197 { 1198 /* 1199 * For just cleaning up the information of the queue in which 1200 * the clone was dispatched. 1201 * The clone is *NOT* freed actually here because it is alloced from 1202 * dm own mempool and REQ_ALLOCED isn't set in clone->cmd_flags. 1203 */ 1204 __blk_put_request(clone->q, clone); 1205 1206 /* 1207 * Actual request completion is done in a softirq context which doesn't 1208 * hold the queue lock. Otherwise, deadlock could occur because: 1209 * - another request may be submitted by the upper level driver 1210 * of the stacking during the completion 1211 * - the submission which requires queue lock may be done 1212 * against this queue 1213 */ 1214 dm_complete_request(clone, error); 1215 } 1216 1217 /* 1218 * Return maximum size of I/O possible at the supplied sector up to the current 1219 * target boundary. 1220 */ 1221 static sector_t max_io_len_target_boundary(sector_t sector, struct dm_target *ti) 1222 { 1223 sector_t target_offset = dm_target_offset(ti, sector); 1224 1225 return ti->len - target_offset; 1226 } 1227 1228 static sector_t max_io_len(sector_t sector, struct dm_target *ti) 1229 { 1230 sector_t len = max_io_len_target_boundary(sector, ti); 1231 sector_t offset, max_len; 1232 1233 /* 1234 * Does the target need to split even further? 1235 */ 1236 if (ti->max_io_len) { 1237 offset = dm_target_offset(ti, sector); 1238 if (unlikely(ti->max_io_len & (ti->max_io_len - 1))) 1239 max_len = sector_div(offset, ti->max_io_len); 1240 else 1241 max_len = offset & (ti->max_io_len - 1); 1242 max_len = ti->max_io_len - max_len; 1243 1244 if (len > max_len) 1245 len = max_len; 1246 } 1247 1248 return len; 1249 } 1250 1251 int dm_set_target_max_io_len(struct dm_target *ti, sector_t len) 1252 { 1253 if (len > UINT_MAX) { 1254 DMERR("Specified maximum size of target IO (%llu) exceeds limit (%u)", 1255 (unsigned long long)len, UINT_MAX); 1256 ti->error = "Maximum size of target IO is too large"; 1257 return -EINVAL; 1258 } 1259 1260 ti->max_io_len = (uint32_t) len; 1261 1262 return 0; 1263 } 1264 EXPORT_SYMBOL_GPL(dm_set_target_max_io_len); 1265 1266 /* 1267 * A target may call dm_accept_partial_bio only from the map routine. It is 1268 * allowed for all bio types except REQ_FLUSH. 1269 * 1270 * dm_accept_partial_bio informs the dm that the target only wants to process 1271 * additional n_sectors sectors of the bio and the rest of the data should be 1272 * sent in a next bio. 1273 * 1274 * A diagram that explains the arithmetics: 1275 * +--------------------+---------------+-------+ 1276 * | 1 | 2 | 3 | 1277 * +--------------------+---------------+-------+ 1278 * 1279 * <-------------- *tio->len_ptr ---------------> 1280 * <------- bi_size -------> 1281 * <-- n_sectors --> 1282 * 1283 * Region 1 was already iterated over with bio_advance or similar function. 1284 * (it may be empty if the target doesn't use bio_advance) 1285 * Region 2 is the remaining bio size that the target wants to process. 1286 * (it may be empty if region 1 is non-empty, although there is no reason 1287 * to make it empty) 1288 * The target requires that region 3 is to be sent in the next bio. 1289 * 1290 * If the target wants to receive multiple copies of the bio (via num_*bios, etc), 1291 * the partially processed part (the sum of regions 1+2) must be the same for all 1292 * copies of the bio. 1293 */ 1294 void dm_accept_partial_bio(struct bio *bio, unsigned n_sectors) 1295 { 1296 struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone); 1297 unsigned bi_size = bio->bi_iter.bi_size >> SECTOR_SHIFT; 1298 BUG_ON(bio->bi_rw & REQ_FLUSH); 1299 BUG_ON(bi_size > *tio->len_ptr); 1300 BUG_ON(n_sectors > bi_size); 1301 *tio->len_ptr -= bi_size - n_sectors; 1302 bio->bi_iter.bi_size = n_sectors << SECTOR_SHIFT; 1303 } 1304 EXPORT_SYMBOL_GPL(dm_accept_partial_bio); 1305 1306 static void __map_bio(struct dm_target_io *tio) 1307 { 1308 int r; 1309 sector_t sector; 1310 struct mapped_device *md; 1311 struct bio *clone = &tio->clone; 1312 struct dm_target *ti = tio->ti; 1313 1314 clone->bi_end_io = clone_endio; 1315 1316 /* 1317 * Map the clone. If r == 0 we don't need to do 1318 * anything, the target has assumed ownership of 1319 * this io. 1320 */ 1321 atomic_inc(&tio->io->io_count); 1322 sector = clone->bi_iter.bi_sector; 1323 r = ti->type->map(ti, clone); 1324 if (r == DM_MAPIO_REMAPPED) { 1325 /* the bio has been remapped so dispatch it */ 1326 1327 trace_block_bio_remap(bdev_get_queue(clone->bi_bdev), clone, 1328 tio->io->bio->bi_bdev->bd_dev, sector); 1329 1330 generic_make_request(clone); 1331 } else if (r < 0 || r == DM_MAPIO_REQUEUE) { 1332 /* error the io and bail out, or requeue it if needed */ 1333 md = tio->io->md; 1334 dec_pending(tio->io, r); 1335 free_tio(md, tio); 1336 } else if (r) { 1337 DMWARN("unimplemented target map return value: %d", r); 1338 BUG(); 1339 } 1340 } 1341 1342 struct clone_info { 1343 struct mapped_device *md; 1344 struct dm_table *map; 1345 struct bio *bio; 1346 struct dm_io *io; 1347 sector_t sector; 1348 unsigned sector_count; 1349 }; 1350 1351 static void bio_setup_sector(struct bio *bio, sector_t sector, unsigned len) 1352 { 1353 bio->bi_iter.bi_sector = sector; 1354 bio->bi_iter.bi_size = to_bytes(len); 1355 } 1356 1357 /* 1358 * Creates a bio that consists of range of complete bvecs. 1359 */ 1360 static void clone_bio(struct dm_target_io *tio, struct bio *bio, 1361 sector_t sector, unsigned len) 1362 { 1363 struct bio *clone = &tio->clone; 1364 1365 __bio_clone_fast(clone, bio); 1366 1367 if (bio_integrity(bio)) 1368 bio_integrity_clone(clone, bio, GFP_NOIO); 1369 1370 bio_advance(clone, to_bytes(sector - clone->bi_iter.bi_sector)); 1371 clone->bi_iter.bi_size = to_bytes(len); 1372 1373 if (bio_integrity(bio)) 1374 bio_integrity_trim(clone, 0, len); 1375 } 1376 1377 static struct dm_target_io *alloc_tio(struct clone_info *ci, 1378 struct dm_target *ti, 1379 unsigned target_bio_nr) 1380 { 1381 struct dm_target_io *tio; 1382 struct bio *clone; 1383 1384 clone = bio_alloc_bioset(GFP_NOIO, 0, ci->md->bs); 1385 tio = container_of(clone, struct dm_target_io, clone); 1386 1387 tio->io = ci->io; 1388 tio->ti = ti; 1389 tio->target_bio_nr = target_bio_nr; 1390 1391 return tio; 1392 } 1393 1394 static void __clone_and_map_simple_bio(struct clone_info *ci, 1395 struct dm_target *ti, 1396 unsigned target_bio_nr, unsigned *len) 1397 { 1398 struct dm_target_io *tio = alloc_tio(ci, ti, target_bio_nr); 1399 struct bio *clone = &tio->clone; 1400 1401 tio->len_ptr = len; 1402 1403 __bio_clone_fast(clone, ci->bio); 1404 if (len) 1405 bio_setup_sector(clone, ci->sector, *len); 1406 1407 __map_bio(tio); 1408 } 1409 1410 static void __send_duplicate_bios(struct clone_info *ci, struct dm_target *ti, 1411 unsigned num_bios, unsigned *len) 1412 { 1413 unsigned target_bio_nr; 1414 1415 for (target_bio_nr = 0; target_bio_nr < num_bios; target_bio_nr++) 1416 __clone_and_map_simple_bio(ci, ti, target_bio_nr, len); 1417 } 1418 1419 static int __send_empty_flush(struct clone_info *ci) 1420 { 1421 unsigned target_nr = 0; 1422 struct dm_target *ti; 1423 1424 BUG_ON(bio_has_data(ci->bio)); 1425 while ((ti = dm_table_get_target(ci->map, target_nr++))) 1426 __send_duplicate_bios(ci, ti, ti->num_flush_bios, NULL); 1427 1428 return 0; 1429 } 1430 1431 static void __clone_and_map_data_bio(struct clone_info *ci, struct dm_target *ti, 1432 sector_t sector, unsigned *len) 1433 { 1434 struct bio *bio = ci->bio; 1435 struct dm_target_io *tio; 1436 unsigned target_bio_nr; 1437 unsigned num_target_bios = 1; 1438 1439 /* 1440 * Does the target want to receive duplicate copies of the bio? 1441 */ 1442 if (bio_data_dir(bio) == WRITE && ti->num_write_bios) 1443 num_target_bios = ti->num_write_bios(ti, bio); 1444 1445 for (target_bio_nr = 0; target_bio_nr < num_target_bios; target_bio_nr++) { 1446 tio = alloc_tio(ci, ti, target_bio_nr); 1447 tio->len_ptr = len; 1448 clone_bio(tio, bio, sector, *len); 1449 __map_bio(tio); 1450 } 1451 } 1452 1453 typedef unsigned (*get_num_bios_fn)(struct dm_target *ti); 1454 1455 static unsigned get_num_discard_bios(struct dm_target *ti) 1456 { 1457 return ti->num_discard_bios; 1458 } 1459 1460 static unsigned get_num_write_same_bios(struct dm_target *ti) 1461 { 1462 return ti->num_write_same_bios; 1463 } 1464 1465 typedef bool (*is_split_required_fn)(struct dm_target *ti); 1466 1467 static bool is_split_required_for_discard(struct dm_target *ti) 1468 { 1469 return ti->split_discard_bios; 1470 } 1471 1472 static int __send_changing_extent_only(struct clone_info *ci, 1473 get_num_bios_fn get_num_bios, 1474 is_split_required_fn is_split_required) 1475 { 1476 struct dm_target *ti; 1477 unsigned len; 1478 unsigned num_bios; 1479 1480 do { 1481 ti = dm_table_find_target(ci->map, ci->sector); 1482 if (!dm_target_is_valid(ti)) 1483 return -EIO; 1484 1485 /* 1486 * Even though the device advertised support for this type of 1487 * request, that does not mean every target supports it, and 1488 * reconfiguration might also have changed that since the 1489 * check was performed. 1490 */ 1491 num_bios = get_num_bios ? get_num_bios(ti) : 0; 1492 if (!num_bios) 1493 return -EOPNOTSUPP; 1494 1495 if (is_split_required && !is_split_required(ti)) 1496 len = min((sector_t)ci->sector_count, max_io_len_target_boundary(ci->sector, ti)); 1497 else 1498 len = min((sector_t)ci->sector_count, max_io_len(ci->sector, ti)); 1499 1500 __send_duplicate_bios(ci, ti, num_bios, &len); 1501 1502 ci->sector += len; 1503 } while (ci->sector_count -= len); 1504 1505 return 0; 1506 } 1507 1508 static int __send_discard(struct clone_info *ci) 1509 { 1510 return __send_changing_extent_only(ci, get_num_discard_bios, 1511 is_split_required_for_discard); 1512 } 1513 1514 static int __send_write_same(struct clone_info *ci) 1515 { 1516 return __send_changing_extent_only(ci, get_num_write_same_bios, NULL); 1517 } 1518 1519 /* 1520 * Select the correct strategy for processing a non-flush bio. 1521 */ 1522 static int __split_and_process_non_flush(struct clone_info *ci) 1523 { 1524 struct bio *bio = ci->bio; 1525 struct dm_target *ti; 1526 unsigned len; 1527 1528 if (unlikely(bio->bi_rw & REQ_DISCARD)) 1529 return __send_discard(ci); 1530 else if (unlikely(bio->bi_rw & REQ_WRITE_SAME)) 1531 return __send_write_same(ci); 1532 1533 ti = dm_table_find_target(ci->map, ci->sector); 1534 if (!dm_target_is_valid(ti)) 1535 return -EIO; 1536 1537 len = min_t(sector_t, max_io_len(ci->sector, ti), ci->sector_count); 1538 1539 __clone_and_map_data_bio(ci, ti, ci->sector, &len); 1540 1541 ci->sector += len; 1542 ci->sector_count -= len; 1543 1544 return 0; 1545 } 1546 1547 /* 1548 * Entry point to split a bio into clones and submit them to the targets. 1549 */ 1550 static void __split_and_process_bio(struct mapped_device *md, 1551 struct dm_table *map, struct bio *bio) 1552 { 1553 struct clone_info ci; 1554 int error = 0; 1555 1556 if (unlikely(!map)) { 1557 bio_io_error(bio); 1558 return; 1559 } 1560 1561 ci.map = map; 1562 ci.md = md; 1563 ci.io = alloc_io(md); 1564 ci.io->error = 0; 1565 atomic_set(&ci.io->io_count, 1); 1566 ci.io->bio = bio; 1567 ci.io->md = md; 1568 spin_lock_init(&ci.io->endio_lock); 1569 ci.sector = bio->bi_iter.bi_sector; 1570 1571 start_io_acct(ci.io); 1572 1573 if (bio->bi_rw & REQ_FLUSH) { 1574 ci.bio = &ci.md->flush_bio; 1575 ci.sector_count = 0; 1576 error = __send_empty_flush(&ci); 1577 /* dec_pending submits any data associated with flush */ 1578 } else { 1579 ci.bio = bio; 1580 ci.sector_count = bio_sectors(bio); 1581 while (ci.sector_count && !error) 1582 error = __split_and_process_non_flush(&ci); 1583 } 1584 1585 /* drop the extra reference count */ 1586 dec_pending(ci.io, error); 1587 } 1588 /*----------------------------------------------------------------- 1589 * CRUD END 1590 *---------------------------------------------------------------*/ 1591 1592 static int dm_merge_bvec(struct request_queue *q, 1593 struct bvec_merge_data *bvm, 1594 struct bio_vec *biovec) 1595 { 1596 struct mapped_device *md = q->queuedata; 1597 struct dm_table *map = dm_get_live_table_fast(md); 1598 struct dm_target *ti; 1599 sector_t max_sectors; 1600 int max_size = 0; 1601 1602 if (unlikely(!map)) 1603 goto out; 1604 1605 ti = dm_table_find_target(map, bvm->bi_sector); 1606 if (!dm_target_is_valid(ti)) 1607 goto out; 1608 1609 /* 1610 * Find maximum amount of I/O that won't need splitting 1611 */ 1612 max_sectors = min(max_io_len(bvm->bi_sector, ti), 1613 (sector_t) queue_max_sectors(q)); 1614 max_size = (max_sectors << SECTOR_SHIFT) - bvm->bi_size; 1615 if (unlikely(max_size < 0)) /* this shouldn't _ever_ happen */ 1616 max_size = 0; 1617 1618 /* 1619 * merge_bvec_fn() returns number of bytes 1620 * it can accept at this offset 1621 * max is precomputed maximal io size 1622 */ 1623 if (max_size && ti->type->merge) 1624 max_size = ti->type->merge(ti, bvm, biovec, max_size); 1625 /* 1626 * If the target doesn't support merge method and some of the devices 1627 * provided their merge_bvec method (we know this by looking for the 1628 * max_hw_sectors that dm_set_device_limits may set), then we can't 1629 * allow bios with multiple vector entries. So always set max_size 1630 * to 0, and the code below allows just one page. 1631 */ 1632 else if (queue_max_hw_sectors(q) <= PAGE_SIZE >> 9) 1633 max_size = 0; 1634 1635 out: 1636 dm_put_live_table_fast(md); 1637 /* 1638 * Always allow an entire first page 1639 */ 1640 if (max_size <= biovec->bv_len && !(bvm->bi_size >> SECTOR_SHIFT)) 1641 max_size = biovec->bv_len; 1642 1643 return max_size; 1644 } 1645 1646 /* 1647 * The request function that just remaps the bio built up by 1648 * dm_merge_bvec. 1649 */ 1650 static void _dm_request(struct request_queue *q, struct bio *bio) 1651 { 1652 int rw = bio_data_dir(bio); 1653 struct mapped_device *md = q->queuedata; 1654 int srcu_idx; 1655 struct dm_table *map; 1656 1657 map = dm_get_live_table(md, &srcu_idx); 1658 1659 generic_start_io_acct(rw, bio_sectors(bio), &dm_disk(md)->part0); 1660 1661 /* if we're suspended, we have to queue this io for later */ 1662 if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))) { 1663 dm_put_live_table(md, srcu_idx); 1664 1665 if (bio_rw(bio) != READA) 1666 queue_io(md, bio); 1667 else 1668 bio_io_error(bio); 1669 return; 1670 } 1671 1672 __split_and_process_bio(md, map, bio); 1673 dm_put_live_table(md, srcu_idx); 1674 return; 1675 } 1676 1677 int dm_request_based(struct mapped_device *md) 1678 { 1679 return blk_queue_stackable(md->queue); 1680 } 1681 1682 static void dm_request(struct request_queue *q, struct bio *bio) 1683 { 1684 struct mapped_device *md = q->queuedata; 1685 1686 if (dm_request_based(md)) 1687 blk_queue_bio(q, bio); 1688 else 1689 _dm_request(q, bio); 1690 } 1691 1692 void dm_dispatch_request(struct request *rq) 1693 { 1694 int r; 1695 1696 if (blk_queue_io_stat(rq->q)) 1697 rq->cmd_flags |= REQ_IO_STAT; 1698 1699 rq->start_time = jiffies; 1700 r = blk_insert_cloned_request(rq->q, rq); 1701 if (r) 1702 dm_complete_request(rq, r); 1703 } 1704 EXPORT_SYMBOL_GPL(dm_dispatch_request); 1705 1706 static int dm_rq_bio_constructor(struct bio *bio, struct bio *bio_orig, 1707 void *data) 1708 { 1709 struct dm_rq_target_io *tio = data; 1710 struct dm_rq_clone_bio_info *info = 1711 container_of(bio, struct dm_rq_clone_bio_info, clone); 1712 1713 info->orig = bio_orig; 1714 info->tio = tio; 1715 bio->bi_end_io = end_clone_bio; 1716 1717 return 0; 1718 } 1719 1720 static int setup_clone(struct request *clone, struct request *rq, 1721 struct dm_rq_target_io *tio) 1722 { 1723 int r; 1724 1725 r = blk_rq_prep_clone(clone, rq, tio->md->bs, GFP_ATOMIC, 1726 dm_rq_bio_constructor, tio); 1727 if (r) 1728 return r; 1729 1730 clone->cmd = rq->cmd; 1731 clone->cmd_len = rq->cmd_len; 1732 clone->sense = rq->sense; 1733 clone->end_io = end_clone_request; 1734 clone->end_io_data = tio; 1735 1736 return 0; 1737 } 1738 1739 static struct request *clone_rq(struct request *rq, struct mapped_device *md, 1740 gfp_t gfp_mask) 1741 { 1742 struct request *clone; 1743 struct dm_rq_target_io *tio; 1744 1745 tio = alloc_rq_tio(md, gfp_mask); 1746 if (!tio) 1747 return NULL; 1748 1749 tio->md = md; 1750 tio->ti = NULL; 1751 tio->orig = rq; 1752 tio->error = 0; 1753 memset(&tio->info, 0, sizeof(tio->info)); 1754 1755 clone = &tio->clone; 1756 if (setup_clone(clone, rq, tio)) { 1757 /* -ENOMEM */ 1758 free_rq_tio(tio); 1759 return NULL; 1760 } 1761 1762 return clone; 1763 } 1764 1765 /* 1766 * Called with the queue lock held. 1767 */ 1768 static int dm_prep_fn(struct request_queue *q, struct request *rq) 1769 { 1770 struct mapped_device *md = q->queuedata; 1771 struct request *clone; 1772 1773 if (unlikely(rq->special)) { 1774 DMWARN("Already has something in rq->special."); 1775 return BLKPREP_KILL; 1776 } 1777 1778 clone = clone_rq(rq, md, GFP_ATOMIC); 1779 if (!clone) 1780 return BLKPREP_DEFER; 1781 1782 rq->special = clone; 1783 rq->cmd_flags |= REQ_DONTPREP; 1784 1785 return BLKPREP_OK; 1786 } 1787 1788 /* 1789 * Returns: 1790 * 0 : the request has been processed (not requeued) 1791 * !0 : the request has been requeued 1792 */ 1793 static int map_request(struct dm_target *ti, struct request *clone, 1794 struct mapped_device *md) 1795 { 1796 int r, requeued = 0; 1797 struct dm_rq_target_io *tio = clone->end_io_data; 1798 1799 tio->ti = ti; 1800 r = ti->type->map_rq(ti, clone, &tio->info); 1801 switch (r) { 1802 case DM_MAPIO_SUBMITTED: 1803 /* The target has taken the I/O to submit by itself later */ 1804 break; 1805 case DM_MAPIO_REMAPPED: 1806 /* The target has remapped the I/O so dispatch it */ 1807 trace_block_rq_remap(clone->q, clone, disk_devt(dm_disk(md)), 1808 blk_rq_pos(tio->orig)); 1809 dm_dispatch_request(clone); 1810 break; 1811 case DM_MAPIO_REQUEUE: 1812 /* The target wants to requeue the I/O */ 1813 dm_requeue_unmapped_request(clone); 1814 requeued = 1; 1815 break; 1816 default: 1817 if (r > 0) { 1818 DMWARN("unimplemented target map return value: %d", r); 1819 BUG(); 1820 } 1821 1822 /* The target wants to complete the I/O */ 1823 dm_kill_unmapped_request(clone, r); 1824 break; 1825 } 1826 1827 return requeued; 1828 } 1829 1830 static struct request *dm_start_request(struct mapped_device *md, struct request *orig) 1831 { 1832 struct request *clone; 1833 1834 blk_start_request(orig); 1835 clone = orig->special; 1836 atomic_inc(&md->pending[rq_data_dir(clone)]); 1837 1838 /* 1839 * Hold the md reference here for the in-flight I/O. 1840 * We can't rely on the reference count by device opener, 1841 * because the device may be closed during the request completion 1842 * when all bios are completed. 1843 * See the comment in rq_completed() too. 1844 */ 1845 dm_get(md); 1846 1847 return clone; 1848 } 1849 1850 /* 1851 * q->request_fn for request-based dm. 1852 * Called with the queue lock held. 1853 */ 1854 static void dm_request_fn(struct request_queue *q) 1855 { 1856 struct mapped_device *md = q->queuedata; 1857 int srcu_idx; 1858 struct dm_table *map = dm_get_live_table(md, &srcu_idx); 1859 struct dm_target *ti; 1860 struct request *rq, *clone; 1861 sector_t pos; 1862 1863 /* 1864 * For suspend, check blk_queue_stopped() and increment 1865 * ->pending within a single queue_lock not to increment the 1866 * number of in-flight I/Os after the queue is stopped in 1867 * dm_suspend(). 1868 */ 1869 while (!blk_queue_stopped(q)) { 1870 rq = blk_peek_request(q); 1871 if (!rq) 1872 goto delay_and_out; 1873 1874 /* always use block 0 to find the target for flushes for now */ 1875 pos = 0; 1876 if (!(rq->cmd_flags & REQ_FLUSH)) 1877 pos = blk_rq_pos(rq); 1878 1879 ti = dm_table_find_target(map, pos); 1880 if (!dm_target_is_valid(ti)) { 1881 /* 1882 * Must perform setup, that dm_done() requires, 1883 * before calling dm_kill_unmapped_request 1884 */ 1885 DMERR_LIMIT("request attempted access beyond the end of device"); 1886 clone = dm_start_request(md, rq); 1887 dm_kill_unmapped_request(clone, -EIO); 1888 continue; 1889 } 1890 1891 if (ti->type->busy && ti->type->busy(ti)) 1892 goto delay_and_out; 1893 1894 clone = dm_start_request(md, rq); 1895 1896 spin_unlock(q->queue_lock); 1897 if (map_request(ti, clone, md)) 1898 goto requeued; 1899 1900 BUG_ON(!irqs_disabled()); 1901 spin_lock(q->queue_lock); 1902 } 1903 1904 goto out; 1905 1906 requeued: 1907 BUG_ON(!irqs_disabled()); 1908 spin_lock(q->queue_lock); 1909 1910 delay_and_out: 1911 blk_delay_queue(q, HZ / 10); 1912 out: 1913 dm_put_live_table(md, srcu_idx); 1914 } 1915 1916 int dm_underlying_device_busy(struct request_queue *q) 1917 { 1918 return blk_lld_busy(q); 1919 } 1920 EXPORT_SYMBOL_GPL(dm_underlying_device_busy); 1921 1922 static int dm_lld_busy(struct request_queue *q) 1923 { 1924 int r; 1925 struct mapped_device *md = q->queuedata; 1926 struct dm_table *map = dm_get_live_table_fast(md); 1927 1928 if (!map || test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) 1929 r = 1; 1930 else 1931 r = dm_table_any_busy_target(map); 1932 1933 dm_put_live_table_fast(md); 1934 1935 return r; 1936 } 1937 1938 static int dm_any_congested(void *congested_data, int bdi_bits) 1939 { 1940 int r = bdi_bits; 1941 struct mapped_device *md = congested_data; 1942 struct dm_table *map; 1943 1944 if (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) { 1945 map = dm_get_live_table_fast(md); 1946 if (map) { 1947 /* 1948 * Request-based dm cares about only own queue for 1949 * the query about congestion status of request_queue 1950 */ 1951 if (dm_request_based(md)) 1952 r = md->queue->backing_dev_info.state & 1953 bdi_bits; 1954 else 1955 r = dm_table_any_congested(map, bdi_bits); 1956 } 1957 dm_put_live_table_fast(md); 1958 } 1959 1960 return r; 1961 } 1962 1963 /*----------------------------------------------------------------- 1964 * An IDR is used to keep track of allocated minor numbers. 1965 *---------------------------------------------------------------*/ 1966 static void free_minor(int minor) 1967 { 1968 spin_lock(&_minor_lock); 1969 idr_remove(&_minor_idr, minor); 1970 spin_unlock(&_minor_lock); 1971 } 1972 1973 /* 1974 * See if the device with a specific minor # is free. 1975 */ 1976 static int specific_minor(int minor) 1977 { 1978 int r; 1979 1980 if (minor >= (1 << MINORBITS)) 1981 return -EINVAL; 1982 1983 idr_preload(GFP_KERNEL); 1984 spin_lock(&_minor_lock); 1985 1986 r = idr_alloc(&_minor_idr, MINOR_ALLOCED, minor, minor + 1, GFP_NOWAIT); 1987 1988 spin_unlock(&_minor_lock); 1989 idr_preload_end(); 1990 if (r < 0) 1991 return r == -ENOSPC ? -EBUSY : r; 1992 return 0; 1993 } 1994 1995 static int next_free_minor(int *minor) 1996 { 1997 int r; 1998 1999 idr_preload(GFP_KERNEL); 2000 spin_lock(&_minor_lock); 2001 2002 r = idr_alloc(&_minor_idr, MINOR_ALLOCED, 0, 1 << MINORBITS, GFP_NOWAIT); 2003 2004 spin_unlock(&_minor_lock); 2005 idr_preload_end(); 2006 if (r < 0) 2007 return r; 2008 *minor = r; 2009 return 0; 2010 } 2011 2012 static const struct block_device_operations dm_blk_dops; 2013 2014 static void dm_wq_work(struct work_struct *work); 2015 2016 static void dm_init_md_queue(struct mapped_device *md) 2017 { 2018 /* 2019 * Request-based dm devices cannot be stacked on top of bio-based dm 2020 * devices. The type of this dm device has not been decided yet. 2021 * The type is decided at the first table loading time. 2022 * To prevent problematic device stacking, clear the queue flag 2023 * for request stacking support until then. 2024 * 2025 * This queue is new, so no concurrency on the queue_flags. 2026 */ 2027 queue_flag_clear_unlocked(QUEUE_FLAG_STACKABLE, md->queue); 2028 2029 md->queue->queuedata = md; 2030 md->queue->backing_dev_info.congested_fn = dm_any_congested; 2031 md->queue->backing_dev_info.congested_data = md; 2032 blk_queue_make_request(md->queue, dm_request); 2033 blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY); 2034 blk_queue_merge_bvec(md->queue, dm_merge_bvec); 2035 } 2036 2037 /* 2038 * Allocate and initialise a blank device with a given minor. 2039 */ 2040 static struct mapped_device *alloc_dev(int minor) 2041 { 2042 int r; 2043 struct mapped_device *md = kzalloc(sizeof(*md), GFP_KERNEL); 2044 void *old_md; 2045 2046 if (!md) { 2047 DMWARN("unable to allocate device, out of memory."); 2048 return NULL; 2049 } 2050 2051 if (!try_module_get(THIS_MODULE)) 2052 goto bad_module_get; 2053 2054 /* get a minor number for the dev */ 2055 if (minor == DM_ANY_MINOR) 2056 r = next_free_minor(&minor); 2057 else 2058 r = specific_minor(minor); 2059 if (r < 0) 2060 goto bad_minor; 2061 2062 r = init_srcu_struct(&md->io_barrier); 2063 if (r < 0) 2064 goto bad_io_barrier; 2065 2066 md->type = DM_TYPE_NONE; 2067 mutex_init(&md->suspend_lock); 2068 mutex_init(&md->type_lock); 2069 mutex_init(&md->table_devices_lock); 2070 spin_lock_init(&md->deferred_lock); 2071 atomic_set(&md->holders, 1); 2072 atomic_set(&md->open_count, 0); 2073 atomic_set(&md->event_nr, 0); 2074 atomic_set(&md->uevent_seq, 0); 2075 INIT_LIST_HEAD(&md->uevent_list); 2076 INIT_LIST_HEAD(&md->table_devices); 2077 spin_lock_init(&md->uevent_lock); 2078 2079 md->queue = blk_alloc_queue(GFP_KERNEL); 2080 if (!md->queue) 2081 goto bad_queue; 2082 2083 dm_init_md_queue(md); 2084 2085 md->disk = alloc_disk(1); 2086 if (!md->disk) 2087 goto bad_disk; 2088 2089 atomic_set(&md->pending[0], 0); 2090 atomic_set(&md->pending[1], 0); 2091 init_waitqueue_head(&md->wait); 2092 INIT_WORK(&md->work, dm_wq_work); 2093 init_waitqueue_head(&md->eventq); 2094 init_completion(&md->kobj_holder.completion); 2095 2096 md->disk->major = _major; 2097 md->disk->first_minor = minor; 2098 md->disk->fops = &dm_blk_dops; 2099 md->disk->queue = md->queue; 2100 md->disk->private_data = md; 2101 sprintf(md->disk->disk_name, "dm-%d", minor); 2102 add_disk(md->disk); 2103 format_dev_t(md->name, MKDEV(_major, minor)); 2104 2105 md->wq = alloc_workqueue("kdmflush", WQ_MEM_RECLAIM, 0); 2106 if (!md->wq) 2107 goto bad_thread; 2108 2109 md->bdev = bdget_disk(md->disk, 0); 2110 if (!md->bdev) 2111 goto bad_bdev; 2112 2113 bio_init(&md->flush_bio); 2114 md->flush_bio.bi_bdev = md->bdev; 2115 md->flush_bio.bi_rw = WRITE_FLUSH; 2116 2117 dm_stats_init(&md->stats); 2118 2119 /* Populate the mapping, nobody knows we exist yet */ 2120 spin_lock(&_minor_lock); 2121 old_md = idr_replace(&_minor_idr, md, minor); 2122 spin_unlock(&_minor_lock); 2123 2124 BUG_ON(old_md != MINOR_ALLOCED); 2125 2126 return md; 2127 2128 bad_bdev: 2129 destroy_workqueue(md->wq); 2130 bad_thread: 2131 del_gendisk(md->disk); 2132 put_disk(md->disk); 2133 bad_disk: 2134 blk_cleanup_queue(md->queue); 2135 bad_queue: 2136 cleanup_srcu_struct(&md->io_barrier); 2137 bad_io_barrier: 2138 free_minor(minor); 2139 bad_minor: 2140 module_put(THIS_MODULE); 2141 bad_module_get: 2142 kfree(md); 2143 return NULL; 2144 } 2145 2146 static void unlock_fs(struct mapped_device *md); 2147 2148 static void free_dev(struct mapped_device *md) 2149 { 2150 int minor = MINOR(disk_devt(md->disk)); 2151 2152 unlock_fs(md); 2153 bdput(md->bdev); 2154 destroy_workqueue(md->wq); 2155 if (md->io_pool) 2156 mempool_destroy(md->io_pool); 2157 if (md->bs) 2158 bioset_free(md->bs); 2159 blk_integrity_unregister(md->disk); 2160 del_gendisk(md->disk); 2161 cleanup_srcu_struct(&md->io_barrier); 2162 free_table_devices(&md->table_devices); 2163 free_minor(minor); 2164 2165 spin_lock(&_minor_lock); 2166 md->disk->private_data = NULL; 2167 spin_unlock(&_minor_lock); 2168 2169 put_disk(md->disk); 2170 blk_cleanup_queue(md->queue); 2171 dm_stats_cleanup(&md->stats); 2172 module_put(THIS_MODULE); 2173 kfree(md); 2174 } 2175 2176 static void __bind_mempools(struct mapped_device *md, struct dm_table *t) 2177 { 2178 struct dm_md_mempools *p = dm_table_get_md_mempools(t); 2179 2180 if (md->io_pool && md->bs) { 2181 /* The md already has necessary mempools. */ 2182 if (dm_table_get_type(t) == DM_TYPE_BIO_BASED) { 2183 /* 2184 * Reload bioset because front_pad may have changed 2185 * because a different table was loaded. 2186 */ 2187 bioset_free(md->bs); 2188 md->bs = p->bs; 2189 p->bs = NULL; 2190 } else if (dm_table_get_type(t) == DM_TYPE_REQUEST_BASED) { 2191 /* 2192 * There's no need to reload with request-based dm 2193 * because the size of front_pad doesn't change. 2194 * Note for future: If you are to reload bioset, 2195 * prep-ed requests in the queue may refer 2196 * to bio from the old bioset, so you must walk 2197 * through the queue to unprep. 2198 */ 2199 } 2200 goto out; 2201 } 2202 2203 BUG_ON(!p || md->io_pool || md->bs); 2204 2205 md->io_pool = p->io_pool; 2206 p->io_pool = NULL; 2207 md->bs = p->bs; 2208 p->bs = NULL; 2209 2210 out: 2211 /* mempool bind completed, now no need any mempools in the table */ 2212 dm_table_free_md_mempools(t); 2213 } 2214 2215 /* 2216 * Bind a table to the device. 2217 */ 2218 static void event_callback(void *context) 2219 { 2220 unsigned long flags; 2221 LIST_HEAD(uevents); 2222 struct mapped_device *md = (struct mapped_device *) context; 2223 2224 spin_lock_irqsave(&md->uevent_lock, flags); 2225 list_splice_init(&md->uevent_list, &uevents); 2226 spin_unlock_irqrestore(&md->uevent_lock, flags); 2227 2228 dm_send_uevents(&uevents, &disk_to_dev(md->disk)->kobj); 2229 2230 atomic_inc(&md->event_nr); 2231 wake_up(&md->eventq); 2232 } 2233 2234 /* 2235 * Protected by md->suspend_lock obtained by dm_swap_table(). 2236 */ 2237 static void __set_size(struct mapped_device *md, sector_t size) 2238 { 2239 set_capacity(md->disk, size); 2240 2241 i_size_write(md->bdev->bd_inode, (loff_t)size << SECTOR_SHIFT); 2242 } 2243 2244 /* 2245 * Return 1 if the queue has a compulsory merge_bvec_fn function. 2246 * 2247 * If this function returns 0, then the device is either a non-dm 2248 * device without a merge_bvec_fn, or it is a dm device that is 2249 * able to split any bios it receives that are too big. 2250 */ 2251 int dm_queue_merge_is_compulsory(struct request_queue *q) 2252 { 2253 struct mapped_device *dev_md; 2254 2255 if (!q->merge_bvec_fn) 2256 return 0; 2257 2258 if (q->make_request_fn == dm_request) { 2259 dev_md = q->queuedata; 2260 if (test_bit(DMF_MERGE_IS_OPTIONAL, &dev_md->flags)) 2261 return 0; 2262 } 2263 2264 return 1; 2265 } 2266 2267 static int dm_device_merge_is_compulsory(struct dm_target *ti, 2268 struct dm_dev *dev, sector_t start, 2269 sector_t len, void *data) 2270 { 2271 struct block_device *bdev = dev->bdev; 2272 struct request_queue *q = bdev_get_queue(bdev); 2273 2274 return dm_queue_merge_is_compulsory(q); 2275 } 2276 2277 /* 2278 * Return 1 if it is acceptable to ignore merge_bvec_fn based 2279 * on the properties of the underlying devices. 2280 */ 2281 static int dm_table_merge_is_optional(struct dm_table *table) 2282 { 2283 unsigned i = 0; 2284 struct dm_target *ti; 2285 2286 while (i < dm_table_get_num_targets(table)) { 2287 ti = dm_table_get_target(table, i++); 2288 2289 if (ti->type->iterate_devices && 2290 ti->type->iterate_devices(ti, dm_device_merge_is_compulsory, NULL)) 2291 return 0; 2292 } 2293 2294 return 1; 2295 } 2296 2297 /* 2298 * Returns old map, which caller must destroy. 2299 */ 2300 static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t, 2301 struct queue_limits *limits) 2302 { 2303 struct dm_table *old_map; 2304 struct request_queue *q = md->queue; 2305 sector_t size; 2306 int merge_is_optional; 2307 2308 size = dm_table_get_size(t); 2309 2310 /* 2311 * Wipe any geometry if the size of the table changed. 2312 */ 2313 if (size != dm_get_size(md)) 2314 memset(&md->geometry, 0, sizeof(md->geometry)); 2315 2316 __set_size(md, size); 2317 2318 dm_table_event_callback(t, event_callback, md); 2319 2320 /* 2321 * The queue hasn't been stopped yet, if the old table type wasn't 2322 * for request-based during suspension. So stop it to prevent 2323 * I/O mapping before resume. 2324 * This must be done before setting the queue restrictions, 2325 * because request-based dm may be run just after the setting. 2326 */ 2327 if (dm_table_request_based(t) && !blk_queue_stopped(q)) 2328 stop_queue(q); 2329 2330 __bind_mempools(md, t); 2331 2332 merge_is_optional = dm_table_merge_is_optional(t); 2333 2334 old_map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock)); 2335 rcu_assign_pointer(md->map, t); 2336 md->immutable_target_type = dm_table_get_immutable_target_type(t); 2337 2338 dm_table_set_restrictions(t, q, limits); 2339 if (merge_is_optional) 2340 set_bit(DMF_MERGE_IS_OPTIONAL, &md->flags); 2341 else 2342 clear_bit(DMF_MERGE_IS_OPTIONAL, &md->flags); 2343 if (old_map) 2344 dm_sync_table(md); 2345 2346 return old_map; 2347 } 2348 2349 /* 2350 * Returns unbound table for the caller to free. 2351 */ 2352 static struct dm_table *__unbind(struct mapped_device *md) 2353 { 2354 struct dm_table *map = rcu_dereference_protected(md->map, 1); 2355 2356 if (!map) 2357 return NULL; 2358 2359 dm_table_event_callback(map, NULL, NULL); 2360 RCU_INIT_POINTER(md->map, NULL); 2361 dm_sync_table(md); 2362 2363 return map; 2364 } 2365 2366 /* 2367 * Constructor for a new device. 2368 */ 2369 int dm_create(int minor, struct mapped_device **result) 2370 { 2371 struct mapped_device *md; 2372 2373 md = alloc_dev(minor); 2374 if (!md) 2375 return -ENXIO; 2376 2377 dm_sysfs_init(md); 2378 2379 *result = md; 2380 return 0; 2381 } 2382 2383 /* 2384 * Functions to manage md->type. 2385 * All are required to hold md->type_lock. 2386 */ 2387 void dm_lock_md_type(struct mapped_device *md) 2388 { 2389 mutex_lock(&md->type_lock); 2390 } 2391 2392 void dm_unlock_md_type(struct mapped_device *md) 2393 { 2394 mutex_unlock(&md->type_lock); 2395 } 2396 2397 void dm_set_md_type(struct mapped_device *md, unsigned type) 2398 { 2399 BUG_ON(!mutex_is_locked(&md->type_lock)); 2400 md->type = type; 2401 } 2402 2403 unsigned dm_get_md_type(struct mapped_device *md) 2404 { 2405 BUG_ON(!mutex_is_locked(&md->type_lock)); 2406 return md->type; 2407 } 2408 2409 struct target_type *dm_get_immutable_target_type(struct mapped_device *md) 2410 { 2411 return md->immutable_target_type; 2412 } 2413 2414 /* 2415 * The queue_limits are only valid as long as you have a reference 2416 * count on 'md'. 2417 */ 2418 struct queue_limits *dm_get_queue_limits(struct mapped_device *md) 2419 { 2420 BUG_ON(!atomic_read(&md->holders)); 2421 return &md->queue->limits; 2422 } 2423 EXPORT_SYMBOL_GPL(dm_get_queue_limits); 2424 2425 /* 2426 * Fully initialize a request-based queue (->elevator, ->request_fn, etc). 2427 */ 2428 static int dm_init_request_based_queue(struct mapped_device *md) 2429 { 2430 struct request_queue *q = NULL; 2431 2432 if (md->queue->elevator) 2433 return 1; 2434 2435 /* Fully initialize the queue */ 2436 q = blk_init_allocated_queue(md->queue, dm_request_fn, NULL); 2437 if (!q) 2438 return 0; 2439 2440 md->queue = q; 2441 dm_init_md_queue(md); 2442 blk_queue_softirq_done(md->queue, dm_softirq_done); 2443 blk_queue_prep_rq(md->queue, dm_prep_fn); 2444 blk_queue_lld_busy(md->queue, dm_lld_busy); 2445 2446 elv_register_queue(md->queue); 2447 2448 return 1; 2449 } 2450 2451 /* 2452 * Setup the DM device's queue based on md's type 2453 */ 2454 int dm_setup_md_queue(struct mapped_device *md) 2455 { 2456 if ((dm_get_md_type(md) == DM_TYPE_REQUEST_BASED) && 2457 !dm_init_request_based_queue(md)) { 2458 DMWARN("Cannot initialize queue for request-based mapped device"); 2459 return -EINVAL; 2460 } 2461 2462 return 0; 2463 } 2464 2465 static struct mapped_device *dm_find_md(dev_t dev) 2466 { 2467 struct mapped_device *md; 2468 unsigned minor = MINOR(dev); 2469 2470 if (MAJOR(dev) != _major || minor >= (1 << MINORBITS)) 2471 return NULL; 2472 2473 spin_lock(&_minor_lock); 2474 2475 md = idr_find(&_minor_idr, minor); 2476 if (md && (md == MINOR_ALLOCED || 2477 (MINOR(disk_devt(dm_disk(md))) != minor) || 2478 dm_deleting_md(md) || 2479 test_bit(DMF_FREEING, &md->flags))) { 2480 md = NULL; 2481 goto out; 2482 } 2483 2484 out: 2485 spin_unlock(&_minor_lock); 2486 2487 return md; 2488 } 2489 2490 struct mapped_device *dm_get_md(dev_t dev) 2491 { 2492 struct mapped_device *md = dm_find_md(dev); 2493 2494 if (md) 2495 dm_get(md); 2496 2497 return md; 2498 } 2499 EXPORT_SYMBOL_GPL(dm_get_md); 2500 2501 void *dm_get_mdptr(struct mapped_device *md) 2502 { 2503 return md->interface_ptr; 2504 } 2505 2506 void dm_set_mdptr(struct mapped_device *md, void *ptr) 2507 { 2508 md->interface_ptr = ptr; 2509 } 2510 2511 void dm_get(struct mapped_device *md) 2512 { 2513 atomic_inc(&md->holders); 2514 BUG_ON(test_bit(DMF_FREEING, &md->flags)); 2515 } 2516 2517 const char *dm_device_name(struct mapped_device *md) 2518 { 2519 return md->name; 2520 } 2521 EXPORT_SYMBOL_GPL(dm_device_name); 2522 2523 static void __dm_destroy(struct mapped_device *md, bool wait) 2524 { 2525 struct dm_table *map; 2526 int srcu_idx; 2527 2528 might_sleep(); 2529 2530 spin_lock(&_minor_lock); 2531 map = dm_get_live_table(md, &srcu_idx); 2532 idr_replace(&_minor_idr, MINOR_ALLOCED, MINOR(disk_devt(dm_disk(md)))); 2533 set_bit(DMF_FREEING, &md->flags); 2534 spin_unlock(&_minor_lock); 2535 2536 if (!dm_suspended_md(md)) { 2537 dm_table_presuspend_targets(map); 2538 dm_table_postsuspend_targets(map); 2539 } 2540 2541 /* dm_put_live_table must be before msleep, otherwise deadlock is possible */ 2542 dm_put_live_table(md, srcu_idx); 2543 2544 /* 2545 * Rare, but there may be I/O requests still going to complete, 2546 * for example. Wait for all references to disappear. 2547 * No one should increment the reference count of the mapped_device, 2548 * after the mapped_device state becomes DMF_FREEING. 2549 */ 2550 if (wait) 2551 while (atomic_read(&md->holders)) 2552 msleep(1); 2553 else if (atomic_read(&md->holders)) 2554 DMWARN("%s: Forcibly removing mapped_device still in use! (%d users)", 2555 dm_device_name(md), atomic_read(&md->holders)); 2556 2557 dm_sysfs_exit(md); 2558 dm_table_destroy(__unbind(md)); 2559 free_dev(md); 2560 } 2561 2562 void dm_destroy(struct mapped_device *md) 2563 { 2564 __dm_destroy(md, true); 2565 } 2566 2567 void dm_destroy_immediate(struct mapped_device *md) 2568 { 2569 __dm_destroy(md, false); 2570 } 2571 2572 void dm_put(struct mapped_device *md) 2573 { 2574 atomic_dec(&md->holders); 2575 } 2576 EXPORT_SYMBOL_GPL(dm_put); 2577 2578 static int dm_wait_for_completion(struct mapped_device *md, int interruptible) 2579 { 2580 int r = 0; 2581 DECLARE_WAITQUEUE(wait, current); 2582 2583 add_wait_queue(&md->wait, &wait); 2584 2585 while (1) { 2586 set_current_state(interruptible); 2587 2588 if (!md_in_flight(md)) 2589 break; 2590 2591 if (interruptible == TASK_INTERRUPTIBLE && 2592 signal_pending(current)) { 2593 r = -EINTR; 2594 break; 2595 } 2596 2597 io_schedule(); 2598 } 2599 set_current_state(TASK_RUNNING); 2600 2601 remove_wait_queue(&md->wait, &wait); 2602 2603 return r; 2604 } 2605 2606 /* 2607 * Process the deferred bios 2608 */ 2609 static void dm_wq_work(struct work_struct *work) 2610 { 2611 struct mapped_device *md = container_of(work, struct mapped_device, 2612 work); 2613 struct bio *c; 2614 int srcu_idx; 2615 struct dm_table *map; 2616 2617 map = dm_get_live_table(md, &srcu_idx); 2618 2619 while (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) { 2620 spin_lock_irq(&md->deferred_lock); 2621 c = bio_list_pop(&md->deferred); 2622 spin_unlock_irq(&md->deferred_lock); 2623 2624 if (!c) 2625 break; 2626 2627 if (dm_request_based(md)) 2628 generic_make_request(c); 2629 else 2630 __split_and_process_bio(md, map, c); 2631 } 2632 2633 dm_put_live_table(md, srcu_idx); 2634 } 2635 2636 static void dm_queue_flush(struct mapped_device *md) 2637 { 2638 clear_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags); 2639 smp_mb__after_atomic(); 2640 queue_work(md->wq, &md->work); 2641 } 2642 2643 /* 2644 * Swap in a new table, returning the old one for the caller to destroy. 2645 */ 2646 struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *table) 2647 { 2648 struct dm_table *live_map = NULL, *map = ERR_PTR(-EINVAL); 2649 struct queue_limits limits; 2650 int r; 2651 2652 mutex_lock(&md->suspend_lock); 2653 2654 /* device must be suspended */ 2655 if (!dm_suspended_md(md)) 2656 goto out; 2657 2658 /* 2659 * If the new table has no data devices, retain the existing limits. 2660 * This helps multipath with queue_if_no_path if all paths disappear, 2661 * then new I/O is queued based on these limits, and then some paths 2662 * reappear. 2663 */ 2664 if (dm_table_has_no_data_devices(table)) { 2665 live_map = dm_get_live_table_fast(md); 2666 if (live_map) 2667 limits = md->queue->limits; 2668 dm_put_live_table_fast(md); 2669 } 2670 2671 if (!live_map) { 2672 r = dm_calculate_queue_limits(table, &limits); 2673 if (r) { 2674 map = ERR_PTR(r); 2675 goto out; 2676 } 2677 } 2678 2679 map = __bind(md, table, &limits); 2680 2681 out: 2682 mutex_unlock(&md->suspend_lock); 2683 return map; 2684 } 2685 2686 /* 2687 * Functions to lock and unlock any filesystem running on the 2688 * device. 2689 */ 2690 static int lock_fs(struct mapped_device *md) 2691 { 2692 int r; 2693 2694 WARN_ON(md->frozen_sb); 2695 2696 md->frozen_sb = freeze_bdev(md->bdev); 2697 if (IS_ERR(md->frozen_sb)) { 2698 r = PTR_ERR(md->frozen_sb); 2699 md->frozen_sb = NULL; 2700 return r; 2701 } 2702 2703 set_bit(DMF_FROZEN, &md->flags); 2704 2705 return 0; 2706 } 2707 2708 static void unlock_fs(struct mapped_device *md) 2709 { 2710 if (!test_bit(DMF_FROZEN, &md->flags)) 2711 return; 2712 2713 thaw_bdev(md->bdev, md->frozen_sb); 2714 md->frozen_sb = NULL; 2715 clear_bit(DMF_FROZEN, &md->flags); 2716 } 2717 2718 /* 2719 * If __dm_suspend returns 0, the device is completely quiescent 2720 * now. There is no request-processing activity. All new requests 2721 * are being added to md->deferred list. 2722 * 2723 * Caller must hold md->suspend_lock 2724 */ 2725 static int __dm_suspend(struct mapped_device *md, struct dm_table *map, 2726 unsigned suspend_flags, int interruptible) 2727 { 2728 bool do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG; 2729 bool noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG; 2730 int r; 2731 2732 /* 2733 * DMF_NOFLUSH_SUSPENDING must be set before presuspend. 2734 * This flag is cleared before dm_suspend returns. 2735 */ 2736 if (noflush) 2737 set_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); 2738 2739 /* 2740 * This gets reverted if there's an error later and the targets 2741 * provide the .presuspend_undo hook. 2742 */ 2743 dm_table_presuspend_targets(map); 2744 2745 /* 2746 * Flush I/O to the device. 2747 * Any I/O submitted after lock_fs() may not be flushed. 2748 * noflush takes precedence over do_lockfs. 2749 * (lock_fs() flushes I/Os and waits for them to complete.) 2750 */ 2751 if (!noflush && do_lockfs) { 2752 r = lock_fs(md); 2753 if (r) { 2754 dm_table_presuspend_undo_targets(map); 2755 return r; 2756 } 2757 } 2758 2759 /* 2760 * Here we must make sure that no processes are submitting requests 2761 * to target drivers i.e. no one may be executing 2762 * __split_and_process_bio. This is called from dm_request and 2763 * dm_wq_work. 2764 * 2765 * To get all processes out of __split_and_process_bio in dm_request, 2766 * we take the write lock. To prevent any process from reentering 2767 * __split_and_process_bio from dm_request and quiesce the thread 2768 * (dm_wq_work), we set BMF_BLOCK_IO_FOR_SUSPEND and call 2769 * flush_workqueue(md->wq). 2770 */ 2771 set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags); 2772 if (map) 2773 synchronize_srcu(&md->io_barrier); 2774 2775 /* 2776 * Stop md->queue before flushing md->wq in case request-based 2777 * dm defers requests to md->wq from md->queue. 2778 */ 2779 if (dm_request_based(md)) 2780 stop_queue(md->queue); 2781 2782 flush_workqueue(md->wq); 2783 2784 /* 2785 * At this point no more requests are entering target request routines. 2786 * We call dm_wait_for_completion to wait for all existing requests 2787 * to finish. 2788 */ 2789 r = dm_wait_for_completion(md, interruptible); 2790 2791 if (noflush) 2792 clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); 2793 if (map) 2794 synchronize_srcu(&md->io_barrier); 2795 2796 /* were we interrupted ? */ 2797 if (r < 0) { 2798 dm_queue_flush(md); 2799 2800 if (dm_request_based(md)) 2801 start_queue(md->queue); 2802 2803 unlock_fs(md); 2804 dm_table_presuspend_undo_targets(map); 2805 /* pushback list is already flushed, so skip flush */ 2806 } 2807 2808 return r; 2809 } 2810 2811 /* 2812 * We need to be able to change a mapping table under a mounted 2813 * filesystem. For example we might want to move some data in 2814 * the background. Before the table can be swapped with 2815 * dm_bind_table, dm_suspend must be called to flush any in 2816 * flight bios and ensure that any further io gets deferred. 2817 */ 2818 /* 2819 * Suspend mechanism in request-based dm. 2820 * 2821 * 1. Flush all I/Os by lock_fs() if needed. 2822 * 2. Stop dispatching any I/O by stopping the request_queue. 2823 * 3. Wait for all in-flight I/Os to be completed or requeued. 2824 * 2825 * To abort suspend, start the request_queue. 2826 */ 2827 int dm_suspend(struct mapped_device *md, unsigned suspend_flags) 2828 { 2829 struct dm_table *map = NULL; 2830 int r = 0; 2831 2832 retry: 2833 mutex_lock_nested(&md->suspend_lock, SINGLE_DEPTH_NESTING); 2834 2835 if (dm_suspended_md(md)) { 2836 r = -EINVAL; 2837 goto out_unlock; 2838 } 2839 2840 if (dm_suspended_internally_md(md)) { 2841 /* already internally suspended, wait for internal resume */ 2842 mutex_unlock(&md->suspend_lock); 2843 r = wait_on_bit(&md->flags, DMF_SUSPENDED_INTERNALLY, TASK_INTERRUPTIBLE); 2844 if (r) 2845 return r; 2846 goto retry; 2847 } 2848 2849 map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock)); 2850 2851 r = __dm_suspend(md, map, suspend_flags, TASK_INTERRUPTIBLE); 2852 if (r) 2853 goto out_unlock; 2854 2855 set_bit(DMF_SUSPENDED, &md->flags); 2856 2857 dm_table_postsuspend_targets(map); 2858 2859 out_unlock: 2860 mutex_unlock(&md->suspend_lock); 2861 return r; 2862 } 2863 2864 static int __dm_resume(struct mapped_device *md, struct dm_table *map) 2865 { 2866 if (map) { 2867 int r = dm_table_resume_targets(map); 2868 if (r) 2869 return r; 2870 } 2871 2872 dm_queue_flush(md); 2873 2874 /* 2875 * Flushing deferred I/Os must be done after targets are resumed 2876 * so that mapping of targets can work correctly. 2877 * Request-based dm is queueing the deferred I/Os in its request_queue. 2878 */ 2879 if (dm_request_based(md)) 2880 start_queue(md->queue); 2881 2882 unlock_fs(md); 2883 2884 return 0; 2885 } 2886 2887 int dm_resume(struct mapped_device *md) 2888 { 2889 int r = -EINVAL; 2890 struct dm_table *map = NULL; 2891 2892 retry: 2893 mutex_lock_nested(&md->suspend_lock, SINGLE_DEPTH_NESTING); 2894 2895 if (!dm_suspended_md(md)) 2896 goto out; 2897 2898 if (dm_suspended_internally_md(md)) { 2899 /* already internally suspended, wait for internal resume */ 2900 mutex_unlock(&md->suspend_lock); 2901 r = wait_on_bit(&md->flags, DMF_SUSPENDED_INTERNALLY, TASK_INTERRUPTIBLE); 2902 if (r) 2903 return r; 2904 goto retry; 2905 } 2906 2907 map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock)); 2908 if (!map || !dm_table_get_size(map)) 2909 goto out; 2910 2911 r = __dm_resume(md, map); 2912 if (r) 2913 goto out; 2914 2915 clear_bit(DMF_SUSPENDED, &md->flags); 2916 2917 r = 0; 2918 out: 2919 mutex_unlock(&md->suspend_lock); 2920 2921 return r; 2922 } 2923 2924 /* 2925 * Internal suspend/resume works like userspace-driven suspend. It waits 2926 * until all bios finish and prevents issuing new bios to the target drivers. 2927 * It may be used only from the kernel. 2928 */ 2929 2930 static void __dm_internal_suspend(struct mapped_device *md, unsigned suspend_flags) 2931 { 2932 struct dm_table *map = NULL; 2933 2934 if (md->internal_suspend_count++) 2935 return; /* nested internal suspend */ 2936 2937 if (dm_suspended_md(md)) { 2938 set_bit(DMF_SUSPENDED_INTERNALLY, &md->flags); 2939 return; /* nest suspend */ 2940 } 2941 2942 map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock)); 2943 2944 /* 2945 * Using TASK_UNINTERRUPTIBLE because only NOFLUSH internal suspend is 2946 * supported. Properly supporting a TASK_INTERRUPTIBLE internal suspend 2947 * would require changing .presuspend to return an error -- avoid this 2948 * until there is a need for more elaborate variants of internal suspend. 2949 */ 2950 (void) __dm_suspend(md, map, suspend_flags, TASK_UNINTERRUPTIBLE); 2951 2952 set_bit(DMF_SUSPENDED_INTERNALLY, &md->flags); 2953 2954 dm_table_postsuspend_targets(map); 2955 } 2956 2957 static void __dm_internal_resume(struct mapped_device *md) 2958 { 2959 BUG_ON(!md->internal_suspend_count); 2960 2961 if (--md->internal_suspend_count) 2962 return; /* resume from nested internal suspend */ 2963 2964 if (dm_suspended_md(md)) 2965 goto done; /* resume from nested suspend */ 2966 2967 /* 2968 * NOTE: existing callers don't need to call dm_table_resume_targets 2969 * (which may fail -- so best to avoid it for now by passing NULL map) 2970 */ 2971 (void) __dm_resume(md, NULL); 2972 2973 done: 2974 clear_bit(DMF_SUSPENDED_INTERNALLY, &md->flags); 2975 smp_mb__after_atomic(); 2976 wake_up_bit(&md->flags, DMF_SUSPENDED_INTERNALLY); 2977 } 2978 2979 void dm_internal_suspend_noflush(struct mapped_device *md) 2980 { 2981 mutex_lock(&md->suspend_lock); 2982 __dm_internal_suspend(md, DM_SUSPEND_NOFLUSH_FLAG); 2983 mutex_unlock(&md->suspend_lock); 2984 } 2985 EXPORT_SYMBOL_GPL(dm_internal_suspend_noflush); 2986 2987 void dm_internal_resume(struct mapped_device *md) 2988 { 2989 mutex_lock(&md->suspend_lock); 2990 __dm_internal_resume(md); 2991 mutex_unlock(&md->suspend_lock); 2992 } 2993 EXPORT_SYMBOL_GPL(dm_internal_resume); 2994 2995 /* 2996 * Fast variants of internal suspend/resume hold md->suspend_lock, 2997 * which prevents interaction with userspace-driven suspend. 2998 */ 2999 3000 void dm_internal_suspend_fast(struct mapped_device *md) 3001 { 3002 mutex_lock(&md->suspend_lock); 3003 if (dm_suspended_md(md) || dm_suspended_internally_md(md)) 3004 return; 3005 3006 set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags); 3007 synchronize_srcu(&md->io_barrier); 3008 flush_workqueue(md->wq); 3009 dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE); 3010 } 3011 3012 void dm_internal_resume_fast(struct mapped_device *md) 3013 { 3014 if (dm_suspended_md(md) || dm_suspended_internally_md(md)) 3015 goto done; 3016 3017 dm_queue_flush(md); 3018 3019 done: 3020 mutex_unlock(&md->suspend_lock); 3021 } 3022 3023 /*----------------------------------------------------------------- 3024 * Event notification. 3025 *---------------------------------------------------------------*/ 3026 int dm_kobject_uevent(struct mapped_device *md, enum kobject_action action, 3027 unsigned cookie) 3028 { 3029 char udev_cookie[DM_COOKIE_LENGTH]; 3030 char *envp[] = { udev_cookie, NULL }; 3031 3032 if (!cookie) 3033 return kobject_uevent(&disk_to_dev(md->disk)->kobj, action); 3034 else { 3035 snprintf(udev_cookie, DM_COOKIE_LENGTH, "%s=%u", 3036 DM_COOKIE_ENV_VAR_NAME, cookie); 3037 return kobject_uevent_env(&disk_to_dev(md->disk)->kobj, 3038 action, envp); 3039 } 3040 } 3041 3042 uint32_t dm_next_uevent_seq(struct mapped_device *md) 3043 { 3044 return atomic_add_return(1, &md->uevent_seq); 3045 } 3046 3047 uint32_t dm_get_event_nr(struct mapped_device *md) 3048 { 3049 return atomic_read(&md->event_nr); 3050 } 3051 3052 int dm_wait_event(struct mapped_device *md, int event_nr) 3053 { 3054 return wait_event_interruptible(md->eventq, 3055 (event_nr != atomic_read(&md->event_nr))); 3056 } 3057 3058 void dm_uevent_add(struct mapped_device *md, struct list_head *elist) 3059 { 3060 unsigned long flags; 3061 3062 spin_lock_irqsave(&md->uevent_lock, flags); 3063 list_add(elist, &md->uevent_list); 3064 spin_unlock_irqrestore(&md->uevent_lock, flags); 3065 } 3066 3067 /* 3068 * The gendisk is only valid as long as you have a reference 3069 * count on 'md'. 3070 */ 3071 struct gendisk *dm_disk(struct mapped_device *md) 3072 { 3073 return md->disk; 3074 } 3075 3076 struct kobject *dm_kobject(struct mapped_device *md) 3077 { 3078 return &md->kobj_holder.kobj; 3079 } 3080 3081 struct mapped_device *dm_get_from_kobject(struct kobject *kobj) 3082 { 3083 struct mapped_device *md; 3084 3085 md = container_of(kobj, struct mapped_device, kobj_holder.kobj); 3086 3087 if (test_bit(DMF_FREEING, &md->flags) || 3088 dm_deleting_md(md)) 3089 return NULL; 3090 3091 dm_get(md); 3092 return md; 3093 } 3094 3095 int dm_suspended_md(struct mapped_device *md) 3096 { 3097 return test_bit(DMF_SUSPENDED, &md->flags); 3098 } 3099 3100 int dm_suspended_internally_md(struct mapped_device *md) 3101 { 3102 return test_bit(DMF_SUSPENDED_INTERNALLY, &md->flags); 3103 } 3104 3105 int dm_test_deferred_remove_flag(struct mapped_device *md) 3106 { 3107 return test_bit(DMF_DEFERRED_REMOVE, &md->flags); 3108 } 3109 3110 int dm_suspended(struct dm_target *ti) 3111 { 3112 return dm_suspended_md(dm_table_get_md(ti->table)); 3113 } 3114 EXPORT_SYMBOL_GPL(dm_suspended); 3115 3116 int dm_noflush_suspending(struct dm_target *ti) 3117 { 3118 return __noflush_suspending(dm_table_get_md(ti->table)); 3119 } 3120 EXPORT_SYMBOL_GPL(dm_noflush_suspending); 3121 3122 struct dm_md_mempools *dm_alloc_md_mempools(unsigned type, unsigned integrity, unsigned per_bio_data_size) 3123 { 3124 struct dm_md_mempools *pools = kzalloc(sizeof(*pools), GFP_KERNEL); 3125 struct kmem_cache *cachep; 3126 unsigned int pool_size; 3127 unsigned int front_pad; 3128 3129 if (!pools) 3130 return NULL; 3131 3132 if (type == DM_TYPE_BIO_BASED) { 3133 cachep = _io_cache; 3134 pool_size = dm_get_reserved_bio_based_ios(); 3135 front_pad = roundup(per_bio_data_size, __alignof__(struct dm_target_io)) + offsetof(struct dm_target_io, clone); 3136 } else if (type == DM_TYPE_REQUEST_BASED) { 3137 cachep = _rq_tio_cache; 3138 pool_size = dm_get_reserved_rq_based_ios(); 3139 front_pad = offsetof(struct dm_rq_clone_bio_info, clone); 3140 /* per_bio_data_size is not used. See __bind_mempools(). */ 3141 WARN_ON(per_bio_data_size != 0); 3142 } else 3143 goto out; 3144 3145 pools->io_pool = mempool_create_slab_pool(pool_size, cachep); 3146 if (!pools->io_pool) 3147 goto out; 3148 3149 pools->bs = bioset_create_nobvec(pool_size, front_pad); 3150 if (!pools->bs) 3151 goto out; 3152 3153 if (integrity && bioset_integrity_create(pools->bs, pool_size)) 3154 goto out; 3155 3156 return pools; 3157 3158 out: 3159 dm_free_md_mempools(pools); 3160 3161 return NULL; 3162 } 3163 3164 void dm_free_md_mempools(struct dm_md_mempools *pools) 3165 { 3166 if (!pools) 3167 return; 3168 3169 if (pools->io_pool) 3170 mempool_destroy(pools->io_pool); 3171 3172 if (pools->bs) 3173 bioset_free(pools->bs); 3174 3175 kfree(pools); 3176 } 3177 3178 static const struct block_device_operations dm_blk_dops = { 3179 .open = dm_blk_open, 3180 .release = dm_blk_close, 3181 .ioctl = dm_blk_ioctl, 3182 .getgeo = dm_blk_getgeo, 3183 .owner = THIS_MODULE 3184 }; 3185 3186 /* 3187 * module hooks 3188 */ 3189 module_init(dm_init); 3190 module_exit(dm_exit); 3191 3192 module_param(major, uint, 0); 3193 MODULE_PARM_DESC(major, "The major number of the device mapper"); 3194 3195 module_param(reserved_bio_based_ios, uint, S_IRUGO | S_IWUSR); 3196 MODULE_PARM_DESC(reserved_bio_based_ios, "Reserved IOs in bio-based mempools"); 3197 3198 module_param(reserved_rq_based_ios, uint, S_IRUGO | S_IWUSR); 3199 MODULE_PARM_DESC(reserved_rq_based_ios, "Reserved IOs in request-based mempools"); 3200 3201 MODULE_DESCRIPTION(DM_NAME " driver"); 3202 MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>"); 3203 MODULE_LICENSE("GPL"); 3204