1 /* 2 * Copyright (C) 2001, 2002 Sistina Software (UK) Limited. 3 * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved. 4 * 5 * This file is released under the GPL. 6 */ 7 8 #include "dm.h" 9 #include "dm-uevent.h" 10 11 #include <linux/init.h> 12 #include <linux/module.h> 13 #include <linux/mutex.h> 14 #include <linux/moduleparam.h> 15 #include <linux/blkpg.h> 16 #include <linux/bio.h> 17 #include <linux/mempool.h> 18 #include <linux/slab.h> 19 #include <linux/idr.h> 20 #include <linux/hdreg.h> 21 #include <linux/delay.h> 22 23 #include <trace/events/block.h> 24 25 #define DM_MSG_PREFIX "core" 26 27 #ifdef CONFIG_PRINTK 28 /* 29 * ratelimit state to be used in DMXXX_LIMIT(). 30 */ 31 DEFINE_RATELIMIT_STATE(dm_ratelimit_state, 32 DEFAULT_RATELIMIT_INTERVAL, 33 DEFAULT_RATELIMIT_BURST); 34 EXPORT_SYMBOL(dm_ratelimit_state); 35 #endif 36 37 /* 38 * Cookies are numeric values sent with CHANGE and REMOVE 39 * uevents while resuming, removing or renaming the device. 40 */ 41 #define DM_COOKIE_ENV_VAR_NAME "DM_COOKIE" 42 #define DM_COOKIE_LENGTH 24 43 44 static const char *_name = DM_NAME; 45 46 static unsigned int major = 0; 47 static unsigned int _major = 0; 48 49 static DEFINE_IDR(_minor_idr); 50 51 static DEFINE_SPINLOCK(_minor_lock); 52 /* 53 * For bio-based dm. 54 * One of these is allocated per bio. 55 */ 56 struct dm_io { 57 struct mapped_device *md; 58 int error; 59 atomic_t io_count; 60 struct bio *bio; 61 unsigned long start_time; 62 spinlock_t endio_lock; 63 struct dm_stats_aux stats_aux; 64 }; 65 66 /* 67 * For request-based dm. 68 * One of these is allocated per request. 69 */ 70 struct dm_rq_target_io { 71 struct mapped_device *md; 72 struct dm_target *ti; 73 struct request *orig, clone; 74 int error; 75 union map_info info; 76 }; 77 78 /* 79 * For request-based dm - the bio clones we allocate are embedded in these 80 * structs. 81 * 82 * We allocate these with bio_alloc_bioset, using the front_pad parameter when 83 * the bioset is created - this means the bio has to come at the end of the 84 * struct. 85 */ 86 struct dm_rq_clone_bio_info { 87 struct bio *orig; 88 struct dm_rq_target_io *tio; 89 struct bio clone; 90 }; 91 92 union map_info *dm_get_mapinfo(struct bio *bio) 93 { 94 if (bio && bio->bi_private) 95 return &((struct dm_target_io *)bio->bi_private)->info; 96 return NULL; 97 } 98 99 union map_info *dm_get_rq_mapinfo(struct request *rq) 100 { 101 if (rq && rq->end_io_data) 102 return &((struct dm_rq_target_io *)rq->end_io_data)->info; 103 return NULL; 104 } 105 EXPORT_SYMBOL_GPL(dm_get_rq_mapinfo); 106 107 #define MINOR_ALLOCED ((void *)-1) 108 109 /* 110 * Bits for the md->flags field. 111 */ 112 #define DMF_BLOCK_IO_FOR_SUSPEND 0 113 #define DMF_SUSPENDED 1 114 #define DMF_FROZEN 2 115 #define DMF_FREEING 3 116 #define DMF_DELETING 4 117 #define DMF_NOFLUSH_SUSPENDING 5 118 #define DMF_MERGE_IS_OPTIONAL 6 119 120 /* 121 * A dummy definition to make RCU happy. 122 * struct dm_table should never be dereferenced in this file. 123 */ 124 struct dm_table { 125 int undefined__; 126 }; 127 128 /* 129 * Work processed by per-device workqueue. 130 */ 131 struct mapped_device { 132 struct srcu_struct io_barrier; 133 struct mutex suspend_lock; 134 atomic_t holders; 135 atomic_t open_count; 136 137 /* 138 * The current mapping. 139 * Use dm_get_live_table{_fast} or take suspend_lock for 140 * dereference. 141 */ 142 struct dm_table *map; 143 144 unsigned long flags; 145 146 struct request_queue *queue; 147 unsigned type; 148 /* Protect queue and type against concurrent access. */ 149 struct mutex type_lock; 150 151 struct target_type *immutable_target_type; 152 153 struct gendisk *disk; 154 char name[16]; 155 156 void *interface_ptr; 157 158 /* 159 * A list of ios that arrived while we were suspended. 160 */ 161 atomic_t pending[2]; 162 wait_queue_head_t wait; 163 struct work_struct work; 164 struct bio_list deferred; 165 spinlock_t deferred_lock; 166 167 /* 168 * Processing queue (flush) 169 */ 170 struct workqueue_struct *wq; 171 172 /* 173 * io objects are allocated from here. 174 */ 175 mempool_t *io_pool; 176 177 struct bio_set *bs; 178 179 /* 180 * Event handling. 181 */ 182 atomic_t event_nr; 183 wait_queue_head_t eventq; 184 atomic_t uevent_seq; 185 struct list_head uevent_list; 186 spinlock_t uevent_lock; /* Protect access to uevent_list */ 187 188 /* 189 * freeze/thaw support require holding onto a super block 190 */ 191 struct super_block *frozen_sb; 192 struct block_device *bdev; 193 194 /* forced geometry settings */ 195 struct hd_geometry geometry; 196 197 /* sysfs handle */ 198 struct kobject kobj; 199 200 /* zero-length flush that will be cloned and submitted to targets */ 201 struct bio flush_bio; 202 203 struct dm_stats stats; 204 }; 205 206 /* 207 * For mempools pre-allocation at the table loading time. 208 */ 209 struct dm_md_mempools { 210 mempool_t *io_pool; 211 struct bio_set *bs; 212 }; 213 214 #define RESERVED_BIO_BASED_IOS 16 215 #define RESERVED_REQUEST_BASED_IOS 256 216 #define RESERVED_MAX_IOS 1024 217 static struct kmem_cache *_io_cache; 218 static struct kmem_cache *_rq_tio_cache; 219 220 /* 221 * Bio-based DM's mempools' reserved IOs set by the user. 222 */ 223 static unsigned reserved_bio_based_ios = RESERVED_BIO_BASED_IOS; 224 225 /* 226 * Request-based DM's mempools' reserved IOs set by the user. 227 */ 228 static unsigned reserved_rq_based_ios = RESERVED_REQUEST_BASED_IOS; 229 230 static unsigned __dm_get_reserved_ios(unsigned *reserved_ios, 231 unsigned def, unsigned max) 232 { 233 unsigned ios = ACCESS_ONCE(*reserved_ios); 234 unsigned modified_ios = 0; 235 236 if (!ios) 237 modified_ios = def; 238 else if (ios > max) 239 modified_ios = max; 240 241 if (modified_ios) { 242 (void)cmpxchg(reserved_ios, ios, modified_ios); 243 ios = modified_ios; 244 } 245 246 return ios; 247 } 248 249 unsigned dm_get_reserved_bio_based_ios(void) 250 { 251 return __dm_get_reserved_ios(&reserved_bio_based_ios, 252 RESERVED_BIO_BASED_IOS, RESERVED_MAX_IOS); 253 } 254 EXPORT_SYMBOL_GPL(dm_get_reserved_bio_based_ios); 255 256 unsigned dm_get_reserved_rq_based_ios(void) 257 { 258 return __dm_get_reserved_ios(&reserved_rq_based_ios, 259 RESERVED_REQUEST_BASED_IOS, RESERVED_MAX_IOS); 260 } 261 EXPORT_SYMBOL_GPL(dm_get_reserved_rq_based_ios); 262 263 static int __init local_init(void) 264 { 265 int r = -ENOMEM; 266 267 /* allocate a slab for the dm_ios */ 268 _io_cache = KMEM_CACHE(dm_io, 0); 269 if (!_io_cache) 270 return r; 271 272 _rq_tio_cache = KMEM_CACHE(dm_rq_target_io, 0); 273 if (!_rq_tio_cache) 274 goto out_free_io_cache; 275 276 r = dm_uevent_init(); 277 if (r) 278 goto out_free_rq_tio_cache; 279 280 _major = major; 281 r = register_blkdev(_major, _name); 282 if (r < 0) 283 goto out_uevent_exit; 284 285 if (!_major) 286 _major = r; 287 288 return 0; 289 290 out_uevent_exit: 291 dm_uevent_exit(); 292 out_free_rq_tio_cache: 293 kmem_cache_destroy(_rq_tio_cache); 294 out_free_io_cache: 295 kmem_cache_destroy(_io_cache); 296 297 return r; 298 } 299 300 static void local_exit(void) 301 { 302 kmem_cache_destroy(_rq_tio_cache); 303 kmem_cache_destroy(_io_cache); 304 unregister_blkdev(_major, _name); 305 dm_uevent_exit(); 306 307 _major = 0; 308 309 DMINFO("cleaned up"); 310 } 311 312 static int (*_inits[])(void) __initdata = { 313 local_init, 314 dm_target_init, 315 dm_linear_init, 316 dm_stripe_init, 317 dm_io_init, 318 dm_kcopyd_init, 319 dm_interface_init, 320 dm_statistics_init, 321 }; 322 323 static void (*_exits[])(void) = { 324 local_exit, 325 dm_target_exit, 326 dm_linear_exit, 327 dm_stripe_exit, 328 dm_io_exit, 329 dm_kcopyd_exit, 330 dm_interface_exit, 331 dm_statistics_exit, 332 }; 333 334 static int __init dm_init(void) 335 { 336 const int count = ARRAY_SIZE(_inits); 337 338 int r, i; 339 340 for (i = 0; i < count; i++) { 341 r = _inits[i](); 342 if (r) 343 goto bad; 344 } 345 346 return 0; 347 348 bad: 349 while (i--) 350 _exits[i](); 351 352 return r; 353 } 354 355 static void __exit dm_exit(void) 356 { 357 int i = ARRAY_SIZE(_exits); 358 359 while (i--) 360 _exits[i](); 361 362 /* 363 * Should be empty by this point. 364 */ 365 idr_destroy(&_minor_idr); 366 } 367 368 /* 369 * Block device functions 370 */ 371 int dm_deleting_md(struct mapped_device *md) 372 { 373 return test_bit(DMF_DELETING, &md->flags); 374 } 375 376 static int dm_blk_open(struct block_device *bdev, fmode_t mode) 377 { 378 struct mapped_device *md; 379 380 spin_lock(&_minor_lock); 381 382 md = bdev->bd_disk->private_data; 383 if (!md) 384 goto out; 385 386 if (test_bit(DMF_FREEING, &md->flags) || 387 dm_deleting_md(md)) { 388 md = NULL; 389 goto out; 390 } 391 392 dm_get(md); 393 atomic_inc(&md->open_count); 394 395 out: 396 spin_unlock(&_minor_lock); 397 398 return md ? 0 : -ENXIO; 399 } 400 401 static void dm_blk_close(struct gendisk *disk, fmode_t mode) 402 { 403 struct mapped_device *md = disk->private_data; 404 405 spin_lock(&_minor_lock); 406 407 atomic_dec(&md->open_count); 408 dm_put(md); 409 410 spin_unlock(&_minor_lock); 411 } 412 413 int dm_open_count(struct mapped_device *md) 414 { 415 return atomic_read(&md->open_count); 416 } 417 418 /* 419 * Guarantees nothing is using the device before it's deleted. 420 */ 421 int dm_lock_for_deletion(struct mapped_device *md) 422 { 423 int r = 0; 424 425 spin_lock(&_minor_lock); 426 427 if (dm_open_count(md)) 428 r = -EBUSY; 429 else 430 set_bit(DMF_DELETING, &md->flags); 431 432 spin_unlock(&_minor_lock); 433 434 return r; 435 } 436 437 sector_t dm_get_size(struct mapped_device *md) 438 { 439 return get_capacity(md->disk); 440 } 441 442 struct dm_stats *dm_get_stats(struct mapped_device *md) 443 { 444 return &md->stats; 445 } 446 447 static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo) 448 { 449 struct mapped_device *md = bdev->bd_disk->private_data; 450 451 return dm_get_geometry(md, geo); 452 } 453 454 static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode, 455 unsigned int cmd, unsigned long arg) 456 { 457 struct mapped_device *md = bdev->bd_disk->private_data; 458 int srcu_idx; 459 struct dm_table *map; 460 struct dm_target *tgt; 461 int r = -ENOTTY; 462 463 retry: 464 map = dm_get_live_table(md, &srcu_idx); 465 466 if (!map || !dm_table_get_size(map)) 467 goto out; 468 469 /* We only support devices that have a single target */ 470 if (dm_table_get_num_targets(map) != 1) 471 goto out; 472 473 tgt = dm_table_get_target(map, 0); 474 475 if (dm_suspended_md(md)) { 476 r = -EAGAIN; 477 goto out; 478 } 479 480 if (tgt->type->ioctl) 481 r = tgt->type->ioctl(tgt, cmd, arg); 482 483 out: 484 dm_put_live_table(md, srcu_idx); 485 486 if (r == -ENOTCONN) { 487 msleep(10); 488 goto retry; 489 } 490 491 return r; 492 } 493 494 static struct dm_io *alloc_io(struct mapped_device *md) 495 { 496 return mempool_alloc(md->io_pool, GFP_NOIO); 497 } 498 499 static void free_io(struct mapped_device *md, struct dm_io *io) 500 { 501 mempool_free(io, md->io_pool); 502 } 503 504 static void free_tio(struct mapped_device *md, struct dm_target_io *tio) 505 { 506 bio_put(&tio->clone); 507 } 508 509 static struct dm_rq_target_io *alloc_rq_tio(struct mapped_device *md, 510 gfp_t gfp_mask) 511 { 512 return mempool_alloc(md->io_pool, gfp_mask); 513 } 514 515 static void free_rq_tio(struct dm_rq_target_io *tio) 516 { 517 mempool_free(tio, tio->md->io_pool); 518 } 519 520 static int md_in_flight(struct mapped_device *md) 521 { 522 return atomic_read(&md->pending[READ]) + 523 atomic_read(&md->pending[WRITE]); 524 } 525 526 static void start_io_acct(struct dm_io *io) 527 { 528 struct mapped_device *md = io->md; 529 struct bio *bio = io->bio; 530 int cpu; 531 int rw = bio_data_dir(bio); 532 533 io->start_time = jiffies; 534 535 cpu = part_stat_lock(); 536 part_round_stats(cpu, &dm_disk(md)->part0); 537 part_stat_unlock(); 538 atomic_set(&dm_disk(md)->part0.in_flight[rw], 539 atomic_inc_return(&md->pending[rw])); 540 541 if (unlikely(dm_stats_used(&md->stats))) 542 dm_stats_account_io(&md->stats, bio->bi_rw, bio->bi_sector, 543 bio_sectors(bio), false, 0, &io->stats_aux); 544 } 545 546 static void end_io_acct(struct dm_io *io) 547 { 548 struct mapped_device *md = io->md; 549 struct bio *bio = io->bio; 550 unsigned long duration = jiffies - io->start_time; 551 int pending, cpu; 552 int rw = bio_data_dir(bio); 553 554 cpu = part_stat_lock(); 555 part_round_stats(cpu, &dm_disk(md)->part0); 556 part_stat_add(cpu, &dm_disk(md)->part0, ticks[rw], duration); 557 part_stat_unlock(); 558 559 if (unlikely(dm_stats_used(&md->stats))) 560 dm_stats_account_io(&md->stats, bio->bi_rw, bio->bi_sector, 561 bio_sectors(bio), true, duration, &io->stats_aux); 562 563 /* 564 * After this is decremented the bio must not be touched if it is 565 * a flush. 566 */ 567 pending = atomic_dec_return(&md->pending[rw]); 568 atomic_set(&dm_disk(md)->part0.in_flight[rw], pending); 569 pending += atomic_read(&md->pending[rw^0x1]); 570 571 /* nudge anyone waiting on suspend queue */ 572 if (!pending) 573 wake_up(&md->wait); 574 } 575 576 /* 577 * Add the bio to the list of deferred io. 578 */ 579 static void queue_io(struct mapped_device *md, struct bio *bio) 580 { 581 unsigned long flags; 582 583 spin_lock_irqsave(&md->deferred_lock, flags); 584 bio_list_add(&md->deferred, bio); 585 spin_unlock_irqrestore(&md->deferred_lock, flags); 586 queue_work(md->wq, &md->work); 587 } 588 589 /* 590 * Everyone (including functions in this file), should use this 591 * function to access the md->map field, and make sure they call 592 * dm_put_live_table() when finished. 593 */ 594 struct dm_table *dm_get_live_table(struct mapped_device *md, int *srcu_idx) __acquires(md->io_barrier) 595 { 596 *srcu_idx = srcu_read_lock(&md->io_barrier); 597 598 return srcu_dereference(md->map, &md->io_barrier); 599 } 600 601 void dm_put_live_table(struct mapped_device *md, int srcu_idx) __releases(md->io_barrier) 602 { 603 srcu_read_unlock(&md->io_barrier, srcu_idx); 604 } 605 606 void dm_sync_table(struct mapped_device *md) 607 { 608 synchronize_srcu(&md->io_barrier); 609 synchronize_rcu_expedited(); 610 } 611 612 /* 613 * A fast alternative to dm_get_live_table/dm_put_live_table. 614 * The caller must not block between these two functions. 615 */ 616 static struct dm_table *dm_get_live_table_fast(struct mapped_device *md) __acquires(RCU) 617 { 618 rcu_read_lock(); 619 return rcu_dereference(md->map); 620 } 621 622 static void dm_put_live_table_fast(struct mapped_device *md) __releases(RCU) 623 { 624 rcu_read_unlock(); 625 } 626 627 /* 628 * Get the geometry associated with a dm device 629 */ 630 int dm_get_geometry(struct mapped_device *md, struct hd_geometry *geo) 631 { 632 *geo = md->geometry; 633 634 return 0; 635 } 636 637 /* 638 * Set the geometry of a device. 639 */ 640 int dm_set_geometry(struct mapped_device *md, struct hd_geometry *geo) 641 { 642 sector_t sz = (sector_t)geo->cylinders * geo->heads * geo->sectors; 643 644 if (geo->start > sz) { 645 DMWARN("Start sector is beyond the geometry limits."); 646 return -EINVAL; 647 } 648 649 md->geometry = *geo; 650 651 return 0; 652 } 653 654 /*----------------------------------------------------------------- 655 * CRUD START: 656 * A more elegant soln is in the works that uses the queue 657 * merge fn, unfortunately there are a couple of changes to 658 * the block layer that I want to make for this. So in the 659 * interests of getting something for people to use I give 660 * you this clearly demarcated crap. 661 *---------------------------------------------------------------*/ 662 663 static int __noflush_suspending(struct mapped_device *md) 664 { 665 return test_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); 666 } 667 668 /* 669 * Decrements the number of outstanding ios that a bio has been 670 * cloned into, completing the original io if necc. 671 */ 672 static void dec_pending(struct dm_io *io, int error) 673 { 674 unsigned long flags; 675 int io_error; 676 struct bio *bio; 677 struct mapped_device *md = io->md; 678 679 /* Push-back supersedes any I/O errors */ 680 if (unlikely(error)) { 681 spin_lock_irqsave(&io->endio_lock, flags); 682 if (!(io->error > 0 && __noflush_suspending(md))) 683 io->error = error; 684 spin_unlock_irqrestore(&io->endio_lock, flags); 685 } 686 687 if (atomic_dec_and_test(&io->io_count)) { 688 if (io->error == DM_ENDIO_REQUEUE) { 689 /* 690 * Target requested pushing back the I/O. 691 */ 692 spin_lock_irqsave(&md->deferred_lock, flags); 693 if (__noflush_suspending(md)) 694 bio_list_add_head(&md->deferred, io->bio); 695 else 696 /* noflush suspend was interrupted. */ 697 io->error = -EIO; 698 spin_unlock_irqrestore(&md->deferred_lock, flags); 699 } 700 701 io_error = io->error; 702 bio = io->bio; 703 end_io_acct(io); 704 free_io(md, io); 705 706 if (io_error == DM_ENDIO_REQUEUE) 707 return; 708 709 if ((bio->bi_rw & REQ_FLUSH) && bio->bi_size) { 710 /* 711 * Preflush done for flush with data, reissue 712 * without REQ_FLUSH. 713 */ 714 bio->bi_rw &= ~REQ_FLUSH; 715 queue_io(md, bio); 716 } else { 717 /* done with normal IO or empty flush */ 718 trace_block_bio_complete(md->queue, bio, io_error); 719 bio_endio(bio, io_error); 720 } 721 } 722 } 723 724 static void clone_endio(struct bio *bio, int error) 725 { 726 int r = 0; 727 struct dm_target_io *tio = bio->bi_private; 728 struct dm_io *io = tio->io; 729 struct mapped_device *md = tio->io->md; 730 dm_endio_fn endio = tio->ti->type->end_io; 731 732 if (!bio_flagged(bio, BIO_UPTODATE) && !error) 733 error = -EIO; 734 735 if (endio) { 736 r = endio(tio->ti, bio, error); 737 if (r < 0 || r == DM_ENDIO_REQUEUE) 738 /* 739 * error and requeue request are handled 740 * in dec_pending(). 741 */ 742 error = r; 743 else if (r == DM_ENDIO_INCOMPLETE) 744 /* The target will handle the io */ 745 return; 746 else if (r) { 747 DMWARN("unimplemented target endio return value: %d", r); 748 BUG(); 749 } 750 } 751 752 free_tio(md, tio); 753 dec_pending(io, error); 754 } 755 756 /* 757 * Partial completion handling for request-based dm 758 */ 759 static void end_clone_bio(struct bio *clone, int error) 760 { 761 struct dm_rq_clone_bio_info *info = clone->bi_private; 762 struct dm_rq_target_io *tio = info->tio; 763 struct bio *bio = info->orig; 764 unsigned int nr_bytes = info->orig->bi_size; 765 766 bio_put(clone); 767 768 if (tio->error) 769 /* 770 * An error has already been detected on the request. 771 * Once error occurred, just let clone->end_io() handle 772 * the remainder. 773 */ 774 return; 775 else if (error) { 776 /* 777 * Don't notice the error to the upper layer yet. 778 * The error handling decision is made by the target driver, 779 * when the request is completed. 780 */ 781 tio->error = error; 782 return; 783 } 784 785 /* 786 * I/O for the bio successfully completed. 787 * Notice the data completion to the upper layer. 788 */ 789 790 /* 791 * bios are processed from the head of the list. 792 * So the completing bio should always be rq->bio. 793 * If it's not, something wrong is happening. 794 */ 795 if (tio->orig->bio != bio) 796 DMERR("bio completion is going in the middle of the request"); 797 798 /* 799 * Update the original request. 800 * Do not use blk_end_request() here, because it may complete 801 * the original request before the clone, and break the ordering. 802 */ 803 blk_update_request(tio->orig, 0, nr_bytes); 804 } 805 806 /* 807 * Don't touch any member of the md after calling this function because 808 * the md may be freed in dm_put() at the end of this function. 809 * Or do dm_get() before calling this function and dm_put() later. 810 */ 811 static void rq_completed(struct mapped_device *md, int rw, int run_queue) 812 { 813 atomic_dec(&md->pending[rw]); 814 815 /* nudge anyone waiting on suspend queue */ 816 if (!md_in_flight(md)) 817 wake_up(&md->wait); 818 819 /* 820 * Run this off this callpath, as drivers could invoke end_io while 821 * inside their request_fn (and holding the queue lock). Calling 822 * back into ->request_fn() could deadlock attempting to grab the 823 * queue lock again. 824 */ 825 if (run_queue) 826 blk_run_queue_async(md->queue); 827 828 /* 829 * dm_put() must be at the end of this function. See the comment above 830 */ 831 dm_put(md); 832 } 833 834 static void free_rq_clone(struct request *clone) 835 { 836 struct dm_rq_target_io *tio = clone->end_io_data; 837 838 blk_rq_unprep_clone(clone); 839 free_rq_tio(tio); 840 } 841 842 /* 843 * Complete the clone and the original request. 844 * Must be called without queue lock. 845 */ 846 static void dm_end_request(struct request *clone, int error) 847 { 848 int rw = rq_data_dir(clone); 849 struct dm_rq_target_io *tio = clone->end_io_data; 850 struct mapped_device *md = tio->md; 851 struct request *rq = tio->orig; 852 853 if (rq->cmd_type == REQ_TYPE_BLOCK_PC) { 854 rq->errors = clone->errors; 855 rq->resid_len = clone->resid_len; 856 857 if (rq->sense) 858 /* 859 * We are using the sense buffer of the original 860 * request. 861 * So setting the length of the sense data is enough. 862 */ 863 rq->sense_len = clone->sense_len; 864 } 865 866 free_rq_clone(clone); 867 blk_end_request_all(rq, error); 868 rq_completed(md, rw, true); 869 } 870 871 static void dm_unprep_request(struct request *rq) 872 { 873 struct request *clone = rq->special; 874 875 rq->special = NULL; 876 rq->cmd_flags &= ~REQ_DONTPREP; 877 878 free_rq_clone(clone); 879 } 880 881 /* 882 * Requeue the original request of a clone. 883 */ 884 void dm_requeue_unmapped_request(struct request *clone) 885 { 886 int rw = rq_data_dir(clone); 887 struct dm_rq_target_io *tio = clone->end_io_data; 888 struct mapped_device *md = tio->md; 889 struct request *rq = tio->orig; 890 struct request_queue *q = rq->q; 891 unsigned long flags; 892 893 dm_unprep_request(rq); 894 895 spin_lock_irqsave(q->queue_lock, flags); 896 blk_requeue_request(q, rq); 897 spin_unlock_irqrestore(q->queue_lock, flags); 898 899 rq_completed(md, rw, 0); 900 } 901 EXPORT_SYMBOL_GPL(dm_requeue_unmapped_request); 902 903 static void __stop_queue(struct request_queue *q) 904 { 905 blk_stop_queue(q); 906 } 907 908 static void stop_queue(struct request_queue *q) 909 { 910 unsigned long flags; 911 912 spin_lock_irqsave(q->queue_lock, flags); 913 __stop_queue(q); 914 spin_unlock_irqrestore(q->queue_lock, flags); 915 } 916 917 static void __start_queue(struct request_queue *q) 918 { 919 if (blk_queue_stopped(q)) 920 blk_start_queue(q); 921 } 922 923 static void start_queue(struct request_queue *q) 924 { 925 unsigned long flags; 926 927 spin_lock_irqsave(q->queue_lock, flags); 928 __start_queue(q); 929 spin_unlock_irqrestore(q->queue_lock, flags); 930 } 931 932 static void dm_done(struct request *clone, int error, bool mapped) 933 { 934 int r = error; 935 struct dm_rq_target_io *tio = clone->end_io_data; 936 dm_request_endio_fn rq_end_io = NULL; 937 938 if (tio->ti) { 939 rq_end_io = tio->ti->type->rq_end_io; 940 941 if (mapped && rq_end_io) 942 r = rq_end_io(tio->ti, clone, error, &tio->info); 943 } 944 945 if (r <= 0) 946 /* The target wants to complete the I/O */ 947 dm_end_request(clone, r); 948 else if (r == DM_ENDIO_INCOMPLETE) 949 /* The target will handle the I/O */ 950 return; 951 else if (r == DM_ENDIO_REQUEUE) 952 /* The target wants to requeue the I/O */ 953 dm_requeue_unmapped_request(clone); 954 else { 955 DMWARN("unimplemented target endio return value: %d", r); 956 BUG(); 957 } 958 } 959 960 /* 961 * Request completion handler for request-based dm 962 */ 963 static void dm_softirq_done(struct request *rq) 964 { 965 bool mapped = true; 966 struct request *clone = rq->completion_data; 967 struct dm_rq_target_io *tio = clone->end_io_data; 968 969 if (rq->cmd_flags & REQ_FAILED) 970 mapped = false; 971 972 dm_done(clone, tio->error, mapped); 973 } 974 975 /* 976 * Complete the clone and the original request with the error status 977 * through softirq context. 978 */ 979 static void dm_complete_request(struct request *clone, int error) 980 { 981 struct dm_rq_target_io *tio = clone->end_io_data; 982 struct request *rq = tio->orig; 983 984 tio->error = error; 985 rq->completion_data = clone; 986 blk_complete_request(rq); 987 } 988 989 /* 990 * Complete the not-mapped clone and the original request with the error status 991 * through softirq context. 992 * Target's rq_end_io() function isn't called. 993 * This may be used when the target's map_rq() function fails. 994 */ 995 void dm_kill_unmapped_request(struct request *clone, int error) 996 { 997 struct dm_rq_target_io *tio = clone->end_io_data; 998 struct request *rq = tio->orig; 999 1000 rq->cmd_flags |= REQ_FAILED; 1001 dm_complete_request(clone, error); 1002 } 1003 EXPORT_SYMBOL_GPL(dm_kill_unmapped_request); 1004 1005 /* 1006 * Called with the queue lock held 1007 */ 1008 static void end_clone_request(struct request *clone, int error) 1009 { 1010 /* 1011 * For just cleaning up the information of the queue in which 1012 * the clone was dispatched. 1013 * The clone is *NOT* freed actually here because it is alloced from 1014 * dm own mempool and REQ_ALLOCED isn't set in clone->cmd_flags. 1015 */ 1016 __blk_put_request(clone->q, clone); 1017 1018 /* 1019 * Actual request completion is done in a softirq context which doesn't 1020 * hold the queue lock. Otherwise, deadlock could occur because: 1021 * - another request may be submitted by the upper level driver 1022 * of the stacking during the completion 1023 * - the submission which requires queue lock may be done 1024 * against this queue 1025 */ 1026 dm_complete_request(clone, error); 1027 } 1028 1029 /* 1030 * Return maximum size of I/O possible at the supplied sector up to the current 1031 * target boundary. 1032 */ 1033 static sector_t max_io_len_target_boundary(sector_t sector, struct dm_target *ti) 1034 { 1035 sector_t target_offset = dm_target_offset(ti, sector); 1036 1037 return ti->len - target_offset; 1038 } 1039 1040 static sector_t max_io_len(sector_t sector, struct dm_target *ti) 1041 { 1042 sector_t len = max_io_len_target_boundary(sector, ti); 1043 sector_t offset, max_len; 1044 1045 /* 1046 * Does the target need to split even further? 1047 */ 1048 if (ti->max_io_len) { 1049 offset = dm_target_offset(ti, sector); 1050 if (unlikely(ti->max_io_len & (ti->max_io_len - 1))) 1051 max_len = sector_div(offset, ti->max_io_len); 1052 else 1053 max_len = offset & (ti->max_io_len - 1); 1054 max_len = ti->max_io_len - max_len; 1055 1056 if (len > max_len) 1057 len = max_len; 1058 } 1059 1060 return len; 1061 } 1062 1063 int dm_set_target_max_io_len(struct dm_target *ti, sector_t len) 1064 { 1065 if (len > UINT_MAX) { 1066 DMERR("Specified maximum size of target IO (%llu) exceeds limit (%u)", 1067 (unsigned long long)len, UINT_MAX); 1068 ti->error = "Maximum size of target IO is too large"; 1069 return -EINVAL; 1070 } 1071 1072 ti->max_io_len = (uint32_t) len; 1073 1074 return 0; 1075 } 1076 EXPORT_SYMBOL_GPL(dm_set_target_max_io_len); 1077 1078 static void __map_bio(struct dm_target_io *tio) 1079 { 1080 int r; 1081 sector_t sector; 1082 struct mapped_device *md; 1083 struct bio *clone = &tio->clone; 1084 struct dm_target *ti = tio->ti; 1085 1086 clone->bi_end_io = clone_endio; 1087 clone->bi_private = tio; 1088 1089 /* 1090 * Map the clone. If r == 0 we don't need to do 1091 * anything, the target has assumed ownership of 1092 * this io. 1093 */ 1094 atomic_inc(&tio->io->io_count); 1095 sector = clone->bi_sector; 1096 r = ti->type->map(ti, clone); 1097 if (r == DM_MAPIO_REMAPPED) { 1098 /* the bio has been remapped so dispatch it */ 1099 1100 trace_block_bio_remap(bdev_get_queue(clone->bi_bdev), clone, 1101 tio->io->bio->bi_bdev->bd_dev, sector); 1102 1103 generic_make_request(clone); 1104 } else if (r < 0 || r == DM_MAPIO_REQUEUE) { 1105 /* error the io and bail out, or requeue it if needed */ 1106 md = tio->io->md; 1107 dec_pending(tio->io, r); 1108 free_tio(md, tio); 1109 } else if (r) { 1110 DMWARN("unimplemented target map return value: %d", r); 1111 BUG(); 1112 } 1113 } 1114 1115 struct clone_info { 1116 struct mapped_device *md; 1117 struct dm_table *map; 1118 struct bio *bio; 1119 struct dm_io *io; 1120 sector_t sector; 1121 sector_t sector_count; 1122 unsigned short idx; 1123 }; 1124 1125 static void bio_setup_sector(struct bio *bio, sector_t sector, sector_t len) 1126 { 1127 bio->bi_sector = sector; 1128 bio->bi_size = to_bytes(len); 1129 } 1130 1131 static void bio_setup_bv(struct bio *bio, unsigned short idx, unsigned short bv_count) 1132 { 1133 bio->bi_idx = idx; 1134 bio->bi_vcnt = idx + bv_count; 1135 bio->bi_flags &= ~(1 << BIO_SEG_VALID); 1136 } 1137 1138 static void clone_bio_integrity(struct bio *bio, struct bio *clone, 1139 unsigned short idx, unsigned len, unsigned offset, 1140 unsigned trim) 1141 { 1142 if (!bio_integrity(bio)) 1143 return; 1144 1145 bio_integrity_clone(clone, bio, GFP_NOIO); 1146 1147 if (trim) 1148 bio_integrity_trim(clone, bio_sector_offset(bio, idx, offset), len); 1149 } 1150 1151 /* 1152 * Creates a little bio that just does part of a bvec. 1153 */ 1154 static void clone_split_bio(struct dm_target_io *tio, struct bio *bio, 1155 sector_t sector, unsigned short idx, 1156 unsigned offset, unsigned len) 1157 { 1158 struct bio *clone = &tio->clone; 1159 struct bio_vec *bv = bio->bi_io_vec + idx; 1160 1161 *clone->bi_io_vec = *bv; 1162 1163 bio_setup_sector(clone, sector, len); 1164 1165 clone->bi_bdev = bio->bi_bdev; 1166 clone->bi_rw = bio->bi_rw; 1167 clone->bi_vcnt = 1; 1168 clone->bi_io_vec->bv_offset = offset; 1169 clone->bi_io_vec->bv_len = clone->bi_size; 1170 clone->bi_flags |= 1 << BIO_CLONED; 1171 1172 clone_bio_integrity(bio, clone, idx, len, offset, 1); 1173 } 1174 1175 /* 1176 * Creates a bio that consists of range of complete bvecs. 1177 */ 1178 static void clone_bio(struct dm_target_io *tio, struct bio *bio, 1179 sector_t sector, unsigned short idx, 1180 unsigned short bv_count, unsigned len) 1181 { 1182 struct bio *clone = &tio->clone; 1183 unsigned trim = 0; 1184 1185 __bio_clone(clone, bio); 1186 bio_setup_sector(clone, sector, len); 1187 bio_setup_bv(clone, idx, bv_count); 1188 1189 if (idx != bio->bi_idx || clone->bi_size < bio->bi_size) 1190 trim = 1; 1191 clone_bio_integrity(bio, clone, idx, len, 0, trim); 1192 } 1193 1194 static struct dm_target_io *alloc_tio(struct clone_info *ci, 1195 struct dm_target *ti, int nr_iovecs, 1196 unsigned target_bio_nr) 1197 { 1198 struct dm_target_io *tio; 1199 struct bio *clone; 1200 1201 clone = bio_alloc_bioset(GFP_NOIO, nr_iovecs, ci->md->bs); 1202 tio = container_of(clone, struct dm_target_io, clone); 1203 1204 tio->io = ci->io; 1205 tio->ti = ti; 1206 memset(&tio->info, 0, sizeof(tio->info)); 1207 tio->target_bio_nr = target_bio_nr; 1208 1209 return tio; 1210 } 1211 1212 static void __clone_and_map_simple_bio(struct clone_info *ci, 1213 struct dm_target *ti, 1214 unsigned target_bio_nr, sector_t len) 1215 { 1216 struct dm_target_io *tio = alloc_tio(ci, ti, ci->bio->bi_max_vecs, target_bio_nr); 1217 struct bio *clone = &tio->clone; 1218 1219 /* 1220 * Discard requests require the bio's inline iovecs be initialized. 1221 * ci->bio->bi_max_vecs is BIO_INLINE_VECS anyway, for both flush 1222 * and discard, so no need for concern about wasted bvec allocations. 1223 */ 1224 __bio_clone(clone, ci->bio); 1225 if (len) 1226 bio_setup_sector(clone, ci->sector, len); 1227 1228 __map_bio(tio); 1229 } 1230 1231 static void __send_duplicate_bios(struct clone_info *ci, struct dm_target *ti, 1232 unsigned num_bios, sector_t len) 1233 { 1234 unsigned target_bio_nr; 1235 1236 for (target_bio_nr = 0; target_bio_nr < num_bios; target_bio_nr++) 1237 __clone_and_map_simple_bio(ci, ti, target_bio_nr, len); 1238 } 1239 1240 static int __send_empty_flush(struct clone_info *ci) 1241 { 1242 unsigned target_nr = 0; 1243 struct dm_target *ti; 1244 1245 BUG_ON(bio_has_data(ci->bio)); 1246 while ((ti = dm_table_get_target(ci->map, target_nr++))) 1247 __send_duplicate_bios(ci, ti, ti->num_flush_bios, 0); 1248 1249 return 0; 1250 } 1251 1252 static void __clone_and_map_data_bio(struct clone_info *ci, struct dm_target *ti, 1253 sector_t sector, int nr_iovecs, 1254 unsigned short idx, unsigned short bv_count, 1255 unsigned offset, unsigned len, 1256 unsigned split_bvec) 1257 { 1258 struct bio *bio = ci->bio; 1259 struct dm_target_io *tio; 1260 unsigned target_bio_nr; 1261 unsigned num_target_bios = 1; 1262 1263 /* 1264 * Does the target want to receive duplicate copies of the bio? 1265 */ 1266 if (bio_data_dir(bio) == WRITE && ti->num_write_bios) 1267 num_target_bios = ti->num_write_bios(ti, bio); 1268 1269 for (target_bio_nr = 0; target_bio_nr < num_target_bios; target_bio_nr++) { 1270 tio = alloc_tio(ci, ti, nr_iovecs, target_bio_nr); 1271 if (split_bvec) 1272 clone_split_bio(tio, bio, sector, idx, offset, len); 1273 else 1274 clone_bio(tio, bio, sector, idx, bv_count, len); 1275 __map_bio(tio); 1276 } 1277 } 1278 1279 typedef unsigned (*get_num_bios_fn)(struct dm_target *ti); 1280 1281 static unsigned get_num_discard_bios(struct dm_target *ti) 1282 { 1283 return ti->num_discard_bios; 1284 } 1285 1286 static unsigned get_num_write_same_bios(struct dm_target *ti) 1287 { 1288 return ti->num_write_same_bios; 1289 } 1290 1291 typedef bool (*is_split_required_fn)(struct dm_target *ti); 1292 1293 static bool is_split_required_for_discard(struct dm_target *ti) 1294 { 1295 return ti->split_discard_bios; 1296 } 1297 1298 static int __send_changing_extent_only(struct clone_info *ci, 1299 get_num_bios_fn get_num_bios, 1300 is_split_required_fn is_split_required) 1301 { 1302 struct dm_target *ti; 1303 sector_t len; 1304 unsigned num_bios; 1305 1306 do { 1307 ti = dm_table_find_target(ci->map, ci->sector); 1308 if (!dm_target_is_valid(ti)) 1309 return -EIO; 1310 1311 /* 1312 * Even though the device advertised support for this type of 1313 * request, that does not mean every target supports it, and 1314 * reconfiguration might also have changed that since the 1315 * check was performed. 1316 */ 1317 num_bios = get_num_bios ? get_num_bios(ti) : 0; 1318 if (!num_bios) 1319 return -EOPNOTSUPP; 1320 1321 if (is_split_required && !is_split_required(ti)) 1322 len = min(ci->sector_count, max_io_len_target_boundary(ci->sector, ti)); 1323 else 1324 len = min(ci->sector_count, max_io_len(ci->sector, ti)); 1325 1326 __send_duplicate_bios(ci, ti, num_bios, len); 1327 1328 ci->sector += len; 1329 } while (ci->sector_count -= len); 1330 1331 return 0; 1332 } 1333 1334 static int __send_discard(struct clone_info *ci) 1335 { 1336 return __send_changing_extent_only(ci, get_num_discard_bios, 1337 is_split_required_for_discard); 1338 } 1339 1340 static int __send_write_same(struct clone_info *ci) 1341 { 1342 return __send_changing_extent_only(ci, get_num_write_same_bios, NULL); 1343 } 1344 1345 /* 1346 * Find maximum number of sectors / bvecs we can process with a single bio. 1347 */ 1348 static sector_t __len_within_target(struct clone_info *ci, sector_t max, int *idx) 1349 { 1350 struct bio *bio = ci->bio; 1351 sector_t bv_len, total_len = 0; 1352 1353 for (*idx = ci->idx; max && (*idx < bio->bi_vcnt); (*idx)++) { 1354 bv_len = to_sector(bio->bi_io_vec[*idx].bv_len); 1355 1356 if (bv_len > max) 1357 break; 1358 1359 max -= bv_len; 1360 total_len += bv_len; 1361 } 1362 1363 return total_len; 1364 } 1365 1366 static int __split_bvec_across_targets(struct clone_info *ci, 1367 struct dm_target *ti, sector_t max) 1368 { 1369 struct bio *bio = ci->bio; 1370 struct bio_vec *bv = bio->bi_io_vec + ci->idx; 1371 sector_t remaining = to_sector(bv->bv_len); 1372 unsigned offset = 0; 1373 sector_t len; 1374 1375 do { 1376 if (offset) { 1377 ti = dm_table_find_target(ci->map, ci->sector); 1378 if (!dm_target_is_valid(ti)) 1379 return -EIO; 1380 1381 max = max_io_len(ci->sector, ti); 1382 } 1383 1384 len = min(remaining, max); 1385 1386 __clone_and_map_data_bio(ci, ti, ci->sector, 1, ci->idx, 0, 1387 bv->bv_offset + offset, len, 1); 1388 1389 ci->sector += len; 1390 ci->sector_count -= len; 1391 offset += to_bytes(len); 1392 } while (remaining -= len); 1393 1394 ci->idx++; 1395 1396 return 0; 1397 } 1398 1399 /* 1400 * Select the correct strategy for processing a non-flush bio. 1401 */ 1402 static int __split_and_process_non_flush(struct clone_info *ci) 1403 { 1404 struct bio *bio = ci->bio; 1405 struct dm_target *ti; 1406 sector_t len, max; 1407 int idx; 1408 1409 if (unlikely(bio->bi_rw & REQ_DISCARD)) 1410 return __send_discard(ci); 1411 else if (unlikely(bio->bi_rw & REQ_WRITE_SAME)) 1412 return __send_write_same(ci); 1413 1414 ti = dm_table_find_target(ci->map, ci->sector); 1415 if (!dm_target_is_valid(ti)) 1416 return -EIO; 1417 1418 max = max_io_len(ci->sector, ti); 1419 1420 /* 1421 * Optimise for the simple case where we can do all of 1422 * the remaining io with a single clone. 1423 */ 1424 if (ci->sector_count <= max) { 1425 __clone_and_map_data_bio(ci, ti, ci->sector, bio->bi_max_vecs, 1426 ci->idx, bio->bi_vcnt - ci->idx, 0, 1427 ci->sector_count, 0); 1428 ci->sector_count = 0; 1429 return 0; 1430 } 1431 1432 /* 1433 * There are some bvecs that don't span targets. 1434 * Do as many of these as possible. 1435 */ 1436 if (to_sector(bio->bi_io_vec[ci->idx].bv_len) <= max) { 1437 len = __len_within_target(ci, max, &idx); 1438 1439 __clone_and_map_data_bio(ci, ti, ci->sector, bio->bi_max_vecs, 1440 ci->idx, idx - ci->idx, 0, len, 0); 1441 1442 ci->sector += len; 1443 ci->sector_count -= len; 1444 ci->idx = idx; 1445 1446 return 0; 1447 } 1448 1449 /* 1450 * Handle a bvec that must be split between two or more targets. 1451 */ 1452 return __split_bvec_across_targets(ci, ti, max); 1453 } 1454 1455 /* 1456 * Entry point to split a bio into clones and submit them to the targets. 1457 */ 1458 static void __split_and_process_bio(struct mapped_device *md, 1459 struct dm_table *map, struct bio *bio) 1460 { 1461 struct clone_info ci; 1462 int error = 0; 1463 1464 if (unlikely(!map)) { 1465 bio_io_error(bio); 1466 return; 1467 } 1468 1469 ci.map = map; 1470 ci.md = md; 1471 ci.io = alloc_io(md); 1472 ci.io->error = 0; 1473 atomic_set(&ci.io->io_count, 1); 1474 ci.io->bio = bio; 1475 ci.io->md = md; 1476 spin_lock_init(&ci.io->endio_lock); 1477 ci.sector = bio->bi_sector; 1478 ci.idx = bio->bi_idx; 1479 1480 start_io_acct(ci.io); 1481 1482 if (bio->bi_rw & REQ_FLUSH) { 1483 ci.bio = &ci.md->flush_bio; 1484 ci.sector_count = 0; 1485 error = __send_empty_flush(&ci); 1486 /* dec_pending submits any data associated with flush */ 1487 } else { 1488 ci.bio = bio; 1489 ci.sector_count = bio_sectors(bio); 1490 while (ci.sector_count && !error) 1491 error = __split_and_process_non_flush(&ci); 1492 } 1493 1494 /* drop the extra reference count */ 1495 dec_pending(ci.io, error); 1496 } 1497 /*----------------------------------------------------------------- 1498 * CRUD END 1499 *---------------------------------------------------------------*/ 1500 1501 static int dm_merge_bvec(struct request_queue *q, 1502 struct bvec_merge_data *bvm, 1503 struct bio_vec *biovec) 1504 { 1505 struct mapped_device *md = q->queuedata; 1506 struct dm_table *map = dm_get_live_table_fast(md); 1507 struct dm_target *ti; 1508 sector_t max_sectors; 1509 int max_size = 0; 1510 1511 if (unlikely(!map)) 1512 goto out; 1513 1514 ti = dm_table_find_target(map, bvm->bi_sector); 1515 if (!dm_target_is_valid(ti)) 1516 goto out; 1517 1518 /* 1519 * Find maximum amount of I/O that won't need splitting 1520 */ 1521 max_sectors = min(max_io_len(bvm->bi_sector, ti), 1522 (sector_t) BIO_MAX_SECTORS); 1523 max_size = (max_sectors << SECTOR_SHIFT) - bvm->bi_size; 1524 if (max_size < 0) 1525 max_size = 0; 1526 1527 /* 1528 * merge_bvec_fn() returns number of bytes 1529 * it can accept at this offset 1530 * max is precomputed maximal io size 1531 */ 1532 if (max_size && ti->type->merge) 1533 max_size = ti->type->merge(ti, bvm, biovec, max_size); 1534 /* 1535 * If the target doesn't support merge method and some of the devices 1536 * provided their merge_bvec method (we know this by looking at 1537 * queue_max_hw_sectors), then we can't allow bios with multiple vector 1538 * entries. So always set max_size to 0, and the code below allows 1539 * just one page. 1540 */ 1541 else if (queue_max_hw_sectors(q) <= PAGE_SIZE >> 9) 1542 1543 max_size = 0; 1544 1545 out: 1546 dm_put_live_table_fast(md); 1547 /* 1548 * Always allow an entire first page 1549 */ 1550 if (max_size <= biovec->bv_len && !(bvm->bi_size >> SECTOR_SHIFT)) 1551 max_size = biovec->bv_len; 1552 1553 return max_size; 1554 } 1555 1556 /* 1557 * The request function that just remaps the bio built up by 1558 * dm_merge_bvec. 1559 */ 1560 static void _dm_request(struct request_queue *q, struct bio *bio) 1561 { 1562 int rw = bio_data_dir(bio); 1563 struct mapped_device *md = q->queuedata; 1564 int cpu; 1565 int srcu_idx; 1566 struct dm_table *map; 1567 1568 map = dm_get_live_table(md, &srcu_idx); 1569 1570 cpu = part_stat_lock(); 1571 part_stat_inc(cpu, &dm_disk(md)->part0, ios[rw]); 1572 part_stat_add(cpu, &dm_disk(md)->part0, sectors[rw], bio_sectors(bio)); 1573 part_stat_unlock(); 1574 1575 /* if we're suspended, we have to queue this io for later */ 1576 if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))) { 1577 dm_put_live_table(md, srcu_idx); 1578 1579 if (bio_rw(bio) != READA) 1580 queue_io(md, bio); 1581 else 1582 bio_io_error(bio); 1583 return; 1584 } 1585 1586 __split_and_process_bio(md, map, bio); 1587 dm_put_live_table(md, srcu_idx); 1588 return; 1589 } 1590 1591 int dm_request_based(struct mapped_device *md) 1592 { 1593 return blk_queue_stackable(md->queue); 1594 } 1595 1596 static void dm_request(struct request_queue *q, struct bio *bio) 1597 { 1598 struct mapped_device *md = q->queuedata; 1599 1600 if (dm_request_based(md)) 1601 blk_queue_bio(q, bio); 1602 else 1603 _dm_request(q, bio); 1604 } 1605 1606 void dm_dispatch_request(struct request *rq) 1607 { 1608 int r; 1609 1610 if (blk_queue_io_stat(rq->q)) 1611 rq->cmd_flags |= REQ_IO_STAT; 1612 1613 rq->start_time = jiffies; 1614 r = blk_insert_cloned_request(rq->q, rq); 1615 if (r) 1616 dm_complete_request(rq, r); 1617 } 1618 EXPORT_SYMBOL_GPL(dm_dispatch_request); 1619 1620 static int dm_rq_bio_constructor(struct bio *bio, struct bio *bio_orig, 1621 void *data) 1622 { 1623 struct dm_rq_target_io *tio = data; 1624 struct dm_rq_clone_bio_info *info = 1625 container_of(bio, struct dm_rq_clone_bio_info, clone); 1626 1627 info->orig = bio_orig; 1628 info->tio = tio; 1629 bio->bi_end_io = end_clone_bio; 1630 bio->bi_private = info; 1631 1632 return 0; 1633 } 1634 1635 static int setup_clone(struct request *clone, struct request *rq, 1636 struct dm_rq_target_io *tio) 1637 { 1638 int r; 1639 1640 r = blk_rq_prep_clone(clone, rq, tio->md->bs, GFP_ATOMIC, 1641 dm_rq_bio_constructor, tio); 1642 if (r) 1643 return r; 1644 1645 clone->cmd = rq->cmd; 1646 clone->cmd_len = rq->cmd_len; 1647 clone->sense = rq->sense; 1648 clone->buffer = rq->buffer; 1649 clone->end_io = end_clone_request; 1650 clone->end_io_data = tio; 1651 1652 return 0; 1653 } 1654 1655 static struct request *clone_rq(struct request *rq, struct mapped_device *md, 1656 gfp_t gfp_mask) 1657 { 1658 struct request *clone; 1659 struct dm_rq_target_io *tio; 1660 1661 tio = alloc_rq_tio(md, gfp_mask); 1662 if (!tio) 1663 return NULL; 1664 1665 tio->md = md; 1666 tio->ti = NULL; 1667 tio->orig = rq; 1668 tio->error = 0; 1669 memset(&tio->info, 0, sizeof(tio->info)); 1670 1671 clone = &tio->clone; 1672 if (setup_clone(clone, rq, tio)) { 1673 /* -ENOMEM */ 1674 free_rq_tio(tio); 1675 return NULL; 1676 } 1677 1678 return clone; 1679 } 1680 1681 /* 1682 * Called with the queue lock held. 1683 */ 1684 static int dm_prep_fn(struct request_queue *q, struct request *rq) 1685 { 1686 struct mapped_device *md = q->queuedata; 1687 struct request *clone; 1688 1689 if (unlikely(rq->special)) { 1690 DMWARN("Already has something in rq->special."); 1691 return BLKPREP_KILL; 1692 } 1693 1694 clone = clone_rq(rq, md, GFP_ATOMIC); 1695 if (!clone) 1696 return BLKPREP_DEFER; 1697 1698 rq->special = clone; 1699 rq->cmd_flags |= REQ_DONTPREP; 1700 1701 return BLKPREP_OK; 1702 } 1703 1704 /* 1705 * Returns: 1706 * 0 : the request has been processed (not requeued) 1707 * !0 : the request has been requeued 1708 */ 1709 static int map_request(struct dm_target *ti, struct request *clone, 1710 struct mapped_device *md) 1711 { 1712 int r, requeued = 0; 1713 struct dm_rq_target_io *tio = clone->end_io_data; 1714 1715 tio->ti = ti; 1716 r = ti->type->map_rq(ti, clone, &tio->info); 1717 switch (r) { 1718 case DM_MAPIO_SUBMITTED: 1719 /* The target has taken the I/O to submit by itself later */ 1720 break; 1721 case DM_MAPIO_REMAPPED: 1722 /* The target has remapped the I/O so dispatch it */ 1723 trace_block_rq_remap(clone->q, clone, disk_devt(dm_disk(md)), 1724 blk_rq_pos(tio->orig)); 1725 dm_dispatch_request(clone); 1726 break; 1727 case DM_MAPIO_REQUEUE: 1728 /* The target wants to requeue the I/O */ 1729 dm_requeue_unmapped_request(clone); 1730 requeued = 1; 1731 break; 1732 default: 1733 if (r > 0) { 1734 DMWARN("unimplemented target map return value: %d", r); 1735 BUG(); 1736 } 1737 1738 /* The target wants to complete the I/O */ 1739 dm_kill_unmapped_request(clone, r); 1740 break; 1741 } 1742 1743 return requeued; 1744 } 1745 1746 static struct request *dm_start_request(struct mapped_device *md, struct request *orig) 1747 { 1748 struct request *clone; 1749 1750 blk_start_request(orig); 1751 clone = orig->special; 1752 atomic_inc(&md->pending[rq_data_dir(clone)]); 1753 1754 /* 1755 * Hold the md reference here for the in-flight I/O. 1756 * We can't rely on the reference count by device opener, 1757 * because the device may be closed during the request completion 1758 * when all bios are completed. 1759 * See the comment in rq_completed() too. 1760 */ 1761 dm_get(md); 1762 1763 return clone; 1764 } 1765 1766 /* 1767 * q->request_fn for request-based dm. 1768 * Called with the queue lock held. 1769 */ 1770 static void dm_request_fn(struct request_queue *q) 1771 { 1772 struct mapped_device *md = q->queuedata; 1773 int srcu_idx; 1774 struct dm_table *map = dm_get_live_table(md, &srcu_idx); 1775 struct dm_target *ti; 1776 struct request *rq, *clone; 1777 sector_t pos; 1778 1779 /* 1780 * For suspend, check blk_queue_stopped() and increment 1781 * ->pending within a single queue_lock not to increment the 1782 * number of in-flight I/Os after the queue is stopped in 1783 * dm_suspend(). 1784 */ 1785 while (!blk_queue_stopped(q)) { 1786 rq = blk_peek_request(q); 1787 if (!rq) 1788 goto delay_and_out; 1789 1790 /* always use block 0 to find the target for flushes for now */ 1791 pos = 0; 1792 if (!(rq->cmd_flags & REQ_FLUSH)) 1793 pos = blk_rq_pos(rq); 1794 1795 ti = dm_table_find_target(map, pos); 1796 if (!dm_target_is_valid(ti)) { 1797 /* 1798 * Must perform setup, that dm_done() requires, 1799 * before calling dm_kill_unmapped_request 1800 */ 1801 DMERR_LIMIT("request attempted access beyond the end of device"); 1802 clone = dm_start_request(md, rq); 1803 dm_kill_unmapped_request(clone, -EIO); 1804 continue; 1805 } 1806 1807 if (ti->type->busy && ti->type->busy(ti)) 1808 goto delay_and_out; 1809 1810 clone = dm_start_request(md, rq); 1811 1812 spin_unlock(q->queue_lock); 1813 if (map_request(ti, clone, md)) 1814 goto requeued; 1815 1816 BUG_ON(!irqs_disabled()); 1817 spin_lock(q->queue_lock); 1818 } 1819 1820 goto out; 1821 1822 requeued: 1823 BUG_ON(!irqs_disabled()); 1824 spin_lock(q->queue_lock); 1825 1826 delay_and_out: 1827 blk_delay_queue(q, HZ / 10); 1828 out: 1829 dm_put_live_table(md, srcu_idx); 1830 } 1831 1832 int dm_underlying_device_busy(struct request_queue *q) 1833 { 1834 return blk_lld_busy(q); 1835 } 1836 EXPORT_SYMBOL_GPL(dm_underlying_device_busy); 1837 1838 static int dm_lld_busy(struct request_queue *q) 1839 { 1840 int r; 1841 struct mapped_device *md = q->queuedata; 1842 struct dm_table *map = dm_get_live_table_fast(md); 1843 1844 if (!map || test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) 1845 r = 1; 1846 else 1847 r = dm_table_any_busy_target(map); 1848 1849 dm_put_live_table_fast(md); 1850 1851 return r; 1852 } 1853 1854 static int dm_any_congested(void *congested_data, int bdi_bits) 1855 { 1856 int r = bdi_bits; 1857 struct mapped_device *md = congested_data; 1858 struct dm_table *map; 1859 1860 if (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) { 1861 map = dm_get_live_table_fast(md); 1862 if (map) { 1863 /* 1864 * Request-based dm cares about only own queue for 1865 * the query about congestion status of request_queue 1866 */ 1867 if (dm_request_based(md)) 1868 r = md->queue->backing_dev_info.state & 1869 bdi_bits; 1870 else 1871 r = dm_table_any_congested(map, bdi_bits); 1872 } 1873 dm_put_live_table_fast(md); 1874 } 1875 1876 return r; 1877 } 1878 1879 /*----------------------------------------------------------------- 1880 * An IDR is used to keep track of allocated minor numbers. 1881 *---------------------------------------------------------------*/ 1882 static void free_minor(int minor) 1883 { 1884 spin_lock(&_minor_lock); 1885 idr_remove(&_minor_idr, minor); 1886 spin_unlock(&_minor_lock); 1887 } 1888 1889 /* 1890 * See if the device with a specific minor # is free. 1891 */ 1892 static int specific_minor(int minor) 1893 { 1894 int r; 1895 1896 if (minor >= (1 << MINORBITS)) 1897 return -EINVAL; 1898 1899 idr_preload(GFP_KERNEL); 1900 spin_lock(&_minor_lock); 1901 1902 r = idr_alloc(&_minor_idr, MINOR_ALLOCED, minor, minor + 1, GFP_NOWAIT); 1903 1904 spin_unlock(&_minor_lock); 1905 idr_preload_end(); 1906 if (r < 0) 1907 return r == -ENOSPC ? -EBUSY : r; 1908 return 0; 1909 } 1910 1911 static int next_free_minor(int *minor) 1912 { 1913 int r; 1914 1915 idr_preload(GFP_KERNEL); 1916 spin_lock(&_minor_lock); 1917 1918 r = idr_alloc(&_minor_idr, MINOR_ALLOCED, 0, 1 << MINORBITS, GFP_NOWAIT); 1919 1920 spin_unlock(&_minor_lock); 1921 idr_preload_end(); 1922 if (r < 0) 1923 return r; 1924 *minor = r; 1925 return 0; 1926 } 1927 1928 static const struct block_device_operations dm_blk_dops; 1929 1930 static void dm_wq_work(struct work_struct *work); 1931 1932 static void dm_init_md_queue(struct mapped_device *md) 1933 { 1934 /* 1935 * Request-based dm devices cannot be stacked on top of bio-based dm 1936 * devices. The type of this dm device has not been decided yet. 1937 * The type is decided at the first table loading time. 1938 * To prevent problematic device stacking, clear the queue flag 1939 * for request stacking support until then. 1940 * 1941 * This queue is new, so no concurrency on the queue_flags. 1942 */ 1943 queue_flag_clear_unlocked(QUEUE_FLAG_STACKABLE, md->queue); 1944 1945 md->queue->queuedata = md; 1946 md->queue->backing_dev_info.congested_fn = dm_any_congested; 1947 md->queue->backing_dev_info.congested_data = md; 1948 blk_queue_make_request(md->queue, dm_request); 1949 blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY); 1950 blk_queue_merge_bvec(md->queue, dm_merge_bvec); 1951 } 1952 1953 /* 1954 * Allocate and initialise a blank device with a given minor. 1955 */ 1956 static struct mapped_device *alloc_dev(int minor) 1957 { 1958 int r; 1959 struct mapped_device *md = kzalloc(sizeof(*md), GFP_KERNEL); 1960 void *old_md; 1961 1962 if (!md) { 1963 DMWARN("unable to allocate device, out of memory."); 1964 return NULL; 1965 } 1966 1967 if (!try_module_get(THIS_MODULE)) 1968 goto bad_module_get; 1969 1970 /* get a minor number for the dev */ 1971 if (minor == DM_ANY_MINOR) 1972 r = next_free_minor(&minor); 1973 else 1974 r = specific_minor(minor); 1975 if (r < 0) 1976 goto bad_minor; 1977 1978 r = init_srcu_struct(&md->io_barrier); 1979 if (r < 0) 1980 goto bad_io_barrier; 1981 1982 md->type = DM_TYPE_NONE; 1983 mutex_init(&md->suspend_lock); 1984 mutex_init(&md->type_lock); 1985 spin_lock_init(&md->deferred_lock); 1986 atomic_set(&md->holders, 1); 1987 atomic_set(&md->open_count, 0); 1988 atomic_set(&md->event_nr, 0); 1989 atomic_set(&md->uevent_seq, 0); 1990 INIT_LIST_HEAD(&md->uevent_list); 1991 spin_lock_init(&md->uevent_lock); 1992 1993 md->queue = blk_alloc_queue(GFP_KERNEL); 1994 if (!md->queue) 1995 goto bad_queue; 1996 1997 dm_init_md_queue(md); 1998 1999 md->disk = alloc_disk(1); 2000 if (!md->disk) 2001 goto bad_disk; 2002 2003 atomic_set(&md->pending[0], 0); 2004 atomic_set(&md->pending[1], 0); 2005 init_waitqueue_head(&md->wait); 2006 INIT_WORK(&md->work, dm_wq_work); 2007 init_waitqueue_head(&md->eventq); 2008 2009 md->disk->major = _major; 2010 md->disk->first_minor = minor; 2011 md->disk->fops = &dm_blk_dops; 2012 md->disk->queue = md->queue; 2013 md->disk->private_data = md; 2014 sprintf(md->disk->disk_name, "dm-%d", minor); 2015 add_disk(md->disk); 2016 format_dev_t(md->name, MKDEV(_major, minor)); 2017 2018 md->wq = alloc_workqueue("kdmflush", WQ_MEM_RECLAIM, 0); 2019 if (!md->wq) 2020 goto bad_thread; 2021 2022 md->bdev = bdget_disk(md->disk, 0); 2023 if (!md->bdev) 2024 goto bad_bdev; 2025 2026 bio_init(&md->flush_bio); 2027 md->flush_bio.bi_bdev = md->bdev; 2028 md->flush_bio.bi_rw = WRITE_FLUSH; 2029 2030 dm_stats_init(&md->stats); 2031 2032 /* Populate the mapping, nobody knows we exist yet */ 2033 spin_lock(&_minor_lock); 2034 old_md = idr_replace(&_minor_idr, md, minor); 2035 spin_unlock(&_minor_lock); 2036 2037 BUG_ON(old_md != MINOR_ALLOCED); 2038 2039 return md; 2040 2041 bad_bdev: 2042 destroy_workqueue(md->wq); 2043 bad_thread: 2044 del_gendisk(md->disk); 2045 put_disk(md->disk); 2046 bad_disk: 2047 blk_cleanup_queue(md->queue); 2048 bad_queue: 2049 cleanup_srcu_struct(&md->io_barrier); 2050 bad_io_barrier: 2051 free_minor(minor); 2052 bad_minor: 2053 module_put(THIS_MODULE); 2054 bad_module_get: 2055 kfree(md); 2056 return NULL; 2057 } 2058 2059 static void unlock_fs(struct mapped_device *md); 2060 2061 static void free_dev(struct mapped_device *md) 2062 { 2063 int minor = MINOR(disk_devt(md->disk)); 2064 2065 unlock_fs(md); 2066 bdput(md->bdev); 2067 destroy_workqueue(md->wq); 2068 if (md->io_pool) 2069 mempool_destroy(md->io_pool); 2070 if (md->bs) 2071 bioset_free(md->bs); 2072 blk_integrity_unregister(md->disk); 2073 del_gendisk(md->disk); 2074 cleanup_srcu_struct(&md->io_barrier); 2075 free_minor(minor); 2076 2077 spin_lock(&_minor_lock); 2078 md->disk->private_data = NULL; 2079 spin_unlock(&_minor_lock); 2080 2081 put_disk(md->disk); 2082 blk_cleanup_queue(md->queue); 2083 dm_stats_cleanup(&md->stats); 2084 module_put(THIS_MODULE); 2085 kfree(md); 2086 } 2087 2088 static void __bind_mempools(struct mapped_device *md, struct dm_table *t) 2089 { 2090 struct dm_md_mempools *p = dm_table_get_md_mempools(t); 2091 2092 if (md->io_pool && md->bs) { 2093 /* The md already has necessary mempools. */ 2094 if (dm_table_get_type(t) == DM_TYPE_BIO_BASED) { 2095 /* 2096 * Reload bioset because front_pad may have changed 2097 * because a different table was loaded. 2098 */ 2099 bioset_free(md->bs); 2100 md->bs = p->bs; 2101 p->bs = NULL; 2102 } else if (dm_table_get_type(t) == DM_TYPE_REQUEST_BASED) { 2103 /* 2104 * There's no need to reload with request-based dm 2105 * because the size of front_pad doesn't change. 2106 * Note for future: If you are to reload bioset, 2107 * prep-ed requests in the queue may refer 2108 * to bio from the old bioset, so you must walk 2109 * through the queue to unprep. 2110 */ 2111 } 2112 goto out; 2113 } 2114 2115 BUG_ON(!p || md->io_pool || md->bs); 2116 2117 md->io_pool = p->io_pool; 2118 p->io_pool = NULL; 2119 md->bs = p->bs; 2120 p->bs = NULL; 2121 2122 out: 2123 /* mempool bind completed, now no need any mempools in the table */ 2124 dm_table_free_md_mempools(t); 2125 } 2126 2127 /* 2128 * Bind a table to the device. 2129 */ 2130 static void event_callback(void *context) 2131 { 2132 unsigned long flags; 2133 LIST_HEAD(uevents); 2134 struct mapped_device *md = (struct mapped_device *) context; 2135 2136 spin_lock_irqsave(&md->uevent_lock, flags); 2137 list_splice_init(&md->uevent_list, &uevents); 2138 spin_unlock_irqrestore(&md->uevent_lock, flags); 2139 2140 dm_send_uevents(&uevents, &disk_to_dev(md->disk)->kobj); 2141 2142 atomic_inc(&md->event_nr); 2143 wake_up(&md->eventq); 2144 } 2145 2146 /* 2147 * Protected by md->suspend_lock obtained by dm_swap_table(). 2148 */ 2149 static void __set_size(struct mapped_device *md, sector_t size) 2150 { 2151 set_capacity(md->disk, size); 2152 2153 i_size_write(md->bdev->bd_inode, (loff_t)size << SECTOR_SHIFT); 2154 } 2155 2156 /* 2157 * Return 1 if the queue has a compulsory merge_bvec_fn function. 2158 * 2159 * If this function returns 0, then the device is either a non-dm 2160 * device without a merge_bvec_fn, or it is a dm device that is 2161 * able to split any bios it receives that are too big. 2162 */ 2163 int dm_queue_merge_is_compulsory(struct request_queue *q) 2164 { 2165 struct mapped_device *dev_md; 2166 2167 if (!q->merge_bvec_fn) 2168 return 0; 2169 2170 if (q->make_request_fn == dm_request) { 2171 dev_md = q->queuedata; 2172 if (test_bit(DMF_MERGE_IS_OPTIONAL, &dev_md->flags)) 2173 return 0; 2174 } 2175 2176 return 1; 2177 } 2178 2179 static int dm_device_merge_is_compulsory(struct dm_target *ti, 2180 struct dm_dev *dev, sector_t start, 2181 sector_t len, void *data) 2182 { 2183 struct block_device *bdev = dev->bdev; 2184 struct request_queue *q = bdev_get_queue(bdev); 2185 2186 return dm_queue_merge_is_compulsory(q); 2187 } 2188 2189 /* 2190 * Return 1 if it is acceptable to ignore merge_bvec_fn based 2191 * on the properties of the underlying devices. 2192 */ 2193 static int dm_table_merge_is_optional(struct dm_table *table) 2194 { 2195 unsigned i = 0; 2196 struct dm_target *ti; 2197 2198 while (i < dm_table_get_num_targets(table)) { 2199 ti = dm_table_get_target(table, i++); 2200 2201 if (ti->type->iterate_devices && 2202 ti->type->iterate_devices(ti, dm_device_merge_is_compulsory, NULL)) 2203 return 0; 2204 } 2205 2206 return 1; 2207 } 2208 2209 /* 2210 * Returns old map, which caller must destroy. 2211 */ 2212 static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t, 2213 struct queue_limits *limits) 2214 { 2215 struct dm_table *old_map; 2216 struct request_queue *q = md->queue; 2217 sector_t size; 2218 int merge_is_optional; 2219 2220 size = dm_table_get_size(t); 2221 2222 /* 2223 * Wipe any geometry if the size of the table changed. 2224 */ 2225 if (size != dm_get_size(md)) 2226 memset(&md->geometry, 0, sizeof(md->geometry)); 2227 2228 __set_size(md, size); 2229 2230 dm_table_event_callback(t, event_callback, md); 2231 2232 /* 2233 * The queue hasn't been stopped yet, if the old table type wasn't 2234 * for request-based during suspension. So stop it to prevent 2235 * I/O mapping before resume. 2236 * This must be done before setting the queue restrictions, 2237 * because request-based dm may be run just after the setting. 2238 */ 2239 if (dm_table_request_based(t) && !blk_queue_stopped(q)) 2240 stop_queue(q); 2241 2242 __bind_mempools(md, t); 2243 2244 merge_is_optional = dm_table_merge_is_optional(t); 2245 2246 old_map = md->map; 2247 rcu_assign_pointer(md->map, t); 2248 md->immutable_target_type = dm_table_get_immutable_target_type(t); 2249 2250 dm_table_set_restrictions(t, q, limits); 2251 if (merge_is_optional) 2252 set_bit(DMF_MERGE_IS_OPTIONAL, &md->flags); 2253 else 2254 clear_bit(DMF_MERGE_IS_OPTIONAL, &md->flags); 2255 dm_sync_table(md); 2256 2257 return old_map; 2258 } 2259 2260 /* 2261 * Returns unbound table for the caller to free. 2262 */ 2263 static struct dm_table *__unbind(struct mapped_device *md) 2264 { 2265 struct dm_table *map = md->map; 2266 2267 if (!map) 2268 return NULL; 2269 2270 dm_table_event_callback(map, NULL, NULL); 2271 rcu_assign_pointer(md->map, NULL); 2272 dm_sync_table(md); 2273 2274 return map; 2275 } 2276 2277 /* 2278 * Constructor for a new device. 2279 */ 2280 int dm_create(int minor, struct mapped_device **result) 2281 { 2282 struct mapped_device *md; 2283 2284 md = alloc_dev(minor); 2285 if (!md) 2286 return -ENXIO; 2287 2288 dm_sysfs_init(md); 2289 2290 *result = md; 2291 return 0; 2292 } 2293 2294 /* 2295 * Functions to manage md->type. 2296 * All are required to hold md->type_lock. 2297 */ 2298 void dm_lock_md_type(struct mapped_device *md) 2299 { 2300 mutex_lock(&md->type_lock); 2301 } 2302 2303 void dm_unlock_md_type(struct mapped_device *md) 2304 { 2305 mutex_unlock(&md->type_lock); 2306 } 2307 2308 void dm_set_md_type(struct mapped_device *md, unsigned type) 2309 { 2310 BUG_ON(!mutex_is_locked(&md->type_lock)); 2311 md->type = type; 2312 } 2313 2314 unsigned dm_get_md_type(struct mapped_device *md) 2315 { 2316 BUG_ON(!mutex_is_locked(&md->type_lock)); 2317 return md->type; 2318 } 2319 2320 struct target_type *dm_get_immutable_target_type(struct mapped_device *md) 2321 { 2322 return md->immutable_target_type; 2323 } 2324 2325 /* 2326 * The queue_limits are only valid as long as you have a reference 2327 * count on 'md'. 2328 */ 2329 struct queue_limits *dm_get_queue_limits(struct mapped_device *md) 2330 { 2331 BUG_ON(!atomic_read(&md->holders)); 2332 return &md->queue->limits; 2333 } 2334 EXPORT_SYMBOL_GPL(dm_get_queue_limits); 2335 2336 /* 2337 * Fully initialize a request-based queue (->elevator, ->request_fn, etc). 2338 */ 2339 static int dm_init_request_based_queue(struct mapped_device *md) 2340 { 2341 struct request_queue *q = NULL; 2342 2343 if (md->queue->elevator) 2344 return 1; 2345 2346 /* Fully initialize the queue */ 2347 q = blk_init_allocated_queue(md->queue, dm_request_fn, NULL); 2348 if (!q) 2349 return 0; 2350 2351 md->queue = q; 2352 dm_init_md_queue(md); 2353 blk_queue_softirq_done(md->queue, dm_softirq_done); 2354 blk_queue_prep_rq(md->queue, dm_prep_fn); 2355 blk_queue_lld_busy(md->queue, dm_lld_busy); 2356 2357 elv_register_queue(md->queue); 2358 2359 return 1; 2360 } 2361 2362 /* 2363 * Setup the DM device's queue based on md's type 2364 */ 2365 int dm_setup_md_queue(struct mapped_device *md) 2366 { 2367 if ((dm_get_md_type(md) == DM_TYPE_REQUEST_BASED) && 2368 !dm_init_request_based_queue(md)) { 2369 DMWARN("Cannot initialize queue for request-based mapped device"); 2370 return -EINVAL; 2371 } 2372 2373 return 0; 2374 } 2375 2376 static struct mapped_device *dm_find_md(dev_t dev) 2377 { 2378 struct mapped_device *md; 2379 unsigned minor = MINOR(dev); 2380 2381 if (MAJOR(dev) != _major || minor >= (1 << MINORBITS)) 2382 return NULL; 2383 2384 spin_lock(&_minor_lock); 2385 2386 md = idr_find(&_minor_idr, minor); 2387 if (md && (md == MINOR_ALLOCED || 2388 (MINOR(disk_devt(dm_disk(md))) != minor) || 2389 dm_deleting_md(md) || 2390 test_bit(DMF_FREEING, &md->flags))) { 2391 md = NULL; 2392 goto out; 2393 } 2394 2395 out: 2396 spin_unlock(&_minor_lock); 2397 2398 return md; 2399 } 2400 2401 struct mapped_device *dm_get_md(dev_t dev) 2402 { 2403 struct mapped_device *md = dm_find_md(dev); 2404 2405 if (md) 2406 dm_get(md); 2407 2408 return md; 2409 } 2410 EXPORT_SYMBOL_GPL(dm_get_md); 2411 2412 void *dm_get_mdptr(struct mapped_device *md) 2413 { 2414 return md->interface_ptr; 2415 } 2416 2417 void dm_set_mdptr(struct mapped_device *md, void *ptr) 2418 { 2419 md->interface_ptr = ptr; 2420 } 2421 2422 void dm_get(struct mapped_device *md) 2423 { 2424 atomic_inc(&md->holders); 2425 BUG_ON(test_bit(DMF_FREEING, &md->flags)); 2426 } 2427 2428 const char *dm_device_name(struct mapped_device *md) 2429 { 2430 return md->name; 2431 } 2432 EXPORT_SYMBOL_GPL(dm_device_name); 2433 2434 static void __dm_destroy(struct mapped_device *md, bool wait) 2435 { 2436 struct dm_table *map; 2437 int srcu_idx; 2438 2439 might_sleep(); 2440 2441 spin_lock(&_minor_lock); 2442 map = dm_get_live_table(md, &srcu_idx); 2443 idr_replace(&_minor_idr, MINOR_ALLOCED, MINOR(disk_devt(dm_disk(md)))); 2444 set_bit(DMF_FREEING, &md->flags); 2445 spin_unlock(&_minor_lock); 2446 2447 if (!dm_suspended_md(md)) { 2448 dm_table_presuspend_targets(map); 2449 dm_table_postsuspend_targets(map); 2450 } 2451 2452 /* dm_put_live_table must be before msleep, otherwise deadlock is possible */ 2453 dm_put_live_table(md, srcu_idx); 2454 2455 /* 2456 * Rare, but there may be I/O requests still going to complete, 2457 * for example. Wait for all references to disappear. 2458 * No one should increment the reference count of the mapped_device, 2459 * after the mapped_device state becomes DMF_FREEING. 2460 */ 2461 if (wait) 2462 while (atomic_read(&md->holders)) 2463 msleep(1); 2464 else if (atomic_read(&md->holders)) 2465 DMWARN("%s: Forcibly removing mapped_device still in use! (%d users)", 2466 dm_device_name(md), atomic_read(&md->holders)); 2467 2468 dm_sysfs_exit(md); 2469 dm_table_destroy(__unbind(md)); 2470 free_dev(md); 2471 } 2472 2473 void dm_destroy(struct mapped_device *md) 2474 { 2475 __dm_destroy(md, true); 2476 } 2477 2478 void dm_destroy_immediate(struct mapped_device *md) 2479 { 2480 __dm_destroy(md, false); 2481 } 2482 2483 void dm_put(struct mapped_device *md) 2484 { 2485 atomic_dec(&md->holders); 2486 } 2487 EXPORT_SYMBOL_GPL(dm_put); 2488 2489 static int dm_wait_for_completion(struct mapped_device *md, int interruptible) 2490 { 2491 int r = 0; 2492 DECLARE_WAITQUEUE(wait, current); 2493 2494 add_wait_queue(&md->wait, &wait); 2495 2496 while (1) { 2497 set_current_state(interruptible); 2498 2499 if (!md_in_flight(md)) 2500 break; 2501 2502 if (interruptible == TASK_INTERRUPTIBLE && 2503 signal_pending(current)) { 2504 r = -EINTR; 2505 break; 2506 } 2507 2508 io_schedule(); 2509 } 2510 set_current_state(TASK_RUNNING); 2511 2512 remove_wait_queue(&md->wait, &wait); 2513 2514 return r; 2515 } 2516 2517 /* 2518 * Process the deferred bios 2519 */ 2520 static void dm_wq_work(struct work_struct *work) 2521 { 2522 struct mapped_device *md = container_of(work, struct mapped_device, 2523 work); 2524 struct bio *c; 2525 int srcu_idx; 2526 struct dm_table *map; 2527 2528 map = dm_get_live_table(md, &srcu_idx); 2529 2530 while (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) { 2531 spin_lock_irq(&md->deferred_lock); 2532 c = bio_list_pop(&md->deferred); 2533 spin_unlock_irq(&md->deferred_lock); 2534 2535 if (!c) 2536 break; 2537 2538 if (dm_request_based(md)) 2539 generic_make_request(c); 2540 else 2541 __split_and_process_bio(md, map, c); 2542 } 2543 2544 dm_put_live_table(md, srcu_idx); 2545 } 2546 2547 static void dm_queue_flush(struct mapped_device *md) 2548 { 2549 clear_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags); 2550 smp_mb__after_clear_bit(); 2551 queue_work(md->wq, &md->work); 2552 } 2553 2554 /* 2555 * Swap in a new table, returning the old one for the caller to destroy. 2556 */ 2557 struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *table) 2558 { 2559 struct dm_table *live_map = NULL, *map = ERR_PTR(-EINVAL); 2560 struct queue_limits limits; 2561 int r; 2562 2563 mutex_lock(&md->suspend_lock); 2564 2565 /* device must be suspended */ 2566 if (!dm_suspended_md(md)) 2567 goto out; 2568 2569 /* 2570 * If the new table has no data devices, retain the existing limits. 2571 * This helps multipath with queue_if_no_path if all paths disappear, 2572 * then new I/O is queued based on these limits, and then some paths 2573 * reappear. 2574 */ 2575 if (dm_table_has_no_data_devices(table)) { 2576 live_map = dm_get_live_table_fast(md); 2577 if (live_map) 2578 limits = md->queue->limits; 2579 dm_put_live_table_fast(md); 2580 } 2581 2582 if (!live_map) { 2583 r = dm_calculate_queue_limits(table, &limits); 2584 if (r) { 2585 map = ERR_PTR(r); 2586 goto out; 2587 } 2588 } 2589 2590 map = __bind(md, table, &limits); 2591 2592 out: 2593 mutex_unlock(&md->suspend_lock); 2594 return map; 2595 } 2596 2597 /* 2598 * Functions to lock and unlock any filesystem running on the 2599 * device. 2600 */ 2601 static int lock_fs(struct mapped_device *md) 2602 { 2603 int r; 2604 2605 WARN_ON(md->frozen_sb); 2606 2607 md->frozen_sb = freeze_bdev(md->bdev); 2608 if (IS_ERR(md->frozen_sb)) { 2609 r = PTR_ERR(md->frozen_sb); 2610 md->frozen_sb = NULL; 2611 return r; 2612 } 2613 2614 set_bit(DMF_FROZEN, &md->flags); 2615 2616 return 0; 2617 } 2618 2619 static void unlock_fs(struct mapped_device *md) 2620 { 2621 if (!test_bit(DMF_FROZEN, &md->flags)) 2622 return; 2623 2624 thaw_bdev(md->bdev, md->frozen_sb); 2625 md->frozen_sb = NULL; 2626 clear_bit(DMF_FROZEN, &md->flags); 2627 } 2628 2629 /* 2630 * We need to be able to change a mapping table under a mounted 2631 * filesystem. For example we might want to move some data in 2632 * the background. Before the table can be swapped with 2633 * dm_bind_table, dm_suspend must be called to flush any in 2634 * flight bios and ensure that any further io gets deferred. 2635 */ 2636 /* 2637 * Suspend mechanism in request-based dm. 2638 * 2639 * 1. Flush all I/Os by lock_fs() if needed. 2640 * 2. Stop dispatching any I/O by stopping the request_queue. 2641 * 3. Wait for all in-flight I/Os to be completed or requeued. 2642 * 2643 * To abort suspend, start the request_queue. 2644 */ 2645 int dm_suspend(struct mapped_device *md, unsigned suspend_flags) 2646 { 2647 struct dm_table *map = NULL; 2648 int r = 0; 2649 int do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG ? 1 : 0; 2650 int noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG ? 1 : 0; 2651 2652 mutex_lock(&md->suspend_lock); 2653 2654 if (dm_suspended_md(md)) { 2655 r = -EINVAL; 2656 goto out_unlock; 2657 } 2658 2659 map = md->map; 2660 2661 /* 2662 * DMF_NOFLUSH_SUSPENDING must be set before presuspend. 2663 * This flag is cleared before dm_suspend returns. 2664 */ 2665 if (noflush) 2666 set_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); 2667 2668 /* This does not get reverted if there's an error later. */ 2669 dm_table_presuspend_targets(map); 2670 2671 /* 2672 * Flush I/O to the device. 2673 * Any I/O submitted after lock_fs() may not be flushed. 2674 * noflush takes precedence over do_lockfs. 2675 * (lock_fs() flushes I/Os and waits for them to complete.) 2676 */ 2677 if (!noflush && do_lockfs) { 2678 r = lock_fs(md); 2679 if (r) 2680 goto out_unlock; 2681 } 2682 2683 /* 2684 * Here we must make sure that no processes are submitting requests 2685 * to target drivers i.e. no one may be executing 2686 * __split_and_process_bio. This is called from dm_request and 2687 * dm_wq_work. 2688 * 2689 * To get all processes out of __split_and_process_bio in dm_request, 2690 * we take the write lock. To prevent any process from reentering 2691 * __split_and_process_bio from dm_request and quiesce the thread 2692 * (dm_wq_work), we set BMF_BLOCK_IO_FOR_SUSPEND and call 2693 * flush_workqueue(md->wq). 2694 */ 2695 set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags); 2696 synchronize_srcu(&md->io_barrier); 2697 2698 /* 2699 * Stop md->queue before flushing md->wq in case request-based 2700 * dm defers requests to md->wq from md->queue. 2701 */ 2702 if (dm_request_based(md)) 2703 stop_queue(md->queue); 2704 2705 flush_workqueue(md->wq); 2706 2707 /* 2708 * At this point no more requests are entering target request routines. 2709 * We call dm_wait_for_completion to wait for all existing requests 2710 * to finish. 2711 */ 2712 r = dm_wait_for_completion(md, TASK_INTERRUPTIBLE); 2713 2714 if (noflush) 2715 clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); 2716 synchronize_srcu(&md->io_barrier); 2717 2718 /* were we interrupted ? */ 2719 if (r < 0) { 2720 dm_queue_flush(md); 2721 2722 if (dm_request_based(md)) 2723 start_queue(md->queue); 2724 2725 unlock_fs(md); 2726 goto out_unlock; /* pushback list is already flushed, so skip flush */ 2727 } 2728 2729 /* 2730 * If dm_wait_for_completion returned 0, the device is completely 2731 * quiescent now. There is no request-processing activity. All new 2732 * requests are being added to md->deferred list. 2733 */ 2734 2735 set_bit(DMF_SUSPENDED, &md->flags); 2736 2737 dm_table_postsuspend_targets(map); 2738 2739 out_unlock: 2740 mutex_unlock(&md->suspend_lock); 2741 return r; 2742 } 2743 2744 int dm_resume(struct mapped_device *md) 2745 { 2746 int r = -EINVAL; 2747 struct dm_table *map = NULL; 2748 2749 mutex_lock(&md->suspend_lock); 2750 if (!dm_suspended_md(md)) 2751 goto out; 2752 2753 map = md->map; 2754 if (!map || !dm_table_get_size(map)) 2755 goto out; 2756 2757 r = dm_table_resume_targets(map); 2758 if (r) 2759 goto out; 2760 2761 dm_queue_flush(md); 2762 2763 /* 2764 * Flushing deferred I/Os must be done after targets are resumed 2765 * so that mapping of targets can work correctly. 2766 * Request-based dm is queueing the deferred I/Os in its request_queue. 2767 */ 2768 if (dm_request_based(md)) 2769 start_queue(md->queue); 2770 2771 unlock_fs(md); 2772 2773 clear_bit(DMF_SUSPENDED, &md->flags); 2774 2775 r = 0; 2776 out: 2777 mutex_unlock(&md->suspend_lock); 2778 2779 return r; 2780 } 2781 2782 /* 2783 * Internal suspend/resume works like userspace-driven suspend. It waits 2784 * until all bios finish and prevents issuing new bios to the target drivers. 2785 * It may be used only from the kernel. 2786 * 2787 * Internal suspend holds md->suspend_lock, which prevents interaction with 2788 * userspace-driven suspend. 2789 */ 2790 2791 void dm_internal_suspend(struct mapped_device *md) 2792 { 2793 mutex_lock(&md->suspend_lock); 2794 if (dm_suspended_md(md)) 2795 return; 2796 2797 set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags); 2798 synchronize_srcu(&md->io_barrier); 2799 flush_workqueue(md->wq); 2800 dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE); 2801 } 2802 2803 void dm_internal_resume(struct mapped_device *md) 2804 { 2805 if (dm_suspended_md(md)) 2806 goto done; 2807 2808 dm_queue_flush(md); 2809 2810 done: 2811 mutex_unlock(&md->suspend_lock); 2812 } 2813 2814 /*----------------------------------------------------------------- 2815 * Event notification. 2816 *---------------------------------------------------------------*/ 2817 int dm_kobject_uevent(struct mapped_device *md, enum kobject_action action, 2818 unsigned cookie) 2819 { 2820 char udev_cookie[DM_COOKIE_LENGTH]; 2821 char *envp[] = { udev_cookie, NULL }; 2822 2823 if (!cookie) 2824 return kobject_uevent(&disk_to_dev(md->disk)->kobj, action); 2825 else { 2826 snprintf(udev_cookie, DM_COOKIE_LENGTH, "%s=%u", 2827 DM_COOKIE_ENV_VAR_NAME, cookie); 2828 return kobject_uevent_env(&disk_to_dev(md->disk)->kobj, 2829 action, envp); 2830 } 2831 } 2832 2833 uint32_t dm_next_uevent_seq(struct mapped_device *md) 2834 { 2835 return atomic_add_return(1, &md->uevent_seq); 2836 } 2837 2838 uint32_t dm_get_event_nr(struct mapped_device *md) 2839 { 2840 return atomic_read(&md->event_nr); 2841 } 2842 2843 int dm_wait_event(struct mapped_device *md, int event_nr) 2844 { 2845 return wait_event_interruptible(md->eventq, 2846 (event_nr != atomic_read(&md->event_nr))); 2847 } 2848 2849 void dm_uevent_add(struct mapped_device *md, struct list_head *elist) 2850 { 2851 unsigned long flags; 2852 2853 spin_lock_irqsave(&md->uevent_lock, flags); 2854 list_add(elist, &md->uevent_list); 2855 spin_unlock_irqrestore(&md->uevent_lock, flags); 2856 } 2857 2858 /* 2859 * The gendisk is only valid as long as you have a reference 2860 * count on 'md'. 2861 */ 2862 struct gendisk *dm_disk(struct mapped_device *md) 2863 { 2864 return md->disk; 2865 } 2866 2867 struct kobject *dm_kobject(struct mapped_device *md) 2868 { 2869 return &md->kobj; 2870 } 2871 2872 /* 2873 * struct mapped_device should not be exported outside of dm.c 2874 * so use this check to verify that kobj is part of md structure 2875 */ 2876 struct mapped_device *dm_get_from_kobject(struct kobject *kobj) 2877 { 2878 struct mapped_device *md; 2879 2880 md = container_of(kobj, struct mapped_device, kobj); 2881 if (&md->kobj != kobj) 2882 return NULL; 2883 2884 if (test_bit(DMF_FREEING, &md->flags) || 2885 dm_deleting_md(md)) 2886 return NULL; 2887 2888 dm_get(md); 2889 return md; 2890 } 2891 2892 int dm_suspended_md(struct mapped_device *md) 2893 { 2894 return test_bit(DMF_SUSPENDED, &md->flags); 2895 } 2896 2897 int dm_suspended(struct dm_target *ti) 2898 { 2899 return dm_suspended_md(dm_table_get_md(ti->table)); 2900 } 2901 EXPORT_SYMBOL_GPL(dm_suspended); 2902 2903 int dm_noflush_suspending(struct dm_target *ti) 2904 { 2905 return __noflush_suspending(dm_table_get_md(ti->table)); 2906 } 2907 EXPORT_SYMBOL_GPL(dm_noflush_suspending); 2908 2909 struct dm_md_mempools *dm_alloc_md_mempools(unsigned type, unsigned integrity, unsigned per_bio_data_size) 2910 { 2911 struct dm_md_mempools *pools = kzalloc(sizeof(*pools), GFP_KERNEL); 2912 struct kmem_cache *cachep; 2913 unsigned int pool_size; 2914 unsigned int front_pad; 2915 2916 if (!pools) 2917 return NULL; 2918 2919 if (type == DM_TYPE_BIO_BASED) { 2920 cachep = _io_cache; 2921 pool_size = dm_get_reserved_bio_based_ios(); 2922 front_pad = roundup(per_bio_data_size, __alignof__(struct dm_target_io)) + offsetof(struct dm_target_io, clone); 2923 } else if (type == DM_TYPE_REQUEST_BASED) { 2924 cachep = _rq_tio_cache; 2925 pool_size = dm_get_reserved_rq_based_ios(); 2926 front_pad = offsetof(struct dm_rq_clone_bio_info, clone); 2927 /* per_bio_data_size is not used. See __bind_mempools(). */ 2928 WARN_ON(per_bio_data_size != 0); 2929 } else 2930 goto out; 2931 2932 pools->io_pool = mempool_create_slab_pool(pool_size, cachep); 2933 if (!pools->io_pool) 2934 goto out; 2935 2936 pools->bs = bioset_create(pool_size, front_pad); 2937 if (!pools->bs) 2938 goto out; 2939 2940 if (integrity && bioset_integrity_create(pools->bs, pool_size)) 2941 goto out; 2942 2943 return pools; 2944 2945 out: 2946 dm_free_md_mempools(pools); 2947 2948 return NULL; 2949 } 2950 2951 void dm_free_md_mempools(struct dm_md_mempools *pools) 2952 { 2953 if (!pools) 2954 return; 2955 2956 if (pools->io_pool) 2957 mempool_destroy(pools->io_pool); 2958 2959 if (pools->bs) 2960 bioset_free(pools->bs); 2961 2962 kfree(pools); 2963 } 2964 2965 static const struct block_device_operations dm_blk_dops = { 2966 .open = dm_blk_open, 2967 .release = dm_blk_close, 2968 .ioctl = dm_blk_ioctl, 2969 .getgeo = dm_blk_getgeo, 2970 .owner = THIS_MODULE 2971 }; 2972 2973 EXPORT_SYMBOL(dm_get_mapinfo); 2974 2975 /* 2976 * module hooks 2977 */ 2978 module_init(dm_init); 2979 module_exit(dm_exit); 2980 2981 module_param(major, uint, 0); 2982 MODULE_PARM_DESC(major, "The major number of the device mapper"); 2983 2984 module_param(reserved_bio_based_ios, uint, S_IRUGO | S_IWUSR); 2985 MODULE_PARM_DESC(reserved_bio_based_ios, "Reserved IOs in bio-based mempools"); 2986 2987 module_param(reserved_rq_based_ios, uint, S_IRUGO | S_IWUSR); 2988 MODULE_PARM_DESC(reserved_rq_based_ios, "Reserved IOs in request-based mempools"); 2989 2990 MODULE_DESCRIPTION(DM_NAME " driver"); 2991 MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>"); 2992 MODULE_LICENSE("GPL"); 2993