1 /* 2 * Copyright (C) 2001, 2002 Sistina Software (UK) Limited. 3 * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved. 4 * 5 * This file is released under the GPL. 6 */ 7 8 #include "dm.h" 9 #include "dm-uevent.h" 10 11 #include <linux/init.h> 12 #include <linux/module.h> 13 #include <linux/mutex.h> 14 #include <linux/moduleparam.h> 15 #include <linux/blkpg.h> 16 #include <linux/bio.h> 17 #include <linux/mempool.h> 18 #include <linux/slab.h> 19 #include <linux/idr.h> 20 #include <linux/hdreg.h> 21 #include <linux/delay.h> 22 23 #include <trace/events/block.h> 24 25 #define DM_MSG_PREFIX "core" 26 27 #ifdef CONFIG_PRINTK 28 /* 29 * ratelimit state to be used in DMXXX_LIMIT(). 30 */ 31 DEFINE_RATELIMIT_STATE(dm_ratelimit_state, 32 DEFAULT_RATELIMIT_INTERVAL, 33 DEFAULT_RATELIMIT_BURST); 34 EXPORT_SYMBOL(dm_ratelimit_state); 35 #endif 36 37 /* 38 * Cookies are numeric values sent with CHANGE and REMOVE 39 * uevents while resuming, removing or renaming the device. 40 */ 41 #define DM_COOKIE_ENV_VAR_NAME "DM_COOKIE" 42 #define DM_COOKIE_LENGTH 24 43 44 static const char *_name = DM_NAME; 45 46 static unsigned int major = 0; 47 static unsigned int _major = 0; 48 49 static DEFINE_IDR(_minor_idr); 50 51 static DEFINE_SPINLOCK(_minor_lock); 52 53 static void do_deferred_remove(struct work_struct *w); 54 55 static DECLARE_WORK(deferred_remove_work, do_deferred_remove); 56 57 /* 58 * For bio-based dm. 59 * One of these is allocated per bio. 60 */ 61 struct dm_io { 62 struct mapped_device *md; 63 int error; 64 atomic_t io_count; 65 struct bio *bio; 66 unsigned long start_time; 67 spinlock_t endio_lock; 68 struct dm_stats_aux stats_aux; 69 }; 70 71 /* 72 * For request-based dm. 73 * One of these is allocated per request. 74 */ 75 struct dm_rq_target_io { 76 struct mapped_device *md; 77 struct dm_target *ti; 78 struct request *orig, clone; 79 int error; 80 union map_info info; 81 }; 82 83 /* 84 * For request-based dm - the bio clones we allocate are embedded in these 85 * structs. 86 * 87 * We allocate these with bio_alloc_bioset, using the front_pad parameter when 88 * the bioset is created - this means the bio has to come at the end of the 89 * struct. 90 */ 91 struct dm_rq_clone_bio_info { 92 struct bio *orig; 93 struct dm_rq_target_io *tio; 94 struct bio clone; 95 }; 96 97 union map_info *dm_get_mapinfo(struct bio *bio) 98 { 99 if (bio && bio->bi_private) 100 return &((struct dm_target_io *)bio->bi_private)->info; 101 return NULL; 102 } 103 104 union map_info *dm_get_rq_mapinfo(struct request *rq) 105 { 106 if (rq && rq->end_io_data) 107 return &((struct dm_rq_target_io *)rq->end_io_data)->info; 108 return NULL; 109 } 110 EXPORT_SYMBOL_GPL(dm_get_rq_mapinfo); 111 112 #define MINOR_ALLOCED ((void *)-1) 113 114 /* 115 * Bits for the md->flags field. 116 */ 117 #define DMF_BLOCK_IO_FOR_SUSPEND 0 118 #define DMF_SUSPENDED 1 119 #define DMF_FROZEN 2 120 #define DMF_FREEING 3 121 #define DMF_DELETING 4 122 #define DMF_NOFLUSH_SUSPENDING 5 123 #define DMF_MERGE_IS_OPTIONAL 6 124 #define DMF_DEFERRED_REMOVE 7 125 126 /* 127 * A dummy definition to make RCU happy. 128 * struct dm_table should never be dereferenced in this file. 129 */ 130 struct dm_table { 131 int undefined__; 132 }; 133 134 /* 135 * Work processed by per-device workqueue. 136 */ 137 struct mapped_device { 138 struct srcu_struct io_barrier; 139 struct mutex suspend_lock; 140 atomic_t holders; 141 atomic_t open_count; 142 143 /* 144 * The current mapping. 145 * Use dm_get_live_table{_fast} or take suspend_lock for 146 * dereference. 147 */ 148 struct dm_table *map; 149 150 unsigned long flags; 151 152 struct request_queue *queue; 153 unsigned type; 154 /* Protect queue and type against concurrent access. */ 155 struct mutex type_lock; 156 157 struct target_type *immutable_target_type; 158 159 struct gendisk *disk; 160 char name[16]; 161 162 void *interface_ptr; 163 164 /* 165 * A list of ios that arrived while we were suspended. 166 */ 167 atomic_t pending[2]; 168 wait_queue_head_t wait; 169 struct work_struct work; 170 struct bio_list deferred; 171 spinlock_t deferred_lock; 172 173 /* 174 * Processing queue (flush) 175 */ 176 struct workqueue_struct *wq; 177 178 /* 179 * io objects are allocated from here. 180 */ 181 mempool_t *io_pool; 182 183 struct bio_set *bs; 184 185 /* 186 * Event handling. 187 */ 188 atomic_t event_nr; 189 wait_queue_head_t eventq; 190 atomic_t uevent_seq; 191 struct list_head uevent_list; 192 spinlock_t uevent_lock; /* Protect access to uevent_list */ 193 194 /* 195 * freeze/thaw support require holding onto a super block 196 */ 197 struct super_block *frozen_sb; 198 struct block_device *bdev; 199 200 /* forced geometry settings */ 201 struct hd_geometry geometry; 202 203 /* sysfs handle */ 204 struct kobject kobj; 205 206 /* zero-length flush that will be cloned and submitted to targets */ 207 struct bio flush_bio; 208 209 struct dm_stats stats; 210 }; 211 212 /* 213 * For mempools pre-allocation at the table loading time. 214 */ 215 struct dm_md_mempools { 216 mempool_t *io_pool; 217 struct bio_set *bs; 218 }; 219 220 #define RESERVED_BIO_BASED_IOS 16 221 #define RESERVED_REQUEST_BASED_IOS 256 222 #define RESERVED_MAX_IOS 1024 223 static struct kmem_cache *_io_cache; 224 static struct kmem_cache *_rq_tio_cache; 225 226 /* 227 * Bio-based DM's mempools' reserved IOs set by the user. 228 */ 229 static unsigned reserved_bio_based_ios = RESERVED_BIO_BASED_IOS; 230 231 /* 232 * Request-based DM's mempools' reserved IOs set by the user. 233 */ 234 static unsigned reserved_rq_based_ios = RESERVED_REQUEST_BASED_IOS; 235 236 static unsigned __dm_get_reserved_ios(unsigned *reserved_ios, 237 unsigned def, unsigned max) 238 { 239 unsigned ios = ACCESS_ONCE(*reserved_ios); 240 unsigned modified_ios = 0; 241 242 if (!ios) 243 modified_ios = def; 244 else if (ios > max) 245 modified_ios = max; 246 247 if (modified_ios) { 248 (void)cmpxchg(reserved_ios, ios, modified_ios); 249 ios = modified_ios; 250 } 251 252 return ios; 253 } 254 255 unsigned dm_get_reserved_bio_based_ios(void) 256 { 257 return __dm_get_reserved_ios(&reserved_bio_based_ios, 258 RESERVED_BIO_BASED_IOS, RESERVED_MAX_IOS); 259 } 260 EXPORT_SYMBOL_GPL(dm_get_reserved_bio_based_ios); 261 262 unsigned dm_get_reserved_rq_based_ios(void) 263 { 264 return __dm_get_reserved_ios(&reserved_rq_based_ios, 265 RESERVED_REQUEST_BASED_IOS, RESERVED_MAX_IOS); 266 } 267 EXPORT_SYMBOL_GPL(dm_get_reserved_rq_based_ios); 268 269 static int __init local_init(void) 270 { 271 int r = -ENOMEM; 272 273 /* allocate a slab for the dm_ios */ 274 _io_cache = KMEM_CACHE(dm_io, 0); 275 if (!_io_cache) 276 return r; 277 278 _rq_tio_cache = KMEM_CACHE(dm_rq_target_io, 0); 279 if (!_rq_tio_cache) 280 goto out_free_io_cache; 281 282 r = dm_uevent_init(); 283 if (r) 284 goto out_free_rq_tio_cache; 285 286 _major = major; 287 r = register_blkdev(_major, _name); 288 if (r < 0) 289 goto out_uevent_exit; 290 291 if (!_major) 292 _major = r; 293 294 return 0; 295 296 out_uevent_exit: 297 dm_uevent_exit(); 298 out_free_rq_tio_cache: 299 kmem_cache_destroy(_rq_tio_cache); 300 out_free_io_cache: 301 kmem_cache_destroy(_io_cache); 302 303 return r; 304 } 305 306 static void local_exit(void) 307 { 308 flush_scheduled_work(); 309 310 kmem_cache_destroy(_rq_tio_cache); 311 kmem_cache_destroy(_io_cache); 312 unregister_blkdev(_major, _name); 313 dm_uevent_exit(); 314 315 _major = 0; 316 317 DMINFO("cleaned up"); 318 } 319 320 static int (*_inits[])(void) __initdata = { 321 local_init, 322 dm_target_init, 323 dm_linear_init, 324 dm_stripe_init, 325 dm_io_init, 326 dm_kcopyd_init, 327 dm_interface_init, 328 dm_statistics_init, 329 }; 330 331 static void (*_exits[])(void) = { 332 local_exit, 333 dm_target_exit, 334 dm_linear_exit, 335 dm_stripe_exit, 336 dm_io_exit, 337 dm_kcopyd_exit, 338 dm_interface_exit, 339 dm_statistics_exit, 340 }; 341 342 static int __init dm_init(void) 343 { 344 const int count = ARRAY_SIZE(_inits); 345 346 int r, i; 347 348 for (i = 0; i < count; i++) { 349 r = _inits[i](); 350 if (r) 351 goto bad; 352 } 353 354 return 0; 355 356 bad: 357 while (i--) 358 _exits[i](); 359 360 return r; 361 } 362 363 static void __exit dm_exit(void) 364 { 365 int i = ARRAY_SIZE(_exits); 366 367 while (i--) 368 _exits[i](); 369 370 /* 371 * Should be empty by this point. 372 */ 373 idr_destroy(&_minor_idr); 374 } 375 376 /* 377 * Block device functions 378 */ 379 int dm_deleting_md(struct mapped_device *md) 380 { 381 return test_bit(DMF_DELETING, &md->flags); 382 } 383 384 static int dm_blk_open(struct block_device *bdev, fmode_t mode) 385 { 386 struct mapped_device *md; 387 388 spin_lock(&_minor_lock); 389 390 md = bdev->bd_disk->private_data; 391 if (!md) 392 goto out; 393 394 if (test_bit(DMF_FREEING, &md->flags) || 395 dm_deleting_md(md)) { 396 md = NULL; 397 goto out; 398 } 399 400 dm_get(md); 401 atomic_inc(&md->open_count); 402 403 out: 404 spin_unlock(&_minor_lock); 405 406 return md ? 0 : -ENXIO; 407 } 408 409 static void dm_blk_close(struct gendisk *disk, fmode_t mode) 410 { 411 struct mapped_device *md = disk->private_data; 412 413 spin_lock(&_minor_lock); 414 415 if (atomic_dec_and_test(&md->open_count) && 416 (test_bit(DMF_DEFERRED_REMOVE, &md->flags))) 417 schedule_work(&deferred_remove_work); 418 419 dm_put(md); 420 421 spin_unlock(&_minor_lock); 422 } 423 424 int dm_open_count(struct mapped_device *md) 425 { 426 return atomic_read(&md->open_count); 427 } 428 429 /* 430 * Guarantees nothing is using the device before it's deleted. 431 */ 432 int dm_lock_for_deletion(struct mapped_device *md, bool mark_deferred, bool only_deferred) 433 { 434 int r = 0; 435 436 spin_lock(&_minor_lock); 437 438 if (dm_open_count(md)) { 439 r = -EBUSY; 440 if (mark_deferred) 441 set_bit(DMF_DEFERRED_REMOVE, &md->flags); 442 } else if (only_deferred && !test_bit(DMF_DEFERRED_REMOVE, &md->flags)) 443 r = -EEXIST; 444 else 445 set_bit(DMF_DELETING, &md->flags); 446 447 spin_unlock(&_minor_lock); 448 449 return r; 450 } 451 452 int dm_cancel_deferred_remove(struct mapped_device *md) 453 { 454 int r = 0; 455 456 spin_lock(&_minor_lock); 457 458 if (test_bit(DMF_DELETING, &md->flags)) 459 r = -EBUSY; 460 else 461 clear_bit(DMF_DEFERRED_REMOVE, &md->flags); 462 463 spin_unlock(&_minor_lock); 464 465 return r; 466 } 467 468 static void do_deferred_remove(struct work_struct *w) 469 { 470 dm_deferred_remove(); 471 } 472 473 sector_t dm_get_size(struct mapped_device *md) 474 { 475 return get_capacity(md->disk); 476 } 477 478 struct dm_stats *dm_get_stats(struct mapped_device *md) 479 { 480 return &md->stats; 481 } 482 483 static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo) 484 { 485 struct mapped_device *md = bdev->bd_disk->private_data; 486 487 return dm_get_geometry(md, geo); 488 } 489 490 static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode, 491 unsigned int cmd, unsigned long arg) 492 { 493 struct mapped_device *md = bdev->bd_disk->private_data; 494 int srcu_idx; 495 struct dm_table *map; 496 struct dm_target *tgt; 497 int r = -ENOTTY; 498 499 retry: 500 map = dm_get_live_table(md, &srcu_idx); 501 502 if (!map || !dm_table_get_size(map)) 503 goto out; 504 505 /* We only support devices that have a single target */ 506 if (dm_table_get_num_targets(map) != 1) 507 goto out; 508 509 tgt = dm_table_get_target(map, 0); 510 511 if (dm_suspended_md(md)) { 512 r = -EAGAIN; 513 goto out; 514 } 515 516 if (tgt->type->ioctl) 517 r = tgt->type->ioctl(tgt, cmd, arg); 518 519 out: 520 dm_put_live_table(md, srcu_idx); 521 522 if (r == -ENOTCONN) { 523 msleep(10); 524 goto retry; 525 } 526 527 return r; 528 } 529 530 static struct dm_io *alloc_io(struct mapped_device *md) 531 { 532 return mempool_alloc(md->io_pool, GFP_NOIO); 533 } 534 535 static void free_io(struct mapped_device *md, struct dm_io *io) 536 { 537 mempool_free(io, md->io_pool); 538 } 539 540 static void free_tio(struct mapped_device *md, struct dm_target_io *tio) 541 { 542 bio_put(&tio->clone); 543 } 544 545 static struct dm_rq_target_io *alloc_rq_tio(struct mapped_device *md, 546 gfp_t gfp_mask) 547 { 548 return mempool_alloc(md->io_pool, gfp_mask); 549 } 550 551 static void free_rq_tio(struct dm_rq_target_io *tio) 552 { 553 mempool_free(tio, tio->md->io_pool); 554 } 555 556 static int md_in_flight(struct mapped_device *md) 557 { 558 return atomic_read(&md->pending[READ]) + 559 atomic_read(&md->pending[WRITE]); 560 } 561 562 static void start_io_acct(struct dm_io *io) 563 { 564 struct mapped_device *md = io->md; 565 struct bio *bio = io->bio; 566 int cpu; 567 int rw = bio_data_dir(bio); 568 569 io->start_time = jiffies; 570 571 cpu = part_stat_lock(); 572 part_round_stats(cpu, &dm_disk(md)->part0); 573 part_stat_unlock(); 574 atomic_set(&dm_disk(md)->part0.in_flight[rw], 575 atomic_inc_return(&md->pending[rw])); 576 577 if (unlikely(dm_stats_used(&md->stats))) 578 dm_stats_account_io(&md->stats, bio->bi_rw, bio->bi_sector, 579 bio_sectors(bio), false, 0, &io->stats_aux); 580 } 581 582 static void end_io_acct(struct dm_io *io) 583 { 584 struct mapped_device *md = io->md; 585 struct bio *bio = io->bio; 586 unsigned long duration = jiffies - io->start_time; 587 int pending, cpu; 588 int rw = bio_data_dir(bio); 589 590 cpu = part_stat_lock(); 591 part_round_stats(cpu, &dm_disk(md)->part0); 592 part_stat_add(cpu, &dm_disk(md)->part0, ticks[rw], duration); 593 part_stat_unlock(); 594 595 if (unlikely(dm_stats_used(&md->stats))) 596 dm_stats_account_io(&md->stats, bio->bi_rw, bio->bi_sector, 597 bio_sectors(bio), true, duration, &io->stats_aux); 598 599 /* 600 * After this is decremented the bio must not be touched if it is 601 * a flush. 602 */ 603 pending = atomic_dec_return(&md->pending[rw]); 604 atomic_set(&dm_disk(md)->part0.in_flight[rw], pending); 605 pending += atomic_read(&md->pending[rw^0x1]); 606 607 /* nudge anyone waiting on suspend queue */ 608 if (!pending) 609 wake_up(&md->wait); 610 } 611 612 /* 613 * Add the bio to the list of deferred io. 614 */ 615 static void queue_io(struct mapped_device *md, struct bio *bio) 616 { 617 unsigned long flags; 618 619 spin_lock_irqsave(&md->deferred_lock, flags); 620 bio_list_add(&md->deferred, bio); 621 spin_unlock_irqrestore(&md->deferred_lock, flags); 622 queue_work(md->wq, &md->work); 623 } 624 625 /* 626 * Everyone (including functions in this file), should use this 627 * function to access the md->map field, and make sure they call 628 * dm_put_live_table() when finished. 629 */ 630 struct dm_table *dm_get_live_table(struct mapped_device *md, int *srcu_idx) __acquires(md->io_barrier) 631 { 632 *srcu_idx = srcu_read_lock(&md->io_barrier); 633 634 return srcu_dereference(md->map, &md->io_barrier); 635 } 636 637 void dm_put_live_table(struct mapped_device *md, int srcu_idx) __releases(md->io_barrier) 638 { 639 srcu_read_unlock(&md->io_barrier, srcu_idx); 640 } 641 642 void dm_sync_table(struct mapped_device *md) 643 { 644 synchronize_srcu(&md->io_barrier); 645 synchronize_rcu_expedited(); 646 } 647 648 /* 649 * A fast alternative to dm_get_live_table/dm_put_live_table. 650 * The caller must not block between these two functions. 651 */ 652 static struct dm_table *dm_get_live_table_fast(struct mapped_device *md) __acquires(RCU) 653 { 654 rcu_read_lock(); 655 return rcu_dereference(md->map); 656 } 657 658 static void dm_put_live_table_fast(struct mapped_device *md) __releases(RCU) 659 { 660 rcu_read_unlock(); 661 } 662 663 /* 664 * Get the geometry associated with a dm device 665 */ 666 int dm_get_geometry(struct mapped_device *md, struct hd_geometry *geo) 667 { 668 *geo = md->geometry; 669 670 return 0; 671 } 672 673 /* 674 * Set the geometry of a device. 675 */ 676 int dm_set_geometry(struct mapped_device *md, struct hd_geometry *geo) 677 { 678 sector_t sz = (sector_t)geo->cylinders * geo->heads * geo->sectors; 679 680 if (geo->start > sz) { 681 DMWARN("Start sector is beyond the geometry limits."); 682 return -EINVAL; 683 } 684 685 md->geometry = *geo; 686 687 return 0; 688 } 689 690 /*----------------------------------------------------------------- 691 * CRUD START: 692 * A more elegant soln is in the works that uses the queue 693 * merge fn, unfortunately there are a couple of changes to 694 * the block layer that I want to make for this. So in the 695 * interests of getting something for people to use I give 696 * you this clearly demarcated crap. 697 *---------------------------------------------------------------*/ 698 699 static int __noflush_suspending(struct mapped_device *md) 700 { 701 return test_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); 702 } 703 704 /* 705 * Decrements the number of outstanding ios that a bio has been 706 * cloned into, completing the original io if necc. 707 */ 708 static void dec_pending(struct dm_io *io, int error) 709 { 710 unsigned long flags; 711 int io_error; 712 struct bio *bio; 713 struct mapped_device *md = io->md; 714 715 /* Push-back supersedes any I/O errors */ 716 if (unlikely(error)) { 717 spin_lock_irqsave(&io->endio_lock, flags); 718 if (!(io->error > 0 && __noflush_suspending(md))) 719 io->error = error; 720 spin_unlock_irqrestore(&io->endio_lock, flags); 721 } 722 723 if (atomic_dec_and_test(&io->io_count)) { 724 if (io->error == DM_ENDIO_REQUEUE) { 725 /* 726 * Target requested pushing back the I/O. 727 */ 728 spin_lock_irqsave(&md->deferred_lock, flags); 729 if (__noflush_suspending(md)) 730 bio_list_add_head(&md->deferred, io->bio); 731 else 732 /* noflush suspend was interrupted. */ 733 io->error = -EIO; 734 spin_unlock_irqrestore(&md->deferred_lock, flags); 735 } 736 737 io_error = io->error; 738 bio = io->bio; 739 end_io_acct(io); 740 free_io(md, io); 741 742 if (io_error == DM_ENDIO_REQUEUE) 743 return; 744 745 if ((bio->bi_rw & REQ_FLUSH) && bio->bi_size) { 746 /* 747 * Preflush done for flush with data, reissue 748 * without REQ_FLUSH. 749 */ 750 bio->bi_rw &= ~REQ_FLUSH; 751 queue_io(md, bio); 752 } else { 753 /* done with normal IO or empty flush */ 754 trace_block_bio_complete(md->queue, bio, io_error); 755 bio_endio(bio, io_error); 756 } 757 } 758 } 759 760 static void clone_endio(struct bio *bio, int error) 761 { 762 int r = 0; 763 struct dm_target_io *tio = bio->bi_private; 764 struct dm_io *io = tio->io; 765 struct mapped_device *md = tio->io->md; 766 dm_endio_fn endio = tio->ti->type->end_io; 767 768 if (!bio_flagged(bio, BIO_UPTODATE) && !error) 769 error = -EIO; 770 771 if (endio) { 772 r = endio(tio->ti, bio, error); 773 if (r < 0 || r == DM_ENDIO_REQUEUE) 774 /* 775 * error and requeue request are handled 776 * in dec_pending(). 777 */ 778 error = r; 779 else if (r == DM_ENDIO_INCOMPLETE) 780 /* The target will handle the io */ 781 return; 782 else if (r) { 783 DMWARN("unimplemented target endio return value: %d", r); 784 BUG(); 785 } 786 } 787 788 free_tio(md, tio); 789 dec_pending(io, error); 790 } 791 792 /* 793 * Partial completion handling for request-based dm 794 */ 795 static void end_clone_bio(struct bio *clone, int error) 796 { 797 struct dm_rq_clone_bio_info *info = clone->bi_private; 798 struct dm_rq_target_io *tio = info->tio; 799 struct bio *bio = info->orig; 800 unsigned int nr_bytes = info->orig->bi_size; 801 802 bio_put(clone); 803 804 if (tio->error) 805 /* 806 * An error has already been detected on the request. 807 * Once error occurred, just let clone->end_io() handle 808 * the remainder. 809 */ 810 return; 811 else if (error) { 812 /* 813 * Don't notice the error to the upper layer yet. 814 * The error handling decision is made by the target driver, 815 * when the request is completed. 816 */ 817 tio->error = error; 818 return; 819 } 820 821 /* 822 * I/O for the bio successfully completed. 823 * Notice the data completion to the upper layer. 824 */ 825 826 /* 827 * bios are processed from the head of the list. 828 * So the completing bio should always be rq->bio. 829 * If it's not, something wrong is happening. 830 */ 831 if (tio->orig->bio != bio) 832 DMERR("bio completion is going in the middle of the request"); 833 834 /* 835 * Update the original request. 836 * Do not use blk_end_request() here, because it may complete 837 * the original request before the clone, and break the ordering. 838 */ 839 blk_update_request(tio->orig, 0, nr_bytes); 840 } 841 842 /* 843 * Don't touch any member of the md after calling this function because 844 * the md may be freed in dm_put() at the end of this function. 845 * Or do dm_get() before calling this function and dm_put() later. 846 */ 847 static void rq_completed(struct mapped_device *md, int rw, int run_queue) 848 { 849 atomic_dec(&md->pending[rw]); 850 851 /* nudge anyone waiting on suspend queue */ 852 if (!md_in_flight(md)) 853 wake_up(&md->wait); 854 855 /* 856 * Run this off this callpath, as drivers could invoke end_io while 857 * inside their request_fn (and holding the queue lock). Calling 858 * back into ->request_fn() could deadlock attempting to grab the 859 * queue lock again. 860 */ 861 if (run_queue) 862 blk_run_queue_async(md->queue); 863 864 /* 865 * dm_put() must be at the end of this function. See the comment above 866 */ 867 dm_put(md); 868 } 869 870 static void free_rq_clone(struct request *clone) 871 { 872 struct dm_rq_target_io *tio = clone->end_io_data; 873 874 blk_rq_unprep_clone(clone); 875 free_rq_tio(tio); 876 } 877 878 /* 879 * Complete the clone and the original request. 880 * Must be called without queue lock. 881 */ 882 static void dm_end_request(struct request *clone, int error) 883 { 884 int rw = rq_data_dir(clone); 885 struct dm_rq_target_io *tio = clone->end_io_data; 886 struct mapped_device *md = tio->md; 887 struct request *rq = tio->orig; 888 889 if (rq->cmd_type == REQ_TYPE_BLOCK_PC) { 890 rq->errors = clone->errors; 891 rq->resid_len = clone->resid_len; 892 893 if (rq->sense) 894 /* 895 * We are using the sense buffer of the original 896 * request. 897 * So setting the length of the sense data is enough. 898 */ 899 rq->sense_len = clone->sense_len; 900 } 901 902 free_rq_clone(clone); 903 blk_end_request_all(rq, error); 904 rq_completed(md, rw, true); 905 } 906 907 static void dm_unprep_request(struct request *rq) 908 { 909 struct request *clone = rq->special; 910 911 rq->special = NULL; 912 rq->cmd_flags &= ~REQ_DONTPREP; 913 914 free_rq_clone(clone); 915 } 916 917 /* 918 * Requeue the original request of a clone. 919 */ 920 void dm_requeue_unmapped_request(struct request *clone) 921 { 922 int rw = rq_data_dir(clone); 923 struct dm_rq_target_io *tio = clone->end_io_data; 924 struct mapped_device *md = tio->md; 925 struct request *rq = tio->orig; 926 struct request_queue *q = rq->q; 927 unsigned long flags; 928 929 dm_unprep_request(rq); 930 931 spin_lock_irqsave(q->queue_lock, flags); 932 blk_requeue_request(q, rq); 933 spin_unlock_irqrestore(q->queue_lock, flags); 934 935 rq_completed(md, rw, 0); 936 } 937 EXPORT_SYMBOL_GPL(dm_requeue_unmapped_request); 938 939 static void __stop_queue(struct request_queue *q) 940 { 941 blk_stop_queue(q); 942 } 943 944 static void stop_queue(struct request_queue *q) 945 { 946 unsigned long flags; 947 948 spin_lock_irqsave(q->queue_lock, flags); 949 __stop_queue(q); 950 spin_unlock_irqrestore(q->queue_lock, flags); 951 } 952 953 static void __start_queue(struct request_queue *q) 954 { 955 if (blk_queue_stopped(q)) 956 blk_start_queue(q); 957 } 958 959 static void start_queue(struct request_queue *q) 960 { 961 unsigned long flags; 962 963 spin_lock_irqsave(q->queue_lock, flags); 964 __start_queue(q); 965 spin_unlock_irqrestore(q->queue_lock, flags); 966 } 967 968 static void dm_done(struct request *clone, int error, bool mapped) 969 { 970 int r = error; 971 struct dm_rq_target_io *tio = clone->end_io_data; 972 dm_request_endio_fn rq_end_io = NULL; 973 974 if (tio->ti) { 975 rq_end_io = tio->ti->type->rq_end_io; 976 977 if (mapped && rq_end_io) 978 r = rq_end_io(tio->ti, clone, error, &tio->info); 979 } 980 981 if (r <= 0) 982 /* The target wants to complete the I/O */ 983 dm_end_request(clone, r); 984 else if (r == DM_ENDIO_INCOMPLETE) 985 /* The target will handle the I/O */ 986 return; 987 else if (r == DM_ENDIO_REQUEUE) 988 /* The target wants to requeue the I/O */ 989 dm_requeue_unmapped_request(clone); 990 else { 991 DMWARN("unimplemented target endio return value: %d", r); 992 BUG(); 993 } 994 } 995 996 /* 997 * Request completion handler for request-based dm 998 */ 999 static void dm_softirq_done(struct request *rq) 1000 { 1001 bool mapped = true; 1002 struct request *clone = rq->completion_data; 1003 struct dm_rq_target_io *tio = clone->end_io_data; 1004 1005 if (rq->cmd_flags & REQ_FAILED) 1006 mapped = false; 1007 1008 dm_done(clone, tio->error, mapped); 1009 } 1010 1011 /* 1012 * Complete the clone and the original request with the error status 1013 * through softirq context. 1014 */ 1015 static void dm_complete_request(struct request *clone, int error) 1016 { 1017 struct dm_rq_target_io *tio = clone->end_io_data; 1018 struct request *rq = tio->orig; 1019 1020 tio->error = error; 1021 rq->completion_data = clone; 1022 blk_complete_request(rq); 1023 } 1024 1025 /* 1026 * Complete the not-mapped clone and the original request with the error status 1027 * through softirq context. 1028 * Target's rq_end_io() function isn't called. 1029 * This may be used when the target's map_rq() function fails. 1030 */ 1031 void dm_kill_unmapped_request(struct request *clone, int error) 1032 { 1033 struct dm_rq_target_io *tio = clone->end_io_data; 1034 struct request *rq = tio->orig; 1035 1036 rq->cmd_flags |= REQ_FAILED; 1037 dm_complete_request(clone, error); 1038 } 1039 EXPORT_SYMBOL_GPL(dm_kill_unmapped_request); 1040 1041 /* 1042 * Called with the queue lock held 1043 */ 1044 static void end_clone_request(struct request *clone, int error) 1045 { 1046 /* 1047 * For just cleaning up the information of the queue in which 1048 * the clone was dispatched. 1049 * The clone is *NOT* freed actually here because it is alloced from 1050 * dm own mempool and REQ_ALLOCED isn't set in clone->cmd_flags. 1051 */ 1052 __blk_put_request(clone->q, clone); 1053 1054 /* 1055 * Actual request completion is done in a softirq context which doesn't 1056 * hold the queue lock. Otherwise, deadlock could occur because: 1057 * - another request may be submitted by the upper level driver 1058 * of the stacking during the completion 1059 * - the submission which requires queue lock may be done 1060 * against this queue 1061 */ 1062 dm_complete_request(clone, error); 1063 } 1064 1065 /* 1066 * Return maximum size of I/O possible at the supplied sector up to the current 1067 * target boundary. 1068 */ 1069 static sector_t max_io_len_target_boundary(sector_t sector, struct dm_target *ti) 1070 { 1071 sector_t target_offset = dm_target_offset(ti, sector); 1072 1073 return ti->len - target_offset; 1074 } 1075 1076 static sector_t max_io_len(sector_t sector, struct dm_target *ti) 1077 { 1078 sector_t len = max_io_len_target_boundary(sector, ti); 1079 sector_t offset, max_len; 1080 1081 /* 1082 * Does the target need to split even further? 1083 */ 1084 if (ti->max_io_len) { 1085 offset = dm_target_offset(ti, sector); 1086 if (unlikely(ti->max_io_len & (ti->max_io_len - 1))) 1087 max_len = sector_div(offset, ti->max_io_len); 1088 else 1089 max_len = offset & (ti->max_io_len - 1); 1090 max_len = ti->max_io_len - max_len; 1091 1092 if (len > max_len) 1093 len = max_len; 1094 } 1095 1096 return len; 1097 } 1098 1099 int dm_set_target_max_io_len(struct dm_target *ti, sector_t len) 1100 { 1101 if (len > UINT_MAX) { 1102 DMERR("Specified maximum size of target IO (%llu) exceeds limit (%u)", 1103 (unsigned long long)len, UINT_MAX); 1104 ti->error = "Maximum size of target IO is too large"; 1105 return -EINVAL; 1106 } 1107 1108 ti->max_io_len = (uint32_t) len; 1109 1110 return 0; 1111 } 1112 EXPORT_SYMBOL_GPL(dm_set_target_max_io_len); 1113 1114 static void __map_bio(struct dm_target_io *tio) 1115 { 1116 int r; 1117 sector_t sector; 1118 struct mapped_device *md; 1119 struct bio *clone = &tio->clone; 1120 struct dm_target *ti = tio->ti; 1121 1122 clone->bi_end_io = clone_endio; 1123 clone->bi_private = tio; 1124 1125 /* 1126 * Map the clone. If r == 0 we don't need to do 1127 * anything, the target has assumed ownership of 1128 * this io. 1129 */ 1130 atomic_inc(&tio->io->io_count); 1131 sector = clone->bi_sector; 1132 r = ti->type->map(ti, clone); 1133 if (r == DM_MAPIO_REMAPPED) { 1134 /* the bio has been remapped so dispatch it */ 1135 1136 trace_block_bio_remap(bdev_get_queue(clone->bi_bdev), clone, 1137 tio->io->bio->bi_bdev->bd_dev, sector); 1138 1139 generic_make_request(clone); 1140 } else if (r < 0 || r == DM_MAPIO_REQUEUE) { 1141 /* error the io and bail out, or requeue it if needed */ 1142 md = tio->io->md; 1143 dec_pending(tio->io, r); 1144 free_tio(md, tio); 1145 } else if (r) { 1146 DMWARN("unimplemented target map return value: %d", r); 1147 BUG(); 1148 } 1149 } 1150 1151 struct clone_info { 1152 struct mapped_device *md; 1153 struct dm_table *map; 1154 struct bio *bio; 1155 struct dm_io *io; 1156 sector_t sector; 1157 sector_t sector_count; 1158 unsigned short idx; 1159 }; 1160 1161 static void bio_setup_sector(struct bio *bio, sector_t sector, sector_t len) 1162 { 1163 bio->bi_sector = sector; 1164 bio->bi_size = to_bytes(len); 1165 } 1166 1167 static void bio_setup_bv(struct bio *bio, unsigned short idx, unsigned short bv_count) 1168 { 1169 bio->bi_idx = idx; 1170 bio->bi_vcnt = idx + bv_count; 1171 bio->bi_flags &= ~(1 << BIO_SEG_VALID); 1172 } 1173 1174 static void clone_bio_integrity(struct bio *bio, struct bio *clone, 1175 unsigned short idx, unsigned len, unsigned offset, 1176 unsigned trim) 1177 { 1178 if (!bio_integrity(bio)) 1179 return; 1180 1181 bio_integrity_clone(clone, bio, GFP_NOIO); 1182 1183 if (trim) 1184 bio_integrity_trim(clone, bio_sector_offset(bio, idx, offset), len); 1185 } 1186 1187 /* 1188 * Creates a little bio that just does part of a bvec. 1189 */ 1190 static void clone_split_bio(struct dm_target_io *tio, struct bio *bio, 1191 sector_t sector, unsigned short idx, 1192 unsigned offset, unsigned len) 1193 { 1194 struct bio *clone = &tio->clone; 1195 struct bio_vec *bv = bio->bi_io_vec + idx; 1196 1197 *clone->bi_io_vec = *bv; 1198 1199 bio_setup_sector(clone, sector, len); 1200 1201 clone->bi_bdev = bio->bi_bdev; 1202 clone->bi_rw = bio->bi_rw; 1203 clone->bi_vcnt = 1; 1204 clone->bi_io_vec->bv_offset = offset; 1205 clone->bi_io_vec->bv_len = clone->bi_size; 1206 clone->bi_flags |= 1 << BIO_CLONED; 1207 1208 clone_bio_integrity(bio, clone, idx, len, offset, 1); 1209 } 1210 1211 /* 1212 * Creates a bio that consists of range of complete bvecs. 1213 */ 1214 static void clone_bio(struct dm_target_io *tio, struct bio *bio, 1215 sector_t sector, unsigned short idx, 1216 unsigned short bv_count, unsigned len) 1217 { 1218 struct bio *clone = &tio->clone; 1219 unsigned trim = 0; 1220 1221 __bio_clone(clone, bio); 1222 bio_setup_sector(clone, sector, len); 1223 bio_setup_bv(clone, idx, bv_count); 1224 1225 if (idx != bio->bi_idx || clone->bi_size < bio->bi_size) 1226 trim = 1; 1227 clone_bio_integrity(bio, clone, idx, len, 0, trim); 1228 } 1229 1230 static struct dm_target_io *alloc_tio(struct clone_info *ci, 1231 struct dm_target *ti, int nr_iovecs, 1232 unsigned target_bio_nr) 1233 { 1234 struct dm_target_io *tio; 1235 struct bio *clone; 1236 1237 clone = bio_alloc_bioset(GFP_NOIO, nr_iovecs, ci->md->bs); 1238 tio = container_of(clone, struct dm_target_io, clone); 1239 1240 tio->io = ci->io; 1241 tio->ti = ti; 1242 memset(&tio->info, 0, sizeof(tio->info)); 1243 tio->target_bio_nr = target_bio_nr; 1244 1245 return tio; 1246 } 1247 1248 static void __clone_and_map_simple_bio(struct clone_info *ci, 1249 struct dm_target *ti, 1250 unsigned target_bio_nr, sector_t len) 1251 { 1252 struct dm_target_io *tio = alloc_tio(ci, ti, ci->bio->bi_max_vecs, target_bio_nr); 1253 struct bio *clone = &tio->clone; 1254 1255 /* 1256 * Discard requests require the bio's inline iovecs be initialized. 1257 * ci->bio->bi_max_vecs is BIO_INLINE_VECS anyway, for both flush 1258 * and discard, so no need for concern about wasted bvec allocations. 1259 */ 1260 __bio_clone(clone, ci->bio); 1261 if (len) 1262 bio_setup_sector(clone, ci->sector, len); 1263 1264 __map_bio(tio); 1265 } 1266 1267 static void __send_duplicate_bios(struct clone_info *ci, struct dm_target *ti, 1268 unsigned num_bios, sector_t len) 1269 { 1270 unsigned target_bio_nr; 1271 1272 for (target_bio_nr = 0; target_bio_nr < num_bios; target_bio_nr++) 1273 __clone_and_map_simple_bio(ci, ti, target_bio_nr, len); 1274 } 1275 1276 static int __send_empty_flush(struct clone_info *ci) 1277 { 1278 unsigned target_nr = 0; 1279 struct dm_target *ti; 1280 1281 BUG_ON(bio_has_data(ci->bio)); 1282 while ((ti = dm_table_get_target(ci->map, target_nr++))) 1283 __send_duplicate_bios(ci, ti, ti->num_flush_bios, 0); 1284 1285 return 0; 1286 } 1287 1288 static void __clone_and_map_data_bio(struct clone_info *ci, struct dm_target *ti, 1289 sector_t sector, int nr_iovecs, 1290 unsigned short idx, unsigned short bv_count, 1291 unsigned offset, unsigned len, 1292 unsigned split_bvec) 1293 { 1294 struct bio *bio = ci->bio; 1295 struct dm_target_io *tio; 1296 unsigned target_bio_nr; 1297 unsigned num_target_bios = 1; 1298 1299 /* 1300 * Does the target want to receive duplicate copies of the bio? 1301 */ 1302 if (bio_data_dir(bio) == WRITE && ti->num_write_bios) 1303 num_target_bios = ti->num_write_bios(ti, bio); 1304 1305 for (target_bio_nr = 0; target_bio_nr < num_target_bios; target_bio_nr++) { 1306 tio = alloc_tio(ci, ti, nr_iovecs, target_bio_nr); 1307 if (split_bvec) 1308 clone_split_bio(tio, bio, sector, idx, offset, len); 1309 else 1310 clone_bio(tio, bio, sector, idx, bv_count, len); 1311 __map_bio(tio); 1312 } 1313 } 1314 1315 typedef unsigned (*get_num_bios_fn)(struct dm_target *ti); 1316 1317 static unsigned get_num_discard_bios(struct dm_target *ti) 1318 { 1319 return ti->num_discard_bios; 1320 } 1321 1322 static unsigned get_num_write_same_bios(struct dm_target *ti) 1323 { 1324 return ti->num_write_same_bios; 1325 } 1326 1327 typedef bool (*is_split_required_fn)(struct dm_target *ti); 1328 1329 static bool is_split_required_for_discard(struct dm_target *ti) 1330 { 1331 return ti->split_discard_bios; 1332 } 1333 1334 static int __send_changing_extent_only(struct clone_info *ci, 1335 get_num_bios_fn get_num_bios, 1336 is_split_required_fn is_split_required) 1337 { 1338 struct dm_target *ti; 1339 sector_t len; 1340 unsigned num_bios; 1341 1342 do { 1343 ti = dm_table_find_target(ci->map, ci->sector); 1344 if (!dm_target_is_valid(ti)) 1345 return -EIO; 1346 1347 /* 1348 * Even though the device advertised support for this type of 1349 * request, that does not mean every target supports it, and 1350 * reconfiguration might also have changed that since the 1351 * check was performed. 1352 */ 1353 num_bios = get_num_bios ? get_num_bios(ti) : 0; 1354 if (!num_bios) 1355 return -EOPNOTSUPP; 1356 1357 if (is_split_required && !is_split_required(ti)) 1358 len = min(ci->sector_count, max_io_len_target_boundary(ci->sector, ti)); 1359 else 1360 len = min(ci->sector_count, max_io_len(ci->sector, ti)); 1361 1362 __send_duplicate_bios(ci, ti, num_bios, len); 1363 1364 ci->sector += len; 1365 } while (ci->sector_count -= len); 1366 1367 return 0; 1368 } 1369 1370 static int __send_discard(struct clone_info *ci) 1371 { 1372 return __send_changing_extent_only(ci, get_num_discard_bios, 1373 is_split_required_for_discard); 1374 } 1375 1376 static int __send_write_same(struct clone_info *ci) 1377 { 1378 return __send_changing_extent_only(ci, get_num_write_same_bios, NULL); 1379 } 1380 1381 /* 1382 * Find maximum number of sectors / bvecs we can process with a single bio. 1383 */ 1384 static sector_t __len_within_target(struct clone_info *ci, sector_t max, int *idx) 1385 { 1386 struct bio *bio = ci->bio; 1387 sector_t bv_len, total_len = 0; 1388 1389 for (*idx = ci->idx; max && (*idx < bio->bi_vcnt); (*idx)++) { 1390 bv_len = to_sector(bio->bi_io_vec[*idx].bv_len); 1391 1392 if (bv_len > max) 1393 break; 1394 1395 max -= bv_len; 1396 total_len += bv_len; 1397 } 1398 1399 return total_len; 1400 } 1401 1402 static int __split_bvec_across_targets(struct clone_info *ci, 1403 struct dm_target *ti, sector_t max) 1404 { 1405 struct bio *bio = ci->bio; 1406 struct bio_vec *bv = bio->bi_io_vec + ci->idx; 1407 sector_t remaining = to_sector(bv->bv_len); 1408 unsigned offset = 0; 1409 sector_t len; 1410 1411 do { 1412 if (offset) { 1413 ti = dm_table_find_target(ci->map, ci->sector); 1414 if (!dm_target_is_valid(ti)) 1415 return -EIO; 1416 1417 max = max_io_len(ci->sector, ti); 1418 } 1419 1420 len = min(remaining, max); 1421 1422 __clone_and_map_data_bio(ci, ti, ci->sector, 1, ci->idx, 0, 1423 bv->bv_offset + offset, len, 1); 1424 1425 ci->sector += len; 1426 ci->sector_count -= len; 1427 offset += to_bytes(len); 1428 } while (remaining -= len); 1429 1430 ci->idx++; 1431 1432 return 0; 1433 } 1434 1435 /* 1436 * Select the correct strategy for processing a non-flush bio. 1437 */ 1438 static int __split_and_process_non_flush(struct clone_info *ci) 1439 { 1440 struct bio *bio = ci->bio; 1441 struct dm_target *ti; 1442 sector_t len, max; 1443 int idx; 1444 1445 if (unlikely(bio->bi_rw & REQ_DISCARD)) 1446 return __send_discard(ci); 1447 else if (unlikely(bio->bi_rw & REQ_WRITE_SAME)) 1448 return __send_write_same(ci); 1449 1450 ti = dm_table_find_target(ci->map, ci->sector); 1451 if (!dm_target_is_valid(ti)) 1452 return -EIO; 1453 1454 max = max_io_len(ci->sector, ti); 1455 1456 /* 1457 * Optimise for the simple case where we can do all of 1458 * the remaining io with a single clone. 1459 */ 1460 if (ci->sector_count <= max) { 1461 __clone_and_map_data_bio(ci, ti, ci->sector, bio->bi_max_vecs, 1462 ci->idx, bio->bi_vcnt - ci->idx, 0, 1463 ci->sector_count, 0); 1464 ci->sector_count = 0; 1465 return 0; 1466 } 1467 1468 /* 1469 * There are some bvecs that don't span targets. 1470 * Do as many of these as possible. 1471 */ 1472 if (to_sector(bio->bi_io_vec[ci->idx].bv_len) <= max) { 1473 len = __len_within_target(ci, max, &idx); 1474 1475 __clone_and_map_data_bio(ci, ti, ci->sector, bio->bi_max_vecs, 1476 ci->idx, idx - ci->idx, 0, len, 0); 1477 1478 ci->sector += len; 1479 ci->sector_count -= len; 1480 ci->idx = idx; 1481 1482 return 0; 1483 } 1484 1485 /* 1486 * Handle a bvec that must be split between two or more targets. 1487 */ 1488 return __split_bvec_across_targets(ci, ti, max); 1489 } 1490 1491 /* 1492 * Entry point to split a bio into clones and submit them to the targets. 1493 */ 1494 static void __split_and_process_bio(struct mapped_device *md, 1495 struct dm_table *map, struct bio *bio) 1496 { 1497 struct clone_info ci; 1498 int error = 0; 1499 1500 if (unlikely(!map)) { 1501 bio_io_error(bio); 1502 return; 1503 } 1504 1505 ci.map = map; 1506 ci.md = md; 1507 ci.io = alloc_io(md); 1508 ci.io->error = 0; 1509 atomic_set(&ci.io->io_count, 1); 1510 ci.io->bio = bio; 1511 ci.io->md = md; 1512 spin_lock_init(&ci.io->endio_lock); 1513 ci.sector = bio->bi_sector; 1514 ci.idx = bio->bi_idx; 1515 1516 start_io_acct(ci.io); 1517 1518 if (bio->bi_rw & REQ_FLUSH) { 1519 ci.bio = &ci.md->flush_bio; 1520 ci.sector_count = 0; 1521 error = __send_empty_flush(&ci); 1522 /* dec_pending submits any data associated with flush */ 1523 } else { 1524 ci.bio = bio; 1525 ci.sector_count = bio_sectors(bio); 1526 while (ci.sector_count && !error) 1527 error = __split_and_process_non_flush(&ci); 1528 } 1529 1530 /* drop the extra reference count */ 1531 dec_pending(ci.io, error); 1532 } 1533 /*----------------------------------------------------------------- 1534 * CRUD END 1535 *---------------------------------------------------------------*/ 1536 1537 static int dm_merge_bvec(struct request_queue *q, 1538 struct bvec_merge_data *bvm, 1539 struct bio_vec *biovec) 1540 { 1541 struct mapped_device *md = q->queuedata; 1542 struct dm_table *map = dm_get_live_table_fast(md); 1543 struct dm_target *ti; 1544 sector_t max_sectors; 1545 int max_size = 0; 1546 1547 if (unlikely(!map)) 1548 goto out; 1549 1550 ti = dm_table_find_target(map, bvm->bi_sector); 1551 if (!dm_target_is_valid(ti)) 1552 goto out; 1553 1554 /* 1555 * Find maximum amount of I/O that won't need splitting 1556 */ 1557 max_sectors = min(max_io_len(bvm->bi_sector, ti), 1558 (sector_t) BIO_MAX_SECTORS); 1559 max_size = (max_sectors << SECTOR_SHIFT) - bvm->bi_size; 1560 if (max_size < 0) 1561 max_size = 0; 1562 1563 /* 1564 * merge_bvec_fn() returns number of bytes 1565 * it can accept at this offset 1566 * max is precomputed maximal io size 1567 */ 1568 if (max_size && ti->type->merge) 1569 max_size = ti->type->merge(ti, bvm, biovec, max_size); 1570 /* 1571 * If the target doesn't support merge method and some of the devices 1572 * provided their merge_bvec method (we know this by looking at 1573 * queue_max_hw_sectors), then we can't allow bios with multiple vector 1574 * entries. So always set max_size to 0, and the code below allows 1575 * just one page. 1576 */ 1577 else if (queue_max_hw_sectors(q) <= PAGE_SIZE >> 9) 1578 1579 max_size = 0; 1580 1581 out: 1582 dm_put_live_table_fast(md); 1583 /* 1584 * Always allow an entire first page 1585 */ 1586 if (max_size <= biovec->bv_len && !(bvm->bi_size >> SECTOR_SHIFT)) 1587 max_size = biovec->bv_len; 1588 1589 return max_size; 1590 } 1591 1592 /* 1593 * The request function that just remaps the bio built up by 1594 * dm_merge_bvec. 1595 */ 1596 static void _dm_request(struct request_queue *q, struct bio *bio) 1597 { 1598 int rw = bio_data_dir(bio); 1599 struct mapped_device *md = q->queuedata; 1600 int cpu; 1601 int srcu_idx; 1602 struct dm_table *map; 1603 1604 map = dm_get_live_table(md, &srcu_idx); 1605 1606 cpu = part_stat_lock(); 1607 part_stat_inc(cpu, &dm_disk(md)->part0, ios[rw]); 1608 part_stat_add(cpu, &dm_disk(md)->part0, sectors[rw], bio_sectors(bio)); 1609 part_stat_unlock(); 1610 1611 /* if we're suspended, we have to queue this io for later */ 1612 if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))) { 1613 dm_put_live_table(md, srcu_idx); 1614 1615 if (bio_rw(bio) != READA) 1616 queue_io(md, bio); 1617 else 1618 bio_io_error(bio); 1619 return; 1620 } 1621 1622 __split_and_process_bio(md, map, bio); 1623 dm_put_live_table(md, srcu_idx); 1624 return; 1625 } 1626 1627 int dm_request_based(struct mapped_device *md) 1628 { 1629 return blk_queue_stackable(md->queue); 1630 } 1631 1632 static void dm_request(struct request_queue *q, struct bio *bio) 1633 { 1634 struct mapped_device *md = q->queuedata; 1635 1636 if (dm_request_based(md)) 1637 blk_queue_bio(q, bio); 1638 else 1639 _dm_request(q, bio); 1640 } 1641 1642 void dm_dispatch_request(struct request *rq) 1643 { 1644 int r; 1645 1646 if (blk_queue_io_stat(rq->q)) 1647 rq->cmd_flags |= REQ_IO_STAT; 1648 1649 rq->start_time = jiffies; 1650 r = blk_insert_cloned_request(rq->q, rq); 1651 if (r) 1652 dm_complete_request(rq, r); 1653 } 1654 EXPORT_SYMBOL_GPL(dm_dispatch_request); 1655 1656 static int dm_rq_bio_constructor(struct bio *bio, struct bio *bio_orig, 1657 void *data) 1658 { 1659 struct dm_rq_target_io *tio = data; 1660 struct dm_rq_clone_bio_info *info = 1661 container_of(bio, struct dm_rq_clone_bio_info, clone); 1662 1663 info->orig = bio_orig; 1664 info->tio = tio; 1665 bio->bi_end_io = end_clone_bio; 1666 bio->bi_private = info; 1667 1668 return 0; 1669 } 1670 1671 static int setup_clone(struct request *clone, struct request *rq, 1672 struct dm_rq_target_io *tio) 1673 { 1674 int r; 1675 1676 r = blk_rq_prep_clone(clone, rq, tio->md->bs, GFP_ATOMIC, 1677 dm_rq_bio_constructor, tio); 1678 if (r) 1679 return r; 1680 1681 clone->cmd = rq->cmd; 1682 clone->cmd_len = rq->cmd_len; 1683 clone->sense = rq->sense; 1684 clone->buffer = rq->buffer; 1685 clone->end_io = end_clone_request; 1686 clone->end_io_data = tio; 1687 1688 return 0; 1689 } 1690 1691 static struct request *clone_rq(struct request *rq, struct mapped_device *md, 1692 gfp_t gfp_mask) 1693 { 1694 struct request *clone; 1695 struct dm_rq_target_io *tio; 1696 1697 tio = alloc_rq_tio(md, gfp_mask); 1698 if (!tio) 1699 return NULL; 1700 1701 tio->md = md; 1702 tio->ti = NULL; 1703 tio->orig = rq; 1704 tio->error = 0; 1705 memset(&tio->info, 0, sizeof(tio->info)); 1706 1707 clone = &tio->clone; 1708 if (setup_clone(clone, rq, tio)) { 1709 /* -ENOMEM */ 1710 free_rq_tio(tio); 1711 return NULL; 1712 } 1713 1714 return clone; 1715 } 1716 1717 /* 1718 * Called with the queue lock held. 1719 */ 1720 static int dm_prep_fn(struct request_queue *q, struct request *rq) 1721 { 1722 struct mapped_device *md = q->queuedata; 1723 struct request *clone; 1724 1725 if (unlikely(rq->special)) { 1726 DMWARN("Already has something in rq->special."); 1727 return BLKPREP_KILL; 1728 } 1729 1730 clone = clone_rq(rq, md, GFP_ATOMIC); 1731 if (!clone) 1732 return BLKPREP_DEFER; 1733 1734 rq->special = clone; 1735 rq->cmd_flags |= REQ_DONTPREP; 1736 1737 return BLKPREP_OK; 1738 } 1739 1740 /* 1741 * Returns: 1742 * 0 : the request has been processed (not requeued) 1743 * !0 : the request has been requeued 1744 */ 1745 static int map_request(struct dm_target *ti, struct request *clone, 1746 struct mapped_device *md) 1747 { 1748 int r, requeued = 0; 1749 struct dm_rq_target_io *tio = clone->end_io_data; 1750 1751 tio->ti = ti; 1752 r = ti->type->map_rq(ti, clone, &tio->info); 1753 switch (r) { 1754 case DM_MAPIO_SUBMITTED: 1755 /* The target has taken the I/O to submit by itself later */ 1756 break; 1757 case DM_MAPIO_REMAPPED: 1758 /* The target has remapped the I/O so dispatch it */ 1759 trace_block_rq_remap(clone->q, clone, disk_devt(dm_disk(md)), 1760 blk_rq_pos(tio->orig)); 1761 dm_dispatch_request(clone); 1762 break; 1763 case DM_MAPIO_REQUEUE: 1764 /* The target wants to requeue the I/O */ 1765 dm_requeue_unmapped_request(clone); 1766 requeued = 1; 1767 break; 1768 default: 1769 if (r > 0) { 1770 DMWARN("unimplemented target map return value: %d", r); 1771 BUG(); 1772 } 1773 1774 /* The target wants to complete the I/O */ 1775 dm_kill_unmapped_request(clone, r); 1776 break; 1777 } 1778 1779 return requeued; 1780 } 1781 1782 static struct request *dm_start_request(struct mapped_device *md, struct request *orig) 1783 { 1784 struct request *clone; 1785 1786 blk_start_request(orig); 1787 clone = orig->special; 1788 atomic_inc(&md->pending[rq_data_dir(clone)]); 1789 1790 /* 1791 * Hold the md reference here for the in-flight I/O. 1792 * We can't rely on the reference count by device opener, 1793 * because the device may be closed during the request completion 1794 * when all bios are completed. 1795 * See the comment in rq_completed() too. 1796 */ 1797 dm_get(md); 1798 1799 return clone; 1800 } 1801 1802 /* 1803 * q->request_fn for request-based dm. 1804 * Called with the queue lock held. 1805 */ 1806 static void dm_request_fn(struct request_queue *q) 1807 { 1808 struct mapped_device *md = q->queuedata; 1809 int srcu_idx; 1810 struct dm_table *map = dm_get_live_table(md, &srcu_idx); 1811 struct dm_target *ti; 1812 struct request *rq, *clone; 1813 sector_t pos; 1814 1815 /* 1816 * For suspend, check blk_queue_stopped() and increment 1817 * ->pending within a single queue_lock not to increment the 1818 * number of in-flight I/Os after the queue is stopped in 1819 * dm_suspend(). 1820 */ 1821 while (!blk_queue_stopped(q)) { 1822 rq = blk_peek_request(q); 1823 if (!rq) 1824 goto delay_and_out; 1825 1826 /* always use block 0 to find the target for flushes for now */ 1827 pos = 0; 1828 if (!(rq->cmd_flags & REQ_FLUSH)) 1829 pos = blk_rq_pos(rq); 1830 1831 ti = dm_table_find_target(map, pos); 1832 if (!dm_target_is_valid(ti)) { 1833 /* 1834 * Must perform setup, that dm_done() requires, 1835 * before calling dm_kill_unmapped_request 1836 */ 1837 DMERR_LIMIT("request attempted access beyond the end of device"); 1838 clone = dm_start_request(md, rq); 1839 dm_kill_unmapped_request(clone, -EIO); 1840 continue; 1841 } 1842 1843 if (ti->type->busy && ti->type->busy(ti)) 1844 goto delay_and_out; 1845 1846 clone = dm_start_request(md, rq); 1847 1848 spin_unlock(q->queue_lock); 1849 if (map_request(ti, clone, md)) 1850 goto requeued; 1851 1852 BUG_ON(!irqs_disabled()); 1853 spin_lock(q->queue_lock); 1854 } 1855 1856 goto out; 1857 1858 requeued: 1859 BUG_ON(!irqs_disabled()); 1860 spin_lock(q->queue_lock); 1861 1862 delay_and_out: 1863 blk_delay_queue(q, HZ / 10); 1864 out: 1865 dm_put_live_table(md, srcu_idx); 1866 } 1867 1868 int dm_underlying_device_busy(struct request_queue *q) 1869 { 1870 return blk_lld_busy(q); 1871 } 1872 EXPORT_SYMBOL_GPL(dm_underlying_device_busy); 1873 1874 static int dm_lld_busy(struct request_queue *q) 1875 { 1876 int r; 1877 struct mapped_device *md = q->queuedata; 1878 struct dm_table *map = dm_get_live_table_fast(md); 1879 1880 if (!map || test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) 1881 r = 1; 1882 else 1883 r = dm_table_any_busy_target(map); 1884 1885 dm_put_live_table_fast(md); 1886 1887 return r; 1888 } 1889 1890 static int dm_any_congested(void *congested_data, int bdi_bits) 1891 { 1892 int r = bdi_bits; 1893 struct mapped_device *md = congested_data; 1894 struct dm_table *map; 1895 1896 if (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) { 1897 map = dm_get_live_table_fast(md); 1898 if (map) { 1899 /* 1900 * Request-based dm cares about only own queue for 1901 * the query about congestion status of request_queue 1902 */ 1903 if (dm_request_based(md)) 1904 r = md->queue->backing_dev_info.state & 1905 bdi_bits; 1906 else 1907 r = dm_table_any_congested(map, bdi_bits); 1908 } 1909 dm_put_live_table_fast(md); 1910 } 1911 1912 return r; 1913 } 1914 1915 /*----------------------------------------------------------------- 1916 * An IDR is used to keep track of allocated minor numbers. 1917 *---------------------------------------------------------------*/ 1918 static void free_minor(int minor) 1919 { 1920 spin_lock(&_minor_lock); 1921 idr_remove(&_minor_idr, minor); 1922 spin_unlock(&_minor_lock); 1923 } 1924 1925 /* 1926 * See if the device with a specific minor # is free. 1927 */ 1928 static int specific_minor(int minor) 1929 { 1930 int r; 1931 1932 if (minor >= (1 << MINORBITS)) 1933 return -EINVAL; 1934 1935 idr_preload(GFP_KERNEL); 1936 spin_lock(&_minor_lock); 1937 1938 r = idr_alloc(&_minor_idr, MINOR_ALLOCED, minor, minor + 1, GFP_NOWAIT); 1939 1940 spin_unlock(&_minor_lock); 1941 idr_preload_end(); 1942 if (r < 0) 1943 return r == -ENOSPC ? -EBUSY : r; 1944 return 0; 1945 } 1946 1947 static int next_free_minor(int *minor) 1948 { 1949 int r; 1950 1951 idr_preload(GFP_KERNEL); 1952 spin_lock(&_minor_lock); 1953 1954 r = idr_alloc(&_minor_idr, MINOR_ALLOCED, 0, 1 << MINORBITS, GFP_NOWAIT); 1955 1956 spin_unlock(&_minor_lock); 1957 idr_preload_end(); 1958 if (r < 0) 1959 return r; 1960 *minor = r; 1961 return 0; 1962 } 1963 1964 static const struct block_device_operations dm_blk_dops; 1965 1966 static void dm_wq_work(struct work_struct *work); 1967 1968 static void dm_init_md_queue(struct mapped_device *md) 1969 { 1970 /* 1971 * Request-based dm devices cannot be stacked on top of bio-based dm 1972 * devices. The type of this dm device has not been decided yet. 1973 * The type is decided at the first table loading time. 1974 * To prevent problematic device stacking, clear the queue flag 1975 * for request stacking support until then. 1976 * 1977 * This queue is new, so no concurrency on the queue_flags. 1978 */ 1979 queue_flag_clear_unlocked(QUEUE_FLAG_STACKABLE, md->queue); 1980 1981 md->queue->queuedata = md; 1982 md->queue->backing_dev_info.congested_fn = dm_any_congested; 1983 md->queue->backing_dev_info.congested_data = md; 1984 blk_queue_make_request(md->queue, dm_request); 1985 blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY); 1986 blk_queue_merge_bvec(md->queue, dm_merge_bvec); 1987 } 1988 1989 /* 1990 * Allocate and initialise a blank device with a given minor. 1991 */ 1992 static struct mapped_device *alloc_dev(int minor) 1993 { 1994 int r; 1995 struct mapped_device *md = kzalloc(sizeof(*md), GFP_KERNEL); 1996 void *old_md; 1997 1998 if (!md) { 1999 DMWARN("unable to allocate device, out of memory."); 2000 return NULL; 2001 } 2002 2003 if (!try_module_get(THIS_MODULE)) 2004 goto bad_module_get; 2005 2006 /* get a minor number for the dev */ 2007 if (minor == DM_ANY_MINOR) 2008 r = next_free_minor(&minor); 2009 else 2010 r = specific_minor(minor); 2011 if (r < 0) 2012 goto bad_minor; 2013 2014 r = init_srcu_struct(&md->io_barrier); 2015 if (r < 0) 2016 goto bad_io_barrier; 2017 2018 md->type = DM_TYPE_NONE; 2019 mutex_init(&md->suspend_lock); 2020 mutex_init(&md->type_lock); 2021 spin_lock_init(&md->deferred_lock); 2022 atomic_set(&md->holders, 1); 2023 atomic_set(&md->open_count, 0); 2024 atomic_set(&md->event_nr, 0); 2025 atomic_set(&md->uevent_seq, 0); 2026 INIT_LIST_HEAD(&md->uevent_list); 2027 spin_lock_init(&md->uevent_lock); 2028 2029 md->queue = blk_alloc_queue(GFP_KERNEL); 2030 if (!md->queue) 2031 goto bad_queue; 2032 2033 dm_init_md_queue(md); 2034 2035 md->disk = alloc_disk(1); 2036 if (!md->disk) 2037 goto bad_disk; 2038 2039 atomic_set(&md->pending[0], 0); 2040 atomic_set(&md->pending[1], 0); 2041 init_waitqueue_head(&md->wait); 2042 INIT_WORK(&md->work, dm_wq_work); 2043 init_waitqueue_head(&md->eventq); 2044 2045 md->disk->major = _major; 2046 md->disk->first_minor = minor; 2047 md->disk->fops = &dm_blk_dops; 2048 md->disk->queue = md->queue; 2049 md->disk->private_data = md; 2050 sprintf(md->disk->disk_name, "dm-%d", minor); 2051 add_disk(md->disk); 2052 format_dev_t(md->name, MKDEV(_major, minor)); 2053 2054 md->wq = alloc_workqueue("kdmflush", WQ_MEM_RECLAIM, 0); 2055 if (!md->wq) 2056 goto bad_thread; 2057 2058 md->bdev = bdget_disk(md->disk, 0); 2059 if (!md->bdev) 2060 goto bad_bdev; 2061 2062 bio_init(&md->flush_bio); 2063 md->flush_bio.bi_bdev = md->bdev; 2064 md->flush_bio.bi_rw = WRITE_FLUSH; 2065 2066 dm_stats_init(&md->stats); 2067 2068 /* Populate the mapping, nobody knows we exist yet */ 2069 spin_lock(&_minor_lock); 2070 old_md = idr_replace(&_minor_idr, md, minor); 2071 spin_unlock(&_minor_lock); 2072 2073 BUG_ON(old_md != MINOR_ALLOCED); 2074 2075 return md; 2076 2077 bad_bdev: 2078 destroy_workqueue(md->wq); 2079 bad_thread: 2080 del_gendisk(md->disk); 2081 put_disk(md->disk); 2082 bad_disk: 2083 blk_cleanup_queue(md->queue); 2084 bad_queue: 2085 cleanup_srcu_struct(&md->io_barrier); 2086 bad_io_barrier: 2087 free_minor(minor); 2088 bad_minor: 2089 module_put(THIS_MODULE); 2090 bad_module_get: 2091 kfree(md); 2092 return NULL; 2093 } 2094 2095 static void unlock_fs(struct mapped_device *md); 2096 2097 static void free_dev(struct mapped_device *md) 2098 { 2099 int minor = MINOR(disk_devt(md->disk)); 2100 2101 unlock_fs(md); 2102 bdput(md->bdev); 2103 destroy_workqueue(md->wq); 2104 if (md->io_pool) 2105 mempool_destroy(md->io_pool); 2106 if (md->bs) 2107 bioset_free(md->bs); 2108 blk_integrity_unregister(md->disk); 2109 del_gendisk(md->disk); 2110 cleanup_srcu_struct(&md->io_barrier); 2111 free_minor(minor); 2112 2113 spin_lock(&_minor_lock); 2114 md->disk->private_data = NULL; 2115 spin_unlock(&_minor_lock); 2116 2117 put_disk(md->disk); 2118 blk_cleanup_queue(md->queue); 2119 dm_stats_cleanup(&md->stats); 2120 module_put(THIS_MODULE); 2121 kfree(md); 2122 } 2123 2124 static void __bind_mempools(struct mapped_device *md, struct dm_table *t) 2125 { 2126 struct dm_md_mempools *p = dm_table_get_md_mempools(t); 2127 2128 if (md->io_pool && md->bs) { 2129 /* The md already has necessary mempools. */ 2130 if (dm_table_get_type(t) == DM_TYPE_BIO_BASED) { 2131 /* 2132 * Reload bioset because front_pad may have changed 2133 * because a different table was loaded. 2134 */ 2135 bioset_free(md->bs); 2136 md->bs = p->bs; 2137 p->bs = NULL; 2138 } else if (dm_table_get_type(t) == DM_TYPE_REQUEST_BASED) { 2139 /* 2140 * There's no need to reload with request-based dm 2141 * because the size of front_pad doesn't change. 2142 * Note for future: If you are to reload bioset, 2143 * prep-ed requests in the queue may refer 2144 * to bio from the old bioset, so you must walk 2145 * through the queue to unprep. 2146 */ 2147 } 2148 goto out; 2149 } 2150 2151 BUG_ON(!p || md->io_pool || md->bs); 2152 2153 md->io_pool = p->io_pool; 2154 p->io_pool = NULL; 2155 md->bs = p->bs; 2156 p->bs = NULL; 2157 2158 out: 2159 /* mempool bind completed, now no need any mempools in the table */ 2160 dm_table_free_md_mempools(t); 2161 } 2162 2163 /* 2164 * Bind a table to the device. 2165 */ 2166 static void event_callback(void *context) 2167 { 2168 unsigned long flags; 2169 LIST_HEAD(uevents); 2170 struct mapped_device *md = (struct mapped_device *) context; 2171 2172 spin_lock_irqsave(&md->uevent_lock, flags); 2173 list_splice_init(&md->uevent_list, &uevents); 2174 spin_unlock_irqrestore(&md->uevent_lock, flags); 2175 2176 dm_send_uevents(&uevents, &disk_to_dev(md->disk)->kobj); 2177 2178 atomic_inc(&md->event_nr); 2179 wake_up(&md->eventq); 2180 } 2181 2182 /* 2183 * Protected by md->suspend_lock obtained by dm_swap_table(). 2184 */ 2185 static void __set_size(struct mapped_device *md, sector_t size) 2186 { 2187 set_capacity(md->disk, size); 2188 2189 i_size_write(md->bdev->bd_inode, (loff_t)size << SECTOR_SHIFT); 2190 } 2191 2192 /* 2193 * Return 1 if the queue has a compulsory merge_bvec_fn function. 2194 * 2195 * If this function returns 0, then the device is either a non-dm 2196 * device without a merge_bvec_fn, or it is a dm device that is 2197 * able to split any bios it receives that are too big. 2198 */ 2199 int dm_queue_merge_is_compulsory(struct request_queue *q) 2200 { 2201 struct mapped_device *dev_md; 2202 2203 if (!q->merge_bvec_fn) 2204 return 0; 2205 2206 if (q->make_request_fn == dm_request) { 2207 dev_md = q->queuedata; 2208 if (test_bit(DMF_MERGE_IS_OPTIONAL, &dev_md->flags)) 2209 return 0; 2210 } 2211 2212 return 1; 2213 } 2214 2215 static int dm_device_merge_is_compulsory(struct dm_target *ti, 2216 struct dm_dev *dev, sector_t start, 2217 sector_t len, void *data) 2218 { 2219 struct block_device *bdev = dev->bdev; 2220 struct request_queue *q = bdev_get_queue(bdev); 2221 2222 return dm_queue_merge_is_compulsory(q); 2223 } 2224 2225 /* 2226 * Return 1 if it is acceptable to ignore merge_bvec_fn based 2227 * on the properties of the underlying devices. 2228 */ 2229 static int dm_table_merge_is_optional(struct dm_table *table) 2230 { 2231 unsigned i = 0; 2232 struct dm_target *ti; 2233 2234 while (i < dm_table_get_num_targets(table)) { 2235 ti = dm_table_get_target(table, i++); 2236 2237 if (ti->type->iterate_devices && 2238 ti->type->iterate_devices(ti, dm_device_merge_is_compulsory, NULL)) 2239 return 0; 2240 } 2241 2242 return 1; 2243 } 2244 2245 /* 2246 * Returns old map, which caller must destroy. 2247 */ 2248 static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t, 2249 struct queue_limits *limits) 2250 { 2251 struct dm_table *old_map; 2252 struct request_queue *q = md->queue; 2253 sector_t size; 2254 int merge_is_optional; 2255 2256 size = dm_table_get_size(t); 2257 2258 /* 2259 * Wipe any geometry if the size of the table changed. 2260 */ 2261 if (size != dm_get_size(md)) 2262 memset(&md->geometry, 0, sizeof(md->geometry)); 2263 2264 __set_size(md, size); 2265 2266 dm_table_event_callback(t, event_callback, md); 2267 2268 /* 2269 * The queue hasn't been stopped yet, if the old table type wasn't 2270 * for request-based during suspension. So stop it to prevent 2271 * I/O mapping before resume. 2272 * This must be done before setting the queue restrictions, 2273 * because request-based dm may be run just after the setting. 2274 */ 2275 if (dm_table_request_based(t) && !blk_queue_stopped(q)) 2276 stop_queue(q); 2277 2278 __bind_mempools(md, t); 2279 2280 merge_is_optional = dm_table_merge_is_optional(t); 2281 2282 old_map = md->map; 2283 rcu_assign_pointer(md->map, t); 2284 md->immutable_target_type = dm_table_get_immutable_target_type(t); 2285 2286 dm_table_set_restrictions(t, q, limits); 2287 if (merge_is_optional) 2288 set_bit(DMF_MERGE_IS_OPTIONAL, &md->flags); 2289 else 2290 clear_bit(DMF_MERGE_IS_OPTIONAL, &md->flags); 2291 dm_sync_table(md); 2292 2293 return old_map; 2294 } 2295 2296 /* 2297 * Returns unbound table for the caller to free. 2298 */ 2299 static struct dm_table *__unbind(struct mapped_device *md) 2300 { 2301 struct dm_table *map = md->map; 2302 2303 if (!map) 2304 return NULL; 2305 2306 dm_table_event_callback(map, NULL, NULL); 2307 rcu_assign_pointer(md->map, NULL); 2308 dm_sync_table(md); 2309 2310 return map; 2311 } 2312 2313 /* 2314 * Constructor for a new device. 2315 */ 2316 int dm_create(int minor, struct mapped_device **result) 2317 { 2318 struct mapped_device *md; 2319 2320 md = alloc_dev(minor); 2321 if (!md) 2322 return -ENXIO; 2323 2324 dm_sysfs_init(md); 2325 2326 *result = md; 2327 return 0; 2328 } 2329 2330 /* 2331 * Functions to manage md->type. 2332 * All are required to hold md->type_lock. 2333 */ 2334 void dm_lock_md_type(struct mapped_device *md) 2335 { 2336 mutex_lock(&md->type_lock); 2337 } 2338 2339 void dm_unlock_md_type(struct mapped_device *md) 2340 { 2341 mutex_unlock(&md->type_lock); 2342 } 2343 2344 void dm_set_md_type(struct mapped_device *md, unsigned type) 2345 { 2346 BUG_ON(!mutex_is_locked(&md->type_lock)); 2347 md->type = type; 2348 } 2349 2350 unsigned dm_get_md_type(struct mapped_device *md) 2351 { 2352 BUG_ON(!mutex_is_locked(&md->type_lock)); 2353 return md->type; 2354 } 2355 2356 struct target_type *dm_get_immutable_target_type(struct mapped_device *md) 2357 { 2358 return md->immutable_target_type; 2359 } 2360 2361 /* 2362 * The queue_limits are only valid as long as you have a reference 2363 * count on 'md'. 2364 */ 2365 struct queue_limits *dm_get_queue_limits(struct mapped_device *md) 2366 { 2367 BUG_ON(!atomic_read(&md->holders)); 2368 return &md->queue->limits; 2369 } 2370 EXPORT_SYMBOL_GPL(dm_get_queue_limits); 2371 2372 /* 2373 * Fully initialize a request-based queue (->elevator, ->request_fn, etc). 2374 */ 2375 static int dm_init_request_based_queue(struct mapped_device *md) 2376 { 2377 struct request_queue *q = NULL; 2378 2379 if (md->queue->elevator) 2380 return 1; 2381 2382 /* Fully initialize the queue */ 2383 q = blk_init_allocated_queue(md->queue, dm_request_fn, NULL); 2384 if (!q) 2385 return 0; 2386 2387 md->queue = q; 2388 dm_init_md_queue(md); 2389 blk_queue_softirq_done(md->queue, dm_softirq_done); 2390 blk_queue_prep_rq(md->queue, dm_prep_fn); 2391 blk_queue_lld_busy(md->queue, dm_lld_busy); 2392 2393 elv_register_queue(md->queue); 2394 2395 return 1; 2396 } 2397 2398 /* 2399 * Setup the DM device's queue based on md's type 2400 */ 2401 int dm_setup_md_queue(struct mapped_device *md) 2402 { 2403 if ((dm_get_md_type(md) == DM_TYPE_REQUEST_BASED) && 2404 !dm_init_request_based_queue(md)) { 2405 DMWARN("Cannot initialize queue for request-based mapped device"); 2406 return -EINVAL; 2407 } 2408 2409 return 0; 2410 } 2411 2412 static struct mapped_device *dm_find_md(dev_t dev) 2413 { 2414 struct mapped_device *md; 2415 unsigned minor = MINOR(dev); 2416 2417 if (MAJOR(dev) != _major || minor >= (1 << MINORBITS)) 2418 return NULL; 2419 2420 spin_lock(&_minor_lock); 2421 2422 md = idr_find(&_minor_idr, minor); 2423 if (md && (md == MINOR_ALLOCED || 2424 (MINOR(disk_devt(dm_disk(md))) != minor) || 2425 dm_deleting_md(md) || 2426 test_bit(DMF_FREEING, &md->flags))) { 2427 md = NULL; 2428 goto out; 2429 } 2430 2431 out: 2432 spin_unlock(&_minor_lock); 2433 2434 return md; 2435 } 2436 2437 struct mapped_device *dm_get_md(dev_t dev) 2438 { 2439 struct mapped_device *md = dm_find_md(dev); 2440 2441 if (md) 2442 dm_get(md); 2443 2444 return md; 2445 } 2446 EXPORT_SYMBOL_GPL(dm_get_md); 2447 2448 void *dm_get_mdptr(struct mapped_device *md) 2449 { 2450 return md->interface_ptr; 2451 } 2452 2453 void dm_set_mdptr(struct mapped_device *md, void *ptr) 2454 { 2455 md->interface_ptr = ptr; 2456 } 2457 2458 void dm_get(struct mapped_device *md) 2459 { 2460 atomic_inc(&md->holders); 2461 BUG_ON(test_bit(DMF_FREEING, &md->flags)); 2462 } 2463 2464 const char *dm_device_name(struct mapped_device *md) 2465 { 2466 return md->name; 2467 } 2468 EXPORT_SYMBOL_GPL(dm_device_name); 2469 2470 static void __dm_destroy(struct mapped_device *md, bool wait) 2471 { 2472 struct dm_table *map; 2473 int srcu_idx; 2474 2475 might_sleep(); 2476 2477 spin_lock(&_minor_lock); 2478 map = dm_get_live_table(md, &srcu_idx); 2479 idr_replace(&_minor_idr, MINOR_ALLOCED, MINOR(disk_devt(dm_disk(md)))); 2480 set_bit(DMF_FREEING, &md->flags); 2481 spin_unlock(&_minor_lock); 2482 2483 if (!dm_suspended_md(md)) { 2484 dm_table_presuspend_targets(map); 2485 dm_table_postsuspend_targets(map); 2486 } 2487 2488 /* dm_put_live_table must be before msleep, otherwise deadlock is possible */ 2489 dm_put_live_table(md, srcu_idx); 2490 2491 /* 2492 * Rare, but there may be I/O requests still going to complete, 2493 * for example. Wait for all references to disappear. 2494 * No one should increment the reference count of the mapped_device, 2495 * after the mapped_device state becomes DMF_FREEING. 2496 */ 2497 if (wait) 2498 while (atomic_read(&md->holders)) 2499 msleep(1); 2500 else if (atomic_read(&md->holders)) 2501 DMWARN("%s: Forcibly removing mapped_device still in use! (%d users)", 2502 dm_device_name(md), atomic_read(&md->holders)); 2503 2504 dm_sysfs_exit(md); 2505 dm_table_destroy(__unbind(md)); 2506 free_dev(md); 2507 } 2508 2509 void dm_destroy(struct mapped_device *md) 2510 { 2511 __dm_destroy(md, true); 2512 } 2513 2514 void dm_destroy_immediate(struct mapped_device *md) 2515 { 2516 __dm_destroy(md, false); 2517 } 2518 2519 void dm_put(struct mapped_device *md) 2520 { 2521 atomic_dec(&md->holders); 2522 } 2523 EXPORT_SYMBOL_GPL(dm_put); 2524 2525 static int dm_wait_for_completion(struct mapped_device *md, int interruptible) 2526 { 2527 int r = 0; 2528 DECLARE_WAITQUEUE(wait, current); 2529 2530 add_wait_queue(&md->wait, &wait); 2531 2532 while (1) { 2533 set_current_state(interruptible); 2534 2535 if (!md_in_flight(md)) 2536 break; 2537 2538 if (interruptible == TASK_INTERRUPTIBLE && 2539 signal_pending(current)) { 2540 r = -EINTR; 2541 break; 2542 } 2543 2544 io_schedule(); 2545 } 2546 set_current_state(TASK_RUNNING); 2547 2548 remove_wait_queue(&md->wait, &wait); 2549 2550 return r; 2551 } 2552 2553 /* 2554 * Process the deferred bios 2555 */ 2556 static void dm_wq_work(struct work_struct *work) 2557 { 2558 struct mapped_device *md = container_of(work, struct mapped_device, 2559 work); 2560 struct bio *c; 2561 int srcu_idx; 2562 struct dm_table *map; 2563 2564 map = dm_get_live_table(md, &srcu_idx); 2565 2566 while (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) { 2567 spin_lock_irq(&md->deferred_lock); 2568 c = bio_list_pop(&md->deferred); 2569 spin_unlock_irq(&md->deferred_lock); 2570 2571 if (!c) 2572 break; 2573 2574 if (dm_request_based(md)) 2575 generic_make_request(c); 2576 else 2577 __split_and_process_bio(md, map, c); 2578 } 2579 2580 dm_put_live_table(md, srcu_idx); 2581 } 2582 2583 static void dm_queue_flush(struct mapped_device *md) 2584 { 2585 clear_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags); 2586 smp_mb__after_clear_bit(); 2587 queue_work(md->wq, &md->work); 2588 } 2589 2590 /* 2591 * Swap in a new table, returning the old one for the caller to destroy. 2592 */ 2593 struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *table) 2594 { 2595 struct dm_table *live_map = NULL, *map = ERR_PTR(-EINVAL); 2596 struct queue_limits limits; 2597 int r; 2598 2599 mutex_lock(&md->suspend_lock); 2600 2601 /* device must be suspended */ 2602 if (!dm_suspended_md(md)) 2603 goto out; 2604 2605 /* 2606 * If the new table has no data devices, retain the existing limits. 2607 * This helps multipath with queue_if_no_path if all paths disappear, 2608 * then new I/O is queued based on these limits, and then some paths 2609 * reappear. 2610 */ 2611 if (dm_table_has_no_data_devices(table)) { 2612 live_map = dm_get_live_table_fast(md); 2613 if (live_map) 2614 limits = md->queue->limits; 2615 dm_put_live_table_fast(md); 2616 } 2617 2618 if (!live_map) { 2619 r = dm_calculate_queue_limits(table, &limits); 2620 if (r) { 2621 map = ERR_PTR(r); 2622 goto out; 2623 } 2624 } 2625 2626 map = __bind(md, table, &limits); 2627 2628 out: 2629 mutex_unlock(&md->suspend_lock); 2630 return map; 2631 } 2632 2633 /* 2634 * Functions to lock and unlock any filesystem running on the 2635 * device. 2636 */ 2637 static int lock_fs(struct mapped_device *md) 2638 { 2639 int r; 2640 2641 WARN_ON(md->frozen_sb); 2642 2643 md->frozen_sb = freeze_bdev(md->bdev); 2644 if (IS_ERR(md->frozen_sb)) { 2645 r = PTR_ERR(md->frozen_sb); 2646 md->frozen_sb = NULL; 2647 return r; 2648 } 2649 2650 set_bit(DMF_FROZEN, &md->flags); 2651 2652 return 0; 2653 } 2654 2655 static void unlock_fs(struct mapped_device *md) 2656 { 2657 if (!test_bit(DMF_FROZEN, &md->flags)) 2658 return; 2659 2660 thaw_bdev(md->bdev, md->frozen_sb); 2661 md->frozen_sb = NULL; 2662 clear_bit(DMF_FROZEN, &md->flags); 2663 } 2664 2665 /* 2666 * We need to be able to change a mapping table under a mounted 2667 * filesystem. For example we might want to move some data in 2668 * the background. Before the table can be swapped with 2669 * dm_bind_table, dm_suspend must be called to flush any in 2670 * flight bios and ensure that any further io gets deferred. 2671 */ 2672 /* 2673 * Suspend mechanism in request-based dm. 2674 * 2675 * 1. Flush all I/Os by lock_fs() if needed. 2676 * 2. Stop dispatching any I/O by stopping the request_queue. 2677 * 3. Wait for all in-flight I/Os to be completed or requeued. 2678 * 2679 * To abort suspend, start the request_queue. 2680 */ 2681 int dm_suspend(struct mapped_device *md, unsigned suspend_flags) 2682 { 2683 struct dm_table *map = NULL; 2684 int r = 0; 2685 int do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG ? 1 : 0; 2686 int noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG ? 1 : 0; 2687 2688 mutex_lock(&md->suspend_lock); 2689 2690 if (dm_suspended_md(md)) { 2691 r = -EINVAL; 2692 goto out_unlock; 2693 } 2694 2695 map = md->map; 2696 2697 /* 2698 * DMF_NOFLUSH_SUSPENDING must be set before presuspend. 2699 * This flag is cleared before dm_suspend returns. 2700 */ 2701 if (noflush) 2702 set_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); 2703 2704 /* This does not get reverted if there's an error later. */ 2705 dm_table_presuspend_targets(map); 2706 2707 /* 2708 * Flush I/O to the device. 2709 * Any I/O submitted after lock_fs() may not be flushed. 2710 * noflush takes precedence over do_lockfs. 2711 * (lock_fs() flushes I/Os and waits for them to complete.) 2712 */ 2713 if (!noflush && do_lockfs) { 2714 r = lock_fs(md); 2715 if (r) 2716 goto out_unlock; 2717 } 2718 2719 /* 2720 * Here we must make sure that no processes are submitting requests 2721 * to target drivers i.e. no one may be executing 2722 * __split_and_process_bio. This is called from dm_request and 2723 * dm_wq_work. 2724 * 2725 * To get all processes out of __split_and_process_bio in dm_request, 2726 * we take the write lock. To prevent any process from reentering 2727 * __split_and_process_bio from dm_request and quiesce the thread 2728 * (dm_wq_work), we set BMF_BLOCK_IO_FOR_SUSPEND and call 2729 * flush_workqueue(md->wq). 2730 */ 2731 set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags); 2732 synchronize_srcu(&md->io_barrier); 2733 2734 /* 2735 * Stop md->queue before flushing md->wq in case request-based 2736 * dm defers requests to md->wq from md->queue. 2737 */ 2738 if (dm_request_based(md)) 2739 stop_queue(md->queue); 2740 2741 flush_workqueue(md->wq); 2742 2743 /* 2744 * At this point no more requests are entering target request routines. 2745 * We call dm_wait_for_completion to wait for all existing requests 2746 * to finish. 2747 */ 2748 r = dm_wait_for_completion(md, TASK_INTERRUPTIBLE); 2749 2750 if (noflush) 2751 clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); 2752 synchronize_srcu(&md->io_barrier); 2753 2754 /* were we interrupted ? */ 2755 if (r < 0) { 2756 dm_queue_flush(md); 2757 2758 if (dm_request_based(md)) 2759 start_queue(md->queue); 2760 2761 unlock_fs(md); 2762 goto out_unlock; /* pushback list is already flushed, so skip flush */ 2763 } 2764 2765 /* 2766 * If dm_wait_for_completion returned 0, the device is completely 2767 * quiescent now. There is no request-processing activity. All new 2768 * requests are being added to md->deferred list. 2769 */ 2770 2771 set_bit(DMF_SUSPENDED, &md->flags); 2772 2773 dm_table_postsuspend_targets(map); 2774 2775 out_unlock: 2776 mutex_unlock(&md->suspend_lock); 2777 return r; 2778 } 2779 2780 int dm_resume(struct mapped_device *md) 2781 { 2782 int r = -EINVAL; 2783 struct dm_table *map = NULL; 2784 2785 mutex_lock(&md->suspend_lock); 2786 if (!dm_suspended_md(md)) 2787 goto out; 2788 2789 map = md->map; 2790 if (!map || !dm_table_get_size(map)) 2791 goto out; 2792 2793 r = dm_table_resume_targets(map); 2794 if (r) 2795 goto out; 2796 2797 dm_queue_flush(md); 2798 2799 /* 2800 * Flushing deferred I/Os must be done after targets are resumed 2801 * so that mapping of targets can work correctly. 2802 * Request-based dm is queueing the deferred I/Os in its request_queue. 2803 */ 2804 if (dm_request_based(md)) 2805 start_queue(md->queue); 2806 2807 unlock_fs(md); 2808 2809 clear_bit(DMF_SUSPENDED, &md->flags); 2810 2811 r = 0; 2812 out: 2813 mutex_unlock(&md->suspend_lock); 2814 2815 return r; 2816 } 2817 2818 /* 2819 * Internal suspend/resume works like userspace-driven suspend. It waits 2820 * until all bios finish and prevents issuing new bios to the target drivers. 2821 * It may be used only from the kernel. 2822 * 2823 * Internal suspend holds md->suspend_lock, which prevents interaction with 2824 * userspace-driven suspend. 2825 */ 2826 2827 void dm_internal_suspend(struct mapped_device *md) 2828 { 2829 mutex_lock(&md->suspend_lock); 2830 if (dm_suspended_md(md)) 2831 return; 2832 2833 set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags); 2834 synchronize_srcu(&md->io_barrier); 2835 flush_workqueue(md->wq); 2836 dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE); 2837 } 2838 2839 void dm_internal_resume(struct mapped_device *md) 2840 { 2841 if (dm_suspended_md(md)) 2842 goto done; 2843 2844 dm_queue_flush(md); 2845 2846 done: 2847 mutex_unlock(&md->suspend_lock); 2848 } 2849 2850 /*----------------------------------------------------------------- 2851 * Event notification. 2852 *---------------------------------------------------------------*/ 2853 int dm_kobject_uevent(struct mapped_device *md, enum kobject_action action, 2854 unsigned cookie) 2855 { 2856 char udev_cookie[DM_COOKIE_LENGTH]; 2857 char *envp[] = { udev_cookie, NULL }; 2858 2859 if (!cookie) 2860 return kobject_uevent(&disk_to_dev(md->disk)->kobj, action); 2861 else { 2862 snprintf(udev_cookie, DM_COOKIE_LENGTH, "%s=%u", 2863 DM_COOKIE_ENV_VAR_NAME, cookie); 2864 return kobject_uevent_env(&disk_to_dev(md->disk)->kobj, 2865 action, envp); 2866 } 2867 } 2868 2869 uint32_t dm_next_uevent_seq(struct mapped_device *md) 2870 { 2871 return atomic_add_return(1, &md->uevent_seq); 2872 } 2873 2874 uint32_t dm_get_event_nr(struct mapped_device *md) 2875 { 2876 return atomic_read(&md->event_nr); 2877 } 2878 2879 int dm_wait_event(struct mapped_device *md, int event_nr) 2880 { 2881 return wait_event_interruptible(md->eventq, 2882 (event_nr != atomic_read(&md->event_nr))); 2883 } 2884 2885 void dm_uevent_add(struct mapped_device *md, struct list_head *elist) 2886 { 2887 unsigned long flags; 2888 2889 spin_lock_irqsave(&md->uevent_lock, flags); 2890 list_add(elist, &md->uevent_list); 2891 spin_unlock_irqrestore(&md->uevent_lock, flags); 2892 } 2893 2894 /* 2895 * The gendisk is only valid as long as you have a reference 2896 * count on 'md'. 2897 */ 2898 struct gendisk *dm_disk(struct mapped_device *md) 2899 { 2900 return md->disk; 2901 } 2902 2903 struct kobject *dm_kobject(struct mapped_device *md) 2904 { 2905 return &md->kobj; 2906 } 2907 2908 /* 2909 * struct mapped_device should not be exported outside of dm.c 2910 * so use this check to verify that kobj is part of md structure 2911 */ 2912 struct mapped_device *dm_get_from_kobject(struct kobject *kobj) 2913 { 2914 struct mapped_device *md; 2915 2916 md = container_of(kobj, struct mapped_device, kobj); 2917 if (&md->kobj != kobj) 2918 return NULL; 2919 2920 if (test_bit(DMF_FREEING, &md->flags) || 2921 dm_deleting_md(md)) 2922 return NULL; 2923 2924 dm_get(md); 2925 return md; 2926 } 2927 2928 int dm_suspended_md(struct mapped_device *md) 2929 { 2930 return test_bit(DMF_SUSPENDED, &md->flags); 2931 } 2932 2933 int dm_test_deferred_remove_flag(struct mapped_device *md) 2934 { 2935 return test_bit(DMF_DEFERRED_REMOVE, &md->flags); 2936 } 2937 2938 int dm_suspended(struct dm_target *ti) 2939 { 2940 return dm_suspended_md(dm_table_get_md(ti->table)); 2941 } 2942 EXPORT_SYMBOL_GPL(dm_suspended); 2943 2944 int dm_noflush_suspending(struct dm_target *ti) 2945 { 2946 return __noflush_suspending(dm_table_get_md(ti->table)); 2947 } 2948 EXPORT_SYMBOL_GPL(dm_noflush_suspending); 2949 2950 struct dm_md_mempools *dm_alloc_md_mempools(unsigned type, unsigned integrity, unsigned per_bio_data_size) 2951 { 2952 struct dm_md_mempools *pools = kzalloc(sizeof(*pools), GFP_KERNEL); 2953 struct kmem_cache *cachep; 2954 unsigned int pool_size; 2955 unsigned int front_pad; 2956 2957 if (!pools) 2958 return NULL; 2959 2960 if (type == DM_TYPE_BIO_BASED) { 2961 cachep = _io_cache; 2962 pool_size = dm_get_reserved_bio_based_ios(); 2963 front_pad = roundup(per_bio_data_size, __alignof__(struct dm_target_io)) + offsetof(struct dm_target_io, clone); 2964 } else if (type == DM_TYPE_REQUEST_BASED) { 2965 cachep = _rq_tio_cache; 2966 pool_size = dm_get_reserved_rq_based_ios(); 2967 front_pad = offsetof(struct dm_rq_clone_bio_info, clone); 2968 /* per_bio_data_size is not used. See __bind_mempools(). */ 2969 WARN_ON(per_bio_data_size != 0); 2970 } else 2971 goto out; 2972 2973 pools->io_pool = mempool_create_slab_pool(pool_size, cachep); 2974 if (!pools->io_pool) 2975 goto out; 2976 2977 pools->bs = bioset_create(pool_size, front_pad); 2978 if (!pools->bs) 2979 goto out; 2980 2981 if (integrity && bioset_integrity_create(pools->bs, pool_size)) 2982 goto out; 2983 2984 return pools; 2985 2986 out: 2987 dm_free_md_mempools(pools); 2988 2989 return NULL; 2990 } 2991 2992 void dm_free_md_mempools(struct dm_md_mempools *pools) 2993 { 2994 if (!pools) 2995 return; 2996 2997 if (pools->io_pool) 2998 mempool_destroy(pools->io_pool); 2999 3000 if (pools->bs) 3001 bioset_free(pools->bs); 3002 3003 kfree(pools); 3004 } 3005 3006 static const struct block_device_operations dm_blk_dops = { 3007 .open = dm_blk_open, 3008 .release = dm_blk_close, 3009 .ioctl = dm_blk_ioctl, 3010 .getgeo = dm_blk_getgeo, 3011 .owner = THIS_MODULE 3012 }; 3013 3014 EXPORT_SYMBOL(dm_get_mapinfo); 3015 3016 /* 3017 * module hooks 3018 */ 3019 module_init(dm_init); 3020 module_exit(dm_exit); 3021 3022 module_param(major, uint, 0); 3023 MODULE_PARM_DESC(major, "The major number of the device mapper"); 3024 3025 module_param(reserved_bio_based_ios, uint, S_IRUGO | S_IWUSR); 3026 MODULE_PARM_DESC(reserved_bio_based_ios, "Reserved IOs in bio-based mempools"); 3027 3028 module_param(reserved_rq_based_ios, uint, S_IRUGO | S_IWUSR); 3029 MODULE_PARM_DESC(reserved_rq_based_ios, "Reserved IOs in request-based mempools"); 3030 3031 MODULE_DESCRIPTION(DM_NAME " driver"); 3032 MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>"); 3033 MODULE_LICENSE("GPL"); 3034