1 /* 2 * Copyright (C) 2001, 2002 Sistina Software (UK) Limited. 3 * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved. 4 * 5 * This file is released under the GPL. 6 */ 7 8 #include "dm.h" 9 #include "dm-uevent.h" 10 11 #include <linux/init.h> 12 #include <linux/module.h> 13 #include <linux/mutex.h> 14 #include <linux/moduleparam.h> 15 #include <linux/blkpg.h> 16 #include <linux/bio.h> 17 #include <linux/mempool.h> 18 #include <linux/slab.h> 19 #include <linux/idr.h> 20 #include <linux/hdreg.h> 21 #include <linux/delay.h> 22 23 #include <trace/events/block.h> 24 25 #define DM_MSG_PREFIX "core" 26 27 #ifdef CONFIG_PRINTK 28 /* 29 * ratelimit state to be used in DMXXX_LIMIT(). 30 */ 31 DEFINE_RATELIMIT_STATE(dm_ratelimit_state, 32 DEFAULT_RATELIMIT_INTERVAL, 33 DEFAULT_RATELIMIT_BURST); 34 EXPORT_SYMBOL(dm_ratelimit_state); 35 #endif 36 37 /* 38 * Cookies are numeric values sent with CHANGE and REMOVE 39 * uevents while resuming, removing or renaming the device. 40 */ 41 #define DM_COOKIE_ENV_VAR_NAME "DM_COOKIE" 42 #define DM_COOKIE_LENGTH 24 43 44 static const char *_name = DM_NAME; 45 46 static unsigned int major = 0; 47 static unsigned int _major = 0; 48 49 static DEFINE_IDR(_minor_idr); 50 51 static DEFINE_SPINLOCK(_minor_lock); 52 /* 53 * For bio-based dm. 54 * One of these is allocated per bio. 55 */ 56 struct dm_io { 57 struct mapped_device *md; 58 int error; 59 atomic_t io_count; 60 struct bio *bio; 61 unsigned long start_time; 62 spinlock_t endio_lock; 63 }; 64 65 /* 66 * For bio-based dm. 67 * One of these is allocated per target within a bio. Hopefully 68 * this will be simplified out one day. 69 */ 70 struct dm_target_io { 71 struct dm_io *io; 72 struct dm_target *ti; 73 union map_info info; 74 }; 75 76 /* 77 * For request-based dm. 78 * One of these is allocated per request. 79 */ 80 struct dm_rq_target_io { 81 struct mapped_device *md; 82 struct dm_target *ti; 83 struct request *orig, clone; 84 int error; 85 union map_info info; 86 }; 87 88 /* 89 * For request-based dm - the bio clones we allocate are embedded in these 90 * structs. 91 * 92 * We allocate these with bio_alloc_bioset, using the front_pad parameter when 93 * the bioset is created - this means the bio has to come at the end of the 94 * struct. 95 */ 96 struct dm_rq_clone_bio_info { 97 struct bio *orig; 98 struct dm_rq_target_io *tio; 99 struct bio clone; 100 }; 101 102 union map_info *dm_get_mapinfo(struct bio *bio) 103 { 104 if (bio && bio->bi_private) 105 return &((struct dm_target_io *)bio->bi_private)->info; 106 return NULL; 107 } 108 109 union map_info *dm_get_rq_mapinfo(struct request *rq) 110 { 111 if (rq && rq->end_io_data) 112 return &((struct dm_rq_target_io *)rq->end_io_data)->info; 113 return NULL; 114 } 115 EXPORT_SYMBOL_GPL(dm_get_rq_mapinfo); 116 117 #define MINOR_ALLOCED ((void *)-1) 118 119 /* 120 * Bits for the md->flags field. 121 */ 122 #define DMF_BLOCK_IO_FOR_SUSPEND 0 123 #define DMF_SUSPENDED 1 124 #define DMF_FROZEN 2 125 #define DMF_FREEING 3 126 #define DMF_DELETING 4 127 #define DMF_NOFLUSH_SUSPENDING 5 128 #define DMF_MERGE_IS_OPTIONAL 6 129 130 /* 131 * Work processed by per-device workqueue. 132 */ 133 struct mapped_device { 134 struct rw_semaphore io_lock; 135 struct mutex suspend_lock; 136 rwlock_t map_lock; 137 atomic_t holders; 138 atomic_t open_count; 139 140 unsigned long flags; 141 142 struct request_queue *queue; 143 unsigned type; 144 /* Protect queue and type against concurrent access. */ 145 struct mutex type_lock; 146 147 struct target_type *immutable_target_type; 148 149 struct gendisk *disk; 150 char name[16]; 151 152 void *interface_ptr; 153 154 /* 155 * A list of ios that arrived while we were suspended. 156 */ 157 atomic_t pending[2]; 158 wait_queue_head_t wait; 159 struct work_struct work; 160 struct bio_list deferred; 161 spinlock_t deferred_lock; 162 163 /* 164 * Processing queue (flush) 165 */ 166 struct workqueue_struct *wq; 167 168 /* 169 * The current mapping. 170 */ 171 struct dm_table *map; 172 173 /* 174 * io objects are allocated from here. 175 */ 176 mempool_t *io_pool; 177 mempool_t *tio_pool; 178 179 struct bio_set *bs; 180 181 /* 182 * Event handling. 183 */ 184 atomic_t event_nr; 185 wait_queue_head_t eventq; 186 atomic_t uevent_seq; 187 struct list_head uevent_list; 188 spinlock_t uevent_lock; /* Protect access to uevent_list */ 189 190 /* 191 * freeze/thaw support require holding onto a super block 192 */ 193 struct super_block *frozen_sb; 194 struct block_device *bdev; 195 196 /* forced geometry settings */ 197 struct hd_geometry geometry; 198 199 /* sysfs handle */ 200 struct kobject kobj; 201 202 /* zero-length flush that will be cloned and submitted to targets */ 203 struct bio flush_bio; 204 }; 205 206 /* 207 * For mempools pre-allocation at the table loading time. 208 */ 209 struct dm_md_mempools { 210 mempool_t *io_pool; 211 mempool_t *tio_pool; 212 struct bio_set *bs; 213 }; 214 215 #define MIN_IOS 256 216 static struct kmem_cache *_io_cache; 217 static struct kmem_cache *_tio_cache; 218 static struct kmem_cache *_rq_tio_cache; 219 220 /* 221 * Unused now, and needs to be deleted. But since io_pool is overloaded and it's 222 * still used for _io_cache, I'm leaving this for a later cleanup 223 */ 224 static struct kmem_cache *_rq_bio_info_cache; 225 226 static int __init local_init(void) 227 { 228 int r = -ENOMEM; 229 230 /* allocate a slab for the dm_ios */ 231 _io_cache = KMEM_CACHE(dm_io, 0); 232 if (!_io_cache) 233 return r; 234 235 /* allocate a slab for the target ios */ 236 _tio_cache = KMEM_CACHE(dm_target_io, 0); 237 if (!_tio_cache) 238 goto out_free_io_cache; 239 240 _rq_tio_cache = KMEM_CACHE(dm_rq_target_io, 0); 241 if (!_rq_tio_cache) 242 goto out_free_tio_cache; 243 244 _rq_bio_info_cache = KMEM_CACHE(dm_rq_clone_bio_info, 0); 245 if (!_rq_bio_info_cache) 246 goto out_free_rq_tio_cache; 247 248 r = dm_uevent_init(); 249 if (r) 250 goto out_free_rq_bio_info_cache; 251 252 _major = major; 253 r = register_blkdev(_major, _name); 254 if (r < 0) 255 goto out_uevent_exit; 256 257 if (!_major) 258 _major = r; 259 260 return 0; 261 262 out_uevent_exit: 263 dm_uevent_exit(); 264 out_free_rq_bio_info_cache: 265 kmem_cache_destroy(_rq_bio_info_cache); 266 out_free_rq_tio_cache: 267 kmem_cache_destroy(_rq_tio_cache); 268 out_free_tio_cache: 269 kmem_cache_destroy(_tio_cache); 270 out_free_io_cache: 271 kmem_cache_destroy(_io_cache); 272 273 return r; 274 } 275 276 static void local_exit(void) 277 { 278 kmem_cache_destroy(_rq_bio_info_cache); 279 kmem_cache_destroy(_rq_tio_cache); 280 kmem_cache_destroy(_tio_cache); 281 kmem_cache_destroy(_io_cache); 282 unregister_blkdev(_major, _name); 283 dm_uevent_exit(); 284 285 _major = 0; 286 287 DMINFO("cleaned up"); 288 } 289 290 static int (*_inits[])(void) __initdata = { 291 local_init, 292 dm_target_init, 293 dm_linear_init, 294 dm_stripe_init, 295 dm_io_init, 296 dm_kcopyd_init, 297 dm_interface_init, 298 }; 299 300 static void (*_exits[])(void) = { 301 local_exit, 302 dm_target_exit, 303 dm_linear_exit, 304 dm_stripe_exit, 305 dm_io_exit, 306 dm_kcopyd_exit, 307 dm_interface_exit, 308 }; 309 310 static int __init dm_init(void) 311 { 312 const int count = ARRAY_SIZE(_inits); 313 314 int r, i; 315 316 for (i = 0; i < count; i++) { 317 r = _inits[i](); 318 if (r) 319 goto bad; 320 } 321 322 return 0; 323 324 bad: 325 while (i--) 326 _exits[i](); 327 328 return r; 329 } 330 331 static void __exit dm_exit(void) 332 { 333 int i = ARRAY_SIZE(_exits); 334 335 while (i--) 336 _exits[i](); 337 338 /* 339 * Should be empty by this point. 340 */ 341 idr_remove_all(&_minor_idr); 342 idr_destroy(&_minor_idr); 343 } 344 345 /* 346 * Block device functions 347 */ 348 int dm_deleting_md(struct mapped_device *md) 349 { 350 return test_bit(DMF_DELETING, &md->flags); 351 } 352 353 static int dm_blk_open(struct block_device *bdev, fmode_t mode) 354 { 355 struct mapped_device *md; 356 357 spin_lock(&_minor_lock); 358 359 md = bdev->bd_disk->private_data; 360 if (!md) 361 goto out; 362 363 if (test_bit(DMF_FREEING, &md->flags) || 364 dm_deleting_md(md)) { 365 md = NULL; 366 goto out; 367 } 368 369 dm_get(md); 370 atomic_inc(&md->open_count); 371 372 out: 373 spin_unlock(&_minor_lock); 374 375 return md ? 0 : -ENXIO; 376 } 377 378 static int dm_blk_close(struct gendisk *disk, fmode_t mode) 379 { 380 struct mapped_device *md = disk->private_data; 381 382 spin_lock(&_minor_lock); 383 384 atomic_dec(&md->open_count); 385 dm_put(md); 386 387 spin_unlock(&_minor_lock); 388 389 return 0; 390 } 391 392 int dm_open_count(struct mapped_device *md) 393 { 394 return atomic_read(&md->open_count); 395 } 396 397 /* 398 * Guarantees nothing is using the device before it's deleted. 399 */ 400 int dm_lock_for_deletion(struct mapped_device *md) 401 { 402 int r = 0; 403 404 spin_lock(&_minor_lock); 405 406 if (dm_open_count(md)) 407 r = -EBUSY; 408 else 409 set_bit(DMF_DELETING, &md->flags); 410 411 spin_unlock(&_minor_lock); 412 413 return r; 414 } 415 416 static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo) 417 { 418 struct mapped_device *md = bdev->bd_disk->private_data; 419 420 return dm_get_geometry(md, geo); 421 } 422 423 static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode, 424 unsigned int cmd, unsigned long arg) 425 { 426 struct mapped_device *md = bdev->bd_disk->private_data; 427 struct dm_table *map = dm_get_live_table(md); 428 struct dm_target *tgt; 429 int r = -ENOTTY; 430 431 if (!map || !dm_table_get_size(map)) 432 goto out; 433 434 /* We only support devices that have a single target */ 435 if (dm_table_get_num_targets(map) != 1) 436 goto out; 437 438 tgt = dm_table_get_target(map, 0); 439 440 if (dm_suspended_md(md)) { 441 r = -EAGAIN; 442 goto out; 443 } 444 445 if (tgt->type->ioctl) 446 r = tgt->type->ioctl(tgt, cmd, arg); 447 448 out: 449 dm_table_put(map); 450 451 return r; 452 } 453 454 static struct dm_io *alloc_io(struct mapped_device *md) 455 { 456 return mempool_alloc(md->io_pool, GFP_NOIO); 457 } 458 459 static void free_io(struct mapped_device *md, struct dm_io *io) 460 { 461 mempool_free(io, md->io_pool); 462 } 463 464 static void free_tio(struct mapped_device *md, struct dm_target_io *tio) 465 { 466 mempool_free(tio, md->tio_pool); 467 } 468 469 static struct dm_rq_target_io *alloc_rq_tio(struct mapped_device *md, 470 gfp_t gfp_mask) 471 { 472 return mempool_alloc(md->tio_pool, gfp_mask); 473 } 474 475 static void free_rq_tio(struct dm_rq_target_io *tio) 476 { 477 mempool_free(tio, tio->md->tio_pool); 478 } 479 480 static int md_in_flight(struct mapped_device *md) 481 { 482 return atomic_read(&md->pending[READ]) + 483 atomic_read(&md->pending[WRITE]); 484 } 485 486 static void start_io_acct(struct dm_io *io) 487 { 488 struct mapped_device *md = io->md; 489 int cpu; 490 int rw = bio_data_dir(io->bio); 491 492 io->start_time = jiffies; 493 494 cpu = part_stat_lock(); 495 part_round_stats(cpu, &dm_disk(md)->part0); 496 part_stat_unlock(); 497 atomic_set(&dm_disk(md)->part0.in_flight[rw], 498 atomic_inc_return(&md->pending[rw])); 499 } 500 501 static void end_io_acct(struct dm_io *io) 502 { 503 struct mapped_device *md = io->md; 504 struct bio *bio = io->bio; 505 unsigned long duration = jiffies - io->start_time; 506 int pending, cpu; 507 int rw = bio_data_dir(bio); 508 509 cpu = part_stat_lock(); 510 part_round_stats(cpu, &dm_disk(md)->part0); 511 part_stat_add(cpu, &dm_disk(md)->part0, ticks[rw], duration); 512 part_stat_unlock(); 513 514 /* 515 * After this is decremented the bio must not be touched if it is 516 * a flush. 517 */ 518 pending = atomic_dec_return(&md->pending[rw]); 519 atomic_set(&dm_disk(md)->part0.in_flight[rw], pending); 520 pending += atomic_read(&md->pending[rw^0x1]); 521 522 /* nudge anyone waiting on suspend queue */ 523 if (!pending) 524 wake_up(&md->wait); 525 } 526 527 /* 528 * Add the bio to the list of deferred io. 529 */ 530 static void queue_io(struct mapped_device *md, struct bio *bio) 531 { 532 unsigned long flags; 533 534 spin_lock_irqsave(&md->deferred_lock, flags); 535 bio_list_add(&md->deferred, bio); 536 spin_unlock_irqrestore(&md->deferred_lock, flags); 537 queue_work(md->wq, &md->work); 538 } 539 540 /* 541 * Everyone (including functions in this file), should use this 542 * function to access the md->map field, and make sure they call 543 * dm_table_put() when finished. 544 */ 545 struct dm_table *dm_get_live_table(struct mapped_device *md) 546 { 547 struct dm_table *t; 548 unsigned long flags; 549 550 read_lock_irqsave(&md->map_lock, flags); 551 t = md->map; 552 if (t) 553 dm_table_get(t); 554 read_unlock_irqrestore(&md->map_lock, flags); 555 556 return t; 557 } 558 559 /* 560 * Get the geometry associated with a dm device 561 */ 562 int dm_get_geometry(struct mapped_device *md, struct hd_geometry *geo) 563 { 564 *geo = md->geometry; 565 566 return 0; 567 } 568 569 /* 570 * Set the geometry of a device. 571 */ 572 int dm_set_geometry(struct mapped_device *md, struct hd_geometry *geo) 573 { 574 sector_t sz = (sector_t)geo->cylinders * geo->heads * geo->sectors; 575 576 if (geo->start > sz) { 577 DMWARN("Start sector is beyond the geometry limits."); 578 return -EINVAL; 579 } 580 581 md->geometry = *geo; 582 583 return 0; 584 } 585 586 /*----------------------------------------------------------------- 587 * CRUD START: 588 * A more elegant soln is in the works that uses the queue 589 * merge fn, unfortunately there are a couple of changes to 590 * the block layer that I want to make for this. So in the 591 * interests of getting something for people to use I give 592 * you this clearly demarcated crap. 593 *---------------------------------------------------------------*/ 594 595 static int __noflush_suspending(struct mapped_device *md) 596 { 597 return test_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); 598 } 599 600 /* 601 * Decrements the number of outstanding ios that a bio has been 602 * cloned into, completing the original io if necc. 603 */ 604 static void dec_pending(struct dm_io *io, int error) 605 { 606 unsigned long flags; 607 int io_error; 608 struct bio *bio; 609 struct mapped_device *md = io->md; 610 611 /* Push-back supersedes any I/O errors */ 612 if (unlikely(error)) { 613 spin_lock_irqsave(&io->endio_lock, flags); 614 if (!(io->error > 0 && __noflush_suspending(md))) 615 io->error = error; 616 spin_unlock_irqrestore(&io->endio_lock, flags); 617 } 618 619 if (atomic_dec_and_test(&io->io_count)) { 620 if (io->error == DM_ENDIO_REQUEUE) { 621 /* 622 * Target requested pushing back the I/O. 623 */ 624 spin_lock_irqsave(&md->deferred_lock, flags); 625 if (__noflush_suspending(md)) 626 bio_list_add_head(&md->deferred, io->bio); 627 else 628 /* noflush suspend was interrupted. */ 629 io->error = -EIO; 630 spin_unlock_irqrestore(&md->deferred_lock, flags); 631 } 632 633 io_error = io->error; 634 bio = io->bio; 635 end_io_acct(io); 636 free_io(md, io); 637 638 if (io_error == DM_ENDIO_REQUEUE) 639 return; 640 641 if ((bio->bi_rw & REQ_FLUSH) && bio->bi_size) { 642 /* 643 * Preflush done for flush with data, reissue 644 * without REQ_FLUSH. 645 */ 646 bio->bi_rw &= ~REQ_FLUSH; 647 queue_io(md, bio); 648 } else { 649 /* done with normal IO or empty flush */ 650 trace_block_bio_complete(md->queue, bio, io_error); 651 bio_endio(bio, io_error); 652 } 653 } 654 } 655 656 static void clone_endio(struct bio *bio, int error) 657 { 658 int r = 0; 659 struct dm_target_io *tio = bio->bi_private; 660 struct dm_io *io = tio->io; 661 struct mapped_device *md = tio->io->md; 662 dm_endio_fn endio = tio->ti->type->end_io; 663 664 if (!bio_flagged(bio, BIO_UPTODATE) && !error) 665 error = -EIO; 666 667 if (endio) { 668 r = endio(tio->ti, bio, error, &tio->info); 669 if (r < 0 || r == DM_ENDIO_REQUEUE) 670 /* 671 * error and requeue request are handled 672 * in dec_pending(). 673 */ 674 error = r; 675 else if (r == DM_ENDIO_INCOMPLETE) 676 /* The target will handle the io */ 677 return; 678 else if (r) { 679 DMWARN("unimplemented target endio return value: %d", r); 680 BUG(); 681 } 682 } 683 684 free_tio(md, tio); 685 bio_put(bio); 686 dec_pending(io, error); 687 } 688 689 /* 690 * Partial completion handling for request-based dm 691 */ 692 static void end_clone_bio(struct bio *clone, int error) 693 { 694 struct dm_rq_clone_bio_info *info = clone->bi_private; 695 struct dm_rq_target_io *tio = info->tio; 696 struct bio *bio = info->orig; 697 unsigned int nr_bytes = info->orig->bi_size; 698 699 bio_put(clone); 700 701 if (tio->error) 702 /* 703 * An error has already been detected on the request. 704 * Once error occurred, just let clone->end_io() handle 705 * the remainder. 706 */ 707 return; 708 else if (error) { 709 /* 710 * Don't notice the error to the upper layer yet. 711 * The error handling decision is made by the target driver, 712 * when the request is completed. 713 */ 714 tio->error = error; 715 return; 716 } 717 718 /* 719 * I/O for the bio successfully completed. 720 * Notice the data completion to the upper layer. 721 */ 722 723 /* 724 * bios are processed from the head of the list. 725 * So the completing bio should always be rq->bio. 726 * If it's not, something wrong is happening. 727 */ 728 if (tio->orig->bio != bio) 729 DMERR("bio completion is going in the middle of the request"); 730 731 /* 732 * Update the original request. 733 * Do not use blk_end_request() here, because it may complete 734 * the original request before the clone, and break the ordering. 735 */ 736 blk_update_request(tio->orig, 0, nr_bytes); 737 } 738 739 /* 740 * Don't touch any member of the md after calling this function because 741 * the md may be freed in dm_put() at the end of this function. 742 * Or do dm_get() before calling this function and dm_put() later. 743 */ 744 static void rq_completed(struct mapped_device *md, int rw, int run_queue) 745 { 746 atomic_dec(&md->pending[rw]); 747 748 /* nudge anyone waiting on suspend queue */ 749 if (!md_in_flight(md)) 750 wake_up(&md->wait); 751 752 if (run_queue) 753 blk_run_queue(md->queue); 754 755 /* 756 * dm_put() must be at the end of this function. See the comment above 757 */ 758 dm_put(md); 759 } 760 761 static void free_rq_clone(struct request *clone) 762 { 763 struct dm_rq_target_io *tio = clone->end_io_data; 764 765 blk_rq_unprep_clone(clone); 766 free_rq_tio(tio); 767 } 768 769 /* 770 * Complete the clone and the original request. 771 * Must be called without queue lock. 772 */ 773 static void dm_end_request(struct request *clone, int error) 774 { 775 int rw = rq_data_dir(clone); 776 struct dm_rq_target_io *tio = clone->end_io_data; 777 struct mapped_device *md = tio->md; 778 struct request *rq = tio->orig; 779 780 if (rq->cmd_type == REQ_TYPE_BLOCK_PC) { 781 rq->errors = clone->errors; 782 rq->resid_len = clone->resid_len; 783 784 if (rq->sense) 785 /* 786 * We are using the sense buffer of the original 787 * request. 788 * So setting the length of the sense data is enough. 789 */ 790 rq->sense_len = clone->sense_len; 791 } 792 793 free_rq_clone(clone); 794 blk_end_request_all(rq, error); 795 rq_completed(md, rw, true); 796 } 797 798 static void dm_unprep_request(struct request *rq) 799 { 800 struct request *clone = rq->special; 801 802 rq->special = NULL; 803 rq->cmd_flags &= ~REQ_DONTPREP; 804 805 free_rq_clone(clone); 806 } 807 808 /* 809 * Requeue the original request of a clone. 810 */ 811 void dm_requeue_unmapped_request(struct request *clone) 812 { 813 int rw = rq_data_dir(clone); 814 struct dm_rq_target_io *tio = clone->end_io_data; 815 struct mapped_device *md = tio->md; 816 struct request *rq = tio->orig; 817 struct request_queue *q = rq->q; 818 unsigned long flags; 819 820 dm_unprep_request(rq); 821 822 spin_lock_irqsave(q->queue_lock, flags); 823 blk_requeue_request(q, rq); 824 spin_unlock_irqrestore(q->queue_lock, flags); 825 826 rq_completed(md, rw, 0); 827 } 828 EXPORT_SYMBOL_GPL(dm_requeue_unmapped_request); 829 830 static void __stop_queue(struct request_queue *q) 831 { 832 blk_stop_queue(q); 833 } 834 835 static void stop_queue(struct request_queue *q) 836 { 837 unsigned long flags; 838 839 spin_lock_irqsave(q->queue_lock, flags); 840 __stop_queue(q); 841 spin_unlock_irqrestore(q->queue_lock, flags); 842 } 843 844 static void __start_queue(struct request_queue *q) 845 { 846 if (blk_queue_stopped(q)) 847 blk_start_queue(q); 848 } 849 850 static void start_queue(struct request_queue *q) 851 { 852 unsigned long flags; 853 854 spin_lock_irqsave(q->queue_lock, flags); 855 __start_queue(q); 856 spin_unlock_irqrestore(q->queue_lock, flags); 857 } 858 859 static void dm_done(struct request *clone, int error, bool mapped) 860 { 861 int r = error; 862 struct dm_rq_target_io *tio = clone->end_io_data; 863 dm_request_endio_fn rq_end_io = tio->ti->type->rq_end_io; 864 865 if (mapped && rq_end_io) 866 r = rq_end_io(tio->ti, clone, error, &tio->info); 867 868 if (r <= 0) 869 /* The target wants to complete the I/O */ 870 dm_end_request(clone, r); 871 else if (r == DM_ENDIO_INCOMPLETE) 872 /* The target will handle the I/O */ 873 return; 874 else if (r == DM_ENDIO_REQUEUE) 875 /* The target wants to requeue the I/O */ 876 dm_requeue_unmapped_request(clone); 877 else { 878 DMWARN("unimplemented target endio return value: %d", r); 879 BUG(); 880 } 881 } 882 883 /* 884 * Request completion handler for request-based dm 885 */ 886 static void dm_softirq_done(struct request *rq) 887 { 888 bool mapped = true; 889 struct request *clone = rq->completion_data; 890 struct dm_rq_target_io *tio = clone->end_io_data; 891 892 if (rq->cmd_flags & REQ_FAILED) 893 mapped = false; 894 895 dm_done(clone, tio->error, mapped); 896 } 897 898 /* 899 * Complete the clone and the original request with the error status 900 * through softirq context. 901 */ 902 static void dm_complete_request(struct request *clone, int error) 903 { 904 struct dm_rq_target_io *tio = clone->end_io_data; 905 struct request *rq = tio->orig; 906 907 tio->error = error; 908 rq->completion_data = clone; 909 blk_complete_request(rq); 910 } 911 912 /* 913 * Complete the not-mapped clone and the original request with the error status 914 * through softirq context. 915 * Target's rq_end_io() function isn't called. 916 * This may be used when the target's map_rq() function fails. 917 */ 918 void dm_kill_unmapped_request(struct request *clone, int error) 919 { 920 struct dm_rq_target_io *tio = clone->end_io_data; 921 struct request *rq = tio->orig; 922 923 rq->cmd_flags |= REQ_FAILED; 924 dm_complete_request(clone, error); 925 } 926 EXPORT_SYMBOL_GPL(dm_kill_unmapped_request); 927 928 /* 929 * Called with the queue lock held 930 */ 931 static void end_clone_request(struct request *clone, int error) 932 { 933 /* 934 * For just cleaning up the information of the queue in which 935 * the clone was dispatched. 936 * The clone is *NOT* freed actually here because it is alloced from 937 * dm own mempool and REQ_ALLOCED isn't set in clone->cmd_flags. 938 */ 939 __blk_put_request(clone->q, clone); 940 941 /* 942 * Actual request completion is done in a softirq context which doesn't 943 * hold the queue lock. Otherwise, deadlock could occur because: 944 * - another request may be submitted by the upper level driver 945 * of the stacking during the completion 946 * - the submission which requires queue lock may be done 947 * against this queue 948 */ 949 dm_complete_request(clone, error); 950 } 951 952 /* 953 * Return maximum size of I/O possible at the supplied sector up to the current 954 * target boundary. 955 */ 956 static sector_t max_io_len_target_boundary(sector_t sector, struct dm_target *ti) 957 { 958 sector_t target_offset = dm_target_offset(ti, sector); 959 960 return ti->len - target_offset; 961 } 962 963 static sector_t max_io_len(sector_t sector, struct dm_target *ti) 964 { 965 sector_t len = max_io_len_target_boundary(sector, ti); 966 sector_t offset, max_len; 967 968 /* 969 * Does the target need to split even further? 970 */ 971 if (ti->max_io_len) { 972 offset = dm_target_offset(ti, sector); 973 if (unlikely(ti->max_io_len & (ti->max_io_len - 1))) 974 max_len = sector_div(offset, ti->max_io_len); 975 else 976 max_len = offset & (ti->max_io_len - 1); 977 max_len = ti->max_io_len - max_len; 978 979 if (len > max_len) 980 len = max_len; 981 } 982 983 return len; 984 } 985 986 int dm_set_target_max_io_len(struct dm_target *ti, sector_t len) 987 { 988 if (len > UINT_MAX) { 989 DMERR("Specified maximum size of target IO (%llu) exceeds limit (%u)", 990 (unsigned long long)len, UINT_MAX); 991 ti->error = "Maximum size of target IO is too large"; 992 return -EINVAL; 993 } 994 995 ti->max_io_len = (uint32_t) len; 996 997 return 0; 998 } 999 EXPORT_SYMBOL_GPL(dm_set_target_max_io_len); 1000 1001 static void __map_bio(struct dm_target *ti, struct bio *clone, 1002 struct dm_target_io *tio) 1003 { 1004 int r; 1005 sector_t sector; 1006 struct mapped_device *md; 1007 1008 clone->bi_end_io = clone_endio; 1009 clone->bi_private = tio; 1010 1011 /* 1012 * Map the clone. If r == 0 we don't need to do 1013 * anything, the target has assumed ownership of 1014 * this io. 1015 */ 1016 atomic_inc(&tio->io->io_count); 1017 sector = clone->bi_sector; 1018 r = ti->type->map(ti, clone, &tio->info); 1019 if (r == DM_MAPIO_REMAPPED) { 1020 /* the bio has been remapped so dispatch it */ 1021 1022 trace_block_bio_remap(bdev_get_queue(clone->bi_bdev), clone, 1023 tio->io->bio->bi_bdev->bd_dev, sector); 1024 1025 generic_make_request(clone); 1026 } else if (r < 0 || r == DM_MAPIO_REQUEUE) { 1027 /* error the io and bail out, or requeue it if needed */ 1028 md = tio->io->md; 1029 dec_pending(tio->io, r); 1030 bio_put(clone); 1031 free_tio(md, tio); 1032 } else if (r) { 1033 DMWARN("unimplemented target map return value: %d", r); 1034 BUG(); 1035 } 1036 } 1037 1038 struct clone_info { 1039 struct mapped_device *md; 1040 struct dm_table *map; 1041 struct bio *bio; 1042 struct dm_io *io; 1043 sector_t sector; 1044 sector_t sector_count; 1045 unsigned short idx; 1046 }; 1047 1048 /* 1049 * Creates a little bio that just does part of a bvec. 1050 */ 1051 static struct bio *split_bvec(struct bio *bio, sector_t sector, 1052 unsigned short idx, unsigned int offset, 1053 unsigned int len, struct bio_set *bs) 1054 { 1055 struct bio *clone; 1056 struct bio_vec *bv = bio->bi_io_vec + idx; 1057 1058 clone = bio_alloc_bioset(GFP_NOIO, 1, bs); 1059 *clone->bi_io_vec = *bv; 1060 1061 clone->bi_sector = sector; 1062 clone->bi_bdev = bio->bi_bdev; 1063 clone->bi_rw = bio->bi_rw; 1064 clone->bi_vcnt = 1; 1065 clone->bi_size = to_bytes(len); 1066 clone->bi_io_vec->bv_offset = offset; 1067 clone->bi_io_vec->bv_len = clone->bi_size; 1068 clone->bi_flags |= 1 << BIO_CLONED; 1069 1070 if (bio_integrity(bio)) { 1071 bio_integrity_clone(clone, bio, GFP_NOIO); 1072 bio_integrity_trim(clone, 1073 bio_sector_offset(bio, idx, offset), len); 1074 } 1075 1076 return clone; 1077 } 1078 1079 /* 1080 * Creates a bio that consists of range of complete bvecs. 1081 */ 1082 static struct bio *clone_bio(struct bio *bio, sector_t sector, 1083 unsigned short idx, unsigned short bv_count, 1084 unsigned int len, struct bio_set *bs) 1085 { 1086 struct bio *clone; 1087 1088 clone = bio_alloc_bioset(GFP_NOIO, bio->bi_max_vecs, bs); 1089 __bio_clone(clone, bio); 1090 clone->bi_sector = sector; 1091 clone->bi_idx = idx; 1092 clone->bi_vcnt = idx + bv_count; 1093 clone->bi_size = to_bytes(len); 1094 clone->bi_flags &= ~(1 << BIO_SEG_VALID); 1095 1096 if (bio_integrity(bio)) { 1097 bio_integrity_clone(clone, bio, GFP_NOIO); 1098 1099 if (idx != bio->bi_idx || clone->bi_size < bio->bi_size) 1100 bio_integrity_trim(clone, 1101 bio_sector_offset(bio, idx, 0), len); 1102 } 1103 1104 return clone; 1105 } 1106 1107 static struct dm_target_io *alloc_tio(struct clone_info *ci, 1108 struct dm_target *ti) 1109 { 1110 struct dm_target_io *tio = mempool_alloc(ci->md->tio_pool, GFP_NOIO); 1111 1112 tio->io = ci->io; 1113 tio->ti = ti; 1114 memset(&tio->info, 0, sizeof(tio->info)); 1115 1116 return tio; 1117 } 1118 1119 static void __issue_target_request(struct clone_info *ci, struct dm_target *ti, 1120 unsigned request_nr, sector_t len) 1121 { 1122 struct dm_target_io *tio = alloc_tio(ci, ti); 1123 struct bio *clone; 1124 1125 tio->info.target_request_nr = request_nr; 1126 1127 /* 1128 * Discard requests require the bio's inline iovecs be initialized. 1129 * ci->bio->bi_max_vecs is BIO_INLINE_VECS anyway, for both flush 1130 * and discard, so no need for concern about wasted bvec allocations. 1131 */ 1132 clone = bio_alloc_bioset(GFP_NOIO, ci->bio->bi_max_vecs, ci->md->bs); 1133 __bio_clone(clone, ci->bio); 1134 if (len) { 1135 clone->bi_sector = ci->sector; 1136 clone->bi_size = to_bytes(len); 1137 } 1138 1139 __map_bio(ti, clone, tio); 1140 } 1141 1142 static void __issue_target_requests(struct clone_info *ci, struct dm_target *ti, 1143 unsigned num_requests, sector_t len) 1144 { 1145 unsigned request_nr; 1146 1147 for (request_nr = 0; request_nr < num_requests; request_nr++) 1148 __issue_target_request(ci, ti, request_nr, len); 1149 } 1150 1151 static int __clone_and_map_empty_flush(struct clone_info *ci) 1152 { 1153 unsigned target_nr = 0; 1154 struct dm_target *ti; 1155 1156 BUG_ON(bio_has_data(ci->bio)); 1157 while ((ti = dm_table_get_target(ci->map, target_nr++))) 1158 __issue_target_requests(ci, ti, ti->num_flush_requests, 0); 1159 1160 return 0; 1161 } 1162 1163 /* 1164 * Perform all io with a single clone. 1165 */ 1166 static void __clone_and_map_simple(struct clone_info *ci, struct dm_target *ti) 1167 { 1168 struct bio *clone, *bio = ci->bio; 1169 struct dm_target_io *tio; 1170 1171 tio = alloc_tio(ci, ti); 1172 clone = clone_bio(bio, ci->sector, ci->idx, 1173 bio->bi_vcnt - ci->idx, ci->sector_count, 1174 ci->md->bs); 1175 __map_bio(ti, clone, tio); 1176 ci->sector_count = 0; 1177 } 1178 1179 static int __clone_and_map_discard(struct clone_info *ci) 1180 { 1181 struct dm_target *ti; 1182 sector_t len; 1183 1184 do { 1185 ti = dm_table_find_target(ci->map, ci->sector); 1186 if (!dm_target_is_valid(ti)) 1187 return -EIO; 1188 1189 /* 1190 * Even though the device advertised discard support, 1191 * that does not mean every target supports it, and 1192 * reconfiguration might also have changed that since the 1193 * check was performed. 1194 */ 1195 if (!ti->num_discard_requests) 1196 return -EOPNOTSUPP; 1197 1198 if (!ti->split_discard_requests) 1199 len = min(ci->sector_count, max_io_len_target_boundary(ci->sector, ti)); 1200 else 1201 len = min(ci->sector_count, max_io_len(ci->sector, ti)); 1202 1203 __issue_target_requests(ci, ti, ti->num_discard_requests, len); 1204 1205 ci->sector += len; 1206 } while (ci->sector_count -= len); 1207 1208 return 0; 1209 } 1210 1211 static int __clone_and_map(struct clone_info *ci) 1212 { 1213 struct bio *clone, *bio = ci->bio; 1214 struct dm_target *ti; 1215 sector_t len = 0, max; 1216 struct dm_target_io *tio; 1217 1218 if (unlikely(bio->bi_rw & REQ_DISCARD)) 1219 return __clone_and_map_discard(ci); 1220 1221 ti = dm_table_find_target(ci->map, ci->sector); 1222 if (!dm_target_is_valid(ti)) 1223 return -EIO; 1224 1225 max = max_io_len(ci->sector, ti); 1226 1227 if (ci->sector_count <= max) { 1228 /* 1229 * Optimise for the simple case where we can do all of 1230 * the remaining io with a single clone. 1231 */ 1232 __clone_and_map_simple(ci, ti); 1233 1234 } else if (to_sector(bio->bi_io_vec[ci->idx].bv_len) <= max) { 1235 /* 1236 * There are some bvecs that don't span targets. 1237 * Do as many of these as possible. 1238 */ 1239 int i; 1240 sector_t remaining = max; 1241 sector_t bv_len; 1242 1243 for (i = ci->idx; remaining && (i < bio->bi_vcnt); i++) { 1244 bv_len = to_sector(bio->bi_io_vec[i].bv_len); 1245 1246 if (bv_len > remaining) 1247 break; 1248 1249 remaining -= bv_len; 1250 len += bv_len; 1251 } 1252 1253 tio = alloc_tio(ci, ti); 1254 clone = clone_bio(bio, ci->sector, ci->idx, i - ci->idx, len, 1255 ci->md->bs); 1256 __map_bio(ti, clone, tio); 1257 1258 ci->sector += len; 1259 ci->sector_count -= len; 1260 ci->idx = i; 1261 1262 } else { 1263 /* 1264 * Handle a bvec that must be split between two or more targets. 1265 */ 1266 struct bio_vec *bv = bio->bi_io_vec + ci->idx; 1267 sector_t remaining = to_sector(bv->bv_len); 1268 unsigned int offset = 0; 1269 1270 do { 1271 if (offset) { 1272 ti = dm_table_find_target(ci->map, ci->sector); 1273 if (!dm_target_is_valid(ti)) 1274 return -EIO; 1275 1276 max = max_io_len(ci->sector, ti); 1277 } 1278 1279 len = min(remaining, max); 1280 1281 tio = alloc_tio(ci, ti); 1282 clone = split_bvec(bio, ci->sector, ci->idx, 1283 bv->bv_offset + offset, len, 1284 ci->md->bs); 1285 1286 __map_bio(ti, clone, tio); 1287 1288 ci->sector += len; 1289 ci->sector_count -= len; 1290 offset += to_bytes(len); 1291 } while (remaining -= len); 1292 1293 ci->idx++; 1294 } 1295 1296 return 0; 1297 } 1298 1299 /* 1300 * Split the bio into several clones and submit it to targets. 1301 */ 1302 static void __split_and_process_bio(struct mapped_device *md, struct bio *bio) 1303 { 1304 struct clone_info ci; 1305 int error = 0; 1306 1307 ci.map = dm_get_live_table(md); 1308 if (unlikely(!ci.map)) { 1309 bio_io_error(bio); 1310 return; 1311 } 1312 1313 ci.md = md; 1314 ci.io = alloc_io(md); 1315 ci.io->error = 0; 1316 atomic_set(&ci.io->io_count, 1); 1317 ci.io->bio = bio; 1318 ci.io->md = md; 1319 spin_lock_init(&ci.io->endio_lock); 1320 ci.sector = bio->bi_sector; 1321 ci.idx = bio->bi_idx; 1322 1323 start_io_acct(ci.io); 1324 if (bio->bi_rw & REQ_FLUSH) { 1325 ci.bio = &ci.md->flush_bio; 1326 ci.sector_count = 0; 1327 error = __clone_and_map_empty_flush(&ci); 1328 /* dec_pending submits any data associated with flush */ 1329 } else { 1330 ci.bio = bio; 1331 ci.sector_count = bio_sectors(bio); 1332 while (ci.sector_count && !error) 1333 error = __clone_and_map(&ci); 1334 } 1335 1336 /* drop the extra reference count */ 1337 dec_pending(ci.io, error); 1338 dm_table_put(ci.map); 1339 } 1340 /*----------------------------------------------------------------- 1341 * CRUD END 1342 *---------------------------------------------------------------*/ 1343 1344 static int dm_merge_bvec(struct request_queue *q, 1345 struct bvec_merge_data *bvm, 1346 struct bio_vec *biovec) 1347 { 1348 struct mapped_device *md = q->queuedata; 1349 struct dm_table *map = dm_get_live_table(md); 1350 struct dm_target *ti; 1351 sector_t max_sectors; 1352 int max_size = 0; 1353 1354 if (unlikely(!map)) 1355 goto out; 1356 1357 ti = dm_table_find_target(map, bvm->bi_sector); 1358 if (!dm_target_is_valid(ti)) 1359 goto out_table; 1360 1361 /* 1362 * Find maximum amount of I/O that won't need splitting 1363 */ 1364 max_sectors = min(max_io_len(bvm->bi_sector, ti), 1365 (sector_t) BIO_MAX_SECTORS); 1366 max_size = (max_sectors << SECTOR_SHIFT) - bvm->bi_size; 1367 if (max_size < 0) 1368 max_size = 0; 1369 1370 /* 1371 * merge_bvec_fn() returns number of bytes 1372 * it can accept at this offset 1373 * max is precomputed maximal io size 1374 */ 1375 if (max_size && ti->type->merge) 1376 max_size = ti->type->merge(ti, bvm, biovec, max_size); 1377 /* 1378 * If the target doesn't support merge method and some of the devices 1379 * provided their merge_bvec method (we know this by looking at 1380 * queue_max_hw_sectors), then we can't allow bios with multiple vector 1381 * entries. So always set max_size to 0, and the code below allows 1382 * just one page. 1383 */ 1384 else if (queue_max_hw_sectors(q) <= PAGE_SIZE >> 9) 1385 1386 max_size = 0; 1387 1388 out_table: 1389 dm_table_put(map); 1390 1391 out: 1392 /* 1393 * Always allow an entire first page 1394 */ 1395 if (max_size <= biovec->bv_len && !(bvm->bi_size >> SECTOR_SHIFT)) 1396 max_size = biovec->bv_len; 1397 1398 return max_size; 1399 } 1400 1401 /* 1402 * The request function that just remaps the bio built up by 1403 * dm_merge_bvec. 1404 */ 1405 static void _dm_request(struct request_queue *q, struct bio *bio) 1406 { 1407 int rw = bio_data_dir(bio); 1408 struct mapped_device *md = q->queuedata; 1409 int cpu; 1410 1411 down_read(&md->io_lock); 1412 1413 cpu = part_stat_lock(); 1414 part_stat_inc(cpu, &dm_disk(md)->part0, ios[rw]); 1415 part_stat_add(cpu, &dm_disk(md)->part0, sectors[rw], bio_sectors(bio)); 1416 part_stat_unlock(); 1417 1418 /* if we're suspended, we have to queue this io for later */ 1419 if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))) { 1420 up_read(&md->io_lock); 1421 1422 if (bio_rw(bio) != READA) 1423 queue_io(md, bio); 1424 else 1425 bio_io_error(bio); 1426 return; 1427 } 1428 1429 __split_and_process_bio(md, bio); 1430 up_read(&md->io_lock); 1431 return; 1432 } 1433 1434 static int dm_request_based(struct mapped_device *md) 1435 { 1436 return blk_queue_stackable(md->queue); 1437 } 1438 1439 static void dm_request(struct request_queue *q, struct bio *bio) 1440 { 1441 struct mapped_device *md = q->queuedata; 1442 1443 if (dm_request_based(md)) 1444 blk_queue_bio(q, bio); 1445 else 1446 _dm_request(q, bio); 1447 } 1448 1449 void dm_dispatch_request(struct request *rq) 1450 { 1451 int r; 1452 1453 if (blk_queue_io_stat(rq->q)) 1454 rq->cmd_flags |= REQ_IO_STAT; 1455 1456 rq->start_time = jiffies; 1457 r = blk_insert_cloned_request(rq->q, rq); 1458 if (r) 1459 dm_complete_request(rq, r); 1460 } 1461 EXPORT_SYMBOL_GPL(dm_dispatch_request); 1462 1463 static int dm_rq_bio_constructor(struct bio *bio, struct bio *bio_orig, 1464 void *data) 1465 { 1466 struct dm_rq_target_io *tio = data; 1467 struct dm_rq_clone_bio_info *info = 1468 container_of(bio, struct dm_rq_clone_bio_info, clone); 1469 1470 info->orig = bio_orig; 1471 info->tio = tio; 1472 bio->bi_end_io = end_clone_bio; 1473 bio->bi_private = info; 1474 1475 return 0; 1476 } 1477 1478 static int setup_clone(struct request *clone, struct request *rq, 1479 struct dm_rq_target_io *tio) 1480 { 1481 int r; 1482 1483 r = blk_rq_prep_clone(clone, rq, tio->md->bs, GFP_ATOMIC, 1484 dm_rq_bio_constructor, tio); 1485 if (r) 1486 return r; 1487 1488 clone->cmd = rq->cmd; 1489 clone->cmd_len = rq->cmd_len; 1490 clone->sense = rq->sense; 1491 clone->buffer = rq->buffer; 1492 clone->end_io = end_clone_request; 1493 clone->end_io_data = tio; 1494 1495 return 0; 1496 } 1497 1498 static struct request *clone_rq(struct request *rq, struct mapped_device *md, 1499 gfp_t gfp_mask) 1500 { 1501 struct request *clone; 1502 struct dm_rq_target_io *tio; 1503 1504 tio = alloc_rq_tio(md, gfp_mask); 1505 if (!tio) 1506 return NULL; 1507 1508 tio->md = md; 1509 tio->ti = NULL; 1510 tio->orig = rq; 1511 tio->error = 0; 1512 memset(&tio->info, 0, sizeof(tio->info)); 1513 1514 clone = &tio->clone; 1515 if (setup_clone(clone, rq, tio)) { 1516 /* -ENOMEM */ 1517 free_rq_tio(tio); 1518 return NULL; 1519 } 1520 1521 return clone; 1522 } 1523 1524 /* 1525 * Called with the queue lock held. 1526 */ 1527 static int dm_prep_fn(struct request_queue *q, struct request *rq) 1528 { 1529 struct mapped_device *md = q->queuedata; 1530 struct request *clone; 1531 1532 if (unlikely(rq->special)) { 1533 DMWARN("Already has something in rq->special."); 1534 return BLKPREP_KILL; 1535 } 1536 1537 clone = clone_rq(rq, md, GFP_ATOMIC); 1538 if (!clone) 1539 return BLKPREP_DEFER; 1540 1541 rq->special = clone; 1542 rq->cmd_flags |= REQ_DONTPREP; 1543 1544 return BLKPREP_OK; 1545 } 1546 1547 /* 1548 * Returns: 1549 * 0 : the request has been processed (not requeued) 1550 * !0 : the request has been requeued 1551 */ 1552 static int map_request(struct dm_target *ti, struct request *clone, 1553 struct mapped_device *md) 1554 { 1555 int r, requeued = 0; 1556 struct dm_rq_target_io *tio = clone->end_io_data; 1557 1558 /* 1559 * Hold the md reference here for the in-flight I/O. 1560 * We can't rely on the reference count by device opener, 1561 * because the device may be closed during the request completion 1562 * when all bios are completed. 1563 * See the comment in rq_completed() too. 1564 */ 1565 dm_get(md); 1566 1567 tio->ti = ti; 1568 r = ti->type->map_rq(ti, clone, &tio->info); 1569 switch (r) { 1570 case DM_MAPIO_SUBMITTED: 1571 /* The target has taken the I/O to submit by itself later */ 1572 break; 1573 case DM_MAPIO_REMAPPED: 1574 /* The target has remapped the I/O so dispatch it */ 1575 trace_block_rq_remap(clone->q, clone, disk_devt(dm_disk(md)), 1576 blk_rq_pos(tio->orig)); 1577 dm_dispatch_request(clone); 1578 break; 1579 case DM_MAPIO_REQUEUE: 1580 /* The target wants to requeue the I/O */ 1581 dm_requeue_unmapped_request(clone); 1582 requeued = 1; 1583 break; 1584 default: 1585 if (r > 0) { 1586 DMWARN("unimplemented target map return value: %d", r); 1587 BUG(); 1588 } 1589 1590 /* The target wants to complete the I/O */ 1591 dm_kill_unmapped_request(clone, r); 1592 break; 1593 } 1594 1595 return requeued; 1596 } 1597 1598 /* 1599 * q->request_fn for request-based dm. 1600 * Called with the queue lock held. 1601 */ 1602 static void dm_request_fn(struct request_queue *q) 1603 { 1604 struct mapped_device *md = q->queuedata; 1605 struct dm_table *map = dm_get_live_table(md); 1606 struct dm_target *ti; 1607 struct request *rq, *clone; 1608 sector_t pos; 1609 1610 /* 1611 * For suspend, check blk_queue_stopped() and increment 1612 * ->pending within a single queue_lock not to increment the 1613 * number of in-flight I/Os after the queue is stopped in 1614 * dm_suspend(). 1615 */ 1616 while (!blk_queue_stopped(q)) { 1617 rq = blk_peek_request(q); 1618 if (!rq) 1619 goto delay_and_out; 1620 1621 /* always use block 0 to find the target for flushes for now */ 1622 pos = 0; 1623 if (!(rq->cmd_flags & REQ_FLUSH)) 1624 pos = blk_rq_pos(rq); 1625 1626 ti = dm_table_find_target(map, pos); 1627 BUG_ON(!dm_target_is_valid(ti)); 1628 1629 if (ti->type->busy && ti->type->busy(ti)) 1630 goto delay_and_out; 1631 1632 blk_start_request(rq); 1633 clone = rq->special; 1634 atomic_inc(&md->pending[rq_data_dir(clone)]); 1635 1636 spin_unlock(q->queue_lock); 1637 if (map_request(ti, clone, md)) 1638 goto requeued; 1639 1640 BUG_ON(!irqs_disabled()); 1641 spin_lock(q->queue_lock); 1642 } 1643 1644 goto out; 1645 1646 requeued: 1647 BUG_ON(!irqs_disabled()); 1648 spin_lock(q->queue_lock); 1649 1650 delay_and_out: 1651 blk_delay_queue(q, HZ / 10); 1652 out: 1653 dm_table_put(map); 1654 1655 return; 1656 } 1657 1658 int dm_underlying_device_busy(struct request_queue *q) 1659 { 1660 return blk_lld_busy(q); 1661 } 1662 EXPORT_SYMBOL_GPL(dm_underlying_device_busy); 1663 1664 static int dm_lld_busy(struct request_queue *q) 1665 { 1666 int r; 1667 struct mapped_device *md = q->queuedata; 1668 struct dm_table *map = dm_get_live_table(md); 1669 1670 if (!map || test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) 1671 r = 1; 1672 else 1673 r = dm_table_any_busy_target(map); 1674 1675 dm_table_put(map); 1676 1677 return r; 1678 } 1679 1680 static int dm_any_congested(void *congested_data, int bdi_bits) 1681 { 1682 int r = bdi_bits; 1683 struct mapped_device *md = congested_data; 1684 struct dm_table *map; 1685 1686 if (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) { 1687 map = dm_get_live_table(md); 1688 if (map) { 1689 /* 1690 * Request-based dm cares about only own queue for 1691 * the query about congestion status of request_queue 1692 */ 1693 if (dm_request_based(md)) 1694 r = md->queue->backing_dev_info.state & 1695 bdi_bits; 1696 else 1697 r = dm_table_any_congested(map, bdi_bits); 1698 1699 dm_table_put(map); 1700 } 1701 } 1702 1703 return r; 1704 } 1705 1706 /*----------------------------------------------------------------- 1707 * An IDR is used to keep track of allocated minor numbers. 1708 *---------------------------------------------------------------*/ 1709 static void free_minor(int minor) 1710 { 1711 spin_lock(&_minor_lock); 1712 idr_remove(&_minor_idr, minor); 1713 spin_unlock(&_minor_lock); 1714 } 1715 1716 /* 1717 * See if the device with a specific minor # is free. 1718 */ 1719 static int specific_minor(int minor) 1720 { 1721 int r, m; 1722 1723 if (minor >= (1 << MINORBITS)) 1724 return -EINVAL; 1725 1726 r = idr_pre_get(&_minor_idr, GFP_KERNEL); 1727 if (!r) 1728 return -ENOMEM; 1729 1730 spin_lock(&_minor_lock); 1731 1732 if (idr_find(&_minor_idr, minor)) { 1733 r = -EBUSY; 1734 goto out; 1735 } 1736 1737 r = idr_get_new_above(&_minor_idr, MINOR_ALLOCED, minor, &m); 1738 if (r) 1739 goto out; 1740 1741 if (m != minor) { 1742 idr_remove(&_minor_idr, m); 1743 r = -EBUSY; 1744 goto out; 1745 } 1746 1747 out: 1748 spin_unlock(&_minor_lock); 1749 return r; 1750 } 1751 1752 static int next_free_minor(int *minor) 1753 { 1754 int r, m; 1755 1756 r = idr_pre_get(&_minor_idr, GFP_KERNEL); 1757 if (!r) 1758 return -ENOMEM; 1759 1760 spin_lock(&_minor_lock); 1761 1762 r = idr_get_new(&_minor_idr, MINOR_ALLOCED, &m); 1763 if (r) 1764 goto out; 1765 1766 if (m >= (1 << MINORBITS)) { 1767 idr_remove(&_minor_idr, m); 1768 r = -ENOSPC; 1769 goto out; 1770 } 1771 1772 *minor = m; 1773 1774 out: 1775 spin_unlock(&_minor_lock); 1776 return r; 1777 } 1778 1779 static const struct block_device_operations dm_blk_dops; 1780 1781 static void dm_wq_work(struct work_struct *work); 1782 1783 static void dm_init_md_queue(struct mapped_device *md) 1784 { 1785 /* 1786 * Request-based dm devices cannot be stacked on top of bio-based dm 1787 * devices. The type of this dm device has not been decided yet. 1788 * The type is decided at the first table loading time. 1789 * To prevent problematic device stacking, clear the queue flag 1790 * for request stacking support until then. 1791 * 1792 * This queue is new, so no concurrency on the queue_flags. 1793 */ 1794 queue_flag_clear_unlocked(QUEUE_FLAG_STACKABLE, md->queue); 1795 1796 md->queue->queuedata = md; 1797 md->queue->backing_dev_info.congested_fn = dm_any_congested; 1798 md->queue->backing_dev_info.congested_data = md; 1799 blk_queue_make_request(md->queue, dm_request); 1800 blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY); 1801 blk_queue_merge_bvec(md->queue, dm_merge_bvec); 1802 } 1803 1804 /* 1805 * Allocate and initialise a blank device with a given minor. 1806 */ 1807 static struct mapped_device *alloc_dev(int minor) 1808 { 1809 int r; 1810 struct mapped_device *md = kzalloc(sizeof(*md), GFP_KERNEL); 1811 void *old_md; 1812 1813 if (!md) { 1814 DMWARN("unable to allocate device, out of memory."); 1815 return NULL; 1816 } 1817 1818 if (!try_module_get(THIS_MODULE)) 1819 goto bad_module_get; 1820 1821 /* get a minor number for the dev */ 1822 if (minor == DM_ANY_MINOR) 1823 r = next_free_minor(&minor); 1824 else 1825 r = specific_minor(minor); 1826 if (r < 0) 1827 goto bad_minor; 1828 1829 md->type = DM_TYPE_NONE; 1830 init_rwsem(&md->io_lock); 1831 mutex_init(&md->suspend_lock); 1832 mutex_init(&md->type_lock); 1833 spin_lock_init(&md->deferred_lock); 1834 rwlock_init(&md->map_lock); 1835 atomic_set(&md->holders, 1); 1836 atomic_set(&md->open_count, 0); 1837 atomic_set(&md->event_nr, 0); 1838 atomic_set(&md->uevent_seq, 0); 1839 INIT_LIST_HEAD(&md->uevent_list); 1840 spin_lock_init(&md->uevent_lock); 1841 1842 md->queue = blk_alloc_queue(GFP_KERNEL); 1843 if (!md->queue) 1844 goto bad_queue; 1845 1846 dm_init_md_queue(md); 1847 1848 md->disk = alloc_disk(1); 1849 if (!md->disk) 1850 goto bad_disk; 1851 1852 atomic_set(&md->pending[0], 0); 1853 atomic_set(&md->pending[1], 0); 1854 init_waitqueue_head(&md->wait); 1855 INIT_WORK(&md->work, dm_wq_work); 1856 init_waitqueue_head(&md->eventq); 1857 1858 md->disk->major = _major; 1859 md->disk->first_minor = minor; 1860 md->disk->fops = &dm_blk_dops; 1861 md->disk->queue = md->queue; 1862 md->disk->private_data = md; 1863 sprintf(md->disk->disk_name, "dm-%d", minor); 1864 add_disk(md->disk); 1865 format_dev_t(md->name, MKDEV(_major, minor)); 1866 1867 md->wq = alloc_workqueue("kdmflush", 1868 WQ_NON_REENTRANT | WQ_MEM_RECLAIM, 0); 1869 if (!md->wq) 1870 goto bad_thread; 1871 1872 md->bdev = bdget_disk(md->disk, 0); 1873 if (!md->bdev) 1874 goto bad_bdev; 1875 1876 bio_init(&md->flush_bio); 1877 md->flush_bio.bi_bdev = md->bdev; 1878 md->flush_bio.bi_rw = WRITE_FLUSH; 1879 1880 /* Populate the mapping, nobody knows we exist yet */ 1881 spin_lock(&_minor_lock); 1882 old_md = idr_replace(&_minor_idr, md, minor); 1883 spin_unlock(&_minor_lock); 1884 1885 BUG_ON(old_md != MINOR_ALLOCED); 1886 1887 return md; 1888 1889 bad_bdev: 1890 destroy_workqueue(md->wq); 1891 bad_thread: 1892 del_gendisk(md->disk); 1893 put_disk(md->disk); 1894 bad_disk: 1895 blk_cleanup_queue(md->queue); 1896 bad_queue: 1897 free_minor(minor); 1898 bad_minor: 1899 module_put(THIS_MODULE); 1900 bad_module_get: 1901 kfree(md); 1902 return NULL; 1903 } 1904 1905 static void unlock_fs(struct mapped_device *md); 1906 1907 static void free_dev(struct mapped_device *md) 1908 { 1909 int minor = MINOR(disk_devt(md->disk)); 1910 1911 unlock_fs(md); 1912 bdput(md->bdev); 1913 destroy_workqueue(md->wq); 1914 if (md->tio_pool) 1915 mempool_destroy(md->tio_pool); 1916 if (md->io_pool) 1917 mempool_destroy(md->io_pool); 1918 if (md->bs) 1919 bioset_free(md->bs); 1920 blk_integrity_unregister(md->disk); 1921 del_gendisk(md->disk); 1922 free_minor(minor); 1923 1924 spin_lock(&_minor_lock); 1925 md->disk->private_data = NULL; 1926 spin_unlock(&_minor_lock); 1927 1928 put_disk(md->disk); 1929 blk_cleanup_queue(md->queue); 1930 module_put(THIS_MODULE); 1931 kfree(md); 1932 } 1933 1934 static void __bind_mempools(struct mapped_device *md, struct dm_table *t) 1935 { 1936 struct dm_md_mempools *p; 1937 1938 if (md->io_pool && md->tio_pool && md->bs) 1939 /* the md already has necessary mempools */ 1940 goto out; 1941 1942 p = dm_table_get_md_mempools(t); 1943 BUG_ON(!p || md->io_pool || md->tio_pool || md->bs); 1944 1945 md->io_pool = p->io_pool; 1946 p->io_pool = NULL; 1947 md->tio_pool = p->tio_pool; 1948 p->tio_pool = NULL; 1949 md->bs = p->bs; 1950 p->bs = NULL; 1951 1952 out: 1953 /* mempool bind completed, now no need any mempools in the table */ 1954 dm_table_free_md_mempools(t); 1955 } 1956 1957 /* 1958 * Bind a table to the device. 1959 */ 1960 static void event_callback(void *context) 1961 { 1962 unsigned long flags; 1963 LIST_HEAD(uevents); 1964 struct mapped_device *md = (struct mapped_device *) context; 1965 1966 spin_lock_irqsave(&md->uevent_lock, flags); 1967 list_splice_init(&md->uevent_list, &uevents); 1968 spin_unlock_irqrestore(&md->uevent_lock, flags); 1969 1970 dm_send_uevents(&uevents, &disk_to_dev(md->disk)->kobj); 1971 1972 atomic_inc(&md->event_nr); 1973 wake_up(&md->eventq); 1974 } 1975 1976 /* 1977 * Protected by md->suspend_lock obtained by dm_swap_table(). 1978 */ 1979 static void __set_size(struct mapped_device *md, sector_t size) 1980 { 1981 set_capacity(md->disk, size); 1982 1983 i_size_write(md->bdev->bd_inode, (loff_t)size << SECTOR_SHIFT); 1984 } 1985 1986 /* 1987 * Return 1 if the queue has a compulsory merge_bvec_fn function. 1988 * 1989 * If this function returns 0, then the device is either a non-dm 1990 * device without a merge_bvec_fn, or it is a dm device that is 1991 * able to split any bios it receives that are too big. 1992 */ 1993 int dm_queue_merge_is_compulsory(struct request_queue *q) 1994 { 1995 struct mapped_device *dev_md; 1996 1997 if (!q->merge_bvec_fn) 1998 return 0; 1999 2000 if (q->make_request_fn == dm_request) { 2001 dev_md = q->queuedata; 2002 if (test_bit(DMF_MERGE_IS_OPTIONAL, &dev_md->flags)) 2003 return 0; 2004 } 2005 2006 return 1; 2007 } 2008 2009 static int dm_device_merge_is_compulsory(struct dm_target *ti, 2010 struct dm_dev *dev, sector_t start, 2011 sector_t len, void *data) 2012 { 2013 struct block_device *bdev = dev->bdev; 2014 struct request_queue *q = bdev_get_queue(bdev); 2015 2016 return dm_queue_merge_is_compulsory(q); 2017 } 2018 2019 /* 2020 * Return 1 if it is acceptable to ignore merge_bvec_fn based 2021 * on the properties of the underlying devices. 2022 */ 2023 static int dm_table_merge_is_optional(struct dm_table *table) 2024 { 2025 unsigned i = 0; 2026 struct dm_target *ti; 2027 2028 while (i < dm_table_get_num_targets(table)) { 2029 ti = dm_table_get_target(table, i++); 2030 2031 if (ti->type->iterate_devices && 2032 ti->type->iterate_devices(ti, dm_device_merge_is_compulsory, NULL)) 2033 return 0; 2034 } 2035 2036 return 1; 2037 } 2038 2039 /* 2040 * Returns old map, which caller must destroy. 2041 */ 2042 static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t, 2043 struct queue_limits *limits) 2044 { 2045 struct dm_table *old_map; 2046 struct request_queue *q = md->queue; 2047 sector_t size; 2048 unsigned long flags; 2049 int merge_is_optional; 2050 2051 size = dm_table_get_size(t); 2052 2053 /* 2054 * Wipe any geometry if the size of the table changed. 2055 */ 2056 if (size != get_capacity(md->disk)) 2057 memset(&md->geometry, 0, sizeof(md->geometry)); 2058 2059 __set_size(md, size); 2060 2061 dm_table_event_callback(t, event_callback, md); 2062 2063 /* 2064 * The queue hasn't been stopped yet, if the old table type wasn't 2065 * for request-based during suspension. So stop it to prevent 2066 * I/O mapping before resume. 2067 * This must be done before setting the queue restrictions, 2068 * because request-based dm may be run just after the setting. 2069 */ 2070 if (dm_table_request_based(t) && !blk_queue_stopped(q)) 2071 stop_queue(q); 2072 2073 __bind_mempools(md, t); 2074 2075 merge_is_optional = dm_table_merge_is_optional(t); 2076 2077 write_lock_irqsave(&md->map_lock, flags); 2078 old_map = md->map; 2079 md->map = t; 2080 md->immutable_target_type = dm_table_get_immutable_target_type(t); 2081 2082 dm_table_set_restrictions(t, q, limits); 2083 if (merge_is_optional) 2084 set_bit(DMF_MERGE_IS_OPTIONAL, &md->flags); 2085 else 2086 clear_bit(DMF_MERGE_IS_OPTIONAL, &md->flags); 2087 write_unlock_irqrestore(&md->map_lock, flags); 2088 2089 return old_map; 2090 } 2091 2092 /* 2093 * Returns unbound table for the caller to free. 2094 */ 2095 static struct dm_table *__unbind(struct mapped_device *md) 2096 { 2097 struct dm_table *map = md->map; 2098 unsigned long flags; 2099 2100 if (!map) 2101 return NULL; 2102 2103 dm_table_event_callback(map, NULL, NULL); 2104 write_lock_irqsave(&md->map_lock, flags); 2105 md->map = NULL; 2106 write_unlock_irqrestore(&md->map_lock, flags); 2107 2108 return map; 2109 } 2110 2111 /* 2112 * Constructor for a new device. 2113 */ 2114 int dm_create(int minor, struct mapped_device **result) 2115 { 2116 struct mapped_device *md; 2117 2118 md = alloc_dev(minor); 2119 if (!md) 2120 return -ENXIO; 2121 2122 dm_sysfs_init(md); 2123 2124 *result = md; 2125 return 0; 2126 } 2127 2128 /* 2129 * Functions to manage md->type. 2130 * All are required to hold md->type_lock. 2131 */ 2132 void dm_lock_md_type(struct mapped_device *md) 2133 { 2134 mutex_lock(&md->type_lock); 2135 } 2136 2137 void dm_unlock_md_type(struct mapped_device *md) 2138 { 2139 mutex_unlock(&md->type_lock); 2140 } 2141 2142 void dm_set_md_type(struct mapped_device *md, unsigned type) 2143 { 2144 md->type = type; 2145 } 2146 2147 unsigned dm_get_md_type(struct mapped_device *md) 2148 { 2149 return md->type; 2150 } 2151 2152 struct target_type *dm_get_immutable_target_type(struct mapped_device *md) 2153 { 2154 return md->immutable_target_type; 2155 } 2156 2157 /* 2158 * Fully initialize a request-based queue (->elevator, ->request_fn, etc). 2159 */ 2160 static int dm_init_request_based_queue(struct mapped_device *md) 2161 { 2162 struct request_queue *q = NULL; 2163 2164 if (md->queue->elevator) 2165 return 1; 2166 2167 /* Fully initialize the queue */ 2168 q = blk_init_allocated_queue(md->queue, dm_request_fn, NULL); 2169 if (!q) 2170 return 0; 2171 2172 md->queue = q; 2173 dm_init_md_queue(md); 2174 blk_queue_softirq_done(md->queue, dm_softirq_done); 2175 blk_queue_prep_rq(md->queue, dm_prep_fn); 2176 blk_queue_lld_busy(md->queue, dm_lld_busy); 2177 2178 elv_register_queue(md->queue); 2179 2180 return 1; 2181 } 2182 2183 /* 2184 * Setup the DM device's queue based on md's type 2185 */ 2186 int dm_setup_md_queue(struct mapped_device *md) 2187 { 2188 if ((dm_get_md_type(md) == DM_TYPE_REQUEST_BASED) && 2189 !dm_init_request_based_queue(md)) { 2190 DMWARN("Cannot initialize queue for request-based mapped device"); 2191 return -EINVAL; 2192 } 2193 2194 return 0; 2195 } 2196 2197 static struct mapped_device *dm_find_md(dev_t dev) 2198 { 2199 struct mapped_device *md; 2200 unsigned minor = MINOR(dev); 2201 2202 if (MAJOR(dev) != _major || minor >= (1 << MINORBITS)) 2203 return NULL; 2204 2205 spin_lock(&_minor_lock); 2206 2207 md = idr_find(&_minor_idr, minor); 2208 if (md && (md == MINOR_ALLOCED || 2209 (MINOR(disk_devt(dm_disk(md))) != minor) || 2210 dm_deleting_md(md) || 2211 test_bit(DMF_FREEING, &md->flags))) { 2212 md = NULL; 2213 goto out; 2214 } 2215 2216 out: 2217 spin_unlock(&_minor_lock); 2218 2219 return md; 2220 } 2221 2222 struct mapped_device *dm_get_md(dev_t dev) 2223 { 2224 struct mapped_device *md = dm_find_md(dev); 2225 2226 if (md) 2227 dm_get(md); 2228 2229 return md; 2230 } 2231 EXPORT_SYMBOL_GPL(dm_get_md); 2232 2233 void *dm_get_mdptr(struct mapped_device *md) 2234 { 2235 return md->interface_ptr; 2236 } 2237 2238 void dm_set_mdptr(struct mapped_device *md, void *ptr) 2239 { 2240 md->interface_ptr = ptr; 2241 } 2242 2243 void dm_get(struct mapped_device *md) 2244 { 2245 atomic_inc(&md->holders); 2246 BUG_ON(test_bit(DMF_FREEING, &md->flags)); 2247 } 2248 2249 const char *dm_device_name(struct mapped_device *md) 2250 { 2251 return md->name; 2252 } 2253 EXPORT_SYMBOL_GPL(dm_device_name); 2254 2255 static void __dm_destroy(struct mapped_device *md, bool wait) 2256 { 2257 struct dm_table *map; 2258 2259 might_sleep(); 2260 2261 spin_lock(&_minor_lock); 2262 map = dm_get_live_table(md); 2263 idr_replace(&_minor_idr, MINOR_ALLOCED, MINOR(disk_devt(dm_disk(md)))); 2264 set_bit(DMF_FREEING, &md->flags); 2265 spin_unlock(&_minor_lock); 2266 2267 if (!dm_suspended_md(md)) { 2268 dm_table_presuspend_targets(map); 2269 dm_table_postsuspend_targets(map); 2270 } 2271 2272 /* 2273 * Rare, but there may be I/O requests still going to complete, 2274 * for example. Wait for all references to disappear. 2275 * No one should increment the reference count of the mapped_device, 2276 * after the mapped_device state becomes DMF_FREEING. 2277 */ 2278 if (wait) 2279 while (atomic_read(&md->holders)) 2280 msleep(1); 2281 else if (atomic_read(&md->holders)) 2282 DMWARN("%s: Forcibly removing mapped_device still in use! (%d users)", 2283 dm_device_name(md), atomic_read(&md->holders)); 2284 2285 dm_sysfs_exit(md); 2286 dm_table_put(map); 2287 dm_table_destroy(__unbind(md)); 2288 free_dev(md); 2289 } 2290 2291 void dm_destroy(struct mapped_device *md) 2292 { 2293 __dm_destroy(md, true); 2294 } 2295 2296 void dm_destroy_immediate(struct mapped_device *md) 2297 { 2298 __dm_destroy(md, false); 2299 } 2300 2301 void dm_put(struct mapped_device *md) 2302 { 2303 atomic_dec(&md->holders); 2304 } 2305 EXPORT_SYMBOL_GPL(dm_put); 2306 2307 static int dm_wait_for_completion(struct mapped_device *md, int interruptible) 2308 { 2309 int r = 0; 2310 DECLARE_WAITQUEUE(wait, current); 2311 2312 add_wait_queue(&md->wait, &wait); 2313 2314 while (1) { 2315 set_current_state(interruptible); 2316 2317 if (!md_in_flight(md)) 2318 break; 2319 2320 if (interruptible == TASK_INTERRUPTIBLE && 2321 signal_pending(current)) { 2322 r = -EINTR; 2323 break; 2324 } 2325 2326 io_schedule(); 2327 } 2328 set_current_state(TASK_RUNNING); 2329 2330 remove_wait_queue(&md->wait, &wait); 2331 2332 return r; 2333 } 2334 2335 /* 2336 * Process the deferred bios 2337 */ 2338 static void dm_wq_work(struct work_struct *work) 2339 { 2340 struct mapped_device *md = container_of(work, struct mapped_device, 2341 work); 2342 struct bio *c; 2343 2344 down_read(&md->io_lock); 2345 2346 while (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) { 2347 spin_lock_irq(&md->deferred_lock); 2348 c = bio_list_pop(&md->deferred); 2349 spin_unlock_irq(&md->deferred_lock); 2350 2351 if (!c) 2352 break; 2353 2354 up_read(&md->io_lock); 2355 2356 if (dm_request_based(md)) 2357 generic_make_request(c); 2358 else 2359 __split_and_process_bio(md, c); 2360 2361 down_read(&md->io_lock); 2362 } 2363 2364 up_read(&md->io_lock); 2365 } 2366 2367 static void dm_queue_flush(struct mapped_device *md) 2368 { 2369 clear_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags); 2370 smp_mb__after_clear_bit(); 2371 queue_work(md->wq, &md->work); 2372 } 2373 2374 /* 2375 * Swap in a new table, returning the old one for the caller to destroy. 2376 */ 2377 struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *table) 2378 { 2379 struct dm_table *map = ERR_PTR(-EINVAL); 2380 struct queue_limits limits; 2381 int r; 2382 2383 mutex_lock(&md->suspend_lock); 2384 2385 /* device must be suspended */ 2386 if (!dm_suspended_md(md)) 2387 goto out; 2388 2389 r = dm_calculate_queue_limits(table, &limits); 2390 if (r) { 2391 map = ERR_PTR(r); 2392 goto out; 2393 } 2394 2395 map = __bind(md, table, &limits); 2396 2397 out: 2398 mutex_unlock(&md->suspend_lock); 2399 return map; 2400 } 2401 2402 /* 2403 * Functions to lock and unlock any filesystem running on the 2404 * device. 2405 */ 2406 static int lock_fs(struct mapped_device *md) 2407 { 2408 int r; 2409 2410 WARN_ON(md->frozen_sb); 2411 2412 md->frozen_sb = freeze_bdev(md->bdev); 2413 if (IS_ERR(md->frozen_sb)) { 2414 r = PTR_ERR(md->frozen_sb); 2415 md->frozen_sb = NULL; 2416 return r; 2417 } 2418 2419 set_bit(DMF_FROZEN, &md->flags); 2420 2421 return 0; 2422 } 2423 2424 static void unlock_fs(struct mapped_device *md) 2425 { 2426 if (!test_bit(DMF_FROZEN, &md->flags)) 2427 return; 2428 2429 thaw_bdev(md->bdev, md->frozen_sb); 2430 md->frozen_sb = NULL; 2431 clear_bit(DMF_FROZEN, &md->flags); 2432 } 2433 2434 /* 2435 * We need to be able to change a mapping table under a mounted 2436 * filesystem. For example we might want to move some data in 2437 * the background. Before the table can be swapped with 2438 * dm_bind_table, dm_suspend must be called to flush any in 2439 * flight bios and ensure that any further io gets deferred. 2440 */ 2441 /* 2442 * Suspend mechanism in request-based dm. 2443 * 2444 * 1. Flush all I/Os by lock_fs() if needed. 2445 * 2. Stop dispatching any I/O by stopping the request_queue. 2446 * 3. Wait for all in-flight I/Os to be completed or requeued. 2447 * 2448 * To abort suspend, start the request_queue. 2449 */ 2450 int dm_suspend(struct mapped_device *md, unsigned suspend_flags) 2451 { 2452 struct dm_table *map = NULL; 2453 int r = 0; 2454 int do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG ? 1 : 0; 2455 int noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG ? 1 : 0; 2456 2457 mutex_lock(&md->suspend_lock); 2458 2459 if (dm_suspended_md(md)) { 2460 r = -EINVAL; 2461 goto out_unlock; 2462 } 2463 2464 map = dm_get_live_table(md); 2465 2466 /* 2467 * DMF_NOFLUSH_SUSPENDING must be set before presuspend. 2468 * This flag is cleared before dm_suspend returns. 2469 */ 2470 if (noflush) 2471 set_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); 2472 2473 /* This does not get reverted if there's an error later. */ 2474 dm_table_presuspend_targets(map); 2475 2476 /* 2477 * Flush I/O to the device. 2478 * Any I/O submitted after lock_fs() may not be flushed. 2479 * noflush takes precedence over do_lockfs. 2480 * (lock_fs() flushes I/Os and waits for them to complete.) 2481 */ 2482 if (!noflush && do_lockfs) { 2483 r = lock_fs(md); 2484 if (r) 2485 goto out; 2486 } 2487 2488 /* 2489 * Here we must make sure that no processes are submitting requests 2490 * to target drivers i.e. no one may be executing 2491 * __split_and_process_bio. This is called from dm_request and 2492 * dm_wq_work. 2493 * 2494 * To get all processes out of __split_and_process_bio in dm_request, 2495 * we take the write lock. To prevent any process from reentering 2496 * __split_and_process_bio from dm_request and quiesce the thread 2497 * (dm_wq_work), we set BMF_BLOCK_IO_FOR_SUSPEND and call 2498 * flush_workqueue(md->wq). 2499 */ 2500 down_write(&md->io_lock); 2501 set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags); 2502 up_write(&md->io_lock); 2503 2504 /* 2505 * Stop md->queue before flushing md->wq in case request-based 2506 * dm defers requests to md->wq from md->queue. 2507 */ 2508 if (dm_request_based(md)) 2509 stop_queue(md->queue); 2510 2511 flush_workqueue(md->wq); 2512 2513 /* 2514 * At this point no more requests are entering target request routines. 2515 * We call dm_wait_for_completion to wait for all existing requests 2516 * to finish. 2517 */ 2518 r = dm_wait_for_completion(md, TASK_INTERRUPTIBLE); 2519 2520 down_write(&md->io_lock); 2521 if (noflush) 2522 clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); 2523 up_write(&md->io_lock); 2524 2525 /* were we interrupted ? */ 2526 if (r < 0) { 2527 dm_queue_flush(md); 2528 2529 if (dm_request_based(md)) 2530 start_queue(md->queue); 2531 2532 unlock_fs(md); 2533 goto out; /* pushback list is already flushed, so skip flush */ 2534 } 2535 2536 /* 2537 * If dm_wait_for_completion returned 0, the device is completely 2538 * quiescent now. There is no request-processing activity. All new 2539 * requests are being added to md->deferred list. 2540 */ 2541 2542 set_bit(DMF_SUSPENDED, &md->flags); 2543 2544 dm_table_postsuspend_targets(map); 2545 2546 out: 2547 dm_table_put(map); 2548 2549 out_unlock: 2550 mutex_unlock(&md->suspend_lock); 2551 return r; 2552 } 2553 2554 int dm_resume(struct mapped_device *md) 2555 { 2556 int r = -EINVAL; 2557 struct dm_table *map = NULL; 2558 2559 mutex_lock(&md->suspend_lock); 2560 if (!dm_suspended_md(md)) 2561 goto out; 2562 2563 map = dm_get_live_table(md); 2564 if (!map || !dm_table_get_size(map)) 2565 goto out; 2566 2567 r = dm_table_resume_targets(map); 2568 if (r) 2569 goto out; 2570 2571 dm_queue_flush(md); 2572 2573 /* 2574 * Flushing deferred I/Os must be done after targets are resumed 2575 * so that mapping of targets can work correctly. 2576 * Request-based dm is queueing the deferred I/Os in its request_queue. 2577 */ 2578 if (dm_request_based(md)) 2579 start_queue(md->queue); 2580 2581 unlock_fs(md); 2582 2583 clear_bit(DMF_SUSPENDED, &md->flags); 2584 2585 r = 0; 2586 out: 2587 dm_table_put(map); 2588 mutex_unlock(&md->suspend_lock); 2589 2590 return r; 2591 } 2592 2593 /*----------------------------------------------------------------- 2594 * Event notification. 2595 *---------------------------------------------------------------*/ 2596 int dm_kobject_uevent(struct mapped_device *md, enum kobject_action action, 2597 unsigned cookie) 2598 { 2599 char udev_cookie[DM_COOKIE_LENGTH]; 2600 char *envp[] = { udev_cookie, NULL }; 2601 2602 if (!cookie) 2603 return kobject_uevent(&disk_to_dev(md->disk)->kobj, action); 2604 else { 2605 snprintf(udev_cookie, DM_COOKIE_LENGTH, "%s=%u", 2606 DM_COOKIE_ENV_VAR_NAME, cookie); 2607 return kobject_uevent_env(&disk_to_dev(md->disk)->kobj, 2608 action, envp); 2609 } 2610 } 2611 2612 uint32_t dm_next_uevent_seq(struct mapped_device *md) 2613 { 2614 return atomic_add_return(1, &md->uevent_seq); 2615 } 2616 2617 uint32_t dm_get_event_nr(struct mapped_device *md) 2618 { 2619 return atomic_read(&md->event_nr); 2620 } 2621 2622 int dm_wait_event(struct mapped_device *md, int event_nr) 2623 { 2624 return wait_event_interruptible(md->eventq, 2625 (event_nr != atomic_read(&md->event_nr))); 2626 } 2627 2628 void dm_uevent_add(struct mapped_device *md, struct list_head *elist) 2629 { 2630 unsigned long flags; 2631 2632 spin_lock_irqsave(&md->uevent_lock, flags); 2633 list_add(elist, &md->uevent_list); 2634 spin_unlock_irqrestore(&md->uevent_lock, flags); 2635 } 2636 2637 /* 2638 * The gendisk is only valid as long as you have a reference 2639 * count on 'md'. 2640 */ 2641 struct gendisk *dm_disk(struct mapped_device *md) 2642 { 2643 return md->disk; 2644 } 2645 2646 struct kobject *dm_kobject(struct mapped_device *md) 2647 { 2648 return &md->kobj; 2649 } 2650 2651 /* 2652 * struct mapped_device should not be exported outside of dm.c 2653 * so use this check to verify that kobj is part of md structure 2654 */ 2655 struct mapped_device *dm_get_from_kobject(struct kobject *kobj) 2656 { 2657 struct mapped_device *md; 2658 2659 md = container_of(kobj, struct mapped_device, kobj); 2660 if (&md->kobj != kobj) 2661 return NULL; 2662 2663 if (test_bit(DMF_FREEING, &md->flags) || 2664 dm_deleting_md(md)) 2665 return NULL; 2666 2667 dm_get(md); 2668 return md; 2669 } 2670 2671 int dm_suspended_md(struct mapped_device *md) 2672 { 2673 return test_bit(DMF_SUSPENDED, &md->flags); 2674 } 2675 2676 int dm_suspended(struct dm_target *ti) 2677 { 2678 return dm_suspended_md(dm_table_get_md(ti->table)); 2679 } 2680 EXPORT_SYMBOL_GPL(dm_suspended); 2681 2682 int dm_noflush_suspending(struct dm_target *ti) 2683 { 2684 return __noflush_suspending(dm_table_get_md(ti->table)); 2685 } 2686 EXPORT_SYMBOL_GPL(dm_noflush_suspending); 2687 2688 struct dm_md_mempools *dm_alloc_md_mempools(unsigned type, unsigned integrity) 2689 { 2690 struct dm_md_mempools *pools = kmalloc(sizeof(*pools), GFP_KERNEL); 2691 unsigned int pool_size = (type == DM_TYPE_BIO_BASED) ? 16 : MIN_IOS; 2692 2693 if (!pools) 2694 return NULL; 2695 2696 pools->io_pool = (type == DM_TYPE_BIO_BASED) ? 2697 mempool_create_slab_pool(MIN_IOS, _io_cache) : 2698 mempool_create_slab_pool(MIN_IOS, _rq_bio_info_cache); 2699 if (!pools->io_pool) 2700 goto free_pools_and_out; 2701 2702 pools->tio_pool = (type == DM_TYPE_BIO_BASED) ? 2703 mempool_create_slab_pool(MIN_IOS, _tio_cache) : 2704 mempool_create_slab_pool(MIN_IOS, _rq_tio_cache); 2705 if (!pools->tio_pool) 2706 goto free_io_pool_and_out; 2707 2708 pools->bs = (type == DM_TYPE_BIO_BASED) ? 2709 bioset_create(pool_size, 0) : 2710 bioset_create(pool_size, 2711 offsetof(struct dm_rq_clone_bio_info, clone)); 2712 if (!pools->bs) 2713 goto free_tio_pool_and_out; 2714 2715 if (integrity && bioset_integrity_create(pools->bs, pool_size)) 2716 goto free_bioset_and_out; 2717 2718 return pools; 2719 2720 free_bioset_and_out: 2721 bioset_free(pools->bs); 2722 2723 free_tio_pool_and_out: 2724 mempool_destroy(pools->tio_pool); 2725 2726 free_io_pool_and_out: 2727 mempool_destroy(pools->io_pool); 2728 2729 free_pools_and_out: 2730 kfree(pools); 2731 2732 return NULL; 2733 } 2734 2735 void dm_free_md_mempools(struct dm_md_mempools *pools) 2736 { 2737 if (!pools) 2738 return; 2739 2740 if (pools->io_pool) 2741 mempool_destroy(pools->io_pool); 2742 2743 if (pools->tio_pool) 2744 mempool_destroy(pools->tio_pool); 2745 2746 if (pools->bs) 2747 bioset_free(pools->bs); 2748 2749 kfree(pools); 2750 } 2751 2752 static const struct block_device_operations dm_blk_dops = { 2753 .open = dm_blk_open, 2754 .release = dm_blk_close, 2755 .ioctl = dm_blk_ioctl, 2756 .getgeo = dm_blk_getgeo, 2757 .owner = THIS_MODULE 2758 }; 2759 2760 EXPORT_SYMBOL(dm_get_mapinfo); 2761 2762 /* 2763 * module hooks 2764 */ 2765 module_init(dm_init); 2766 module_exit(dm_exit); 2767 2768 module_param(major, uint, 0); 2769 MODULE_PARM_DESC(major, "The major number of the device mapper"); 2770 MODULE_DESCRIPTION(DM_NAME " driver"); 2771 MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>"); 2772 MODULE_LICENSE("GPL"); 2773