1 /* 2 * Copyright (C) 2001, 2002 Sistina Software (UK) Limited. 3 * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved. 4 * 5 * This file is released under the GPL. 6 */ 7 8 #include "dm.h" 9 #include "dm-uevent.h" 10 11 #include <linux/init.h> 12 #include <linux/module.h> 13 #include <linux/mutex.h> 14 #include <linux/moduleparam.h> 15 #include <linux/blkpg.h> 16 #include <linux/bio.h> 17 #include <linux/buffer_head.h> 18 #include <linux/mempool.h> 19 #include <linux/slab.h> 20 #include <linux/idr.h> 21 #include <linux/hdreg.h> 22 23 #include <trace/events/block.h> 24 25 #define DM_MSG_PREFIX "core" 26 27 /* 28 * Cookies are numeric values sent with CHANGE and REMOVE 29 * uevents while resuming, removing or renaming the device. 30 */ 31 #define DM_COOKIE_ENV_VAR_NAME "DM_COOKIE" 32 #define DM_COOKIE_LENGTH 24 33 34 static const char *_name = DM_NAME; 35 36 static unsigned int major = 0; 37 static unsigned int _major = 0; 38 39 static DEFINE_SPINLOCK(_minor_lock); 40 /* 41 * For bio-based dm. 42 * One of these is allocated per bio. 43 */ 44 struct dm_io { 45 struct mapped_device *md; 46 int error; 47 atomic_t io_count; 48 struct bio *bio; 49 unsigned long start_time; 50 spinlock_t endio_lock; 51 }; 52 53 /* 54 * For bio-based dm. 55 * One of these is allocated per target within a bio. Hopefully 56 * this will be simplified out one day. 57 */ 58 struct dm_target_io { 59 struct dm_io *io; 60 struct dm_target *ti; 61 union map_info info; 62 }; 63 64 /* 65 * For request-based dm. 66 * One of these is allocated per request. 67 */ 68 struct dm_rq_target_io { 69 struct mapped_device *md; 70 struct dm_target *ti; 71 struct request *orig, clone; 72 int error; 73 union map_info info; 74 }; 75 76 /* 77 * For request-based dm. 78 * One of these is allocated per bio. 79 */ 80 struct dm_rq_clone_bio_info { 81 struct bio *orig; 82 struct dm_rq_target_io *tio; 83 }; 84 85 union map_info *dm_get_mapinfo(struct bio *bio) 86 { 87 if (bio && bio->bi_private) 88 return &((struct dm_target_io *)bio->bi_private)->info; 89 return NULL; 90 } 91 92 union map_info *dm_get_rq_mapinfo(struct request *rq) 93 { 94 if (rq && rq->end_io_data) 95 return &((struct dm_rq_target_io *)rq->end_io_data)->info; 96 return NULL; 97 } 98 EXPORT_SYMBOL_GPL(dm_get_rq_mapinfo); 99 100 #define MINOR_ALLOCED ((void *)-1) 101 102 /* 103 * Bits for the md->flags field. 104 */ 105 #define DMF_BLOCK_IO_FOR_SUSPEND 0 106 #define DMF_SUSPENDED 1 107 #define DMF_FROZEN 2 108 #define DMF_FREEING 3 109 #define DMF_DELETING 4 110 #define DMF_NOFLUSH_SUSPENDING 5 111 #define DMF_QUEUE_IO_TO_THREAD 6 112 113 /* 114 * Work processed by per-device workqueue. 115 */ 116 struct mapped_device { 117 struct rw_semaphore io_lock; 118 struct mutex suspend_lock; 119 rwlock_t map_lock; 120 atomic_t holders; 121 atomic_t open_count; 122 123 unsigned long flags; 124 125 struct request_queue *queue; 126 struct gendisk *disk; 127 char name[16]; 128 129 void *interface_ptr; 130 131 /* 132 * A list of ios that arrived while we were suspended. 133 */ 134 atomic_t pending[2]; 135 wait_queue_head_t wait; 136 struct work_struct work; 137 struct bio_list deferred; 138 spinlock_t deferred_lock; 139 140 /* 141 * An error from the barrier request currently being processed. 142 */ 143 int barrier_error; 144 145 /* 146 * Protect barrier_error from concurrent endio processing 147 * in request-based dm. 148 */ 149 spinlock_t barrier_error_lock; 150 151 /* 152 * Processing queue (flush/barriers) 153 */ 154 struct workqueue_struct *wq; 155 struct work_struct barrier_work; 156 157 /* A pointer to the currently processing pre/post flush request */ 158 struct request *flush_request; 159 160 /* 161 * The current mapping. 162 */ 163 struct dm_table *map; 164 165 /* 166 * io objects are allocated from here. 167 */ 168 mempool_t *io_pool; 169 mempool_t *tio_pool; 170 171 struct bio_set *bs; 172 173 /* 174 * Event handling. 175 */ 176 atomic_t event_nr; 177 wait_queue_head_t eventq; 178 atomic_t uevent_seq; 179 struct list_head uevent_list; 180 spinlock_t uevent_lock; /* Protect access to uevent_list */ 181 182 /* 183 * freeze/thaw support require holding onto a super block 184 */ 185 struct super_block *frozen_sb; 186 struct block_device *bdev; 187 188 /* forced geometry settings */ 189 struct hd_geometry geometry; 190 191 /* For saving the address of __make_request for request based dm */ 192 make_request_fn *saved_make_request_fn; 193 194 /* sysfs handle */ 195 struct kobject kobj; 196 197 /* zero-length barrier that will be cloned and submitted to targets */ 198 struct bio barrier_bio; 199 }; 200 201 /* 202 * For mempools pre-allocation at the table loading time. 203 */ 204 struct dm_md_mempools { 205 mempool_t *io_pool; 206 mempool_t *tio_pool; 207 struct bio_set *bs; 208 }; 209 210 #define MIN_IOS 256 211 static struct kmem_cache *_io_cache; 212 static struct kmem_cache *_tio_cache; 213 static struct kmem_cache *_rq_tio_cache; 214 static struct kmem_cache *_rq_bio_info_cache; 215 216 static int __init local_init(void) 217 { 218 int r = -ENOMEM; 219 220 /* allocate a slab for the dm_ios */ 221 _io_cache = KMEM_CACHE(dm_io, 0); 222 if (!_io_cache) 223 return r; 224 225 /* allocate a slab for the target ios */ 226 _tio_cache = KMEM_CACHE(dm_target_io, 0); 227 if (!_tio_cache) 228 goto out_free_io_cache; 229 230 _rq_tio_cache = KMEM_CACHE(dm_rq_target_io, 0); 231 if (!_rq_tio_cache) 232 goto out_free_tio_cache; 233 234 _rq_bio_info_cache = KMEM_CACHE(dm_rq_clone_bio_info, 0); 235 if (!_rq_bio_info_cache) 236 goto out_free_rq_tio_cache; 237 238 r = dm_uevent_init(); 239 if (r) 240 goto out_free_rq_bio_info_cache; 241 242 _major = major; 243 r = register_blkdev(_major, _name); 244 if (r < 0) 245 goto out_uevent_exit; 246 247 if (!_major) 248 _major = r; 249 250 return 0; 251 252 out_uevent_exit: 253 dm_uevent_exit(); 254 out_free_rq_bio_info_cache: 255 kmem_cache_destroy(_rq_bio_info_cache); 256 out_free_rq_tio_cache: 257 kmem_cache_destroy(_rq_tio_cache); 258 out_free_tio_cache: 259 kmem_cache_destroy(_tio_cache); 260 out_free_io_cache: 261 kmem_cache_destroy(_io_cache); 262 263 return r; 264 } 265 266 static void local_exit(void) 267 { 268 kmem_cache_destroy(_rq_bio_info_cache); 269 kmem_cache_destroy(_rq_tio_cache); 270 kmem_cache_destroy(_tio_cache); 271 kmem_cache_destroy(_io_cache); 272 unregister_blkdev(_major, _name); 273 dm_uevent_exit(); 274 275 _major = 0; 276 277 DMINFO("cleaned up"); 278 } 279 280 static int (*_inits[])(void) __initdata = { 281 local_init, 282 dm_target_init, 283 dm_linear_init, 284 dm_stripe_init, 285 dm_io_init, 286 dm_kcopyd_init, 287 dm_interface_init, 288 }; 289 290 static void (*_exits[])(void) = { 291 local_exit, 292 dm_target_exit, 293 dm_linear_exit, 294 dm_stripe_exit, 295 dm_io_exit, 296 dm_kcopyd_exit, 297 dm_interface_exit, 298 }; 299 300 static int __init dm_init(void) 301 { 302 const int count = ARRAY_SIZE(_inits); 303 304 int r, i; 305 306 for (i = 0; i < count; i++) { 307 r = _inits[i](); 308 if (r) 309 goto bad; 310 } 311 312 return 0; 313 314 bad: 315 while (i--) 316 _exits[i](); 317 318 return r; 319 } 320 321 static void __exit dm_exit(void) 322 { 323 int i = ARRAY_SIZE(_exits); 324 325 while (i--) 326 _exits[i](); 327 } 328 329 /* 330 * Block device functions 331 */ 332 int dm_deleting_md(struct mapped_device *md) 333 { 334 return test_bit(DMF_DELETING, &md->flags); 335 } 336 337 static int dm_blk_open(struct block_device *bdev, fmode_t mode) 338 { 339 struct mapped_device *md; 340 341 spin_lock(&_minor_lock); 342 343 md = bdev->bd_disk->private_data; 344 if (!md) 345 goto out; 346 347 if (test_bit(DMF_FREEING, &md->flags) || 348 dm_deleting_md(md)) { 349 md = NULL; 350 goto out; 351 } 352 353 dm_get(md); 354 atomic_inc(&md->open_count); 355 356 out: 357 spin_unlock(&_minor_lock); 358 359 return md ? 0 : -ENXIO; 360 } 361 362 static int dm_blk_close(struct gendisk *disk, fmode_t mode) 363 { 364 struct mapped_device *md = disk->private_data; 365 atomic_dec(&md->open_count); 366 dm_put(md); 367 return 0; 368 } 369 370 int dm_open_count(struct mapped_device *md) 371 { 372 return atomic_read(&md->open_count); 373 } 374 375 /* 376 * Guarantees nothing is using the device before it's deleted. 377 */ 378 int dm_lock_for_deletion(struct mapped_device *md) 379 { 380 int r = 0; 381 382 spin_lock(&_minor_lock); 383 384 if (dm_open_count(md)) 385 r = -EBUSY; 386 else 387 set_bit(DMF_DELETING, &md->flags); 388 389 spin_unlock(&_minor_lock); 390 391 return r; 392 } 393 394 static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo) 395 { 396 struct mapped_device *md = bdev->bd_disk->private_data; 397 398 return dm_get_geometry(md, geo); 399 } 400 401 static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode, 402 unsigned int cmd, unsigned long arg) 403 { 404 struct mapped_device *md = bdev->bd_disk->private_data; 405 struct dm_table *map = dm_get_live_table(md); 406 struct dm_target *tgt; 407 int r = -ENOTTY; 408 409 if (!map || !dm_table_get_size(map)) 410 goto out; 411 412 /* We only support devices that have a single target */ 413 if (dm_table_get_num_targets(map) != 1) 414 goto out; 415 416 tgt = dm_table_get_target(map, 0); 417 418 if (dm_suspended(md)) { 419 r = -EAGAIN; 420 goto out; 421 } 422 423 if (tgt->type->ioctl) 424 r = tgt->type->ioctl(tgt, cmd, arg); 425 426 out: 427 dm_table_put(map); 428 429 return r; 430 } 431 432 static struct dm_io *alloc_io(struct mapped_device *md) 433 { 434 return mempool_alloc(md->io_pool, GFP_NOIO); 435 } 436 437 static void free_io(struct mapped_device *md, struct dm_io *io) 438 { 439 mempool_free(io, md->io_pool); 440 } 441 442 static void free_tio(struct mapped_device *md, struct dm_target_io *tio) 443 { 444 mempool_free(tio, md->tio_pool); 445 } 446 447 static struct dm_rq_target_io *alloc_rq_tio(struct mapped_device *md, 448 gfp_t gfp_mask) 449 { 450 return mempool_alloc(md->tio_pool, gfp_mask); 451 } 452 453 static void free_rq_tio(struct dm_rq_target_io *tio) 454 { 455 mempool_free(tio, tio->md->tio_pool); 456 } 457 458 static struct dm_rq_clone_bio_info *alloc_bio_info(struct mapped_device *md) 459 { 460 return mempool_alloc(md->io_pool, GFP_ATOMIC); 461 } 462 463 static void free_bio_info(struct dm_rq_clone_bio_info *info) 464 { 465 mempool_free(info, info->tio->md->io_pool); 466 } 467 468 static int md_in_flight(struct mapped_device *md) 469 { 470 return atomic_read(&md->pending[READ]) + 471 atomic_read(&md->pending[WRITE]); 472 } 473 474 static void start_io_acct(struct dm_io *io) 475 { 476 struct mapped_device *md = io->md; 477 int cpu; 478 int rw = bio_data_dir(io->bio); 479 480 io->start_time = jiffies; 481 482 cpu = part_stat_lock(); 483 part_round_stats(cpu, &dm_disk(md)->part0); 484 part_stat_unlock(); 485 dm_disk(md)->part0.in_flight[rw] = atomic_inc_return(&md->pending[rw]); 486 } 487 488 static void end_io_acct(struct dm_io *io) 489 { 490 struct mapped_device *md = io->md; 491 struct bio *bio = io->bio; 492 unsigned long duration = jiffies - io->start_time; 493 int pending, cpu; 494 int rw = bio_data_dir(bio); 495 496 cpu = part_stat_lock(); 497 part_round_stats(cpu, &dm_disk(md)->part0); 498 part_stat_add(cpu, &dm_disk(md)->part0, ticks[rw], duration); 499 part_stat_unlock(); 500 501 /* 502 * After this is decremented the bio must not be touched if it is 503 * a barrier. 504 */ 505 dm_disk(md)->part0.in_flight[rw] = pending = 506 atomic_dec_return(&md->pending[rw]); 507 pending += atomic_read(&md->pending[rw^0x1]); 508 509 /* nudge anyone waiting on suspend queue */ 510 if (!pending) 511 wake_up(&md->wait); 512 } 513 514 /* 515 * Add the bio to the list of deferred io. 516 */ 517 static void queue_io(struct mapped_device *md, struct bio *bio) 518 { 519 down_write(&md->io_lock); 520 521 spin_lock_irq(&md->deferred_lock); 522 bio_list_add(&md->deferred, bio); 523 spin_unlock_irq(&md->deferred_lock); 524 525 if (!test_and_set_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags)) 526 queue_work(md->wq, &md->work); 527 528 up_write(&md->io_lock); 529 } 530 531 /* 532 * Everyone (including functions in this file), should use this 533 * function to access the md->map field, and make sure they call 534 * dm_table_put() when finished. 535 */ 536 struct dm_table *dm_get_live_table(struct mapped_device *md) 537 { 538 struct dm_table *t; 539 unsigned long flags; 540 541 read_lock_irqsave(&md->map_lock, flags); 542 t = md->map; 543 if (t) 544 dm_table_get(t); 545 read_unlock_irqrestore(&md->map_lock, flags); 546 547 return t; 548 } 549 550 /* 551 * Get the geometry associated with a dm device 552 */ 553 int dm_get_geometry(struct mapped_device *md, struct hd_geometry *geo) 554 { 555 *geo = md->geometry; 556 557 return 0; 558 } 559 560 /* 561 * Set the geometry of a device. 562 */ 563 int dm_set_geometry(struct mapped_device *md, struct hd_geometry *geo) 564 { 565 sector_t sz = (sector_t)geo->cylinders * geo->heads * geo->sectors; 566 567 if (geo->start > sz) { 568 DMWARN("Start sector is beyond the geometry limits."); 569 return -EINVAL; 570 } 571 572 md->geometry = *geo; 573 574 return 0; 575 } 576 577 /*----------------------------------------------------------------- 578 * CRUD START: 579 * A more elegant soln is in the works that uses the queue 580 * merge fn, unfortunately there are a couple of changes to 581 * the block layer that I want to make for this. So in the 582 * interests of getting something for people to use I give 583 * you this clearly demarcated crap. 584 *---------------------------------------------------------------*/ 585 586 static int __noflush_suspending(struct mapped_device *md) 587 { 588 return test_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); 589 } 590 591 /* 592 * Decrements the number of outstanding ios that a bio has been 593 * cloned into, completing the original io if necc. 594 */ 595 static void dec_pending(struct dm_io *io, int error) 596 { 597 unsigned long flags; 598 int io_error; 599 struct bio *bio; 600 struct mapped_device *md = io->md; 601 602 /* Push-back supersedes any I/O errors */ 603 if (unlikely(error)) { 604 spin_lock_irqsave(&io->endio_lock, flags); 605 if (!(io->error > 0 && __noflush_suspending(md))) 606 io->error = error; 607 spin_unlock_irqrestore(&io->endio_lock, flags); 608 } 609 610 if (atomic_dec_and_test(&io->io_count)) { 611 if (io->error == DM_ENDIO_REQUEUE) { 612 /* 613 * Target requested pushing back the I/O. 614 */ 615 spin_lock_irqsave(&md->deferred_lock, flags); 616 if (__noflush_suspending(md)) { 617 if (!bio_rw_flagged(io->bio, BIO_RW_BARRIER)) 618 bio_list_add_head(&md->deferred, 619 io->bio); 620 } else 621 /* noflush suspend was interrupted. */ 622 io->error = -EIO; 623 spin_unlock_irqrestore(&md->deferred_lock, flags); 624 } 625 626 io_error = io->error; 627 bio = io->bio; 628 629 if (bio_rw_flagged(bio, BIO_RW_BARRIER)) { 630 /* 631 * There can be just one barrier request so we use 632 * a per-device variable for error reporting. 633 * Note that you can't touch the bio after end_io_acct 634 */ 635 if (!md->barrier_error && io_error != -EOPNOTSUPP) 636 md->barrier_error = io_error; 637 end_io_acct(io); 638 } else { 639 end_io_acct(io); 640 641 if (io_error != DM_ENDIO_REQUEUE) { 642 trace_block_bio_complete(md->queue, bio); 643 644 bio_endio(bio, io_error); 645 } 646 } 647 648 free_io(md, io); 649 } 650 } 651 652 static void clone_endio(struct bio *bio, int error) 653 { 654 int r = 0; 655 struct dm_target_io *tio = bio->bi_private; 656 struct dm_io *io = tio->io; 657 struct mapped_device *md = tio->io->md; 658 dm_endio_fn endio = tio->ti->type->end_io; 659 660 if (!bio_flagged(bio, BIO_UPTODATE) && !error) 661 error = -EIO; 662 663 if (endio) { 664 r = endio(tio->ti, bio, error, &tio->info); 665 if (r < 0 || r == DM_ENDIO_REQUEUE) 666 /* 667 * error and requeue request are handled 668 * in dec_pending(). 669 */ 670 error = r; 671 else if (r == DM_ENDIO_INCOMPLETE) 672 /* The target will handle the io */ 673 return; 674 else if (r) { 675 DMWARN("unimplemented target endio return value: %d", r); 676 BUG(); 677 } 678 } 679 680 /* 681 * Store md for cleanup instead of tio which is about to get freed. 682 */ 683 bio->bi_private = md->bs; 684 685 free_tio(md, tio); 686 bio_put(bio); 687 dec_pending(io, error); 688 } 689 690 /* 691 * Partial completion handling for request-based dm 692 */ 693 static void end_clone_bio(struct bio *clone, int error) 694 { 695 struct dm_rq_clone_bio_info *info = clone->bi_private; 696 struct dm_rq_target_io *tio = info->tio; 697 struct bio *bio = info->orig; 698 unsigned int nr_bytes = info->orig->bi_size; 699 700 bio_put(clone); 701 702 if (tio->error) 703 /* 704 * An error has already been detected on the request. 705 * Once error occurred, just let clone->end_io() handle 706 * the remainder. 707 */ 708 return; 709 else if (error) { 710 /* 711 * Don't notice the error to the upper layer yet. 712 * The error handling decision is made by the target driver, 713 * when the request is completed. 714 */ 715 tio->error = error; 716 return; 717 } 718 719 /* 720 * I/O for the bio successfully completed. 721 * Notice the data completion to the upper layer. 722 */ 723 724 /* 725 * bios are processed from the head of the list. 726 * So the completing bio should always be rq->bio. 727 * If it's not, something wrong is happening. 728 */ 729 if (tio->orig->bio != bio) 730 DMERR("bio completion is going in the middle of the request"); 731 732 /* 733 * Update the original request. 734 * Do not use blk_end_request() here, because it may complete 735 * the original request before the clone, and break the ordering. 736 */ 737 blk_update_request(tio->orig, 0, nr_bytes); 738 } 739 740 static void store_barrier_error(struct mapped_device *md, int error) 741 { 742 unsigned long flags; 743 744 spin_lock_irqsave(&md->barrier_error_lock, flags); 745 /* 746 * Basically, the first error is taken, but: 747 * -EOPNOTSUPP supersedes any I/O error. 748 * Requeue request supersedes any I/O error but -EOPNOTSUPP. 749 */ 750 if (!md->barrier_error || error == -EOPNOTSUPP || 751 (md->barrier_error != -EOPNOTSUPP && 752 error == DM_ENDIO_REQUEUE)) 753 md->barrier_error = error; 754 spin_unlock_irqrestore(&md->barrier_error_lock, flags); 755 } 756 757 /* 758 * Don't touch any member of the md after calling this function because 759 * the md may be freed in dm_put() at the end of this function. 760 * Or do dm_get() before calling this function and dm_put() later. 761 */ 762 static void rq_completed(struct mapped_device *md, int rw, int run_queue) 763 { 764 atomic_dec(&md->pending[rw]); 765 766 /* nudge anyone waiting on suspend queue */ 767 if (!md_in_flight(md)) 768 wake_up(&md->wait); 769 770 if (run_queue) 771 blk_run_queue(md->queue); 772 773 /* 774 * dm_put() must be at the end of this function. See the comment above 775 */ 776 dm_put(md); 777 } 778 779 static void free_rq_clone(struct request *clone) 780 { 781 struct dm_rq_target_io *tio = clone->end_io_data; 782 783 blk_rq_unprep_clone(clone); 784 free_rq_tio(tio); 785 } 786 787 /* 788 * Complete the clone and the original request. 789 * Must be called without queue lock. 790 */ 791 static void dm_end_request(struct request *clone, int error) 792 { 793 int rw = rq_data_dir(clone); 794 int run_queue = 1; 795 bool is_barrier = blk_barrier_rq(clone); 796 struct dm_rq_target_io *tio = clone->end_io_data; 797 struct mapped_device *md = tio->md; 798 struct request *rq = tio->orig; 799 800 if (blk_pc_request(rq) && !is_barrier) { 801 rq->errors = clone->errors; 802 rq->resid_len = clone->resid_len; 803 804 if (rq->sense) 805 /* 806 * We are using the sense buffer of the original 807 * request. 808 * So setting the length of the sense data is enough. 809 */ 810 rq->sense_len = clone->sense_len; 811 } 812 813 free_rq_clone(clone); 814 815 if (unlikely(is_barrier)) { 816 if (unlikely(error)) 817 store_barrier_error(md, error); 818 run_queue = 0; 819 } else 820 blk_end_request_all(rq, error); 821 822 rq_completed(md, rw, run_queue); 823 } 824 825 static void dm_unprep_request(struct request *rq) 826 { 827 struct request *clone = rq->special; 828 829 rq->special = NULL; 830 rq->cmd_flags &= ~REQ_DONTPREP; 831 832 free_rq_clone(clone); 833 } 834 835 /* 836 * Requeue the original request of a clone. 837 */ 838 void dm_requeue_unmapped_request(struct request *clone) 839 { 840 int rw = rq_data_dir(clone); 841 struct dm_rq_target_io *tio = clone->end_io_data; 842 struct mapped_device *md = tio->md; 843 struct request *rq = tio->orig; 844 struct request_queue *q = rq->q; 845 unsigned long flags; 846 847 if (unlikely(blk_barrier_rq(clone))) { 848 /* 849 * Barrier clones share an original request. 850 * Leave it to dm_end_request(), which handles this special 851 * case. 852 */ 853 dm_end_request(clone, DM_ENDIO_REQUEUE); 854 return; 855 } 856 857 dm_unprep_request(rq); 858 859 spin_lock_irqsave(q->queue_lock, flags); 860 if (elv_queue_empty(q)) 861 blk_plug_device(q); 862 blk_requeue_request(q, rq); 863 spin_unlock_irqrestore(q->queue_lock, flags); 864 865 rq_completed(md, rw, 0); 866 } 867 EXPORT_SYMBOL_GPL(dm_requeue_unmapped_request); 868 869 static void __stop_queue(struct request_queue *q) 870 { 871 blk_stop_queue(q); 872 } 873 874 static void stop_queue(struct request_queue *q) 875 { 876 unsigned long flags; 877 878 spin_lock_irqsave(q->queue_lock, flags); 879 __stop_queue(q); 880 spin_unlock_irqrestore(q->queue_lock, flags); 881 } 882 883 static void __start_queue(struct request_queue *q) 884 { 885 if (blk_queue_stopped(q)) 886 blk_start_queue(q); 887 } 888 889 static void start_queue(struct request_queue *q) 890 { 891 unsigned long flags; 892 893 spin_lock_irqsave(q->queue_lock, flags); 894 __start_queue(q); 895 spin_unlock_irqrestore(q->queue_lock, flags); 896 } 897 898 static void dm_done(struct request *clone, int error, bool mapped) 899 { 900 int r = error; 901 struct dm_rq_target_io *tio = clone->end_io_data; 902 dm_request_endio_fn rq_end_io = tio->ti->type->rq_end_io; 903 904 if (mapped && rq_end_io) 905 r = rq_end_io(tio->ti, clone, error, &tio->info); 906 907 if (r <= 0) 908 /* The target wants to complete the I/O */ 909 dm_end_request(clone, r); 910 else if (r == DM_ENDIO_INCOMPLETE) 911 /* The target will handle the I/O */ 912 return; 913 else if (r == DM_ENDIO_REQUEUE) 914 /* The target wants to requeue the I/O */ 915 dm_requeue_unmapped_request(clone); 916 else { 917 DMWARN("unimplemented target endio return value: %d", r); 918 BUG(); 919 } 920 } 921 922 /* 923 * Request completion handler for request-based dm 924 */ 925 static void dm_softirq_done(struct request *rq) 926 { 927 bool mapped = true; 928 struct request *clone = rq->completion_data; 929 struct dm_rq_target_io *tio = clone->end_io_data; 930 931 if (rq->cmd_flags & REQ_FAILED) 932 mapped = false; 933 934 dm_done(clone, tio->error, mapped); 935 } 936 937 /* 938 * Complete the clone and the original request with the error status 939 * through softirq context. 940 */ 941 static void dm_complete_request(struct request *clone, int error) 942 { 943 struct dm_rq_target_io *tio = clone->end_io_data; 944 struct request *rq = tio->orig; 945 946 if (unlikely(blk_barrier_rq(clone))) { 947 /* 948 * Barrier clones share an original request. So can't use 949 * softirq_done with the original. 950 * Pass the clone to dm_done() directly in this special case. 951 * It is safe (even if clone->q->queue_lock is held here) 952 * because there is no I/O dispatching during the completion 953 * of barrier clone. 954 */ 955 dm_done(clone, error, true); 956 return; 957 } 958 959 tio->error = error; 960 rq->completion_data = clone; 961 blk_complete_request(rq); 962 } 963 964 /* 965 * Complete the not-mapped clone and the original request with the error status 966 * through softirq context. 967 * Target's rq_end_io() function isn't called. 968 * This may be used when the target's map_rq() function fails. 969 */ 970 void dm_kill_unmapped_request(struct request *clone, int error) 971 { 972 struct dm_rq_target_io *tio = clone->end_io_data; 973 struct request *rq = tio->orig; 974 975 if (unlikely(blk_barrier_rq(clone))) { 976 /* 977 * Barrier clones share an original request. 978 * Leave it to dm_end_request(), which handles this special 979 * case. 980 */ 981 BUG_ON(error > 0); 982 dm_end_request(clone, error); 983 return; 984 } 985 986 rq->cmd_flags |= REQ_FAILED; 987 dm_complete_request(clone, error); 988 } 989 EXPORT_SYMBOL_GPL(dm_kill_unmapped_request); 990 991 /* 992 * Called with the queue lock held 993 */ 994 static void end_clone_request(struct request *clone, int error) 995 { 996 /* 997 * For just cleaning up the information of the queue in which 998 * the clone was dispatched. 999 * The clone is *NOT* freed actually here because it is alloced from 1000 * dm own mempool and REQ_ALLOCED isn't set in clone->cmd_flags. 1001 */ 1002 __blk_put_request(clone->q, clone); 1003 1004 /* 1005 * Actual request completion is done in a softirq context which doesn't 1006 * hold the queue lock. Otherwise, deadlock could occur because: 1007 * - another request may be submitted by the upper level driver 1008 * of the stacking during the completion 1009 * - the submission which requires queue lock may be done 1010 * against this queue 1011 */ 1012 dm_complete_request(clone, error); 1013 } 1014 1015 static sector_t max_io_len(struct mapped_device *md, 1016 sector_t sector, struct dm_target *ti) 1017 { 1018 sector_t offset = sector - ti->begin; 1019 sector_t len = ti->len - offset; 1020 1021 /* 1022 * Does the target need to split even further ? 1023 */ 1024 if (ti->split_io) { 1025 sector_t boundary; 1026 boundary = ((offset + ti->split_io) & ~(ti->split_io - 1)) 1027 - offset; 1028 if (len > boundary) 1029 len = boundary; 1030 } 1031 1032 return len; 1033 } 1034 1035 static void __map_bio(struct dm_target *ti, struct bio *clone, 1036 struct dm_target_io *tio) 1037 { 1038 int r; 1039 sector_t sector; 1040 struct mapped_device *md; 1041 1042 clone->bi_end_io = clone_endio; 1043 clone->bi_private = tio; 1044 1045 /* 1046 * Map the clone. If r == 0 we don't need to do 1047 * anything, the target has assumed ownership of 1048 * this io. 1049 */ 1050 atomic_inc(&tio->io->io_count); 1051 sector = clone->bi_sector; 1052 r = ti->type->map(ti, clone, &tio->info); 1053 if (r == DM_MAPIO_REMAPPED) { 1054 /* the bio has been remapped so dispatch it */ 1055 1056 trace_block_remap(bdev_get_queue(clone->bi_bdev), clone, 1057 tio->io->bio->bi_bdev->bd_dev, sector); 1058 1059 generic_make_request(clone); 1060 } else if (r < 0 || r == DM_MAPIO_REQUEUE) { 1061 /* error the io and bail out, or requeue it if needed */ 1062 md = tio->io->md; 1063 dec_pending(tio->io, r); 1064 /* 1065 * Store bio_set for cleanup. 1066 */ 1067 clone->bi_private = md->bs; 1068 bio_put(clone); 1069 free_tio(md, tio); 1070 } else if (r) { 1071 DMWARN("unimplemented target map return value: %d", r); 1072 BUG(); 1073 } 1074 } 1075 1076 struct clone_info { 1077 struct mapped_device *md; 1078 struct dm_table *map; 1079 struct bio *bio; 1080 struct dm_io *io; 1081 sector_t sector; 1082 sector_t sector_count; 1083 unsigned short idx; 1084 }; 1085 1086 static void dm_bio_destructor(struct bio *bio) 1087 { 1088 struct bio_set *bs = bio->bi_private; 1089 1090 bio_free(bio, bs); 1091 } 1092 1093 /* 1094 * Creates a little bio that is just does part of a bvec. 1095 */ 1096 static struct bio *split_bvec(struct bio *bio, sector_t sector, 1097 unsigned short idx, unsigned int offset, 1098 unsigned int len, struct bio_set *bs) 1099 { 1100 struct bio *clone; 1101 struct bio_vec *bv = bio->bi_io_vec + idx; 1102 1103 clone = bio_alloc_bioset(GFP_NOIO, 1, bs); 1104 clone->bi_destructor = dm_bio_destructor; 1105 *clone->bi_io_vec = *bv; 1106 1107 clone->bi_sector = sector; 1108 clone->bi_bdev = bio->bi_bdev; 1109 clone->bi_rw = bio->bi_rw & ~(1 << BIO_RW_BARRIER); 1110 clone->bi_vcnt = 1; 1111 clone->bi_size = to_bytes(len); 1112 clone->bi_io_vec->bv_offset = offset; 1113 clone->bi_io_vec->bv_len = clone->bi_size; 1114 clone->bi_flags |= 1 << BIO_CLONED; 1115 1116 if (bio_integrity(bio)) { 1117 bio_integrity_clone(clone, bio, GFP_NOIO, bs); 1118 bio_integrity_trim(clone, 1119 bio_sector_offset(bio, idx, offset), len); 1120 } 1121 1122 return clone; 1123 } 1124 1125 /* 1126 * Creates a bio that consists of range of complete bvecs. 1127 */ 1128 static struct bio *clone_bio(struct bio *bio, sector_t sector, 1129 unsigned short idx, unsigned short bv_count, 1130 unsigned int len, struct bio_set *bs) 1131 { 1132 struct bio *clone; 1133 1134 clone = bio_alloc_bioset(GFP_NOIO, bio->bi_max_vecs, bs); 1135 __bio_clone(clone, bio); 1136 clone->bi_rw &= ~(1 << BIO_RW_BARRIER); 1137 clone->bi_destructor = dm_bio_destructor; 1138 clone->bi_sector = sector; 1139 clone->bi_idx = idx; 1140 clone->bi_vcnt = idx + bv_count; 1141 clone->bi_size = to_bytes(len); 1142 clone->bi_flags &= ~(1 << BIO_SEG_VALID); 1143 1144 if (bio_integrity(bio)) { 1145 bio_integrity_clone(clone, bio, GFP_NOIO, bs); 1146 1147 if (idx != bio->bi_idx || clone->bi_size < bio->bi_size) 1148 bio_integrity_trim(clone, 1149 bio_sector_offset(bio, idx, 0), len); 1150 } 1151 1152 return clone; 1153 } 1154 1155 static struct dm_target_io *alloc_tio(struct clone_info *ci, 1156 struct dm_target *ti) 1157 { 1158 struct dm_target_io *tio = mempool_alloc(ci->md->tio_pool, GFP_NOIO); 1159 1160 tio->io = ci->io; 1161 tio->ti = ti; 1162 memset(&tio->info, 0, sizeof(tio->info)); 1163 1164 return tio; 1165 } 1166 1167 static void __flush_target(struct clone_info *ci, struct dm_target *ti, 1168 unsigned flush_nr) 1169 { 1170 struct dm_target_io *tio = alloc_tio(ci, ti); 1171 struct bio *clone; 1172 1173 tio->info.flush_request = flush_nr; 1174 1175 clone = bio_alloc_bioset(GFP_NOIO, 0, ci->md->bs); 1176 __bio_clone(clone, ci->bio); 1177 clone->bi_destructor = dm_bio_destructor; 1178 1179 __map_bio(ti, clone, tio); 1180 } 1181 1182 static int __clone_and_map_empty_barrier(struct clone_info *ci) 1183 { 1184 unsigned target_nr = 0, flush_nr; 1185 struct dm_target *ti; 1186 1187 while ((ti = dm_table_get_target(ci->map, target_nr++))) 1188 for (flush_nr = 0; flush_nr < ti->num_flush_requests; 1189 flush_nr++) 1190 __flush_target(ci, ti, flush_nr); 1191 1192 ci->sector_count = 0; 1193 1194 return 0; 1195 } 1196 1197 static int __clone_and_map(struct clone_info *ci) 1198 { 1199 struct bio *clone, *bio = ci->bio; 1200 struct dm_target *ti; 1201 sector_t len = 0, max; 1202 struct dm_target_io *tio; 1203 1204 if (unlikely(bio_empty_barrier(bio))) 1205 return __clone_and_map_empty_barrier(ci); 1206 1207 ti = dm_table_find_target(ci->map, ci->sector); 1208 if (!dm_target_is_valid(ti)) 1209 return -EIO; 1210 1211 max = max_io_len(ci->md, ci->sector, ti); 1212 1213 /* 1214 * Allocate a target io object. 1215 */ 1216 tio = alloc_tio(ci, ti); 1217 1218 if (ci->sector_count <= max) { 1219 /* 1220 * Optimise for the simple case where we can do all of 1221 * the remaining io with a single clone. 1222 */ 1223 clone = clone_bio(bio, ci->sector, ci->idx, 1224 bio->bi_vcnt - ci->idx, ci->sector_count, 1225 ci->md->bs); 1226 __map_bio(ti, clone, tio); 1227 ci->sector_count = 0; 1228 1229 } else if (to_sector(bio->bi_io_vec[ci->idx].bv_len) <= max) { 1230 /* 1231 * There are some bvecs that don't span targets. 1232 * Do as many of these as possible. 1233 */ 1234 int i; 1235 sector_t remaining = max; 1236 sector_t bv_len; 1237 1238 for (i = ci->idx; remaining && (i < bio->bi_vcnt); i++) { 1239 bv_len = to_sector(bio->bi_io_vec[i].bv_len); 1240 1241 if (bv_len > remaining) 1242 break; 1243 1244 remaining -= bv_len; 1245 len += bv_len; 1246 } 1247 1248 clone = clone_bio(bio, ci->sector, ci->idx, i - ci->idx, len, 1249 ci->md->bs); 1250 __map_bio(ti, clone, tio); 1251 1252 ci->sector += len; 1253 ci->sector_count -= len; 1254 ci->idx = i; 1255 1256 } else { 1257 /* 1258 * Handle a bvec that must be split between two or more targets. 1259 */ 1260 struct bio_vec *bv = bio->bi_io_vec + ci->idx; 1261 sector_t remaining = to_sector(bv->bv_len); 1262 unsigned int offset = 0; 1263 1264 do { 1265 if (offset) { 1266 ti = dm_table_find_target(ci->map, ci->sector); 1267 if (!dm_target_is_valid(ti)) 1268 return -EIO; 1269 1270 max = max_io_len(ci->md, ci->sector, ti); 1271 1272 tio = alloc_tio(ci, ti); 1273 } 1274 1275 len = min(remaining, max); 1276 1277 clone = split_bvec(bio, ci->sector, ci->idx, 1278 bv->bv_offset + offset, len, 1279 ci->md->bs); 1280 1281 __map_bio(ti, clone, tio); 1282 1283 ci->sector += len; 1284 ci->sector_count -= len; 1285 offset += to_bytes(len); 1286 } while (remaining -= len); 1287 1288 ci->idx++; 1289 } 1290 1291 return 0; 1292 } 1293 1294 /* 1295 * Split the bio into several clones and submit it to targets. 1296 */ 1297 static void __split_and_process_bio(struct mapped_device *md, struct bio *bio) 1298 { 1299 struct clone_info ci; 1300 int error = 0; 1301 1302 ci.map = dm_get_live_table(md); 1303 if (unlikely(!ci.map)) { 1304 if (!bio_rw_flagged(bio, BIO_RW_BARRIER)) 1305 bio_io_error(bio); 1306 else 1307 if (!md->barrier_error) 1308 md->barrier_error = -EIO; 1309 return; 1310 } 1311 1312 ci.md = md; 1313 ci.bio = bio; 1314 ci.io = alloc_io(md); 1315 ci.io->error = 0; 1316 atomic_set(&ci.io->io_count, 1); 1317 ci.io->bio = bio; 1318 ci.io->md = md; 1319 spin_lock_init(&ci.io->endio_lock); 1320 ci.sector = bio->bi_sector; 1321 ci.sector_count = bio_sectors(bio); 1322 if (unlikely(bio_empty_barrier(bio))) 1323 ci.sector_count = 1; 1324 ci.idx = bio->bi_idx; 1325 1326 start_io_acct(ci.io); 1327 while (ci.sector_count && !error) 1328 error = __clone_and_map(&ci); 1329 1330 /* drop the extra reference count */ 1331 dec_pending(ci.io, error); 1332 dm_table_put(ci.map); 1333 } 1334 /*----------------------------------------------------------------- 1335 * CRUD END 1336 *---------------------------------------------------------------*/ 1337 1338 static int dm_merge_bvec(struct request_queue *q, 1339 struct bvec_merge_data *bvm, 1340 struct bio_vec *biovec) 1341 { 1342 struct mapped_device *md = q->queuedata; 1343 struct dm_table *map = dm_get_live_table(md); 1344 struct dm_target *ti; 1345 sector_t max_sectors; 1346 int max_size = 0; 1347 1348 if (unlikely(!map)) 1349 goto out; 1350 1351 ti = dm_table_find_target(map, bvm->bi_sector); 1352 if (!dm_target_is_valid(ti)) 1353 goto out_table; 1354 1355 /* 1356 * Find maximum amount of I/O that won't need splitting 1357 */ 1358 max_sectors = min(max_io_len(md, bvm->bi_sector, ti), 1359 (sector_t) BIO_MAX_SECTORS); 1360 max_size = (max_sectors << SECTOR_SHIFT) - bvm->bi_size; 1361 if (max_size < 0) 1362 max_size = 0; 1363 1364 /* 1365 * merge_bvec_fn() returns number of bytes 1366 * it can accept at this offset 1367 * max is precomputed maximal io size 1368 */ 1369 if (max_size && ti->type->merge) 1370 max_size = ti->type->merge(ti, bvm, biovec, max_size); 1371 /* 1372 * If the target doesn't support merge method and some of the devices 1373 * provided their merge_bvec method (we know this by looking at 1374 * queue_max_hw_sectors), then we can't allow bios with multiple vector 1375 * entries. So always set max_size to 0, and the code below allows 1376 * just one page. 1377 */ 1378 else if (queue_max_hw_sectors(q) <= PAGE_SIZE >> 9) 1379 1380 max_size = 0; 1381 1382 out_table: 1383 dm_table_put(map); 1384 1385 out: 1386 /* 1387 * Always allow an entire first page 1388 */ 1389 if (max_size <= biovec->bv_len && !(bvm->bi_size >> SECTOR_SHIFT)) 1390 max_size = biovec->bv_len; 1391 1392 return max_size; 1393 } 1394 1395 /* 1396 * The request function that just remaps the bio built up by 1397 * dm_merge_bvec. 1398 */ 1399 static int _dm_request(struct request_queue *q, struct bio *bio) 1400 { 1401 int rw = bio_data_dir(bio); 1402 struct mapped_device *md = q->queuedata; 1403 int cpu; 1404 1405 down_read(&md->io_lock); 1406 1407 cpu = part_stat_lock(); 1408 part_stat_inc(cpu, &dm_disk(md)->part0, ios[rw]); 1409 part_stat_add(cpu, &dm_disk(md)->part0, sectors[rw], bio_sectors(bio)); 1410 part_stat_unlock(); 1411 1412 /* 1413 * If we're suspended or the thread is processing barriers 1414 * we have to queue this io for later. 1415 */ 1416 if (unlikely(test_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags)) || 1417 unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) { 1418 up_read(&md->io_lock); 1419 1420 if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) && 1421 bio_rw(bio) == READA) { 1422 bio_io_error(bio); 1423 return 0; 1424 } 1425 1426 queue_io(md, bio); 1427 1428 return 0; 1429 } 1430 1431 __split_and_process_bio(md, bio); 1432 up_read(&md->io_lock); 1433 return 0; 1434 } 1435 1436 static int dm_make_request(struct request_queue *q, struct bio *bio) 1437 { 1438 struct mapped_device *md = q->queuedata; 1439 1440 return md->saved_make_request_fn(q, bio); /* call __make_request() */ 1441 } 1442 1443 static int dm_request_based(struct mapped_device *md) 1444 { 1445 return blk_queue_stackable(md->queue); 1446 } 1447 1448 static int dm_request(struct request_queue *q, struct bio *bio) 1449 { 1450 struct mapped_device *md = q->queuedata; 1451 1452 if (dm_request_based(md)) 1453 return dm_make_request(q, bio); 1454 1455 return _dm_request(q, bio); 1456 } 1457 1458 /* 1459 * Mark this request as flush request, so that dm_request_fn() can 1460 * recognize. 1461 */ 1462 static void dm_rq_prepare_flush(struct request_queue *q, struct request *rq) 1463 { 1464 rq->cmd_type = REQ_TYPE_LINUX_BLOCK; 1465 rq->cmd[0] = REQ_LB_OP_FLUSH; 1466 } 1467 1468 static bool dm_rq_is_flush_request(struct request *rq) 1469 { 1470 if (rq->cmd_type == REQ_TYPE_LINUX_BLOCK && 1471 rq->cmd[0] == REQ_LB_OP_FLUSH) 1472 return true; 1473 else 1474 return false; 1475 } 1476 1477 void dm_dispatch_request(struct request *rq) 1478 { 1479 int r; 1480 1481 if (blk_queue_io_stat(rq->q)) 1482 rq->cmd_flags |= REQ_IO_STAT; 1483 1484 rq->start_time = jiffies; 1485 r = blk_insert_cloned_request(rq->q, rq); 1486 if (r) 1487 dm_complete_request(rq, r); 1488 } 1489 EXPORT_SYMBOL_GPL(dm_dispatch_request); 1490 1491 static void dm_rq_bio_destructor(struct bio *bio) 1492 { 1493 struct dm_rq_clone_bio_info *info = bio->bi_private; 1494 struct mapped_device *md = info->tio->md; 1495 1496 free_bio_info(info); 1497 bio_free(bio, md->bs); 1498 } 1499 1500 static int dm_rq_bio_constructor(struct bio *bio, struct bio *bio_orig, 1501 void *data) 1502 { 1503 struct dm_rq_target_io *tio = data; 1504 struct mapped_device *md = tio->md; 1505 struct dm_rq_clone_bio_info *info = alloc_bio_info(md); 1506 1507 if (!info) 1508 return -ENOMEM; 1509 1510 info->orig = bio_orig; 1511 info->tio = tio; 1512 bio->bi_end_io = end_clone_bio; 1513 bio->bi_private = info; 1514 bio->bi_destructor = dm_rq_bio_destructor; 1515 1516 return 0; 1517 } 1518 1519 static int setup_clone(struct request *clone, struct request *rq, 1520 struct dm_rq_target_io *tio) 1521 { 1522 int r; 1523 1524 if (dm_rq_is_flush_request(rq)) { 1525 blk_rq_init(NULL, clone); 1526 clone->cmd_type = REQ_TYPE_FS; 1527 clone->cmd_flags |= (REQ_HARDBARRIER | WRITE); 1528 } else { 1529 r = blk_rq_prep_clone(clone, rq, tio->md->bs, GFP_ATOMIC, 1530 dm_rq_bio_constructor, tio); 1531 if (r) 1532 return r; 1533 1534 clone->cmd = rq->cmd; 1535 clone->cmd_len = rq->cmd_len; 1536 clone->sense = rq->sense; 1537 clone->buffer = rq->buffer; 1538 } 1539 1540 clone->end_io = end_clone_request; 1541 clone->end_io_data = tio; 1542 1543 return 0; 1544 } 1545 1546 static struct request *clone_rq(struct request *rq, struct mapped_device *md, 1547 gfp_t gfp_mask) 1548 { 1549 struct request *clone; 1550 struct dm_rq_target_io *tio; 1551 1552 tio = alloc_rq_tio(md, gfp_mask); 1553 if (!tio) 1554 return NULL; 1555 1556 tio->md = md; 1557 tio->ti = NULL; 1558 tio->orig = rq; 1559 tio->error = 0; 1560 memset(&tio->info, 0, sizeof(tio->info)); 1561 1562 clone = &tio->clone; 1563 if (setup_clone(clone, rq, tio)) { 1564 /* -ENOMEM */ 1565 free_rq_tio(tio); 1566 return NULL; 1567 } 1568 1569 return clone; 1570 } 1571 1572 /* 1573 * Called with the queue lock held. 1574 */ 1575 static int dm_prep_fn(struct request_queue *q, struct request *rq) 1576 { 1577 struct mapped_device *md = q->queuedata; 1578 struct request *clone; 1579 1580 if (unlikely(dm_rq_is_flush_request(rq))) 1581 return BLKPREP_OK; 1582 1583 if (unlikely(rq->special)) { 1584 DMWARN("Already has something in rq->special."); 1585 return BLKPREP_KILL; 1586 } 1587 1588 clone = clone_rq(rq, md, GFP_ATOMIC); 1589 if (!clone) 1590 return BLKPREP_DEFER; 1591 1592 rq->special = clone; 1593 rq->cmd_flags |= REQ_DONTPREP; 1594 1595 return BLKPREP_OK; 1596 } 1597 1598 static void map_request(struct dm_target *ti, struct request *clone, 1599 struct mapped_device *md) 1600 { 1601 int r; 1602 struct dm_rq_target_io *tio = clone->end_io_data; 1603 1604 /* 1605 * Hold the md reference here for the in-flight I/O. 1606 * We can't rely on the reference count by device opener, 1607 * because the device may be closed during the request completion 1608 * when all bios are completed. 1609 * See the comment in rq_completed() too. 1610 */ 1611 dm_get(md); 1612 1613 tio->ti = ti; 1614 r = ti->type->map_rq(ti, clone, &tio->info); 1615 switch (r) { 1616 case DM_MAPIO_SUBMITTED: 1617 /* The target has taken the I/O to submit by itself later */ 1618 break; 1619 case DM_MAPIO_REMAPPED: 1620 /* The target has remapped the I/O so dispatch it */ 1621 trace_block_rq_remap(clone->q, clone, disk_devt(dm_disk(md)), 1622 blk_rq_pos(tio->orig)); 1623 dm_dispatch_request(clone); 1624 break; 1625 case DM_MAPIO_REQUEUE: 1626 /* The target wants to requeue the I/O */ 1627 dm_requeue_unmapped_request(clone); 1628 break; 1629 default: 1630 if (r > 0) { 1631 DMWARN("unimplemented target map return value: %d", r); 1632 BUG(); 1633 } 1634 1635 /* The target wants to complete the I/O */ 1636 dm_kill_unmapped_request(clone, r); 1637 break; 1638 } 1639 } 1640 1641 /* 1642 * q->request_fn for request-based dm. 1643 * Called with the queue lock held. 1644 */ 1645 static void dm_request_fn(struct request_queue *q) 1646 { 1647 struct mapped_device *md = q->queuedata; 1648 struct dm_table *map = dm_get_live_table(md); 1649 struct dm_target *ti; 1650 struct request *rq, *clone; 1651 1652 /* 1653 * For suspend, check blk_queue_stopped() and increment 1654 * ->pending within a single queue_lock not to increment the 1655 * number of in-flight I/Os after the queue is stopped in 1656 * dm_suspend(). 1657 */ 1658 while (!blk_queue_plugged(q) && !blk_queue_stopped(q)) { 1659 rq = blk_peek_request(q); 1660 if (!rq) 1661 goto plug_and_out; 1662 1663 if (unlikely(dm_rq_is_flush_request(rq))) { 1664 BUG_ON(md->flush_request); 1665 md->flush_request = rq; 1666 blk_start_request(rq); 1667 queue_work(md->wq, &md->barrier_work); 1668 goto out; 1669 } 1670 1671 ti = dm_table_find_target(map, blk_rq_pos(rq)); 1672 if (ti->type->busy && ti->type->busy(ti)) 1673 goto plug_and_out; 1674 1675 blk_start_request(rq); 1676 clone = rq->special; 1677 atomic_inc(&md->pending[rq_data_dir(clone)]); 1678 1679 spin_unlock(q->queue_lock); 1680 map_request(ti, clone, md); 1681 spin_lock_irq(q->queue_lock); 1682 } 1683 1684 goto out; 1685 1686 plug_and_out: 1687 if (!elv_queue_empty(q)) 1688 /* Some requests still remain, retry later */ 1689 blk_plug_device(q); 1690 1691 out: 1692 dm_table_put(map); 1693 1694 return; 1695 } 1696 1697 int dm_underlying_device_busy(struct request_queue *q) 1698 { 1699 return blk_lld_busy(q); 1700 } 1701 EXPORT_SYMBOL_GPL(dm_underlying_device_busy); 1702 1703 static int dm_lld_busy(struct request_queue *q) 1704 { 1705 int r; 1706 struct mapped_device *md = q->queuedata; 1707 struct dm_table *map = dm_get_live_table(md); 1708 1709 if (!map || test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) 1710 r = 1; 1711 else 1712 r = dm_table_any_busy_target(map); 1713 1714 dm_table_put(map); 1715 1716 return r; 1717 } 1718 1719 static void dm_unplug_all(struct request_queue *q) 1720 { 1721 struct mapped_device *md = q->queuedata; 1722 struct dm_table *map = dm_get_live_table(md); 1723 1724 if (map) { 1725 if (dm_request_based(md)) 1726 generic_unplug_device(q); 1727 1728 dm_table_unplug_all(map); 1729 dm_table_put(map); 1730 } 1731 } 1732 1733 static int dm_any_congested(void *congested_data, int bdi_bits) 1734 { 1735 int r = bdi_bits; 1736 struct mapped_device *md = congested_data; 1737 struct dm_table *map; 1738 1739 if (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) { 1740 map = dm_get_live_table(md); 1741 if (map) { 1742 /* 1743 * Request-based dm cares about only own queue for 1744 * the query about congestion status of request_queue 1745 */ 1746 if (dm_request_based(md)) 1747 r = md->queue->backing_dev_info.state & 1748 bdi_bits; 1749 else 1750 r = dm_table_any_congested(map, bdi_bits); 1751 1752 dm_table_put(map); 1753 } 1754 } 1755 1756 return r; 1757 } 1758 1759 /*----------------------------------------------------------------- 1760 * An IDR is used to keep track of allocated minor numbers. 1761 *---------------------------------------------------------------*/ 1762 static DEFINE_IDR(_minor_idr); 1763 1764 static void free_minor(int minor) 1765 { 1766 spin_lock(&_minor_lock); 1767 idr_remove(&_minor_idr, minor); 1768 spin_unlock(&_minor_lock); 1769 } 1770 1771 /* 1772 * See if the device with a specific minor # is free. 1773 */ 1774 static int specific_minor(int minor) 1775 { 1776 int r, m; 1777 1778 if (minor >= (1 << MINORBITS)) 1779 return -EINVAL; 1780 1781 r = idr_pre_get(&_minor_idr, GFP_KERNEL); 1782 if (!r) 1783 return -ENOMEM; 1784 1785 spin_lock(&_minor_lock); 1786 1787 if (idr_find(&_minor_idr, minor)) { 1788 r = -EBUSY; 1789 goto out; 1790 } 1791 1792 r = idr_get_new_above(&_minor_idr, MINOR_ALLOCED, minor, &m); 1793 if (r) 1794 goto out; 1795 1796 if (m != minor) { 1797 idr_remove(&_minor_idr, m); 1798 r = -EBUSY; 1799 goto out; 1800 } 1801 1802 out: 1803 spin_unlock(&_minor_lock); 1804 return r; 1805 } 1806 1807 static int next_free_minor(int *minor) 1808 { 1809 int r, m; 1810 1811 r = idr_pre_get(&_minor_idr, GFP_KERNEL); 1812 if (!r) 1813 return -ENOMEM; 1814 1815 spin_lock(&_minor_lock); 1816 1817 r = idr_get_new(&_minor_idr, MINOR_ALLOCED, &m); 1818 if (r) 1819 goto out; 1820 1821 if (m >= (1 << MINORBITS)) { 1822 idr_remove(&_minor_idr, m); 1823 r = -ENOSPC; 1824 goto out; 1825 } 1826 1827 *minor = m; 1828 1829 out: 1830 spin_unlock(&_minor_lock); 1831 return r; 1832 } 1833 1834 static const struct block_device_operations dm_blk_dops; 1835 1836 static void dm_wq_work(struct work_struct *work); 1837 static void dm_rq_barrier_work(struct work_struct *work); 1838 1839 /* 1840 * Allocate and initialise a blank device with a given minor. 1841 */ 1842 static struct mapped_device *alloc_dev(int minor) 1843 { 1844 int r; 1845 struct mapped_device *md = kzalloc(sizeof(*md), GFP_KERNEL); 1846 void *old_md; 1847 1848 if (!md) { 1849 DMWARN("unable to allocate device, out of memory."); 1850 return NULL; 1851 } 1852 1853 if (!try_module_get(THIS_MODULE)) 1854 goto bad_module_get; 1855 1856 /* get a minor number for the dev */ 1857 if (minor == DM_ANY_MINOR) 1858 r = next_free_minor(&minor); 1859 else 1860 r = specific_minor(minor); 1861 if (r < 0) 1862 goto bad_minor; 1863 1864 init_rwsem(&md->io_lock); 1865 mutex_init(&md->suspend_lock); 1866 spin_lock_init(&md->deferred_lock); 1867 spin_lock_init(&md->barrier_error_lock); 1868 rwlock_init(&md->map_lock); 1869 atomic_set(&md->holders, 1); 1870 atomic_set(&md->open_count, 0); 1871 atomic_set(&md->event_nr, 0); 1872 atomic_set(&md->uevent_seq, 0); 1873 INIT_LIST_HEAD(&md->uevent_list); 1874 spin_lock_init(&md->uevent_lock); 1875 1876 md->queue = blk_init_queue(dm_request_fn, NULL); 1877 if (!md->queue) 1878 goto bad_queue; 1879 1880 /* 1881 * Request-based dm devices cannot be stacked on top of bio-based dm 1882 * devices. The type of this dm device has not been decided yet, 1883 * although we initialized the queue using blk_init_queue(). 1884 * The type is decided at the first table loading time. 1885 * To prevent problematic device stacking, clear the queue flag 1886 * for request stacking support until then. 1887 * 1888 * This queue is new, so no concurrency on the queue_flags. 1889 */ 1890 queue_flag_clear_unlocked(QUEUE_FLAG_STACKABLE, md->queue); 1891 md->saved_make_request_fn = md->queue->make_request_fn; 1892 md->queue->queuedata = md; 1893 md->queue->backing_dev_info.congested_fn = dm_any_congested; 1894 md->queue->backing_dev_info.congested_data = md; 1895 blk_queue_make_request(md->queue, dm_request); 1896 blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY); 1897 md->queue->unplug_fn = dm_unplug_all; 1898 blk_queue_merge_bvec(md->queue, dm_merge_bvec); 1899 blk_queue_softirq_done(md->queue, dm_softirq_done); 1900 blk_queue_prep_rq(md->queue, dm_prep_fn); 1901 blk_queue_lld_busy(md->queue, dm_lld_busy); 1902 blk_queue_ordered(md->queue, QUEUE_ORDERED_DRAIN_FLUSH, 1903 dm_rq_prepare_flush); 1904 1905 md->disk = alloc_disk(1); 1906 if (!md->disk) 1907 goto bad_disk; 1908 1909 atomic_set(&md->pending[0], 0); 1910 atomic_set(&md->pending[1], 0); 1911 init_waitqueue_head(&md->wait); 1912 INIT_WORK(&md->work, dm_wq_work); 1913 INIT_WORK(&md->barrier_work, dm_rq_barrier_work); 1914 init_waitqueue_head(&md->eventq); 1915 1916 md->disk->major = _major; 1917 md->disk->first_minor = minor; 1918 md->disk->fops = &dm_blk_dops; 1919 md->disk->queue = md->queue; 1920 md->disk->private_data = md; 1921 sprintf(md->disk->disk_name, "dm-%d", minor); 1922 add_disk(md->disk); 1923 format_dev_t(md->name, MKDEV(_major, minor)); 1924 1925 md->wq = create_singlethread_workqueue("kdmflush"); 1926 if (!md->wq) 1927 goto bad_thread; 1928 1929 md->bdev = bdget_disk(md->disk, 0); 1930 if (!md->bdev) 1931 goto bad_bdev; 1932 1933 /* Populate the mapping, nobody knows we exist yet */ 1934 spin_lock(&_minor_lock); 1935 old_md = idr_replace(&_minor_idr, md, minor); 1936 spin_unlock(&_minor_lock); 1937 1938 BUG_ON(old_md != MINOR_ALLOCED); 1939 1940 return md; 1941 1942 bad_bdev: 1943 destroy_workqueue(md->wq); 1944 bad_thread: 1945 del_gendisk(md->disk); 1946 put_disk(md->disk); 1947 bad_disk: 1948 blk_cleanup_queue(md->queue); 1949 bad_queue: 1950 free_minor(minor); 1951 bad_minor: 1952 module_put(THIS_MODULE); 1953 bad_module_get: 1954 kfree(md); 1955 return NULL; 1956 } 1957 1958 static void unlock_fs(struct mapped_device *md); 1959 1960 static void free_dev(struct mapped_device *md) 1961 { 1962 int minor = MINOR(disk_devt(md->disk)); 1963 1964 unlock_fs(md); 1965 bdput(md->bdev); 1966 destroy_workqueue(md->wq); 1967 if (md->tio_pool) 1968 mempool_destroy(md->tio_pool); 1969 if (md->io_pool) 1970 mempool_destroy(md->io_pool); 1971 if (md->bs) 1972 bioset_free(md->bs); 1973 blk_integrity_unregister(md->disk); 1974 del_gendisk(md->disk); 1975 free_minor(minor); 1976 1977 spin_lock(&_minor_lock); 1978 md->disk->private_data = NULL; 1979 spin_unlock(&_minor_lock); 1980 1981 put_disk(md->disk); 1982 blk_cleanup_queue(md->queue); 1983 module_put(THIS_MODULE); 1984 kfree(md); 1985 } 1986 1987 static void __bind_mempools(struct mapped_device *md, struct dm_table *t) 1988 { 1989 struct dm_md_mempools *p; 1990 1991 if (md->io_pool && md->tio_pool && md->bs) 1992 /* the md already has necessary mempools */ 1993 goto out; 1994 1995 p = dm_table_get_md_mempools(t); 1996 BUG_ON(!p || md->io_pool || md->tio_pool || md->bs); 1997 1998 md->io_pool = p->io_pool; 1999 p->io_pool = NULL; 2000 md->tio_pool = p->tio_pool; 2001 p->tio_pool = NULL; 2002 md->bs = p->bs; 2003 p->bs = NULL; 2004 2005 out: 2006 /* mempool bind completed, now no need any mempools in the table */ 2007 dm_table_free_md_mempools(t); 2008 } 2009 2010 /* 2011 * Bind a table to the device. 2012 */ 2013 static void event_callback(void *context) 2014 { 2015 unsigned long flags; 2016 LIST_HEAD(uevents); 2017 struct mapped_device *md = (struct mapped_device *) context; 2018 2019 spin_lock_irqsave(&md->uevent_lock, flags); 2020 list_splice_init(&md->uevent_list, &uevents); 2021 spin_unlock_irqrestore(&md->uevent_lock, flags); 2022 2023 dm_send_uevents(&uevents, &disk_to_dev(md->disk)->kobj); 2024 2025 atomic_inc(&md->event_nr); 2026 wake_up(&md->eventq); 2027 } 2028 2029 static void __set_size(struct mapped_device *md, sector_t size) 2030 { 2031 set_capacity(md->disk, size); 2032 2033 mutex_lock(&md->bdev->bd_inode->i_mutex); 2034 i_size_write(md->bdev->bd_inode, (loff_t)size << SECTOR_SHIFT); 2035 mutex_unlock(&md->bdev->bd_inode->i_mutex); 2036 } 2037 2038 /* 2039 * Returns old map, which caller must destroy. 2040 */ 2041 static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t, 2042 struct queue_limits *limits) 2043 { 2044 struct dm_table *old_map; 2045 struct request_queue *q = md->queue; 2046 sector_t size; 2047 unsigned long flags; 2048 2049 size = dm_table_get_size(t); 2050 2051 /* 2052 * Wipe any geometry if the size of the table changed. 2053 */ 2054 if (size != get_capacity(md->disk)) 2055 memset(&md->geometry, 0, sizeof(md->geometry)); 2056 2057 __set_size(md, size); 2058 2059 dm_table_event_callback(t, event_callback, md); 2060 2061 /* 2062 * The queue hasn't been stopped yet, if the old table type wasn't 2063 * for request-based during suspension. So stop it to prevent 2064 * I/O mapping before resume. 2065 * This must be done before setting the queue restrictions, 2066 * because request-based dm may be run just after the setting. 2067 */ 2068 if (dm_table_request_based(t) && !blk_queue_stopped(q)) 2069 stop_queue(q); 2070 2071 __bind_mempools(md, t); 2072 2073 write_lock_irqsave(&md->map_lock, flags); 2074 old_map = md->map; 2075 md->map = t; 2076 dm_table_set_restrictions(t, q, limits); 2077 write_unlock_irqrestore(&md->map_lock, flags); 2078 2079 return old_map; 2080 } 2081 2082 /* 2083 * Returns unbound table for the caller to free. 2084 */ 2085 static struct dm_table *__unbind(struct mapped_device *md) 2086 { 2087 struct dm_table *map = md->map; 2088 unsigned long flags; 2089 2090 if (!map) 2091 return NULL; 2092 2093 dm_table_event_callback(map, NULL, NULL); 2094 write_lock_irqsave(&md->map_lock, flags); 2095 md->map = NULL; 2096 write_unlock_irqrestore(&md->map_lock, flags); 2097 2098 return map; 2099 } 2100 2101 /* 2102 * Constructor for a new device. 2103 */ 2104 int dm_create(int minor, struct mapped_device **result) 2105 { 2106 struct mapped_device *md; 2107 2108 md = alloc_dev(minor); 2109 if (!md) 2110 return -ENXIO; 2111 2112 dm_sysfs_init(md); 2113 2114 *result = md; 2115 return 0; 2116 } 2117 2118 static struct mapped_device *dm_find_md(dev_t dev) 2119 { 2120 struct mapped_device *md; 2121 unsigned minor = MINOR(dev); 2122 2123 if (MAJOR(dev) != _major || minor >= (1 << MINORBITS)) 2124 return NULL; 2125 2126 spin_lock(&_minor_lock); 2127 2128 md = idr_find(&_minor_idr, minor); 2129 if (md && (md == MINOR_ALLOCED || 2130 (MINOR(disk_devt(dm_disk(md))) != minor) || 2131 test_bit(DMF_FREEING, &md->flags))) { 2132 md = NULL; 2133 goto out; 2134 } 2135 2136 out: 2137 spin_unlock(&_minor_lock); 2138 2139 return md; 2140 } 2141 2142 struct mapped_device *dm_get_md(dev_t dev) 2143 { 2144 struct mapped_device *md = dm_find_md(dev); 2145 2146 if (md) 2147 dm_get(md); 2148 2149 return md; 2150 } 2151 2152 void *dm_get_mdptr(struct mapped_device *md) 2153 { 2154 return md->interface_ptr; 2155 } 2156 2157 void dm_set_mdptr(struct mapped_device *md, void *ptr) 2158 { 2159 md->interface_ptr = ptr; 2160 } 2161 2162 void dm_get(struct mapped_device *md) 2163 { 2164 atomic_inc(&md->holders); 2165 } 2166 2167 const char *dm_device_name(struct mapped_device *md) 2168 { 2169 return md->name; 2170 } 2171 EXPORT_SYMBOL_GPL(dm_device_name); 2172 2173 void dm_put(struct mapped_device *md) 2174 { 2175 struct dm_table *map; 2176 2177 BUG_ON(test_bit(DMF_FREEING, &md->flags)); 2178 2179 if (atomic_dec_and_lock(&md->holders, &_minor_lock)) { 2180 map = dm_get_live_table(md); 2181 idr_replace(&_minor_idr, MINOR_ALLOCED, 2182 MINOR(disk_devt(dm_disk(md)))); 2183 set_bit(DMF_FREEING, &md->flags); 2184 spin_unlock(&_minor_lock); 2185 if (!dm_suspended(md)) { 2186 dm_table_presuspend_targets(map); 2187 dm_table_postsuspend_targets(map); 2188 } 2189 dm_sysfs_exit(md); 2190 dm_table_put(map); 2191 dm_table_destroy(__unbind(md)); 2192 free_dev(md); 2193 } 2194 } 2195 EXPORT_SYMBOL_GPL(dm_put); 2196 2197 static int dm_wait_for_completion(struct mapped_device *md, int interruptible) 2198 { 2199 int r = 0; 2200 DECLARE_WAITQUEUE(wait, current); 2201 2202 dm_unplug_all(md->queue); 2203 2204 add_wait_queue(&md->wait, &wait); 2205 2206 while (1) { 2207 set_current_state(interruptible); 2208 2209 smp_mb(); 2210 if (!md_in_flight(md)) 2211 break; 2212 2213 if (interruptible == TASK_INTERRUPTIBLE && 2214 signal_pending(current)) { 2215 r = -EINTR; 2216 break; 2217 } 2218 2219 io_schedule(); 2220 } 2221 set_current_state(TASK_RUNNING); 2222 2223 remove_wait_queue(&md->wait, &wait); 2224 2225 return r; 2226 } 2227 2228 static void dm_flush(struct mapped_device *md) 2229 { 2230 dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE); 2231 2232 bio_init(&md->barrier_bio); 2233 md->barrier_bio.bi_bdev = md->bdev; 2234 md->barrier_bio.bi_rw = WRITE_BARRIER; 2235 __split_and_process_bio(md, &md->barrier_bio); 2236 2237 dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE); 2238 } 2239 2240 static void process_barrier(struct mapped_device *md, struct bio *bio) 2241 { 2242 md->barrier_error = 0; 2243 2244 dm_flush(md); 2245 2246 if (!bio_empty_barrier(bio)) { 2247 __split_and_process_bio(md, bio); 2248 dm_flush(md); 2249 } 2250 2251 if (md->barrier_error != DM_ENDIO_REQUEUE) 2252 bio_endio(bio, md->barrier_error); 2253 else { 2254 spin_lock_irq(&md->deferred_lock); 2255 bio_list_add_head(&md->deferred, bio); 2256 spin_unlock_irq(&md->deferred_lock); 2257 } 2258 } 2259 2260 /* 2261 * Process the deferred bios 2262 */ 2263 static void dm_wq_work(struct work_struct *work) 2264 { 2265 struct mapped_device *md = container_of(work, struct mapped_device, 2266 work); 2267 struct bio *c; 2268 2269 down_write(&md->io_lock); 2270 2271 while (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) { 2272 spin_lock_irq(&md->deferred_lock); 2273 c = bio_list_pop(&md->deferred); 2274 spin_unlock_irq(&md->deferred_lock); 2275 2276 if (!c) { 2277 clear_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags); 2278 break; 2279 } 2280 2281 up_write(&md->io_lock); 2282 2283 if (dm_request_based(md)) 2284 generic_make_request(c); 2285 else { 2286 if (bio_rw_flagged(c, BIO_RW_BARRIER)) 2287 process_barrier(md, c); 2288 else 2289 __split_and_process_bio(md, c); 2290 } 2291 2292 down_write(&md->io_lock); 2293 } 2294 2295 up_write(&md->io_lock); 2296 } 2297 2298 static void dm_queue_flush(struct mapped_device *md) 2299 { 2300 clear_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags); 2301 smp_mb__after_clear_bit(); 2302 queue_work(md->wq, &md->work); 2303 } 2304 2305 static void dm_rq_set_flush_nr(struct request *clone, unsigned flush_nr) 2306 { 2307 struct dm_rq_target_io *tio = clone->end_io_data; 2308 2309 tio->info.flush_request = flush_nr; 2310 } 2311 2312 /* Issue barrier requests to targets and wait for their completion. */ 2313 static int dm_rq_barrier(struct mapped_device *md) 2314 { 2315 int i, j; 2316 struct dm_table *map = dm_get_live_table(md); 2317 unsigned num_targets = dm_table_get_num_targets(map); 2318 struct dm_target *ti; 2319 struct request *clone; 2320 2321 md->barrier_error = 0; 2322 2323 for (i = 0; i < num_targets; i++) { 2324 ti = dm_table_get_target(map, i); 2325 for (j = 0; j < ti->num_flush_requests; j++) { 2326 clone = clone_rq(md->flush_request, md, GFP_NOIO); 2327 dm_rq_set_flush_nr(clone, j); 2328 atomic_inc(&md->pending[rq_data_dir(clone)]); 2329 map_request(ti, clone, md); 2330 } 2331 } 2332 2333 dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE); 2334 dm_table_put(map); 2335 2336 return md->barrier_error; 2337 } 2338 2339 static void dm_rq_barrier_work(struct work_struct *work) 2340 { 2341 int error; 2342 struct mapped_device *md = container_of(work, struct mapped_device, 2343 barrier_work); 2344 struct request_queue *q = md->queue; 2345 struct request *rq; 2346 unsigned long flags; 2347 2348 /* 2349 * Hold the md reference here and leave it at the last part so that 2350 * the md can't be deleted by device opener when the barrier request 2351 * completes. 2352 */ 2353 dm_get(md); 2354 2355 error = dm_rq_barrier(md); 2356 2357 rq = md->flush_request; 2358 md->flush_request = NULL; 2359 2360 if (error == DM_ENDIO_REQUEUE) { 2361 spin_lock_irqsave(q->queue_lock, flags); 2362 blk_requeue_request(q, rq); 2363 spin_unlock_irqrestore(q->queue_lock, flags); 2364 } else 2365 blk_end_request_all(rq, error); 2366 2367 blk_run_queue(q); 2368 2369 dm_put(md); 2370 } 2371 2372 /* 2373 * Swap in a new table, returning the old one for the caller to destroy. 2374 */ 2375 struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *table) 2376 { 2377 struct dm_table *map = ERR_PTR(-EINVAL); 2378 struct queue_limits limits; 2379 int r; 2380 2381 mutex_lock(&md->suspend_lock); 2382 2383 /* device must be suspended */ 2384 if (!dm_suspended(md)) 2385 goto out; 2386 2387 r = dm_calculate_queue_limits(table, &limits); 2388 if (r) { 2389 map = ERR_PTR(r); 2390 goto out; 2391 } 2392 2393 /* cannot change the device type, once a table is bound */ 2394 if (md->map && 2395 (dm_table_get_type(md->map) != dm_table_get_type(table))) { 2396 DMWARN("can't change the device type after a table is bound"); 2397 goto out; 2398 } 2399 2400 map = __bind(md, table, &limits); 2401 2402 out: 2403 mutex_unlock(&md->suspend_lock); 2404 return map; 2405 } 2406 2407 /* 2408 * Functions to lock and unlock any filesystem running on the 2409 * device. 2410 */ 2411 static int lock_fs(struct mapped_device *md) 2412 { 2413 int r; 2414 2415 WARN_ON(md->frozen_sb); 2416 2417 md->frozen_sb = freeze_bdev(md->bdev); 2418 if (IS_ERR(md->frozen_sb)) { 2419 r = PTR_ERR(md->frozen_sb); 2420 md->frozen_sb = NULL; 2421 return r; 2422 } 2423 2424 set_bit(DMF_FROZEN, &md->flags); 2425 2426 return 0; 2427 } 2428 2429 static void unlock_fs(struct mapped_device *md) 2430 { 2431 if (!test_bit(DMF_FROZEN, &md->flags)) 2432 return; 2433 2434 thaw_bdev(md->bdev, md->frozen_sb); 2435 md->frozen_sb = NULL; 2436 clear_bit(DMF_FROZEN, &md->flags); 2437 } 2438 2439 /* 2440 * We need to be able to change a mapping table under a mounted 2441 * filesystem. For example we might want to move some data in 2442 * the background. Before the table can be swapped with 2443 * dm_bind_table, dm_suspend must be called to flush any in 2444 * flight bios and ensure that any further io gets deferred. 2445 */ 2446 /* 2447 * Suspend mechanism in request-based dm. 2448 * 2449 * 1. Flush all I/Os by lock_fs() if needed. 2450 * 2. Stop dispatching any I/O by stopping the request_queue. 2451 * 3. Wait for all in-flight I/Os to be completed or requeued. 2452 * 2453 * To abort suspend, start the request_queue. 2454 */ 2455 int dm_suspend(struct mapped_device *md, unsigned suspend_flags) 2456 { 2457 struct dm_table *map = NULL; 2458 int r = 0; 2459 int do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG ? 1 : 0; 2460 int noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG ? 1 : 0; 2461 2462 mutex_lock(&md->suspend_lock); 2463 2464 if (dm_suspended(md)) { 2465 r = -EINVAL; 2466 goto out_unlock; 2467 } 2468 2469 map = dm_get_live_table(md); 2470 2471 /* 2472 * DMF_NOFLUSH_SUSPENDING must be set before presuspend. 2473 * This flag is cleared before dm_suspend returns. 2474 */ 2475 if (noflush) 2476 set_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); 2477 2478 /* This does not get reverted if there's an error later. */ 2479 dm_table_presuspend_targets(map); 2480 2481 /* 2482 * Flush I/O to the device. 2483 * Any I/O submitted after lock_fs() may not be flushed. 2484 * noflush takes precedence over do_lockfs. 2485 * (lock_fs() flushes I/Os and waits for them to complete.) 2486 */ 2487 if (!noflush && do_lockfs) { 2488 r = lock_fs(md); 2489 if (r) 2490 goto out; 2491 } 2492 2493 /* 2494 * Here we must make sure that no processes are submitting requests 2495 * to target drivers i.e. no one may be executing 2496 * __split_and_process_bio. This is called from dm_request and 2497 * dm_wq_work. 2498 * 2499 * To get all processes out of __split_and_process_bio in dm_request, 2500 * we take the write lock. To prevent any process from reentering 2501 * __split_and_process_bio from dm_request, we set 2502 * DMF_QUEUE_IO_TO_THREAD. 2503 * 2504 * To quiesce the thread (dm_wq_work), we set DMF_BLOCK_IO_FOR_SUSPEND 2505 * and call flush_workqueue(md->wq). flush_workqueue will wait until 2506 * dm_wq_work exits and DMF_BLOCK_IO_FOR_SUSPEND will prevent any 2507 * further calls to __split_and_process_bio from dm_wq_work. 2508 */ 2509 down_write(&md->io_lock); 2510 set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags); 2511 set_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags); 2512 up_write(&md->io_lock); 2513 2514 /* 2515 * Request-based dm uses md->wq for barrier (dm_rq_barrier_work) which 2516 * can be kicked until md->queue is stopped. So stop md->queue before 2517 * flushing md->wq. 2518 */ 2519 if (dm_request_based(md)) 2520 stop_queue(md->queue); 2521 2522 flush_workqueue(md->wq); 2523 2524 /* 2525 * At this point no more requests are entering target request routines. 2526 * We call dm_wait_for_completion to wait for all existing requests 2527 * to finish. 2528 */ 2529 r = dm_wait_for_completion(md, TASK_INTERRUPTIBLE); 2530 2531 down_write(&md->io_lock); 2532 if (noflush) 2533 clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); 2534 up_write(&md->io_lock); 2535 2536 /* were we interrupted ? */ 2537 if (r < 0) { 2538 dm_queue_flush(md); 2539 2540 if (dm_request_based(md)) 2541 start_queue(md->queue); 2542 2543 unlock_fs(md); 2544 goto out; /* pushback list is already flushed, so skip flush */ 2545 } 2546 2547 /* 2548 * If dm_wait_for_completion returned 0, the device is completely 2549 * quiescent now. There is no request-processing activity. All new 2550 * requests are being added to md->deferred list. 2551 */ 2552 2553 dm_table_postsuspend_targets(map); 2554 2555 set_bit(DMF_SUSPENDED, &md->flags); 2556 2557 out: 2558 dm_table_put(map); 2559 2560 out_unlock: 2561 mutex_unlock(&md->suspend_lock); 2562 return r; 2563 } 2564 2565 int dm_resume(struct mapped_device *md) 2566 { 2567 int r = -EINVAL; 2568 struct dm_table *map = NULL; 2569 2570 mutex_lock(&md->suspend_lock); 2571 if (!dm_suspended(md)) 2572 goto out; 2573 2574 map = dm_get_live_table(md); 2575 if (!map || !dm_table_get_size(map)) 2576 goto out; 2577 2578 r = dm_table_resume_targets(map); 2579 if (r) 2580 goto out; 2581 2582 dm_queue_flush(md); 2583 2584 /* 2585 * Flushing deferred I/Os must be done after targets are resumed 2586 * so that mapping of targets can work correctly. 2587 * Request-based dm is queueing the deferred I/Os in its request_queue. 2588 */ 2589 if (dm_request_based(md)) 2590 start_queue(md->queue); 2591 2592 unlock_fs(md); 2593 2594 clear_bit(DMF_SUSPENDED, &md->flags); 2595 2596 dm_table_unplug_all(map); 2597 r = 0; 2598 out: 2599 dm_table_put(map); 2600 mutex_unlock(&md->suspend_lock); 2601 2602 return r; 2603 } 2604 2605 /*----------------------------------------------------------------- 2606 * Event notification. 2607 *---------------------------------------------------------------*/ 2608 void dm_kobject_uevent(struct mapped_device *md, enum kobject_action action, 2609 unsigned cookie) 2610 { 2611 char udev_cookie[DM_COOKIE_LENGTH]; 2612 char *envp[] = { udev_cookie, NULL }; 2613 2614 if (!cookie) 2615 kobject_uevent(&disk_to_dev(md->disk)->kobj, action); 2616 else { 2617 snprintf(udev_cookie, DM_COOKIE_LENGTH, "%s=%u", 2618 DM_COOKIE_ENV_VAR_NAME, cookie); 2619 kobject_uevent_env(&disk_to_dev(md->disk)->kobj, action, envp); 2620 } 2621 } 2622 2623 uint32_t dm_next_uevent_seq(struct mapped_device *md) 2624 { 2625 return atomic_add_return(1, &md->uevent_seq); 2626 } 2627 2628 uint32_t dm_get_event_nr(struct mapped_device *md) 2629 { 2630 return atomic_read(&md->event_nr); 2631 } 2632 2633 int dm_wait_event(struct mapped_device *md, int event_nr) 2634 { 2635 return wait_event_interruptible(md->eventq, 2636 (event_nr != atomic_read(&md->event_nr))); 2637 } 2638 2639 void dm_uevent_add(struct mapped_device *md, struct list_head *elist) 2640 { 2641 unsigned long flags; 2642 2643 spin_lock_irqsave(&md->uevent_lock, flags); 2644 list_add(elist, &md->uevent_list); 2645 spin_unlock_irqrestore(&md->uevent_lock, flags); 2646 } 2647 2648 /* 2649 * The gendisk is only valid as long as you have a reference 2650 * count on 'md'. 2651 */ 2652 struct gendisk *dm_disk(struct mapped_device *md) 2653 { 2654 return md->disk; 2655 } 2656 2657 struct kobject *dm_kobject(struct mapped_device *md) 2658 { 2659 return &md->kobj; 2660 } 2661 2662 /* 2663 * struct mapped_device should not be exported outside of dm.c 2664 * so use this check to verify that kobj is part of md structure 2665 */ 2666 struct mapped_device *dm_get_from_kobject(struct kobject *kobj) 2667 { 2668 struct mapped_device *md; 2669 2670 md = container_of(kobj, struct mapped_device, kobj); 2671 if (&md->kobj != kobj) 2672 return NULL; 2673 2674 if (test_bit(DMF_FREEING, &md->flags) || 2675 dm_deleting_md(md)) 2676 return NULL; 2677 2678 dm_get(md); 2679 return md; 2680 } 2681 2682 int dm_suspended(struct mapped_device *md) 2683 { 2684 return test_bit(DMF_SUSPENDED, &md->flags); 2685 } 2686 2687 int dm_noflush_suspending(struct dm_target *ti) 2688 { 2689 struct mapped_device *md = dm_table_get_md(ti->table); 2690 int r = __noflush_suspending(md); 2691 2692 dm_put(md); 2693 2694 return r; 2695 } 2696 EXPORT_SYMBOL_GPL(dm_noflush_suspending); 2697 2698 struct dm_md_mempools *dm_alloc_md_mempools(unsigned type) 2699 { 2700 struct dm_md_mempools *pools = kmalloc(sizeof(*pools), GFP_KERNEL); 2701 2702 if (!pools) 2703 return NULL; 2704 2705 pools->io_pool = (type == DM_TYPE_BIO_BASED) ? 2706 mempool_create_slab_pool(MIN_IOS, _io_cache) : 2707 mempool_create_slab_pool(MIN_IOS, _rq_bio_info_cache); 2708 if (!pools->io_pool) 2709 goto free_pools_and_out; 2710 2711 pools->tio_pool = (type == DM_TYPE_BIO_BASED) ? 2712 mempool_create_slab_pool(MIN_IOS, _tio_cache) : 2713 mempool_create_slab_pool(MIN_IOS, _rq_tio_cache); 2714 if (!pools->tio_pool) 2715 goto free_io_pool_and_out; 2716 2717 pools->bs = (type == DM_TYPE_BIO_BASED) ? 2718 bioset_create(16, 0) : bioset_create(MIN_IOS, 0); 2719 if (!pools->bs) 2720 goto free_tio_pool_and_out; 2721 2722 return pools; 2723 2724 free_tio_pool_and_out: 2725 mempool_destroy(pools->tio_pool); 2726 2727 free_io_pool_and_out: 2728 mempool_destroy(pools->io_pool); 2729 2730 free_pools_and_out: 2731 kfree(pools); 2732 2733 return NULL; 2734 } 2735 2736 void dm_free_md_mempools(struct dm_md_mempools *pools) 2737 { 2738 if (!pools) 2739 return; 2740 2741 if (pools->io_pool) 2742 mempool_destroy(pools->io_pool); 2743 2744 if (pools->tio_pool) 2745 mempool_destroy(pools->tio_pool); 2746 2747 if (pools->bs) 2748 bioset_free(pools->bs); 2749 2750 kfree(pools); 2751 } 2752 2753 static const struct block_device_operations dm_blk_dops = { 2754 .open = dm_blk_open, 2755 .release = dm_blk_close, 2756 .ioctl = dm_blk_ioctl, 2757 .getgeo = dm_blk_getgeo, 2758 .owner = THIS_MODULE 2759 }; 2760 2761 EXPORT_SYMBOL(dm_get_mapinfo); 2762 2763 /* 2764 * module hooks 2765 */ 2766 module_init(dm_init); 2767 module_exit(dm_exit); 2768 2769 module_param(major, uint, 0); 2770 MODULE_PARM_DESC(major, "The major number of the device mapper"); 2771 MODULE_DESCRIPTION(DM_NAME " driver"); 2772 MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>"); 2773 MODULE_LICENSE("GPL"); 2774