1 /* 2 * Copyright (C) 2001, 2002 Sistina Software (UK) Limited. 3 * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved. 4 * 5 * This file is released under the GPL. 6 */ 7 8 #include "dm.h" 9 #include "dm-uevent.h" 10 11 #include <linux/init.h> 12 #include <linux/module.h> 13 #include <linux/mutex.h> 14 #include <linux/moduleparam.h> 15 #include <linux/blkpg.h> 16 #include <linux/bio.h> 17 #include <linux/buffer_head.h> 18 #include <linux/smp_lock.h> 19 #include <linux/mempool.h> 20 #include <linux/slab.h> 21 #include <linux/idr.h> 22 #include <linux/hdreg.h> 23 24 #include <trace/events/block.h> 25 26 #define DM_MSG_PREFIX "core" 27 28 /* 29 * Cookies are numeric values sent with CHANGE and REMOVE 30 * uevents while resuming, removing or renaming the device. 31 */ 32 #define DM_COOKIE_ENV_VAR_NAME "DM_COOKIE" 33 #define DM_COOKIE_LENGTH 24 34 35 static const char *_name = DM_NAME; 36 37 static unsigned int major = 0; 38 static unsigned int _major = 0; 39 40 static DEFINE_SPINLOCK(_minor_lock); 41 /* 42 * For bio-based dm. 43 * One of these is allocated per bio. 44 */ 45 struct dm_io { 46 struct mapped_device *md; 47 int error; 48 atomic_t io_count; 49 struct bio *bio; 50 unsigned long start_time; 51 spinlock_t endio_lock; 52 }; 53 54 /* 55 * For bio-based dm. 56 * One of these is allocated per target within a bio. Hopefully 57 * this will be simplified out one day. 58 */ 59 struct dm_target_io { 60 struct dm_io *io; 61 struct dm_target *ti; 62 union map_info info; 63 }; 64 65 /* 66 * For request-based dm. 67 * One of these is allocated per request. 68 */ 69 struct dm_rq_target_io { 70 struct mapped_device *md; 71 struct dm_target *ti; 72 struct request *orig, clone; 73 int error; 74 union map_info info; 75 }; 76 77 /* 78 * For request-based dm. 79 * One of these is allocated per bio. 80 */ 81 struct dm_rq_clone_bio_info { 82 struct bio *orig; 83 struct dm_rq_target_io *tio; 84 }; 85 86 union map_info *dm_get_mapinfo(struct bio *bio) 87 { 88 if (bio && bio->bi_private) 89 return &((struct dm_target_io *)bio->bi_private)->info; 90 return NULL; 91 } 92 93 union map_info *dm_get_rq_mapinfo(struct request *rq) 94 { 95 if (rq && rq->end_io_data) 96 return &((struct dm_rq_target_io *)rq->end_io_data)->info; 97 return NULL; 98 } 99 EXPORT_SYMBOL_GPL(dm_get_rq_mapinfo); 100 101 #define MINOR_ALLOCED ((void *)-1) 102 103 /* 104 * Bits for the md->flags field. 105 */ 106 #define DMF_BLOCK_IO_FOR_SUSPEND 0 107 #define DMF_SUSPENDED 1 108 #define DMF_FROZEN 2 109 #define DMF_FREEING 3 110 #define DMF_DELETING 4 111 #define DMF_NOFLUSH_SUSPENDING 5 112 #define DMF_QUEUE_IO_TO_THREAD 6 113 114 /* 115 * Work processed by per-device workqueue. 116 */ 117 struct mapped_device { 118 struct rw_semaphore io_lock; 119 struct mutex suspend_lock; 120 rwlock_t map_lock; 121 atomic_t holders; 122 atomic_t open_count; 123 124 unsigned long flags; 125 126 struct request_queue *queue; 127 struct gendisk *disk; 128 char name[16]; 129 130 void *interface_ptr; 131 132 /* 133 * A list of ios that arrived while we were suspended. 134 */ 135 atomic_t pending[2]; 136 wait_queue_head_t wait; 137 struct work_struct work; 138 struct bio_list deferred; 139 spinlock_t deferred_lock; 140 141 /* 142 * An error from the barrier request currently being processed. 143 */ 144 int barrier_error; 145 146 /* 147 * Protect barrier_error from concurrent endio processing 148 * in request-based dm. 149 */ 150 spinlock_t barrier_error_lock; 151 152 /* 153 * Processing queue (flush/barriers) 154 */ 155 struct workqueue_struct *wq; 156 struct work_struct barrier_work; 157 158 /* A pointer to the currently processing pre/post flush request */ 159 struct request *flush_request; 160 161 /* 162 * The current mapping. 163 */ 164 struct dm_table *map; 165 166 /* 167 * io objects are allocated from here. 168 */ 169 mempool_t *io_pool; 170 mempool_t *tio_pool; 171 172 struct bio_set *bs; 173 174 /* 175 * Event handling. 176 */ 177 atomic_t event_nr; 178 wait_queue_head_t eventq; 179 atomic_t uevent_seq; 180 struct list_head uevent_list; 181 spinlock_t uevent_lock; /* Protect access to uevent_list */ 182 183 /* 184 * freeze/thaw support require holding onto a super block 185 */ 186 struct super_block *frozen_sb; 187 struct block_device *bdev; 188 189 /* forced geometry settings */ 190 struct hd_geometry geometry; 191 192 /* For saving the address of __make_request for request based dm */ 193 make_request_fn *saved_make_request_fn; 194 195 /* sysfs handle */ 196 struct kobject kobj; 197 198 /* zero-length barrier that will be cloned and submitted to targets */ 199 struct bio barrier_bio; 200 }; 201 202 /* 203 * For mempools pre-allocation at the table loading time. 204 */ 205 struct dm_md_mempools { 206 mempool_t *io_pool; 207 mempool_t *tio_pool; 208 struct bio_set *bs; 209 }; 210 211 #define MIN_IOS 256 212 static struct kmem_cache *_io_cache; 213 static struct kmem_cache *_tio_cache; 214 static struct kmem_cache *_rq_tio_cache; 215 static struct kmem_cache *_rq_bio_info_cache; 216 217 static int __init local_init(void) 218 { 219 int r = -ENOMEM; 220 221 /* allocate a slab for the dm_ios */ 222 _io_cache = KMEM_CACHE(dm_io, 0); 223 if (!_io_cache) 224 return r; 225 226 /* allocate a slab for the target ios */ 227 _tio_cache = KMEM_CACHE(dm_target_io, 0); 228 if (!_tio_cache) 229 goto out_free_io_cache; 230 231 _rq_tio_cache = KMEM_CACHE(dm_rq_target_io, 0); 232 if (!_rq_tio_cache) 233 goto out_free_tio_cache; 234 235 _rq_bio_info_cache = KMEM_CACHE(dm_rq_clone_bio_info, 0); 236 if (!_rq_bio_info_cache) 237 goto out_free_rq_tio_cache; 238 239 r = dm_uevent_init(); 240 if (r) 241 goto out_free_rq_bio_info_cache; 242 243 _major = major; 244 r = register_blkdev(_major, _name); 245 if (r < 0) 246 goto out_uevent_exit; 247 248 if (!_major) 249 _major = r; 250 251 return 0; 252 253 out_uevent_exit: 254 dm_uevent_exit(); 255 out_free_rq_bio_info_cache: 256 kmem_cache_destroy(_rq_bio_info_cache); 257 out_free_rq_tio_cache: 258 kmem_cache_destroy(_rq_tio_cache); 259 out_free_tio_cache: 260 kmem_cache_destroy(_tio_cache); 261 out_free_io_cache: 262 kmem_cache_destroy(_io_cache); 263 264 return r; 265 } 266 267 static void local_exit(void) 268 { 269 kmem_cache_destroy(_rq_bio_info_cache); 270 kmem_cache_destroy(_rq_tio_cache); 271 kmem_cache_destroy(_tio_cache); 272 kmem_cache_destroy(_io_cache); 273 unregister_blkdev(_major, _name); 274 dm_uevent_exit(); 275 276 _major = 0; 277 278 DMINFO("cleaned up"); 279 } 280 281 static int (*_inits[])(void) __initdata = { 282 local_init, 283 dm_target_init, 284 dm_linear_init, 285 dm_stripe_init, 286 dm_io_init, 287 dm_kcopyd_init, 288 dm_interface_init, 289 }; 290 291 static void (*_exits[])(void) = { 292 local_exit, 293 dm_target_exit, 294 dm_linear_exit, 295 dm_stripe_exit, 296 dm_io_exit, 297 dm_kcopyd_exit, 298 dm_interface_exit, 299 }; 300 301 static int __init dm_init(void) 302 { 303 const int count = ARRAY_SIZE(_inits); 304 305 int r, i; 306 307 for (i = 0; i < count; i++) { 308 r = _inits[i](); 309 if (r) 310 goto bad; 311 } 312 313 return 0; 314 315 bad: 316 while (i--) 317 _exits[i](); 318 319 return r; 320 } 321 322 static void __exit dm_exit(void) 323 { 324 int i = ARRAY_SIZE(_exits); 325 326 while (i--) 327 _exits[i](); 328 } 329 330 /* 331 * Block device functions 332 */ 333 int dm_deleting_md(struct mapped_device *md) 334 { 335 return test_bit(DMF_DELETING, &md->flags); 336 } 337 338 static int dm_blk_open(struct block_device *bdev, fmode_t mode) 339 { 340 struct mapped_device *md; 341 342 lock_kernel(); 343 spin_lock(&_minor_lock); 344 345 md = bdev->bd_disk->private_data; 346 if (!md) 347 goto out; 348 349 if (test_bit(DMF_FREEING, &md->flags) || 350 dm_deleting_md(md)) { 351 md = NULL; 352 goto out; 353 } 354 355 dm_get(md); 356 atomic_inc(&md->open_count); 357 358 out: 359 spin_unlock(&_minor_lock); 360 unlock_kernel(); 361 362 return md ? 0 : -ENXIO; 363 } 364 365 static int dm_blk_close(struct gendisk *disk, fmode_t mode) 366 { 367 struct mapped_device *md = disk->private_data; 368 369 lock_kernel(); 370 atomic_dec(&md->open_count); 371 dm_put(md); 372 unlock_kernel(); 373 374 return 0; 375 } 376 377 int dm_open_count(struct mapped_device *md) 378 { 379 return atomic_read(&md->open_count); 380 } 381 382 /* 383 * Guarantees nothing is using the device before it's deleted. 384 */ 385 int dm_lock_for_deletion(struct mapped_device *md) 386 { 387 int r = 0; 388 389 spin_lock(&_minor_lock); 390 391 if (dm_open_count(md)) 392 r = -EBUSY; 393 else 394 set_bit(DMF_DELETING, &md->flags); 395 396 spin_unlock(&_minor_lock); 397 398 return r; 399 } 400 401 static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo) 402 { 403 struct mapped_device *md = bdev->bd_disk->private_data; 404 405 return dm_get_geometry(md, geo); 406 } 407 408 static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode, 409 unsigned int cmd, unsigned long arg) 410 { 411 struct mapped_device *md = bdev->bd_disk->private_data; 412 struct dm_table *map = dm_get_live_table(md); 413 struct dm_target *tgt; 414 int r = -ENOTTY; 415 416 if (!map || !dm_table_get_size(map)) 417 goto out; 418 419 /* We only support devices that have a single target */ 420 if (dm_table_get_num_targets(map) != 1) 421 goto out; 422 423 tgt = dm_table_get_target(map, 0); 424 425 if (dm_suspended_md(md)) { 426 r = -EAGAIN; 427 goto out; 428 } 429 430 if (tgt->type->ioctl) 431 r = tgt->type->ioctl(tgt, cmd, arg); 432 433 out: 434 dm_table_put(map); 435 436 return r; 437 } 438 439 static struct dm_io *alloc_io(struct mapped_device *md) 440 { 441 return mempool_alloc(md->io_pool, GFP_NOIO); 442 } 443 444 static void free_io(struct mapped_device *md, struct dm_io *io) 445 { 446 mempool_free(io, md->io_pool); 447 } 448 449 static void free_tio(struct mapped_device *md, struct dm_target_io *tio) 450 { 451 mempool_free(tio, md->tio_pool); 452 } 453 454 static struct dm_rq_target_io *alloc_rq_tio(struct mapped_device *md, 455 gfp_t gfp_mask) 456 { 457 return mempool_alloc(md->tio_pool, gfp_mask); 458 } 459 460 static void free_rq_tio(struct dm_rq_target_io *tio) 461 { 462 mempool_free(tio, tio->md->tio_pool); 463 } 464 465 static struct dm_rq_clone_bio_info *alloc_bio_info(struct mapped_device *md) 466 { 467 return mempool_alloc(md->io_pool, GFP_ATOMIC); 468 } 469 470 static void free_bio_info(struct dm_rq_clone_bio_info *info) 471 { 472 mempool_free(info, info->tio->md->io_pool); 473 } 474 475 static int md_in_flight(struct mapped_device *md) 476 { 477 return atomic_read(&md->pending[READ]) + 478 atomic_read(&md->pending[WRITE]); 479 } 480 481 static void start_io_acct(struct dm_io *io) 482 { 483 struct mapped_device *md = io->md; 484 int cpu; 485 int rw = bio_data_dir(io->bio); 486 487 io->start_time = jiffies; 488 489 cpu = part_stat_lock(); 490 part_round_stats(cpu, &dm_disk(md)->part0); 491 part_stat_unlock(); 492 dm_disk(md)->part0.in_flight[rw] = atomic_inc_return(&md->pending[rw]); 493 } 494 495 static void end_io_acct(struct dm_io *io) 496 { 497 struct mapped_device *md = io->md; 498 struct bio *bio = io->bio; 499 unsigned long duration = jiffies - io->start_time; 500 int pending, cpu; 501 int rw = bio_data_dir(bio); 502 503 cpu = part_stat_lock(); 504 part_round_stats(cpu, &dm_disk(md)->part0); 505 part_stat_add(cpu, &dm_disk(md)->part0, ticks[rw], duration); 506 part_stat_unlock(); 507 508 /* 509 * After this is decremented the bio must not be touched if it is 510 * a barrier. 511 */ 512 dm_disk(md)->part0.in_flight[rw] = pending = 513 atomic_dec_return(&md->pending[rw]); 514 pending += atomic_read(&md->pending[rw^0x1]); 515 516 /* nudge anyone waiting on suspend queue */ 517 if (!pending) 518 wake_up(&md->wait); 519 } 520 521 /* 522 * Add the bio to the list of deferred io. 523 */ 524 static void queue_io(struct mapped_device *md, struct bio *bio) 525 { 526 down_write(&md->io_lock); 527 528 spin_lock_irq(&md->deferred_lock); 529 bio_list_add(&md->deferred, bio); 530 spin_unlock_irq(&md->deferred_lock); 531 532 if (!test_and_set_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags)) 533 queue_work(md->wq, &md->work); 534 535 up_write(&md->io_lock); 536 } 537 538 /* 539 * Everyone (including functions in this file), should use this 540 * function to access the md->map field, and make sure they call 541 * dm_table_put() when finished. 542 */ 543 struct dm_table *dm_get_live_table(struct mapped_device *md) 544 { 545 struct dm_table *t; 546 unsigned long flags; 547 548 read_lock_irqsave(&md->map_lock, flags); 549 t = md->map; 550 if (t) 551 dm_table_get(t); 552 read_unlock_irqrestore(&md->map_lock, flags); 553 554 return t; 555 } 556 557 /* 558 * Get the geometry associated with a dm device 559 */ 560 int dm_get_geometry(struct mapped_device *md, struct hd_geometry *geo) 561 { 562 *geo = md->geometry; 563 564 return 0; 565 } 566 567 /* 568 * Set the geometry of a device. 569 */ 570 int dm_set_geometry(struct mapped_device *md, struct hd_geometry *geo) 571 { 572 sector_t sz = (sector_t)geo->cylinders * geo->heads * geo->sectors; 573 574 if (geo->start > sz) { 575 DMWARN("Start sector is beyond the geometry limits."); 576 return -EINVAL; 577 } 578 579 md->geometry = *geo; 580 581 return 0; 582 } 583 584 /*----------------------------------------------------------------- 585 * CRUD START: 586 * A more elegant soln is in the works that uses the queue 587 * merge fn, unfortunately there are a couple of changes to 588 * the block layer that I want to make for this. So in the 589 * interests of getting something for people to use I give 590 * you this clearly demarcated crap. 591 *---------------------------------------------------------------*/ 592 593 static int __noflush_suspending(struct mapped_device *md) 594 { 595 return test_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); 596 } 597 598 /* 599 * Decrements the number of outstanding ios that a bio has been 600 * cloned into, completing the original io if necc. 601 */ 602 static void dec_pending(struct dm_io *io, int error) 603 { 604 unsigned long flags; 605 int io_error; 606 struct bio *bio; 607 struct mapped_device *md = io->md; 608 609 /* Push-back supersedes any I/O errors */ 610 if (unlikely(error)) { 611 spin_lock_irqsave(&io->endio_lock, flags); 612 if (!(io->error > 0 && __noflush_suspending(md))) 613 io->error = error; 614 spin_unlock_irqrestore(&io->endio_lock, flags); 615 } 616 617 if (atomic_dec_and_test(&io->io_count)) { 618 if (io->error == DM_ENDIO_REQUEUE) { 619 /* 620 * Target requested pushing back the I/O. 621 */ 622 spin_lock_irqsave(&md->deferred_lock, flags); 623 if (__noflush_suspending(md)) { 624 if (!(io->bio->bi_rw & REQ_HARDBARRIER)) 625 bio_list_add_head(&md->deferred, 626 io->bio); 627 } else 628 /* noflush suspend was interrupted. */ 629 io->error = -EIO; 630 spin_unlock_irqrestore(&md->deferred_lock, flags); 631 } 632 633 io_error = io->error; 634 bio = io->bio; 635 636 if (bio->bi_rw & REQ_HARDBARRIER) { 637 /* 638 * There can be just one barrier request so we use 639 * a per-device variable for error reporting. 640 * Note that you can't touch the bio after end_io_acct 641 */ 642 if (!md->barrier_error && io_error != -EOPNOTSUPP) 643 md->barrier_error = io_error; 644 end_io_acct(io); 645 free_io(md, io); 646 } else { 647 end_io_acct(io); 648 free_io(md, io); 649 650 if (io_error != DM_ENDIO_REQUEUE) { 651 trace_block_bio_complete(md->queue, bio); 652 653 bio_endio(bio, io_error); 654 } 655 } 656 } 657 } 658 659 static void clone_endio(struct bio *bio, int error) 660 { 661 int r = 0; 662 struct dm_target_io *tio = bio->bi_private; 663 struct dm_io *io = tio->io; 664 struct mapped_device *md = tio->io->md; 665 dm_endio_fn endio = tio->ti->type->end_io; 666 667 if (!bio_flagged(bio, BIO_UPTODATE) && !error) 668 error = -EIO; 669 670 if (endio) { 671 r = endio(tio->ti, bio, error, &tio->info); 672 if (r < 0 || r == DM_ENDIO_REQUEUE) 673 /* 674 * error and requeue request are handled 675 * in dec_pending(). 676 */ 677 error = r; 678 else if (r == DM_ENDIO_INCOMPLETE) 679 /* The target will handle the io */ 680 return; 681 else if (r) { 682 DMWARN("unimplemented target endio return value: %d", r); 683 BUG(); 684 } 685 } 686 687 /* 688 * Store md for cleanup instead of tio which is about to get freed. 689 */ 690 bio->bi_private = md->bs; 691 692 free_tio(md, tio); 693 bio_put(bio); 694 dec_pending(io, error); 695 } 696 697 /* 698 * Partial completion handling for request-based dm 699 */ 700 static void end_clone_bio(struct bio *clone, int error) 701 { 702 struct dm_rq_clone_bio_info *info = clone->bi_private; 703 struct dm_rq_target_io *tio = info->tio; 704 struct bio *bio = info->orig; 705 unsigned int nr_bytes = info->orig->bi_size; 706 707 bio_put(clone); 708 709 if (tio->error) 710 /* 711 * An error has already been detected on the request. 712 * Once error occurred, just let clone->end_io() handle 713 * the remainder. 714 */ 715 return; 716 else if (error) { 717 /* 718 * Don't notice the error to the upper layer yet. 719 * The error handling decision is made by the target driver, 720 * when the request is completed. 721 */ 722 tio->error = error; 723 return; 724 } 725 726 /* 727 * I/O for the bio successfully completed. 728 * Notice the data completion to the upper layer. 729 */ 730 731 /* 732 * bios are processed from the head of the list. 733 * So the completing bio should always be rq->bio. 734 * If it's not, something wrong is happening. 735 */ 736 if (tio->orig->bio != bio) 737 DMERR("bio completion is going in the middle of the request"); 738 739 /* 740 * Update the original request. 741 * Do not use blk_end_request() here, because it may complete 742 * the original request before the clone, and break the ordering. 743 */ 744 blk_update_request(tio->orig, 0, nr_bytes); 745 } 746 747 static void store_barrier_error(struct mapped_device *md, int error) 748 { 749 unsigned long flags; 750 751 spin_lock_irqsave(&md->barrier_error_lock, flags); 752 /* 753 * Basically, the first error is taken, but: 754 * -EOPNOTSUPP supersedes any I/O error. 755 * Requeue request supersedes any I/O error but -EOPNOTSUPP. 756 */ 757 if (!md->barrier_error || error == -EOPNOTSUPP || 758 (md->barrier_error != -EOPNOTSUPP && 759 error == DM_ENDIO_REQUEUE)) 760 md->barrier_error = error; 761 spin_unlock_irqrestore(&md->barrier_error_lock, flags); 762 } 763 764 /* 765 * Don't touch any member of the md after calling this function because 766 * the md may be freed in dm_put() at the end of this function. 767 * Or do dm_get() before calling this function and dm_put() later. 768 */ 769 static void rq_completed(struct mapped_device *md, int rw, int run_queue) 770 { 771 atomic_dec(&md->pending[rw]); 772 773 /* nudge anyone waiting on suspend queue */ 774 if (!md_in_flight(md)) 775 wake_up(&md->wait); 776 777 if (run_queue) 778 blk_run_queue(md->queue); 779 780 /* 781 * dm_put() must be at the end of this function. See the comment above 782 */ 783 dm_put(md); 784 } 785 786 static void free_rq_clone(struct request *clone) 787 { 788 struct dm_rq_target_io *tio = clone->end_io_data; 789 790 blk_rq_unprep_clone(clone); 791 free_rq_tio(tio); 792 } 793 794 /* 795 * Complete the clone and the original request. 796 * Must be called without queue lock. 797 */ 798 static void dm_end_request(struct request *clone, int error) 799 { 800 int rw = rq_data_dir(clone); 801 int run_queue = 1; 802 bool is_barrier = clone->cmd_flags & REQ_HARDBARRIER; 803 struct dm_rq_target_io *tio = clone->end_io_data; 804 struct mapped_device *md = tio->md; 805 struct request *rq = tio->orig; 806 807 if (rq->cmd_type == REQ_TYPE_BLOCK_PC && !is_barrier) { 808 rq->errors = clone->errors; 809 rq->resid_len = clone->resid_len; 810 811 if (rq->sense) 812 /* 813 * We are using the sense buffer of the original 814 * request. 815 * So setting the length of the sense data is enough. 816 */ 817 rq->sense_len = clone->sense_len; 818 } 819 820 free_rq_clone(clone); 821 822 if (unlikely(is_barrier)) { 823 if (unlikely(error)) 824 store_barrier_error(md, error); 825 run_queue = 0; 826 } else 827 blk_end_request_all(rq, error); 828 829 rq_completed(md, rw, run_queue); 830 } 831 832 static void dm_unprep_request(struct request *rq) 833 { 834 struct request *clone = rq->special; 835 836 rq->special = NULL; 837 rq->cmd_flags &= ~REQ_DONTPREP; 838 839 free_rq_clone(clone); 840 } 841 842 /* 843 * Requeue the original request of a clone. 844 */ 845 void dm_requeue_unmapped_request(struct request *clone) 846 { 847 int rw = rq_data_dir(clone); 848 struct dm_rq_target_io *tio = clone->end_io_data; 849 struct mapped_device *md = tio->md; 850 struct request *rq = tio->orig; 851 struct request_queue *q = rq->q; 852 unsigned long flags; 853 854 if (unlikely(clone->cmd_flags & REQ_HARDBARRIER)) { 855 /* 856 * Barrier clones share an original request. 857 * Leave it to dm_end_request(), which handles this special 858 * case. 859 */ 860 dm_end_request(clone, DM_ENDIO_REQUEUE); 861 return; 862 } 863 864 dm_unprep_request(rq); 865 866 spin_lock_irqsave(q->queue_lock, flags); 867 if (elv_queue_empty(q)) 868 blk_plug_device(q); 869 blk_requeue_request(q, rq); 870 spin_unlock_irqrestore(q->queue_lock, flags); 871 872 rq_completed(md, rw, 0); 873 } 874 EXPORT_SYMBOL_GPL(dm_requeue_unmapped_request); 875 876 static void __stop_queue(struct request_queue *q) 877 { 878 blk_stop_queue(q); 879 } 880 881 static void stop_queue(struct request_queue *q) 882 { 883 unsigned long flags; 884 885 spin_lock_irqsave(q->queue_lock, flags); 886 __stop_queue(q); 887 spin_unlock_irqrestore(q->queue_lock, flags); 888 } 889 890 static void __start_queue(struct request_queue *q) 891 { 892 if (blk_queue_stopped(q)) 893 blk_start_queue(q); 894 } 895 896 static void start_queue(struct request_queue *q) 897 { 898 unsigned long flags; 899 900 spin_lock_irqsave(q->queue_lock, flags); 901 __start_queue(q); 902 spin_unlock_irqrestore(q->queue_lock, flags); 903 } 904 905 static void dm_done(struct request *clone, int error, bool mapped) 906 { 907 int r = error; 908 struct dm_rq_target_io *tio = clone->end_io_data; 909 dm_request_endio_fn rq_end_io = tio->ti->type->rq_end_io; 910 911 if (mapped && rq_end_io) 912 r = rq_end_io(tio->ti, clone, error, &tio->info); 913 914 if (r <= 0) 915 /* The target wants to complete the I/O */ 916 dm_end_request(clone, r); 917 else if (r == DM_ENDIO_INCOMPLETE) 918 /* The target will handle the I/O */ 919 return; 920 else if (r == DM_ENDIO_REQUEUE) 921 /* The target wants to requeue the I/O */ 922 dm_requeue_unmapped_request(clone); 923 else { 924 DMWARN("unimplemented target endio return value: %d", r); 925 BUG(); 926 } 927 } 928 929 /* 930 * Request completion handler for request-based dm 931 */ 932 static void dm_softirq_done(struct request *rq) 933 { 934 bool mapped = true; 935 struct request *clone = rq->completion_data; 936 struct dm_rq_target_io *tio = clone->end_io_data; 937 938 if (rq->cmd_flags & REQ_FAILED) 939 mapped = false; 940 941 dm_done(clone, tio->error, mapped); 942 } 943 944 /* 945 * Complete the clone and the original request with the error status 946 * through softirq context. 947 */ 948 static void dm_complete_request(struct request *clone, int error) 949 { 950 struct dm_rq_target_io *tio = clone->end_io_data; 951 struct request *rq = tio->orig; 952 953 if (unlikely(clone->cmd_flags & REQ_HARDBARRIER)) { 954 /* 955 * Barrier clones share an original request. So can't use 956 * softirq_done with the original. 957 * Pass the clone to dm_done() directly in this special case. 958 * It is safe (even if clone->q->queue_lock is held here) 959 * because there is no I/O dispatching during the completion 960 * of barrier clone. 961 */ 962 dm_done(clone, error, true); 963 return; 964 } 965 966 tio->error = error; 967 rq->completion_data = clone; 968 blk_complete_request(rq); 969 } 970 971 /* 972 * Complete the not-mapped clone and the original request with the error status 973 * through softirq context. 974 * Target's rq_end_io() function isn't called. 975 * This may be used when the target's map_rq() function fails. 976 */ 977 void dm_kill_unmapped_request(struct request *clone, int error) 978 { 979 struct dm_rq_target_io *tio = clone->end_io_data; 980 struct request *rq = tio->orig; 981 982 if (unlikely(clone->cmd_flags & REQ_HARDBARRIER)) { 983 /* 984 * Barrier clones share an original request. 985 * Leave it to dm_end_request(), which handles this special 986 * case. 987 */ 988 BUG_ON(error > 0); 989 dm_end_request(clone, error); 990 return; 991 } 992 993 rq->cmd_flags |= REQ_FAILED; 994 dm_complete_request(clone, error); 995 } 996 EXPORT_SYMBOL_GPL(dm_kill_unmapped_request); 997 998 /* 999 * Called with the queue lock held 1000 */ 1001 static void end_clone_request(struct request *clone, int error) 1002 { 1003 /* 1004 * For just cleaning up the information of the queue in which 1005 * the clone was dispatched. 1006 * The clone is *NOT* freed actually here because it is alloced from 1007 * dm own mempool and REQ_ALLOCED isn't set in clone->cmd_flags. 1008 */ 1009 __blk_put_request(clone->q, clone); 1010 1011 /* 1012 * Actual request completion is done in a softirq context which doesn't 1013 * hold the queue lock. Otherwise, deadlock could occur because: 1014 * - another request may be submitted by the upper level driver 1015 * of the stacking during the completion 1016 * - the submission which requires queue lock may be done 1017 * against this queue 1018 */ 1019 dm_complete_request(clone, error); 1020 } 1021 1022 static sector_t max_io_len(struct mapped_device *md, 1023 sector_t sector, struct dm_target *ti) 1024 { 1025 sector_t offset = sector - ti->begin; 1026 sector_t len = ti->len - offset; 1027 1028 /* 1029 * Does the target need to split even further ? 1030 */ 1031 if (ti->split_io) { 1032 sector_t boundary; 1033 boundary = ((offset + ti->split_io) & ~(ti->split_io - 1)) 1034 - offset; 1035 if (len > boundary) 1036 len = boundary; 1037 } 1038 1039 return len; 1040 } 1041 1042 static void __map_bio(struct dm_target *ti, struct bio *clone, 1043 struct dm_target_io *tio) 1044 { 1045 int r; 1046 sector_t sector; 1047 struct mapped_device *md; 1048 1049 clone->bi_end_io = clone_endio; 1050 clone->bi_private = tio; 1051 1052 /* 1053 * Map the clone. If r == 0 we don't need to do 1054 * anything, the target has assumed ownership of 1055 * this io. 1056 */ 1057 atomic_inc(&tio->io->io_count); 1058 sector = clone->bi_sector; 1059 r = ti->type->map(ti, clone, &tio->info); 1060 if (r == DM_MAPIO_REMAPPED) { 1061 /* the bio has been remapped so dispatch it */ 1062 1063 trace_block_remap(bdev_get_queue(clone->bi_bdev), clone, 1064 tio->io->bio->bi_bdev->bd_dev, sector); 1065 1066 generic_make_request(clone); 1067 } else if (r < 0 || r == DM_MAPIO_REQUEUE) { 1068 /* error the io and bail out, or requeue it if needed */ 1069 md = tio->io->md; 1070 dec_pending(tio->io, r); 1071 /* 1072 * Store bio_set for cleanup. 1073 */ 1074 clone->bi_private = md->bs; 1075 bio_put(clone); 1076 free_tio(md, tio); 1077 } else if (r) { 1078 DMWARN("unimplemented target map return value: %d", r); 1079 BUG(); 1080 } 1081 } 1082 1083 struct clone_info { 1084 struct mapped_device *md; 1085 struct dm_table *map; 1086 struct bio *bio; 1087 struct dm_io *io; 1088 sector_t sector; 1089 sector_t sector_count; 1090 unsigned short idx; 1091 }; 1092 1093 static void dm_bio_destructor(struct bio *bio) 1094 { 1095 struct bio_set *bs = bio->bi_private; 1096 1097 bio_free(bio, bs); 1098 } 1099 1100 /* 1101 * Creates a little bio that is just does part of a bvec. 1102 */ 1103 static struct bio *split_bvec(struct bio *bio, sector_t sector, 1104 unsigned short idx, unsigned int offset, 1105 unsigned int len, struct bio_set *bs) 1106 { 1107 struct bio *clone; 1108 struct bio_vec *bv = bio->bi_io_vec + idx; 1109 1110 clone = bio_alloc_bioset(GFP_NOIO, 1, bs); 1111 clone->bi_destructor = dm_bio_destructor; 1112 *clone->bi_io_vec = *bv; 1113 1114 clone->bi_sector = sector; 1115 clone->bi_bdev = bio->bi_bdev; 1116 clone->bi_rw = bio->bi_rw & ~REQ_HARDBARRIER; 1117 clone->bi_vcnt = 1; 1118 clone->bi_size = to_bytes(len); 1119 clone->bi_io_vec->bv_offset = offset; 1120 clone->bi_io_vec->bv_len = clone->bi_size; 1121 clone->bi_flags |= 1 << BIO_CLONED; 1122 1123 if (bio_integrity(bio)) { 1124 bio_integrity_clone(clone, bio, GFP_NOIO, bs); 1125 bio_integrity_trim(clone, 1126 bio_sector_offset(bio, idx, offset), len); 1127 } 1128 1129 return clone; 1130 } 1131 1132 /* 1133 * Creates a bio that consists of range of complete bvecs. 1134 */ 1135 static struct bio *clone_bio(struct bio *bio, sector_t sector, 1136 unsigned short idx, unsigned short bv_count, 1137 unsigned int len, struct bio_set *bs) 1138 { 1139 struct bio *clone; 1140 1141 clone = bio_alloc_bioset(GFP_NOIO, bio->bi_max_vecs, bs); 1142 __bio_clone(clone, bio); 1143 clone->bi_rw &= ~REQ_HARDBARRIER; 1144 clone->bi_destructor = dm_bio_destructor; 1145 clone->bi_sector = sector; 1146 clone->bi_idx = idx; 1147 clone->bi_vcnt = idx + bv_count; 1148 clone->bi_size = to_bytes(len); 1149 clone->bi_flags &= ~(1 << BIO_SEG_VALID); 1150 1151 if (bio_integrity(bio)) { 1152 bio_integrity_clone(clone, bio, GFP_NOIO, bs); 1153 1154 if (idx != bio->bi_idx || clone->bi_size < bio->bi_size) 1155 bio_integrity_trim(clone, 1156 bio_sector_offset(bio, idx, 0), len); 1157 } 1158 1159 return clone; 1160 } 1161 1162 static struct dm_target_io *alloc_tio(struct clone_info *ci, 1163 struct dm_target *ti) 1164 { 1165 struct dm_target_io *tio = mempool_alloc(ci->md->tio_pool, GFP_NOIO); 1166 1167 tio->io = ci->io; 1168 tio->ti = ti; 1169 memset(&tio->info, 0, sizeof(tio->info)); 1170 1171 return tio; 1172 } 1173 1174 static void __flush_target(struct clone_info *ci, struct dm_target *ti, 1175 unsigned flush_nr) 1176 { 1177 struct dm_target_io *tio = alloc_tio(ci, ti); 1178 struct bio *clone; 1179 1180 tio->info.flush_request = flush_nr; 1181 1182 clone = bio_alloc_bioset(GFP_NOIO, 0, ci->md->bs); 1183 __bio_clone(clone, ci->bio); 1184 clone->bi_destructor = dm_bio_destructor; 1185 1186 __map_bio(ti, clone, tio); 1187 } 1188 1189 static int __clone_and_map_empty_barrier(struct clone_info *ci) 1190 { 1191 unsigned target_nr = 0, flush_nr; 1192 struct dm_target *ti; 1193 1194 while ((ti = dm_table_get_target(ci->map, target_nr++))) 1195 for (flush_nr = 0; flush_nr < ti->num_flush_requests; 1196 flush_nr++) 1197 __flush_target(ci, ti, flush_nr); 1198 1199 ci->sector_count = 0; 1200 1201 return 0; 1202 } 1203 1204 static int __clone_and_map(struct clone_info *ci) 1205 { 1206 struct bio *clone, *bio = ci->bio; 1207 struct dm_target *ti; 1208 sector_t len = 0, max; 1209 struct dm_target_io *tio; 1210 1211 if (unlikely(bio_empty_barrier(bio))) 1212 return __clone_and_map_empty_barrier(ci); 1213 1214 ti = dm_table_find_target(ci->map, ci->sector); 1215 if (!dm_target_is_valid(ti)) 1216 return -EIO; 1217 1218 max = max_io_len(ci->md, ci->sector, ti); 1219 1220 /* 1221 * Allocate a target io object. 1222 */ 1223 tio = alloc_tio(ci, ti); 1224 1225 if (ci->sector_count <= max) { 1226 /* 1227 * Optimise for the simple case where we can do all of 1228 * the remaining io with a single clone. 1229 */ 1230 clone = clone_bio(bio, ci->sector, ci->idx, 1231 bio->bi_vcnt - ci->idx, ci->sector_count, 1232 ci->md->bs); 1233 __map_bio(ti, clone, tio); 1234 ci->sector_count = 0; 1235 1236 } else if (to_sector(bio->bi_io_vec[ci->idx].bv_len) <= max) { 1237 /* 1238 * There are some bvecs that don't span targets. 1239 * Do as many of these as possible. 1240 */ 1241 int i; 1242 sector_t remaining = max; 1243 sector_t bv_len; 1244 1245 for (i = ci->idx; remaining && (i < bio->bi_vcnt); i++) { 1246 bv_len = to_sector(bio->bi_io_vec[i].bv_len); 1247 1248 if (bv_len > remaining) 1249 break; 1250 1251 remaining -= bv_len; 1252 len += bv_len; 1253 } 1254 1255 clone = clone_bio(bio, ci->sector, ci->idx, i - ci->idx, len, 1256 ci->md->bs); 1257 __map_bio(ti, clone, tio); 1258 1259 ci->sector += len; 1260 ci->sector_count -= len; 1261 ci->idx = i; 1262 1263 } else { 1264 /* 1265 * Handle a bvec that must be split between two or more targets. 1266 */ 1267 struct bio_vec *bv = bio->bi_io_vec + ci->idx; 1268 sector_t remaining = to_sector(bv->bv_len); 1269 unsigned int offset = 0; 1270 1271 do { 1272 if (offset) { 1273 ti = dm_table_find_target(ci->map, ci->sector); 1274 if (!dm_target_is_valid(ti)) 1275 return -EIO; 1276 1277 max = max_io_len(ci->md, ci->sector, ti); 1278 1279 tio = alloc_tio(ci, ti); 1280 } 1281 1282 len = min(remaining, max); 1283 1284 clone = split_bvec(bio, ci->sector, ci->idx, 1285 bv->bv_offset + offset, len, 1286 ci->md->bs); 1287 1288 __map_bio(ti, clone, tio); 1289 1290 ci->sector += len; 1291 ci->sector_count -= len; 1292 offset += to_bytes(len); 1293 } while (remaining -= len); 1294 1295 ci->idx++; 1296 } 1297 1298 return 0; 1299 } 1300 1301 /* 1302 * Split the bio into several clones and submit it to targets. 1303 */ 1304 static void __split_and_process_bio(struct mapped_device *md, struct bio *bio) 1305 { 1306 struct clone_info ci; 1307 int error = 0; 1308 1309 ci.map = dm_get_live_table(md); 1310 if (unlikely(!ci.map)) { 1311 if (!(bio->bi_rw & REQ_HARDBARRIER)) 1312 bio_io_error(bio); 1313 else 1314 if (!md->barrier_error) 1315 md->barrier_error = -EIO; 1316 return; 1317 } 1318 1319 ci.md = md; 1320 ci.bio = bio; 1321 ci.io = alloc_io(md); 1322 ci.io->error = 0; 1323 atomic_set(&ci.io->io_count, 1); 1324 ci.io->bio = bio; 1325 ci.io->md = md; 1326 spin_lock_init(&ci.io->endio_lock); 1327 ci.sector = bio->bi_sector; 1328 ci.sector_count = bio_sectors(bio); 1329 if (unlikely(bio_empty_barrier(bio))) 1330 ci.sector_count = 1; 1331 ci.idx = bio->bi_idx; 1332 1333 start_io_acct(ci.io); 1334 while (ci.sector_count && !error) 1335 error = __clone_and_map(&ci); 1336 1337 /* drop the extra reference count */ 1338 dec_pending(ci.io, error); 1339 dm_table_put(ci.map); 1340 } 1341 /*----------------------------------------------------------------- 1342 * CRUD END 1343 *---------------------------------------------------------------*/ 1344 1345 static int dm_merge_bvec(struct request_queue *q, 1346 struct bvec_merge_data *bvm, 1347 struct bio_vec *biovec) 1348 { 1349 struct mapped_device *md = q->queuedata; 1350 struct dm_table *map = dm_get_live_table(md); 1351 struct dm_target *ti; 1352 sector_t max_sectors; 1353 int max_size = 0; 1354 1355 if (unlikely(!map)) 1356 goto out; 1357 1358 ti = dm_table_find_target(map, bvm->bi_sector); 1359 if (!dm_target_is_valid(ti)) 1360 goto out_table; 1361 1362 /* 1363 * Find maximum amount of I/O that won't need splitting 1364 */ 1365 max_sectors = min(max_io_len(md, bvm->bi_sector, ti), 1366 (sector_t) BIO_MAX_SECTORS); 1367 max_size = (max_sectors << SECTOR_SHIFT) - bvm->bi_size; 1368 if (max_size < 0) 1369 max_size = 0; 1370 1371 /* 1372 * merge_bvec_fn() returns number of bytes 1373 * it can accept at this offset 1374 * max is precomputed maximal io size 1375 */ 1376 if (max_size && ti->type->merge) 1377 max_size = ti->type->merge(ti, bvm, biovec, max_size); 1378 /* 1379 * If the target doesn't support merge method and some of the devices 1380 * provided their merge_bvec method (we know this by looking at 1381 * queue_max_hw_sectors), then we can't allow bios with multiple vector 1382 * entries. So always set max_size to 0, and the code below allows 1383 * just one page. 1384 */ 1385 else if (queue_max_hw_sectors(q) <= PAGE_SIZE >> 9) 1386 1387 max_size = 0; 1388 1389 out_table: 1390 dm_table_put(map); 1391 1392 out: 1393 /* 1394 * Always allow an entire first page 1395 */ 1396 if (max_size <= biovec->bv_len && !(bvm->bi_size >> SECTOR_SHIFT)) 1397 max_size = biovec->bv_len; 1398 1399 return max_size; 1400 } 1401 1402 /* 1403 * The request function that just remaps the bio built up by 1404 * dm_merge_bvec. 1405 */ 1406 static int _dm_request(struct request_queue *q, struct bio *bio) 1407 { 1408 int rw = bio_data_dir(bio); 1409 struct mapped_device *md = q->queuedata; 1410 int cpu; 1411 1412 down_read(&md->io_lock); 1413 1414 cpu = part_stat_lock(); 1415 part_stat_inc(cpu, &dm_disk(md)->part0, ios[rw]); 1416 part_stat_add(cpu, &dm_disk(md)->part0, sectors[rw], bio_sectors(bio)); 1417 part_stat_unlock(); 1418 1419 /* 1420 * If we're suspended or the thread is processing barriers 1421 * we have to queue this io for later. 1422 */ 1423 if (unlikely(test_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags)) || 1424 unlikely(bio->bi_rw & REQ_HARDBARRIER)) { 1425 up_read(&md->io_lock); 1426 1427 if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) && 1428 bio_rw(bio) == READA) { 1429 bio_io_error(bio); 1430 return 0; 1431 } 1432 1433 queue_io(md, bio); 1434 1435 return 0; 1436 } 1437 1438 __split_and_process_bio(md, bio); 1439 up_read(&md->io_lock); 1440 return 0; 1441 } 1442 1443 static int dm_make_request(struct request_queue *q, struct bio *bio) 1444 { 1445 struct mapped_device *md = q->queuedata; 1446 1447 return md->saved_make_request_fn(q, bio); /* call __make_request() */ 1448 } 1449 1450 static int dm_request_based(struct mapped_device *md) 1451 { 1452 return blk_queue_stackable(md->queue); 1453 } 1454 1455 static int dm_request(struct request_queue *q, struct bio *bio) 1456 { 1457 struct mapped_device *md = q->queuedata; 1458 1459 if (dm_request_based(md)) 1460 return dm_make_request(q, bio); 1461 1462 return _dm_request(q, bio); 1463 } 1464 1465 static bool dm_rq_is_flush_request(struct request *rq) 1466 { 1467 if (rq->cmd_flags & REQ_FLUSH) 1468 return true; 1469 else 1470 return false; 1471 } 1472 1473 void dm_dispatch_request(struct request *rq) 1474 { 1475 int r; 1476 1477 if (blk_queue_io_stat(rq->q)) 1478 rq->cmd_flags |= REQ_IO_STAT; 1479 1480 rq->start_time = jiffies; 1481 r = blk_insert_cloned_request(rq->q, rq); 1482 if (r) 1483 dm_complete_request(rq, r); 1484 } 1485 EXPORT_SYMBOL_GPL(dm_dispatch_request); 1486 1487 static void dm_rq_bio_destructor(struct bio *bio) 1488 { 1489 struct dm_rq_clone_bio_info *info = bio->bi_private; 1490 struct mapped_device *md = info->tio->md; 1491 1492 free_bio_info(info); 1493 bio_free(bio, md->bs); 1494 } 1495 1496 static int dm_rq_bio_constructor(struct bio *bio, struct bio *bio_orig, 1497 void *data) 1498 { 1499 struct dm_rq_target_io *tio = data; 1500 struct mapped_device *md = tio->md; 1501 struct dm_rq_clone_bio_info *info = alloc_bio_info(md); 1502 1503 if (!info) 1504 return -ENOMEM; 1505 1506 info->orig = bio_orig; 1507 info->tio = tio; 1508 bio->bi_end_io = end_clone_bio; 1509 bio->bi_private = info; 1510 bio->bi_destructor = dm_rq_bio_destructor; 1511 1512 return 0; 1513 } 1514 1515 static int setup_clone(struct request *clone, struct request *rq, 1516 struct dm_rq_target_io *tio) 1517 { 1518 int r; 1519 1520 if (dm_rq_is_flush_request(rq)) { 1521 blk_rq_init(NULL, clone); 1522 clone->cmd_type = REQ_TYPE_FS; 1523 clone->cmd_flags |= (REQ_HARDBARRIER | WRITE); 1524 } else { 1525 r = blk_rq_prep_clone(clone, rq, tio->md->bs, GFP_ATOMIC, 1526 dm_rq_bio_constructor, tio); 1527 if (r) 1528 return r; 1529 1530 clone->cmd = rq->cmd; 1531 clone->cmd_len = rq->cmd_len; 1532 clone->sense = rq->sense; 1533 clone->buffer = rq->buffer; 1534 } 1535 1536 clone->end_io = end_clone_request; 1537 clone->end_io_data = tio; 1538 1539 return 0; 1540 } 1541 1542 static struct request *clone_rq(struct request *rq, struct mapped_device *md, 1543 gfp_t gfp_mask) 1544 { 1545 struct request *clone; 1546 struct dm_rq_target_io *tio; 1547 1548 tio = alloc_rq_tio(md, gfp_mask); 1549 if (!tio) 1550 return NULL; 1551 1552 tio->md = md; 1553 tio->ti = NULL; 1554 tio->orig = rq; 1555 tio->error = 0; 1556 memset(&tio->info, 0, sizeof(tio->info)); 1557 1558 clone = &tio->clone; 1559 if (setup_clone(clone, rq, tio)) { 1560 /* -ENOMEM */ 1561 free_rq_tio(tio); 1562 return NULL; 1563 } 1564 1565 return clone; 1566 } 1567 1568 /* 1569 * Called with the queue lock held. 1570 */ 1571 static int dm_prep_fn(struct request_queue *q, struct request *rq) 1572 { 1573 struct mapped_device *md = q->queuedata; 1574 struct request *clone; 1575 1576 if (unlikely(dm_rq_is_flush_request(rq))) 1577 return BLKPREP_OK; 1578 1579 if (unlikely(rq->special)) { 1580 DMWARN("Already has something in rq->special."); 1581 return BLKPREP_KILL; 1582 } 1583 1584 clone = clone_rq(rq, md, GFP_ATOMIC); 1585 if (!clone) 1586 return BLKPREP_DEFER; 1587 1588 rq->special = clone; 1589 rq->cmd_flags |= REQ_DONTPREP; 1590 1591 return BLKPREP_OK; 1592 } 1593 1594 /* 1595 * Returns: 1596 * 0 : the request has been processed (not requeued) 1597 * !0 : the request has been requeued 1598 */ 1599 static int map_request(struct dm_target *ti, struct request *clone, 1600 struct mapped_device *md) 1601 { 1602 int r, requeued = 0; 1603 struct dm_rq_target_io *tio = clone->end_io_data; 1604 1605 /* 1606 * Hold the md reference here for the in-flight I/O. 1607 * We can't rely on the reference count by device opener, 1608 * because the device may be closed during the request completion 1609 * when all bios are completed. 1610 * See the comment in rq_completed() too. 1611 */ 1612 dm_get(md); 1613 1614 tio->ti = ti; 1615 r = ti->type->map_rq(ti, clone, &tio->info); 1616 switch (r) { 1617 case DM_MAPIO_SUBMITTED: 1618 /* The target has taken the I/O to submit by itself later */ 1619 break; 1620 case DM_MAPIO_REMAPPED: 1621 /* The target has remapped the I/O so dispatch it */ 1622 trace_block_rq_remap(clone->q, clone, disk_devt(dm_disk(md)), 1623 blk_rq_pos(tio->orig)); 1624 dm_dispatch_request(clone); 1625 break; 1626 case DM_MAPIO_REQUEUE: 1627 /* The target wants to requeue the I/O */ 1628 dm_requeue_unmapped_request(clone); 1629 requeued = 1; 1630 break; 1631 default: 1632 if (r > 0) { 1633 DMWARN("unimplemented target map return value: %d", r); 1634 BUG(); 1635 } 1636 1637 /* The target wants to complete the I/O */ 1638 dm_kill_unmapped_request(clone, r); 1639 break; 1640 } 1641 1642 return requeued; 1643 } 1644 1645 /* 1646 * q->request_fn for request-based dm. 1647 * Called with the queue lock held. 1648 */ 1649 static void dm_request_fn(struct request_queue *q) 1650 { 1651 struct mapped_device *md = q->queuedata; 1652 struct dm_table *map = dm_get_live_table(md); 1653 struct dm_target *ti; 1654 struct request *rq, *clone; 1655 1656 /* 1657 * For suspend, check blk_queue_stopped() and increment 1658 * ->pending within a single queue_lock not to increment the 1659 * number of in-flight I/Os after the queue is stopped in 1660 * dm_suspend(). 1661 */ 1662 while (!blk_queue_plugged(q) && !blk_queue_stopped(q)) { 1663 rq = blk_peek_request(q); 1664 if (!rq) 1665 goto plug_and_out; 1666 1667 if (unlikely(dm_rq_is_flush_request(rq))) { 1668 BUG_ON(md->flush_request); 1669 md->flush_request = rq; 1670 blk_start_request(rq); 1671 queue_work(md->wq, &md->barrier_work); 1672 goto out; 1673 } 1674 1675 ti = dm_table_find_target(map, blk_rq_pos(rq)); 1676 if (ti->type->busy && ti->type->busy(ti)) 1677 goto plug_and_out; 1678 1679 blk_start_request(rq); 1680 clone = rq->special; 1681 atomic_inc(&md->pending[rq_data_dir(clone)]); 1682 1683 spin_unlock(q->queue_lock); 1684 if (map_request(ti, clone, md)) 1685 goto requeued; 1686 1687 spin_lock_irq(q->queue_lock); 1688 } 1689 1690 goto out; 1691 1692 requeued: 1693 spin_lock_irq(q->queue_lock); 1694 1695 plug_and_out: 1696 if (!elv_queue_empty(q)) 1697 /* Some requests still remain, retry later */ 1698 blk_plug_device(q); 1699 1700 out: 1701 dm_table_put(map); 1702 1703 return; 1704 } 1705 1706 int dm_underlying_device_busy(struct request_queue *q) 1707 { 1708 return blk_lld_busy(q); 1709 } 1710 EXPORT_SYMBOL_GPL(dm_underlying_device_busy); 1711 1712 static int dm_lld_busy(struct request_queue *q) 1713 { 1714 int r; 1715 struct mapped_device *md = q->queuedata; 1716 struct dm_table *map = dm_get_live_table(md); 1717 1718 if (!map || test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) 1719 r = 1; 1720 else 1721 r = dm_table_any_busy_target(map); 1722 1723 dm_table_put(map); 1724 1725 return r; 1726 } 1727 1728 static void dm_unplug_all(struct request_queue *q) 1729 { 1730 struct mapped_device *md = q->queuedata; 1731 struct dm_table *map = dm_get_live_table(md); 1732 1733 if (map) { 1734 if (dm_request_based(md)) 1735 generic_unplug_device(q); 1736 1737 dm_table_unplug_all(map); 1738 dm_table_put(map); 1739 } 1740 } 1741 1742 static int dm_any_congested(void *congested_data, int bdi_bits) 1743 { 1744 int r = bdi_bits; 1745 struct mapped_device *md = congested_data; 1746 struct dm_table *map; 1747 1748 if (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) { 1749 map = dm_get_live_table(md); 1750 if (map) { 1751 /* 1752 * Request-based dm cares about only own queue for 1753 * the query about congestion status of request_queue 1754 */ 1755 if (dm_request_based(md)) 1756 r = md->queue->backing_dev_info.state & 1757 bdi_bits; 1758 else 1759 r = dm_table_any_congested(map, bdi_bits); 1760 1761 dm_table_put(map); 1762 } 1763 } 1764 1765 return r; 1766 } 1767 1768 /*----------------------------------------------------------------- 1769 * An IDR is used to keep track of allocated minor numbers. 1770 *---------------------------------------------------------------*/ 1771 static DEFINE_IDR(_minor_idr); 1772 1773 static void free_minor(int minor) 1774 { 1775 spin_lock(&_minor_lock); 1776 idr_remove(&_minor_idr, minor); 1777 spin_unlock(&_minor_lock); 1778 } 1779 1780 /* 1781 * See if the device with a specific minor # is free. 1782 */ 1783 static int specific_minor(int minor) 1784 { 1785 int r, m; 1786 1787 if (minor >= (1 << MINORBITS)) 1788 return -EINVAL; 1789 1790 r = idr_pre_get(&_minor_idr, GFP_KERNEL); 1791 if (!r) 1792 return -ENOMEM; 1793 1794 spin_lock(&_minor_lock); 1795 1796 if (idr_find(&_minor_idr, minor)) { 1797 r = -EBUSY; 1798 goto out; 1799 } 1800 1801 r = idr_get_new_above(&_minor_idr, MINOR_ALLOCED, minor, &m); 1802 if (r) 1803 goto out; 1804 1805 if (m != minor) { 1806 idr_remove(&_minor_idr, m); 1807 r = -EBUSY; 1808 goto out; 1809 } 1810 1811 out: 1812 spin_unlock(&_minor_lock); 1813 return r; 1814 } 1815 1816 static int next_free_minor(int *minor) 1817 { 1818 int r, m; 1819 1820 r = idr_pre_get(&_minor_idr, GFP_KERNEL); 1821 if (!r) 1822 return -ENOMEM; 1823 1824 spin_lock(&_minor_lock); 1825 1826 r = idr_get_new(&_minor_idr, MINOR_ALLOCED, &m); 1827 if (r) 1828 goto out; 1829 1830 if (m >= (1 << MINORBITS)) { 1831 idr_remove(&_minor_idr, m); 1832 r = -ENOSPC; 1833 goto out; 1834 } 1835 1836 *minor = m; 1837 1838 out: 1839 spin_unlock(&_minor_lock); 1840 return r; 1841 } 1842 1843 static const struct block_device_operations dm_blk_dops; 1844 1845 static void dm_wq_work(struct work_struct *work); 1846 static void dm_rq_barrier_work(struct work_struct *work); 1847 1848 /* 1849 * Allocate and initialise a blank device with a given minor. 1850 */ 1851 static struct mapped_device *alloc_dev(int minor) 1852 { 1853 int r; 1854 struct mapped_device *md = kzalloc(sizeof(*md), GFP_KERNEL); 1855 void *old_md; 1856 1857 if (!md) { 1858 DMWARN("unable to allocate device, out of memory."); 1859 return NULL; 1860 } 1861 1862 if (!try_module_get(THIS_MODULE)) 1863 goto bad_module_get; 1864 1865 /* get a minor number for the dev */ 1866 if (minor == DM_ANY_MINOR) 1867 r = next_free_minor(&minor); 1868 else 1869 r = specific_minor(minor); 1870 if (r < 0) 1871 goto bad_minor; 1872 1873 init_rwsem(&md->io_lock); 1874 mutex_init(&md->suspend_lock); 1875 spin_lock_init(&md->deferred_lock); 1876 spin_lock_init(&md->barrier_error_lock); 1877 rwlock_init(&md->map_lock); 1878 atomic_set(&md->holders, 1); 1879 atomic_set(&md->open_count, 0); 1880 atomic_set(&md->event_nr, 0); 1881 atomic_set(&md->uevent_seq, 0); 1882 INIT_LIST_HEAD(&md->uevent_list); 1883 spin_lock_init(&md->uevent_lock); 1884 1885 md->queue = blk_init_queue(dm_request_fn, NULL); 1886 if (!md->queue) 1887 goto bad_queue; 1888 1889 /* 1890 * Request-based dm devices cannot be stacked on top of bio-based dm 1891 * devices. The type of this dm device has not been decided yet, 1892 * although we initialized the queue using blk_init_queue(). 1893 * The type is decided at the first table loading time. 1894 * To prevent problematic device stacking, clear the queue flag 1895 * for request stacking support until then. 1896 * 1897 * This queue is new, so no concurrency on the queue_flags. 1898 */ 1899 queue_flag_clear_unlocked(QUEUE_FLAG_STACKABLE, md->queue); 1900 md->saved_make_request_fn = md->queue->make_request_fn; 1901 md->queue->queuedata = md; 1902 md->queue->backing_dev_info.congested_fn = dm_any_congested; 1903 md->queue->backing_dev_info.congested_data = md; 1904 blk_queue_make_request(md->queue, dm_request); 1905 blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY); 1906 md->queue->unplug_fn = dm_unplug_all; 1907 blk_queue_merge_bvec(md->queue, dm_merge_bvec); 1908 blk_queue_softirq_done(md->queue, dm_softirq_done); 1909 blk_queue_prep_rq(md->queue, dm_prep_fn); 1910 blk_queue_lld_busy(md->queue, dm_lld_busy); 1911 blk_queue_ordered(md->queue, QUEUE_ORDERED_DRAIN_FLUSH); 1912 1913 md->disk = alloc_disk(1); 1914 if (!md->disk) 1915 goto bad_disk; 1916 1917 atomic_set(&md->pending[0], 0); 1918 atomic_set(&md->pending[1], 0); 1919 init_waitqueue_head(&md->wait); 1920 INIT_WORK(&md->work, dm_wq_work); 1921 INIT_WORK(&md->barrier_work, dm_rq_barrier_work); 1922 init_waitqueue_head(&md->eventq); 1923 1924 md->disk->major = _major; 1925 md->disk->first_minor = minor; 1926 md->disk->fops = &dm_blk_dops; 1927 md->disk->queue = md->queue; 1928 md->disk->private_data = md; 1929 sprintf(md->disk->disk_name, "dm-%d", minor); 1930 add_disk(md->disk); 1931 format_dev_t(md->name, MKDEV(_major, minor)); 1932 1933 md->wq = create_singlethread_workqueue("kdmflush"); 1934 if (!md->wq) 1935 goto bad_thread; 1936 1937 md->bdev = bdget_disk(md->disk, 0); 1938 if (!md->bdev) 1939 goto bad_bdev; 1940 1941 /* Populate the mapping, nobody knows we exist yet */ 1942 spin_lock(&_minor_lock); 1943 old_md = idr_replace(&_minor_idr, md, minor); 1944 spin_unlock(&_minor_lock); 1945 1946 BUG_ON(old_md != MINOR_ALLOCED); 1947 1948 return md; 1949 1950 bad_bdev: 1951 destroy_workqueue(md->wq); 1952 bad_thread: 1953 del_gendisk(md->disk); 1954 put_disk(md->disk); 1955 bad_disk: 1956 blk_cleanup_queue(md->queue); 1957 bad_queue: 1958 free_minor(minor); 1959 bad_minor: 1960 module_put(THIS_MODULE); 1961 bad_module_get: 1962 kfree(md); 1963 return NULL; 1964 } 1965 1966 static void unlock_fs(struct mapped_device *md); 1967 1968 static void free_dev(struct mapped_device *md) 1969 { 1970 int minor = MINOR(disk_devt(md->disk)); 1971 1972 unlock_fs(md); 1973 bdput(md->bdev); 1974 destroy_workqueue(md->wq); 1975 if (md->tio_pool) 1976 mempool_destroy(md->tio_pool); 1977 if (md->io_pool) 1978 mempool_destroy(md->io_pool); 1979 if (md->bs) 1980 bioset_free(md->bs); 1981 blk_integrity_unregister(md->disk); 1982 del_gendisk(md->disk); 1983 free_minor(minor); 1984 1985 spin_lock(&_minor_lock); 1986 md->disk->private_data = NULL; 1987 spin_unlock(&_minor_lock); 1988 1989 put_disk(md->disk); 1990 blk_cleanup_queue(md->queue); 1991 module_put(THIS_MODULE); 1992 kfree(md); 1993 } 1994 1995 static void __bind_mempools(struct mapped_device *md, struct dm_table *t) 1996 { 1997 struct dm_md_mempools *p; 1998 1999 if (md->io_pool && md->tio_pool && md->bs) 2000 /* the md already has necessary mempools */ 2001 goto out; 2002 2003 p = dm_table_get_md_mempools(t); 2004 BUG_ON(!p || md->io_pool || md->tio_pool || md->bs); 2005 2006 md->io_pool = p->io_pool; 2007 p->io_pool = NULL; 2008 md->tio_pool = p->tio_pool; 2009 p->tio_pool = NULL; 2010 md->bs = p->bs; 2011 p->bs = NULL; 2012 2013 out: 2014 /* mempool bind completed, now no need any mempools in the table */ 2015 dm_table_free_md_mempools(t); 2016 } 2017 2018 /* 2019 * Bind a table to the device. 2020 */ 2021 static void event_callback(void *context) 2022 { 2023 unsigned long flags; 2024 LIST_HEAD(uevents); 2025 struct mapped_device *md = (struct mapped_device *) context; 2026 2027 spin_lock_irqsave(&md->uevent_lock, flags); 2028 list_splice_init(&md->uevent_list, &uevents); 2029 spin_unlock_irqrestore(&md->uevent_lock, flags); 2030 2031 dm_send_uevents(&uevents, &disk_to_dev(md->disk)->kobj); 2032 2033 atomic_inc(&md->event_nr); 2034 wake_up(&md->eventq); 2035 } 2036 2037 static void __set_size(struct mapped_device *md, sector_t size) 2038 { 2039 set_capacity(md->disk, size); 2040 2041 mutex_lock(&md->bdev->bd_inode->i_mutex); 2042 i_size_write(md->bdev->bd_inode, (loff_t)size << SECTOR_SHIFT); 2043 mutex_unlock(&md->bdev->bd_inode->i_mutex); 2044 } 2045 2046 /* 2047 * Returns old map, which caller must destroy. 2048 */ 2049 static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t, 2050 struct queue_limits *limits) 2051 { 2052 struct dm_table *old_map; 2053 struct request_queue *q = md->queue; 2054 sector_t size; 2055 unsigned long flags; 2056 2057 size = dm_table_get_size(t); 2058 2059 /* 2060 * Wipe any geometry if the size of the table changed. 2061 */ 2062 if (size != get_capacity(md->disk)) 2063 memset(&md->geometry, 0, sizeof(md->geometry)); 2064 2065 __set_size(md, size); 2066 2067 dm_table_event_callback(t, event_callback, md); 2068 2069 /* 2070 * The queue hasn't been stopped yet, if the old table type wasn't 2071 * for request-based during suspension. So stop it to prevent 2072 * I/O mapping before resume. 2073 * This must be done before setting the queue restrictions, 2074 * because request-based dm may be run just after the setting. 2075 */ 2076 if (dm_table_request_based(t) && !blk_queue_stopped(q)) 2077 stop_queue(q); 2078 2079 __bind_mempools(md, t); 2080 2081 write_lock_irqsave(&md->map_lock, flags); 2082 old_map = md->map; 2083 md->map = t; 2084 dm_table_set_restrictions(t, q, limits); 2085 write_unlock_irqrestore(&md->map_lock, flags); 2086 2087 return old_map; 2088 } 2089 2090 /* 2091 * Returns unbound table for the caller to free. 2092 */ 2093 static struct dm_table *__unbind(struct mapped_device *md) 2094 { 2095 struct dm_table *map = md->map; 2096 unsigned long flags; 2097 2098 if (!map) 2099 return NULL; 2100 2101 dm_table_event_callback(map, NULL, NULL); 2102 write_lock_irqsave(&md->map_lock, flags); 2103 md->map = NULL; 2104 write_unlock_irqrestore(&md->map_lock, flags); 2105 2106 return map; 2107 } 2108 2109 /* 2110 * Constructor for a new device. 2111 */ 2112 int dm_create(int minor, struct mapped_device **result) 2113 { 2114 struct mapped_device *md; 2115 2116 md = alloc_dev(minor); 2117 if (!md) 2118 return -ENXIO; 2119 2120 dm_sysfs_init(md); 2121 2122 *result = md; 2123 return 0; 2124 } 2125 2126 static struct mapped_device *dm_find_md(dev_t dev) 2127 { 2128 struct mapped_device *md; 2129 unsigned minor = MINOR(dev); 2130 2131 if (MAJOR(dev) != _major || minor >= (1 << MINORBITS)) 2132 return NULL; 2133 2134 spin_lock(&_minor_lock); 2135 2136 md = idr_find(&_minor_idr, minor); 2137 if (md && (md == MINOR_ALLOCED || 2138 (MINOR(disk_devt(dm_disk(md))) != minor) || 2139 dm_deleting_md(md) || 2140 test_bit(DMF_FREEING, &md->flags))) { 2141 md = NULL; 2142 goto out; 2143 } 2144 2145 out: 2146 spin_unlock(&_minor_lock); 2147 2148 return md; 2149 } 2150 2151 struct mapped_device *dm_get_md(dev_t dev) 2152 { 2153 struct mapped_device *md = dm_find_md(dev); 2154 2155 if (md) 2156 dm_get(md); 2157 2158 return md; 2159 } 2160 2161 void *dm_get_mdptr(struct mapped_device *md) 2162 { 2163 return md->interface_ptr; 2164 } 2165 2166 void dm_set_mdptr(struct mapped_device *md, void *ptr) 2167 { 2168 md->interface_ptr = ptr; 2169 } 2170 2171 void dm_get(struct mapped_device *md) 2172 { 2173 atomic_inc(&md->holders); 2174 } 2175 2176 const char *dm_device_name(struct mapped_device *md) 2177 { 2178 return md->name; 2179 } 2180 EXPORT_SYMBOL_GPL(dm_device_name); 2181 2182 void dm_put(struct mapped_device *md) 2183 { 2184 struct dm_table *map; 2185 2186 BUG_ON(test_bit(DMF_FREEING, &md->flags)); 2187 2188 if (atomic_dec_and_lock(&md->holders, &_minor_lock)) { 2189 map = dm_get_live_table(md); 2190 idr_replace(&_minor_idr, MINOR_ALLOCED, 2191 MINOR(disk_devt(dm_disk(md)))); 2192 set_bit(DMF_FREEING, &md->flags); 2193 spin_unlock(&_minor_lock); 2194 if (!dm_suspended_md(md)) { 2195 dm_table_presuspend_targets(map); 2196 dm_table_postsuspend_targets(map); 2197 } 2198 dm_sysfs_exit(md); 2199 dm_table_put(map); 2200 dm_table_destroy(__unbind(md)); 2201 free_dev(md); 2202 } 2203 } 2204 EXPORT_SYMBOL_GPL(dm_put); 2205 2206 static int dm_wait_for_completion(struct mapped_device *md, int interruptible) 2207 { 2208 int r = 0; 2209 DECLARE_WAITQUEUE(wait, current); 2210 2211 dm_unplug_all(md->queue); 2212 2213 add_wait_queue(&md->wait, &wait); 2214 2215 while (1) { 2216 set_current_state(interruptible); 2217 2218 smp_mb(); 2219 if (!md_in_flight(md)) 2220 break; 2221 2222 if (interruptible == TASK_INTERRUPTIBLE && 2223 signal_pending(current)) { 2224 r = -EINTR; 2225 break; 2226 } 2227 2228 io_schedule(); 2229 } 2230 set_current_state(TASK_RUNNING); 2231 2232 remove_wait_queue(&md->wait, &wait); 2233 2234 return r; 2235 } 2236 2237 static void dm_flush(struct mapped_device *md) 2238 { 2239 dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE); 2240 2241 bio_init(&md->barrier_bio); 2242 md->barrier_bio.bi_bdev = md->bdev; 2243 md->barrier_bio.bi_rw = WRITE_BARRIER; 2244 __split_and_process_bio(md, &md->barrier_bio); 2245 2246 dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE); 2247 } 2248 2249 static void process_barrier(struct mapped_device *md, struct bio *bio) 2250 { 2251 md->barrier_error = 0; 2252 2253 dm_flush(md); 2254 2255 if (!bio_empty_barrier(bio)) { 2256 __split_and_process_bio(md, bio); 2257 dm_flush(md); 2258 } 2259 2260 if (md->barrier_error != DM_ENDIO_REQUEUE) 2261 bio_endio(bio, md->barrier_error); 2262 else { 2263 spin_lock_irq(&md->deferred_lock); 2264 bio_list_add_head(&md->deferred, bio); 2265 spin_unlock_irq(&md->deferred_lock); 2266 } 2267 } 2268 2269 /* 2270 * Process the deferred bios 2271 */ 2272 static void dm_wq_work(struct work_struct *work) 2273 { 2274 struct mapped_device *md = container_of(work, struct mapped_device, 2275 work); 2276 struct bio *c; 2277 2278 down_write(&md->io_lock); 2279 2280 while (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) { 2281 spin_lock_irq(&md->deferred_lock); 2282 c = bio_list_pop(&md->deferred); 2283 spin_unlock_irq(&md->deferred_lock); 2284 2285 if (!c) { 2286 clear_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags); 2287 break; 2288 } 2289 2290 up_write(&md->io_lock); 2291 2292 if (dm_request_based(md)) 2293 generic_make_request(c); 2294 else { 2295 if (c->bi_rw & REQ_HARDBARRIER) 2296 process_barrier(md, c); 2297 else 2298 __split_and_process_bio(md, c); 2299 } 2300 2301 down_write(&md->io_lock); 2302 } 2303 2304 up_write(&md->io_lock); 2305 } 2306 2307 static void dm_queue_flush(struct mapped_device *md) 2308 { 2309 clear_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags); 2310 smp_mb__after_clear_bit(); 2311 queue_work(md->wq, &md->work); 2312 } 2313 2314 static void dm_rq_set_flush_nr(struct request *clone, unsigned flush_nr) 2315 { 2316 struct dm_rq_target_io *tio = clone->end_io_data; 2317 2318 tio->info.flush_request = flush_nr; 2319 } 2320 2321 /* Issue barrier requests to targets and wait for their completion. */ 2322 static int dm_rq_barrier(struct mapped_device *md) 2323 { 2324 int i, j; 2325 struct dm_table *map = dm_get_live_table(md); 2326 unsigned num_targets = dm_table_get_num_targets(map); 2327 struct dm_target *ti; 2328 struct request *clone; 2329 2330 md->barrier_error = 0; 2331 2332 for (i = 0; i < num_targets; i++) { 2333 ti = dm_table_get_target(map, i); 2334 for (j = 0; j < ti->num_flush_requests; j++) { 2335 clone = clone_rq(md->flush_request, md, GFP_NOIO); 2336 dm_rq_set_flush_nr(clone, j); 2337 atomic_inc(&md->pending[rq_data_dir(clone)]); 2338 map_request(ti, clone, md); 2339 } 2340 } 2341 2342 dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE); 2343 dm_table_put(map); 2344 2345 return md->barrier_error; 2346 } 2347 2348 static void dm_rq_barrier_work(struct work_struct *work) 2349 { 2350 int error; 2351 struct mapped_device *md = container_of(work, struct mapped_device, 2352 barrier_work); 2353 struct request_queue *q = md->queue; 2354 struct request *rq; 2355 unsigned long flags; 2356 2357 /* 2358 * Hold the md reference here and leave it at the last part so that 2359 * the md can't be deleted by device opener when the barrier request 2360 * completes. 2361 */ 2362 dm_get(md); 2363 2364 error = dm_rq_barrier(md); 2365 2366 rq = md->flush_request; 2367 md->flush_request = NULL; 2368 2369 if (error == DM_ENDIO_REQUEUE) { 2370 spin_lock_irqsave(q->queue_lock, flags); 2371 blk_requeue_request(q, rq); 2372 spin_unlock_irqrestore(q->queue_lock, flags); 2373 } else 2374 blk_end_request_all(rq, error); 2375 2376 blk_run_queue(q); 2377 2378 dm_put(md); 2379 } 2380 2381 /* 2382 * Swap in a new table, returning the old one for the caller to destroy. 2383 */ 2384 struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *table) 2385 { 2386 struct dm_table *map = ERR_PTR(-EINVAL); 2387 struct queue_limits limits; 2388 int r; 2389 2390 mutex_lock(&md->suspend_lock); 2391 2392 /* device must be suspended */ 2393 if (!dm_suspended_md(md)) 2394 goto out; 2395 2396 r = dm_calculate_queue_limits(table, &limits); 2397 if (r) { 2398 map = ERR_PTR(r); 2399 goto out; 2400 } 2401 2402 /* cannot change the device type, once a table is bound */ 2403 if (md->map && 2404 (dm_table_get_type(md->map) != dm_table_get_type(table))) { 2405 DMWARN("can't change the device type after a table is bound"); 2406 goto out; 2407 } 2408 2409 map = __bind(md, table, &limits); 2410 2411 out: 2412 mutex_unlock(&md->suspend_lock); 2413 return map; 2414 } 2415 2416 /* 2417 * Functions to lock and unlock any filesystem running on the 2418 * device. 2419 */ 2420 static int lock_fs(struct mapped_device *md) 2421 { 2422 int r; 2423 2424 WARN_ON(md->frozen_sb); 2425 2426 md->frozen_sb = freeze_bdev(md->bdev); 2427 if (IS_ERR(md->frozen_sb)) { 2428 r = PTR_ERR(md->frozen_sb); 2429 md->frozen_sb = NULL; 2430 return r; 2431 } 2432 2433 set_bit(DMF_FROZEN, &md->flags); 2434 2435 return 0; 2436 } 2437 2438 static void unlock_fs(struct mapped_device *md) 2439 { 2440 if (!test_bit(DMF_FROZEN, &md->flags)) 2441 return; 2442 2443 thaw_bdev(md->bdev, md->frozen_sb); 2444 md->frozen_sb = NULL; 2445 clear_bit(DMF_FROZEN, &md->flags); 2446 } 2447 2448 /* 2449 * We need to be able to change a mapping table under a mounted 2450 * filesystem. For example we might want to move some data in 2451 * the background. Before the table can be swapped with 2452 * dm_bind_table, dm_suspend must be called to flush any in 2453 * flight bios and ensure that any further io gets deferred. 2454 */ 2455 /* 2456 * Suspend mechanism in request-based dm. 2457 * 2458 * 1. Flush all I/Os by lock_fs() if needed. 2459 * 2. Stop dispatching any I/O by stopping the request_queue. 2460 * 3. Wait for all in-flight I/Os to be completed or requeued. 2461 * 2462 * To abort suspend, start the request_queue. 2463 */ 2464 int dm_suspend(struct mapped_device *md, unsigned suspend_flags) 2465 { 2466 struct dm_table *map = NULL; 2467 int r = 0; 2468 int do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG ? 1 : 0; 2469 int noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG ? 1 : 0; 2470 2471 mutex_lock(&md->suspend_lock); 2472 2473 if (dm_suspended_md(md)) { 2474 r = -EINVAL; 2475 goto out_unlock; 2476 } 2477 2478 map = dm_get_live_table(md); 2479 2480 /* 2481 * DMF_NOFLUSH_SUSPENDING must be set before presuspend. 2482 * This flag is cleared before dm_suspend returns. 2483 */ 2484 if (noflush) 2485 set_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); 2486 2487 /* This does not get reverted if there's an error later. */ 2488 dm_table_presuspend_targets(map); 2489 2490 /* 2491 * Flush I/O to the device. 2492 * Any I/O submitted after lock_fs() may not be flushed. 2493 * noflush takes precedence over do_lockfs. 2494 * (lock_fs() flushes I/Os and waits for them to complete.) 2495 */ 2496 if (!noflush && do_lockfs) { 2497 r = lock_fs(md); 2498 if (r) 2499 goto out; 2500 } 2501 2502 /* 2503 * Here we must make sure that no processes are submitting requests 2504 * to target drivers i.e. no one may be executing 2505 * __split_and_process_bio. This is called from dm_request and 2506 * dm_wq_work. 2507 * 2508 * To get all processes out of __split_and_process_bio in dm_request, 2509 * we take the write lock. To prevent any process from reentering 2510 * __split_and_process_bio from dm_request, we set 2511 * DMF_QUEUE_IO_TO_THREAD. 2512 * 2513 * To quiesce the thread (dm_wq_work), we set DMF_BLOCK_IO_FOR_SUSPEND 2514 * and call flush_workqueue(md->wq). flush_workqueue will wait until 2515 * dm_wq_work exits and DMF_BLOCK_IO_FOR_SUSPEND will prevent any 2516 * further calls to __split_and_process_bio from dm_wq_work. 2517 */ 2518 down_write(&md->io_lock); 2519 set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags); 2520 set_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags); 2521 up_write(&md->io_lock); 2522 2523 /* 2524 * Request-based dm uses md->wq for barrier (dm_rq_barrier_work) which 2525 * can be kicked until md->queue is stopped. So stop md->queue before 2526 * flushing md->wq. 2527 */ 2528 if (dm_request_based(md)) 2529 stop_queue(md->queue); 2530 2531 flush_workqueue(md->wq); 2532 2533 /* 2534 * At this point no more requests are entering target request routines. 2535 * We call dm_wait_for_completion to wait for all existing requests 2536 * to finish. 2537 */ 2538 r = dm_wait_for_completion(md, TASK_INTERRUPTIBLE); 2539 2540 down_write(&md->io_lock); 2541 if (noflush) 2542 clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); 2543 up_write(&md->io_lock); 2544 2545 /* were we interrupted ? */ 2546 if (r < 0) { 2547 dm_queue_flush(md); 2548 2549 if (dm_request_based(md)) 2550 start_queue(md->queue); 2551 2552 unlock_fs(md); 2553 goto out; /* pushback list is already flushed, so skip flush */ 2554 } 2555 2556 /* 2557 * If dm_wait_for_completion returned 0, the device is completely 2558 * quiescent now. There is no request-processing activity. All new 2559 * requests are being added to md->deferred list. 2560 */ 2561 2562 set_bit(DMF_SUSPENDED, &md->flags); 2563 2564 dm_table_postsuspend_targets(map); 2565 2566 out: 2567 dm_table_put(map); 2568 2569 out_unlock: 2570 mutex_unlock(&md->suspend_lock); 2571 return r; 2572 } 2573 2574 int dm_resume(struct mapped_device *md) 2575 { 2576 int r = -EINVAL; 2577 struct dm_table *map = NULL; 2578 2579 mutex_lock(&md->suspend_lock); 2580 if (!dm_suspended_md(md)) 2581 goto out; 2582 2583 map = dm_get_live_table(md); 2584 if (!map || !dm_table_get_size(map)) 2585 goto out; 2586 2587 r = dm_table_resume_targets(map); 2588 if (r) 2589 goto out; 2590 2591 dm_queue_flush(md); 2592 2593 /* 2594 * Flushing deferred I/Os must be done after targets are resumed 2595 * so that mapping of targets can work correctly. 2596 * Request-based dm is queueing the deferred I/Os in its request_queue. 2597 */ 2598 if (dm_request_based(md)) 2599 start_queue(md->queue); 2600 2601 unlock_fs(md); 2602 2603 clear_bit(DMF_SUSPENDED, &md->flags); 2604 2605 dm_table_unplug_all(map); 2606 r = 0; 2607 out: 2608 dm_table_put(map); 2609 mutex_unlock(&md->suspend_lock); 2610 2611 return r; 2612 } 2613 2614 /*----------------------------------------------------------------- 2615 * Event notification. 2616 *---------------------------------------------------------------*/ 2617 int dm_kobject_uevent(struct mapped_device *md, enum kobject_action action, 2618 unsigned cookie) 2619 { 2620 char udev_cookie[DM_COOKIE_LENGTH]; 2621 char *envp[] = { udev_cookie, NULL }; 2622 2623 if (!cookie) 2624 return kobject_uevent(&disk_to_dev(md->disk)->kobj, action); 2625 else { 2626 snprintf(udev_cookie, DM_COOKIE_LENGTH, "%s=%u", 2627 DM_COOKIE_ENV_VAR_NAME, cookie); 2628 return kobject_uevent_env(&disk_to_dev(md->disk)->kobj, 2629 action, envp); 2630 } 2631 } 2632 2633 uint32_t dm_next_uevent_seq(struct mapped_device *md) 2634 { 2635 return atomic_add_return(1, &md->uevent_seq); 2636 } 2637 2638 uint32_t dm_get_event_nr(struct mapped_device *md) 2639 { 2640 return atomic_read(&md->event_nr); 2641 } 2642 2643 int dm_wait_event(struct mapped_device *md, int event_nr) 2644 { 2645 return wait_event_interruptible(md->eventq, 2646 (event_nr != atomic_read(&md->event_nr))); 2647 } 2648 2649 void dm_uevent_add(struct mapped_device *md, struct list_head *elist) 2650 { 2651 unsigned long flags; 2652 2653 spin_lock_irqsave(&md->uevent_lock, flags); 2654 list_add(elist, &md->uevent_list); 2655 spin_unlock_irqrestore(&md->uevent_lock, flags); 2656 } 2657 2658 /* 2659 * The gendisk is only valid as long as you have a reference 2660 * count on 'md'. 2661 */ 2662 struct gendisk *dm_disk(struct mapped_device *md) 2663 { 2664 return md->disk; 2665 } 2666 2667 struct kobject *dm_kobject(struct mapped_device *md) 2668 { 2669 return &md->kobj; 2670 } 2671 2672 /* 2673 * struct mapped_device should not be exported outside of dm.c 2674 * so use this check to verify that kobj is part of md structure 2675 */ 2676 struct mapped_device *dm_get_from_kobject(struct kobject *kobj) 2677 { 2678 struct mapped_device *md; 2679 2680 md = container_of(kobj, struct mapped_device, kobj); 2681 if (&md->kobj != kobj) 2682 return NULL; 2683 2684 if (test_bit(DMF_FREEING, &md->flags) || 2685 dm_deleting_md(md)) 2686 return NULL; 2687 2688 dm_get(md); 2689 return md; 2690 } 2691 2692 int dm_suspended_md(struct mapped_device *md) 2693 { 2694 return test_bit(DMF_SUSPENDED, &md->flags); 2695 } 2696 2697 int dm_suspended(struct dm_target *ti) 2698 { 2699 return dm_suspended_md(dm_table_get_md(ti->table)); 2700 } 2701 EXPORT_SYMBOL_GPL(dm_suspended); 2702 2703 int dm_noflush_suspending(struct dm_target *ti) 2704 { 2705 return __noflush_suspending(dm_table_get_md(ti->table)); 2706 } 2707 EXPORT_SYMBOL_GPL(dm_noflush_suspending); 2708 2709 struct dm_md_mempools *dm_alloc_md_mempools(unsigned type) 2710 { 2711 struct dm_md_mempools *pools = kmalloc(sizeof(*pools), GFP_KERNEL); 2712 2713 if (!pools) 2714 return NULL; 2715 2716 pools->io_pool = (type == DM_TYPE_BIO_BASED) ? 2717 mempool_create_slab_pool(MIN_IOS, _io_cache) : 2718 mempool_create_slab_pool(MIN_IOS, _rq_bio_info_cache); 2719 if (!pools->io_pool) 2720 goto free_pools_and_out; 2721 2722 pools->tio_pool = (type == DM_TYPE_BIO_BASED) ? 2723 mempool_create_slab_pool(MIN_IOS, _tio_cache) : 2724 mempool_create_slab_pool(MIN_IOS, _rq_tio_cache); 2725 if (!pools->tio_pool) 2726 goto free_io_pool_and_out; 2727 2728 pools->bs = (type == DM_TYPE_BIO_BASED) ? 2729 bioset_create(16, 0) : bioset_create(MIN_IOS, 0); 2730 if (!pools->bs) 2731 goto free_tio_pool_and_out; 2732 2733 return pools; 2734 2735 free_tio_pool_and_out: 2736 mempool_destroy(pools->tio_pool); 2737 2738 free_io_pool_and_out: 2739 mempool_destroy(pools->io_pool); 2740 2741 free_pools_and_out: 2742 kfree(pools); 2743 2744 return NULL; 2745 } 2746 2747 void dm_free_md_mempools(struct dm_md_mempools *pools) 2748 { 2749 if (!pools) 2750 return; 2751 2752 if (pools->io_pool) 2753 mempool_destroy(pools->io_pool); 2754 2755 if (pools->tio_pool) 2756 mempool_destroy(pools->tio_pool); 2757 2758 if (pools->bs) 2759 bioset_free(pools->bs); 2760 2761 kfree(pools); 2762 } 2763 2764 static const struct block_device_operations dm_blk_dops = { 2765 .open = dm_blk_open, 2766 .release = dm_blk_close, 2767 .ioctl = dm_blk_ioctl, 2768 .getgeo = dm_blk_getgeo, 2769 .owner = THIS_MODULE 2770 }; 2771 2772 EXPORT_SYMBOL(dm_get_mapinfo); 2773 2774 /* 2775 * module hooks 2776 */ 2777 module_init(dm_init); 2778 module_exit(dm_exit); 2779 2780 module_param(major, uint, 0); 2781 MODULE_PARM_DESC(major, "The major number of the device mapper"); 2782 MODULE_DESCRIPTION(DM_NAME " driver"); 2783 MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>"); 2784 MODULE_LICENSE("GPL"); 2785