1 /* 2 * Copyright (C) 2001, 2002 Sistina Software (UK) Limited. 3 * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved. 4 * 5 * This file is released under the GPL. 6 */ 7 8 #include "dm-core.h" 9 #include "dm-rq.h" 10 #include "dm-uevent.h" 11 12 #include <linux/init.h> 13 #include <linux/module.h> 14 #include <linux/mutex.h> 15 #include <linux/sched/signal.h> 16 #include <linux/blkpg.h> 17 #include <linux/bio.h> 18 #include <linux/mempool.h> 19 #include <linux/dax.h> 20 #include <linux/slab.h> 21 #include <linux/idr.h> 22 #include <linux/hdreg.h> 23 #include <linux/delay.h> 24 #include <linux/wait.h> 25 #include <linux/pr.h> 26 27 #define DM_MSG_PREFIX "core" 28 29 #ifdef CONFIG_PRINTK 30 /* 31 * ratelimit state to be used in DMXXX_LIMIT(). 32 */ 33 DEFINE_RATELIMIT_STATE(dm_ratelimit_state, 34 DEFAULT_RATELIMIT_INTERVAL, 35 DEFAULT_RATELIMIT_BURST); 36 EXPORT_SYMBOL(dm_ratelimit_state); 37 #endif 38 39 /* 40 * Cookies are numeric values sent with CHANGE and REMOVE 41 * uevents while resuming, removing or renaming the device. 42 */ 43 #define DM_COOKIE_ENV_VAR_NAME "DM_COOKIE" 44 #define DM_COOKIE_LENGTH 24 45 46 static const char *_name = DM_NAME; 47 48 static unsigned int major = 0; 49 static unsigned int _major = 0; 50 51 static DEFINE_IDR(_minor_idr); 52 53 static DEFINE_SPINLOCK(_minor_lock); 54 55 static void do_deferred_remove(struct work_struct *w); 56 57 static DECLARE_WORK(deferred_remove_work, do_deferred_remove); 58 59 static struct workqueue_struct *deferred_remove_workqueue; 60 61 atomic_t dm_global_event_nr = ATOMIC_INIT(0); 62 DECLARE_WAIT_QUEUE_HEAD(dm_global_eventq); 63 64 /* 65 * One of these is allocated per bio. 66 */ 67 struct dm_io { 68 struct mapped_device *md; 69 blk_status_t status; 70 atomic_t io_count; 71 struct bio *bio; 72 unsigned long start_time; 73 spinlock_t endio_lock; 74 struct dm_stats_aux stats_aux; 75 }; 76 77 #define MINOR_ALLOCED ((void *)-1) 78 79 /* 80 * Bits for the md->flags field. 81 */ 82 #define DMF_BLOCK_IO_FOR_SUSPEND 0 83 #define DMF_SUSPENDED 1 84 #define DMF_FROZEN 2 85 #define DMF_FREEING 3 86 #define DMF_DELETING 4 87 #define DMF_NOFLUSH_SUSPENDING 5 88 #define DMF_DEFERRED_REMOVE 6 89 #define DMF_SUSPENDED_INTERNALLY 7 90 91 #define DM_NUMA_NODE NUMA_NO_NODE 92 static int dm_numa_node = DM_NUMA_NODE; 93 94 /* 95 * For mempools pre-allocation at the table loading time. 96 */ 97 struct dm_md_mempools { 98 mempool_t *io_pool; 99 struct bio_set *bs; 100 }; 101 102 struct table_device { 103 struct list_head list; 104 atomic_t count; 105 struct dm_dev dm_dev; 106 }; 107 108 static struct kmem_cache *_io_cache; 109 static struct kmem_cache *_rq_tio_cache; 110 static struct kmem_cache *_rq_cache; 111 112 /* 113 * Bio-based DM's mempools' reserved IOs set by the user. 114 */ 115 #define RESERVED_BIO_BASED_IOS 16 116 static unsigned reserved_bio_based_ios = RESERVED_BIO_BASED_IOS; 117 118 static int __dm_get_module_param_int(int *module_param, int min, int max) 119 { 120 int param = ACCESS_ONCE(*module_param); 121 int modified_param = 0; 122 bool modified = true; 123 124 if (param < min) 125 modified_param = min; 126 else if (param > max) 127 modified_param = max; 128 else 129 modified = false; 130 131 if (modified) { 132 (void)cmpxchg(module_param, param, modified_param); 133 param = modified_param; 134 } 135 136 return param; 137 } 138 139 unsigned __dm_get_module_param(unsigned *module_param, 140 unsigned def, unsigned max) 141 { 142 unsigned param = ACCESS_ONCE(*module_param); 143 unsigned modified_param = 0; 144 145 if (!param) 146 modified_param = def; 147 else if (param > max) 148 modified_param = max; 149 150 if (modified_param) { 151 (void)cmpxchg(module_param, param, modified_param); 152 param = modified_param; 153 } 154 155 return param; 156 } 157 158 unsigned dm_get_reserved_bio_based_ios(void) 159 { 160 return __dm_get_module_param(&reserved_bio_based_ios, 161 RESERVED_BIO_BASED_IOS, DM_RESERVED_MAX_IOS); 162 } 163 EXPORT_SYMBOL_GPL(dm_get_reserved_bio_based_ios); 164 165 static unsigned dm_get_numa_node(void) 166 { 167 return __dm_get_module_param_int(&dm_numa_node, 168 DM_NUMA_NODE, num_online_nodes() - 1); 169 } 170 171 static int __init local_init(void) 172 { 173 int r = -ENOMEM; 174 175 /* allocate a slab for the dm_ios */ 176 _io_cache = KMEM_CACHE(dm_io, 0); 177 if (!_io_cache) 178 return r; 179 180 _rq_tio_cache = KMEM_CACHE(dm_rq_target_io, 0); 181 if (!_rq_tio_cache) 182 goto out_free_io_cache; 183 184 _rq_cache = kmem_cache_create("dm_old_clone_request", sizeof(struct request), 185 __alignof__(struct request), 0, NULL); 186 if (!_rq_cache) 187 goto out_free_rq_tio_cache; 188 189 r = dm_uevent_init(); 190 if (r) 191 goto out_free_rq_cache; 192 193 deferred_remove_workqueue = alloc_workqueue("kdmremove", WQ_UNBOUND, 1); 194 if (!deferred_remove_workqueue) { 195 r = -ENOMEM; 196 goto out_uevent_exit; 197 } 198 199 _major = major; 200 r = register_blkdev(_major, _name); 201 if (r < 0) 202 goto out_free_workqueue; 203 204 if (!_major) 205 _major = r; 206 207 return 0; 208 209 out_free_workqueue: 210 destroy_workqueue(deferred_remove_workqueue); 211 out_uevent_exit: 212 dm_uevent_exit(); 213 out_free_rq_cache: 214 kmem_cache_destroy(_rq_cache); 215 out_free_rq_tio_cache: 216 kmem_cache_destroy(_rq_tio_cache); 217 out_free_io_cache: 218 kmem_cache_destroy(_io_cache); 219 220 return r; 221 } 222 223 static void local_exit(void) 224 { 225 flush_scheduled_work(); 226 destroy_workqueue(deferred_remove_workqueue); 227 228 kmem_cache_destroy(_rq_cache); 229 kmem_cache_destroy(_rq_tio_cache); 230 kmem_cache_destroy(_io_cache); 231 unregister_blkdev(_major, _name); 232 dm_uevent_exit(); 233 234 _major = 0; 235 236 DMINFO("cleaned up"); 237 } 238 239 static int (*_inits[])(void) __initdata = { 240 local_init, 241 dm_target_init, 242 dm_linear_init, 243 dm_stripe_init, 244 dm_io_init, 245 dm_kcopyd_init, 246 dm_interface_init, 247 dm_statistics_init, 248 }; 249 250 static void (*_exits[])(void) = { 251 local_exit, 252 dm_target_exit, 253 dm_linear_exit, 254 dm_stripe_exit, 255 dm_io_exit, 256 dm_kcopyd_exit, 257 dm_interface_exit, 258 dm_statistics_exit, 259 }; 260 261 static int __init dm_init(void) 262 { 263 const int count = ARRAY_SIZE(_inits); 264 265 int r, i; 266 267 for (i = 0; i < count; i++) { 268 r = _inits[i](); 269 if (r) 270 goto bad; 271 } 272 273 return 0; 274 275 bad: 276 while (i--) 277 _exits[i](); 278 279 return r; 280 } 281 282 static void __exit dm_exit(void) 283 { 284 int i = ARRAY_SIZE(_exits); 285 286 while (i--) 287 _exits[i](); 288 289 /* 290 * Should be empty by this point. 291 */ 292 idr_destroy(&_minor_idr); 293 } 294 295 /* 296 * Block device functions 297 */ 298 int dm_deleting_md(struct mapped_device *md) 299 { 300 return test_bit(DMF_DELETING, &md->flags); 301 } 302 303 static int dm_blk_open(struct block_device *bdev, fmode_t mode) 304 { 305 struct mapped_device *md; 306 307 spin_lock(&_minor_lock); 308 309 md = bdev->bd_disk->private_data; 310 if (!md) 311 goto out; 312 313 if (test_bit(DMF_FREEING, &md->flags) || 314 dm_deleting_md(md)) { 315 md = NULL; 316 goto out; 317 } 318 319 dm_get(md); 320 atomic_inc(&md->open_count); 321 out: 322 spin_unlock(&_minor_lock); 323 324 return md ? 0 : -ENXIO; 325 } 326 327 static void dm_blk_close(struct gendisk *disk, fmode_t mode) 328 { 329 struct mapped_device *md; 330 331 spin_lock(&_minor_lock); 332 333 md = disk->private_data; 334 if (WARN_ON(!md)) 335 goto out; 336 337 if (atomic_dec_and_test(&md->open_count) && 338 (test_bit(DMF_DEFERRED_REMOVE, &md->flags))) 339 queue_work(deferred_remove_workqueue, &deferred_remove_work); 340 341 dm_put(md); 342 out: 343 spin_unlock(&_minor_lock); 344 } 345 346 int dm_open_count(struct mapped_device *md) 347 { 348 return atomic_read(&md->open_count); 349 } 350 351 /* 352 * Guarantees nothing is using the device before it's deleted. 353 */ 354 int dm_lock_for_deletion(struct mapped_device *md, bool mark_deferred, bool only_deferred) 355 { 356 int r = 0; 357 358 spin_lock(&_minor_lock); 359 360 if (dm_open_count(md)) { 361 r = -EBUSY; 362 if (mark_deferred) 363 set_bit(DMF_DEFERRED_REMOVE, &md->flags); 364 } else if (only_deferred && !test_bit(DMF_DEFERRED_REMOVE, &md->flags)) 365 r = -EEXIST; 366 else 367 set_bit(DMF_DELETING, &md->flags); 368 369 spin_unlock(&_minor_lock); 370 371 return r; 372 } 373 374 int dm_cancel_deferred_remove(struct mapped_device *md) 375 { 376 int r = 0; 377 378 spin_lock(&_minor_lock); 379 380 if (test_bit(DMF_DELETING, &md->flags)) 381 r = -EBUSY; 382 else 383 clear_bit(DMF_DEFERRED_REMOVE, &md->flags); 384 385 spin_unlock(&_minor_lock); 386 387 return r; 388 } 389 390 static void do_deferred_remove(struct work_struct *w) 391 { 392 dm_deferred_remove(); 393 } 394 395 sector_t dm_get_size(struct mapped_device *md) 396 { 397 return get_capacity(md->disk); 398 } 399 400 struct request_queue *dm_get_md_queue(struct mapped_device *md) 401 { 402 return md->queue; 403 } 404 405 struct dm_stats *dm_get_stats(struct mapped_device *md) 406 { 407 return &md->stats; 408 } 409 410 static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo) 411 { 412 struct mapped_device *md = bdev->bd_disk->private_data; 413 414 return dm_get_geometry(md, geo); 415 } 416 417 static int dm_grab_bdev_for_ioctl(struct mapped_device *md, 418 struct block_device **bdev, 419 fmode_t *mode) 420 { 421 struct dm_target *tgt; 422 struct dm_table *map; 423 int srcu_idx, r; 424 425 retry: 426 r = -ENOTTY; 427 map = dm_get_live_table(md, &srcu_idx); 428 if (!map || !dm_table_get_size(map)) 429 goto out; 430 431 /* We only support devices that have a single target */ 432 if (dm_table_get_num_targets(map) != 1) 433 goto out; 434 435 tgt = dm_table_get_target(map, 0); 436 if (!tgt->type->prepare_ioctl) 437 goto out; 438 439 if (dm_suspended_md(md)) { 440 r = -EAGAIN; 441 goto out; 442 } 443 444 r = tgt->type->prepare_ioctl(tgt, bdev, mode); 445 if (r < 0) 446 goto out; 447 448 bdgrab(*bdev); 449 dm_put_live_table(md, srcu_idx); 450 return r; 451 452 out: 453 dm_put_live_table(md, srcu_idx); 454 if (r == -ENOTCONN && !fatal_signal_pending(current)) { 455 msleep(10); 456 goto retry; 457 } 458 return r; 459 } 460 461 static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode, 462 unsigned int cmd, unsigned long arg) 463 { 464 struct mapped_device *md = bdev->bd_disk->private_data; 465 int r; 466 467 r = dm_grab_bdev_for_ioctl(md, &bdev, &mode); 468 if (r < 0) 469 return r; 470 471 if (r > 0) { 472 /* 473 * Target determined this ioctl is being issued against a 474 * subset of the parent bdev; require extra privileges. 475 */ 476 if (!capable(CAP_SYS_RAWIO)) { 477 DMWARN_LIMIT( 478 "%s: sending ioctl %x to DM device without required privilege.", 479 current->comm, cmd); 480 r = -ENOIOCTLCMD; 481 goto out; 482 } 483 } 484 485 r = __blkdev_driver_ioctl(bdev, mode, cmd, arg); 486 out: 487 bdput(bdev); 488 return r; 489 } 490 491 static struct dm_io *alloc_io(struct mapped_device *md) 492 { 493 return mempool_alloc(md->io_pool, GFP_NOIO); 494 } 495 496 static void free_io(struct mapped_device *md, struct dm_io *io) 497 { 498 mempool_free(io, md->io_pool); 499 } 500 501 static void free_tio(struct dm_target_io *tio) 502 { 503 bio_put(&tio->clone); 504 } 505 506 int md_in_flight(struct mapped_device *md) 507 { 508 return atomic_read(&md->pending[READ]) + 509 atomic_read(&md->pending[WRITE]); 510 } 511 512 static void start_io_acct(struct dm_io *io) 513 { 514 struct mapped_device *md = io->md; 515 struct bio *bio = io->bio; 516 int cpu; 517 int rw = bio_data_dir(bio); 518 519 io->start_time = jiffies; 520 521 cpu = part_stat_lock(); 522 part_round_stats(cpu, &dm_disk(md)->part0); 523 part_stat_unlock(); 524 atomic_set(&dm_disk(md)->part0.in_flight[rw], 525 atomic_inc_return(&md->pending[rw])); 526 527 if (unlikely(dm_stats_used(&md->stats))) 528 dm_stats_account_io(&md->stats, bio_data_dir(bio), 529 bio->bi_iter.bi_sector, bio_sectors(bio), 530 false, 0, &io->stats_aux); 531 } 532 533 static void end_io_acct(struct dm_io *io) 534 { 535 struct mapped_device *md = io->md; 536 struct bio *bio = io->bio; 537 unsigned long duration = jiffies - io->start_time; 538 int pending; 539 int rw = bio_data_dir(bio); 540 541 generic_end_io_acct(rw, &dm_disk(md)->part0, io->start_time); 542 543 if (unlikely(dm_stats_used(&md->stats))) 544 dm_stats_account_io(&md->stats, bio_data_dir(bio), 545 bio->bi_iter.bi_sector, bio_sectors(bio), 546 true, duration, &io->stats_aux); 547 548 /* 549 * After this is decremented the bio must not be touched if it is 550 * a flush. 551 */ 552 pending = atomic_dec_return(&md->pending[rw]); 553 atomic_set(&dm_disk(md)->part0.in_flight[rw], pending); 554 pending += atomic_read(&md->pending[rw^0x1]); 555 556 /* nudge anyone waiting on suspend queue */ 557 if (!pending) 558 wake_up(&md->wait); 559 } 560 561 /* 562 * Add the bio to the list of deferred io. 563 */ 564 static void queue_io(struct mapped_device *md, struct bio *bio) 565 { 566 unsigned long flags; 567 568 spin_lock_irqsave(&md->deferred_lock, flags); 569 bio_list_add(&md->deferred, bio); 570 spin_unlock_irqrestore(&md->deferred_lock, flags); 571 queue_work(md->wq, &md->work); 572 } 573 574 /* 575 * Everyone (including functions in this file), should use this 576 * function to access the md->map field, and make sure they call 577 * dm_put_live_table() when finished. 578 */ 579 struct dm_table *dm_get_live_table(struct mapped_device *md, int *srcu_idx) __acquires(md->io_barrier) 580 { 581 *srcu_idx = srcu_read_lock(&md->io_barrier); 582 583 return srcu_dereference(md->map, &md->io_barrier); 584 } 585 586 void dm_put_live_table(struct mapped_device *md, int srcu_idx) __releases(md->io_barrier) 587 { 588 srcu_read_unlock(&md->io_barrier, srcu_idx); 589 } 590 591 void dm_sync_table(struct mapped_device *md) 592 { 593 synchronize_srcu(&md->io_barrier); 594 synchronize_rcu_expedited(); 595 } 596 597 /* 598 * A fast alternative to dm_get_live_table/dm_put_live_table. 599 * The caller must not block between these two functions. 600 */ 601 static struct dm_table *dm_get_live_table_fast(struct mapped_device *md) __acquires(RCU) 602 { 603 rcu_read_lock(); 604 return rcu_dereference(md->map); 605 } 606 607 static void dm_put_live_table_fast(struct mapped_device *md) __releases(RCU) 608 { 609 rcu_read_unlock(); 610 } 611 612 /* 613 * Open a table device so we can use it as a map destination. 614 */ 615 static int open_table_device(struct table_device *td, dev_t dev, 616 struct mapped_device *md) 617 { 618 static char *_claim_ptr = "I belong to device-mapper"; 619 struct block_device *bdev; 620 621 int r; 622 623 BUG_ON(td->dm_dev.bdev); 624 625 bdev = blkdev_get_by_dev(dev, td->dm_dev.mode | FMODE_EXCL, _claim_ptr); 626 if (IS_ERR(bdev)) 627 return PTR_ERR(bdev); 628 629 r = bd_link_disk_holder(bdev, dm_disk(md)); 630 if (r) { 631 blkdev_put(bdev, td->dm_dev.mode | FMODE_EXCL); 632 return r; 633 } 634 635 td->dm_dev.bdev = bdev; 636 td->dm_dev.dax_dev = dax_get_by_host(bdev->bd_disk->disk_name); 637 return 0; 638 } 639 640 /* 641 * Close a table device that we've been using. 642 */ 643 static void close_table_device(struct table_device *td, struct mapped_device *md) 644 { 645 if (!td->dm_dev.bdev) 646 return; 647 648 bd_unlink_disk_holder(td->dm_dev.bdev, dm_disk(md)); 649 blkdev_put(td->dm_dev.bdev, td->dm_dev.mode | FMODE_EXCL); 650 put_dax(td->dm_dev.dax_dev); 651 td->dm_dev.bdev = NULL; 652 td->dm_dev.dax_dev = NULL; 653 } 654 655 static struct table_device *find_table_device(struct list_head *l, dev_t dev, 656 fmode_t mode) { 657 struct table_device *td; 658 659 list_for_each_entry(td, l, list) 660 if (td->dm_dev.bdev->bd_dev == dev && td->dm_dev.mode == mode) 661 return td; 662 663 return NULL; 664 } 665 666 int dm_get_table_device(struct mapped_device *md, dev_t dev, fmode_t mode, 667 struct dm_dev **result) { 668 int r; 669 struct table_device *td; 670 671 mutex_lock(&md->table_devices_lock); 672 td = find_table_device(&md->table_devices, dev, mode); 673 if (!td) { 674 td = kmalloc_node(sizeof(*td), GFP_KERNEL, md->numa_node_id); 675 if (!td) { 676 mutex_unlock(&md->table_devices_lock); 677 return -ENOMEM; 678 } 679 680 td->dm_dev.mode = mode; 681 td->dm_dev.bdev = NULL; 682 683 if ((r = open_table_device(td, dev, md))) { 684 mutex_unlock(&md->table_devices_lock); 685 kfree(td); 686 return r; 687 } 688 689 format_dev_t(td->dm_dev.name, dev); 690 691 atomic_set(&td->count, 0); 692 list_add(&td->list, &md->table_devices); 693 } 694 atomic_inc(&td->count); 695 mutex_unlock(&md->table_devices_lock); 696 697 *result = &td->dm_dev; 698 return 0; 699 } 700 EXPORT_SYMBOL_GPL(dm_get_table_device); 701 702 void dm_put_table_device(struct mapped_device *md, struct dm_dev *d) 703 { 704 struct table_device *td = container_of(d, struct table_device, dm_dev); 705 706 mutex_lock(&md->table_devices_lock); 707 if (atomic_dec_and_test(&td->count)) { 708 close_table_device(td, md); 709 list_del(&td->list); 710 kfree(td); 711 } 712 mutex_unlock(&md->table_devices_lock); 713 } 714 EXPORT_SYMBOL(dm_put_table_device); 715 716 static void free_table_devices(struct list_head *devices) 717 { 718 struct list_head *tmp, *next; 719 720 list_for_each_safe(tmp, next, devices) { 721 struct table_device *td = list_entry(tmp, struct table_device, list); 722 723 DMWARN("dm_destroy: %s still exists with %d references", 724 td->dm_dev.name, atomic_read(&td->count)); 725 kfree(td); 726 } 727 } 728 729 /* 730 * Get the geometry associated with a dm device 731 */ 732 int dm_get_geometry(struct mapped_device *md, struct hd_geometry *geo) 733 { 734 *geo = md->geometry; 735 736 return 0; 737 } 738 739 /* 740 * Set the geometry of a device. 741 */ 742 int dm_set_geometry(struct mapped_device *md, struct hd_geometry *geo) 743 { 744 sector_t sz = (sector_t)geo->cylinders * geo->heads * geo->sectors; 745 746 if (geo->start > sz) { 747 DMWARN("Start sector is beyond the geometry limits."); 748 return -EINVAL; 749 } 750 751 md->geometry = *geo; 752 753 return 0; 754 } 755 756 /*----------------------------------------------------------------- 757 * CRUD START: 758 * A more elegant soln is in the works that uses the queue 759 * merge fn, unfortunately there are a couple of changes to 760 * the block layer that I want to make for this. So in the 761 * interests of getting something for people to use I give 762 * you this clearly demarcated crap. 763 *---------------------------------------------------------------*/ 764 765 static int __noflush_suspending(struct mapped_device *md) 766 { 767 return test_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); 768 } 769 770 /* 771 * Decrements the number of outstanding ios that a bio has been 772 * cloned into, completing the original io if necc. 773 */ 774 static void dec_pending(struct dm_io *io, blk_status_t error) 775 { 776 unsigned long flags; 777 blk_status_t io_error; 778 struct bio *bio; 779 struct mapped_device *md = io->md; 780 781 /* Push-back supersedes any I/O errors */ 782 if (unlikely(error)) { 783 spin_lock_irqsave(&io->endio_lock, flags); 784 if (!(io->status == BLK_STS_DM_REQUEUE && 785 __noflush_suspending(md))) 786 io->status = error; 787 spin_unlock_irqrestore(&io->endio_lock, flags); 788 } 789 790 if (atomic_dec_and_test(&io->io_count)) { 791 if (io->status == BLK_STS_DM_REQUEUE) { 792 /* 793 * Target requested pushing back the I/O. 794 */ 795 spin_lock_irqsave(&md->deferred_lock, flags); 796 if (__noflush_suspending(md)) 797 bio_list_add_head(&md->deferred, io->bio); 798 else 799 /* noflush suspend was interrupted. */ 800 io->status = BLK_STS_IOERR; 801 spin_unlock_irqrestore(&md->deferred_lock, flags); 802 } 803 804 io_error = io->status; 805 bio = io->bio; 806 end_io_acct(io); 807 free_io(md, io); 808 809 if (io_error == BLK_STS_DM_REQUEUE) 810 return; 811 812 if ((bio->bi_opf & REQ_PREFLUSH) && bio->bi_iter.bi_size) { 813 /* 814 * Preflush done for flush with data, reissue 815 * without REQ_PREFLUSH. 816 */ 817 bio->bi_opf &= ~REQ_PREFLUSH; 818 queue_io(md, bio); 819 } else { 820 /* done with normal IO or empty flush */ 821 bio->bi_status = io_error; 822 bio_endio(bio); 823 } 824 } 825 } 826 827 void disable_write_same(struct mapped_device *md) 828 { 829 struct queue_limits *limits = dm_get_queue_limits(md); 830 831 /* device doesn't really support WRITE SAME, disable it */ 832 limits->max_write_same_sectors = 0; 833 } 834 835 void disable_write_zeroes(struct mapped_device *md) 836 { 837 struct queue_limits *limits = dm_get_queue_limits(md); 838 839 /* device doesn't really support WRITE ZEROES, disable it */ 840 limits->max_write_zeroes_sectors = 0; 841 } 842 843 static void clone_endio(struct bio *bio) 844 { 845 blk_status_t error = bio->bi_status; 846 struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone); 847 struct dm_io *io = tio->io; 848 struct mapped_device *md = tio->io->md; 849 dm_endio_fn endio = tio->ti->type->end_io; 850 851 if (unlikely(error == BLK_STS_TARGET)) { 852 if (bio_op(bio) == REQ_OP_WRITE_SAME && 853 !bdev_get_queue(bio->bi_bdev)->limits.max_write_same_sectors) 854 disable_write_same(md); 855 if (bio_op(bio) == REQ_OP_WRITE_ZEROES && 856 !bdev_get_queue(bio->bi_bdev)->limits.max_write_zeroes_sectors) 857 disable_write_zeroes(md); 858 } 859 860 if (endio) { 861 int r = endio(tio->ti, bio, &error); 862 switch (r) { 863 case DM_ENDIO_REQUEUE: 864 error = BLK_STS_DM_REQUEUE; 865 /*FALLTHRU*/ 866 case DM_ENDIO_DONE: 867 break; 868 case DM_ENDIO_INCOMPLETE: 869 /* The target will handle the io */ 870 return; 871 default: 872 DMWARN("unimplemented target endio return value: %d", r); 873 BUG(); 874 } 875 } 876 877 free_tio(tio); 878 dec_pending(io, error); 879 } 880 881 /* 882 * Return maximum size of I/O possible at the supplied sector up to the current 883 * target boundary. 884 */ 885 static sector_t max_io_len_target_boundary(sector_t sector, struct dm_target *ti) 886 { 887 sector_t target_offset = dm_target_offset(ti, sector); 888 889 return ti->len - target_offset; 890 } 891 892 static sector_t max_io_len(sector_t sector, struct dm_target *ti) 893 { 894 sector_t len = max_io_len_target_boundary(sector, ti); 895 sector_t offset, max_len; 896 897 /* 898 * Does the target need to split even further? 899 */ 900 if (ti->max_io_len) { 901 offset = dm_target_offset(ti, sector); 902 if (unlikely(ti->max_io_len & (ti->max_io_len - 1))) 903 max_len = sector_div(offset, ti->max_io_len); 904 else 905 max_len = offset & (ti->max_io_len - 1); 906 max_len = ti->max_io_len - max_len; 907 908 if (len > max_len) 909 len = max_len; 910 } 911 912 return len; 913 } 914 915 int dm_set_target_max_io_len(struct dm_target *ti, sector_t len) 916 { 917 if (len > UINT_MAX) { 918 DMERR("Specified maximum size of target IO (%llu) exceeds limit (%u)", 919 (unsigned long long)len, UINT_MAX); 920 ti->error = "Maximum size of target IO is too large"; 921 return -EINVAL; 922 } 923 924 ti->max_io_len = (uint32_t) len; 925 926 return 0; 927 } 928 EXPORT_SYMBOL_GPL(dm_set_target_max_io_len); 929 930 static struct dm_target *dm_dax_get_live_target(struct mapped_device *md, 931 sector_t sector, int *srcu_idx) 932 { 933 struct dm_table *map; 934 struct dm_target *ti; 935 936 map = dm_get_live_table(md, srcu_idx); 937 if (!map) 938 return NULL; 939 940 ti = dm_table_find_target(map, sector); 941 if (!dm_target_is_valid(ti)) 942 return NULL; 943 944 return ti; 945 } 946 947 static long dm_dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, 948 long nr_pages, void **kaddr, pfn_t *pfn) 949 { 950 struct mapped_device *md = dax_get_private(dax_dev); 951 sector_t sector = pgoff * PAGE_SECTORS; 952 struct dm_target *ti; 953 long len, ret = -EIO; 954 int srcu_idx; 955 956 ti = dm_dax_get_live_target(md, sector, &srcu_idx); 957 958 if (!ti) 959 goto out; 960 if (!ti->type->direct_access) 961 goto out; 962 len = max_io_len(sector, ti) / PAGE_SECTORS; 963 if (len < 1) 964 goto out; 965 nr_pages = min(len, nr_pages); 966 if (ti->type->direct_access) 967 ret = ti->type->direct_access(ti, pgoff, nr_pages, kaddr, pfn); 968 969 out: 970 dm_put_live_table(md, srcu_idx); 971 972 return ret; 973 } 974 975 /* 976 * A target may call dm_accept_partial_bio only from the map routine. It is 977 * allowed for all bio types except REQ_PREFLUSH. 978 * 979 * dm_accept_partial_bio informs the dm that the target only wants to process 980 * additional n_sectors sectors of the bio and the rest of the data should be 981 * sent in a next bio. 982 * 983 * A diagram that explains the arithmetics: 984 * +--------------------+---------------+-------+ 985 * | 1 | 2 | 3 | 986 * +--------------------+---------------+-------+ 987 * 988 * <-------------- *tio->len_ptr ---------------> 989 * <------- bi_size -------> 990 * <-- n_sectors --> 991 * 992 * Region 1 was already iterated over with bio_advance or similar function. 993 * (it may be empty if the target doesn't use bio_advance) 994 * Region 2 is the remaining bio size that the target wants to process. 995 * (it may be empty if region 1 is non-empty, although there is no reason 996 * to make it empty) 997 * The target requires that region 3 is to be sent in the next bio. 998 * 999 * If the target wants to receive multiple copies of the bio (via num_*bios, etc), 1000 * the partially processed part (the sum of regions 1+2) must be the same for all 1001 * copies of the bio. 1002 */ 1003 void dm_accept_partial_bio(struct bio *bio, unsigned n_sectors) 1004 { 1005 struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone); 1006 unsigned bi_size = bio->bi_iter.bi_size >> SECTOR_SHIFT; 1007 BUG_ON(bio->bi_opf & REQ_PREFLUSH); 1008 BUG_ON(bi_size > *tio->len_ptr); 1009 BUG_ON(n_sectors > bi_size); 1010 *tio->len_ptr -= bi_size - n_sectors; 1011 bio->bi_iter.bi_size = n_sectors << SECTOR_SHIFT; 1012 } 1013 EXPORT_SYMBOL_GPL(dm_accept_partial_bio); 1014 1015 /* 1016 * Flush current->bio_list when the target map method blocks. 1017 * This fixes deadlocks in snapshot and possibly in other targets. 1018 */ 1019 struct dm_offload { 1020 struct blk_plug plug; 1021 struct blk_plug_cb cb; 1022 }; 1023 1024 static void flush_current_bio_list(struct blk_plug_cb *cb, bool from_schedule) 1025 { 1026 struct dm_offload *o = container_of(cb, struct dm_offload, cb); 1027 struct bio_list list; 1028 struct bio *bio; 1029 int i; 1030 1031 INIT_LIST_HEAD(&o->cb.list); 1032 1033 if (unlikely(!current->bio_list)) 1034 return; 1035 1036 for (i = 0; i < 2; i++) { 1037 list = current->bio_list[i]; 1038 bio_list_init(¤t->bio_list[i]); 1039 1040 while ((bio = bio_list_pop(&list))) { 1041 struct bio_set *bs = bio->bi_pool; 1042 if (unlikely(!bs) || bs == fs_bio_set || 1043 !bs->rescue_workqueue) { 1044 bio_list_add(¤t->bio_list[i], bio); 1045 continue; 1046 } 1047 1048 spin_lock(&bs->rescue_lock); 1049 bio_list_add(&bs->rescue_list, bio); 1050 queue_work(bs->rescue_workqueue, &bs->rescue_work); 1051 spin_unlock(&bs->rescue_lock); 1052 } 1053 } 1054 } 1055 1056 static void dm_offload_start(struct dm_offload *o) 1057 { 1058 blk_start_plug(&o->plug); 1059 o->cb.callback = flush_current_bio_list; 1060 list_add(&o->cb.list, ¤t->plug->cb_list); 1061 } 1062 1063 static void dm_offload_end(struct dm_offload *o) 1064 { 1065 list_del(&o->cb.list); 1066 blk_finish_plug(&o->plug); 1067 } 1068 1069 static void __map_bio(struct dm_target_io *tio) 1070 { 1071 int r; 1072 sector_t sector; 1073 struct dm_offload o; 1074 struct bio *clone = &tio->clone; 1075 struct dm_target *ti = tio->ti; 1076 1077 clone->bi_end_io = clone_endio; 1078 1079 /* 1080 * Map the clone. If r == 0 we don't need to do 1081 * anything, the target has assumed ownership of 1082 * this io. 1083 */ 1084 atomic_inc(&tio->io->io_count); 1085 sector = clone->bi_iter.bi_sector; 1086 1087 dm_offload_start(&o); 1088 r = ti->type->map(ti, clone); 1089 dm_offload_end(&o); 1090 1091 switch (r) { 1092 case DM_MAPIO_SUBMITTED: 1093 break; 1094 case DM_MAPIO_REMAPPED: 1095 /* the bio has been remapped so dispatch it */ 1096 trace_block_bio_remap(bdev_get_queue(clone->bi_bdev), clone, 1097 tio->io->bio->bi_bdev->bd_dev, sector); 1098 generic_make_request(clone); 1099 break; 1100 case DM_MAPIO_KILL: 1101 dec_pending(tio->io, BLK_STS_IOERR); 1102 free_tio(tio); 1103 break; 1104 case DM_MAPIO_REQUEUE: 1105 dec_pending(tio->io, BLK_STS_DM_REQUEUE); 1106 free_tio(tio); 1107 break; 1108 default: 1109 DMWARN("unimplemented target map return value: %d", r); 1110 BUG(); 1111 } 1112 } 1113 1114 struct clone_info { 1115 struct mapped_device *md; 1116 struct dm_table *map; 1117 struct bio *bio; 1118 struct dm_io *io; 1119 sector_t sector; 1120 unsigned sector_count; 1121 }; 1122 1123 static void bio_setup_sector(struct bio *bio, sector_t sector, unsigned len) 1124 { 1125 bio->bi_iter.bi_sector = sector; 1126 bio->bi_iter.bi_size = to_bytes(len); 1127 } 1128 1129 /* 1130 * Creates a bio that consists of range of complete bvecs. 1131 */ 1132 static int clone_bio(struct dm_target_io *tio, struct bio *bio, 1133 sector_t sector, unsigned len) 1134 { 1135 struct bio *clone = &tio->clone; 1136 1137 __bio_clone_fast(clone, bio); 1138 1139 if (unlikely(bio_integrity(bio) != NULL)) { 1140 int r; 1141 1142 if (unlikely(!dm_target_has_integrity(tio->ti->type) && 1143 !dm_target_passes_integrity(tio->ti->type))) { 1144 DMWARN("%s: the target %s doesn't support integrity data.", 1145 dm_device_name(tio->io->md), 1146 tio->ti->type->name); 1147 return -EIO; 1148 } 1149 1150 r = bio_integrity_clone(clone, bio, GFP_NOIO); 1151 if (r < 0) 1152 return r; 1153 } 1154 1155 bio_advance(clone, to_bytes(sector - clone->bi_iter.bi_sector)); 1156 clone->bi_iter.bi_size = to_bytes(len); 1157 1158 if (unlikely(bio_integrity(bio) != NULL)) 1159 bio_integrity_trim(clone, 0, len); 1160 1161 return 0; 1162 } 1163 1164 static struct dm_target_io *alloc_tio(struct clone_info *ci, 1165 struct dm_target *ti, 1166 unsigned target_bio_nr) 1167 { 1168 struct dm_target_io *tio; 1169 struct bio *clone; 1170 1171 clone = bio_alloc_bioset(GFP_NOIO, 0, ci->md->bs); 1172 tio = container_of(clone, struct dm_target_io, clone); 1173 1174 tio->io = ci->io; 1175 tio->ti = ti; 1176 tio->target_bio_nr = target_bio_nr; 1177 1178 return tio; 1179 } 1180 1181 static void __clone_and_map_simple_bio(struct clone_info *ci, 1182 struct dm_target *ti, 1183 unsigned target_bio_nr, unsigned *len) 1184 { 1185 struct dm_target_io *tio = alloc_tio(ci, ti, target_bio_nr); 1186 struct bio *clone = &tio->clone; 1187 1188 tio->len_ptr = len; 1189 1190 __bio_clone_fast(clone, ci->bio); 1191 if (len) 1192 bio_setup_sector(clone, ci->sector, *len); 1193 1194 __map_bio(tio); 1195 } 1196 1197 static void __send_duplicate_bios(struct clone_info *ci, struct dm_target *ti, 1198 unsigned num_bios, unsigned *len) 1199 { 1200 unsigned target_bio_nr; 1201 1202 for (target_bio_nr = 0; target_bio_nr < num_bios; target_bio_nr++) 1203 __clone_and_map_simple_bio(ci, ti, target_bio_nr, len); 1204 } 1205 1206 static int __send_empty_flush(struct clone_info *ci) 1207 { 1208 unsigned target_nr = 0; 1209 struct dm_target *ti; 1210 1211 BUG_ON(bio_has_data(ci->bio)); 1212 while ((ti = dm_table_get_target(ci->map, target_nr++))) 1213 __send_duplicate_bios(ci, ti, ti->num_flush_bios, NULL); 1214 1215 return 0; 1216 } 1217 1218 static int __clone_and_map_data_bio(struct clone_info *ci, struct dm_target *ti, 1219 sector_t sector, unsigned *len) 1220 { 1221 struct bio *bio = ci->bio; 1222 struct dm_target_io *tio; 1223 unsigned target_bio_nr; 1224 unsigned num_target_bios = 1; 1225 int r = 0; 1226 1227 /* 1228 * Does the target want to receive duplicate copies of the bio? 1229 */ 1230 if (bio_data_dir(bio) == WRITE && ti->num_write_bios) 1231 num_target_bios = ti->num_write_bios(ti, bio); 1232 1233 for (target_bio_nr = 0; target_bio_nr < num_target_bios; target_bio_nr++) { 1234 tio = alloc_tio(ci, ti, target_bio_nr); 1235 tio->len_ptr = len; 1236 r = clone_bio(tio, bio, sector, *len); 1237 if (r < 0) { 1238 free_tio(tio); 1239 break; 1240 } 1241 __map_bio(tio); 1242 } 1243 1244 return r; 1245 } 1246 1247 typedef unsigned (*get_num_bios_fn)(struct dm_target *ti); 1248 1249 static unsigned get_num_discard_bios(struct dm_target *ti) 1250 { 1251 return ti->num_discard_bios; 1252 } 1253 1254 static unsigned get_num_write_same_bios(struct dm_target *ti) 1255 { 1256 return ti->num_write_same_bios; 1257 } 1258 1259 static unsigned get_num_write_zeroes_bios(struct dm_target *ti) 1260 { 1261 return ti->num_write_zeroes_bios; 1262 } 1263 1264 typedef bool (*is_split_required_fn)(struct dm_target *ti); 1265 1266 static bool is_split_required_for_discard(struct dm_target *ti) 1267 { 1268 return ti->split_discard_bios; 1269 } 1270 1271 static int __send_changing_extent_only(struct clone_info *ci, 1272 get_num_bios_fn get_num_bios, 1273 is_split_required_fn is_split_required) 1274 { 1275 struct dm_target *ti; 1276 unsigned len; 1277 unsigned num_bios; 1278 1279 do { 1280 ti = dm_table_find_target(ci->map, ci->sector); 1281 if (!dm_target_is_valid(ti)) 1282 return -EIO; 1283 1284 /* 1285 * Even though the device advertised support for this type of 1286 * request, that does not mean every target supports it, and 1287 * reconfiguration might also have changed that since the 1288 * check was performed. 1289 */ 1290 num_bios = get_num_bios ? get_num_bios(ti) : 0; 1291 if (!num_bios) 1292 return -EOPNOTSUPP; 1293 1294 if (is_split_required && !is_split_required(ti)) 1295 len = min((sector_t)ci->sector_count, max_io_len_target_boundary(ci->sector, ti)); 1296 else 1297 len = min((sector_t)ci->sector_count, max_io_len(ci->sector, ti)); 1298 1299 __send_duplicate_bios(ci, ti, num_bios, &len); 1300 1301 ci->sector += len; 1302 } while (ci->sector_count -= len); 1303 1304 return 0; 1305 } 1306 1307 static int __send_discard(struct clone_info *ci) 1308 { 1309 return __send_changing_extent_only(ci, get_num_discard_bios, 1310 is_split_required_for_discard); 1311 } 1312 1313 static int __send_write_same(struct clone_info *ci) 1314 { 1315 return __send_changing_extent_only(ci, get_num_write_same_bios, NULL); 1316 } 1317 1318 static int __send_write_zeroes(struct clone_info *ci) 1319 { 1320 return __send_changing_extent_only(ci, get_num_write_zeroes_bios, NULL); 1321 } 1322 1323 /* 1324 * Select the correct strategy for processing a non-flush bio. 1325 */ 1326 static int __split_and_process_non_flush(struct clone_info *ci) 1327 { 1328 struct bio *bio = ci->bio; 1329 struct dm_target *ti; 1330 unsigned len; 1331 int r; 1332 1333 if (unlikely(bio_op(bio) == REQ_OP_DISCARD)) 1334 return __send_discard(ci); 1335 else if (unlikely(bio_op(bio) == REQ_OP_WRITE_SAME)) 1336 return __send_write_same(ci); 1337 else if (unlikely(bio_op(bio) == REQ_OP_WRITE_ZEROES)) 1338 return __send_write_zeroes(ci); 1339 1340 ti = dm_table_find_target(ci->map, ci->sector); 1341 if (!dm_target_is_valid(ti)) 1342 return -EIO; 1343 1344 len = min_t(sector_t, max_io_len(ci->sector, ti), ci->sector_count); 1345 1346 r = __clone_and_map_data_bio(ci, ti, ci->sector, &len); 1347 if (r < 0) 1348 return r; 1349 1350 ci->sector += len; 1351 ci->sector_count -= len; 1352 1353 return 0; 1354 } 1355 1356 /* 1357 * Entry point to split a bio into clones and submit them to the targets. 1358 */ 1359 static void __split_and_process_bio(struct mapped_device *md, 1360 struct dm_table *map, struct bio *bio) 1361 { 1362 struct clone_info ci; 1363 int error = 0; 1364 1365 if (unlikely(!map)) { 1366 bio_io_error(bio); 1367 return; 1368 } 1369 1370 ci.map = map; 1371 ci.md = md; 1372 ci.io = alloc_io(md); 1373 ci.io->status = 0; 1374 atomic_set(&ci.io->io_count, 1); 1375 ci.io->bio = bio; 1376 ci.io->md = md; 1377 spin_lock_init(&ci.io->endio_lock); 1378 ci.sector = bio->bi_iter.bi_sector; 1379 1380 start_io_acct(ci.io); 1381 1382 if (bio->bi_opf & REQ_PREFLUSH) { 1383 ci.bio = &ci.md->flush_bio; 1384 ci.sector_count = 0; 1385 error = __send_empty_flush(&ci); 1386 /* dec_pending submits any data associated with flush */ 1387 } else { 1388 ci.bio = bio; 1389 ci.sector_count = bio_sectors(bio); 1390 while (ci.sector_count && !error) 1391 error = __split_and_process_non_flush(&ci); 1392 } 1393 1394 /* drop the extra reference count */ 1395 dec_pending(ci.io, error); 1396 } 1397 /*----------------------------------------------------------------- 1398 * CRUD END 1399 *---------------------------------------------------------------*/ 1400 1401 /* 1402 * The request function that just remaps the bio built up by 1403 * dm_merge_bvec. 1404 */ 1405 static blk_qc_t dm_make_request(struct request_queue *q, struct bio *bio) 1406 { 1407 int rw = bio_data_dir(bio); 1408 struct mapped_device *md = q->queuedata; 1409 int srcu_idx; 1410 struct dm_table *map; 1411 1412 map = dm_get_live_table(md, &srcu_idx); 1413 1414 generic_start_io_acct(rw, bio_sectors(bio), &dm_disk(md)->part0); 1415 1416 /* if we're suspended, we have to queue this io for later */ 1417 if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))) { 1418 dm_put_live_table(md, srcu_idx); 1419 1420 if (!(bio->bi_opf & REQ_RAHEAD)) 1421 queue_io(md, bio); 1422 else 1423 bio_io_error(bio); 1424 return BLK_QC_T_NONE; 1425 } 1426 1427 __split_and_process_bio(md, map, bio); 1428 dm_put_live_table(md, srcu_idx); 1429 return BLK_QC_T_NONE; 1430 } 1431 1432 static int dm_any_congested(void *congested_data, int bdi_bits) 1433 { 1434 int r = bdi_bits; 1435 struct mapped_device *md = congested_data; 1436 struct dm_table *map; 1437 1438 if (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) { 1439 if (dm_request_based(md)) { 1440 /* 1441 * With request-based DM we only need to check the 1442 * top-level queue for congestion. 1443 */ 1444 r = md->queue->backing_dev_info->wb.state & bdi_bits; 1445 } else { 1446 map = dm_get_live_table_fast(md); 1447 if (map) 1448 r = dm_table_any_congested(map, bdi_bits); 1449 dm_put_live_table_fast(md); 1450 } 1451 } 1452 1453 return r; 1454 } 1455 1456 /*----------------------------------------------------------------- 1457 * An IDR is used to keep track of allocated minor numbers. 1458 *---------------------------------------------------------------*/ 1459 static void free_minor(int minor) 1460 { 1461 spin_lock(&_minor_lock); 1462 idr_remove(&_minor_idr, minor); 1463 spin_unlock(&_minor_lock); 1464 } 1465 1466 /* 1467 * See if the device with a specific minor # is free. 1468 */ 1469 static int specific_minor(int minor) 1470 { 1471 int r; 1472 1473 if (minor >= (1 << MINORBITS)) 1474 return -EINVAL; 1475 1476 idr_preload(GFP_KERNEL); 1477 spin_lock(&_minor_lock); 1478 1479 r = idr_alloc(&_minor_idr, MINOR_ALLOCED, minor, minor + 1, GFP_NOWAIT); 1480 1481 spin_unlock(&_minor_lock); 1482 idr_preload_end(); 1483 if (r < 0) 1484 return r == -ENOSPC ? -EBUSY : r; 1485 return 0; 1486 } 1487 1488 static int next_free_minor(int *minor) 1489 { 1490 int r; 1491 1492 idr_preload(GFP_KERNEL); 1493 spin_lock(&_minor_lock); 1494 1495 r = idr_alloc(&_minor_idr, MINOR_ALLOCED, 0, 1 << MINORBITS, GFP_NOWAIT); 1496 1497 spin_unlock(&_minor_lock); 1498 idr_preload_end(); 1499 if (r < 0) 1500 return r; 1501 *minor = r; 1502 return 0; 1503 } 1504 1505 static const struct block_device_operations dm_blk_dops; 1506 static const struct dax_operations dm_dax_ops; 1507 1508 static void dm_wq_work(struct work_struct *work); 1509 1510 void dm_init_md_queue(struct mapped_device *md) 1511 { 1512 /* 1513 * Request-based dm devices cannot be stacked on top of bio-based dm 1514 * devices. The type of this dm device may not have been decided yet. 1515 * The type is decided at the first table loading time. 1516 * To prevent problematic device stacking, clear the queue flag 1517 * for request stacking support until then. 1518 * 1519 * This queue is new, so no concurrency on the queue_flags. 1520 */ 1521 queue_flag_clear_unlocked(QUEUE_FLAG_STACKABLE, md->queue); 1522 1523 /* 1524 * Initialize data that will only be used by a non-blk-mq DM queue 1525 * - must do so here (in alloc_dev callchain) before queue is used 1526 */ 1527 md->queue->queuedata = md; 1528 md->queue->backing_dev_info->congested_data = md; 1529 } 1530 1531 void dm_init_normal_md_queue(struct mapped_device *md) 1532 { 1533 md->use_blk_mq = false; 1534 dm_init_md_queue(md); 1535 1536 /* 1537 * Initialize aspects of queue that aren't relevant for blk-mq 1538 */ 1539 md->queue->backing_dev_info->congested_fn = dm_any_congested; 1540 blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY); 1541 } 1542 1543 static void cleanup_mapped_device(struct mapped_device *md) 1544 { 1545 if (md->wq) 1546 destroy_workqueue(md->wq); 1547 if (md->kworker_task) 1548 kthread_stop(md->kworker_task); 1549 mempool_destroy(md->io_pool); 1550 if (md->bs) 1551 bioset_free(md->bs); 1552 1553 if (md->dax_dev) { 1554 kill_dax(md->dax_dev); 1555 put_dax(md->dax_dev); 1556 md->dax_dev = NULL; 1557 } 1558 1559 if (md->disk) { 1560 spin_lock(&_minor_lock); 1561 md->disk->private_data = NULL; 1562 spin_unlock(&_minor_lock); 1563 del_gendisk(md->disk); 1564 put_disk(md->disk); 1565 } 1566 1567 if (md->queue) 1568 blk_cleanup_queue(md->queue); 1569 1570 cleanup_srcu_struct(&md->io_barrier); 1571 1572 if (md->bdev) { 1573 bdput(md->bdev); 1574 md->bdev = NULL; 1575 } 1576 1577 dm_mq_cleanup_mapped_device(md); 1578 } 1579 1580 /* 1581 * Allocate and initialise a blank device with a given minor. 1582 */ 1583 static struct mapped_device *alloc_dev(int minor) 1584 { 1585 int r, numa_node_id = dm_get_numa_node(); 1586 struct dax_device *dax_dev; 1587 struct mapped_device *md; 1588 void *old_md; 1589 1590 md = kzalloc_node(sizeof(*md), GFP_KERNEL, numa_node_id); 1591 if (!md) { 1592 DMWARN("unable to allocate device, out of memory."); 1593 return NULL; 1594 } 1595 1596 if (!try_module_get(THIS_MODULE)) 1597 goto bad_module_get; 1598 1599 /* get a minor number for the dev */ 1600 if (minor == DM_ANY_MINOR) 1601 r = next_free_minor(&minor); 1602 else 1603 r = specific_minor(minor); 1604 if (r < 0) 1605 goto bad_minor; 1606 1607 r = init_srcu_struct(&md->io_barrier); 1608 if (r < 0) 1609 goto bad_io_barrier; 1610 1611 md->numa_node_id = numa_node_id; 1612 md->use_blk_mq = dm_use_blk_mq_default(); 1613 md->init_tio_pdu = false; 1614 md->type = DM_TYPE_NONE; 1615 mutex_init(&md->suspend_lock); 1616 mutex_init(&md->type_lock); 1617 mutex_init(&md->table_devices_lock); 1618 spin_lock_init(&md->deferred_lock); 1619 atomic_set(&md->holders, 1); 1620 atomic_set(&md->open_count, 0); 1621 atomic_set(&md->event_nr, 0); 1622 atomic_set(&md->uevent_seq, 0); 1623 INIT_LIST_HEAD(&md->uevent_list); 1624 INIT_LIST_HEAD(&md->table_devices); 1625 spin_lock_init(&md->uevent_lock); 1626 1627 md->queue = blk_alloc_queue_node(GFP_KERNEL, numa_node_id); 1628 if (!md->queue) 1629 goto bad; 1630 1631 dm_init_md_queue(md); 1632 1633 md->disk = alloc_disk_node(1, numa_node_id); 1634 if (!md->disk) 1635 goto bad; 1636 1637 atomic_set(&md->pending[0], 0); 1638 atomic_set(&md->pending[1], 0); 1639 init_waitqueue_head(&md->wait); 1640 INIT_WORK(&md->work, dm_wq_work); 1641 init_waitqueue_head(&md->eventq); 1642 init_completion(&md->kobj_holder.completion); 1643 md->kworker_task = NULL; 1644 1645 md->disk->major = _major; 1646 md->disk->first_minor = minor; 1647 md->disk->fops = &dm_blk_dops; 1648 md->disk->queue = md->queue; 1649 md->disk->private_data = md; 1650 sprintf(md->disk->disk_name, "dm-%d", minor); 1651 1652 dax_dev = alloc_dax(md, md->disk->disk_name, &dm_dax_ops); 1653 if (!dax_dev) 1654 goto bad; 1655 md->dax_dev = dax_dev; 1656 1657 add_disk(md->disk); 1658 format_dev_t(md->name, MKDEV(_major, minor)); 1659 1660 md->wq = alloc_workqueue("kdmflush", WQ_MEM_RECLAIM, 0); 1661 if (!md->wq) 1662 goto bad; 1663 1664 md->bdev = bdget_disk(md->disk, 0); 1665 if (!md->bdev) 1666 goto bad; 1667 1668 bio_init(&md->flush_bio, NULL, 0); 1669 md->flush_bio.bi_bdev = md->bdev; 1670 md->flush_bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC; 1671 1672 dm_stats_init(&md->stats); 1673 1674 /* Populate the mapping, nobody knows we exist yet */ 1675 spin_lock(&_minor_lock); 1676 old_md = idr_replace(&_minor_idr, md, minor); 1677 spin_unlock(&_minor_lock); 1678 1679 BUG_ON(old_md != MINOR_ALLOCED); 1680 1681 return md; 1682 1683 bad: 1684 cleanup_mapped_device(md); 1685 bad_io_barrier: 1686 free_minor(minor); 1687 bad_minor: 1688 module_put(THIS_MODULE); 1689 bad_module_get: 1690 kfree(md); 1691 return NULL; 1692 } 1693 1694 static void unlock_fs(struct mapped_device *md); 1695 1696 static void free_dev(struct mapped_device *md) 1697 { 1698 int minor = MINOR(disk_devt(md->disk)); 1699 1700 unlock_fs(md); 1701 1702 cleanup_mapped_device(md); 1703 1704 free_table_devices(&md->table_devices); 1705 dm_stats_cleanup(&md->stats); 1706 free_minor(minor); 1707 1708 module_put(THIS_MODULE); 1709 kfree(md); 1710 } 1711 1712 static void __bind_mempools(struct mapped_device *md, struct dm_table *t) 1713 { 1714 struct dm_md_mempools *p = dm_table_get_md_mempools(t); 1715 1716 if (md->bs) { 1717 /* The md already has necessary mempools. */ 1718 if (dm_table_bio_based(t)) { 1719 /* 1720 * Reload bioset because front_pad may have changed 1721 * because a different table was loaded. 1722 */ 1723 bioset_free(md->bs); 1724 md->bs = p->bs; 1725 p->bs = NULL; 1726 } 1727 /* 1728 * There's no need to reload with request-based dm 1729 * because the size of front_pad doesn't change. 1730 * Note for future: If you are to reload bioset, 1731 * prep-ed requests in the queue may refer 1732 * to bio from the old bioset, so you must walk 1733 * through the queue to unprep. 1734 */ 1735 goto out; 1736 } 1737 1738 BUG_ON(!p || md->io_pool || md->bs); 1739 1740 md->io_pool = p->io_pool; 1741 p->io_pool = NULL; 1742 md->bs = p->bs; 1743 p->bs = NULL; 1744 1745 out: 1746 /* mempool bind completed, no longer need any mempools in the table */ 1747 dm_table_free_md_mempools(t); 1748 } 1749 1750 /* 1751 * Bind a table to the device. 1752 */ 1753 static void event_callback(void *context) 1754 { 1755 unsigned long flags; 1756 LIST_HEAD(uevents); 1757 struct mapped_device *md = (struct mapped_device *) context; 1758 1759 spin_lock_irqsave(&md->uevent_lock, flags); 1760 list_splice_init(&md->uevent_list, &uevents); 1761 spin_unlock_irqrestore(&md->uevent_lock, flags); 1762 1763 dm_send_uevents(&uevents, &disk_to_dev(md->disk)->kobj); 1764 1765 atomic_inc(&md->event_nr); 1766 atomic_inc(&dm_global_event_nr); 1767 wake_up(&md->eventq); 1768 wake_up(&dm_global_eventq); 1769 } 1770 1771 /* 1772 * Protected by md->suspend_lock obtained by dm_swap_table(). 1773 */ 1774 static void __set_size(struct mapped_device *md, sector_t size) 1775 { 1776 lockdep_assert_held(&md->suspend_lock); 1777 1778 set_capacity(md->disk, size); 1779 1780 i_size_write(md->bdev->bd_inode, (loff_t)size << SECTOR_SHIFT); 1781 } 1782 1783 /* 1784 * Returns old map, which caller must destroy. 1785 */ 1786 static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t, 1787 struct queue_limits *limits) 1788 { 1789 struct dm_table *old_map; 1790 struct request_queue *q = md->queue; 1791 sector_t size; 1792 1793 lockdep_assert_held(&md->suspend_lock); 1794 1795 size = dm_table_get_size(t); 1796 1797 /* 1798 * Wipe any geometry if the size of the table changed. 1799 */ 1800 if (size != dm_get_size(md)) 1801 memset(&md->geometry, 0, sizeof(md->geometry)); 1802 1803 __set_size(md, size); 1804 1805 dm_table_event_callback(t, event_callback, md); 1806 1807 /* 1808 * The queue hasn't been stopped yet, if the old table type wasn't 1809 * for request-based during suspension. So stop it to prevent 1810 * I/O mapping before resume. 1811 * This must be done before setting the queue restrictions, 1812 * because request-based dm may be run just after the setting. 1813 */ 1814 if (dm_table_request_based(t)) { 1815 dm_stop_queue(q); 1816 /* 1817 * Leverage the fact that request-based DM targets are 1818 * immutable singletons and establish md->immutable_target 1819 * - used to optimize both dm_request_fn and dm_mq_queue_rq 1820 */ 1821 md->immutable_target = dm_table_get_immutable_target(t); 1822 } 1823 1824 __bind_mempools(md, t); 1825 1826 old_map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock)); 1827 rcu_assign_pointer(md->map, (void *)t); 1828 md->immutable_target_type = dm_table_get_immutable_target_type(t); 1829 1830 dm_table_set_restrictions(t, q, limits); 1831 if (old_map) 1832 dm_sync_table(md); 1833 1834 return old_map; 1835 } 1836 1837 /* 1838 * Returns unbound table for the caller to free. 1839 */ 1840 static struct dm_table *__unbind(struct mapped_device *md) 1841 { 1842 struct dm_table *map = rcu_dereference_protected(md->map, 1); 1843 1844 if (!map) 1845 return NULL; 1846 1847 dm_table_event_callback(map, NULL, NULL); 1848 RCU_INIT_POINTER(md->map, NULL); 1849 dm_sync_table(md); 1850 1851 return map; 1852 } 1853 1854 /* 1855 * Constructor for a new device. 1856 */ 1857 int dm_create(int minor, struct mapped_device **result) 1858 { 1859 struct mapped_device *md; 1860 1861 md = alloc_dev(minor); 1862 if (!md) 1863 return -ENXIO; 1864 1865 dm_sysfs_init(md); 1866 1867 *result = md; 1868 return 0; 1869 } 1870 1871 /* 1872 * Functions to manage md->type. 1873 * All are required to hold md->type_lock. 1874 */ 1875 void dm_lock_md_type(struct mapped_device *md) 1876 { 1877 mutex_lock(&md->type_lock); 1878 } 1879 1880 void dm_unlock_md_type(struct mapped_device *md) 1881 { 1882 mutex_unlock(&md->type_lock); 1883 } 1884 1885 void dm_set_md_type(struct mapped_device *md, enum dm_queue_mode type) 1886 { 1887 BUG_ON(!mutex_is_locked(&md->type_lock)); 1888 md->type = type; 1889 } 1890 1891 enum dm_queue_mode dm_get_md_type(struct mapped_device *md) 1892 { 1893 return md->type; 1894 } 1895 1896 struct target_type *dm_get_immutable_target_type(struct mapped_device *md) 1897 { 1898 return md->immutable_target_type; 1899 } 1900 1901 /* 1902 * The queue_limits are only valid as long as you have a reference 1903 * count on 'md'. 1904 */ 1905 struct queue_limits *dm_get_queue_limits(struct mapped_device *md) 1906 { 1907 BUG_ON(!atomic_read(&md->holders)); 1908 return &md->queue->limits; 1909 } 1910 EXPORT_SYMBOL_GPL(dm_get_queue_limits); 1911 1912 /* 1913 * Setup the DM device's queue based on md's type 1914 */ 1915 int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t) 1916 { 1917 int r; 1918 enum dm_queue_mode type = dm_get_md_type(md); 1919 1920 switch (type) { 1921 case DM_TYPE_REQUEST_BASED: 1922 r = dm_old_init_request_queue(md, t); 1923 if (r) { 1924 DMERR("Cannot initialize queue for request-based mapped device"); 1925 return r; 1926 } 1927 break; 1928 case DM_TYPE_MQ_REQUEST_BASED: 1929 r = dm_mq_init_request_queue(md, t); 1930 if (r) { 1931 DMERR("Cannot initialize queue for request-based dm-mq mapped device"); 1932 return r; 1933 } 1934 break; 1935 case DM_TYPE_BIO_BASED: 1936 case DM_TYPE_DAX_BIO_BASED: 1937 dm_init_normal_md_queue(md); 1938 blk_queue_make_request(md->queue, dm_make_request); 1939 /* 1940 * DM handles splitting bios as needed. Free the bio_split bioset 1941 * since it won't be used (saves 1 process per bio-based DM device). 1942 */ 1943 bioset_free(md->queue->bio_split); 1944 md->queue->bio_split = NULL; 1945 1946 if (type == DM_TYPE_DAX_BIO_BASED) 1947 queue_flag_set_unlocked(QUEUE_FLAG_DAX, md->queue); 1948 break; 1949 case DM_TYPE_NONE: 1950 WARN_ON_ONCE(true); 1951 break; 1952 } 1953 1954 return 0; 1955 } 1956 1957 struct mapped_device *dm_get_md(dev_t dev) 1958 { 1959 struct mapped_device *md; 1960 unsigned minor = MINOR(dev); 1961 1962 if (MAJOR(dev) != _major || minor >= (1 << MINORBITS)) 1963 return NULL; 1964 1965 spin_lock(&_minor_lock); 1966 1967 md = idr_find(&_minor_idr, minor); 1968 if (md) { 1969 if ((md == MINOR_ALLOCED || 1970 (MINOR(disk_devt(dm_disk(md))) != minor) || 1971 dm_deleting_md(md) || 1972 test_bit(DMF_FREEING, &md->flags))) { 1973 md = NULL; 1974 goto out; 1975 } 1976 dm_get(md); 1977 } 1978 1979 out: 1980 spin_unlock(&_minor_lock); 1981 1982 return md; 1983 } 1984 EXPORT_SYMBOL_GPL(dm_get_md); 1985 1986 void *dm_get_mdptr(struct mapped_device *md) 1987 { 1988 return md->interface_ptr; 1989 } 1990 1991 void dm_set_mdptr(struct mapped_device *md, void *ptr) 1992 { 1993 md->interface_ptr = ptr; 1994 } 1995 1996 void dm_get(struct mapped_device *md) 1997 { 1998 atomic_inc(&md->holders); 1999 BUG_ON(test_bit(DMF_FREEING, &md->flags)); 2000 } 2001 2002 int dm_hold(struct mapped_device *md) 2003 { 2004 spin_lock(&_minor_lock); 2005 if (test_bit(DMF_FREEING, &md->flags)) { 2006 spin_unlock(&_minor_lock); 2007 return -EBUSY; 2008 } 2009 dm_get(md); 2010 spin_unlock(&_minor_lock); 2011 return 0; 2012 } 2013 EXPORT_SYMBOL_GPL(dm_hold); 2014 2015 const char *dm_device_name(struct mapped_device *md) 2016 { 2017 return md->name; 2018 } 2019 EXPORT_SYMBOL_GPL(dm_device_name); 2020 2021 static void __dm_destroy(struct mapped_device *md, bool wait) 2022 { 2023 struct request_queue *q = dm_get_md_queue(md); 2024 struct dm_table *map; 2025 int srcu_idx; 2026 2027 might_sleep(); 2028 2029 spin_lock(&_minor_lock); 2030 idr_replace(&_minor_idr, MINOR_ALLOCED, MINOR(disk_devt(dm_disk(md)))); 2031 set_bit(DMF_FREEING, &md->flags); 2032 spin_unlock(&_minor_lock); 2033 2034 blk_set_queue_dying(q); 2035 2036 if (dm_request_based(md) && md->kworker_task) 2037 kthread_flush_worker(&md->kworker); 2038 2039 /* 2040 * Take suspend_lock so that presuspend and postsuspend methods 2041 * do not race with internal suspend. 2042 */ 2043 mutex_lock(&md->suspend_lock); 2044 map = dm_get_live_table(md, &srcu_idx); 2045 if (!dm_suspended_md(md)) { 2046 dm_table_presuspend_targets(map); 2047 dm_table_postsuspend_targets(map); 2048 } 2049 /* dm_put_live_table must be before msleep, otherwise deadlock is possible */ 2050 dm_put_live_table(md, srcu_idx); 2051 mutex_unlock(&md->suspend_lock); 2052 2053 /* 2054 * Rare, but there may be I/O requests still going to complete, 2055 * for example. Wait for all references to disappear. 2056 * No one should increment the reference count of the mapped_device, 2057 * after the mapped_device state becomes DMF_FREEING. 2058 */ 2059 if (wait) 2060 while (atomic_read(&md->holders)) 2061 msleep(1); 2062 else if (atomic_read(&md->holders)) 2063 DMWARN("%s: Forcibly removing mapped_device still in use! (%d users)", 2064 dm_device_name(md), atomic_read(&md->holders)); 2065 2066 dm_sysfs_exit(md); 2067 dm_table_destroy(__unbind(md)); 2068 free_dev(md); 2069 } 2070 2071 void dm_destroy(struct mapped_device *md) 2072 { 2073 __dm_destroy(md, true); 2074 } 2075 2076 void dm_destroy_immediate(struct mapped_device *md) 2077 { 2078 __dm_destroy(md, false); 2079 } 2080 2081 void dm_put(struct mapped_device *md) 2082 { 2083 atomic_dec(&md->holders); 2084 } 2085 EXPORT_SYMBOL_GPL(dm_put); 2086 2087 static int dm_wait_for_completion(struct mapped_device *md, long task_state) 2088 { 2089 int r = 0; 2090 DEFINE_WAIT(wait); 2091 2092 while (1) { 2093 prepare_to_wait(&md->wait, &wait, task_state); 2094 2095 if (!md_in_flight(md)) 2096 break; 2097 2098 if (signal_pending_state(task_state, current)) { 2099 r = -EINTR; 2100 break; 2101 } 2102 2103 io_schedule(); 2104 } 2105 finish_wait(&md->wait, &wait); 2106 2107 return r; 2108 } 2109 2110 /* 2111 * Process the deferred bios 2112 */ 2113 static void dm_wq_work(struct work_struct *work) 2114 { 2115 struct mapped_device *md = container_of(work, struct mapped_device, 2116 work); 2117 struct bio *c; 2118 int srcu_idx; 2119 struct dm_table *map; 2120 2121 map = dm_get_live_table(md, &srcu_idx); 2122 2123 while (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) { 2124 spin_lock_irq(&md->deferred_lock); 2125 c = bio_list_pop(&md->deferred); 2126 spin_unlock_irq(&md->deferred_lock); 2127 2128 if (!c) 2129 break; 2130 2131 if (dm_request_based(md)) 2132 generic_make_request(c); 2133 else 2134 __split_and_process_bio(md, map, c); 2135 } 2136 2137 dm_put_live_table(md, srcu_idx); 2138 } 2139 2140 static void dm_queue_flush(struct mapped_device *md) 2141 { 2142 clear_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags); 2143 smp_mb__after_atomic(); 2144 queue_work(md->wq, &md->work); 2145 } 2146 2147 /* 2148 * Swap in a new table, returning the old one for the caller to destroy. 2149 */ 2150 struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *table) 2151 { 2152 struct dm_table *live_map = NULL, *map = ERR_PTR(-EINVAL); 2153 struct queue_limits limits; 2154 int r; 2155 2156 mutex_lock(&md->suspend_lock); 2157 2158 /* device must be suspended */ 2159 if (!dm_suspended_md(md)) 2160 goto out; 2161 2162 /* 2163 * If the new table has no data devices, retain the existing limits. 2164 * This helps multipath with queue_if_no_path if all paths disappear, 2165 * then new I/O is queued based on these limits, and then some paths 2166 * reappear. 2167 */ 2168 if (dm_table_has_no_data_devices(table)) { 2169 live_map = dm_get_live_table_fast(md); 2170 if (live_map) 2171 limits = md->queue->limits; 2172 dm_put_live_table_fast(md); 2173 } 2174 2175 if (!live_map) { 2176 r = dm_calculate_queue_limits(table, &limits); 2177 if (r) { 2178 map = ERR_PTR(r); 2179 goto out; 2180 } 2181 } 2182 2183 map = __bind(md, table, &limits); 2184 2185 out: 2186 mutex_unlock(&md->suspend_lock); 2187 return map; 2188 } 2189 2190 /* 2191 * Functions to lock and unlock any filesystem running on the 2192 * device. 2193 */ 2194 static int lock_fs(struct mapped_device *md) 2195 { 2196 int r; 2197 2198 WARN_ON(md->frozen_sb); 2199 2200 md->frozen_sb = freeze_bdev(md->bdev); 2201 if (IS_ERR(md->frozen_sb)) { 2202 r = PTR_ERR(md->frozen_sb); 2203 md->frozen_sb = NULL; 2204 return r; 2205 } 2206 2207 set_bit(DMF_FROZEN, &md->flags); 2208 2209 return 0; 2210 } 2211 2212 static void unlock_fs(struct mapped_device *md) 2213 { 2214 if (!test_bit(DMF_FROZEN, &md->flags)) 2215 return; 2216 2217 thaw_bdev(md->bdev, md->frozen_sb); 2218 md->frozen_sb = NULL; 2219 clear_bit(DMF_FROZEN, &md->flags); 2220 } 2221 2222 /* 2223 * @suspend_flags: DM_SUSPEND_LOCKFS_FLAG and/or DM_SUSPEND_NOFLUSH_FLAG 2224 * @task_state: e.g. TASK_INTERRUPTIBLE or TASK_UNINTERRUPTIBLE 2225 * @dmf_suspended_flag: DMF_SUSPENDED or DMF_SUSPENDED_INTERNALLY 2226 * 2227 * If __dm_suspend returns 0, the device is completely quiescent 2228 * now. There is no request-processing activity. All new requests 2229 * are being added to md->deferred list. 2230 */ 2231 static int __dm_suspend(struct mapped_device *md, struct dm_table *map, 2232 unsigned suspend_flags, long task_state, 2233 int dmf_suspended_flag) 2234 { 2235 bool do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG; 2236 bool noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG; 2237 int r; 2238 2239 lockdep_assert_held(&md->suspend_lock); 2240 2241 /* 2242 * DMF_NOFLUSH_SUSPENDING must be set before presuspend. 2243 * This flag is cleared before dm_suspend returns. 2244 */ 2245 if (noflush) 2246 set_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); 2247 else 2248 pr_debug("%s: suspending with flush\n", dm_device_name(md)); 2249 2250 /* 2251 * This gets reverted if there's an error later and the targets 2252 * provide the .presuspend_undo hook. 2253 */ 2254 dm_table_presuspend_targets(map); 2255 2256 /* 2257 * Flush I/O to the device. 2258 * Any I/O submitted after lock_fs() may not be flushed. 2259 * noflush takes precedence over do_lockfs. 2260 * (lock_fs() flushes I/Os and waits for them to complete.) 2261 */ 2262 if (!noflush && do_lockfs) { 2263 r = lock_fs(md); 2264 if (r) { 2265 dm_table_presuspend_undo_targets(map); 2266 return r; 2267 } 2268 } 2269 2270 /* 2271 * Here we must make sure that no processes are submitting requests 2272 * to target drivers i.e. no one may be executing 2273 * __split_and_process_bio. This is called from dm_request and 2274 * dm_wq_work. 2275 * 2276 * To get all processes out of __split_and_process_bio in dm_request, 2277 * we take the write lock. To prevent any process from reentering 2278 * __split_and_process_bio from dm_request and quiesce the thread 2279 * (dm_wq_work), we set BMF_BLOCK_IO_FOR_SUSPEND and call 2280 * flush_workqueue(md->wq). 2281 */ 2282 set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags); 2283 if (map) 2284 synchronize_srcu(&md->io_barrier); 2285 2286 /* 2287 * Stop md->queue before flushing md->wq in case request-based 2288 * dm defers requests to md->wq from md->queue. 2289 */ 2290 if (dm_request_based(md)) { 2291 dm_stop_queue(md->queue); 2292 if (md->kworker_task) 2293 kthread_flush_worker(&md->kworker); 2294 } 2295 2296 flush_workqueue(md->wq); 2297 2298 /* 2299 * At this point no more requests are entering target request routines. 2300 * We call dm_wait_for_completion to wait for all existing requests 2301 * to finish. 2302 */ 2303 r = dm_wait_for_completion(md, task_state); 2304 if (!r) 2305 set_bit(dmf_suspended_flag, &md->flags); 2306 2307 if (noflush) 2308 clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); 2309 if (map) 2310 synchronize_srcu(&md->io_barrier); 2311 2312 /* were we interrupted ? */ 2313 if (r < 0) { 2314 dm_queue_flush(md); 2315 2316 if (dm_request_based(md)) 2317 dm_start_queue(md->queue); 2318 2319 unlock_fs(md); 2320 dm_table_presuspend_undo_targets(map); 2321 /* pushback list is already flushed, so skip flush */ 2322 } 2323 2324 return r; 2325 } 2326 2327 /* 2328 * We need to be able to change a mapping table under a mounted 2329 * filesystem. For example we might want to move some data in 2330 * the background. Before the table can be swapped with 2331 * dm_bind_table, dm_suspend must be called to flush any in 2332 * flight bios and ensure that any further io gets deferred. 2333 */ 2334 /* 2335 * Suspend mechanism in request-based dm. 2336 * 2337 * 1. Flush all I/Os by lock_fs() if needed. 2338 * 2. Stop dispatching any I/O by stopping the request_queue. 2339 * 3. Wait for all in-flight I/Os to be completed or requeued. 2340 * 2341 * To abort suspend, start the request_queue. 2342 */ 2343 int dm_suspend(struct mapped_device *md, unsigned suspend_flags) 2344 { 2345 struct dm_table *map = NULL; 2346 int r = 0; 2347 2348 retry: 2349 mutex_lock_nested(&md->suspend_lock, SINGLE_DEPTH_NESTING); 2350 2351 if (dm_suspended_md(md)) { 2352 r = -EINVAL; 2353 goto out_unlock; 2354 } 2355 2356 if (dm_suspended_internally_md(md)) { 2357 /* already internally suspended, wait for internal resume */ 2358 mutex_unlock(&md->suspend_lock); 2359 r = wait_on_bit(&md->flags, DMF_SUSPENDED_INTERNALLY, TASK_INTERRUPTIBLE); 2360 if (r) 2361 return r; 2362 goto retry; 2363 } 2364 2365 map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock)); 2366 2367 r = __dm_suspend(md, map, suspend_flags, TASK_INTERRUPTIBLE, DMF_SUSPENDED); 2368 if (r) 2369 goto out_unlock; 2370 2371 dm_table_postsuspend_targets(map); 2372 2373 out_unlock: 2374 mutex_unlock(&md->suspend_lock); 2375 return r; 2376 } 2377 2378 static int __dm_resume(struct mapped_device *md, struct dm_table *map) 2379 { 2380 if (map) { 2381 int r = dm_table_resume_targets(map); 2382 if (r) 2383 return r; 2384 } 2385 2386 dm_queue_flush(md); 2387 2388 /* 2389 * Flushing deferred I/Os must be done after targets are resumed 2390 * so that mapping of targets can work correctly. 2391 * Request-based dm is queueing the deferred I/Os in its request_queue. 2392 */ 2393 if (dm_request_based(md)) 2394 dm_start_queue(md->queue); 2395 2396 unlock_fs(md); 2397 2398 return 0; 2399 } 2400 2401 int dm_resume(struct mapped_device *md) 2402 { 2403 int r; 2404 struct dm_table *map = NULL; 2405 2406 retry: 2407 r = -EINVAL; 2408 mutex_lock_nested(&md->suspend_lock, SINGLE_DEPTH_NESTING); 2409 2410 if (!dm_suspended_md(md)) 2411 goto out; 2412 2413 if (dm_suspended_internally_md(md)) { 2414 /* already internally suspended, wait for internal resume */ 2415 mutex_unlock(&md->suspend_lock); 2416 r = wait_on_bit(&md->flags, DMF_SUSPENDED_INTERNALLY, TASK_INTERRUPTIBLE); 2417 if (r) 2418 return r; 2419 goto retry; 2420 } 2421 2422 map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock)); 2423 if (!map || !dm_table_get_size(map)) 2424 goto out; 2425 2426 r = __dm_resume(md, map); 2427 if (r) 2428 goto out; 2429 2430 clear_bit(DMF_SUSPENDED, &md->flags); 2431 out: 2432 mutex_unlock(&md->suspend_lock); 2433 2434 return r; 2435 } 2436 2437 /* 2438 * Internal suspend/resume works like userspace-driven suspend. It waits 2439 * until all bios finish and prevents issuing new bios to the target drivers. 2440 * It may be used only from the kernel. 2441 */ 2442 2443 static void __dm_internal_suspend(struct mapped_device *md, unsigned suspend_flags) 2444 { 2445 struct dm_table *map = NULL; 2446 2447 lockdep_assert_held(&md->suspend_lock); 2448 2449 if (md->internal_suspend_count++) 2450 return; /* nested internal suspend */ 2451 2452 if (dm_suspended_md(md)) { 2453 set_bit(DMF_SUSPENDED_INTERNALLY, &md->flags); 2454 return; /* nest suspend */ 2455 } 2456 2457 map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock)); 2458 2459 /* 2460 * Using TASK_UNINTERRUPTIBLE because only NOFLUSH internal suspend is 2461 * supported. Properly supporting a TASK_INTERRUPTIBLE internal suspend 2462 * would require changing .presuspend to return an error -- avoid this 2463 * until there is a need for more elaborate variants of internal suspend. 2464 */ 2465 (void) __dm_suspend(md, map, suspend_flags, TASK_UNINTERRUPTIBLE, 2466 DMF_SUSPENDED_INTERNALLY); 2467 2468 dm_table_postsuspend_targets(map); 2469 } 2470 2471 static void __dm_internal_resume(struct mapped_device *md) 2472 { 2473 BUG_ON(!md->internal_suspend_count); 2474 2475 if (--md->internal_suspend_count) 2476 return; /* resume from nested internal suspend */ 2477 2478 if (dm_suspended_md(md)) 2479 goto done; /* resume from nested suspend */ 2480 2481 /* 2482 * NOTE: existing callers don't need to call dm_table_resume_targets 2483 * (which may fail -- so best to avoid it for now by passing NULL map) 2484 */ 2485 (void) __dm_resume(md, NULL); 2486 2487 done: 2488 clear_bit(DMF_SUSPENDED_INTERNALLY, &md->flags); 2489 smp_mb__after_atomic(); 2490 wake_up_bit(&md->flags, DMF_SUSPENDED_INTERNALLY); 2491 } 2492 2493 void dm_internal_suspend_noflush(struct mapped_device *md) 2494 { 2495 mutex_lock(&md->suspend_lock); 2496 __dm_internal_suspend(md, DM_SUSPEND_NOFLUSH_FLAG); 2497 mutex_unlock(&md->suspend_lock); 2498 } 2499 EXPORT_SYMBOL_GPL(dm_internal_suspend_noflush); 2500 2501 void dm_internal_resume(struct mapped_device *md) 2502 { 2503 mutex_lock(&md->suspend_lock); 2504 __dm_internal_resume(md); 2505 mutex_unlock(&md->suspend_lock); 2506 } 2507 EXPORT_SYMBOL_GPL(dm_internal_resume); 2508 2509 /* 2510 * Fast variants of internal suspend/resume hold md->suspend_lock, 2511 * which prevents interaction with userspace-driven suspend. 2512 */ 2513 2514 void dm_internal_suspend_fast(struct mapped_device *md) 2515 { 2516 mutex_lock(&md->suspend_lock); 2517 if (dm_suspended_md(md) || dm_suspended_internally_md(md)) 2518 return; 2519 2520 set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags); 2521 synchronize_srcu(&md->io_barrier); 2522 flush_workqueue(md->wq); 2523 dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE); 2524 } 2525 EXPORT_SYMBOL_GPL(dm_internal_suspend_fast); 2526 2527 void dm_internal_resume_fast(struct mapped_device *md) 2528 { 2529 if (dm_suspended_md(md) || dm_suspended_internally_md(md)) 2530 goto done; 2531 2532 dm_queue_flush(md); 2533 2534 done: 2535 mutex_unlock(&md->suspend_lock); 2536 } 2537 EXPORT_SYMBOL_GPL(dm_internal_resume_fast); 2538 2539 /*----------------------------------------------------------------- 2540 * Event notification. 2541 *---------------------------------------------------------------*/ 2542 int dm_kobject_uevent(struct mapped_device *md, enum kobject_action action, 2543 unsigned cookie) 2544 { 2545 char udev_cookie[DM_COOKIE_LENGTH]; 2546 char *envp[] = { udev_cookie, NULL }; 2547 2548 if (!cookie) 2549 return kobject_uevent(&disk_to_dev(md->disk)->kobj, action); 2550 else { 2551 snprintf(udev_cookie, DM_COOKIE_LENGTH, "%s=%u", 2552 DM_COOKIE_ENV_VAR_NAME, cookie); 2553 return kobject_uevent_env(&disk_to_dev(md->disk)->kobj, 2554 action, envp); 2555 } 2556 } 2557 2558 uint32_t dm_next_uevent_seq(struct mapped_device *md) 2559 { 2560 return atomic_add_return(1, &md->uevent_seq); 2561 } 2562 2563 uint32_t dm_get_event_nr(struct mapped_device *md) 2564 { 2565 return atomic_read(&md->event_nr); 2566 } 2567 2568 int dm_wait_event(struct mapped_device *md, int event_nr) 2569 { 2570 return wait_event_interruptible(md->eventq, 2571 (event_nr != atomic_read(&md->event_nr))); 2572 } 2573 2574 void dm_uevent_add(struct mapped_device *md, struct list_head *elist) 2575 { 2576 unsigned long flags; 2577 2578 spin_lock_irqsave(&md->uevent_lock, flags); 2579 list_add(elist, &md->uevent_list); 2580 spin_unlock_irqrestore(&md->uevent_lock, flags); 2581 } 2582 2583 /* 2584 * The gendisk is only valid as long as you have a reference 2585 * count on 'md'. 2586 */ 2587 struct gendisk *dm_disk(struct mapped_device *md) 2588 { 2589 return md->disk; 2590 } 2591 EXPORT_SYMBOL_GPL(dm_disk); 2592 2593 struct kobject *dm_kobject(struct mapped_device *md) 2594 { 2595 return &md->kobj_holder.kobj; 2596 } 2597 2598 struct mapped_device *dm_get_from_kobject(struct kobject *kobj) 2599 { 2600 struct mapped_device *md; 2601 2602 md = container_of(kobj, struct mapped_device, kobj_holder.kobj); 2603 2604 if (test_bit(DMF_FREEING, &md->flags) || 2605 dm_deleting_md(md)) 2606 return NULL; 2607 2608 dm_get(md); 2609 return md; 2610 } 2611 2612 int dm_suspended_md(struct mapped_device *md) 2613 { 2614 return test_bit(DMF_SUSPENDED, &md->flags); 2615 } 2616 2617 int dm_suspended_internally_md(struct mapped_device *md) 2618 { 2619 return test_bit(DMF_SUSPENDED_INTERNALLY, &md->flags); 2620 } 2621 2622 int dm_test_deferred_remove_flag(struct mapped_device *md) 2623 { 2624 return test_bit(DMF_DEFERRED_REMOVE, &md->flags); 2625 } 2626 2627 int dm_suspended(struct dm_target *ti) 2628 { 2629 return dm_suspended_md(dm_table_get_md(ti->table)); 2630 } 2631 EXPORT_SYMBOL_GPL(dm_suspended); 2632 2633 int dm_noflush_suspending(struct dm_target *ti) 2634 { 2635 return __noflush_suspending(dm_table_get_md(ti->table)); 2636 } 2637 EXPORT_SYMBOL_GPL(dm_noflush_suspending); 2638 2639 struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, enum dm_queue_mode type, 2640 unsigned integrity, unsigned per_io_data_size) 2641 { 2642 struct dm_md_mempools *pools = kzalloc_node(sizeof(*pools), GFP_KERNEL, md->numa_node_id); 2643 unsigned int pool_size = 0; 2644 unsigned int front_pad; 2645 2646 if (!pools) 2647 return NULL; 2648 2649 switch (type) { 2650 case DM_TYPE_BIO_BASED: 2651 case DM_TYPE_DAX_BIO_BASED: 2652 pool_size = dm_get_reserved_bio_based_ios(); 2653 front_pad = roundup(per_io_data_size, __alignof__(struct dm_target_io)) + offsetof(struct dm_target_io, clone); 2654 2655 pools->io_pool = mempool_create_slab_pool(pool_size, _io_cache); 2656 if (!pools->io_pool) 2657 goto out; 2658 break; 2659 case DM_TYPE_REQUEST_BASED: 2660 case DM_TYPE_MQ_REQUEST_BASED: 2661 pool_size = dm_get_reserved_rq_based_ios(); 2662 front_pad = offsetof(struct dm_rq_clone_bio_info, clone); 2663 /* per_io_data_size is used for blk-mq pdu at queue allocation */ 2664 break; 2665 default: 2666 BUG(); 2667 } 2668 2669 pools->bs = bioset_create(pool_size, front_pad, BIOSET_NEED_RESCUER); 2670 if (!pools->bs) 2671 goto out; 2672 2673 if (integrity && bioset_integrity_create(pools->bs, pool_size)) 2674 goto out; 2675 2676 return pools; 2677 2678 out: 2679 dm_free_md_mempools(pools); 2680 2681 return NULL; 2682 } 2683 2684 void dm_free_md_mempools(struct dm_md_mempools *pools) 2685 { 2686 if (!pools) 2687 return; 2688 2689 mempool_destroy(pools->io_pool); 2690 2691 if (pools->bs) 2692 bioset_free(pools->bs); 2693 2694 kfree(pools); 2695 } 2696 2697 struct dm_pr { 2698 u64 old_key; 2699 u64 new_key; 2700 u32 flags; 2701 bool fail_early; 2702 }; 2703 2704 static int dm_call_pr(struct block_device *bdev, iterate_devices_callout_fn fn, 2705 void *data) 2706 { 2707 struct mapped_device *md = bdev->bd_disk->private_data; 2708 struct dm_table *table; 2709 struct dm_target *ti; 2710 int ret = -ENOTTY, srcu_idx; 2711 2712 table = dm_get_live_table(md, &srcu_idx); 2713 if (!table || !dm_table_get_size(table)) 2714 goto out; 2715 2716 /* We only support devices that have a single target */ 2717 if (dm_table_get_num_targets(table) != 1) 2718 goto out; 2719 ti = dm_table_get_target(table, 0); 2720 2721 ret = -EINVAL; 2722 if (!ti->type->iterate_devices) 2723 goto out; 2724 2725 ret = ti->type->iterate_devices(ti, fn, data); 2726 out: 2727 dm_put_live_table(md, srcu_idx); 2728 return ret; 2729 } 2730 2731 /* 2732 * For register / unregister we need to manually call out to every path. 2733 */ 2734 static int __dm_pr_register(struct dm_target *ti, struct dm_dev *dev, 2735 sector_t start, sector_t len, void *data) 2736 { 2737 struct dm_pr *pr = data; 2738 const struct pr_ops *ops = dev->bdev->bd_disk->fops->pr_ops; 2739 2740 if (!ops || !ops->pr_register) 2741 return -EOPNOTSUPP; 2742 return ops->pr_register(dev->bdev, pr->old_key, pr->new_key, pr->flags); 2743 } 2744 2745 static int dm_pr_register(struct block_device *bdev, u64 old_key, u64 new_key, 2746 u32 flags) 2747 { 2748 struct dm_pr pr = { 2749 .old_key = old_key, 2750 .new_key = new_key, 2751 .flags = flags, 2752 .fail_early = true, 2753 }; 2754 int ret; 2755 2756 ret = dm_call_pr(bdev, __dm_pr_register, &pr); 2757 if (ret && new_key) { 2758 /* unregister all paths if we failed to register any path */ 2759 pr.old_key = new_key; 2760 pr.new_key = 0; 2761 pr.flags = 0; 2762 pr.fail_early = false; 2763 dm_call_pr(bdev, __dm_pr_register, &pr); 2764 } 2765 2766 return ret; 2767 } 2768 2769 static int dm_pr_reserve(struct block_device *bdev, u64 key, enum pr_type type, 2770 u32 flags) 2771 { 2772 struct mapped_device *md = bdev->bd_disk->private_data; 2773 const struct pr_ops *ops; 2774 fmode_t mode; 2775 int r; 2776 2777 r = dm_grab_bdev_for_ioctl(md, &bdev, &mode); 2778 if (r < 0) 2779 return r; 2780 2781 ops = bdev->bd_disk->fops->pr_ops; 2782 if (ops && ops->pr_reserve) 2783 r = ops->pr_reserve(bdev, key, type, flags); 2784 else 2785 r = -EOPNOTSUPP; 2786 2787 bdput(bdev); 2788 return r; 2789 } 2790 2791 static int dm_pr_release(struct block_device *bdev, u64 key, enum pr_type type) 2792 { 2793 struct mapped_device *md = bdev->bd_disk->private_data; 2794 const struct pr_ops *ops; 2795 fmode_t mode; 2796 int r; 2797 2798 r = dm_grab_bdev_for_ioctl(md, &bdev, &mode); 2799 if (r < 0) 2800 return r; 2801 2802 ops = bdev->bd_disk->fops->pr_ops; 2803 if (ops && ops->pr_release) 2804 r = ops->pr_release(bdev, key, type); 2805 else 2806 r = -EOPNOTSUPP; 2807 2808 bdput(bdev); 2809 return r; 2810 } 2811 2812 static int dm_pr_preempt(struct block_device *bdev, u64 old_key, u64 new_key, 2813 enum pr_type type, bool abort) 2814 { 2815 struct mapped_device *md = bdev->bd_disk->private_data; 2816 const struct pr_ops *ops; 2817 fmode_t mode; 2818 int r; 2819 2820 r = dm_grab_bdev_for_ioctl(md, &bdev, &mode); 2821 if (r < 0) 2822 return r; 2823 2824 ops = bdev->bd_disk->fops->pr_ops; 2825 if (ops && ops->pr_preempt) 2826 r = ops->pr_preempt(bdev, old_key, new_key, type, abort); 2827 else 2828 r = -EOPNOTSUPP; 2829 2830 bdput(bdev); 2831 return r; 2832 } 2833 2834 static int dm_pr_clear(struct block_device *bdev, u64 key) 2835 { 2836 struct mapped_device *md = bdev->bd_disk->private_data; 2837 const struct pr_ops *ops; 2838 fmode_t mode; 2839 int r; 2840 2841 r = dm_grab_bdev_for_ioctl(md, &bdev, &mode); 2842 if (r < 0) 2843 return r; 2844 2845 ops = bdev->bd_disk->fops->pr_ops; 2846 if (ops && ops->pr_clear) 2847 r = ops->pr_clear(bdev, key); 2848 else 2849 r = -EOPNOTSUPP; 2850 2851 bdput(bdev); 2852 return r; 2853 } 2854 2855 static const struct pr_ops dm_pr_ops = { 2856 .pr_register = dm_pr_register, 2857 .pr_reserve = dm_pr_reserve, 2858 .pr_release = dm_pr_release, 2859 .pr_preempt = dm_pr_preempt, 2860 .pr_clear = dm_pr_clear, 2861 }; 2862 2863 static const struct block_device_operations dm_blk_dops = { 2864 .open = dm_blk_open, 2865 .release = dm_blk_close, 2866 .ioctl = dm_blk_ioctl, 2867 .getgeo = dm_blk_getgeo, 2868 .pr_ops = &dm_pr_ops, 2869 .owner = THIS_MODULE 2870 }; 2871 2872 static const struct dax_operations dm_dax_ops = { 2873 .direct_access = dm_dax_direct_access, 2874 }; 2875 2876 /* 2877 * module hooks 2878 */ 2879 module_init(dm_init); 2880 module_exit(dm_exit); 2881 2882 module_param(major, uint, 0); 2883 MODULE_PARM_DESC(major, "The major number of the device mapper"); 2884 2885 module_param(reserved_bio_based_ios, uint, S_IRUGO | S_IWUSR); 2886 MODULE_PARM_DESC(reserved_bio_based_ios, "Reserved IOs in bio-based mempools"); 2887 2888 module_param(dm_numa_node, int, S_IRUGO | S_IWUSR); 2889 MODULE_PARM_DESC(dm_numa_node, "NUMA node for DM device memory allocations"); 2890 2891 MODULE_DESCRIPTION(DM_NAME " driver"); 2892 MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>"); 2893 MODULE_LICENSE("GPL"); 2894