1 /* 2 * Copyright (C) 2001, 2002 Sistina Software (UK) Limited. 3 * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved. 4 * 5 * This file is released under the GPL. 6 */ 7 8 #include "dm-core.h" 9 #include "dm-rq.h" 10 #include "dm-uevent.h" 11 12 #include <linux/init.h> 13 #include <linux/module.h> 14 #include <linux/mutex.h> 15 #include <linux/sched/signal.h> 16 #include <linux/blkpg.h> 17 #include <linux/bio.h> 18 #include <linux/mempool.h> 19 #include <linux/dax.h> 20 #include <linux/slab.h> 21 #include <linux/idr.h> 22 #include <linux/uio.h> 23 #include <linux/hdreg.h> 24 #include <linux/delay.h> 25 #include <linux/wait.h> 26 #include <linux/pr.h> 27 28 #define DM_MSG_PREFIX "core" 29 30 #ifdef CONFIG_PRINTK 31 /* 32 * ratelimit state to be used in DMXXX_LIMIT(). 33 */ 34 DEFINE_RATELIMIT_STATE(dm_ratelimit_state, 35 DEFAULT_RATELIMIT_INTERVAL, 36 DEFAULT_RATELIMIT_BURST); 37 EXPORT_SYMBOL(dm_ratelimit_state); 38 #endif 39 40 /* 41 * Cookies are numeric values sent with CHANGE and REMOVE 42 * uevents while resuming, removing or renaming the device. 43 */ 44 #define DM_COOKIE_ENV_VAR_NAME "DM_COOKIE" 45 #define DM_COOKIE_LENGTH 24 46 47 static const char *_name = DM_NAME; 48 49 static unsigned int major = 0; 50 static unsigned int _major = 0; 51 52 static DEFINE_IDR(_minor_idr); 53 54 static DEFINE_SPINLOCK(_minor_lock); 55 56 static void do_deferred_remove(struct work_struct *w); 57 58 static DECLARE_WORK(deferred_remove_work, do_deferred_remove); 59 60 static struct workqueue_struct *deferred_remove_workqueue; 61 62 atomic_t dm_global_event_nr = ATOMIC_INIT(0); 63 DECLARE_WAIT_QUEUE_HEAD(dm_global_eventq); 64 65 /* 66 * One of these is allocated per bio. 67 */ 68 struct dm_io { 69 struct mapped_device *md; 70 blk_status_t status; 71 atomic_t io_count; 72 struct bio *bio; 73 unsigned long start_time; 74 spinlock_t endio_lock; 75 struct dm_stats_aux stats_aux; 76 }; 77 78 #define MINOR_ALLOCED ((void *)-1) 79 80 /* 81 * Bits for the md->flags field. 82 */ 83 #define DMF_BLOCK_IO_FOR_SUSPEND 0 84 #define DMF_SUSPENDED 1 85 #define DMF_FROZEN 2 86 #define DMF_FREEING 3 87 #define DMF_DELETING 4 88 #define DMF_NOFLUSH_SUSPENDING 5 89 #define DMF_DEFERRED_REMOVE 6 90 #define DMF_SUSPENDED_INTERNALLY 7 91 92 #define DM_NUMA_NODE NUMA_NO_NODE 93 static int dm_numa_node = DM_NUMA_NODE; 94 95 /* 96 * For mempools pre-allocation at the table loading time. 97 */ 98 struct dm_md_mempools { 99 mempool_t *io_pool; 100 struct bio_set *bs; 101 }; 102 103 struct table_device { 104 struct list_head list; 105 atomic_t count; 106 struct dm_dev dm_dev; 107 }; 108 109 static struct kmem_cache *_io_cache; 110 static struct kmem_cache *_rq_tio_cache; 111 static struct kmem_cache *_rq_cache; 112 113 /* 114 * Bio-based DM's mempools' reserved IOs set by the user. 115 */ 116 #define RESERVED_BIO_BASED_IOS 16 117 static unsigned reserved_bio_based_ios = RESERVED_BIO_BASED_IOS; 118 119 static int __dm_get_module_param_int(int *module_param, int min, int max) 120 { 121 int param = ACCESS_ONCE(*module_param); 122 int modified_param = 0; 123 bool modified = true; 124 125 if (param < min) 126 modified_param = min; 127 else if (param > max) 128 modified_param = max; 129 else 130 modified = false; 131 132 if (modified) { 133 (void)cmpxchg(module_param, param, modified_param); 134 param = modified_param; 135 } 136 137 return param; 138 } 139 140 unsigned __dm_get_module_param(unsigned *module_param, 141 unsigned def, unsigned max) 142 { 143 unsigned param = ACCESS_ONCE(*module_param); 144 unsigned modified_param = 0; 145 146 if (!param) 147 modified_param = def; 148 else if (param > max) 149 modified_param = max; 150 151 if (modified_param) { 152 (void)cmpxchg(module_param, param, modified_param); 153 param = modified_param; 154 } 155 156 return param; 157 } 158 159 unsigned dm_get_reserved_bio_based_ios(void) 160 { 161 return __dm_get_module_param(&reserved_bio_based_ios, 162 RESERVED_BIO_BASED_IOS, DM_RESERVED_MAX_IOS); 163 } 164 EXPORT_SYMBOL_GPL(dm_get_reserved_bio_based_ios); 165 166 static unsigned dm_get_numa_node(void) 167 { 168 return __dm_get_module_param_int(&dm_numa_node, 169 DM_NUMA_NODE, num_online_nodes() - 1); 170 } 171 172 static int __init local_init(void) 173 { 174 int r = -ENOMEM; 175 176 /* allocate a slab for the dm_ios */ 177 _io_cache = KMEM_CACHE(dm_io, 0); 178 if (!_io_cache) 179 return r; 180 181 _rq_tio_cache = KMEM_CACHE(dm_rq_target_io, 0); 182 if (!_rq_tio_cache) 183 goto out_free_io_cache; 184 185 _rq_cache = kmem_cache_create("dm_old_clone_request", sizeof(struct request), 186 __alignof__(struct request), 0, NULL); 187 if (!_rq_cache) 188 goto out_free_rq_tio_cache; 189 190 r = dm_uevent_init(); 191 if (r) 192 goto out_free_rq_cache; 193 194 deferred_remove_workqueue = alloc_workqueue("kdmremove", WQ_UNBOUND, 1); 195 if (!deferred_remove_workqueue) { 196 r = -ENOMEM; 197 goto out_uevent_exit; 198 } 199 200 _major = major; 201 r = register_blkdev(_major, _name); 202 if (r < 0) 203 goto out_free_workqueue; 204 205 if (!_major) 206 _major = r; 207 208 return 0; 209 210 out_free_workqueue: 211 destroy_workqueue(deferred_remove_workqueue); 212 out_uevent_exit: 213 dm_uevent_exit(); 214 out_free_rq_cache: 215 kmem_cache_destroy(_rq_cache); 216 out_free_rq_tio_cache: 217 kmem_cache_destroy(_rq_tio_cache); 218 out_free_io_cache: 219 kmem_cache_destroy(_io_cache); 220 221 return r; 222 } 223 224 static void local_exit(void) 225 { 226 flush_scheduled_work(); 227 destroy_workqueue(deferred_remove_workqueue); 228 229 kmem_cache_destroy(_rq_cache); 230 kmem_cache_destroy(_rq_tio_cache); 231 kmem_cache_destroy(_io_cache); 232 unregister_blkdev(_major, _name); 233 dm_uevent_exit(); 234 235 _major = 0; 236 237 DMINFO("cleaned up"); 238 } 239 240 static int (*_inits[])(void) __initdata = { 241 local_init, 242 dm_target_init, 243 dm_linear_init, 244 dm_stripe_init, 245 dm_io_init, 246 dm_kcopyd_init, 247 dm_interface_init, 248 dm_statistics_init, 249 }; 250 251 static void (*_exits[])(void) = { 252 local_exit, 253 dm_target_exit, 254 dm_linear_exit, 255 dm_stripe_exit, 256 dm_io_exit, 257 dm_kcopyd_exit, 258 dm_interface_exit, 259 dm_statistics_exit, 260 }; 261 262 static int __init dm_init(void) 263 { 264 const int count = ARRAY_SIZE(_inits); 265 266 int r, i; 267 268 for (i = 0; i < count; i++) { 269 r = _inits[i](); 270 if (r) 271 goto bad; 272 } 273 274 return 0; 275 276 bad: 277 while (i--) 278 _exits[i](); 279 280 return r; 281 } 282 283 static void __exit dm_exit(void) 284 { 285 int i = ARRAY_SIZE(_exits); 286 287 while (i--) 288 _exits[i](); 289 290 /* 291 * Should be empty by this point. 292 */ 293 idr_destroy(&_minor_idr); 294 } 295 296 /* 297 * Block device functions 298 */ 299 int dm_deleting_md(struct mapped_device *md) 300 { 301 return test_bit(DMF_DELETING, &md->flags); 302 } 303 304 static int dm_blk_open(struct block_device *bdev, fmode_t mode) 305 { 306 struct mapped_device *md; 307 308 spin_lock(&_minor_lock); 309 310 md = bdev->bd_disk->private_data; 311 if (!md) 312 goto out; 313 314 if (test_bit(DMF_FREEING, &md->flags) || 315 dm_deleting_md(md)) { 316 md = NULL; 317 goto out; 318 } 319 320 dm_get(md); 321 atomic_inc(&md->open_count); 322 out: 323 spin_unlock(&_minor_lock); 324 325 return md ? 0 : -ENXIO; 326 } 327 328 static void dm_blk_close(struct gendisk *disk, fmode_t mode) 329 { 330 struct mapped_device *md; 331 332 spin_lock(&_minor_lock); 333 334 md = disk->private_data; 335 if (WARN_ON(!md)) 336 goto out; 337 338 if (atomic_dec_and_test(&md->open_count) && 339 (test_bit(DMF_DEFERRED_REMOVE, &md->flags))) 340 queue_work(deferred_remove_workqueue, &deferred_remove_work); 341 342 dm_put(md); 343 out: 344 spin_unlock(&_minor_lock); 345 } 346 347 int dm_open_count(struct mapped_device *md) 348 { 349 return atomic_read(&md->open_count); 350 } 351 352 /* 353 * Guarantees nothing is using the device before it's deleted. 354 */ 355 int dm_lock_for_deletion(struct mapped_device *md, bool mark_deferred, bool only_deferred) 356 { 357 int r = 0; 358 359 spin_lock(&_minor_lock); 360 361 if (dm_open_count(md)) { 362 r = -EBUSY; 363 if (mark_deferred) 364 set_bit(DMF_DEFERRED_REMOVE, &md->flags); 365 } else if (only_deferred && !test_bit(DMF_DEFERRED_REMOVE, &md->flags)) 366 r = -EEXIST; 367 else 368 set_bit(DMF_DELETING, &md->flags); 369 370 spin_unlock(&_minor_lock); 371 372 return r; 373 } 374 375 int dm_cancel_deferred_remove(struct mapped_device *md) 376 { 377 int r = 0; 378 379 spin_lock(&_minor_lock); 380 381 if (test_bit(DMF_DELETING, &md->flags)) 382 r = -EBUSY; 383 else 384 clear_bit(DMF_DEFERRED_REMOVE, &md->flags); 385 386 spin_unlock(&_minor_lock); 387 388 return r; 389 } 390 391 static void do_deferred_remove(struct work_struct *w) 392 { 393 dm_deferred_remove(); 394 } 395 396 sector_t dm_get_size(struct mapped_device *md) 397 { 398 return get_capacity(md->disk); 399 } 400 401 struct request_queue *dm_get_md_queue(struct mapped_device *md) 402 { 403 return md->queue; 404 } 405 406 struct dm_stats *dm_get_stats(struct mapped_device *md) 407 { 408 return &md->stats; 409 } 410 411 static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo) 412 { 413 struct mapped_device *md = bdev->bd_disk->private_data; 414 415 return dm_get_geometry(md, geo); 416 } 417 418 static int dm_grab_bdev_for_ioctl(struct mapped_device *md, 419 struct block_device **bdev, 420 fmode_t *mode) 421 { 422 struct dm_target *tgt; 423 struct dm_table *map; 424 int srcu_idx, r; 425 426 retry: 427 r = -ENOTTY; 428 map = dm_get_live_table(md, &srcu_idx); 429 if (!map || !dm_table_get_size(map)) 430 goto out; 431 432 /* We only support devices that have a single target */ 433 if (dm_table_get_num_targets(map) != 1) 434 goto out; 435 436 tgt = dm_table_get_target(map, 0); 437 if (!tgt->type->prepare_ioctl) 438 goto out; 439 440 if (dm_suspended_md(md)) { 441 r = -EAGAIN; 442 goto out; 443 } 444 445 r = tgt->type->prepare_ioctl(tgt, bdev, mode); 446 if (r < 0) 447 goto out; 448 449 bdgrab(*bdev); 450 dm_put_live_table(md, srcu_idx); 451 return r; 452 453 out: 454 dm_put_live_table(md, srcu_idx); 455 if (r == -ENOTCONN && !fatal_signal_pending(current)) { 456 msleep(10); 457 goto retry; 458 } 459 return r; 460 } 461 462 static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode, 463 unsigned int cmd, unsigned long arg) 464 { 465 struct mapped_device *md = bdev->bd_disk->private_data; 466 int r; 467 468 r = dm_grab_bdev_for_ioctl(md, &bdev, &mode); 469 if (r < 0) 470 return r; 471 472 if (r > 0) { 473 /* 474 * Target determined this ioctl is being issued against a 475 * subset of the parent bdev; require extra privileges. 476 */ 477 if (!capable(CAP_SYS_RAWIO)) { 478 DMWARN_LIMIT( 479 "%s: sending ioctl %x to DM device without required privilege.", 480 current->comm, cmd); 481 r = -ENOIOCTLCMD; 482 goto out; 483 } 484 } 485 486 r = __blkdev_driver_ioctl(bdev, mode, cmd, arg); 487 out: 488 bdput(bdev); 489 return r; 490 } 491 492 static struct dm_io *alloc_io(struct mapped_device *md) 493 { 494 return mempool_alloc(md->io_pool, GFP_NOIO); 495 } 496 497 static void free_io(struct mapped_device *md, struct dm_io *io) 498 { 499 mempool_free(io, md->io_pool); 500 } 501 502 static void free_tio(struct dm_target_io *tio) 503 { 504 bio_put(&tio->clone); 505 } 506 507 int md_in_flight(struct mapped_device *md) 508 { 509 return atomic_read(&md->pending[READ]) + 510 atomic_read(&md->pending[WRITE]); 511 } 512 513 static void start_io_acct(struct dm_io *io) 514 { 515 struct mapped_device *md = io->md; 516 struct bio *bio = io->bio; 517 int cpu; 518 int rw = bio_data_dir(bio); 519 520 io->start_time = jiffies; 521 522 cpu = part_stat_lock(); 523 part_round_stats(cpu, &dm_disk(md)->part0); 524 part_stat_unlock(); 525 atomic_set(&dm_disk(md)->part0.in_flight[rw], 526 atomic_inc_return(&md->pending[rw])); 527 528 if (unlikely(dm_stats_used(&md->stats))) 529 dm_stats_account_io(&md->stats, bio_data_dir(bio), 530 bio->bi_iter.bi_sector, bio_sectors(bio), 531 false, 0, &io->stats_aux); 532 } 533 534 static void end_io_acct(struct dm_io *io) 535 { 536 struct mapped_device *md = io->md; 537 struct bio *bio = io->bio; 538 unsigned long duration = jiffies - io->start_time; 539 int pending; 540 int rw = bio_data_dir(bio); 541 542 generic_end_io_acct(rw, &dm_disk(md)->part0, io->start_time); 543 544 if (unlikely(dm_stats_used(&md->stats))) 545 dm_stats_account_io(&md->stats, bio_data_dir(bio), 546 bio->bi_iter.bi_sector, bio_sectors(bio), 547 true, duration, &io->stats_aux); 548 549 /* 550 * After this is decremented the bio must not be touched if it is 551 * a flush. 552 */ 553 pending = atomic_dec_return(&md->pending[rw]); 554 atomic_set(&dm_disk(md)->part0.in_flight[rw], pending); 555 pending += atomic_read(&md->pending[rw^0x1]); 556 557 /* nudge anyone waiting on suspend queue */ 558 if (!pending) 559 wake_up(&md->wait); 560 } 561 562 /* 563 * Add the bio to the list of deferred io. 564 */ 565 static void queue_io(struct mapped_device *md, struct bio *bio) 566 { 567 unsigned long flags; 568 569 spin_lock_irqsave(&md->deferred_lock, flags); 570 bio_list_add(&md->deferred, bio); 571 spin_unlock_irqrestore(&md->deferred_lock, flags); 572 queue_work(md->wq, &md->work); 573 } 574 575 /* 576 * Everyone (including functions in this file), should use this 577 * function to access the md->map field, and make sure they call 578 * dm_put_live_table() when finished. 579 */ 580 struct dm_table *dm_get_live_table(struct mapped_device *md, int *srcu_idx) __acquires(md->io_barrier) 581 { 582 *srcu_idx = srcu_read_lock(&md->io_barrier); 583 584 return srcu_dereference(md->map, &md->io_barrier); 585 } 586 587 void dm_put_live_table(struct mapped_device *md, int srcu_idx) __releases(md->io_barrier) 588 { 589 srcu_read_unlock(&md->io_barrier, srcu_idx); 590 } 591 592 void dm_sync_table(struct mapped_device *md) 593 { 594 synchronize_srcu(&md->io_barrier); 595 synchronize_rcu_expedited(); 596 } 597 598 /* 599 * A fast alternative to dm_get_live_table/dm_put_live_table. 600 * The caller must not block between these two functions. 601 */ 602 static struct dm_table *dm_get_live_table_fast(struct mapped_device *md) __acquires(RCU) 603 { 604 rcu_read_lock(); 605 return rcu_dereference(md->map); 606 } 607 608 static void dm_put_live_table_fast(struct mapped_device *md) __releases(RCU) 609 { 610 rcu_read_unlock(); 611 } 612 613 /* 614 * Open a table device so we can use it as a map destination. 615 */ 616 static int open_table_device(struct table_device *td, dev_t dev, 617 struct mapped_device *md) 618 { 619 static char *_claim_ptr = "I belong to device-mapper"; 620 struct block_device *bdev; 621 622 int r; 623 624 BUG_ON(td->dm_dev.bdev); 625 626 bdev = blkdev_get_by_dev(dev, td->dm_dev.mode | FMODE_EXCL, _claim_ptr); 627 if (IS_ERR(bdev)) 628 return PTR_ERR(bdev); 629 630 r = bd_link_disk_holder(bdev, dm_disk(md)); 631 if (r) { 632 blkdev_put(bdev, td->dm_dev.mode | FMODE_EXCL); 633 return r; 634 } 635 636 td->dm_dev.bdev = bdev; 637 td->dm_dev.dax_dev = dax_get_by_host(bdev->bd_disk->disk_name); 638 return 0; 639 } 640 641 /* 642 * Close a table device that we've been using. 643 */ 644 static void close_table_device(struct table_device *td, struct mapped_device *md) 645 { 646 if (!td->dm_dev.bdev) 647 return; 648 649 bd_unlink_disk_holder(td->dm_dev.bdev, dm_disk(md)); 650 blkdev_put(td->dm_dev.bdev, td->dm_dev.mode | FMODE_EXCL); 651 put_dax(td->dm_dev.dax_dev); 652 td->dm_dev.bdev = NULL; 653 td->dm_dev.dax_dev = NULL; 654 } 655 656 static struct table_device *find_table_device(struct list_head *l, dev_t dev, 657 fmode_t mode) { 658 struct table_device *td; 659 660 list_for_each_entry(td, l, list) 661 if (td->dm_dev.bdev->bd_dev == dev && td->dm_dev.mode == mode) 662 return td; 663 664 return NULL; 665 } 666 667 int dm_get_table_device(struct mapped_device *md, dev_t dev, fmode_t mode, 668 struct dm_dev **result) { 669 int r; 670 struct table_device *td; 671 672 mutex_lock(&md->table_devices_lock); 673 td = find_table_device(&md->table_devices, dev, mode); 674 if (!td) { 675 td = kmalloc_node(sizeof(*td), GFP_KERNEL, md->numa_node_id); 676 if (!td) { 677 mutex_unlock(&md->table_devices_lock); 678 return -ENOMEM; 679 } 680 681 td->dm_dev.mode = mode; 682 td->dm_dev.bdev = NULL; 683 684 if ((r = open_table_device(td, dev, md))) { 685 mutex_unlock(&md->table_devices_lock); 686 kfree(td); 687 return r; 688 } 689 690 format_dev_t(td->dm_dev.name, dev); 691 692 atomic_set(&td->count, 0); 693 list_add(&td->list, &md->table_devices); 694 } 695 atomic_inc(&td->count); 696 mutex_unlock(&md->table_devices_lock); 697 698 *result = &td->dm_dev; 699 return 0; 700 } 701 EXPORT_SYMBOL_GPL(dm_get_table_device); 702 703 void dm_put_table_device(struct mapped_device *md, struct dm_dev *d) 704 { 705 struct table_device *td = container_of(d, struct table_device, dm_dev); 706 707 mutex_lock(&md->table_devices_lock); 708 if (atomic_dec_and_test(&td->count)) { 709 close_table_device(td, md); 710 list_del(&td->list); 711 kfree(td); 712 } 713 mutex_unlock(&md->table_devices_lock); 714 } 715 EXPORT_SYMBOL(dm_put_table_device); 716 717 static void free_table_devices(struct list_head *devices) 718 { 719 struct list_head *tmp, *next; 720 721 list_for_each_safe(tmp, next, devices) { 722 struct table_device *td = list_entry(tmp, struct table_device, list); 723 724 DMWARN("dm_destroy: %s still exists with %d references", 725 td->dm_dev.name, atomic_read(&td->count)); 726 kfree(td); 727 } 728 } 729 730 /* 731 * Get the geometry associated with a dm device 732 */ 733 int dm_get_geometry(struct mapped_device *md, struct hd_geometry *geo) 734 { 735 *geo = md->geometry; 736 737 return 0; 738 } 739 740 /* 741 * Set the geometry of a device. 742 */ 743 int dm_set_geometry(struct mapped_device *md, struct hd_geometry *geo) 744 { 745 sector_t sz = (sector_t)geo->cylinders * geo->heads * geo->sectors; 746 747 if (geo->start > sz) { 748 DMWARN("Start sector is beyond the geometry limits."); 749 return -EINVAL; 750 } 751 752 md->geometry = *geo; 753 754 return 0; 755 } 756 757 /*----------------------------------------------------------------- 758 * CRUD START: 759 * A more elegant soln is in the works that uses the queue 760 * merge fn, unfortunately there are a couple of changes to 761 * the block layer that I want to make for this. So in the 762 * interests of getting something for people to use I give 763 * you this clearly demarcated crap. 764 *---------------------------------------------------------------*/ 765 766 static int __noflush_suspending(struct mapped_device *md) 767 { 768 return test_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); 769 } 770 771 /* 772 * Decrements the number of outstanding ios that a bio has been 773 * cloned into, completing the original io if necc. 774 */ 775 static void dec_pending(struct dm_io *io, blk_status_t error) 776 { 777 unsigned long flags; 778 blk_status_t io_error; 779 struct bio *bio; 780 struct mapped_device *md = io->md; 781 782 /* Push-back supersedes any I/O errors */ 783 if (unlikely(error)) { 784 spin_lock_irqsave(&io->endio_lock, flags); 785 if (!(io->status == BLK_STS_DM_REQUEUE && 786 __noflush_suspending(md))) 787 io->status = error; 788 spin_unlock_irqrestore(&io->endio_lock, flags); 789 } 790 791 if (atomic_dec_and_test(&io->io_count)) { 792 if (io->status == BLK_STS_DM_REQUEUE) { 793 /* 794 * Target requested pushing back the I/O. 795 */ 796 spin_lock_irqsave(&md->deferred_lock, flags); 797 if (__noflush_suspending(md)) 798 bio_list_add_head(&md->deferred, io->bio); 799 else 800 /* noflush suspend was interrupted. */ 801 io->status = BLK_STS_IOERR; 802 spin_unlock_irqrestore(&md->deferred_lock, flags); 803 } 804 805 io_error = io->status; 806 bio = io->bio; 807 end_io_acct(io); 808 free_io(md, io); 809 810 if (io_error == BLK_STS_DM_REQUEUE) 811 return; 812 813 if ((bio->bi_opf & REQ_PREFLUSH) && bio->bi_iter.bi_size) { 814 /* 815 * Preflush done for flush with data, reissue 816 * without REQ_PREFLUSH. 817 */ 818 bio->bi_opf &= ~REQ_PREFLUSH; 819 queue_io(md, bio); 820 } else { 821 /* done with normal IO or empty flush */ 822 bio->bi_status = io_error; 823 bio_endio(bio); 824 } 825 } 826 } 827 828 void disable_write_same(struct mapped_device *md) 829 { 830 struct queue_limits *limits = dm_get_queue_limits(md); 831 832 /* device doesn't really support WRITE SAME, disable it */ 833 limits->max_write_same_sectors = 0; 834 } 835 836 void disable_write_zeroes(struct mapped_device *md) 837 { 838 struct queue_limits *limits = dm_get_queue_limits(md); 839 840 /* device doesn't really support WRITE ZEROES, disable it */ 841 limits->max_write_zeroes_sectors = 0; 842 } 843 844 static void clone_endio(struct bio *bio) 845 { 846 blk_status_t error = bio->bi_status; 847 struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone); 848 struct dm_io *io = tio->io; 849 struct mapped_device *md = tio->io->md; 850 dm_endio_fn endio = tio->ti->type->end_io; 851 852 if (unlikely(error == BLK_STS_TARGET)) { 853 if (bio_op(bio) == REQ_OP_WRITE_SAME && 854 !bdev_get_queue(bio->bi_bdev)->limits.max_write_same_sectors) 855 disable_write_same(md); 856 if (bio_op(bio) == REQ_OP_WRITE_ZEROES && 857 !bdev_get_queue(bio->bi_bdev)->limits.max_write_zeroes_sectors) 858 disable_write_zeroes(md); 859 } 860 861 if (endio) { 862 int r = endio(tio->ti, bio, &error); 863 switch (r) { 864 case DM_ENDIO_REQUEUE: 865 error = BLK_STS_DM_REQUEUE; 866 /*FALLTHRU*/ 867 case DM_ENDIO_DONE: 868 break; 869 case DM_ENDIO_INCOMPLETE: 870 /* The target will handle the io */ 871 return; 872 default: 873 DMWARN("unimplemented target endio return value: %d", r); 874 BUG(); 875 } 876 } 877 878 free_tio(tio); 879 dec_pending(io, error); 880 } 881 882 /* 883 * Return maximum size of I/O possible at the supplied sector up to the current 884 * target boundary. 885 */ 886 static sector_t max_io_len_target_boundary(sector_t sector, struct dm_target *ti) 887 { 888 sector_t target_offset = dm_target_offset(ti, sector); 889 890 return ti->len - target_offset; 891 } 892 893 static sector_t max_io_len(sector_t sector, struct dm_target *ti) 894 { 895 sector_t len = max_io_len_target_boundary(sector, ti); 896 sector_t offset, max_len; 897 898 /* 899 * Does the target need to split even further? 900 */ 901 if (ti->max_io_len) { 902 offset = dm_target_offset(ti, sector); 903 if (unlikely(ti->max_io_len & (ti->max_io_len - 1))) 904 max_len = sector_div(offset, ti->max_io_len); 905 else 906 max_len = offset & (ti->max_io_len - 1); 907 max_len = ti->max_io_len - max_len; 908 909 if (len > max_len) 910 len = max_len; 911 } 912 913 return len; 914 } 915 916 int dm_set_target_max_io_len(struct dm_target *ti, sector_t len) 917 { 918 if (len > UINT_MAX) { 919 DMERR("Specified maximum size of target IO (%llu) exceeds limit (%u)", 920 (unsigned long long)len, UINT_MAX); 921 ti->error = "Maximum size of target IO is too large"; 922 return -EINVAL; 923 } 924 925 ti->max_io_len = (uint32_t) len; 926 927 return 0; 928 } 929 EXPORT_SYMBOL_GPL(dm_set_target_max_io_len); 930 931 static struct dm_target *dm_dax_get_live_target(struct mapped_device *md, 932 sector_t sector, int *srcu_idx) 933 { 934 struct dm_table *map; 935 struct dm_target *ti; 936 937 map = dm_get_live_table(md, srcu_idx); 938 if (!map) 939 return NULL; 940 941 ti = dm_table_find_target(map, sector); 942 if (!dm_target_is_valid(ti)) 943 return NULL; 944 945 return ti; 946 } 947 948 static long dm_dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, 949 long nr_pages, void **kaddr, pfn_t *pfn) 950 { 951 struct mapped_device *md = dax_get_private(dax_dev); 952 sector_t sector = pgoff * PAGE_SECTORS; 953 struct dm_target *ti; 954 long len, ret = -EIO; 955 int srcu_idx; 956 957 ti = dm_dax_get_live_target(md, sector, &srcu_idx); 958 959 if (!ti) 960 goto out; 961 if (!ti->type->direct_access) 962 goto out; 963 len = max_io_len(sector, ti) / PAGE_SECTORS; 964 if (len < 1) 965 goto out; 966 nr_pages = min(len, nr_pages); 967 if (ti->type->direct_access) 968 ret = ti->type->direct_access(ti, pgoff, nr_pages, kaddr, pfn); 969 970 out: 971 dm_put_live_table(md, srcu_idx); 972 973 return ret; 974 } 975 976 static size_t dm_dax_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff, 977 void *addr, size_t bytes, struct iov_iter *i) 978 { 979 struct mapped_device *md = dax_get_private(dax_dev); 980 sector_t sector = pgoff * PAGE_SECTORS; 981 struct dm_target *ti; 982 long ret = 0; 983 int srcu_idx; 984 985 ti = dm_dax_get_live_target(md, sector, &srcu_idx); 986 987 if (!ti) 988 goto out; 989 if (!ti->type->dax_copy_from_iter) { 990 ret = copy_from_iter(addr, bytes, i); 991 goto out; 992 } 993 ret = ti->type->dax_copy_from_iter(ti, pgoff, addr, bytes, i); 994 out: 995 dm_put_live_table(md, srcu_idx); 996 997 return ret; 998 } 999 1000 static void dm_dax_flush(struct dax_device *dax_dev, pgoff_t pgoff, void *addr, 1001 size_t size) 1002 { 1003 struct mapped_device *md = dax_get_private(dax_dev); 1004 sector_t sector = pgoff * PAGE_SECTORS; 1005 struct dm_target *ti; 1006 int srcu_idx; 1007 1008 ti = dm_dax_get_live_target(md, sector, &srcu_idx); 1009 1010 if (!ti) 1011 goto out; 1012 if (ti->type->dax_flush) 1013 ti->type->dax_flush(ti, pgoff, addr, size); 1014 out: 1015 dm_put_live_table(md, srcu_idx); 1016 } 1017 1018 /* 1019 * A target may call dm_accept_partial_bio only from the map routine. It is 1020 * allowed for all bio types except REQ_PREFLUSH. 1021 * 1022 * dm_accept_partial_bio informs the dm that the target only wants to process 1023 * additional n_sectors sectors of the bio and the rest of the data should be 1024 * sent in a next bio. 1025 * 1026 * A diagram that explains the arithmetics: 1027 * +--------------------+---------------+-------+ 1028 * | 1 | 2 | 3 | 1029 * +--------------------+---------------+-------+ 1030 * 1031 * <-------------- *tio->len_ptr ---------------> 1032 * <------- bi_size -------> 1033 * <-- n_sectors --> 1034 * 1035 * Region 1 was already iterated over with bio_advance or similar function. 1036 * (it may be empty if the target doesn't use bio_advance) 1037 * Region 2 is the remaining bio size that the target wants to process. 1038 * (it may be empty if region 1 is non-empty, although there is no reason 1039 * to make it empty) 1040 * The target requires that region 3 is to be sent in the next bio. 1041 * 1042 * If the target wants to receive multiple copies of the bio (via num_*bios, etc), 1043 * the partially processed part (the sum of regions 1+2) must be the same for all 1044 * copies of the bio. 1045 */ 1046 void dm_accept_partial_bio(struct bio *bio, unsigned n_sectors) 1047 { 1048 struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone); 1049 unsigned bi_size = bio->bi_iter.bi_size >> SECTOR_SHIFT; 1050 BUG_ON(bio->bi_opf & REQ_PREFLUSH); 1051 BUG_ON(bi_size > *tio->len_ptr); 1052 BUG_ON(n_sectors > bi_size); 1053 *tio->len_ptr -= bi_size - n_sectors; 1054 bio->bi_iter.bi_size = n_sectors << SECTOR_SHIFT; 1055 } 1056 EXPORT_SYMBOL_GPL(dm_accept_partial_bio); 1057 1058 /* 1059 * The zone descriptors obtained with a zone report indicate 1060 * zone positions within the target device. The zone descriptors 1061 * must be remapped to match their position within the dm device. 1062 * A target may call dm_remap_zone_report after completion of a 1063 * REQ_OP_ZONE_REPORT bio to remap the zone descriptors obtained 1064 * from the target device mapping to the dm device. 1065 */ 1066 void dm_remap_zone_report(struct dm_target *ti, struct bio *bio, sector_t start) 1067 { 1068 #ifdef CONFIG_BLK_DEV_ZONED 1069 struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone); 1070 struct bio *report_bio = tio->io->bio; 1071 struct blk_zone_report_hdr *hdr = NULL; 1072 struct blk_zone *zone; 1073 unsigned int nr_rep = 0; 1074 unsigned int ofst; 1075 struct bio_vec bvec; 1076 struct bvec_iter iter; 1077 void *addr; 1078 1079 if (bio->bi_status) 1080 return; 1081 1082 /* 1083 * Remap the start sector of the reported zones. For sequential zones, 1084 * also remap the write pointer position. 1085 */ 1086 bio_for_each_segment(bvec, report_bio, iter) { 1087 addr = kmap_atomic(bvec.bv_page); 1088 1089 /* Remember the report header in the first page */ 1090 if (!hdr) { 1091 hdr = addr; 1092 ofst = sizeof(struct blk_zone_report_hdr); 1093 } else 1094 ofst = 0; 1095 1096 /* Set zones start sector */ 1097 while (hdr->nr_zones && ofst < bvec.bv_len) { 1098 zone = addr + ofst; 1099 if (zone->start >= start + ti->len) { 1100 hdr->nr_zones = 0; 1101 break; 1102 } 1103 zone->start = zone->start + ti->begin - start; 1104 if (zone->type != BLK_ZONE_TYPE_CONVENTIONAL) { 1105 if (zone->cond == BLK_ZONE_COND_FULL) 1106 zone->wp = zone->start + zone->len; 1107 else if (zone->cond == BLK_ZONE_COND_EMPTY) 1108 zone->wp = zone->start; 1109 else 1110 zone->wp = zone->wp + ti->begin - start; 1111 } 1112 ofst += sizeof(struct blk_zone); 1113 hdr->nr_zones--; 1114 nr_rep++; 1115 } 1116 1117 if (addr != hdr) 1118 kunmap_atomic(addr); 1119 1120 if (!hdr->nr_zones) 1121 break; 1122 } 1123 1124 if (hdr) { 1125 hdr->nr_zones = nr_rep; 1126 kunmap_atomic(hdr); 1127 } 1128 1129 bio_advance(report_bio, report_bio->bi_iter.bi_size); 1130 1131 #else /* !CONFIG_BLK_DEV_ZONED */ 1132 bio->bi_status = BLK_STS_NOTSUPP; 1133 #endif 1134 } 1135 EXPORT_SYMBOL_GPL(dm_remap_zone_report); 1136 1137 /* 1138 * Flush current->bio_list when the target map method blocks. 1139 * This fixes deadlocks in snapshot and possibly in other targets. 1140 */ 1141 struct dm_offload { 1142 struct blk_plug plug; 1143 struct blk_plug_cb cb; 1144 }; 1145 1146 static void flush_current_bio_list(struct blk_plug_cb *cb, bool from_schedule) 1147 { 1148 struct dm_offload *o = container_of(cb, struct dm_offload, cb); 1149 struct bio_list list; 1150 struct bio *bio; 1151 int i; 1152 1153 INIT_LIST_HEAD(&o->cb.list); 1154 1155 if (unlikely(!current->bio_list)) 1156 return; 1157 1158 for (i = 0; i < 2; i++) { 1159 list = current->bio_list[i]; 1160 bio_list_init(¤t->bio_list[i]); 1161 1162 while ((bio = bio_list_pop(&list))) { 1163 struct bio_set *bs = bio->bi_pool; 1164 if (unlikely(!bs) || bs == fs_bio_set || 1165 !bs->rescue_workqueue) { 1166 bio_list_add(¤t->bio_list[i], bio); 1167 continue; 1168 } 1169 1170 spin_lock(&bs->rescue_lock); 1171 bio_list_add(&bs->rescue_list, bio); 1172 queue_work(bs->rescue_workqueue, &bs->rescue_work); 1173 spin_unlock(&bs->rescue_lock); 1174 } 1175 } 1176 } 1177 1178 static void dm_offload_start(struct dm_offload *o) 1179 { 1180 blk_start_plug(&o->plug); 1181 o->cb.callback = flush_current_bio_list; 1182 list_add(&o->cb.list, ¤t->plug->cb_list); 1183 } 1184 1185 static void dm_offload_end(struct dm_offload *o) 1186 { 1187 list_del(&o->cb.list); 1188 blk_finish_plug(&o->plug); 1189 } 1190 1191 static void __map_bio(struct dm_target_io *tio) 1192 { 1193 int r; 1194 sector_t sector; 1195 struct dm_offload o; 1196 struct bio *clone = &tio->clone; 1197 struct dm_target *ti = tio->ti; 1198 1199 clone->bi_end_io = clone_endio; 1200 1201 /* 1202 * Map the clone. If r == 0 we don't need to do 1203 * anything, the target has assumed ownership of 1204 * this io. 1205 */ 1206 atomic_inc(&tio->io->io_count); 1207 sector = clone->bi_iter.bi_sector; 1208 1209 dm_offload_start(&o); 1210 r = ti->type->map(ti, clone); 1211 dm_offload_end(&o); 1212 1213 switch (r) { 1214 case DM_MAPIO_SUBMITTED: 1215 break; 1216 case DM_MAPIO_REMAPPED: 1217 /* the bio has been remapped so dispatch it */ 1218 trace_block_bio_remap(bdev_get_queue(clone->bi_bdev), clone, 1219 tio->io->bio->bi_bdev->bd_dev, sector); 1220 generic_make_request(clone); 1221 break; 1222 case DM_MAPIO_KILL: 1223 dec_pending(tio->io, BLK_STS_IOERR); 1224 free_tio(tio); 1225 break; 1226 case DM_MAPIO_REQUEUE: 1227 dec_pending(tio->io, BLK_STS_DM_REQUEUE); 1228 free_tio(tio); 1229 break; 1230 default: 1231 DMWARN("unimplemented target map return value: %d", r); 1232 BUG(); 1233 } 1234 } 1235 1236 struct clone_info { 1237 struct mapped_device *md; 1238 struct dm_table *map; 1239 struct bio *bio; 1240 struct dm_io *io; 1241 sector_t sector; 1242 unsigned sector_count; 1243 }; 1244 1245 static void bio_setup_sector(struct bio *bio, sector_t sector, unsigned len) 1246 { 1247 bio->bi_iter.bi_sector = sector; 1248 bio->bi_iter.bi_size = to_bytes(len); 1249 } 1250 1251 /* 1252 * Creates a bio that consists of range of complete bvecs. 1253 */ 1254 static int clone_bio(struct dm_target_io *tio, struct bio *bio, 1255 sector_t sector, unsigned len) 1256 { 1257 struct bio *clone = &tio->clone; 1258 1259 __bio_clone_fast(clone, bio); 1260 1261 if (unlikely(bio_integrity(bio) != NULL)) { 1262 int r; 1263 1264 if (unlikely(!dm_target_has_integrity(tio->ti->type) && 1265 !dm_target_passes_integrity(tio->ti->type))) { 1266 DMWARN("%s: the target %s doesn't support integrity data.", 1267 dm_device_name(tio->io->md), 1268 tio->ti->type->name); 1269 return -EIO; 1270 } 1271 1272 r = bio_integrity_clone(clone, bio, GFP_NOIO); 1273 if (r < 0) 1274 return r; 1275 } 1276 1277 if (bio_op(bio) != REQ_OP_ZONE_REPORT) 1278 bio_advance(clone, to_bytes(sector - clone->bi_iter.bi_sector)); 1279 clone->bi_iter.bi_size = to_bytes(len); 1280 1281 if (unlikely(bio_integrity(bio) != NULL)) 1282 bio_integrity_trim(clone); 1283 1284 return 0; 1285 } 1286 1287 static struct dm_target_io *alloc_tio(struct clone_info *ci, 1288 struct dm_target *ti, 1289 unsigned target_bio_nr) 1290 { 1291 struct dm_target_io *tio; 1292 struct bio *clone; 1293 1294 clone = bio_alloc_bioset(GFP_NOIO, 0, ci->md->bs); 1295 tio = container_of(clone, struct dm_target_io, clone); 1296 1297 tio->io = ci->io; 1298 tio->ti = ti; 1299 tio->target_bio_nr = target_bio_nr; 1300 1301 return tio; 1302 } 1303 1304 static void __clone_and_map_simple_bio(struct clone_info *ci, 1305 struct dm_target *ti, 1306 unsigned target_bio_nr, unsigned *len) 1307 { 1308 struct dm_target_io *tio = alloc_tio(ci, ti, target_bio_nr); 1309 struct bio *clone = &tio->clone; 1310 1311 tio->len_ptr = len; 1312 1313 __bio_clone_fast(clone, ci->bio); 1314 if (len) 1315 bio_setup_sector(clone, ci->sector, *len); 1316 1317 __map_bio(tio); 1318 } 1319 1320 static void __send_duplicate_bios(struct clone_info *ci, struct dm_target *ti, 1321 unsigned num_bios, unsigned *len) 1322 { 1323 unsigned target_bio_nr; 1324 1325 for (target_bio_nr = 0; target_bio_nr < num_bios; target_bio_nr++) 1326 __clone_and_map_simple_bio(ci, ti, target_bio_nr, len); 1327 } 1328 1329 static int __send_empty_flush(struct clone_info *ci) 1330 { 1331 unsigned target_nr = 0; 1332 struct dm_target *ti; 1333 1334 BUG_ON(bio_has_data(ci->bio)); 1335 while ((ti = dm_table_get_target(ci->map, target_nr++))) 1336 __send_duplicate_bios(ci, ti, ti->num_flush_bios, NULL); 1337 1338 return 0; 1339 } 1340 1341 static int __clone_and_map_data_bio(struct clone_info *ci, struct dm_target *ti, 1342 sector_t sector, unsigned *len) 1343 { 1344 struct bio *bio = ci->bio; 1345 struct dm_target_io *tio; 1346 unsigned target_bio_nr; 1347 unsigned num_target_bios = 1; 1348 int r = 0; 1349 1350 /* 1351 * Does the target want to receive duplicate copies of the bio? 1352 */ 1353 if (bio_data_dir(bio) == WRITE && ti->num_write_bios) 1354 num_target_bios = ti->num_write_bios(ti, bio); 1355 1356 for (target_bio_nr = 0; target_bio_nr < num_target_bios; target_bio_nr++) { 1357 tio = alloc_tio(ci, ti, target_bio_nr); 1358 tio->len_ptr = len; 1359 r = clone_bio(tio, bio, sector, *len); 1360 if (r < 0) { 1361 free_tio(tio); 1362 break; 1363 } 1364 __map_bio(tio); 1365 } 1366 1367 return r; 1368 } 1369 1370 typedef unsigned (*get_num_bios_fn)(struct dm_target *ti); 1371 1372 static unsigned get_num_discard_bios(struct dm_target *ti) 1373 { 1374 return ti->num_discard_bios; 1375 } 1376 1377 static unsigned get_num_write_same_bios(struct dm_target *ti) 1378 { 1379 return ti->num_write_same_bios; 1380 } 1381 1382 static unsigned get_num_write_zeroes_bios(struct dm_target *ti) 1383 { 1384 return ti->num_write_zeroes_bios; 1385 } 1386 1387 typedef bool (*is_split_required_fn)(struct dm_target *ti); 1388 1389 static bool is_split_required_for_discard(struct dm_target *ti) 1390 { 1391 return ti->split_discard_bios; 1392 } 1393 1394 static int __send_changing_extent_only(struct clone_info *ci, 1395 get_num_bios_fn get_num_bios, 1396 is_split_required_fn is_split_required) 1397 { 1398 struct dm_target *ti; 1399 unsigned len; 1400 unsigned num_bios; 1401 1402 do { 1403 ti = dm_table_find_target(ci->map, ci->sector); 1404 if (!dm_target_is_valid(ti)) 1405 return -EIO; 1406 1407 /* 1408 * Even though the device advertised support for this type of 1409 * request, that does not mean every target supports it, and 1410 * reconfiguration might also have changed that since the 1411 * check was performed. 1412 */ 1413 num_bios = get_num_bios ? get_num_bios(ti) : 0; 1414 if (!num_bios) 1415 return -EOPNOTSUPP; 1416 1417 if (is_split_required && !is_split_required(ti)) 1418 len = min((sector_t)ci->sector_count, max_io_len_target_boundary(ci->sector, ti)); 1419 else 1420 len = min((sector_t)ci->sector_count, max_io_len(ci->sector, ti)); 1421 1422 __send_duplicate_bios(ci, ti, num_bios, &len); 1423 1424 ci->sector += len; 1425 } while (ci->sector_count -= len); 1426 1427 return 0; 1428 } 1429 1430 static int __send_discard(struct clone_info *ci) 1431 { 1432 return __send_changing_extent_only(ci, get_num_discard_bios, 1433 is_split_required_for_discard); 1434 } 1435 1436 static int __send_write_same(struct clone_info *ci) 1437 { 1438 return __send_changing_extent_only(ci, get_num_write_same_bios, NULL); 1439 } 1440 1441 static int __send_write_zeroes(struct clone_info *ci) 1442 { 1443 return __send_changing_extent_only(ci, get_num_write_zeroes_bios, NULL); 1444 } 1445 1446 /* 1447 * Select the correct strategy for processing a non-flush bio. 1448 */ 1449 static int __split_and_process_non_flush(struct clone_info *ci) 1450 { 1451 struct bio *bio = ci->bio; 1452 struct dm_target *ti; 1453 unsigned len; 1454 int r; 1455 1456 if (unlikely(bio_op(bio) == REQ_OP_DISCARD)) 1457 return __send_discard(ci); 1458 else if (unlikely(bio_op(bio) == REQ_OP_WRITE_SAME)) 1459 return __send_write_same(ci); 1460 else if (unlikely(bio_op(bio) == REQ_OP_WRITE_ZEROES)) 1461 return __send_write_zeroes(ci); 1462 1463 ti = dm_table_find_target(ci->map, ci->sector); 1464 if (!dm_target_is_valid(ti)) 1465 return -EIO; 1466 1467 if (bio_op(bio) == REQ_OP_ZONE_REPORT) 1468 len = ci->sector_count; 1469 else 1470 len = min_t(sector_t, max_io_len(ci->sector, ti), 1471 ci->sector_count); 1472 1473 r = __clone_and_map_data_bio(ci, ti, ci->sector, &len); 1474 if (r < 0) 1475 return r; 1476 1477 ci->sector += len; 1478 ci->sector_count -= len; 1479 1480 return 0; 1481 } 1482 1483 /* 1484 * Entry point to split a bio into clones and submit them to the targets. 1485 */ 1486 static void __split_and_process_bio(struct mapped_device *md, 1487 struct dm_table *map, struct bio *bio) 1488 { 1489 struct clone_info ci; 1490 int error = 0; 1491 1492 if (unlikely(!map)) { 1493 bio_io_error(bio); 1494 return; 1495 } 1496 1497 ci.map = map; 1498 ci.md = md; 1499 ci.io = alloc_io(md); 1500 ci.io->status = 0; 1501 atomic_set(&ci.io->io_count, 1); 1502 ci.io->bio = bio; 1503 ci.io->md = md; 1504 spin_lock_init(&ci.io->endio_lock); 1505 ci.sector = bio->bi_iter.bi_sector; 1506 1507 start_io_acct(ci.io); 1508 1509 if (bio->bi_opf & REQ_PREFLUSH) { 1510 ci.bio = &ci.md->flush_bio; 1511 ci.sector_count = 0; 1512 error = __send_empty_flush(&ci); 1513 /* dec_pending submits any data associated with flush */ 1514 } else if (bio_op(bio) == REQ_OP_ZONE_RESET) { 1515 ci.bio = bio; 1516 ci.sector_count = 0; 1517 error = __split_and_process_non_flush(&ci); 1518 } else { 1519 ci.bio = bio; 1520 ci.sector_count = bio_sectors(bio); 1521 while (ci.sector_count && !error) 1522 error = __split_and_process_non_flush(&ci); 1523 } 1524 1525 /* drop the extra reference count */ 1526 dec_pending(ci.io, error); 1527 } 1528 /*----------------------------------------------------------------- 1529 * CRUD END 1530 *---------------------------------------------------------------*/ 1531 1532 /* 1533 * The request function that just remaps the bio built up by 1534 * dm_merge_bvec. 1535 */ 1536 static blk_qc_t dm_make_request(struct request_queue *q, struct bio *bio) 1537 { 1538 int rw = bio_data_dir(bio); 1539 struct mapped_device *md = q->queuedata; 1540 int srcu_idx; 1541 struct dm_table *map; 1542 1543 map = dm_get_live_table(md, &srcu_idx); 1544 1545 generic_start_io_acct(rw, bio_sectors(bio), &dm_disk(md)->part0); 1546 1547 /* if we're suspended, we have to queue this io for later */ 1548 if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))) { 1549 dm_put_live_table(md, srcu_idx); 1550 1551 if (!(bio->bi_opf & REQ_RAHEAD)) 1552 queue_io(md, bio); 1553 else 1554 bio_io_error(bio); 1555 return BLK_QC_T_NONE; 1556 } 1557 1558 __split_and_process_bio(md, map, bio); 1559 dm_put_live_table(md, srcu_idx); 1560 return BLK_QC_T_NONE; 1561 } 1562 1563 static int dm_any_congested(void *congested_data, int bdi_bits) 1564 { 1565 int r = bdi_bits; 1566 struct mapped_device *md = congested_data; 1567 struct dm_table *map; 1568 1569 if (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) { 1570 if (dm_request_based(md)) { 1571 /* 1572 * With request-based DM we only need to check the 1573 * top-level queue for congestion. 1574 */ 1575 r = md->queue->backing_dev_info->wb.state & bdi_bits; 1576 } else { 1577 map = dm_get_live_table_fast(md); 1578 if (map) 1579 r = dm_table_any_congested(map, bdi_bits); 1580 dm_put_live_table_fast(md); 1581 } 1582 } 1583 1584 return r; 1585 } 1586 1587 /*----------------------------------------------------------------- 1588 * An IDR is used to keep track of allocated minor numbers. 1589 *---------------------------------------------------------------*/ 1590 static void free_minor(int minor) 1591 { 1592 spin_lock(&_minor_lock); 1593 idr_remove(&_minor_idr, minor); 1594 spin_unlock(&_minor_lock); 1595 } 1596 1597 /* 1598 * See if the device with a specific minor # is free. 1599 */ 1600 static int specific_minor(int minor) 1601 { 1602 int r; 1603 1604 if (minor >= (1 << MINORBITS)) 1605 return -EINVAL; 1606 1607 idr_preload(GFP_KERNEL); 1608 spin_lock(&_minor_lock); 1609 1610 r = idr_alloc(&_minor_idr, MINOR_ALLOCED, minor, minor + 1, GFP_NOWAIT); 1611 1612 spin_unlock(&_minor_lock); 1613 idr_preload_end(); 1614 if (r < 0) 1615 return r == -ENOSPC ? -EBUSY : r; 1616 return 0; 1617 } 1618 1619 static int next_free_minor(int *minor) 1620 { 1621 int r; 1622 1623 idr_preload(GFP_KERNEL); 1624 spin_lock(&_minor_lock); 1625 1626 r = idr_alloc(&_minor_idr, MINOR_ALLOCED, 0, 1 << MINORBITS, GFP_NOWAIT); 1627 1628 spin_unlock(&_minor_lock); 1629 idr_preload_end(); 1630 if (r < 0) 1631 return r; 1632 *minor = r; 1633 return 0; 1634 } 1635 1636 static const struct block_device_operations dm_blk_dops; 1637 static const struct dax_operations dm_dax_ops; 1638 1639 static void dm_wq_work(struct work_struct *work); 1640 1641 void dm_init_md_queue(struct mapped_device *md) 1642 { 1643 /* 1644 * Request-based dm devices cannot be stacked on top of bio-based dm 1645 * devices. The type of this dm device may not have been decided yet. 1646 * The type is decided at the first table loading time. 1647 * To prevent problematic device stacking, clear the queue flag 1648 * for request stacking support until then. 1649 * 1650 * This queue is new, so no concurrency on the queue_flags. 1651 */ 1652 queue_flag_clear_unlocked(QUEUE_FLAG_STACKABLE, md->queue); 1653 1654 /* 1655 * Initialize data that will only be used by a non-blk-mq DM queue 1656 * - must do so here (in alloc_dev callchain) before queue is used 1657 */ 1658 md->queue->queuedata = md; 1659 md->queue->backing_dev_info->congested_data = md; 1660 } 1661 1662 void dm_init_normal_md_queue(struct mapped_device *md) 1663 { 1664 md->use_blk_mq = false; 1665 dm_init_md_queue(md); 1666 1667 /* 1668 * Initialize aspects of queue that aren't relevant for blk-mq 1669 */ 1670 md->queue->backing_dev_info->congested_fn = dm_any_congested; 1671 } 1672 1673 static void cleanup_mapped_device(struct mapped_device *md) 1674 { 1675 if (md->wq) 1676 destroy_workqueue(md->wq); 1677 if (md->kworker_task) 1678 kthread_stop(md->kworker_task); 1679 mempool_destroy(md->io_pool); 1680 if (md->bs) 1681 bioset_free(md->bs); 1682 1683 if (md->dax_dev) { 1684 kill_dax(md->dax_dev); 1685 put_dax(md->dax_dev); 1686 md->dax_dev = NULL; 1687 } 1688 1689 if (md->disk) { 1690 spin_lock(&_minor_lock); 1691 md->disk->private_data = NULL; 1692 spin_unlock(&_minor_lock); 1693 del_gendisk(md->disk); 1694 put_disk(md->disk); 1695 } 1696 1697 if (md->queue) 1698 blk_cleanup_queue(md->queue); 1699 1700 cleanup_srcu_struct(&md->io_barrier); 1701 1702 if (md->bdev) { 1703 bdput(md->bdev); 1704 md->bdev = NULL; 1705 } 1706 1707 dm_mq_cleanup_mapped_device(md); 1708 } 1709 1710 /* 1711 * Allocate and initialise a blank device with a given minor. 1712 */ 1713 static struct mapped_device *alloc_dev(int minor) 1714 { 1715 int r, numa_node_id = dm_get_numa_node(); 1716 struct dax_device *dax_dev; 1717 struct mapped_device *md; 1718 void *old_md; 1719 1720 md = kzalloc_node(sizeof(*md), GFP_KERNEL, numa_node_id); 1721 if (!md) { 1722 DMWARN("unable to allocate device, out of memory."); 1723 return NULL; 1724 } 1725 1726 if (!try_module_get(THIS_MODULE)) 1727 goto bad_module_get; 1728 1729 /* get a minor number for the dev */ 1730 if (minor == DM_ANY_MINOR) 1731 r = next_free_minor(&minor); 1732 else 1733 r = specific_minor(minor); 1734 if (r < 0) 1735 goto bad_minor; 1736 1737 r = init_srcu_struct(&md->io_barrier); 1738 if (r < 0) 1739 goto bad_io_barrier; 1740 1741 md->numa_node_id = numa_node_id; 1742 md->use_blk_mq = dm_use_blk_mq_default(); 1743 md->init_tio_pdu = false; 1744 md->type = DM_TYPE_NONE; 1745 mutex_init(&md->suspend_lock); 1746 mutex_init(&md->type_lock); 1747 mutex_init(&md->table_devices_lock); 1748 spin_lock_init(&md->deferred_lock); 1749 atomic_set(&md->holders, 1); 1750 atomic_set(&md->open_count, 0); 1751 atomic_set(&md->event_nr, 0); 1752 atomic_set(&md->uevent_seq, 0); 1753 INIT_LIST_HEAD(&md->uevent_list); 1754 INIT_LIST_HEAD(&md->table_devices); 1755 spin_lock_init(&md->uevent_lock); 1756 1757 md->queue = blk_alloc_queue_node(GFP_KERNEL, numa_node_id); 1758 if (!md->queue) 1759 goto bad; 1760 1761 dm_init_md_queue(md); 1762 1763 md->disk = alloc_disk_node(1, numa_node_id); 1764 if (!md->disk) 1765 goto bad; 1766 1767 atomic_set(&md->pending[0], 0); 1768 atomic_set(&md->pending[1], 0); 1769 init_waitqueue_head(&md->wait); 1770 INIT_WORK(&md->work, dm_wq_work); 1771 init_waitqueue_head(&md->eventq); 1772 init_completion(&md->kobj_holder.completion); 1773 md->kworker_task = NULL; 1774 1775 md->disk->major = _major; 1776 md->disk->first_minor = minor; 1777 md->disk->fops = &dm_blk_dops; 1778 md->disk->queue = md->queue; 1779 md->disk->private_data = md; 1780 sprintf(md->disk->disk_name, "dm-%d", minor); 1781 1782 dax_dev = alloc_dax(md, md->disk->disk_name, &dm_dax_ops); 1783 if (!dax_dev) 1784 goto bad; 1785 md->dax_dev = dax_dev; 1786 1787 add_disk(md->disk); 1788 format_dev_t(md->name, MKDEV(_major, minor)); 1789 1790 md->wq = alloc_workqueue("kdmflush", WQ_MEM_RECLAIM, 0); 1791 if (!md->wq) 1792 goto bad; 1793 1794 md->bdev = bdget_disk(md->disk, 0); 1795 if (!md->bdev) 1796 goto bad; 1797 1798 bio_init(&md->flush_bio, NULL, 0); 1799 md->flush_bio.bi_bdev = md->bdev; 1800 md->flush_bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC; 1801 1802 dm_stats_init(&md->stats); 1803 1804 /* Populate the mapping, nobody knows we exist yet */ 1805 spin_lock(&_minor_lock); 1806 old_md = idr_replace(&_minor_idr, md, minor); 1807 spin_unlock(&_minor_lock); 1808 1809 BUG_ON(old_md != MINOR_ALLOCED); 1810 1811 return md; 1812 1813 bad: 1814 cleanup_mapped_device(md); 1815 bad_io_barrier: 1816 free_minor(minor); 1817 bad_minor: 1818 module_put(THIS_MODULE); 1819 bad_module_get: 1820 kfree(md); 1821 return NULL; 1822 } 1823 1824 static void unlock_fs(struct mapped_device *md); 1825 1826 static void free_dev(struct mapped_device *md) 1827 { 1828 int minor = MINOR(disk_devt(md->disk)); 1829 1830 unlock_fs(md); 1831 1832 cleanup_mapped_device(md); 1833 1834 free_table_devices(&md->table_devices); 1835 dm_stats_cleanup(&md->stats); 1836 free_minor(minor); 1837 1838 module_put(THIS_MODULE); 1839 kfree(md); 1840 } 1841 1842 static void __bind_mempools(struct mapped_device *md, struct dm_table *t) 1843 { 1844 struct dm_md_mempools *p = dm_table_get_md_mempools(t); 1845 1846 if (md->bs) { 1847 /* The md already has necessary mempools. */ 1848 if (dm_table_bio_based(t)) { 1849 /* 1850 * Reload bioset because front_pad may have changed 1851 * because a different table was loaded. 1852 */ 1853 bioset_free(md->bs); 1854 md->bs = p->bs; 1855 p->bs = NULL; 1856 } 1857 /* 1858 * There's no need to reload with request-based dm 1859 * because the size of front_pad doesn't change. 1860 * Note for future: If you are to reload bioset, 1861 * prep-ed requests in the queue may refer 1862 * to bio from the old bioset, so you must walk 1863 * through the queue to unprep. 1864 */ 1865 goto out; 1866 } 1867 1868 BUG_ON(!p || md->io_pool || md->bs); 1869 1870 md->io_pool = p->io_pool; 1871 p->io_pool = NULL; 1872 md->bs = p->bs; 1873 p->bs = NULL; 1874 1875 out: 1876 /* mempool bind completed, no longer need any mempools in the table */ 1877 dm_table_free_md_mempools(t); 1878 } 1879 1880 /* 1881 * Bind a table to the device. 1882 */ 1883 static void event_callback(void *context) 1884 { 1885 unsigned long flags; 1886 LIST_HEAD(uevents); 1887 struct mapped_device *md = (struct mapped_device *) context; 1888 1889 spin_lock_irqsave(&md->uevent_lock, flags); 1890 list_splice_init(&md->uevent_list, &uevents); 1891 spin_unlock_irqrestore(&md->uevent_lock, flags); 1892 1893 dm_send_uevents(&uevents, &disk_to_dev(md->disk)->kobj); 1894 1895 atomic_inc(&md->event_nr); 1896 atomic_inc(&dm_global_event_nr); 1897 wake_up(&md->eventq); 1898 wake_up(&dm_global_eventq); 1899 } 1900 1901 /* 1902 * Protected by md->suspend_lock obtained by dm_swap_table(). 1903 */ 1904 static void __set_size(struct mapped_device *md, sector_t size) 1905 { 1906 lockdep_assert_held(&md->suspend_lock); 1907 1908 set_capacity(md->disk, size); 1909 1910 i_size_write(md->bdev->bd_inode, (loff_t)size << SECTOR_SHIFT); 1911 } 1912 1913 /* 1914 * Returns old map, which caller must destroy. 1915 */ 1916 static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t, 1917 struct queue_limits *limits) 1918 { 1919 struct dm_table *old_map; 1920 struct request_queue *q = md->queue; 1921 sector_t size; 1922 1923 lockdep_assert_held(&md->suspend_lock); 1924 1925 size = dm_table_get_size(t); 1926 1927 /* 1928 * Wipe any geometry if the size of the table changed. 1929 */ 1930 if (size != dm_get_size(md)) 1931 memset(&md->geometry, 0, sizeof(md->geometry)); 1932 1933 __set_size(md, size); 1934 1935 dm_table_event_callback(t, event_callback, md); 1936 1937 /* 1938 * The queue hasn't been stopped yet, if the old table type wasn't 1939 * for request-based during suspension. So stop it to prevent 1940 * I/O mapping before resume. 1941 * This must be done before setting the queue restrictions, 1942 * because request-based dm may be run just after the setting. 1943 */ 1944 if (dm_table_request_based(t)) { 1945 dm_stop_queue(q); 1946 /* 1947 * Leverage the fact that request-based DM targets are 1948 * immutable singletons and establish md->immutable_target 1949 * - used to optimize both dm_request_fn and dm_mq_queue_rq 1950 */ 1951 md->immutable_target = dm_table_get_immutable_target(t); 1952 } 1953 1954 __bind_mempools(md, t); 1955 1956 old_map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock)); 1957 rcu_assign_pointer(md->map, (void *)t); 1958 md->immutable_target_type = dm_table_get_immutable_target_type(t); 1959 1960 dm_table_set_restrictions(t, q, limits); 1961 if (old_map) 1962 dm_sync_table(md); 1963 1964 return old_map; 1965 } 1966 1967 /* 1968 * Returns unbound table for the caller to free. 1969 */ 1970 static struct dm_table *__unbind(struct mapped_device *md) 1971 { 1972 struct dm_table *map = rcu_dereference_protected(md->map, 1); 1973 1974 if (!map) 1975 return NULL; 1976 1977 dm_table_event_callback(map, NULL, NULL); 1978 RCU_INIT_POINTER(md->map, NULL); 1979 dm_sync_table(md); 1980 1981 return map; 1982 } 1983 1984 /* 1985 * Constructor for a new device. 1986 */ 1987 int dm_create(int minor, struct mapped_device **result) 1988 { 1989 struct mapped_device *md; 1990 1991 md = alloc_dev(minor); 1992 if (!md) 1993 return -ENXIO; 1994 1995 dm_sysfs_init(md); 1996 1997 *result = md; 1998 return 0; 1999 } 2000 2001 /* 2002 * Functions to manage md->type. 2003 * All are required to hold md->type_lock. 2004 */ 2005 void dm_lock_md_type(struct mapped_device *md) 2006 { 2007 mutex_lock(&md->type_lock); 2008 } 2009 2010 void dm_unlock_md_type(struct mapped_device *md) 2011 { 2012 mutex_unlock(&md->type_lock); 2013 } 2014 2015 void dm_set_md_type(struct mapped_device *md, enum dm_queue_mode type) 2016 { 2017 BUG_ON(!mutex_is_locked(&md->type_lock)); 2018 md->type = type; 2019 } 2020 2021 enum dm_queue_mode dm_get_md_type(struct mapped_device *md) 2022 { 2023 return md->type; 2024 } 2025 2026 struct target_type *dm_get_immutable_target_type(struct mapped_device *md) 2027 { 2028 return md->immutable_target_type; 2029 } 2030 2031 /* 2032 * The queue_limits are only valid as long as you have a reference 2033 * count on 'md'. 2034 */ 2035 struct queue_limits *dm_get_queue_limits(struct mapped_device *md) 2036 { 2037 BUG_ON(!atomic_read(&md->holders)); 2038 return &md->queue->limits; 2039 } 2040 EXPORT_SYMBOL_GPL(dm_get_queue_limits); 2041 2042 /* 2043 * Setup the DM device's queue based on md's type 2044 */ 2045 int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t) 2046 { 2047 int r; 2048 enum dm_queue_mode type = dm_get_md_type(md); 2049 2050 switch (type) { 2051 case DM_TYPE_REQUEST_BASED: 2052 r = dm_old_init_request_queue(md, t); 2053 if (r) { 2054 DMERR("Cannot initialize queue for request-based mapped device"); 2055 return r; 2056 } 2057 break; 2058 case DM_TYPE_MQ_REQUEST_BASED: 2059 r = dm_mq_init_request_queue(md, t); 2060 if (r) { 2061 DMERR("Cannot initialize queue for request-based dm-mq mapped device"); 2062 return r; 2063 } 2064 break; 2065 case DM_TYPE_BIO_BASED: 2066 case DM_TYPE_DAX_BIO_BASED: 2067 dm_init_normal_md_queue(md); 2068 blk_queue_make_request(md->queue, dm_make_request); 2069 /* 2070 * DM handles splitting bios as needed. Free the bio_split bioset 2071 * since it won't be used (saves 1 process per bio-based DM device). 2072 */ 2073 bioset_free(md->queue->bio_split); 2074 md->queue->bio_split = NULL; 2075 2076 if (type == DM_TYPE_DAX_BIO_BASED) 2077 queue_flag_set_unlocked(QUEUE_FLAG_DAX, md->queue); 2078 break; 2079 case DM_TYPE_NONE: 2080 WARN_ON_ONCE(true); 2081 break; 2082 } 2083 2084 return 0; 2085 } 2086 2087 struct mapped_device *dm_get_md(dev_t dev) 2088 { 2089 struct mapped_device *md; 2090 unsigned minor = MINOR(dev); 2091 2092 if (MAJOR(dev) != _major || minor >= (1 << MINORBITS)) 2093 return NULL; 2094 2095 spin_lock(&_minor_lock); 2096 2097 md = idr_find(&_minor_idr, minor); 2098 if (md) { 2099 if ((md == MINOR_ALLOCED || 2100 (MINOR(disk_devt(dm_disk(md))) != minor) || 2101 dm_deleting_md(md) || 2102 test_bit(DMF_FREEING, &md->flags))) { 2103 md = NULL; 2104 goto out; 2105 } 2106 dm_get(md); 2107 } 2108 2109 out: 2110 spin_unlock(&_minor_lock); 2111 2112 return md; 2113 } 2114 EXPORT_SYMBOL_GPL(dm_get_md); 2115 2116 void *dm_get_mdptr(struct mapped_device *md) 2117 { 2118 return md->interface_ptr; 2119 } 2120 2121 void dm_set_mdptr(struct mapped_device *md, void *ptr) 2122 { 2123 md->interface_ptr = ptr; 2124 } 2125 2126 void dm_get(struct mapped_device *md) 2127 { 2128 atomic_inc(&md->holders); 2129 BUG_ON(test_bit(DMF_FREEING, &md->flags)); 2130 } 2131 2132 int dm_hold(struct mapped_device *md) 2133 { 2134 spin_lock(&_minor_lock); 2135 if (test_bit(DMF_FREEING, &md->flags)) { 2136 spin_unlock(&_minor_lock); 2137 return -EBUSY; 2138 } 2139 dm_get(md); 2140 spin_unlock(&_minor_lock); 2141 return 0; 2142 } 2143 EXPORT_SYMBOL_GPL(dm_hold); 2144 2145 const char *dm_device_name(struct mapped_device *md) 2146 { 2147 return md->name; 2148 } 2149 EXPORT_SYMBOL_GPL(dm_device_name); 2150 2151 static void __dm_destroy(struct mapped_device *md, bool wait) 2152 { 2153 struct request_queue *q = dm_get_md_queue(md); 2154 struct dm_table *map; 2155 int srcu_idx; 2156 2157 might_sleep(); 2158 2159 spin_lock(&_minor_lock); 2160 idr_replace(&_minor_idr, MINOR_ALLOCED, MINOR(disk_devt(dm_disk(md)))); 2161 set_bit(DMF_FREEING, &md->flags); 2162 spin_unlock(&_minor_lock); 2163 2164 blk_set_queue_dying(q); 2165 2166 if (dm_request_based(md) && md->kworker_task) 2167 kthread_flush_worker(&md->kworker); 2168 2169 /* 2170 * Take suspend_lock so that presuspend and postsuspend methods 2171 * do not race with internal suspend. 2172 */ 2173 mutex_lock(&md->suspend_lock); 2174 map = dm_get_live_table(md, &srcu_idx); 2175 if (!dm_suspended_md(md)) { 2176 dm_table_presuspend_targets(map); 2177 dm_table_postsuspend_targets(map); 2178 } 2179 /* dm_put_live_table must be before msleep, otherwise deadlock is possible */ 2180 dm_put_live_table(md, srcu_idx); 2181 mutex_unlock(&md->suspend_lock); 2182 2183 /* 2184 * Rare, but there may be I/O requests still going to complete, 2185 * for example. Wait for all references to disappear. 2186 * No one should increment the reference count of the mapped_device, 2187 * after the mapped_device state becomes DMF_FREEING. 2188 */ 2189 if (wait) 2190 while (atomic_read(&md->holders)) 2191 msleep(1); 2192 else if (atomic_read(&md->holders)) 2193 DMWARN("%s: Forcibly removing mapped_device still in use! (%d users)", 2194 dm_device_name(md), atomic_read(&md->holders)); 2195 2196 dm_sysfs_exit(md); 2197 dm_table_destroy(__unbind(md)); 2198 free_dev(md); 2199 } 2200 2201 void dm_destroy(struct mapped_device *md) 2202 { 2203 __dm_destroy(md, true); 2204 } 2205 2206 void dm_destroy_immediate(struct mapped_device *md) 2207 { 2208 __dm_destroy(md, false); 2209 } 2210 2211 void dm_put(struct mapped_device *md) 2212 { 2213 atomic_dec(&md->holders); 2214 } 2215 EXPORT_SYMBOL_GPL(dm_put); 2216 2217 static int dm_wait_for_completion(struct mapped_device *md, long task_state) 2218 { 2219 int r = 0; 2220 DEFINE_WAIT(wait); 2221 2222 while (1) { 2223 prepare_to_wait(&md->wait, &wait, task_state); 2224 2225 if (!md_in_flight(md)) 2226 break; 2227 2228 if (signal_pending_state(task_state, current)) { 2229 r = -EINTR; 2230 break; 2231 } 2232 2233 io_schedule(); 2234 } 2235 finish_wait(&md->wait, &wait); 2236 2237 return r; 2238 } 2239 2240 /* 2241 * Process the deferred bios 2242 */ 2243 static void dm_wq_work(struct work_struct *work) 2244 { 2245 struct mapped_device *md = container_of(work, struct mapped_device, 2246 work); 2247 struct bio *c; 2248 int srcu_idx; 2249 struct dm_table *map; 2250 2251 map = dm_get_live_table(md, &srcu_idx); 2252 2253 while (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) { 2254 spin_lock_irq(&md->deferred_lock); 2255 c = bio_list_pop(&md->deferred); 2256 spin_unlock_irq(&md->deferred_lock); 2257 2258 if (!c) 2259 break; 2260 2261 if (dm_request_based(md)) 2262 generic_make_request(c); 2263 else 2264 __split_and_process_bio(md, map, c); 2265 } 2266 2267 dm_put_live_table(md, srcu_idx); 2268 } 2269 2270 static void dm_queue_flush(struct mapped_device *md) 2271 { 2272 clear_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags); 2273 smp_mb__after_atomic(); 2274 queue_work(md->wq, &md->work); 2275 } 2276 2277 /* 2278 * Swap in a new table, returning the old one for the caller to destroy. 2279 */ 2280 struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *table) 2281 { 2282 struct dm_table *live_map = NULL, *map = ERR_PTR(-EINVAL); 2283 struct queue_limits limits; 2284 int r; 2285 2286 mutex_lock(&md->suspend_lock); 2287 2288 /* device must be suspended */ 2289 if (!dm_suspended_md(md)) 2290 goto out; 2291 2292 /* 2293 * If the new table has no data devices, retain the existing limits. 2294 * This helps multipath with queue_if_no_path if all paths disappear, 2295 * then new I/O is queued based on these limits, and then some paths 2296 * reappear. 2297 */ 2298 if (dm_table_has_no_data_devices(table)) { 2299 live_map = dm_get_live_table_fast(md); 2300 if (live_map) 2301 limits = md->queue->limits; 2302 dm_put_live_table_fast(md); 2303 } 2304 2305 if (!live_map) { 2306 r = dm_calculate_queue_limits(table, &limits); 2307 if (r) { 2308 map = ERR_PTR(r); 2309 goto out; 2310 } 2311 } 2312 2313 map = __bind(md, table, &limits); 2314 2315 out: 2316 mutex_unlock(&md->suspend_lock); 2317 return map; 2318 } 2319 2320 /* 2321 * Functions to lock and unlock any filesystem running on the 2322 * device. 2323 */ 2324 static int lock_fs(struct mapped_device *md) 2325 { 2326 int r; 2327 2328 WARN_ON(md->frozen_sb); 2329 2330 md->frozen_sb = freeze_bdev(md->bdev); 2331 if (IS_ERR(md->frozen_sb)) { 2332 r = PTR_ERR(md->frozen_sb); 2333 md->frozen_sb = NULL; 2334 return r; 2335 } 2336 2337 set_bit(DMF_FROZEN, &md->flags); 2338 2339 return 0; 2340 } 2341 2342 static void unlock_fs(struct mapped_device *md) 2343 { 2344 if (!test_bit(DMF_FROZEN, &md->flags)) 2345 return; 2346 2347 thaw_bdev(md->bdev, md->frozen_sb); 2348 md->frozen_sb = NULL; 2349 clear_bit(DMF_FROZEN, &md->flags); 2350 } 2351 2352 /* 2353 * @suspend_flags: DM_SUSPEND_LOCKFS_FLAG and/or DM_SUSPEND_NOFLUSH_FLAG 2354 * @task_state: e.g. TASK_INTERRUPTIBLE or TASK_UNINTERRUPTIBLE 2355 * @dmf_suspended_flag: DMF_SUSPENDED or DMF_SUSPENDED_INTERNALLY 2356 * 2357 * If __dm_suspend returns 0, the device is completely quiescent 2358 * now. There is no request-processing activity. All new requests 2359 * are being added to md->deferred list. 2360 */ 2361 static int __dm_suspend(struct mapped_device *md, struct dm_table *map, 2362 unsigned suspend_flags, long task_state, 2363 int dmf_suspended_flag) 2364 { 2365 bool do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG; 2366 bool noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG; 2367 int r; 2368 2369 lockdep_assert_held(&md->suspend_lock); 2370 2371 /* 2372 * DMF_NOFLUSH_SUSPENDING must be set before presuspend. 2373 * This flag is cleared before dm_suspend returns. 2374 */ 2375 if (noflush) 2376 set_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); 2377 else 2378 pr_debug("%s: suspending with flush\n", dm_device_name(md)); 2379 2380 /* 2381 * This gets reverted if there's an error later and the targets 2382 * provide the .presuspend_undo hook. 2383 */ 2384 dm_table_presuspend_targets(map); 2385 2386 /* 2387 * Flush I/O to the device. 2388 * Any I/O submitted after lock_fs() may not be flushed. 2389 * noflush takes precedence over do_lockfs. 2390 * (lock_fs() flushes I/Os and waits for them to complete.) 2391 */ 2392 if (!noflush && do_lockfs) { 2393 r = lock_fs(md); 2394 if (r) { 2395 dm_table_presuspend_undo_targets(map); 2396 return r; 2397 } 2398 } 2399 2400 /* 2401 * Here we must make sure that no processes are submitting requests 2402 * to target drivers i.e. no one may be executing 2403 * __split_and_process_bio. This is called from dm_request and 2404 * dm_wq_work. 2405 * 2406 * To get all processes out of __split_and_process_bio in dm_request, 2407 * we take the write lock. To prevent any process from reentering 2408 * __split_and_process_bio from dm_request and quiesce the thread 2409 * (dm_wq_work), we set BMF_BLOCK_IO_FOR_SUSPEND and call 2410 * flush_workqueue(md->wq). 2411 */ 2412 set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags); 2413 if (map) 2414 synchronize_srcu(&md->io_barrier); 2415 2416 /* 2417 * Stop md->queue before flushing md->wq in case request-based 2418 * dm defers requests to md->wq from md->queue. 2419 */ 2420 if (dm_request_based(md)) { 2421 dm_stop_queue(md->queue); 2422 if (md->kworker_task) 2423 kthread_flush_worker(&md->kworker); 2424 } 2425 2426 flush_workqueue(md->wq); 2427 2428 /* 2429 * At this point no more requests are entering target request routines. 2430 * We call dm_wait_for_completion to wait for all existing requests 2431 * to finish. 2432 */ 2433 r = dm_wait_for_completion(md, task_state); 2434 if (!r) 2435 set_bit(dmf_suspended_flag, &md->flags); 2436 2437 if (noflush) 2438 clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); 2439 if (map) 2440 synchronize_srcu(&md->io_barrier); 2441 2442 /* were we interrupted ? */ 2443 if (r < 0) { 2444 dm_queue_flush(md); 2445 2446 if (dm_request_based(md)) 2447 dm_start_queue(md->queue); 2448 2449 unlock_fs(md); 2450 dm_table_presuspend_undo_targets(map); 2451 /* pushback list is already flushed, so skip flush */ 2452 } 2453 2454 return r; 2455 } 2456 2457 /* 2458 * We need to be able to change a mapping table under a mounted 2459 * filesystem. For example we might want to move some data in 2460 * the background. Before the table can be swapped with 2461 * dm_bind_table, dm_suspend must be called to flush any in 2462 * flight bios and ensure that any further io gets deferred. 2463 */ 2464 /* 2465 * Suspend mechanism in request-based dm. 2466 * 2467 * 1. Flush all I/Os by lock_fs() if needed. 2468 * 2. Stop dispatching any I/O by stopping the request_queue. 2469 * 3. Wait for all in-flight I/Os to be completed or requeued. 2470 * 2471 * To abort suspend, start the request_queue. 2472 */ 2473 int dm_suspend(struct mapped_device *md, unsigned suspend_flags) 2474 { 2475 struct dm_table *map = NULL; 2476 int r = 0; 2477 2478 retry: 2479 mutex_lock_nested(&md->suspend_lock, SINGLE_DEPTH_NESTING); 2480 2481 if (dm_suspended_md(md)) { 2482 r = -EINVAL; 2483 goto out_unlock; 2484 } 2485 2486 if (dm_suspended_internally_md(md)) { 2487 /* already internally suspended, wait for internal resume */ 2488 mutex_unlock(&md->suspend_lock); 2489 r = wait_on_bit(&md->flags, DMF_SUSPENDED_INTERNALLY, TASK_INTERRUPTIBLE); 2490 if (r) 2491 return r; 2492 goto retry; 2493 } 2494 2495 map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock)); 2496 2497 r = __dm_suspend(md, map, suspend_flags, TASK_INTERRUPTIBLE, DMF_SUSPENDED); 2498 if (r) 2499 goto out_unlock; 2500 2501 dm_table_postsuspend_targets(map); 2502 2503 out_unlock: 2504 mutex_unlock(&md->suspend_lock); 2505 return r; 2506 } 2507 2508 static int __dm_resume(struct mapped_device *md, struct dm_table *map) 2509 { 2510 if (map) { 2511 int r = dm_table_resume_targets(map); 2512 if (r) 2513 return r; 2514 } 2515 2516 dm_queue_flush(md); 2517 2518 /* 2519 * Flushing deferred I/Os must be done after targets are resumed 2520 * so that mapping of targets can work correctly. 2521 * Request-based dm is queueing the deferred I/Os in its request_queue. 2522 */ 2523 if (dm_request_based(md)) 2524 dm_start_queue(md->queue); 2525 2526 unlock_fs(md); 2527 2528 return 0; 2529 } 2530 2531 int dm_resume(struct mapped_device *md) 2532 { 2533 int r; 2534 struct dm_table *map = NULL; 2535 2536 retry: 2537 r = -EINVAL; 2538 mutex_lock_nested(&md->suspend_lock, SINGLE_DEPTH_NESTING); 2539 2540 if (!dm_suspended_md(md)) 2541 goto out; 2542 2543 if (dm_suspended_internally_md(md)) { 2544 /* already internally suspended, wait for internal resume */ 2545 mutex_unlock(&md->suspend_lock); 2546 r = wait_on_bit(&md->flags, DMF_SUSPENDED_INTERNALLY, TASK_INTERRUPTIBLE); 2547 if (r) 2548 return r; 2549 goto retry; 2550 } 2551 2552 map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock)); 2553 if (!map || !dm_table_get_size(map)) 2554 goto out; 2555 2556 r = __dm_resume(md, map); 2557 if (r) 2558 goto out; 2559 2560 clear_bit(DMF_SUSPENDED, &md->flags); 2561 out: 2562 mutex_unlock(&md->suspend_lock); 2563 2564 return r; 2565 } 2566 2567 /* 2568 * Internal suspend/resume works like userspace-driven suspend. It waits 2569 * until all bios finish and prevents issuing new bios to the target drivers. 2570 * It may be used only from the kernel. 2571 */ 2572 2573 static void __dm_internal_suspend(struct mapped_device *md, unsigned suspend_flags) 2574 { 2575 struct dm_table *map = NULL; 2576 2577 lockdep_assert_held(&md->suspend_lock); 2578 2579 if (md->internal_suspend_count++) 2580 return; /* nested internal suspend */ 2581 2582 if (dm_suspended_md(md)) { 2583 set_bit(DMF_SUSPENDED_INTERNALLY, &md->flags); 2584 return; /* nest suspend */ 2585 } 2586 2587 map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock)); 2588 2589 /* 2590 * Using TASK_UNINTERRUPTIBLE because only NOFLUSH internal suspend is 2591 * supported. Properly supporting a TASK_INTERRUPTIBLE internal suspend 2592 * would require changing .presuspend to return an error -- avoid this 2593 * until there is a need for more elaborate variants of internal suspend. 2594 */ 2595 (void) __dm_suspend(md, map, suspend_flags, TASK_UNINTERRUPTIBLE, 2596 DMF_SUSPENDED_INTERNALLY); 2597 2598 dm_table_postsuspend_targets(map); 2599 } 2600 2601 static void __dm_internal_resume(struct mapped_device *md) 2602 { 2603 BUG_ON(!md->internal_suspend_count); 2604 2605 if (--md->internal_suspend_count) 2606 return; /* resume from nested internal suspend */ 2607 2608 if (dm_suspended_md(md)) 2609 goto done; /* resume from nested suspend */ 2610 2611 /* 2612 * NOTE: existing callers don't need to call dm_table_resume_targets 2613 * (which may fail -- so best to avoid it for now by passing NULL map) 2614 */ 2615 (void) __dm_resume(md, NULL); 2616 2617 done: 2618 clear_bit(DMF_SUSPENDED_INTERNALLY, &md->flags); 2619 smp_mb__after_atomic(); 2620 wake_up_bit(&md->flags, DMF_SUSPENDED_INTERNALLY); 2621 } 2622 2623 void dm_internal_suspend_noflush(struct mapped_device *md) 2624 { 2625 mutex_lock(&md->suspend_lock); 2626 __dm_internal_suspend(md, DM_SUSPEND_NOFLUSH_FLAG); 2627 mutex_unlock(&md->suspend_lock); 2628 } 2629 EXPORT_SYMBOL_GPL(dm_internal_suspend_noflush); 2630 2631 void dm_internal_resume(struct mapped_device *md) 2632 { 2633 mutex_lock(&md->suspend_lock); 2634 __dm_internal_resume(md); 2635 mutex_unlock(&md->suspend_lock); 2636 } 2637 EXPORT_SYMBOL_GPL(dm_internal_resume); 2638 2639 /* 2640 * Fast variants of internal suspend/resume hold md->suspend_lock, 2641 * which prevents interaction with userspace-driven suspend. 2642 */ 2643 2644 void dm_internal_suspend_fast(struct mapped_device *md) 2645 { 2646 mutex_lock(&md->suspend_lock); 2647 if (dm_suspended_md(md) || dm_suspended_internally_md(md)) 2648 return; 2649 2650 set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags); 2651 synchronize_srcu(&md->io_barrier); 2652 flush_workqueue(md->wq); 2653 dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE); 2654 } 2655 EXPORT_SYMBOL_GPL(dm_internal_suspend_fast); 2656 2657 void dm_internal_resume_fast(struct mapped_device *md) 2658 { 2659 if (dm_suspended_md(md) || dm_suspended_internally_md(md)) 2660 goto done; 2661 2662 dm_queue_flush(md); 2663 2664 done: 2665 mutex_unlock(&md->suspend_lock); 2666 } 2667 EXPORT_SYMBOL_GPL(dm_internal_resume_fast); 2668 2669 /*----------------------------------------------------------------- 2670 * Event notification. 2671 *---------------------------------------------------------------*/ 2672 int dm_kobject_uevent(struct mapped_device *md, enum kobject_action action, 2673 unsigned cookie) 2674 { 2675 char udev_cookie[DM_COOKIE_LENGTH]; 2676 char *envp[] = { udev_cookie, NULL }; 2677 2678 if (!cookie) 2679 return kobject_uevent(&disk_to_dev(md->disk)->kobj, action); 2680 else { 2681 snprintf(udev_cookie, DM_COOKIE_LENGTH, "%s=%u", 2682 DM_COOKIE_ENV_VAR_NAME, cookie); 2683 return kobject_uevent_env(&disk_to_dev(md->disk)->kobj, 2684 action, envp); 2685 } 2686 } 2687 2688 uint32_t dm_next_uevent_seq(struct mapped_device *md) 2689 { 2690 return atomic_add_return(1, &md->uevent_seq); 2691 } 2692 2693 uint32_t dm_get_event_nr(struct mapped_device *md) 2694 { 2695 return atomic_read(&md->event_nr); 2696 } 2697 2698 int dm_wait_event(struct mapped_device *md, int event_nr) 2699 { 2700 return wait_event_interruptible(md->eventq, 2701 (event_nr != atomic_read(&md->event_nr))); 2702 } 2703 2704 void dm_uevent_add(struct mapped_device *md, struct list_head *elist) 2705 { 2706 unsigned long flags; 2707 2708 spin_lock_irqsave(&md->uevent_lock, flags); 2709 list_add(elist, &md->uevent_list); 2710 spin_unlock_irqrestore(&md->uevent_lock, flags); 2711 } 2712 2713 /* 2714 * The gendisk is only valid as long as you have a reference 2715 * count on 'md'. 2716 */ 2717 struct gendisk *dm_disk(struct mapped_device *md) 2718 { 2719 return md->disk; 2720 } 2721 EXPORT_SYMBOL_GPL(dm_disk); 2722 2723 struct kobject *dm_kobject(struct mapped_device *md) 2724 { 2725 return &md->kobj_holder.kobj; 2726 } 2727 2728 struct mapped_device *dm_get_from_kobject(struct kobject *kobj) 2729 { 2730 struct mapped_device *md; 2731 2732 md = container_of(kobj, struct mapped_device, kobj_holder.kobj); 2733 2734 if (test_bit(DMF_FREEING, &md->flags) || 2735 dm_deleting_md(md)) 2736 return NULL; 2737 2738 dm_get(md); 2739 return md; 2740 } 2741 2742 int dm_suspended_md(struct mapped_device *md) 2743 { 2744 return test_bit(DMF_SUSPENDED, &md->flags); 2745 } 2746 2747 int dm_suspended_internally_md(struct mapped_device *md) 2748 { 2749 return test_bit(DMF_SUSPENDED_INTERNALLY, &md->flags); 2750 } 2751 2752 int dm_test_deferred_remove_flag(struct mapped_device *md) 2753 { 2754 return test_bit(DMF_DEFERRED_REMOVE, &md->flags); 2755 } 2756 2757 int dm_suspended(struct dm_target *ti) 2758 { 2759 return dm_suspended_md(dm_table_get_md(ti->table)); 2760 } 2761 EXPORT_SYMBOL_GPL(dm_suspended); 2762 2763 int dm_noflush_suspending(struct dm_target *ti) 2764 { 2765 return __noflush_suspending(dm_table_get_md(ti->table)); 2766 } 2767 EXPORT_SYMBOL_GPL(dm_noflush_suspending); 2768 2769 struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, enum dm_queue_mode type, 2770 unsigned integrity, unsigned per_io_data_size) 2771 { 2772 struct dm_md_mempools *pools = kzalloc_node(sizeof(*pools), GFP_KERNEL, md->numa_node_id); 2773 unsigned int pool_size = 0; 2774 unsigned int front_pad; 2775 2776 if (!pools) 2777 return NULL; 2778 2779 switch (type) { 2780 case DM_TYPE_BIO_BASED: 2781 case DM_TYPE_DAX_BIO_BASED: 2782 pool_size = dm_get_reserved_bio_based_ios(); 2783 front_pad = roundup(per_io_data_size, __alignof__(struct dm_target_io)) + offsetof(struct dm_target_io, clone); 2784 2785 pools->io_pool = mempool_create_slab_pool(pool_size, _io_cache); 2786 if (!pools->io_pool) 2787 goto out; 2788 break; 2789 case DM_TYPE_REQUEST_BASED: 2790 case DM_TYPE_MQ_REQUEST_BASED: 2791 pool_size = dm_get_reserved_rq_based_ios(); 2792 front_pad = offsetof(struct dm_rq_clone_bio_info, clone); 2793 /* per_io_data_size is used for blk-mq pdu at queue allocation */ 2794 break; 2795 default: 2796 BUG(); 2797 } 2798 2799 pools->bs = bioset_create(pool_size, front_pad, BIOSET_NEED_RESCUER); 2800 if (!pools->bs) 2801 goto out; 2802 2803 if (integrity && bioset_integrity_create(pools->bs, pool_size)) 2804 goto out; 2805 2806 return pools; 2807 2808 out: 2809 dm_free_md_mempools(pools); 2810 2811 return NULL; 2812 } 2813 2814 void dm_free_md_mempools(struct dm_md_mempools *pools) 2815 { 2816 if (!pools) 2817 return; 2818 2819 mempool_destroy(pools->io_pool); 2820 2821 if (pools->bs) 2822 bioset_free(pools->bs); 2823 2824 kfree(pools); 2825 } 2826 2827 struct dm_pr { 2828 u64 old_key; 2829 u64 new_key; 2830 u32 flags; 2831 bool fail_early; 2832 }; 2833 2834 static int dm_call_pr(struct block_device *bdev, iterate_devices_callout_fn fn, 2835 void *data) 2836 { 2837 struct mapped_device *md = bdev->bd_disk->private_data; 2838 struct dm_table *table; 2839 struct dm_target *ti; 2840 int ret = -ENOTTY, srcu_idx; 2841 2842 table = dm_get_live_table(md, &srcu_idx); 2843 if (!table || !dm_table_get_size(table)) 2844 goto out; 2845 2846 /* We only support devices that have a single target */ 2847 if (dm_table_get_num_targets(table) != 1) 2848 goto out; 2849 ti = dm_table_get_target(table, 0); 2850 2851 ret = -EINVAL; 2852 if (!ti->type->iterate_devices) 2853 goto out; 2854 2855 ret = ti->type->iterate_devices(ti, fn, data); 2856 out: 2857 dm_put_live_table(md, srcu_idx); 2858 return ret; 2859 } 2860 2861 /* 2862 * For register / unregister we need to manually call out to every path. 2863 */ 2864 static int __dm_pr_register(struct dm_target *ti, struct dm_dev *dev, 2865 sector_t start, sector_t len, void *data) 2866 { 2867 struct dm_pr *pr = data; 2868 const struct pr_ops *ops = dev->bdev->bd_disk->fops->pr_ops; 2869 2870 if (!ops || !ops->pr_register) 2871 return -EOPNOTSUPP; 2872 return ops->pr_register(dev->bdev, pr->old_key, pr->new_key, pr->flags); 2873 } 2874 2875 static int dm_pr_register(struct block_device *bdev, u64 old_key, u64 new_key, 2876 u32 flags) 2877 { 2878 struct dm_pr pr = { 2879 .old_key = old_key, 2880 .new_key = new_key, 2881 .flags = flags, 2882 .fail_early = true, 2883 }; 2884 int ret; 2885 2886 ret = dm_call_pr(bdev, __dm_pr_register, &pr); 2887 if (ret && new_key) { 2888 /* unregister all paths if we failed to register any path */ 2889 pr.old_key = new_key; 2890 pr.new_key = 0; 2891 pr.flags = 0; 2892 pr.fail_early = false; 2893 dm_call_pr(bdev, __dm_pr_register, &pr); 2894 } 2895 2896 return ret; 2897 } 2898 2899 static int dm_pr_reserve(struct block_device *bdev, u64 key, enum pr_type type, 2900 u32 flags) 2901 { 2902 struct mapped_device *md = bdev->bd_disk->private_data; 2903 const struct pr_ops *ops; 2904 fmode_t mode; 2905 int r; 2906 2907 r = dm_grab_bdev_for_ioctl(md, &bdev, &mode); 2908 if (r < 0) 2909 return r; 2910 2911 ops = bdev->bd_disk->fops->pr_ops; 2912 if (ops && ops->pr_reserve) 2913 r = ops->pr_reserve(bdev, key, type, flags); 2914 else 2915 r = -EOPNOTSUPP; 2916 2917 bdput(bdev); 2918 return r; 2919 } 2920 2921 static int dm_pr_release(struct block_device *bdev, u64 key, enum pr_type type) 2922 { 2923 struct mapped_device *md = bdev->bd_disk->private_data; 2924 const struct pr_ops *ops; 2925 fmode_t mode; 2926 int r; 2927 2928 r = dm_grab_bdev_for_ioctl(md, &bdev, &mode); 2929 if (r < 0) 2930 return r; 2931 2932 ops = bdev->bd_disk->fops->pr_ops; 2933 if (ops && ops->pr_release) 2934 r = ops->pr_release(bdev, key, type); 2935 else 2936 r = -EOPNOTSUPP; 2937 2938 bdput(bdev); 2939 return r; 2940 } 2941 2942 static int dm_pr_preempt(struct block_device *bdev, u64 old_key, u64 new_key, 2943 enum pr_type type, bool abort) 2944 { 2945 struct mapped_device *md = bdev->bd_disk->private_data; 2946 const struct pr_ops *ops; 2947 fmode_t mode; 2948 int r; 2949 2950 r = dm_grab_bdev_for_ioctl(md, &bdev, &mode); 2951 if (r < 0) 2952 return r; 2953 2954 ops = bdev->bd_disk->fops->pr_ops; 2955 if (ops && ops->pr_preempt) 2956 r = ops->pr_preempt(bdev, old_key, new_key, type, abort); 2957 else 2958 r = -EOPNOTSUPP; 2959 2960 bdput(bdev); 2961 return r; 2962 } 2963 2964 static int dm_pr_clear(struct block_device *bdev, u64 key) 2965 { 2966 struct mapped_device *md = bdev->bd_disk->private_data; 2967 const struct pr_ops *ops; 2968 fmode_t mode; 2969 int r; 2970 2971 r = dm_grab_bdev_for_ioctl(md, &bdev, &mode); 2972 if (r < 0) 2973 return r; 2974 2975 ops = bdev->bd_disk->fops->pr_ops; 2976 if (ops && ops->pr_clear) 2977 r = ops->pr_clear(bdev, key); 2978 else 2979 r = -EOPNOTSUPP; 2980 2981 bdput(bdev); 2982 return r; 2983 } 2984 2985 static const struct pr_ops dm_pr_ops = { 2986 .pr_register = dm_pr_register, 2987 .pr_reserve = dm_pr_reserve, 2988 .pr_release = dm_pr_release, 2989 .pr_preempt = dm_pr_preempt, 2990 .pr_clear = dm_pr_clear, 2991 }; 2992 2993 static const struct block_device_operations dm_blk_dops = { 2994 .open = dm_blk_open, 2995 .release = dm_blk_close, 2996 .ioctl = dm_blk_ioctl, 2997 .getgeo = dm_blk_getgeo, 2998 .pr_ops = &dm_pr_ops, 2999 .owner = THIS_MODULE 3000 }; 3001 3002 static const struct dax_operations dm_dax_ops = { 3003 .direct_access = dm_dax_direct_access, 3004 .copy_from_iter = dm_dax_copy_from_iter, 3005 .flush = dm_dax_flush, 3006 }; 3007 3008 /* 3009 * module hooks 3010 */ 3011 module_init(dm_init); 3012 module_exit(dm_exit); 3013 3014 module_param(major, uint, 0); 3015 MODULE_PARM_DESC(major, "The major number of the device mapper"); 3016 3017 module_param(reserved_bio_based_ios, uint, S_IRUGO | S_IWUSR); 3018 MODULE_PARM_DESC(reserved_bio_based_ios, "Reserved IOs in bio-based mempools"); 3019 3020 module_param(dm_numa_node, int, S_IRUGO | S_IWUSR); 3021 MODULE_PARM_DESC(dm_numa_node, "NUMA node for DM device memory allocations"); 3022 3023 MODULE_DESCRIPTION(DM_NAME " driver"); 3024 MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>"); 3025 MODULE_LICENSE("GPL"); 3026