1 /* 2 * Copyright (C) 2001, 2002 Sistina Software (UK) Limited. 3 * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved. 4 * 5 * This file is released under the GPL. 6 */ 7 8 #include "dm-core.h" 9 #include "dm-rq.h" 10 #include "dm-uevent.h" 11 #include "dm-ima.h" 12 13 #include <linux/init.h> 14 #include <linux/module.h> 15 #include <linux/mutex.h> 16 #include <linux/sched/mm.h> 17 #include <linux/sched/signal.h> 18 #include <linux/blkpg.h> 19 #include <linux/bio.h> 20 #include <linux/mempool.h> 21 #include <linux/dax.h> 22 #include <linux/slab.h> 23 #include <linux/idr.h> 24 #include <linux/uio.h> 25 #include <linux/hdreg.h> 26 #include <linux/delay.h> 27 #include <linux/wait.h> 28 #include <linux/pr.h> 29 #include <linux/refcount.h> 30 #include <linux/part_stat.h> 31 #include <linux/blk-crypto.h> 32 #include <linux/blk-crypto-profile.h> 33 34 #define DM_MSG_PREFIX "core" 35 36 /* 37 * Cookies are numeric values sent with CHANGE and REMOVE 38 * uevents while resuming, removing or renaming the device. 39 */ 40 #define DM_COOKIE_ENV_VAR_NAME "DM_COOKIE" 41 #define DM_COOKIE_LENGTH 24 42 43 /* 44 * For REQ_POLLED fs bio, this flag is set if we link mapped underlying 45 * dm_io into one list, and reuse bio->bi_private as the list head. Before 46 * ending this fs bio, we will recover its ->bi_private. 47 */ 48 #define REQ_DM_POLL_LIST REQ_DRV 49 50 static const char *_name = DM_NAME; 51 52 static unsigned int major = 0; 53 static unsigned int _major = 0; 54 55 static DEFINE_IDR(_minor_idr); 56 57 static DEFINE_SPINLOCK(_minor_lock); 58 59 static void do_deferred_remove(struct work_struct *w); 60 61 static DECLARE_WORK(deferred_remove_work, do_deferred_remove); 62 63 static struct workqueue_struct *deferred_remove_workqueue; 64 65 atomic_t dm_global_event_nr = ATOMIC_INIT(0); 66 DECLARE_WAIT_QUEUE_HEAD(dm_global_eventq); 67 68 void dm_issue_global_event(void) 69 { 70 atomic_inc(&dm_global_event_nr); 71 wake_up(&dm_global_eventq); 72 } 73 74 /* 75 * One of these is allocated (on-stack) per original bio. 76 */ 77 struct clone_info { 78 struct dm_table *map; 79 struct bio *bio; 80 struct dm_io *io; 81 sector_t sector; 82 unsigned sector_count; 83 bool submit_as_polled; 84 }; 85 86 #define DM_TARGET_IO_BIO_OFFSET (offsetof(struct dm_target_io, clone)) 87 #define DM_IO_BIO_OFFSET \ 88 (offsetof(struct dm_target_io, clone) + offsetof(struct dm_io, tio)) 89 90 static inline struct dm_target_io *clone_to_tio(struct bio *clone) 91 { 92 return container_of(clone, struct dm_target_io, clone); 93 } 94 95 void *dm_per_bio_data(struct bio *bio, size_t data_size) 96 { 97 if (!clone_to_tio(bio)->inside_dm_io) 98 return (char *)bio - DM_TARGET_IO_BIO_OFFSET - data_size; 99 return (char *)bio - DM_IO_BIO_OFFSET - data_size; 100 } 101 EXPORT_SYMBOL_GPL(dm_per_bio_data); 102 103 struct bio *dm_bio_from_per_bio_data(void *data, size_t data_size) 104 { 105 struct dm_io *io = (struct dm_io *)((char *)data + data_size); 106 if (io->magic == DM_IO_MAGIC) 107 return (struct bio *)((char *)io + DM_IO_BIO_OFFSET); 108 BUG_ON(io->magic != DM_TIO_MAGIC); 109 return (struct bio *)((char *)io + DM_TARGET_IO_BIO_OFFSET); 110 } 111 EXPORT_SYMBOL_GPL(dm_bio_from_per_bio_data); 112 113 unsigned dm_bio_get_target_bio_nr(const struct bio *bio) 114 { 115 return container_of(bio, struct dm_target_io, clone)->target_bio_nr; 116 } 117 EXPORT_SYMBOL_GPL(dm_bio_get_target_bio_nr); 118 119 #define MINOR_ALLOCED ((void *)-1) 120 121 #define DM_NUMA_NODE NUMA_NO_NODE 122 static int dm_numa_node = DM_NUMA_NODE; 123 124 #define DEFAULT_SWAP_BIOS (8 * 1048576 / PAGE_SIZE) 125 static int swap_bios = DEFAULT_SWAP_BIOS; 126 static int get_swap_bios(void) 127 { 128 int latch = READ_ONCE(swap_bios); 129 if (unlikely(latch <= 0)) 130 latch = DEFAULT_SWAP_BIOS; 131 return latch; 132 } 133 134 /* 135 * For mempools pre-allocation at the table loading time. 136 */ 137 struct dm_md_mempools { 138 struct bio_set bs; 139 struct bio_set io_bs; 140 }; 141 142 struct table_device { 143 struct list_head list; 144 refcount_t count; 145 struct dm_dev dm_dev; 146 }; 147 148 /* 149 * Bio-based DM's mempools' reserved IOs set by the user. 150 */ 151 #define RESERVED_BIO_BASED_IOS 16 152 static unsigned reserved_bio_based_ios = RESERVED_BIO_BASED_IOS; 153 154 static int __dm_get_module_param_int(int *module_param, int min, int max) 155 { 156 int param = READ_ONCE(*module_param); 157 int modified_param = 0; 158 bool modified = true; 159 160 if (param < min) 161 modified_param = min; 162 else if (param > max) 163 modified_param = max; 164 else 165 modified = false; 166 167 if (modified) { 168 (void)cmpxchg(module_param, param, modified_param); 169 param = modified_param; 170 } 171 172 return param; 173 } 174 175 unsigned __dm_get_module_param(unsigned *module_param, 176 unsigned def, unsigned max) 177 { 178 unsigned param = READ_ONCE(*module_param); 179 unsigned modified_param = 0; 180 181 if (!param) 182 modified_param = def; 183 else if (param > max) 184 modified_param = max; 185 186 if (modified_param) { 187 (void)cmpxchg(module_param, param, modified_param); 188 param = modified_param; 189 } 190 191 return param; 192 } 193 194 unsigned dm_get_reserved_bio_based_ios(void) 195 { 196 return __dm_get_module_param(&reserved_bio_based_ios, 197 RESERVED_BIO_BASED_IOS, DM_RESERVED_MAX_IOS); 198 } 199 EXPORT_SYMBOL_GPL(dm_get_reserved_bio_based_ios); 200 201 static unsigned dm_get_numa_node(void) 202 { 203 return __dm_get_module_param_int(&dm_numa_node, 204 DM_NUMA_NODE, num_online_nodes() - 1); 205 } 206 207 static int __init local_init(void) 208 { 209 int r; 210 211 r = dm_uevent_init(); 212 if (r) 213 return r; 214 215 deferred_remove_workqueue = alloc_workqueue("kdmremove", WQ_UNBOUND, 1); 216 if (!deferred_remove_workqueue) { 217 r = -ENOMEM; 218 goto out_uevent_exit; 219 } 220 221 _major = major; 222 r = register_blkdev(_major, _name); 223 if (r < 0) 224 goto out_free_workqueue; 225 226 if (!_major) 227 _major = r; 228 229 return 0; 230 231 out_free_workqueue: 232 destroy_workqueue(deferred_remove_workqueue); 233 out_uevent_exit: 234 dm_uevent_exit(); 235 236 return r; 237 } 238 239 static void local_exit(void) 240 { 241 flush_scheduled_work(); 242 destroy_workqueue(deferred_remove_workqueue); 243 244 unregister_blkdev(_major, _name); 245 dm_uevent_exit(); 246 247 _major = 0; 248 249 DMINFO("cleaned up"); 250 } 251 252 static int (*_inits[])(void) __initdata = { 253 local_init, 254 dm_target_init, 255 dm_linear_init, 256 dm_stripe_init, 257 dm_io_init, 258 dm_kcopyd_init, 259 dm_interface_init, 260 dm_statistics_init, 261 }; 262 263 static void (*_exits[])(void) = { 264 local_exit, 265 dm_target_exit, 266 dm_linear_exit, 267 dm_stripe_exit, 268 dm_io_exit, 269 dm_kcopyd_exit, 270 dm_interface_exit, 271 dm_statistics_exit, 272 }; 273 274 static int __init dm_init(void) 275 { 276 const int count = ARRAY_SIZE(_inits); 277 int r, i; 278 279 #if (IS_ENABLED(CONFIG_IMA) && !IS_ENABLED(CONFIG_IMA_DISABLE_HTABLE)) 280 DMWARN("CONFIG_IMA_DISABLE_HTABLE is disabled." 281 " Duplicate IMA measurements will not be recorded in the IMA log."); 282 #endif 283 284 for (i = 0; i < count; i++) { 285 r = _inits[i](); 286 if (r) 287 goto bad; 288 } 289 290 return 0; 291 bad: 292 while (i--) 293 _exits[i](); 294 295 return r; 296 } 297 298 static void __exit dm_exit(void) 299 { 300 int i = ARRAY_SIZE(_exits); 301 302 while (i--) 303 _exits[i](); 304 305 /* 306 * Should be empty by this point. 307 */ 308 idr_destroy(&_minor_idr); 309 } 310 311 /* 312 * Block device functions 313 */ 314 int dm_deleting_md(struct mapped_device *md) 315 { 316 return test_bit(DMF_DELETING, &md->flags); 317 } 318 319 static int dm_blk_open(struct block_device *bdev, fmode_t mode) 320 { 321 struct mapped_device *md; 322 323 spin_lock(&_minor_lock); 324 325 md = bdev->bd_disk->private_data; 326 if (!md) 327 goto out; 328 329 if (test_bit(DMF_FREEING, &md->flags) || 330 dm_deleting_md(md)) { 331 md = NULL; 332 goto out; 333 } 334 335 dm_get(md); 336 atomic_inc(&md->open_count); 337 out: 338 spin_unlock(&_minor_lock); 339 340 return md ? 0 : -ENXIO; 341 } 342 343 static void dm_blk_close(struct gendisk *disk, fmode_t mode) 344 { 345 struct mapped_device *md; 346 347 spin_lock(&_minor_lock); 348 349 md = disk->private_data; 350 if (WARN_ON(!md)) 351 goto out; 352 353 if (atomic_dec_and_test(&md->open_count) && 354 (test_bit(DMF_DEFERRED_REMOVE, &md->flags))) 355 queue_work(deferred_remove_workqueue, &deferred_remove_work); 356 357 dm_put(md); 358 out: 359 spin_unlock(&_minor_lock); 360 } 361 362 int dm_open_count(struct mapped_device *md) 363 { 364 return atomic_read(&md->open_count); 365 } 366 367 /* 368 * Guarantees nothing is using the device before it's deleted. 369 */ 370 int dm_lock_for_deletion(struct mapped_device *md, bool mark_deferred, bool only_deferred) 371 { 372 int r = 0; 373 374 spin_lock(&_minor_lock); 375 376 if (dm_open_count(md)) { 377 r = -EBUSY; 378 if (mark_deferred) 379 set_bit(DMF_DEFERRED_REMOVE, &md->flags); 380 } else if (only_deferred && !test_bit(DMF_DEFERRED_REMOVE, &md->flags)) 381 r = -EEXIST; 382 else 383 set_bit(DMF_DELETING, &md->flags); 384 385 spin_unlock(&_minor_lock); 386 387 return r; 388 } 389 390 int dm_cancel_deferred_remove(struct mapped_device *md) 391 { 392 int r = 0; 393 394 spin_lock(&_minor_lock); 395 396 if (test_bit(DMF_DELETING, &md->flags)) 397 r = -EBUSY; 398 else 399 clear_bit(DMF_DEFERRED_REMOVE, &md->flags); 400 401 spin_unlock(&_minor_lock); 402 403 return r; 404 } 405 406 static void do_deferred_remove(struct work_struct *w) 407 { 408 dm_deferred_remove(); 409 } 410 411 static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo) 412 { 413 struct mapped_device *md = bdev->bd_disk->private_data; 414 415 return dm_get_geometry(md, geo); 416 } 417 418 static int dm_prepare_ioctl(struct mapped_device *md, int *srcu_idx, 419 struct block_device **bdev) 420 { 421 struct dm_target *tgt; 422 struct dm_table *map; 423 int r; 424 425 retry: 426 r = -ENOTTY; 427 map = dm_get_live_table(md, srcu_idx); 428 if (!map || !dm_table_get_size(map)) 429 return r; 430 431 /* We only support devices that have a single target */ 432 if (dm_table_get_num_targets(map) != 1) 433 return r; 434 435 tgt = dm_table_get_target(map, 0); 436 if (!tgt->type->prepare_ioctl) 437 return r; 438 439 if (dm_suspended_md(md)) 440 return -EAGAIN; 441 442 r = tgt->type->prepare_ioctl(tgt, bdev); 443 if (r == -ENOTCONN && !fatal_signal_pending(current)) { 444 dm_put_live_table(md, *srcu_idx); 445 msleep(10); 446 goto retry; 447 } 448 449 return r; 450 } 451 452 static void dm_unprepare_ioctl(struct mapped_device *md, int srcu_idx) 453 { 454 dm_put_live_table(md, srcu_idx); 455 } 456 457 static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode, 458 unsigned int cmd, unsigned long arg) 459 { 460 struct mapped_device *md = bdev->bd_disk->private_data; 461 int r, srcu_idx; 462 463 r = dm_prepare_ioctl(md, &srcu_idx, &bdev); 464 if (r < 0) 465 goto out; 466 467 if (r > 0) { 468 /* 469 * Target determined this ioctl is being issued against a 470 * subset of the parent bdev; require extra privileges. 471 */ 472 if (!capable(CAP_SYS_RAWIO)) { 473 DMDEBUG_LIMIT( 474 "%s: sending ioctl %x to DM device without required privilege.", 475 current->comm, cmd); 476 r = -ENOIOCTLCMD; 477 goto out; 478 } 479 } 480 481 if (!bdev->bd_disk->fops->ioctl) 482 r = -ENOTTY; 483 else 484 r = bdev->bd_disk->fops->ioctl(bdev, mode, cmd, arg); 485 out: 486 dm_unprepare_ioctl(md, srcu_idx); 487 return r; 488 } 489 490 u64 dm_start_time_ns_from_clone(struct bio *bio) 491 { 492 return jiffies_to_nsecs(clone_to_tio(bio)->io->start_time); 493 } 494 EXPORT_SYMBOL_GPL(dm_start_time_ns_from_clone); 495 496 static bool bio_is_flush_with_data(struct bio *bio) 497 { 498 return ((bio->bi_opf & REQ_PREFLUSH) && bio->bi_iter.bi_size); 499 } 500 501 static void dm_io_acct(bool end, struct mapped_device *md, struct bio *bio, 502 unsigned long start_time, struct dm_stats_aux *stats_aux) 503 { 504 bool is_flush_with_data; 505 unsigned int bi_size; 506 507 /* If REQ_PREFLUSH set save any payload but do not account it */ 508 is_flush_with_data = bio_is_flush_with_data(bio); 509 if (is_flush_with_data) { 510 bi_size = bio->bi_iter.bi_size; 511 bio->bi_iter.bi_size = 0; 512 } 513 514 if (!end) 515 bio_start_io_acct_time(bio, start_time); 516 else 517 bio_end_io_acct(bio, start_time); 518 519 if (unlikely(dm_stats_used(&md->stats))) 520 dm_stats_account_io(&md->stats, bio_data_dir(bio), 521 bio->bi_iter.bi_sector, bio_sectors(bio), 522 end, start_time, stats_aux); 523 524 /* Restore bio's payload so it does get accounted upon requeue */ 525 if (is_flush_with_data) 526 bio->bi_iter.bi_size = bi_size; 527 } 528 529 static void __dm_start_io_acct(struct dm_io *io, struct bio *bio) 530 { 531 dm_io_acct(false, io->md, bio, io->start_time, &io->stats_aux); 532 } 533 534 static void dm_start_io_acct(struct dm_io *io, struct bio *clone) 535 { 536 /* Must account IO to DM device in terms of orig_bio */ 537 struct bio *bio = io->orig_bio; 538 539 /* 540 * Ensure IO accounting is only ever started once. 541 * Expect no possibility for race unless is_duplicate_bio. 542 */ 543 if (!clone || likely(!clone_to_tio(clone)->is_duplicate_bio)) { 544 if (WARN_ON_ONCE(io->was_accounted)) 545 return; 546 io->was_accounted = 1; 547 } else if (xchg(&io->was_accounted, 1) == 1) 548 return; 549 550 __dm_start_io_acct(io, bio); 551 } 552 553 static void dm_end_io_acct(struct dm_io *io, struct bio *bio) 554 { 555 dm_io_acct(true, io->md, bio, io->start_time, &io->stats_aux); 556 } 557 558 static struct dm_io *alloc_io(struct mapped_device *md, struct bio *bio) 559 { 560 struct dm_io *io; 561 struct dm_target_io *tio; 562 struct bio *clone; 563 564 clone = bio_alloc_clone(bio->bi_bdev, bio, GFP_NOIO, &md->io_bs); 565 566 tio = clone_to_tio(clone); 567 tio->inside_dm_io = true; 568 tio->io = NULL; 569 570 io = container_of(tio, struct dm_io, tio); 571 io->magic = DM_IO_MAGIC; 572 io->status = 0; 573 atomic_set(&io->io_count, 1); 574 this_cpu_inc(*md->pending_io); 575 io->orig_bio = NULL; 576 io->md = md; 577 io->map_task = current; 578 spin_lock_init(&io->endio_lock); 579 580 io->start_time = jiffies; 581 io->start_io_acct = false; 582 io->was_accounted = 0; 583 584 dm_stats_record_start(&md->stats, &io->stats_aux); 585 586 return io; 587 } 588 589 static void free_io(struct dm_io *io) 590 { 591 bio_put(&io->tio.clone); 592 } 593 594 static struct bio *alloc_tio(struct clone_info *ci, struct dm_target *ti, 595 unsigned target_bio_nr, unsigned *len, gfp_t gfp_mask) 596 { 597 struct dm_target_io *tio; 598 struct bio *clone; 599 600 if (!ci->io->tio.io) { 601 /* the dm_target_io embedded in ci->io is available */ 602 tio = &ci->io->tio; 603 /* alloc_io() already initialized embedded clone */ 604 clone = &tio->clone; 605 } else { 606 clone = bio_alloc_clone(ci->bio->bi_bdev, ci->bio, 607 gfp_mask, &ci->io->md->bs); 608 if (!clone) 609 return NULL; 610 611 /* REQ_DM_POLL_LIST shouldn't be inherited */ 612 clone->bi_opf &= ~REQ_DM_POLL_LIST; 613 614 tio = clone_to_tio(clone); 615 tio->inside_dm_io = false; 616 } 617 618 tio->magic = DM_TIO_MAGIC; 619 tio->io = ci->io; 620 tio->ti = ti; 621 tio->target_bio_nr = target_bio_nr; 622 tio->is_duplicate_bio = false; 623 tio->len_ptr = len; 624 tio->old_sector = 0; 625 626 if (len) { 627 clone->bi_iter.bi_size = to_bytes(*len); 628 if (bio_integrity(clone)) 629 bio_integrity_trim(clone); 630 } 631 632 return clone; 633 } 634 635 static void free_tio(struct bio *clone) 636 { 637 if (clone_to_tio(clone)->inside_dm_io) 638 return; 639 bio_put(clone); 640 } 641 642 /* 643 * Add the bio to the list of deferred io. 644 */ 645 static void queue_io(struct mapped_device *md, struct bio *bio) 646 { 647 unsigned long flags; 648 649 spin_lock_irqsave(&md->deferred_lock, flags); 650 bio_list_add(&md->deferred, bio); 651 spin_unlock_irqrestore(&md->deferred_lock, flags); 652 queue_work(md->wq, &md->work); 653 } 654 655 /* 656 * Everyone (including functions in this file), should use this 657 * function to access the md->map field, and make sure they call 658 * dm_put_live_table() when finished. 659 */ 660 struct dm_table *dm_get_live_table(struct mapped_device *md, int *srcu_idx) __acquires(md->io_barrier) 661 { 662 *srcu_idx = srcu_read_lock(&md->io_barrier); 663 664 return srcu_dereference(md->map, &md->io_barrier); 665 } 666 667 void dm_put_live_table(struct mapped_device *md, int srcu_idx) __releases(md->io_barrier) 668 { 669 srcu_read_unlock(&md->io_barrier, srcu_idx); 670 } 671 672 void dm_sync_table(struct mapped_device *md) 673 { 674 synchronize_srcu(&md->io_barrier); 675 synchronize_rcu_expedited(); 676 } 677 678 /* 679 * A fast alternative to dm_get_live_table/dm_put_live_table. 680 * The caller must not block between these two functions. 681 */ 682 static struct dm_table *dm_get_live_table_fast(struct mapped_device *md) __acquires(RCU) 683 { 684 rcu_read_lock(); 685 return rcu_dereference(md->map); 686 } 687 688 static void dm_put_live_table_fast(struct mapped_device *md) __releases(RCU) 689 { 690 rcu_read_unlock(); 691 } 692 693 static char *_dm_claim_ptr = "I belong to device-mapper"; 694 695 /* 696 * Open a table device so we can use it as a map destination. 697 */ 698 static int open_table_device(struct table_device *td, dev_t dev, 699 struct mapped_device *md) 700 { 701 struct block_device *bdev; 702 u64 part_off; 703 int r; 704 705 BUG_ON(td->dm_dev.bdev); 706 707 bdev = blkdev_get_by_dev(dev, td->dm_dev.mode | FMODE_EXCL, _dm_claim_ptr); 708 if (IS_ERR(bdev)) 709 return PTR_ERR(bdev); 710 711 r = bd_link_disk_holder(bdev, dm_disk(md)); 712 if (r) { 713 blkdev_put(bdev, td->dm_dev.mode | FMODE_EXCL); 714 return r; 715 } 716 717 td->dm_dev.bdev = bdev; 718 td->dm_dev.dax_dev = fs_dax_get_by_bdev(bdev, &part_off); 719 return 0; 720 } 721 722 /* 723 * Close a table device that we've been using. 724 */ 725 static void close_table_device(struct table_device *td, struct mapped_device *md) 726 { 727 if (!td->dm_dev.bdev) 728 return; 729 730 bd_unlink_disk_holder(td->dm_dev.bdev, dm_disk(md)); 731 blkdev_put(td->dm_dev.bdev, td->dm_dev.mode | FMODE_EXCL); 732 put_dax(td->dm_dev.dax_dev); 733 td->dm_dev.bdev = NULL; 734 td->dm_dev.dax_dev = NULL; 735 } 736 737 static struct table_device *find_table_device(struct list_head *l, dev_t dev, 738 fmode_t mode) 739 { 740 struct table_device *td; 741 742 list_for_each_entry(td, l, list) 743 if (td->dm_dev.bdev->bd_dev == dev && td->dm_dev.mode == mode) 744 return td; 745 746 return NULL; 747 } 748 749 int dm_get_table_device(struct mapped_device *md, dev_t dev, fmode_t mode, 750 struct dm_dev **result) 751 { 752 int r; 753 struct table_device *td; 754 755 mutex_lock(&md->table_devices_lock); 756 td = find_table_device(&md->table_devices, dev, mode); 757 if (!td) { 758 td = kmalloc_node(sizeof(*td), GFP_KERNEL, md->numa_node_id); 759 if (!td) { 760 mutex_unlock(&md->table_devices_lock); 761 return -ENOMEM; 762 } 763 764 td->dm_dev.mode = mode; 765 td->dm_dev.bdev = NULL; 766 767 if ((r = open_table_device(td, dev, md))) { 768 mutex_unlock(&md->table_devices_lock); 769 kfree(td); 770 return r; 771 } 772 773 format_dev_t(td->dm_dev.name, dev); 774 775 refcount_set(&td->count, 1); 776 list_add(&td->list, &md->table_devices); 777 } else { 778 refcount_inc(&td->count); 779 } 780 mutex_unlock(&md->table_devices_lock); 781 782 *result = &td->dm_dev; 783 return 0; 784 } 785 786 void dm_put_table_device(struct mapped_device *md, struct dm_dev *d) 787 { 788 struct table_device *td = container_of(d, struct table_device, dm_dev); 789 790 mutex_lock(&md->table_devices_lock); 791 if (refcount_dec_and_test(&td->count)) { 792 close_table_device(td, md); 793 list_del(&td->list); 794 kfree(td); 795 } 796 mutex_unlock(&md->table_devices_lock); 797 } 798 799 static void free_table_devices(struct list_head *devices) 800 { 801 struct list_head *tmp, *next; 802 803 list_for_each_safe(tmp, next, devices) { 804 struct table_device *td = list_entry(tmp, struct table_device, list); 805 806 DMWARN("dm_destroy: %s still exists with %d references", 807 td->dm_dev.name, refcount_read(&td->count)); 808 kfree(td); 809 } 810 } 811 812 /* 813 * Get the geometry associated with a dm device 814 */ 815 int dm_get_geometry(struct mapped_device *md, struct hd_geometry *geo) 816 { 817 *geo = md->geometry; 818 819 return 0; 820 } 821 822 /* 823 * Set the geometry of a device. 824 */ 825 int dm_set_geometry(struct mapped_device *md, struct hd_geometry *geo) 826 { 827 sector_t sz = (sector_t)geo->cylinders * geo->heads * geo->sectors; 828 829 if (geo->start > sz) { 830 DMWARN("Start sector is beyond the geometry limits."); 831 return -EINVAL; 832 } 833 834 md->geometry = *geo; 835 836 return 0; 837 } 838 839 static int __noflush_suspending(struct mapped_device *md) 840 { 841 return test_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); 842 } 843 844 static void dm_io_complete(struct dm_io *io) 845 { 846 blk_status_t io_error; 847 struct mapped_device *md = io->md; 848 struct bio *bio = io->orig_bio; 849 850 if (io->status == BLK_STS_DM_REQUEUE) { 851 unsigned long flags; 852 /* 853 * Target requested pushing back the I/O. 854 */ 855 spin_lock_irqsave(&md->deferred_lock, flags); 856 if (__noflush_suspending(md) && 857 !WARN_ON_ONCE(dm_is_zone_write(md, bio))) { 858 /* NOTE early return due to BLK_STS_DM_REQUEUE below */ 859 bio_list_add_head(&md->deferred, bio); 860 } else { 861 /* 862 * noflush suspend was interrupted or this is 863 * a write to a zoned target. 864 */ 865 io->status = BLK_STS_IOERR; 866 } 867 spin_unlock_irqrestore(&md->deferred_lock, flags); 868 } 869 870 io_error = io->status; 871 if (io->was_accounted) 872 dm_end_io_acct(io, bio); 873 else if (!io_error) { 874 /* 875 * Must handle target that DM_MAPIO_SUBMITTED only to 876 * then bio_endio() rather than dm_submit_bio_remap() 877 */ 878 __dm_start_io_acct(io, bio); 879 dm_end_io_acct(io, bio); 880 } 881 free_io(io); 882 smp_wmb(); 883 this_cpu_dec(*md->pending_io); 884 885 /* nudge anyone waiting on suspend queue */ 886 if (unlikely(wq_has_sleeper(&md->wait))) 887 wake_up(&md->wait); 888 889 if (io_error == BLK_STS_DM_REQUEUE) { 890 /* 891 * Upper layer won't help us poll split bio, io->orig_bio 892 * may only reflect a subset of the pre-split original, 893 * so clear REQ_POLLED in case of requeue 894 */ 895 bio->bi_opf &= ~REQ_POLLED; 896 return; 897 } 898 899 if (bio_is_flush_with_data(bio)) { 900 /* 901 * Preflush done for flush with data, reissue 902 * without REQ_PREFLUSH. 903 */ 904 bio->bi_opf &= ~REQ_PREFLUSH; 905 queue_io(md, bio); 906 } else { 907 /* done with normal IO or empty flush */ 908 if (io_error) 909 bio->bi_status = io_error; 910 bio_endio(bio); 911 } 912 } 913 914 /* 915 * Decrements the number of outstanding ios that a bio has been 916 * cloned into, completing the original io if necc. 917 */ 918 void dm_io_dec_pending(struct dm_io *io, blk_status_t error) 919 { 920 /* Push-back supersedes any I/O errors */ 921 if (unlikely(error)) { 922 unsigned long flags; 923 spin_lock_irqsave(&io->endio_lock, flags); 924 if (!(io->status == BLK_STS_DM_REQUEUE && 925 __noflush_suspending(io->md))) 926 io->status = error; 927 spin_unlock_irqrestore(&io->endio_lock, flags); 928 } 929 930 if (atomic_dec_and_test(&io->io_count)) 931 dm_io_complete(io); 932 } 933 934 void disable_discard(struct mapped_device *md) 935 { 936 struct queue_limits *limits = dm_get_queue_limits(md); 937 938 /* device doesn't really support DISCARD, disable it */ 939 limits->max_discard_sectors = 0; 940 blk_queue_flag_clear(QUEUE_FLAG_DISCARD, md->queue); 941 } 942 943 void disable_write_same(struct mapped_device *md) 944 { 945 struct queue_limits *limits = dm_get_queue_limits(md); 946 947 /* device doesn't really support WRITE SAME, disable it */ 948 limits->max_write_same_sectors = 0; 949 } 950 951 void disable_write_zeroes(struct mapped_device *md) 952 { 953 struct queue_limits *limits = dm_get_queue_limits(md); 954 955 /* device doesn't really support WRITE ZEROES, disable it */ 956 limits->max_write_zeroes_sectors = 0; 957 } 958 959 static bool swap_bios_limit(struct dm_target *ti, struct bio *bio) 960 { 961 return unlikely((bio->bi_opf & REQ_SWAP) != 0) && unlikely(ti->limit_swap_bios); 962 } 963 964 static void clone_endio(struct bio *bio) 965 { 966 blk_status_t error = bio->bi_status; 967 struct dm_target_io *tio = clone_to_tio(bio); 968 struct dm_io *io = tio->io; 969 struct mapped_device *md = tio->io->md; 970 dm_endio_fn endio = tio->ti->type->end_io; 971 struct request_queue *q = bio->bi_bdev->bd_disk->queue; 972 973 if (unlikely(error == BLK_STS_TARGET)) { 974 if (bio_op(bio) == REQ_OP_DISCARD && 975 !q->limits.max_discard_sectors) 976 disable_discard(md); 977 else if (bio_op(bio) == REQ_OP_WRITE_SAME && 978 !q->limits.max_write_same_sectors) 979 disable_write_same(md); 980 else if (bio_op(bio) == REQ_OP_WRITE_ZEROES && 981 !q->limits.max_write_zeroes_sectors) 982 disable_write_zeroes(md); 983 } 984 985 if (blk_queue_is_zoned(q)) 986 dm_zone_endio(io, bio); 987 988 if (endio) { 989 int r = endio(tio->ti, bio, &error); 990 switch (r) { 991 case DM_ENDIO_REQUEUE: 992 /* 993 * Requeuing writes to a sequential zone of a zoned 994 * target will break the sequential write pattern: 995 * fail such IO. 996 */ 997 if (WARN_ON_ONCE(dm_is_zone_write(md, bio))) 998 error = BLK_STS_IOERR; 999 else 1000 error = BLK_STS_DM_REQUEUE; 1001 fallthrough; 1002 case DM_ENDIO_DONE: 1003 break; 1004 case DM_ENDIO_INCOMPLETE: 1005 /* The target will handle the io */ 1006 return; 1007 default: 1008 DMWARN("unimplemented target endio return value: %d", r); 1009 BUG(); 1010 } 1011 } 1012 1013 if (unlikely(swap_bios_limit(tio->ti, bio))) { 1014 struct mapped_device *md = io->md; 1015 up(&md->swap_bios_semaphore); 1016 } 1017 1018 free_tio(bio); 1019 dm_io_dec_pending(io, error); 1020 } 1021 1022 /* 1023 * Return maximum size of I/O possible at the supplied sector up to the current 1024 * target boundary. 1025 */ 1026 static inline sector_t max_io_len_target_boundary(struct dm_target *ti, 1027 sector_t target_offset) 1028 { 1029 return ti->len - target_offset; 1030 } 1031 1032 static sector_t max_io_len(struct dm_target *ti, sector_t sector) 1033 { 1034 sector_t target_offset = dm_target_offset(ti, sector); 1035 sector_t len = max_io_len_target_boundary(ti, target_offset); 1036 sector_t max_len; 1037 1038 /* 1039 * Does the target need to split IO even further? 1040 * - varied (per target) IO splitting is a tenet of DM; this 1041 * explains why stacked chunk_sectors based splitting via 1042 * blk_max_size_offset() isn't possible here. So pass in 1043 * ti->max_io_len to override stacked chunk_sectors. 1044 */ 1045 if (ti->max_io_len) { 1046 max_len = blk_max_size_offset(ti->table->md->queue, 1047 target_offset, ti->max_io_len); 1048 if (len > max_len) 1049 len = max_len; 1050 } 1051 1052 return len; 1053 } 1054 1055 int dm_set_target_max_io_len(struct dm_target *ti, sector_t len) 1056 { 1057 if (len > UINT_MAX) { 1058 DMERR("Specified maximum size of target IO (%llu) exceeds limit (%u)", 1059 (unsigned long long)len, UINT_MAX); 1060 ti->error = "Maximum size of target IO is too large"; 1061 return -EINVAL; 1062 } 1063 1064 ti->max_io_len = (uint32_t) len; 1065 1066 return 0; 1067 } 1068 EXPORT_SYMBOL_GPL(dm_set_target_max_io_len); 1069 1070 static struct dm_target *dm_dax_get_live_target(struct mapped_device *md, 1071 sector_t sector, int *srcu_idx) 1072 __acquires(md->io_barrier) 1073 { 1074 struct dm_table *map; 1075 struct dm_target *ti; 1076 1077 map = dm_get_live_table(md, srcu_idx); 1078 if (!map) 1079 return NULL; 1080 1081 ti = dm_table_find_target(map, sector); 1082 if (!ti) 1083 return NULL; 1084 1085 return ti; 1086 } 1087 1088 static long dm_dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, 1089 long nr_pages, void **kaddr, pfn_t *pfn) 1090 { 1091 struct mapped_device *md = dax_get_private(dax_dev); 1092 sector_t sector = pgoff * PAGE_SECTORS; 1093 struct dm_target *ti; 1094 long len, ret = -EIO; 1095 int srcu_idx; 1096 1097 ti = dm_dax_get_live_target(md, sector, &srcu_idx); 1098 1099 if (!ti) 1100 goto out; 1101 if (!ti->type->direct_access) 1102 goto out; 1103 len = max_io_len(ti, sector) / PAGE_SECTORS; 1104 if (len < 1) 1105 goto out; 1106 nr_pages = min(len, nr_pages); 1107 ret = ti->type->direct_access(ti, pgoff, nr_pages, kaddr, pfn); 1108 1109 out: 1110 dm_put_live_table(md, srcu_idx); 1111 1112 return ret; 1113 } 1114 1115 static int dm_dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff, 1116 size_t nr_pages) 1117 { 1118 struct mapped_device *md = dax_get_private(dax_dev); 1119 sector_t sector = pgoff * PAGE_SECTORS; 1120 struct dm_target *ti; 1121 int ret = -EIO; 1122 int srcu_idx; 1123 1124 ti = dm_dax_get_live_target(md, sector, &srcu_idx); 1125 1126 if (!ti) 1127 goto out; 1128 if (WARN_ON(!ti->type->dax_zero_page_range)) { 1129 /* 1130 * ->zero_page_range() is mandatory dax operation. If we are 1131 * here, something is wrong. 1132 */ 1133 goto out; 1134 } 1135 ret = ti->type->dax_zero_page_range(ti, pgoff, nr_pages); 1136 out: 1137 dm_put_live_table(md, srcu_idx); 1138 1139 return ret; 1140 } 1141 1142 /* 1143 * A target may call dm_accept_partial_bio only from the map routine. It is 1144 * allowed for all bio types except REQ_PREFLUSH, REQ_OP_ZONE_* zone management 1145 * operations, REQ_OP_ZONE_APPEND (zone append writes) and any bio serviced by 1146 * __send_duplicate_bios(). 1147 * 1148 * dm_accept_partial_bio informs the dm that the target only wants to process 1149 * additional n_sectors sectors of the bio and the rest of the data should be 1150 * sent in a next bio. 1151 * 1152 * A diagram that explains the arithmetics: 1153 * +--------------------+---------------+-------+ 1154 * | 1 | 2 | 3 | 1155 * +--------------------+---------------+-------+ 1156 * 1157 * <-------------- *tio->len_ptr ---------------> 1158 * <------- bi_size -------> 1159 * <-- n_sectors --> 1160 * 1161 * Region 1 was already iterated over with bio_advance or similar function. 1162 * (it may be empty if the target doesn't use bio_advance) 1163 * Region 2 is the remaining bio size that the target wants to process. 1164 * (it may be empty if region 1 is non-empty, although there is no reason 1165 * to make it empty) 1166 * The target requires that region 3 is to be sent in the next bio. 1167 * 1168 * If the target wants to receive multiple copies of the bio (via num_*bios, etc), 1169 * the partially processed part (the sum of regions 1+2) must be the same for all 1170 * copies of the bio. 1171 */ 1172 void dm_accept_partial_bio(struct bio *bio, unsigned n_sectors) 1173 { 1174 struct dm_target_io *tio = clone_to_tio(bio); 1175 unsigned bi_size = bio->bi_iter.bi_size >> SECTOR_SHIFT; 1176 1177 BUG_ON(tio->is_duplicate_bio); 1178 BUG_ON(op_is_zone_mgmt(bio_op(bio))); 1179 BUG_ON(bio_op(bio) == REQ_OP_ZONE_APPEND); 1180 BUG_ON(bi_size > *tio->len_ptr); 1181 BUG_ON(n_sectors > bi_size); 1182 1183 *tio->len_ptr -= bi_size - n_sectors; 1184 bio->bi_iter.bi_size = n_sectors << SECTOR_SHIFT; 1185 } 1186 EXPORT_SYMBOL_GPL(dm_accept_partial_bio); 1187 1188 static inline void __dm_submit_bio_remap(struct bio *clone, 1189 dev_t dev, sector_t old_sector) 1190 { 1191 trace_block_bio_remap(clone, dev, old_sector); 1192 submit_bio_noacct(clone); 1193 } 1194 1195 /* 1196 * @clone: clone bio that DM core passed to target's .map function 1197 * @tgt_clone: clone of @clone bio that target needs submitted 1198 * 1199 * Targets should use this interface to submit bios they take 1200 * ownership of when returning DM_MAPIO_SUBMITTED. 1201 * 1202 * Target should also enable ti->accounts_remapped_io 1203 */ 1204 void dm_submit_bio_remap(struct bio *clone, struct bio *tgt_clone) 1205 { 1206 struct dm_target_io *tio = clone_to_tio(clone); 1207 struct dm_io *io = tio->io; 1208 1209 WARN_ON_ONCE(!tio->ti->accounts_remapped_io); 1210 1211 /* establish bio that will get submitted */ 1212 if (!tgt_clone) 1213 tgt_clone = clone; 1214 1215 /* 1216 * Account io->origin_bio to DM dev on behalf of target 1217 * that took ownership of IO with DM_MAPIO_SUBMITTED. 1218 */ 1219 if (io->map_task == current) { 1220 /* Still in target's map function */ 1221 io->start_io_acct = true; 1222 } else { 1223 /* 1224 * Called by another thread, managed by DM target, 1225 * wait for dm_split_and_process_bio() to store 1226 * io->orig_bio 1227 */ 1228 while (unlikely(!smp_load_acquire(&io->orig_bio))) 1229 msleep(1); 1230 dm_start_io_acct(io, clone); 1231 } 1232 1233 __dm_submit_bio_remap(tgt_clone, disk_devt(io->md->disk), 1234 tio->old_sector); 1235 } 1236 EXPORT_SYMBOL_GPL(dm_submit_bio_remap); 1237 1238 static noinline void __set_swap_bios_limit(struct mapped_device *md, int latch) 1239 { 1240 mutex_lock(&md->swap_bios_lock); 1241 while (latch < md->swap_bios) { 1242 cond_resched(); 1243 down(&md->swap_bios_semaphore); 1244 md->swap_bios--; 1245 } 1246 while (latch > md->swap_bios) { 1247 cond_resched(); 1248 up(&md->swap_bios_semaphore); 1249 md->swap_bios++; 1250 } 1251 mutex_unlock(&md->swap_bios_lock); 1252 } 1253 1254 static void __map_bio(struct bio *clone) 1255 { 1256 struct dm_target_io *tio = clone_to_tio(clone); 1257 int r; 1258 struct dm_io *io = tio->io; 1259 struct dm_target *ti = tio->ti; 1260 1261 clone->bi_end_io = clone_endio; 1262 1263 /* 1264 * Map the clone. 1265 */ 1266 dm_io_inc_pending(io); 1267 tio->old_sector = clone->bi_iter.bi_sector; 1268 1269 if (unlikely(swap_bios_limit(ti, clone))) { 1270 struct mapped_device *md = io->md; 1271 int latch = get_swap_bios(); 1272 if (unlikely(latch != md->swap_bios)) 1273 __set_swap_bios_limit(md, latch); 1274 down(&md->swap_bios_semaphore); 1275 } 1276 1277 /* 1278 * Check if the IO needs a special mapping due to zone append emulation 1279 * on zoned target. In this case, dm_zone_map_bio() calls the target 1280 * map operation. 1281 */ 1282 if (dm_emulate_zone_append(io->md)) 1283 r = dm_zone_map_bio(tio); 1284 else 1285 r = ti->type->map(ti, clone); 1286 1287 switch (r) { 1288 case DM_MAPIO_SUBMITTED: 1289 /* target has assumed ownership of this io */ 1290 if (!ti->accounts_remapped_io) 1291 io->start_io_acct = true; 1292 break; 1293 case DM_MAPIO_REMAPPED: 1294 /* 1295 * the bio has been remapped so dispatch it, but defer 1296 * dm_start_io_acct() until after possible bio_split(). 1297 */ 1298 __dm_submit_bio_remap(clone, disk_devt(io->md->disk), 1299 tio->old_sector); 1300 io->start_io_acct = true; 1301 break; 1302 case DM_MAPIO_KILL: 1303 case DM_MAPIO_REQUEUE: 1304 if (unlikely(swap_bios_limit(ti, clone))) 1305 up(&io->md->swap_bios_semaphore); 1306 free_tio(clone); 1307 if (r == DM_MAPIO_KILL) 1308 dm_io_dec_pending(io, BLK_STS_IOERR); 1309 else 1310 dm_io_dec_pending(io, BLK_STS_DM_REQUEUE); 1311 break; 1312 default: 1313 DMWARN("unimplemented target map return value: %d", r); 1314 BUG(); 1315 } 1316 } 1317 1318 static void alloc_multiple_bios(struct bio_list *blist, struct clone_info *ci, 1319 struct dm_target *ti, unsigned num_bios, 1320 unsigned *len) 1321 { 1322 struct bio *bio; 1323 int try; 1324 1325 for (try = 0; try < 2; try++) { 1326 int bio_nr; 1327 1328 if (try) 1329 mutex_lock(&ci->io->md->table_devices_lock); 1330 for (bio_nr = 0; bio_nr < num_bios; bio_nr++) { 1331 bio = alloc_tio(ci, ti, bio_nr, len, 1332 try ? GFP_NOIO : GFP_NOWAIT); 1333 if (!bio) 1334 break; 1335 1336 bio_list_add(blist, bio); 1337 } 1338 if (try) 1339 mutex_unlock(&ci->io->md->table_devices_lock); 1340 if (bio_nr == num_bios) 1341 return; 1342 1343 while ((bio = bio_list_pop(blist))) 1344 free_tio(bio); 1345 } 1346 } 1347 1348 static void __send_duplicate_bios(struct clone_info *ci, struct dm_target *ti, 1349 unsigned num_bios, unsigned *len) 1350 { 1351 struct bio_list blist = BIO_EMPTY_LIST; 1352 struct bio *clone; 1353 1354 switch (num_bios) { 1355 case 0: 1356 break; 1357 case 1: 1358 clone = alloc_tio(ci, ti, 0, len, GFP_NOIO); 1359 clone_to_tio(clone)->is_duplicate_bio = true; 1360 __map_bio(clone); 1361 break; 1362 default: 1363 alloc_multiple_bios(&blist, ci, ti, num_bios, len); 1364 while ((clone = bio_list_pop(&blist))) { 1365 clone_to_tio(clone)->is_duplicate_bio = true; 1366 __map_bio(clone); 1367 } 1368 break; 1369 } 1370 } 1371 1372 static void __send_empty_flush(struct clone_info *ci) 1373 { 1374 unsigned target_nr = 0; 1375 struct dm_target *ti; 1376 struct bio flush_bio; 1377 1378 /* 1379 * Use an on-stack bio for this, it's safe since we don't 1380 * need to reference it after submit. It's just used as 1381 * the basis for the clone(s). 1382 */ 1383 bio_init(&flush_bio, ci->io->md->disk->part0, NULL, 0, 1384 REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC); 1385 1386 ci->bio = &flush_bio; 1387 ci->sector_count = 0; 1388 1389 while ((ti = dm_table_get_target(ci->map, target_nr++))) 1390 __send_duplicate_bios(ci, ti, ti->num_flush_bios, NULL); 1391 1392 bio_uninit(ci->bio); 1393 } 1394 1395 static void __send_changing_extent_only(struct clone_info *ci, struct dm_target *ti, 1396 unsigned num_bios) 1397 { 1398 unsigned len; 1399 1400 len = min_t(sector_t, ci->sector_count, 1401 max_io_len_target_boundary(ti, dm_target_offset(ti, ci->sector))); 1402 1403 /* 1404 * dm_accept_partial_bio cannot be used with duplicate bios, 1405 * so update clone_info cursor before __send_duplicate_bios(). 1406 */ 1407 ci->sector += len; 1408 ci->sector_count -= len; 1409 1410 __send_duplicate_bios(ci, ti, num_bios, &len); 1411 } 1412 1413 static bool is_abnormal_io(struct bio *bio) 1414 { 1415 bool r = false; 1416 1417 switch (bio_op(bio)) { 1418 case REQ_OP_DISCARD: 1419 case REQ_OP_SECURE_ERASE: 1420 case REQ_OP_WRITE_SAME: 1421 case REQ_OP_WRITE_ZEROES: 1422 r = true; 1423 break; 1424 } 1425 1426 return r; 1427 } 1428 1429 static bool __process_abnormal_io(struct clone_info *ci, struct dm_target *ti, 1430 int *result) 1431 { 1432 unsigned num_bios = 0; 1433 1434 switch (bio_op(ci->bio)) { 1435 case REQ_OP_DISCARD: 1436 num_bios = ti->num_discard_bios; 1437 break; 1438 case REQ_OP_SECURE_ERASE: 1439 num_bios = ti->num_secure_erase_bios; 1440 break; 1441 case REQ_OP_WRITE_SAME: 1442 num_bios = ti->num_write_same_bios; 1443 break; 1444 case REQ_OP_WRITE_ZEROES: 1445 num_bios = ti->num_write_zeroes_bios; 1446 break; 1447 default: 1448 return false; 1449 } 1450 1451 /* 1452 * Even though the device advertised support for this type of 1453 * request, that does not mean every target supports it, and 1454 * reconfiguration might also have changed that since the 1455 * check was performed. 1456 */ 1457 if (!num_bios) 1458 *result = -EOPNOTSUPP; 1459 else { 1460 __send_changing_extent_only(ci, ti, num_bios); 1461 *result = 0; 1462 } 1463 return true; 1464 } 1465 1466 /* 1467 * Reuse ->bi_private as hlist head for storing all dm_io instances 1468 * associated with this bio, and this bio's bi_private needs to be 1469 * stored in dm_io->data before the reuse. 1470 * 1471 * bio->bi_private is owned by fs or upper layer, so block layer won't 1472 * touch it after splitting. Meantime it won't be changed by anyone after 1473 * bio is submitted. So this reuse is safe. 1474 */ 1475 static inline struct hlist_head *dm_get_bio_hlist_head(struct bio *bio) 1476 { 1477 return (struct hlist_head *)&bio->bi_private; 1478 } 1479 1480 static void dm_queue_poll_io(struct bio *bio, struct dm_io *io) 1481 { 1482 struct hlist_head *head = dm_get_bio_hlist_head(bio); 1483 1484 if (!(bio->bi_opf & REQ_DM_POLL_LIST)) { 1485 bio->bi_opf |= REQ_DM_POLL_LIST; 1486 /* 1487 * Save .bi_private into dm_io, so that we can reuse 1488 * .bi_private as hlist head for storing dm_io list 1489 */ 1490 io->data = bio->bi_private; 1491 1492 INIT_HLIST_HEAD(head); 1493 1494 /* tell block layer to poll for completion */ 1495 bio->bi_cookie = ~BLK_QC_T_NONE; 1496 } else { 1497 /* 1498 * bio recursed due to split, reuse original poll list, 1499 * and save bio->bi_private too. 1500 */ 1501 io->data = hlist_entry(head->first, struct dm_io, node)->data; 1502 } 1503 1504 hlist_add_head(&io->node, head); 1505 } 1506 1507 /* 1508 * Select the correct strategy for processing a non-flush bio. 1509 */ 1510 static int __split_and_process_bio(struct clone_info *ci) 1511 { 1512 struct bio *clone; 1513 struct dm_target *ti; 1514 unsigned len; 1515 int r; 1516 1517 ti = dm_table_find_target(ci->map, ci->sector); 1518 if (!ti) 1519 return -EIO; 1520 1521 if (__process_abnormal_io(ci, ti, &r)) 1522 return r; 1523 1524 /* 1525 * Only support bio polling for normal IO, and the target io is 1526 * exactly inside the dm_io instance (verified in dm_poll_dm_io) 1527 */ 1528 ci->submit_as_polled = ci->bio->bi_opf & REQ_POLLED; 1529 1530 len = min_t(sector_t, max_io_len(ti, ci->sector), ci->sector_count); 1531 clone = alloc_tio(ci, ti, 0, &len, GFP_NOIO); 1532 __map_bio(clone); 1533 1534 ci->sector += len; 1535 ci->sector_count -= len; 1536 1537 return 0; 1538 } 1539 1540 static void init_clone_info(struct clone_info *ci, struct mapped_device *md, 1541 struct dm_table *map, struct bio *bio) 1542 { 1543 ci->map = map; 1544 ci->io = alloc_io(md, bio); 1545 ci->bio = bio; 1546 ci->submit_as_polled = false; 1547 ci->sector = bio->bi_iter.bi_sector; 1548 ci->sector_count = bio_sectors(bio); 1549 1550 /* Shouldn't happen but sector_count was being set to 0 so... */ 1551 if (WARN_ON_ONCE(op_is_zone_mgmt(bio_op(bio)) && ci->sector_count)) 1552 ci->sector_count = 0; 1553 } 1554 1555 /* 1556 * Entry point to split a bio into clones and submit them to the targets. 1557 */ 1558 static void dm_split_and_process_bio(struct mapped_device *md, 1559 struct dm_table *map, struct bio *bio) 1560 { 1561 struct clone_info ci; 1562 struct bio *orig_bio = NULL; 1563 int error = 0; 1564 1565 init_clone_info(&ci, md, map, bio); 1566 1567 if (bio->bi_opf & REQ_PREFLUSH) { 1568 __send_empty_flush(&ci); 1569 /* dm_io_complete submits any data associated with flush */ 1570 goto out; 1571 } 1572 1573 error = __split_and_process_bio(&ci); 1574 ci.io->map_task = NULL; 1575 if (error || !ci.sector_count) 1576 goto out; 1577 1578 /* 1579 * Remainder must be passed to submit_bio_noacct() so it gets handled 1580 * *after* bios already submitted have been completely processed. 1581 * We take a clone of the original to store in ci.io->orig_bio to be 1582 * used by dm_end_io_acct() and for dm_io_complete() to use for 1583 * completion handling. 1584 */ 1585 orig_bio = bio_split(bio, bio_sectors(bio) - ci.sector_count, 1586 GFP_NOIO, &md->queue->bio_split); 1587 bio_chain(orig_bio, bio); 1588 trace_block_split(orig_bio, bio->bi_iter.bi_sector); 1589 submit_bio_noacct(bio); 1590 out: 1591 if (!orig_bio) 1592 orig_bio = bio; 1593 smp_store_release(&ci.io->orig_bio, orig_bio); 1594 if (ci.io->start_io_acct) 1595 dm_start_io_acct(ci.io, NULL); 1596 1597 /* 1598 * Drop the extra reference count for non-POLLED bio, and hold one 1599 * reference for POLLED bio, which will be released in dm_poll_bio 1600 * 1601 * Add every dm_io instance into the hlist_head which is stored in 1602 * bio->bi_private, so that dm_poll_bio can poll them all. 1603 */ 1604 if (error || !ci.submit_as_polled) 1605 dm_io_dec_pending(ci.io, errno_to_blk_status(error)); 1606 else 1607 dm_queue_poll_io(bio, ci.io); 1608 } 1609 1610 static void dm_submit_bio(struct bio *bio) 1611 { 1612 struct mapped_device *md = bio->bi_bdev->bd_disk->private_data; 1613 int srcu_idx; 1614 struct dm_table *map; 1615 1616 map = dm_get_live_table(md, &srcu_idx); 1617 1618 /* If suspended, or map not yet available, queue this IO for later */ 1619 if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) || 1620 unlikely(!map)) { 1621 if (bio->bi_opf & REQ_NOWAIT) 1622 bio_wouldblock_error(bio); 1623 else if (bio->bi_opf & REQ_RAHEAD) 1624 bio_io_error(bio); 1625 else 1626 queue_io(md, bio); 1627 goto out; 1628 } 1629 1630 /* 1631 * Use blk_queue_split() for abnormal IO (e.g. discard, writesame, etc) 1632 * otherwise associated queue_limits won't be imposed. 1633 */ 1634 if (is_abnormal_io(bio)) 1635 blk_queue_split(&bio); 1636 1637 dm_split_and_process_bio(md, map, bio); 1638 out: 1639 dm_put_live_table(md, srcu_idx); 1640 } 1641 1642 static bool dm_poll_dm_io(struct dm_io *io, struct io_comp_batch *iob, 1643 unsigned int flags) 1644 { 1645 WARN_ON_ONCE(!io->tio.inside_dm_io); 1646 1647 /* don't poll if the mapped io is done */ 1648 if (atomic_read(&io->io_count) > 1) 1649 bio_poll(&io->tio.clone, iob, flags); 1650 1651 /* bio_poll holds the last reference */ 1652 return atomic_read(&io->io_count) == 1; 1653 } 1654 1655 static int dm_poll_bio(struct bio *bio, struct io_comp_batch *iob, 1656 unsigned int flags) 1657 { 1658 struct hlist_head *head = dm_get_bio_hlist_head(bio); 1659 struct hlist_head tmp = HLIST_HEAD_INIT; 1660 struct hlist_node *next; 1661 struct dm_io *io; 1662 1663 /* Only poll normal bio which was marked as REQ_DM_POLL_LIST */ 1664 if (!(bio->bi_opf & REQ_DM_POLL_LIST)) 1665 return 0; 1666 1667 WARN_ON_ONCE(hlist_empty(head)); 1668 1669 hlist_move_list(head, &tmp); 1670 1671 /* 1672 * Restore .bi_private before possibly completing dm_io. 1673 * 1674 * bio_poll() is only possible once @bio has been completely 1675 * submitted via submit_bio_noacct()'s depth-first submission. 1676 * So there is no dm_queue_poll_io() race associated with 1677 * clearing REQ_DM_POLL_LIST here. 1678 */ 1679 bio->bi_opf &= ~REQ_DM_POLL_LIST; 1680 bio->bi_private = hlist_entry(tmp.first, struct dm_io, node)->data; 1681 1682 hlist_for_each_entry_safe(io, next, &tmp, node) { 1683 if (dm_poll_dm_io(io, iob, flags)) { 1684 hlist_del_init(&io->node); 1685 /* 1686 * clone_endio() has already occurred, so passing 1687 * error as 0 here doesn't override io->status 1688 */ 1689 dm_io_dec_pending(io, 0); 1690 } 1691 } 1692 1693 /* Not done? */ 1694 if (!hlist_empty(&tmp)) { 1695 bio->bi_opf |= REQ_DM_POLL_LIST; 1696 /* Reset bio->bi_private to dm_io list head */ 1697 hlist_move_list(&tmp, head); 1698 return 0; 1699 } 1700 return 1; 1701 } 1702 1703 /*----------------------------------------------------------------- 1704 * An IDR is used to keep track of allocated minor numbers. 1705 *---------------------------------------------------------------*/ 1706 static void free_minor(int minor) 1707 { 1708 spin_lock(&_minor_lock); 1709 idr_remove(&_minor_idr, minor); 1710 spin_unlock(&_minor_lock); 1711 } 1712 1713 /* 1714 * See if the device with a specific minor # is free. 1715 */ 1716 static int specific_minor(int minor) 1717 { 1718 int r; 1719 1720 if (minor >= (1 << MINORBITS)) 1721 return -EINVAL; 1722 1723 idr_preload(GFP_KERNEL); 1724 spin_lock(&_minor_lock); 1725 1726 r = idr_alloc(&_minor_idr, MINOR_ALLOCED, minor, minor + 1, GFP_NOWAIT); 1727 1728 spin_unlock(&_minor_lock); 1729 idr_preload_end(); 1730 if (r < 0) 1731 return r == -ENOSPC ? -EBUSY : r; 1732 return 0; 1733 } 1734 1735 static int next_free_minor(int *minor) 1736 { 1737 int r; 1738 1739 idr_preload(GFP_KERNEL); 1740 spin_lock(&_minor_lock); 1741 1742 r = idr_alloc(&_minor_idr, MINOR_ALLOCED, 0, 1 << MINORBITS, GFP_NOWAIT); 1743 1744 spin_unlock(&_minor_lock); 1745 idr_preload_end(); 1746 if (r < 0) 1747 return r; 1748 *minor = r; 1749 return 0; 1750 } 1751 1752 static const struct block_device_operations dm_blk_dops; 1753 static const struct block_device_operations dm_rq_blk_dops; 1754 static const struct dax_operations dm_dax_ops; 1755 1756 static void dm_wq_work(struct work_struct *work); 1757 1758 #ifdef CONFIG_BLK_INLINE_ENCRYPTION 1759 static void dm_queue_destroy_crypto_profile(struct request_queue *q) 1760 { 1761 dm_destroy_crypto_profile(q->crypto_profile); 1762 } 1763 1764 #else /* CONFIG_BLK_INLINE_ENCRYPTION */ 1765 1766 static inline void dm_queue_destroy_crypto_profile(struct request_queue *q) 1767 { 1768 } 1769 #endif /* !CONFIG_BLK_INLINE_ENCRYPTION */ 1770 1771 static void cleanup_mapped_device(struct mapped_device *md) 1772 { 1773 if (md->wq) 1774 destroy_workqueue(md->wq); 1775 bioset_exit(&md->bs); 1776 bioset_exit(&md->io_bs); 1777 1778 if (md->dax_dev) { 1779 dax_remove_host(md->disk); 1780 kill_dax(md->dax_dev); 1781 put_dax(md->dax_dev); 1782 md->dax_dev = NULL; 1783 } 1784 1785 dm_cleanup_zoned_dev(md); 1786 if (md->disk) { 1787 spin_lock(&_minor_lock); 1788 md->disk->private_data = NULL; 1789 spin_unlock(&_minor_lock); 1790 if (dm_get_md_type(md) != DM_TYPE_NONE) { 1791 dm_sysfs_exit(md); 1792 del_gendisk(md->disk); 1793 } 1794 dm_queue_destroy_crypto_profile(md->queue); 1795 blk_cleanup_disk(md->disk); 1796 } 1797 1798 if (md->pending_io) { 1799 free_percpu(md->pending_io); 1800 md->pending_io = NULL; 1801 } 1802 1803 cleanup_srcu_struct(&md->io_barrier); 1804 1805 mutex_destroy(&md->suspend_lock); 1806 mutex_destroy(&md->type_lock); 1807 mutex_destroy(&md->table_devices_lock); 1808 mutex_destroy(&md->swap_bios_lock); 1809 1810 dm_mq_cleanup_mapped_device(md); 1811 } 1812 1813 /* 1814 * Allocate and initialise a blank device with a given minor. 1815 */ 1816 static struct mapped_device *alloc_dev(int minor) 1817 { 1818 int r, numa_node_id = dm_get_numa_node(); 1819 struct mapped_device *md; 1820 void *old_md; 1821 1822 md = kvzalloc_node(sizeof(*md), GFP_KERNEL, numa_node_id); 1823 if (!md) { 1824 DMWARN("unable to allocate device, out of memory."); 1825 return NULL; 1826 } 1827 1828 if (!try_module_get(THIS_MODULE)) 1829 goto bad_module_get; 1830 1831 /* get a minor number for the dev */ 1832 if (minor == DM_ANY_MINOR) 1833 r = next_free_minor(&minor); 1834 else 1835 r = specific_minor(minor); 1836 if (r < 0) 1837 goto bad_minor; 1838 1839 r = init_srcu_struct(&md->io_barrier); 1840 if (r < 0) 1841 goto bad_io_barrier; 1842 1843 md->numa_node_id = numa_node_id; 1844 md->init_tio_pdu = false; 1845 md->type = DM_TYPE_NONE; 1846 mutex_init(&md->suspend_lock); 1847 mutex_init(&md->type_lock); 1848 mutex_init(&md->table_devices_lock); 1849 spin_lock_init(&md->deferred_lock); 1850 atomic_set(&md->holders, 1); 1851 atomic_set(&md->open_count, 0); 1852 atomic_set(&md->event_nr, 0); 1853 atomic_set(&md->uevent_seq, 0); 1854 INIT_LIST_HEAD(&md->uevent_list); 1855 INIT_LIST_HEAD(&md->table_devices); 1856 spin_lock_init(&md->uevent_lock); 1857 1858 /* 1859 * default to bio-based until DM table is loaded and md->type 1860 * established. If request-based table is loaded: blk-mq will 1861 * override accordingly. 1862 */ 1863 md->disk = blk_alloc_disk(md->numa_node_id); 1864 if (!md->disk) 1865 goto bad; 1866 md->queue = md->disk->queue; 1867 1868 init_waitqueue_head(&md->wait); 1869 INIT_WORK(&md->work, dm_wq_work); 1870 init_waitqueue_head(&md->eventq); 1871 init_completion(&md->kobj_holder.completion); 1872 1873 md->swap_bios = get_swap_bios(); 1874 sema_init(&md->swap_bios_semaphore, md->swap_bios); 1875 mutex_init(&md->swap_bios_lock); 1876 1877 md->disk->major = _major; 1878 md->disk->first_minor = minor; 1879 md->disk->minors = 1; 1880 md->disk->flags |= GENHD_FL_NO_PART; 1881 md->disk->fops = &dm_blk_dops; 1882 md->disk->queue = md->queue; 1883 md->disk->private_data = md; 1884 sprintf(md->disk->disk_name, "dm-%d", minor); 1885 1886 if (IS_ENABLED(CONFIG_FS_DAX)) { 1887 md->dax_dev = alloc_dax(md, &dm_dax_ops); 1888 if (IS_ERR(md->dax_dev)) { 1889 md->dax_dev = NULL; 1890 goto bad; 1891 } 1892 set_dax_nocache(md->dax_dev); 1893 set_dax_nomc(md->dax_dev); 1894 if (dax_add_host(md->dax_dev, md->disk)) 1895 goto bad; 1896 } 1897 1898 format_dev_t(md->name, MKDEV(_major, minor)); 1899 1900 md->wq = alloc_workqueue("kdmflush/%s", WQ_MEM_RECLAIM, 0, md->name); 1901 if (!md->wq) 1902 goto bad; 1903 1904 md->pending_io = alloc_percpu(unsigned long); 1905 if (!md->pending_io) 1906 goto bad; 1907 1908 dm_stats_init(&md->stats); 1909 1910 /* Populate the mapping, nobody knows we exist yet */ 1911 spin_lock(&_minor_lock); 1912 old_md = idr_replace(&_minor_idr, md, minor); 1913 spin_unlock(&_minor_lock); 1914 1915 BUG_ON(old_md != MINOR_ALLOCED); 1916 1917 return md; 1918 1919 bad: 1920 cleanup_mapped_device(md); 1921 bad_io_barrier: 1922 free_minor(minor); 1923 bad_minor: 1924 module_put(THIS_MODULE); 1925 bad_module_get: 1926 kvfree(md); 1927 return NULL; 1928 } 1929 1930 static void unlock_fs(struct mapped_device *md); 1931 1932 static void free_dev(struct mapped_device *md) 1933 { 1934 int minor = MINOR(disk_devt(md->disk)); 1935 1936 unlock_fs(md); 1937 1938 cleanup_mapped_device(md); 1939 1940 free_table_devices(&md->table_devices); 1941 dm_stats_cleanup(&md->stats); 1942 free_minor(minor); 1943 1944 module_put(THIS_MODULE); 1945 kvfree(md); 1946 } 1947 1948 static int __bind_mempools(struct mapped_device *md, struct dm_table *t) 1949 { 1950 struct dm_md_mempools *p = dm_table_get_md_mempools(t); 1951 int ret = 0; 1952 1953 if (dm_table_bio_based(t)) { 1954 /* 1955 * The md may already have mempools that need changing. 1956 * If so, reload bioset because front_pad may have changed 1957 * because a different table was loaded. 1958 */ 1959 bioset_exit(&md->bs); 1960 bioset_exit(&md->io_bs); 1961 1962 } else if (bioset_initialized(&md->bs)) { 1963 /* 1964 * There's no need to reload with request-based dm 1965 * because the size of front_pad doesn't change. 1966 * Note for future: If you are to reload bioset, 1967 * prep-ed requests in the queue may refer 1968 * to bio from the old bioset, so you must walk 1969 * through the queue to unprep. 1970 */ 1971 goto out; 1972 } 1973 1974 BUG_ON(!p || 1975 bioset_initialized(&md->bs) || 1976 bioset_initialized(&md->io_bs)); 1977 1978 ret = bioset_init_from_src(&md->bs, &p->bs); 1979 if (ret) 1980 goto out; 1981 ret = bioset_init_from_src(&md->io_bs, &p->io_bs); 1982 if (ret) 1983 bioset_exit(&md->bs); 1984 out: 1985 /* mempool bind completed, no longer need any mempools in the table */ 1986 dm_table_free_md_mempools(t); 1987 return ret; 1988 } 1989 1990 /* 1991 * Bind a table to the device. 1992 */ 1993 static void event_callback(void *context) 1994 { 1995 unsigned long flags; 1996 LIST_HEAD(uevents); 1997 struct mapped_device *md = (struct mapped_device *) context; 1998 1999 spin_lock_irqsave(&md->uevent_lock, flags); 2000 list_splice_init(&md->uevent_list, &uevents); 2001 spin_unlock_irqrestore(&md->uevent_lock, flags); 2002 2003 dm_send_uevents(&uevents, &disk_to_dev(md->disk)->kobj); 2004 2005 atomic_inc(&md->event_nr); 2006 wake_up(&md->eventq); 2007 dm_issue_global_event(); 2008 } 2009 2010 /* 2011 * Returns old map, which caller must destroy. 2012 */ 2013 static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t, 2014 struct queue_limits *limits) 2015 { 2016 struct dm_table *old_map; 2017 sector_t size; 2018 int ret; 2019 2020 lockdep_assert_held(&md->suspend_lock); 2021 2022 size = dm_table_get_size(t); 2023 2024 /* 2025 * Wipe any geometry if the size of the table changed. 2026 */ 2027 if (size != dm_get_size(md)) 2028 memset(&md->geometry, 0, sizeof(md->geometry)); 2029 2030 if (!get_capacity(md->disk)) 2031 set_capacity(md->disk, size); 2032 else 2033 set_capacity_and_notify(md->disk, size); 2034 2035 dm_table_event_callback(t, event_callback, md); 2036 2037 if (dm_table_request_based(t)) { 2038 /* 2039 * Leverage the fact that request-based DM targets are 2040 * immutable singletons - used to optimize dm_mq_queue_rq. 2041 */ 2042 md->immutable_target = dm_table_get_immutable_target(t); 2043 } 2044 2045 ret = __bind_mempools(md, t); 2046 if (ret) { 2047 old_map = ERR_PTR(ret); 2048 goto out; 2049 } 2050 2051 ret = dm_table_set_restrictions(t, md->queue, limits); 2052 if (ret) { 2053 old_map = ERR_PTR(ret); 2054 goto out; 2055 } 2056 2057 old_map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock)); 2058 rcu_assign_pointer(md->map, (void *)t); 2059 md->immutable_target_type = dm_table_get_immutable_target_type(t); 2060 2061 if (old_map) 2062 dm_sync_table(md); 2063 out: 2064 return old_map; 2065 } 2066 2067 /* 2068 * Returns unbound table for the caller to free. 2069 */ 2070 static struct dm_table *__unbind(struct mapped_device *md) 2071 { 2072 struct dm_table *map = rcu_dereference_protected(md->map, 1); 2073 2074 if (!map) 2075 return NULL; 2076 2077 dm_table_event_callback(map, NULL, NULL); 2078 RCU_INIT_POINTER(md->map, NULL); 2079 dm_sync_table(md); 2080 2081 return map; 2082 } 2083 2084 /* 2085 * Constructor for a new device. 2086 */ 2087 int dm_create(int minor, struct mapped_device **result) 2088 { 2089 struct mapped_device *md; 2090 2091 md = alloc_dev(minor); 2092 if (!md) 2093 return -ENXIO; 2094 2095 dm_ima_reset_data(md); 2096 2097 *result = md; 2098 return 0; 2099 } 2100 2101 /* 2102 * Functions to manage md->type. 2103 * All are required to hold md->type_lock. 2104 */ 2105 void dm_lock_md_type(struct mapped_device *md) 2106 { 2107 mutex_lock(&md->type_lock); 2108 } 2109 2110 void dm_unlock_md_type(struct mapped_device *md) 2111 { 2112 mutex_unlock(&md->type_lock); 2113 } 2114 2115 void dm_set_md_type(struct mapped_device *md, enum dm_queue_mode type) 2116 { 2117 BUG_ON(!mutex_is_locked(&md->type_lock)); 2118 md->type = type; 2119 } 2120 2121 enum dm_queue_mode dm_get_md_type(struct mapped_device *md) 2122 { 2123 return md->type; 2124 } 2125 2126 struct target_type *dm_get_immutable_target_type(struct mapped_device *md) 2127 { 2128 return md->immutable_target_type; 2129 } 2130 2131 /* 2132 * The queue_limits are only valid as long as you have a reference 2133 * count on 'md'. 2134 */ 2135 struct queue_limits *dm_get_queue_limits(struct mapped_device *md) 2136 { 2137 BUG_ON(!atomic_read(&md->holders)); 2138 return &md->queue->limits; 2139 } 2140 EXPORT_SYMBOL_GPL(dm_get_queue_limits); 2141 2142 /* 2143 * Setup the DM device's queue based on md's type 2144 */ 2145 int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t) 2146 { 2147 enum dm_queue_mode type = dm_table_get_type(t); 2148 struct queue_limits limits; 2149 int r; 2150 2151 switch (type) { 2152 case DM_TYPE_REQUEST_BASED: 2153 md->disk->fops = &dm_rq_blk_dops; 2154 r = dm_mq_init_request_queue(md, t); 2155 if (r) { 2156 DMERR("Cannot initialize queue for request-based dm mapped device"); 2157 return r; 2158 } 2159 break; 2160 case DM_TYPE_BIO_BASED: 2161 case DM_TYPE_DAX_BIO_BASED: 2162 break; 2163 case DM_TYPE_NONE: 2164 WARN_ON_ONCE(true); 2165 break; 2166 } 2167 2168 r = dm_calculate_queue_limits(t, &limits); 2169 if (r) { 2170 DMERR("Cannot calculate initial queue limits"); 2171 return r; 2172 } 2173 r = dm_table_set_restrictions(t, md->queue, &limits); 2174 if (r) 2175 return r; 2176 2177 r = add_disk(md->disk); 2178 if (r) 2179 return r; 2180 2181 r = dm_sysfs_init(md); 2182 if (r) { 2183 del_gendisk(md->disk); 2184 return r; 2185 } 2186 md->type = type; 2187 return 0; 2188 } 2189 2190 struct mapped_device *dm_get_md(dev_t dev) 2191 { 2192 struct mapped_device *md; 2193 unsigned minor = MINOR(dev); 2194 2195 if (MAJOR(dev) != _major || minor >= (1 << MINORBITS)) 2196 return NULL; 2197 2198 spin_lock(&_minor_lock); 2199 2200 md = idr_find(&_minor_idr, minor); 2201 if (!md || md == MINOR_ALLOCED || (MINOR(disk_devt(dm_disk(md))) != minor) || 2202 test_bit(DMF_FREEING, &md->flags) || dm_deleting_md(md)) { 2203 md = NULL; 2204 goto out; 2205 } 2206 dm_get(md); 2207 out: 2208 spin_unlock(&_minor_lock); 2209 2210 return md; 2211 } 2212 EXPORT_SYMBOL_GPL(dm_get_md); 2213 2214 void *dm_get_mdptr(struct mapped_device *md) 2215 { 2216 return md->interface_ptr; 2217 } 2218 2219 void dm_set_mdptr(struct mapped_device *md, void *ptr) 2220 { 2221 md->interface_ptr = ptr; 2222 } 2223 2224 void dm_get(struct mapped_device *md) 2225 { 2226 atomic_inc(&md->holders); 2227 BUG_ON(test_bit(DMF_FREEING, &md->flags)); 2228 } 2229 2230 int dm_hold(struct mapped_device *md) 2231 { 2232 spin_lock(&_minor_lock); 2233 if (test_bit(DMF_FREEING, &md->flags)) { 2234 spin_unlock(&_minor_lock); 2235 return -EBUSY; 2236 } 2237 dm_get(md); 2238 spin_unlock(&_minor_lock); 2239 return 0; 2240 } 2241 EXPORT_SYMBOL_GPL(dm_hold); 2242 2243 const char *dm_device_name(struct mapped_device *md) 2244 { 2245 return md->name; 2246 } 2247 EXPORT_SYMBOL_GPL(dm_device_name); 2248 2249 static void __dm_destroy(struct mapped_device *md, bool wait) 2250 { 2251 struct dm_table *map; 2252 int srcu_idx; 2253 2254 might_sleep(); 2255 2256 spin_lock(&_minor_lock); 2257 idr_replace(&_minor_idr, MINOR_ALLOCED, MINOR(disk_devt(dm_disk(md)))); 2258 set_bit(DMF_FREEING, &md->flags); 2259 spin_unlock(&_minor_lock); 2260 2261 blk_set_queue_dying(md->queue); 2262 2263 /* 2264 * Take suspend_lock so that presuspend and postsuspend methods 2265 * do not race with internal suspend. 2266 */ 2267 mutex_lock(&md->suspend_lock); 2268 map = dm_get_live_table(md, &srcu_idx); 2269 if (!dm_suspended_md(md)) { 2270 dm_table_presuspend_targets(map); 2271 set_bit(DMF_SUSPENDED, &md->flags); 2272 set_bit(DMF_POST_SUSPENDING, &md->flags); 2273 dm_table_postsuspend_targets(map); 2274 } 2275 /* dm_put_live_table must be before msleep, otherwise deadlock is possible */ 2276 dm_put_live_table(md, srcu_idx); 2277 mutex_unlock(&md->suspend_lock); 2278 2279 /* 2280 * Rare, but there may be I/O requests still going to complete, 2281 * for example. Wait for all references to disappear. 2282 * No one should increment the reference count of the mapped_device, 2283 * after the mapped_device state becomes DMF_FREEING. 2284 */ 2285 if (wait) 2286 while (atomic_read(&md->holders)) 2287 msleep(1); 2288 else if (atomic_read(&md->holders)) 2289 DMWARN("%s: Forcibly removing mapped_device still in use! (%d users)", 2290 dm_device_name(md), atomic_read(&md->holders)); 2291 2292 dm_table_destroy(__unbind(md)); 2293 free_dev(md); 2294 } 2295 2296 void dm_destroy(struct mapped_device *md) 2297 { 2298 __dm_destroy(md, true); 2299 } 2300 2301 void dm_destroy_immediate(struct mapped_device *md) 2302 { 2303 __dm_destroy(md, false); 2304 } 2305 2306 void dm_put(struct mapped_device *md) 2307 { 2308 atomic_dec(&md->holders); 2309 } 2310 EXPORT_SYMBOL_GPL(dm_put); 2311 2312 static bool dm_in_flight_bios(struct mapped_device *md) 2313 { 2314 int cpu; 2315 unsigned long sum = 0; 2316 2317 for_each_possible_cpu(cpu) 2318 sum += *per_cpu_ptr(md->pending_io, cpu); 2319 2320 return sum != 0; 2321 } 2322 2323 static int dm_wait_for_bios_completion(struct mapped_device *md, unsigned int task_state) 2324 { 2325 int r = 0; 2326 DEFINE_WAIT(wait); 2327 2328 while (true) { 2329 prepare_to_wait(&md->wait, &wait, task_state); 2330 2331 if (!dm_in_flight_bios(md)) 2332 break; 2333 2334 if (signal_pending_state(task_state, current)) { 2335 r = -EINTR; 2336 break; 2337 } 2338 2339 io_schedule(); 2340 } 2341 finish_wait(&md->wait, &wait); 2342 2343 smp_rmb(); 2344 2345 return r; 2346 } 2347 2348 static int dm_wait_for_completion(struct mapped_device *md, unsigned int task_state) 2349 { 2350 int r = 0; 2351 2352 if (!queue_is_mq(md->queue)) 2353 return dm_wait_for_bios_completion(md, task_state); 2354 2355 while (true) { 2356 if (!blk_mq_queue_inflight(md->queue)) 2357 break; 2358 2359 if (signal_pending_state(task_state, current)) { 2360 r = -EINTR; 2361 break; 2362 } 2363 2364 msleep(5); 2365 } 2366 2367 return r; 2368 } 2369 2370 /* 2371 * Process the deferred bios 2372 */ 2373 static void dm_wq_work(struct work_struct *work) 2374 { 2375 struct mapped_device *md = container_of(work, struct mapped_device, work); 2376 struct bio *bio; 2377 2378 while (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) { 2379 spin_lock_irq(&md->deferred_lock); 2380 bio = bio_list_pop(&md->deferred); 2381 spin_unlock_irq(&md->deferred_lock); 2382 2383 if (!bio) 2384 break; 2385 2386 submit_bio_noacct(bio); 2387 } 2388 } 2389 2390 static void dm_queue_flush(struct mapped_device *md) 2391 { 2392 clear_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags); 2393 smp_mb__after_atomic(); 2394 queue_work(md->wq, &md->work); 2395 } 2396 2397 /* 2398 * Swap in a new table, returning the old one for the caller to destroy. 2399 */ 2400 struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *table) 2401 { 2402 struct dm_table *live_map = NULL, *map = ERR_PTR(-EINVAL); 2403 struct queue_limits limits; 2404 int r; 2405 2406 mutex_lock(&md->suspend_lock); 2407 2408 /* device must be suspended */ 2409 if (!dm_suspended_md(md)) 2410 goto out; 2411 2412 /* 2413 * If the new table has no data devices, retain the existing limits. 2414 * This helps multipath with queue_if_no_path if all paths disappear, 2415 * then new I/O is queued based on these limits, and then some paths 2416 * reappear. 2417 */ 2418 if (dm_table_has_no_data_devices(table)) { 2419 live_map = dm_get_live_table_fast(md); 2420 if (live_map) 2421 limits = md->queue->limits; 2422 dm_put_live_table_fast(md); 2423 } 2424 2425 if (!live_map) { 2426 r = dm_calculate_queue_limits(table, &limits); 2427 if (r) { 2428 map = ERR_PTR(r); 2429 goto out; 2430 } 2431 } 2432 2433 map = __bind(md, table, &limits); 2434 dm_issue_global_event(); 2435 2436 out: 2437 mutex_unlock(&md->suspend_lock); 2438 return map; 2439 } 2440 2441 /* 2442 * Functions to lock and unlock any filesystem running on the 2443 * device. 2444 */ 2445 static int lock_fs(struct mapped_device *md) 2446 { 2447 int r; 2448 2449 WARN_ON(test_bit(DMF_FROZEN, &md->flags)); 2450 2451 r = freeze_bdev(md->disk->part0); 2452 if (!r) 2453 set_bit(DMF_FROZEN, &md->flags); 2454 return r; 2455 } 2456 2457 static void unlock_fs(struct mapped_device *md) 2458 { 2459 if (!test_bit(DMF_FROZEN, &md->flags)) 2460 return; 2461 thaw_bdev(md->disk->part0); 2462 clear_bit(DMF_FROZEN, &md->flags); 2463 } 2464 2465 /* 2466 * @suspend_flags: DM_SUSPEND_LOCKFS_FLAG and/or DM_SUSPEND_NOFLUSH_FLAG 2467 * @task_state: e.g. TASK_INTERRUPTIBLE or TASK_UNINTERRUPTIBLE 2468 * @dmf_suspended_flag: DMF_SUSPENDED or DMF_SUSPENDED_INTERNALLY 2469 * 2470 * If __dm_suspend returns 0, the device is completely quiescent 2471 * now. There is no request-processing activity. All new requests 2472 * are being added to md->deferred list. 2473 */ 2474 static int __dm_suspend(struct mapped_device *md, struct dm_table *map, 2475 unsigned suspend_flags, unsigned int task_state, 2476 int dmf_suspended_flag) 2477 { 2478 bool do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG; 2479 bool noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG; 2480 int r; 2481 2482 lockdep_assert_held(&md->suspend_lock); 2483 2484 /* 2485 * DMF_NOFLUSH_SUSPENDING must be set before presuspend. 2486 * This flag is cleared before dm_suspend returns. 2487 */ 2488 if (noflush) 2489 set_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); 2490 else 2491 DMDEBUG("%s: suspending with flush", dm_device_name(md)); 2492 2493 /* 2494 * This gets reverted if there's an error later and the targets 2495 * provide the .presuspend_undo hook. 2496 */ 2497 dm_table_presuspend_targets(map); 2498 2499 /* 2500 * Flush I/O to the device. 2501 * Any I/O submitted after lock_fs() may not be flushed. 2502 * noflush takes precedence over do_lockfs. 2503 * (lock_fs() flushes I/Os and waits for them to complete.) 2504 */ 2505 if (!noflush && do_lockfs) { 2506 r = lock_fs(md); 2507 if (r) { 2508 dm_table_presuspend_undo_targets(map); 2509 return r; 2510 } 2511 } 2512 2513 /* 2514 * Here we must make sure that no processes are submitting requests 2515 * to target drivers i.e. no one may be executing 2516 * dm_split_and_process_bio from dm_submit_bio. 2517 * 2518 * To get all processes out of dm_split_and_process_bio in dm_submit_bio, 2519 * we take the write lock. To prevent any process from reentering 2520 * dm_split_and_process_bio from dm_submit_bio and quiesce the thread 2521 * (dm_wq_work), we set DMF_BLOCK_IO_FOR_SUSPEND and call 2522 * flush_workqueue(md->wq). 2523 */ 2524 set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags); 2525 if (map) 2526 synchronize_srcu(&md->io_barrier); 2527 2528 /* 2529 * Stop md->queue before flushing md->wq in case request-based 2530 * dm defers requests to md->wq from md->queue. 2531 */ 2532 if (dm_request_based(md)) 2533 dm_stop_queue(md->queue); 2534 2535 flush_workqueue(md->wq); 2536 2537 /* 2538 * At this point no more requests are entering target request routines. 2539 * We call dm_wait_for_completion to wait for all existing requests 2540 * to finish. 2541 */ 2542 r = dm_wait_for_completion(md, task_state); 2543 if (!r) 2544 set_bit(dmf_suspended_flag, &md->flags); 2545 2546 if (noflush) 2547 clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); 2548 if (map) 2549 synchronize_srcu(&md->io_barrier); 2550 2551 /* were we interrupted ? */ 2552 if (r < 0) { 2553 dm_queue_flush(md); 2554 2555 if (dm_request_based(md)) 2556 dm_start_queue(md->queue); 2557 2558 unlock_fs(md); 2559 dm_table_presuspend_undo_targets(map); 2560 /* pushback list is already flushed, so skip flush */ 2561 } 2562 2563 return r; 2564 } 2565 2566 /* 2567 * We need to be able to change a mapping table under a mounted 2568 * filesystem. For example we might want to move some data in 2569 * the background. Before the table can be swapped with 2570 * dm_bind_table, dm_suspend must be called to flush any in 2571 * flight bios and ensure that any further io gets deferred. 2572 */ 2573 /* 2574 * Suspend mechanism in request-based dm. 2575 * 2576 * 1. Flush all I/Os by lock_fs() if needed. 2577 * 2. Stop dispatching any I/O by stopping the request_queue. 2578 * 3. Wait for all in-flight I/Os to be completed or requeued. 2579 * 2580 * To abort suspend, start the request_queue. 2581 */ 2582 int dm_suspend(struct mapped_device *md, unsigned suspend_flags) 2583 { 2584 struct dm_table *map = NULL; 2585 int r = 0; 2586 2587 retry: 2588 mutex_lock_nested(&md->suspend_lock, SINGLE_DEPTH_NESTING); 2589 2590 if (dm_suspended_md(md)) { 2591 r = -EINVAL; 2592 goto out_unlock; 2593 } 2594 2595 if (dm_suspended_internally_md(md)) { 2596 /* already internally suspended, wait for internal resume */ 2597 mutex_unlock(&md->suspend_lock); 2598 r = wait_on_bit(&md->flags, DMF_SUSPENDED_INTERNALLY, TASK_INTERRUPTIBLE); 2599 if (r) 2600 return r; 2601 goto retry; 2602 } 2603 2604 map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock)); 2605 2606 r = __dm_suspend(md, map, suspend_flags, TASK_INTERRUPTIBLE, DMF_SUSPENDED); 2607 if (r) 2608 goto out_unlock; 2609 2610 set_bit(DMF_POST_SUSPENDING, &md->flags); 2611 dm_table_postsuspend_targets(map); 2612 clear_bit(DMF_POST_SUSPENDING, &md->flags); 2613 2614 out_unlock: 2615 mutex_unlock(&md->suspend_lock); 2616 return r; 2617 } 2618 2619 static int __dm_resume(struct mapped_device *md, struct dm_table *map) 2620 { 2621 if (map) { 2622 int r = dm_table_resume_targets(map); 2623 if (r) 2624 return r; 2625 } 2626 2627 dm_queue_flush(md); 2628 2629 /* 2630 * Flushing deferred I/Os must be done after targets are resumed 2631 * so that mapping of targets can work correctly. 2632 * Request-based dm is queueing the deferred I/Os in its request_queue. 2633 */ 2634 if (dm_request_based(md)) 2635 dm_start_queue(md->queue); 2636 2637 unlock_fs(md); 2638 2639 return 0; 2640 } 2641 2642 int dm_resume(struct mapped_device *md) 2643 { 2644 int r; 2645 struct dm_table *map = NULL; 2646 2647 retry: 2648 r = -EINVAL; 2649 mutex_lock_nested(&md->suspend_lock, SINGLE_DEPTH_NESTING); 2650 2651 if (!dm_suspended_md(md)) 2652 goto out; 2653 2654 if (dm_suspended_internally_md(md)) { 2655 /* already internally suspended, wait for internal resume */ 2656 mutex_unlock(&md->suspend_lock); 2657 r = wait_on_bit(&md->flags, DMF_SUSPENDED_INTERNALLY, TASK_INTERRUPTIBLE); 2658 if (r) 2659 return r; 2660 goto retry; 2661 } 2662 2663 map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock)); 2664 if (!map || !dm_table_get_size(map)) 2665 goto out; 2666 2667 r = __dm_resume(md, map); 2668 if (r) 2669 goto out; 2670 2671 clear_bit(DMF_SUSPENDED, &md->flags); 2672 out: 2673 mutex_unlock(&md->suspend_lock); 2674 2675 return r; 2676 } 2677 2678 /* 2679 * Internal suspend/resume works like userspace-driven suspend. It waits 2680 * until all bios finish and prevents issuing new bios to the target drivers. 2681 * It may be used only from the kernel. 2682 */ 2683 2684 static void __dm_internal_suspend(struct mapped_device *md, unsigned suspend_flags) 2685 { 2686 struct dm_table *map = NULL; 2687 2688 lockdep_assert_held(&md->suspend_lock); 2689 2690 if (md->internal_suspend_count++) 2691 return; /* nested internal suspend */ 2692 2693 if (dm_suspended_md(md)) { 2694 set_bit(DMF_SUSPENDED_INTERNALLY, &md->flags); 2695 return; /* nest suspend */ 2696 } 2697 2698 map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock)); 2699 2700 /* 2701 * Using TASK_UNINTERRUPTIBLE because only NOFLUSH internal suspend is 2702 * supported. Properly supporting a TASK_INTERRUPTIBLE internal suspend 2703 * would require changing .presuspend to return an error -- avoid this 2704 * until there is a need for more elaborate variants of internal suspend. 2705 */ 2706 (void) __dm_suspend(md, map, suspend_flags, TASK_UNINTERRUPTIBLE, 2707 DMF_SUSPENDED_INTERNALLY); 2708 2709 set_bit(DMF_POST_SUSPENDING, &md->flags); 2710 dm_table_postsuspend_targets(map); 2711 clear_bit(DMF_POST_SUSPENDING, &md->flags); 2712 } 2713 2714 static void __dm_internal_resume(struct mapped_device *md) 2715 { 2716 BUG_ON(!md->internal_suspend_count); 2717 2718 if (--md->internal_suspend_count) 2719 return; /* resume from nested internal suspend */ 2720 2721 if (dm_suspended_md(md)) 2722 goto done; /* resume from nested suspend */ 2723 2724 /* 2725 * NOTE: existing callers don't need to call dm_table_resume_targets 2726 * (which may fail -- so best to avoid it for now by passing NULL map) 2727 */ 2728 (void) __dm_resume(md, NULL); 2729 2730 done: 2731 clear_bit(DMF_SUSPENDED_INTERNALLY, &md->flags); 2732 smp_mb__after_atomic(); 2733 wake_up_bit(&md->flags, DMF_SUSPENDED_INTERNALLY); 2734 } 2735 2736 void dm_internal_suspend_noflush(struct mapped_device *md) 2737 { 2738 mutex_lock(&md->suspend_lock); 2739 __dm_internal_suspend(md, DM_SUSPEND_NOFLUSH_FLAG); 2740 mutex_unlock(&md->suspend_lock); 2741 } 2742 EXPORT_SYMBOL_GPL(dm_internal_suspend_noflush); 2743 2744 void dm_internal_resume(struct mapped_device *md) 2745 { 2746 mutex_lock(&md->suspend_lock); 2747 __dm_internal_resume(md); 2748 mutex_unlock(&md->suspend_lock); 2749 } 2750 EXPORT_SYMBOL_GPL(dm_internal_resume); 2751 2752 /* 2753 * Fast variants of internal suspend/resume hold md->suspend_lock, 2754 * which prevents interaction with userspace-driven suspend. 2755 */ 2756 2757 void dm_internal_suspend_fast(struct mapped_device *md) 2758 { 2759 mutex_lock(&md->suspend_lock); 2760 if (dm_suspended_md(md) || dm_suspended_internally_md(md)) 2761 return; 2762 2763 set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags); 2764 synchronize_srcu(&md->io_barrier); 2765 flush_workqueue(md->wq); 2766 dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE); 2767 } 2768 EXPORT_SYMBOL_GPL(dm_internal_suspend_fast); 2769 2770 void dm_internal_resume_fast(struct mapped_device *md) 2771 { 2772 if (dm_suspended_md(md) || dm_suspended_internally_md(md)) 2773 goto done; 2774 2775 dm_queue_flush(md); 2776 2777 done: 2778 mutex_unlock(&md->suspend_lock); 2779 } 2780 EXPORT_SYMBOL_GPL(dm_internal_resume_fast); 2781 2782 /*----------------------------------------------------------------- 2783 * Event notification. 2784 *---------------------------------------------------------------*/ 2785 int dm_kobject_uevent(struct mapped_device *md, enum kobject_action action, 2786 unsigned cookie) 2787 { 2788 int r; 2789 unsigned noio_flag; 2790 char udev_cookie[DM_COOKIE_LENGTH]; 2791 char *envp[] = { udev_cookie, NULL }; 2792 2793 noio_flag = memalloc_noio_save(); 2794 2795 if (!cookie) 2796 r = kobject_uevent(&disk_to_dev(md->disk)->kobj, action); 2797 else { 2798 snprintf(udev_cookie, DM_COOKIE_LENGTH, "%s=%u", 2799 DM_COOKIE_ENV_VAR_NAME, cookie); 2800 r = kobject_uevent_env(&disk_to_dev(md->disk)->kobj, 2801 action, envp); 2802 } 2803 2804 memalloc_noio_restore(noio_flag); 2805 2806 return r; 2807 } 2808 2809 uint32_t dm_next_uevent_seq(struct mapped_device *md) 2810 { 2811 return atomic_add_return(1, &md->uevent_seq); 2812 } 2813 2814 uint32_t dm_get_event_nr(struct mapped_device *md) 2815 { 2816 return atomic_read(&md->event_nr); 2817 } 2818 2819 int dm_wait_event(struct mapped_device *md, int event_nr) 2820 { 2821 return wait_event_interruptible(md->eventq, 2822 (event_nr != atomic_read(&md->event_nr))); 2823 } 2824 2825 void dm_uevent_add(struct mapped_device *md, struct list_head *elist) 2826 { 2827 unsigned long flags; 2828 2829 spin_lock_irqsave(&md->uevent_lock, flags); 2830 list_add(elist, &md->uevent_list); 2831 spin_unlock_irqrestore(&md->uevent_lock, flags); 2832 } 2833 2834 /* 2835 * The gendisk is only valid as long as you have a reference 2836 * count on 'md'. 2837 */ 2838 struct gendisk *dm_disk(struct mapped_device *md) 2839 { 2840 return md->disk; 2841 } 2842 EXPORT_SYMBOL_GPL(dm_disk); 2843 2844 struct kobject *dm_kobject(struct mapped_device *md) 2845 { 2846 return &md->kobj_holder.kobj; 2847 } 2848 2849 struct mapped_device *dm_get_from_kobject(struct kobject *kobj) 2850 { 2851 struct mapped_device *md; 2852 2853 md = container_of(kobj, struct mapped_device, kobj_holder.kobj); 2854 2855 spin_lock(&_minor_lock); 2856 if (test_bit(DMF_FREEING, &md->flags) || dm_deleting_md(md)) { 2857 md = NULL; 2858 goto out; 2859 } 2860 dm_get(md); 2861 out: 2862 spin_unlock(&_minor_lock); 2863 2864 return md; 2865 } 2866 2867 int dm_suspended_md(struct mapped_device *md) 2868 { 2869 return test_bit(DMF_SUSPENDED, &md->flags); 2870 } 2871 2872 static int dm_post_suspending_md(struct mapped_device *md) 2873 { 2874 return test_bit(DMF_POST_SUSPENDING, &md->flags); 2875 } 2876 2877 int dm_suspended_internally_md(struct mapped_device *md) 2878 { 2879 return test_bit(DMF_SUSPENDED_INTERNALLY, &md->flags); 2880 } 2881 2882 int dm_test_deferred_remove_flag(struct mapped_device *md) 2883 { 2884 return test_bit(DMF_DEFERRED_REMOVE, &md->flags); 2885 } 2886 2887 int dm_suspended(struct dm_target *ti) 2888 { 2889 return dm_suspended_md(ti->table->md); 2890 } 2891 EXPORT_SYMBOL_GPL(dm_suspended); 2892 2893 int dm_post_suspending(struct dm_target *ti) 2894 { 2895 return dm_post_suspending_md(ti->table->md); 2896 } 2897 EXPORT_SYMBOL_GPL(dm_post_suspending); 2898 2899 int dm_noflush_suspending(struct dm_target *ti) 2900 { 2901 return __noflush_suspending(ti->table->md); 2902 } 2903 EXPORT_SYMBOL_GPL(dm_noflush_suspending); 2904 2905 struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, enum dm_queue_mode type, 2906 unsigned integrity, unsigned per_io_data_size, 2907 unsigned min_pool_size) 2908 { 2909 struct dm_md_mempools *pools = kzalloc_node(sizeof(*pools), GFP_KERNEL, md->numa_node_id); 2910 unsigned int pool_size = 0; 2911 unsigned int front_pad, io_front_pad; 2912 int ret; 2913 2914 if (!pools) 2915 return NULL; 2916 2917 switch (type) { 2918 case DM_TYPE_BIO_BASED: 2919 case DM_TYPE_DAX_BIO_BASED: 2920 pool_size = max(dm_get_reserved_bio_based_ios(), min_pool_size); 2921 front_pad = roundup(per_io_data_size, __alignof__(struct dm_target_io)) + DM_TARGET_IO_BIO_OFFSET; 2922 io_front_pad = roundup(per_io_data_size, __alignof__(struct dm_io)) + DM_IO_BIO_OFFSET; 2923 ret = bioset_init(&pools->io_bs, pool_size, io_front_pad, 0); 2924 if (ret) 2925 goto out; 2926 if (integrity && bioset_integrity_create(&pools->io_bs, pool_size)) 2927 goto out; 2928 break; 2929 case DM_TYPE_REQUEST_BASED: 2930 pool_size = max(dm_get_reserved_rq_based_ios(), min_pool_size); 2931 front_pad = offsetof(struct dm_rq_clone_bio_info, clone); 2932 /* per_io_data_size is used for blk-mq pdu at queue allocation */ 2933 break; 2934 default: 2935 BUG(); 2936 } 2937 2938 ret = bioset_init(&pools->bs, pool_size, front_pad, 0); 2939 if (ret) 2940 goto out; 2941 2942 if (integrity && bioset_integrity_create(&pools->bs, pool_size)) 2943 goto out; 2944 2945 return pools; 2946 2947 out: 2948 dm_free_md_mempools(pools); 2949 2950 return NULL; 2951 } 2952 2953 void dm_free_md_mempools(struct dm_md_mempools *pools) 2954 { 2955 if (!pools) 2956 return; 2957 2958 bioset_exit(&pools->bs); 2959 bioset_exit(&pools->io_bs); 2960 2961 kfree(pools); 2962 } 2963 2964 struct dm_pr { 2965 u64 old_key; 2966 u64 new_key; 2967 u32 flags; 2968 bool fail_early; 2969 }; 2970 2971 static int dm_call_pr(struct block_device *bdev, iterate_devices_callout_fn fn, 2972 void *data) 2973 { 2974 struct mapped_device *md = bdev->bd_disk->private_data; 2975 struct dm_table *table; 2976 struct dm_target *ti; 2977 int ret = -ENOTTY, srcu_idx; 2978 2979 table = dm_get_live_table(md, &srcu_idx); 2980 if (!table || !dm_table_get_size(table)) 2981 goto out; 2982 2983 /* We only support devices that have a single target */ 2984 if (dm_table_get_num_targets(table) != 1) 2985 goto out; 2986 ti = dm_table_get_target(table, 0); 2987 2988 ret = -EINVAL; 2989 if (!ti->type->iterate_devices) 2990 goto out; 2991 2992 ret = ti->type->iterate_devices(ti, fn, data); 2993 out: 2994 dm_put_live_table(md, srcu_idx); 2995 return ret; 2996 } 2997 2998 /* 2999 * For register / unregister we need to manually call out to every path. 3000 */ 3001 static int __dm_pr_register(struct dm_target *ti, struct dm_dev *dev, 3002 sector_t start, sector_t len, void *data) 3003 { 3004 struct dm_pr *pr = data; 3005 const struct pr_ops *ops = dev->bdev->bd_disk->fops->pr_ops; 3006 3007 if (!ops || !ops->pr_register) 3008 return -EOPNOTSUPP; 3009 return ops->pr_register(dev->bdev, pr->old_key, pr->new_key, pr->flags); 3010 } 3011 3012 static int dm_pr_register(struct block_device *bdev, u64 old_key, u64 new_key, 3013 u32 flags) 3014 { 3015 struct dm_pr pr = { 3016 .old_key = old_key, 3017 .new_key = new_key, 3018 .flags = flags, 3019 .fail_early = true, 3020 }; 3021 int ret; 3022 3023 ret = dm_call_pr(bdev, __dm_pr_register, &pr); 3024 if (ret && new_key) { 3025 /* unregister all paths if we failed to register any path */ 3026 pr.old_key = new_key; 3027 pr.new_key = 0; 3028 pr.flags = 0; 3029 pr.fail_early = false; 3030 dm_call_pr(bdev, __dm_pr_register, &pr); 3031 } 3032 3033 return ret; 3034 } 3035 3036 static int dm_pr_reserve(struct block_device *bdev, u64 key, enum pr_type type, 3037 u32 flags) 3038 { 3039 struct mapped_device *md = bdev->bd_disk->private_data; 3040 const struct pr_ops *ops; 3041 int r, srcu_idx; 3042 3043 r = dm_prepare_ioctl(md, &srcu_idx, &bdev); 3044 if (r < 0) 3045 goto out; 3046 3047 ops = bdev->bd_disk->fops->pr_ops; 3048 if (ops && ops->pr_reserve) 3049 r = ops->pr_reserve(bdev, key, type, flags); 3050 else 3051 r = -EOPNOTSUPP; 3052 out: 3053 dm_unprepare_ioctl(md, srcu_idx); 3054 return r; 3055 } 3056 3057 static int dm_pr_release(struct block_device *bdev, u64 key, enum pr_type type) 3058 { 3059 struct mapped_device *md = bdev->bd_disk->private_data; 3060 const struct pr_ops *ops; 3061 int r, srcu_idx; 3062 3063 r = dm_prepare_ioctl(md, &srcu_idx, &bdev); 3064 if (r < 0) 3065 goto out; 3066 3067 ops = bdev->bd_disk->fops->pr_ops; 3068 if (ops && ops->pr_release) 3069 r = ops->pr_release(bdev, key, type); 3070 else 3071 r = -EOPNOTSUPP; 3072 out: 3073 dm_unprepare_ioctl(md, srcu_idx); 3074 return r; 3075 } 3076 3077 static int dm_pr_preempt(struct block_device *bdev, u64 old_key, u64 new_key, 3078 enum pr_type type, bool abort) 3079 { 3080 struct mapped_device *md = bdev->bd_disk->private_data; 3081 const struct pr_ops *ops; 3082 int r, srcu_idx; 3083 3084 r = dm_prepare_ioctl(md, &srcu_idx, &bdev); 3085 if (r < 0) 3086 goto out; 3087 3088 ops = bdev->bd_disk->fops->pr_ops; 3089 if (ops && ops->pr_preempt) 3090 r = ops->pr_preempt(bdev, old_key, new_key, type, abort); 3091 else 3092 r = -EOPNOTSUPP; 3093 out: 3094 dm_unprepare_ioctl(md, srcu_idx); 3095 return r; 3096 } 3097 3098 static int dm_pr_clear(struct block_device *bdev, u64 key) 3099 { 3100 struct mapped_device *md = bdev->bd_disk->private_data; 3101 const struct pr_ops *ops; 3102 int r, srcu_idx; 3103 3104 r = dm_prepare_ioctl(md, &srcu_idx, &bdev); 3105 if (r < 0) 3106 goto out; 3107 3108 ops = bdev->bd_disk->fops->pr_ops; 3109 if (ops && ops->pr_clear) 3110 r = ops->pr_clear(bdev, key); 3111 else 3112 r = -EOPNOTSUPP; 3113 out: 3114 dm_unprepare_ioctl(md, srcu_idx); 3115 return r; 3116 } 3117 3118 static const struct pr_ops dm_pr_ops = { 3119 .pr_register = dm_pr_register, 3120 .pr_reserve = dm_pr_reserve, 3121 .pr_release = dm_pr_release, 3122 .pr_preempt = dm_pr_preempt, 3123 .pr_clear = dm_pr_clear, 3124 }; 3125 3126 static const struct block_device_operations dm_blk_dops = { 3127 .submit_bio = dm_submit_bio, 3128 .poll_bio = dm_poll_bio, 3129 .open = dm_blk_open, 3130 .release = dm_blk_close, 3131 .ioctl = dm_blk_ioctl, 3132 .getgeo = dm_blk_getgeo, 3133 .report_zones = dm_blk_report_zones, 3134 .pr_ops = &dm_pr_ops, 3135 .owner = THIS_MODULE 3136 }; 3137 3138 static const struct block_device_operations dm_rq_blk_dops = { 3139 .open = dm_blk_open, 3140 .release = dm_blk_close, 3141 .ioctl = dm_blk_ioctl, 3142 .getgeo = dm_blk_getgeo, 3143 .pr_ops = &dm_pr_ops, 3144 .owner = THIS_MODULE 3145 }; 3146 3147 static const struct dax_operations dm_dax_ops = { 3148 .direct_access = dm_dax_direct_access, 3149 .zero_page_range = dm_dax_zero_page_range, 3150 }; 3151 3152 /* 3153 * module hooks 3154 */ 3155 module_init(dm_init); 3156 module_exit(dm_exit); 3157 3158 module_param(major, uint, 0); 3159 MODULE_PARM_DESC(major, "The major number of the device mapper"); 3160 3161 module_param(reserved_bio_based_ios, uint, S_IRUGO | S_IWUSR); 3162 MODULE_PARM_DESC(reserved_bio_based_ios, "Reserved IOs in bio-based mempools"); 3163 3164 module_param(dm_numa_node, int, S_IRUGO | S_IWUSR); 3165 MODULE_PARM_DESC(dm_numa_node, "NUMA node for DM device memory allocations"); 3166 3167 module_param(swap_bios, int, S_IRUGO | S_IWUSR); 3168 MODULE_PARM_DESC(swap_bios, "Maximum allowed inflight swap IOs"); 3169 3170 MODULE_DESCRIPTION(DM_NAME " driver"); 3171 MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>"); 3172 MODULE_LICENSE("GPL"); 3173