1 /* 2 * Copyright (C) 2001, 2002 Sistina Software (UK) Limited. 3 * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved. 4 * 5 * This file is released under the GPL. 6 */ 7 8 #include "dm-core.h" 9 #include "dm-rq.h" 10 #include "dm-uevent.h" 11 #include "dm-ima.h" 12 13 #include <linux/init.h> 14 #include <linux/module.h> 15 #include <linux/mutex.h> 16 #include <linux/sched/mm.h> 17 #include <linux/sched/signal.h> 18 #include <linux/blkpg.h> 19 #include <linux/bio.h> 20 #include <linux/mempool.h> 21 #include <linux/dax.h> 22 #include <linux/slab.h> 23 #include <linux/idr.h> 24 #include <linux/uio.h> 25 #include <linux/hdreg.h> 26 #include <linux/delay.h> 27 #include <linux/wait.h> 28 #include <linux/pr.h> 29 #include <linux/refcount.h> 30 #include <linux/part_stat.h> 31 #include <linux/blk-crypto.h> 32 #include <linux/blk-crypto-profile.h> 33 34 #define DM_MSG_PREFIX "core" 35 36 /* 37 * Cookies are numeric values sent with CHANGE and REMOVE 38 * uevents while resuming, removing or renaming the device. 39 */ 40 #define DM_COOKIE_ENV_VAR_NAME "DM_COOKIE" 41 #define DM_COOKIE_LENGTH 24 42 43 /* 44 * For REQ_POLLED fs bio, this flag is set if we link mapped underlying 45 * dm_io into one list, and reuse bio->bi_private as the list head. Before 46 * ending this fs bio, we will recover its ->bi_private. 47 */ 48 #define REQ_DM_POLL_LIST REQ_DRV 49 50 static const char *_name = DM_NAME; 51 52 static unsigned int major = 0; 53 static unsigned int _major = 0; 54 55 static DEFINE_IDR(_minor_idr); 56 57 static DEFINE_SPINLOCK(_minor_lock); 58 59 static void do_deferred_remove(struct work_struct *w); 60 61 static DECLARE_WORK(deferred_remove_work, do_deferred_remove); 62 63 static struct workqueue_struct *deferred_remove_workqueue; 64 65 atomic_t dm_global_event_nr = ATOMIC_INIT(0); 66 DECLARE_WAIT_QUEUE_HEAD(dm_global_eventq); 67 68 void dm_issue_global_event(void) 69 { 70 atomic_inc(&dm_global_event_nr); 71 wake_up(&dm_global_eventq); 72 } 73 74 /* 75 * One of these is allocated (on-stack) per original bio. 76 */ 77 struct clone_info { 78 struct dm_table *map; 79 struct bio *bio; 80 struct dm_io *io; 81 sector_t sector; 82 unsigned sector_count; 83 bool submit_as_polled; 84 }; 85 86 #define DM_TARGET_IO_BIO_OFFSET (offsetof(struct dm_target_io, clone)) 87 #define DM_IO_BIO_OFFSET \ 88 (offsetof(struct dm_target_io, clone) + offsetof(struct dm_io, tio)) 89 90 static inline struct dm_target_io *clone_to_tio(struct bio *clone) 91 { 92 return container_of(clone, struct dm_target_io, clone); 93 } 94 95 void *dm_per_bio_data(struct bio *bio, size_t data_size) 96 { 97 if (!clone_to_tio(bio)->inside_dm_io) 98 return (char *)bio - DM_TARGET_IO_BIO_OFFSET - data_size; 99 return (char *)bio - DM_IO_BIO_OFFSET - data_size; 100 } 101 EXPORT_SYMBOL_GPL(dm_per_bio_data); 102 103 struct bio *dm_bio_from_per_bio_data(void *data, size_t data_size) 104 { 105 struct dm_io *io = (struct dm_io *)((char *)data + data_size); 106 if (io->magic == DM_IO_MAGIC) 107 return (struct bio *)((char *)io + DM_IO_BIO_OFFSET); 108 BUG_ON(io->magic != DM_TIO_MAGIC); 109 return (struct bio *)((char *)io + DM_TARGET_IO_BIO_OFFSET); 110 } 111 EXPORT_SYMBOL_GPL(dm_bio_from_per_bio_data); 112 113 unsigned dm_bio_get_target_bio_nr(const struct bio *bio) 114 { 115 return container_of(bio, struct dm_target_io, clone)->target_bio_nr; 116 } 117 EXPORT_SYMBOL_GPL(dm_bio_get_target_bio_nr); 118 119 #define MINOR_ALLOCED ((void *)-1) 120 121 #define DM_NUMA_NODE NUMA_NO_NODE 122 static int dm_numa_node = DM_NUMA_NODE; 123 124 #define DEFAULT_SWAP_BIOS (8 * 1048576 / PAGE_SIZE) 125 static int swap_bios = DEFAULT_SWAP_BIOS; 126 static int get_swap_bios(void) 127 { 128 int latch = READ_ONCE(swap_bios); 129 if (unlikely(latch <= 0)) 130 latch = DEFAULT_SWAP_BIOS; 131 return latch; 132 } 133 134 /* 135 * For mempools pre-allocation at the table loading time. 136 */ 137 struct dm_md_mempools { 138 struct bio_set bs; 139 struct bio_set io_bs; 140 }; 141 142 struct table_device { 143 struct list_head list; 144 refcount_t count; 145 struct dm_dev dm_dev; 146 }; 147 148 /* 149 * Bio-based DM's mempools' reserved IOs set by the user. 150 */ 151 #define RESERVED_BIO_BASED_IOS 16 152 static unsigned reserved_bio_based_ios = RESERVED_BIO_BASED_IOS; 153 154 static int __dm_get_module_param_int(int *module_param, int min, int max) 155 { 156 int param = READ_ONCE(*module_param); 157 int modified_param = 0; 158 bool modified = true; 159 160 if (param < min) 161 modified_param = min; 162 else if (param > max) 163 modified_param = max; 164 else 165 modified = false; 166 167 if (modified) { 168 (void)cmpxchg(module_param, param, modified_param); 169 param = modified_param; 170 } 171 172 return param; 173 } 174 175 unsigned __dm_get_module_param(unsigned *module_param, 176 unsigned def, unsigned max) 177 { 178 unsigned param = READ_ONCE(*module_param); 179 unsigned modified_param = 0; 180 181 if (!param) 182 modified_param = def; 183 else if (param > max) 184 modified_param = max; 185 186 if (modified_param) { 187 (void)cmpxchg(module_param, param, modified_param); 188 param = modified_param; 189 } 190 191 return param; 192 } 193 194 unsigned dm_get_reserved_bio_based_ios(void) 195 { 196 return __dm_get_module_param(&reserved_bio_based_ios, 197 RESERVED_BIO_BASED_IOS, DM_RESERVED_MAX_IOS); 198 } 199 EXPORT_SYMBOL_GPL(dm_get_reserved_bio_based_ios); 200 201 static unsigned dm_get_numa_node(void) 202 { 203 return __dm_get_module_param_int(&dm_numa_node, 204 DM_NUMA_NODE, num_online_nodes() - 1); 205 } 206 207 static int __init local_init(void) 208 { 209 int r; 210 211 r = dm_uevent_init(); 212 if (r) 213 return r; 214 215 deferred_remove_workqueue = alloc_workqueue("kdmremove", WQ_UNBOUND, 1); 216 if (!deferred_remove_workqueue) { 217 r = -ENOMEM; 218 goto out_uevent_exit; 219 } 220 221 _major = major; 222 r = register_blkdev(_major, _name); 223 if (r < 0) 224 goto out_free_workqueue; 225 226 if (!_major) 227 _major = r; 228 229 return 0; 230 231 out_free_workqueue: 232 destroy_workqueue(deferred_remove_workqueue); 233 out_uevent_exit: 234 dm_uevent_exit(); 235 236 return r; 237 } 238 239 static void local_exit(void) 240 { 241 flush_scheduled_work(); 242 destroy_workqueue(deferred_remove_workqueue); 243 244 unregister_blkdev(_major, _name); 245 dm_uevent_exit(); 246 247 _major = 0; 248 249 DMINFO("cleaned up"); 250 } 251 252 static int (*_inits[])(void) __initdata = { 253 local_init, 254 dm_target_init, 255 dm_linear_init, 256 dm_stripe_init, 257 dm_io_init, 258 dm_kcopyd_init, 259 dm_interface_init, 260 dm_statistics_init, 261 }; 262 263 static void (*_exits[])(void) = { 264 local_exit, 265 dm_target_exit, 266 dm_linear_exit, 267 dm_stripe_exit, 268 dm_io_exit, 269 dm_kcopyd_exit, 270 dm_interface_exit, 271 dm_statistics_exit, 272 }; 273 274 static int __init dm_init(void) 275 { 276 const int count = ARRAY_SIZE(_inits); 277 int r, i; 278 279 #if (IS_ENABLED(CONFIG_IMA) && !IS_ENABLED(CONFIG_IMA_DISABLE_HTABLE)) 280 DMWARN("CONFIG_IMA_DISABLE_HTABLE is disabled." 281 " Duplicate IMA measurements will not be recorded in the IMA log."); 282 #endif 283 284 for (i = 0; i < count; i++) { 285 r = _inits[i](); 286 if (r) 287 goto bad; 288 } 289 290 return 0; 291 bad: 292 while (i--) 293 _exits[i](); 294 295 return r; 296 } 297 298 static void __exit dm_exit(void) 299 { 300 int i = ARRAY_SIZE(_exits); 301 302 while (i--) 303 _exits[i](); 304 305 /* 306 * Should be empty by this point. 307 */ 308 idr_destroy(&_minor_idr); 309 } 310 311 /* 312 * Block device functions 313 */ 314 int dm_deleting_md(struct mapped_device *md) 315 { 316 return test_bit(DMF_DELETING, &md->flags); 317 } 318 319 static int dm_blk_open(struct block_device *bdev, fmode_t mode) 320 { 321 struct mapped_device *md; 322 323 spin_lock(&_minor_lock); 324 325 md = bdev->bd_disk->private_data; 326 if (!md) 327 goto out; 328 329 if (test_bit(DMF_FREEING, &md->flags) || 330 dm_deleting_md(md)) { 331 md = NULL; 332 goto out; 333 } 334 335 dm_get(md); 336 atomic_inc(&md->open_count); 337 out: 338 spin_unlock(&_minor_lock); 339 340 return md ? 0 : -ENXIO; 341 } 342 343 static void dm_blk_close(struct gendisk *disk, fmode_t mode) 344 { 345 struct mapped_device *md; 346 347 spin_lock(&_minor_lock); 348 349 md = disk->private_data; 350 if (WARN_ON(!md)) 351 goto out; 352 353 if (atomic_dec_and_test(&md->open_count) && 354 (test_bit(DMF_DEFERRED_REMOVE, &md->flags))) 355 queue_work(deferred_remove_workqueue, &deferred_remove_work); 356 357 dm_put(md); 358 out: 359 spin_unlock(&_minor_lock); 360 } 361 362 int dm_open_count(struct mapped_device *md) 363 { 364 return atomic_read(&md->open_count); 365 } 366 367 /* 368 * Guarantees nothing is using the device before it's deleted. 369 */ 370 int dm_lock_for_deletion(struct mapped_device *md, bool mark_deferred, bool only_deferred) 371 { 372 int r = 0; 373 374 spin_lock(&_minor_lock); 375 376 if (dm_open_count(md)) { 377 r = -EBUSY; 378 if (mark_deferred) 379 set_bit(DMF_DEFERRED_REMOVE, &md->flags); 380 } else if (only_deferred && !test_bit(DMF_DEFERRED_REMOVE, &md->flags)) 381 r = -EEXIST; 382 else 383 set_bit(DMF_DELETING, &md->flags); 384 385 spin_unlock(&_minor_lock); 386 387 return r; 388 } 389 390 int dm_cancel_deferred_remove(struct mapped_device *md) 391 { 392 int r = 0; 393 394 spin_lock(&_minor_lock); 395 396 if (test_bit(DMF_DELETING, &md->flags)) 397 r = -EBUSY; 398 else 399 clear_bit(DMF_DEFERRED_REMOVE, &md->flags); 400 401 spin_unlock(&_minor_lock); 402 403 return r; 404 } 405 406 static void do_deferred_remove(struct work_struct *w) 407 { 408 dm_deferred_remove(); 409 } 410 411 static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo) 412 { 413 struct mapped_device *md = bdev->bd_disk->private_data; 414 415 return dm_get_geometry(md, geo); 416 } 417 418 static int dm_prepare_ioctl(struct mapped_device *md, int *srcu_idx, 419 struct block_device **bdev) 420 { 421 struct dm_target *tgt; 422 struct dm_table *map; 423 int r; 424 425 retry: 426 r = -ENOTTY; 427 map = dm_get_live_table(md, srcu_idx); 428 if (!map || !dm_table_get_size(map)) 429 return r; 430 431 /* We only support devices that have a single target */ 432 if (dm_table_get_num_targets(map) != 1) 433 return r; 434 435 tgt = dm_table_get_target(map, 0); 436 if (!tgt->type->prepare_ioctl) 437 return r; 438 439 if (dm_suspended_md(md)) 440 return -EAGAIN; 441 442 r = tgt->type->prepare_ioctl(tgt, bdev); 443 if (r == -ENOTCONN && !fatal_signal_pending(current)) { 444 dm_put_live_table(md, *srcu_idx); 445 msleep(10); 446 goto retry; 447 } 448 449 return r; 450 } 451 452 static void dm_unprepare_ioctl(struct mapped_device *md, int srcu_idx) 453 { 454 dm_put_live_table(md, srcu_idx); 455 } 456 457 static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode, 458 unsigned int cmd, unsigned long arg) 459 { 460 struct mapped_device *md = bdev->bd_disk->private_data; 461 int r, srcu_idx; 462 463 r = dm_prepare_ioctl(md, &srcu_idx, &bdev); 464 if (r < 0) 465 goto out; 466 467 if (r > 0) { 468 /* 469 * Target determined this ioctl is being issued against a 470 * subset of the parent bdev; require extra privileges. 471 */ 472 if (!capable(CAP_SYS_RAWIO)) { 473 DMDEBUG_LIMIT( 474 "%s: sending ioctl %x to DM device without required privilege.", 475 current->comm, cmd); 476 r = -ENOIOCTLCMD; 477 goto out; 478 } 479 } 480 481 if (!bdev->bd_disk->fops->ioctl) 482 r = -ENOTTY; 483 else 484 r = bdev->bd_disk->fops->ioctl(bdev, mode, cmd, arg); 485 out: 486 dm_unprepare_ioctl(md, srcu_idx); 487 return r; 488 } 489 490 u64 dm_start_time_ns_from_clone(struct bio *bio) 491 { 492 return jiffies_to_nsecs(clone_to_tio(bio)->io->start_time); 493 } 494 EXPORT_SYMBOL_GPL(dm_start_time_ns_from_clone); 495 496 static bool bio_is_flush_with_data(struct bio *bio) 497 { 498 return ((bio->bi_opf & REQ_PREFLUSH) && bio->bi_iter.bi_size); 499 } 500 501 static void dm_io_acct(bool end, struct mapped_device *md, struct bio *bio, 502 unsigned long start_time, struct dm_stats_aux *stats_aux) 503 { 504 bool is_flush_with_data; 505 unsigned int bi_size; 506 507 /* If REQ_PREFLUSH set save any payload but do not account it */ 508 is_flush_with_data = bio_is_flush_with_data(bio); 509 if (is_flush_with_data) { 510 bi_size = bio->bi_iter.bi_size; 511 bio->bi_iter.bi_size = 0; 512 } 513 514 if (!end) 515 bio_start_io_acct_time(bio, start_time); 516 else 517 bio_end_io_acct(bio, start_time); 518 519 if (unlikely(dm_stats_used(&md->stats))) 520 dm_stats_account_io(&md->stats, bio_data_dir(bio), 521 bio->bi_iter.bi_sector, bio_sectors(bio), 522 end, start_time, stats_aux); 523 524 /* Restore bio's payload so it does get accounted upon requeue */ 525 if (is_flush_with_data) 526 bio->bi_iter.bi_size = bi_size; 527 } 528 529 static void __dm_start_io_acct(struct dm_io *io, struct bio *bio) 530 { 531 dm_io_acct(false, io->md, bio, io->start_time, &io->stats_aux); 532 } 533 534 static void dm_start_io_acct(struct dm_io *io, struct bio *clone) 535 { 536 /* Must account IO to DM device in terms of orig_bio */ 537 struct bio *bio = io->orig_bio; 538 539 /* 540 * Ensure IO accounting is only ever started once. 541 * Expect no possibility for race unless is_duplicate_bio. 542 */ 543 if (!clone || likely(!clone_to_tio(clone)->is_duplicate_bio)) { 544 if (WARN_ON_ONCE(io->was_accounted)) 545 return; 546 io->was_accounted = 1; 547 } else if (xchg(&io->was_accounted, 1) == 1) 548 return; 549 550 __dm_start_io_acct(io, bio); 551 } 552 553 static void dm_end_io_acct(struct dm_io *io, struct bio *bio) 554 { 555 dm_io_acct(true, io->md, bio, io->start_time, &io->stats_aux); 556 } 557 558 static struct dm_io *alloc_io(struct mapped_device *md, struct bio *bio) 559 { 560 struct dm_io *io; 561 struct dm_target_io *tio; 562 struct bio *clone; 563 564 clone = bio_alloc_clone(bio->bi_bdev, bio, GFP_NOIO, &md->io_bs); 565 566 tio = clone_to_tio(clone); 567 tio->inside_dm_io = true; 568 tio->io = NULL; 569 570 io = container_of(tio, struct dm_io, tio); 571 io->magic = DM_IO_MAGIC; 572 io->status = 0; 573 atomic_set(&io->io_count, 1); 574 this_cpu_inc(*md->pending_io); 575 io->orig_bio = NULL; 576 io->md = md; 577 io->map_task = current; 578 spin_lock_init(&io->endio_lock); 579 580 io->start_time = jiffies; 581 io->start_io_acct = false; 582 io->was_accounted = 0; 583 584 dm_stats_record_start(&md->stats, &io->stats_aux); 585 586 return io; 587 } 588 589 static void free_io(struct dm_io *io) 590 { 591 bio_put(&io->tio.clone); 592 } 593 594 static struct bio *alloc_tio(struct clone_info *ci, struct dm_target *ti, 595 unsigned target_bio_nr, unsigned *len, gfp_t gfp_mask) 596 { 597 struct dm_target_io *tio; 598 struct bio *clone; 599 600 if (!ci->io->tio.io) { 601 /* the dm_target_io embedded in ci->io is available */ 602 tio = &ci->io->tio; 603 /* alloc_io() already initialized embedded clone */ 604 clone = &tio->clone; 605 } else { 606 clone = bio_alloc_clone(ci->bio->bi_bdev, ci->bio, 607 gfp_mask, &ci->io->md->bs); 608 if (!clone) 609 return NULL; 610 611 /* REQ_DM_POLL_LIST shouldn't be inherited */ 612 clone->bi_opf &= ~REQ_DM_POLL_LIST; 613 614 tio = clone_to_tio(clone); 615 tio->inside_dm_io = false; 616 } 617 618 tio->magic = DM_TIO_MAGIC; 619 tio->io = ci->io; 620 tio->ti = ti; 621 tio->target_bio_nr = target_bio_nr; 622 tio->is_duplicate_bio = false; 623 tio->len_ptr = len; 624 tio->old_sector = 0; 625 626 if (len) { 627 clone->bi_iter.bi_size = to_bytes(*len); 628 if (bio_integrity(clone)) 629 bio_integrity_trim(clone); 630 } 631 632 return clone; 633 } 634 635 static void free_tio(struct bio *clone) 636 { 637 if (clone_to_tio(clone)->inside_dm_io) 638 return; 639 bio_put(clone); 640 } 641 642 /* 643 * Add the bio to the list of deferred io. 644 */ 645 static void queue_io(struct mapped_device *md, struct bio *bio) 646 { 647 unsigned long flags; 648 649 spin_lock_irqsave(&md->deferred_lock, flags); 650 bio_list_add(&md->deferred, bio); 651 spin_unlock_irqrestore(&md->deferred_lock, flags); 652 queue_work(md->wq, &md->work); 653 } 654 655 /* 656 * Everyone (including functions in this file), should use this 657 * function to access the md->map field, and make sure they call 658 * dm_put_live_table() when finished. 659 */ 660 struct dm_table *dm_get_live_table(struct mapped_device *md, int *srcu_idx) __acquires(md->io_barrier) 661 { 662 *srcu_idx = srcu_read_lock(&md->io_barrier); 663 664 return srcu_dereference(md->map, &md->io_barrier); 665 } 666 667 void dm_put_live_table(struct mapped_device *md, int srcu_idx) __releases(md->io_barrier) 668 { 669 srcu_read_unlock(&md->io_barrier, srcu_idx); 670 } 671 672 void dm_sync_table(struct mapped_device *md) 673 { 674 synchronize_srcu(&md->io_barrier); 675 synchronize_rcu_expedited(); 676 } 677 678 /* 679 * A fast alternative to dm_get_live_table/dm_put_live_table. 680 * The caller must not block between these two functions. 681 */ 682 static struct dm_table *dm_get_live_table_fast(struct mapped_device *md) __acquires(RCU) 683 { 684 rcu_read_lock(); 685 return rcu_dereference(md->map); 686 } 687 688 static void dm_put_live_table_fast(struct mapped_device *md) __releases(RCU) 689 { 690 rcu_read_unlock(); 691 } 692 693 static char *_dm_claim_ptr = "I belong to device-mapper"; 694 695 /* 696 * Open a table device so we can use it as a map destination. 697 */ 698 static int open_table_device(struct table_device *td, dev_t dev, 699 struct mapped_device *md) 700 { 701 struct block_device *bdev; 702 u64 part_off; 703 int r; 704 705 BUG_ON(td->dm_dev.bdev); 706 707 bdev = blkdev_get_by_dev(dev, td->dm_dev.mode | FMODE_EXCL, _dm_claim_ptr); 708 if (IS_ERR(bdev)) 709 return PTR_ERR(bdev); 710 711 r = bd_link_disk_holder(bdev, dm_disk(md)); 712 if (r) { 713 blkdev_put(bdev, td->dm_dev.mode | FMODE_EXCL); 714 return r; 715 } 716 717 td->dm_dev.bdev = bdev; 718 td->dm_dev.dax_dev = fs_dax_get_by_bdev(bdev, &part_off); 719 return 0; 720 } 721 722 /* 723 * Close a table device that we've been using. 724 */ 725 static void close_table_device(struct table_device *td, struct mapped_device *md) 726 { 727 if (!td->dm_dev.bdev) 728 return; 729 730 bd_unlink_disk_holder(td->dm_dev.bdev, dm_disk(md)); 731 blkdev_put(td->dm_dev.bdev, td->dm_dev.mode | FMODE_EXCL); 732 put_dax(td->dm_dev.dax_dev); 733 td->dm_dev.bdev = NULL; 734 td->dm_dev.dax_dev = NULL; 735 } 736 737 static struct table_device *find_table_device(struct list_head *l, dev_t dev, 738 fmode_t mode) 739 { 740 struct table_device *td; 741 742 list_for_each_entry(td, l, list) 743 if (td->dm_dev.bdev->bd_dev == dev && td->dm_dev.mode == mode) 744 return td; 745 746 return NULL; 747 } 748 749 int dm_get_table_device(struct mapped_device *md, dev_t dev, fmode_t mode, 750 struct dm_dev **result) 751 { 752 int r; 753 struct table_device *td; 754 755 mutex_lock(&md->table_devices_lock); 756 td = find_table_device(&md->table_devices, dev, mode); 757 if (!td) { 758 td = kmalloc_node(sizeof(*td), GFP_KERNEL, md->numa_node_id); 759 if (!td) { 760 mutex_unlock(&md->table_devices_lock); 761 return -ENOMEM; 762 } 763 764 td->dm_dev.mode = mode; 765 td->dm_dev.bdev = NULL; 766 767 if ((r = open_table_device(td, dev, md))) { 768 mutex_unlock(&md->table_devices_lock); 769 kfree(td); 770 return r; 771 } 772 773 format_dev_t(td->dm_dev.name, dev); 774 775 refcount_set(&td->count, 1); 776 list_add(&td->list, &md->table_devices); 777 } else { 778 refcount_inc(&td->count); 779 } 780 mutex_unlock(&md->table_devices_lock); 781 782 *result = &td->dm_dev; 783 return 0; 784 } 785 786 void dm_put_table_device(struct mapped_device *md, struct dm_dev *d) 787 { 788 struct table_device *td = container_of(d, struct table_device, dm_dev); 789 790 mutex_lock(&md->table_devices_lock); 791 if (refcount_dec_and_test(&td->count)) { 792 close_table_device(td, md); 793 list_del(&td->list); 794 kfree(td); 795 } 796 mutex_unlock(&md->table_devices_lock); 797 } 798 799 static void free_table_devices(struct list_head *devices) 800 { 801 struct list_head *tmp, *next; 802 803 list_for_each_safe(tmp, next, devices) { 804 struct table_device *td = list_entry(tmp, struct table_device, list); 805 806 DMWARN("dm_destroy: %s still exists with %d references", 807 td->dm_dev.name, refcount_read(&td->count)); 808 kfree(td); 809 } 810 } 811 812 /* 813 * Get the geometry associated with a dm device 814 */ 815 int dm_get_geometry(struct mapped_device *md, struct hd_geometry *geo) 816 { 817 *geo = md->geometry; 818 819 return 0; 820 } 821 822 /* 823 * Set the geometry of a device. 824 */ 825 int dm_set_geometry(struct mapped_device *md, struct hd_geometry *geo) 826 { 827 sector_t sz = (sector_t)geo->cylinders * geo->heads * geo->sectors; 828 829 if (geo->start > sz) { 830 DMWARN("Start sector is beyond the geometry limits."); 831 return -EINVAL; 832 } 833 834 md->geometry = *geo; 835 836 return 0; 837 } 838 839 static int __noflush_suspending(struct mapped_device *md) 840 { 841 return test_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); 842 } 843 844 static void dm_io_complete(struct dm_io *io) 845 { 846 blk_status_t io_error; 847 struct mapped_device *md = io->md; 848 struct bio *bio = io->orig_bio; 849 850 if (io->status == BLK_STS_DM_REQUEUE) { 851 unsigned long flags; 852 /* 853 * Target requested pushing back the I/O. 854 */ 855 spin_lock_irqsave(&md->deferred_lock, flags); 856 if (__noflush_suspending(md) && 857 !WARN_ON_ONCE(dm_is_zone_write(md, bio))) { 858 /* NOTE early return due to BLK_STS_DM_REQUEUE below */ 859 bio_list_add_head(&md->deferred, bio); 860 } else { 861 /* 862 * noflush suspend was interrupted or this is 863 * a write to a zoned target. 864 */ 865 io->status = BLK_STS_IOERR; 866 } 867 spin_unlock_irqrestore(&md->deferred_lock, flags); 868 } 869 870 io_error = io->status; 871 if (io->was_accounted) 872 dm_end_io_acct(io, bio); 873 else if (!io_error) { 874 /* 875 * Must handle target that DM_MAPIO_SUBMITTED only to 876 * then bio_endio() rather than dm_submit_bio_remap() 877 */ 878 __dm_start_io_acct(io, bio); 879 dm_end_io_acct(io, bio); 880 } 881 free_io(io); 882 smp_wmb(); 883 this_cpu_dec(*md->pending_io); 884 885 /* nudge anyone waiting on suspend queue */ 886 if (unlikely(wq_has_sleeper(&md->wait))) 887 wake_up(&md->wait); 888 889 if (io_error == BLK_STS_DM_REQUEUE) { 890 /* 891 * Upper layer won't help us poll split bio, io->orig_bio 892 * may only reflect a subset of the pre-split original, 893 * so clear REQ_POLLED in case of requeue 894 */ 895 bio->bi_opf &= ~REQ_POLLED; 896 return; 897 } 898 899 if (bio_is_flush_with_data(bio)) { 900 /* 901 * Preflush done for flush with data, reissue 902 * without REQ_PREFLUSH. 903 */ 904 bio->bi_opf &= ~REQ_PREFLUSH; 905 queue_io(md, bio); 906 } else { 907 /* done with normal IO or empty flush */ 908 if (io_error) 909 bio->bi_status = io_error; 910 bio_endio(bio); 911 } 912 } 913 914 /* 915 * Decrements the number of outstanding ios that a bio has been 916 * cloned into, completing the original io if necc. 917 */ 918 void dm_io_dec_pending(struct dm_io *io, blk_status_t error) 919 { 920 /* Push-back supersedes any I/O errors */ 921 if (unlikely(error)) { 922 unsigned long flags; 923 spin_lock_irqsave(&io->endio_lock, flags); 924 if (!(io->status == BLK_STS_DM_REQUEUE && 925 __noflush_suspending(io->md))) 926 io->status = error; 927 spin_unlock_irqrestore(&io->endio_lock, flags); 928 } 929 930 if (atomic_dec_and_test(&io->io_count)) 931 dm_io_complete(io); 932 } 933 934 void disable_discard(struct mapped_device *md) 935 { 936 struct queue_limits *limits = dm_get_queue_limits(md); 937 938 /* device doesn't really support DISCARD, disable it */ 939 limits->max_discard_sectors = 0; 940 blk_queue_flag_clear(QUEUE_FLAG_DISCARD, md->queue); 941 } 942 943 void disable_write_same(struct mapped_device *md) 944 { 945 struct queue_limits *limits = dm_get_queue_limits(md); 946 947 /* device doesn't really support WRITE SAME, disable it */ 948 limits->max_write_same_sectors = 0; 949 } 950 951 void disable_write_zeroes(struct mapped_device *md) 952 { 953 struct queue_limits *limits = dm_get_queue_limits(md); 954 955 /* device doesn't really support WRITE ZEROES, disable it */ 956 limits->max_write_zeroes_sectors = 0; 957 } 958 959 static bool swap_bios_limit(struct dm_target *ti, struct bio *bio) 960 { 961 return unlikely((bio->bi_opf & REQ_SWAP) != 0) && unlikely(ti->limit_swap_bios); 962 } 963 964 static void clone_endio(struct bio *bio) 965 { 966 blk_status_t error = bio->bi_status; 967 struct dm_target_io *tio = clone_to_tio(bio); 968 struct dm_io *io = tio->io; 969 struct mapped_device *md = tio->io->md; 970 dm_endio_fn endio = tio->ti->type->end_io; 971 struct request_queue *q = bio->bi_bdev->bd_disk->queue; 972 973 if (unlikely(error == BLK_STS_TARGET)) { 974 if (bio_op(bio) == REQ_OP_DISCARD && 975 !q->limits.max_discard_sectors) 976 disable_discard(md); 977 else if (bio_op(bio) == REQ_OP_WRITE_SAME && 978 !q->limits.max_write_same_sectors) 979 disable_write_same(md); 980 else if (bio_op(bio) == REQ_OP_WRITE_ZEROES && 981 !q->limits.max_write_zeroes_sectors) 982 disable_write_zeroes(md); 983 } 984 985 if (blk_queue_is_zoned(q)) 986 dm_zone_endio(io, bio); 987 988 if (endio) { 989 int r = endio(tio->ti, bio, &error); 990 switch (r) { 991 case DM_ENDIO_REQUEUE: 992 /* 993 * Requeuing writes to a sequential zone of a zoned 994 * target will break the sequential write pattern: 995 * fail such IO. 996 */ 997 if (WARN_ON_ONCE(dm_is_zone_write(md, bio))) 998 error = BLK_STS_IOERR; 999 else 1000 error = BLK_STS_DM_REQUEUE; 1001 fallthrough; 1002 case DM_ENDIO_DONE: 1003 break; 1004 case DM_ENDIO_INCOMPLETE: 1005 /* The target will handle the io */ 1006 return; 1007 default: 1008 DMWARN("unimplemented target endio return value: %d", r); 1009 BUG(); 1010 } 1011 } 1012 1013 if (unlikely(swap_bios_limit(tio->ti, bio))) { 1014 struct mapped_device *md = io->md; 1015 up(&md->swap_bios_semaphore); 1016 } 1017 1018 free_tio(bio); 1019 dm_io_dec_pending(io, error); 1020 } 1021 1022 /* 1023 * Return maximum size of I/O possible at the supplied sector up to the current 1024 * target boundary. 1025 */ 1026 static inline sector_t max_io_len_target_boundary(struct dm_target *ti, 1027 sector_t target_offset) 1028 { 1029 return ti->len - target_offset; 1030 } 1031 1032 static sector_t max_io_len(struct dm_target *ti, sector_t sector) 1033 { 1034 sector_t target_offset = dm_target_offset(ti, sector); 1035 sector_t len = max_io_len_target_boundary(ti, target_offset); 1036 sector_t max_len; 1037 1038 /* 1039 * Does the target need to split IO even further? 1040 * - varied (per target) IO splitting is a tenet of DM; this 1041 * explains why stacked chunk_sectors based splitting via 1042 * blk_max_size_offset() isn't possible here. So pass in 1043 * ti->max_io_len to override stacked chunk_sectors. 1044 */ 1045 if (ti->max_io_len) { 1046 max_len = blk_max_size_offset(ti->table->md->queue, 1047 target_offset, ti->max_io_len); 1048 if (len > max_len) 1049 len = max_len; 1050 } 1051 1052 return len; 1053 } 1054 1055 int dm_set_target_max_io_len(struct dm_target *ti, sector_t len) 1056 { 1057 if (len > UINT_MAX) { 1058 DMERR("Specified maximum size of target IO (%llu) exceeds limit (%u)", 1059 (unsigned long long)len, UINT_MAX); 1060 ti->error = "Maximum size of target IO is too large"; 1061 return -EINVAL; 1062 } 1063 1064 ti->max_io_len = (uint32_t) len; 1065 1066 return 0; 1067 } 1068 EXPORT_SYMBOL_GPL(dm_set_target_max_io_len); 1069 1070 static struct dm_target *dm_dax_get_live_target(struct mapped_device *md, 1071 sector_t sector, int *srcu_idx) 1072 __acquires(md->io_barrier) 1073 { 1074 struct dm_table *map; 1075 struct dm_target *ti; 1076 1077 map = dm_get_live_table(md, srcu_idx); 1078 if (!map) 1079 return NULL; 1080 1081 ti = dm_table_find_target(map, sector); 1082 if (!ti) 1083 return NULL; 1084 1085 return ti; 1086 } 1087 1088 static long dm_dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, 1089 long nr_pages, void **kaddr, pfn_t *pfn) 1090 { 1091 struct mapped_device *md = dax_get_private(dax_dev); 1092 sector_t sector = pgoff * PAGE_SECTORS; 1093 struct dm_target *ti; 1094 long len, ret = -EIO; 1095 int srcu_idx; 1096 1097 ti = dm_dax_get_live_target(md, sector, &srcu_idx); 1098 1099 if (!ti) 1100 goto out; 1101 if (!ti->type->direct_access) 1102 goto out; 1103 len = max_io_len(ti, sector) / PAGE_SECTORS; 1104 if (len < 1) 1105 goto out; 1106 nr_pages = min(len, nr_pages); 1107 ret = ti->type->direct_access(ti, pgoff, nr_pages, kaddr, pfn); 1108 1109 out: 1110 dm_put_live_table(md, srcu_idx); 1111 1112 return ret; 1113 } 1114 1115 static int dm_dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff, 1116 size_t nr_pages) 1117 { 1118 struct mapped_device *md = dax_get_private(dax_dev); 1119 sector_t sector = pgoff * PAGE_SECTORS; 1120 struct dm_target *ti; 1121 int ret = -EIO; 1122 int srcu_idx; 1123 1124 ti = dm_dax_get_live_target(md, sector, &srcu_idx); 1125 1126 if (!ti) 1127 goto out; 1128 if (WARN_ON(!ti->type->dax_zero_page_range)) { 1129 /* 1130 * ->zero_page_range() is mandatory dax operation. If we are 1131 * here, something is wrong. 1132 */ 1133 goto out; 1134 } 1135 ret = ti->type->dax_zero_page_range(ti, pgoff, nr_pages); 1136 out: 1137 dm_put_live_table(md, srcu_idx); 1138 1139 return ret; 1140 } 1141 1142 /* 1143 * A target may call dm_accept_partial_bio only from the map routine. It is 1144 * allowed for all bio types except REQ_PREFLUSH, REQ_OP_ZONE_* zone management 1145 * operations, REQ_OP_ZONE_APPEND (zone append writes) and any bio serviced by 1146 * __send_duplicate_bios(). 1147 * 1148 * dm_accept_partial_bio informs the dm that the target only wants to process 1149 * additional n_sectors sectors of the bio and the rest of the data should be 1150 * sent in a next bio. 1151 * 1152 * A diagram that explains the arithmetics: 1153 * +--------------------+---------------+-------+ 1154 * | 1 | 2 | 3 | 1155 * +--------------------+---------------+-------+ 1156 * 1157 * <-------------- *tio->len_ptr ---------------> 1158 * <------- bi_size -------> 1159 * <-- n_sectors --> 1160 * 1161 * Region 1 was already iterated over with bio_advance or similar function. 1162 * (it may be empty if the target doesn't use bio_advance) 1163 * Region 2 is the remaining bio size that the target wants to process. 1164 * (it may be empty if region 1 is non-empty, although there is no reason 1165 * to make it empty) 1166 * The target requires that region 3 is to be sent in the next bio. 1167 * 1168 * If the target wants to receive multiple copies of the bio (via num_*bios, etc), 1169 * the partially processed part (the sum of regions 1+2) must be the same for all 1170 * copies of the bio. 1171 */ 1172 void dm_accept_partial_bio(struct bio *bio, unsigned n_sectors) 1173 { 1174 struct dm_target_io *tio = clone_to_tio(bio); 1175 unsigned bi_size = bio->bi_iter.bi_size >> SECTOR_SHIFT; 1176 1177 BUG_ON(tio->is_duplicate_bio); 1178 BUG_ON(op_is_zone_mgmt(bio_op(bio))); 1179 BUG_ON(bio_op(bio) == REQ_OP_ZONE_APPEND); 1180 BUG_ON(bi_size > *tio->len_ptr); 1181 BUG_ON(n_sectors > bi_size); 1182 1183 *tio->len_ptr -= bi_size - n_sectors; 1184 bio->bi_iter.bi_size = n_sectors << SECTOR_SHIFT; 1185 } 1186 EXPORT_SYMBOL_GPL(dm_accept_partial_bio); 1187 1188 static inline void __dm_submit_bio_remap(struct bio *clone, 1189 dev_t dev, sector_t old_sector) 1190 { 1191 trace_block_bio_remap(clone, dev, old_sector); 1192 submit_bio_noacct(clone); 1193 } 1194 1195 /* 1196 * @clone: clone bio that DM core passed to target's .map function 1197 * @tgt_clone: clone of @clone bio that target needs submitted 1198 * 1199 * Targets should use this interface to submit bios they take 1200 * ownership of when returning DM_MAPIO_SUBMITTED. 1201 * 1202 * Target should also enable ti->accounts_remapped_io 1203 */ 1204 void dm_submit_bio_remap(struct bio *clone, struct bio *tgt_clone) 1205 { 1206 struct dm_target_io *tio = clone_to_tio(clone); 1207 struct dm_io *io = tio->io; 1208 1209 WARN_ON_ONCE(!tio->ti->accounts_remapped_io); 1210 1211 /* establish bio that will get submitted */ 1212 if (!tgt_clone) 1213 tgt_clone = clone; 1214 1215 /* 1216 * Account io->origin_bio to DM dev on behalf of target 1217 * that took ownership of IO with DM_MAPIO_SUBMITTED. 1218 */ 1219 if (io->map_task == current) { 1220 /* Still in target's map function */ 1221 io->start_io_acct = true; 1222 } else { 1223 /* 1224 * Called by another thread, managed by DM target, 1225 * wait for dm_split_and_process_bio() to store 1226 * io->orig_bio 1227 */ 1228 while (unlikely(!smp_load_acquire(&io->orig_bio))) 1229 msleep(1); 1230 dm_start_io_acct(io, clone); 1231 } 1232 1233 __dm_submit_bio_remap(tgt_clone, disk_devt(io->md->disk), 1234 tio->old_sector); 1235 } 1236 EXPORT_SYMBOL_GPL(dm_submit_bio_remap); 1237 1238 static noinline void __set_swap_bios_limit(struct mapped_device *md, int latch) 1239 { 1240 mutex_lock(&md->swap_bios_lock); 1241 while (latch < md->swap_bios) { 1242 cond_resched(); 1243 down(&md->swap_bios_semaphore); 1244 md->swap_bios--; 1245 } 1246 while (latch > md->swap_bios) { 1247 cond_resched(); 1248 up(&md->swap_bios_semaphore); 1249 md->swap_bios++; 1250 } 1251 mutex_unlock(&md->swap_bios_lock); 1252 } 1253 1254 static void __map_bio(struct bio *clone) 1255 { 1256 struct dm_target_io *tio = clone_to_tio(clone); 1257 int r; 1258 struct dm_io *io = tio->io; 1259 struct dm_target *ti = tio->ti; 1260 1261 clone->bi_end_io = clone_endio; 1262 1263 /* 1264 * Map the clone. 1265 */ 1266 dm_io_inc_pending(io); 1267 tio->old_sector = clone->bi_iter.bi_sector; 1268 1269 if (unlikely(swap_bios_limit(ti, clone))) { 1270 struct mapped_device *md = io->md; 1271 int latch = get_swap_bios(); 1272 if (unlikely(latch != md->swap_bios)) 1273 __set_swap_bios_limit(md, latch); 1274 down(&md->swap_bios_semaphore); 1275 } 1276 1277 /* 1278 * Check if the IO needs a special mapping due to zone append emulation 1279 * on zoned target. In this case, dm_zone_map_bio() calls the target 1280 * map operation. 1281 */ 1282 if (dm_emulate_zone_append(io->md)) 1283 r = dm_zone_map_bio(tio); 1284 else 1285 r = ti->type->map(ti, clone); 1286 1287 switch (r) { 1288 case DM_MAPIO_SUBMITTED: 1289 /* target has assumed ownership of this io */ 1290 if (!ti->accounts_remapped_io) 1291 io->start_io_acct = true; 1292 break; 1293 case DM_MAPIO_REMAPPED: 1294 /* 1295 * the bio has been remapped so dispatch it, but defer 1296 * dm_start_io_acct() until after possible bio_split(). 1297 */ 1298 __dm_submit_bio_remap(clone, disk_devt(io->md->disk), 1299 tio->old_sector); 1300 io->start_io_acct = true; 1301 break; 1302 case DM_MAPIO_KILL: 1303 case DM_MAPIO_REQUEUE: 1304 if (unlikely(swap_bios_limit(ti, clone))) 1305 up(&io->md->swap_bios_semaphore); 1306 free_tio(clone); 1307 if (r == DM_MAPIO_KILL) 1308 dm_io_dec_pending(io, BLK_STS_IOERR); 1309 else 1310 dm_io_dec_pending(io, BLK_STS_DM_REQUEUE); 1311 break; 1312 default: 1313 DMWARN("unimplemented target map return value: %d", r); 1314 BUG(); 1315 } 1316 } 1317 1318 static void alloc_multiple_bios(struct bio_list *blist, struct clone_info *ci, 1319 struct dm_target *ti, unsigned num_bios, 1320 unsigned *len) 1321 { 1322 struct bio *bio; 1323 int try; 1324 1325 for (try = 0; try < 2; try++) { 1326 int bio_nr; 1327 1328 if (try) 1329 mutex_lock(&ci->io->md->table_devices_lock); 1330 for (bio_nr = 0; bio_nr < num_bios; bio_nr++) { 1331 bio = alloc_tio(ci, ti, bio_nr, len, 1332 try ? GFP_NOIO : GFP_NOWAIT); 1333 if (!bio) 1334 break; 1335 1336 bio_list_add(blist, bio); 1337 } 1338 if (try) 1339 mutex_unlock(&ci->io->md->table_devices_lock); 1340 if (bio_nr == num_bios) 1341 return; 1342 1343 while ((bio = bio_list_pop(blist))) 1344 free_tio(bio); 1345 } 1346 } 1347 1348 static void __send_duplicate_bios(struct clone_info *ci, struct dm_target *ti, 1349 unsigned num_bios, unsigned *len) 1350 { 1351 struct bio_list blist = BIO_EMPTY_LIST; 1352 struct bio *clone; 1353 1354 switch (num_bios) { 1355 case 0: 1356 break; 1357 case 1: 1358 clone = alloc_tio(ci, ti, 0, len, GFP_NOIO); 1359 clone_to_tio(clone)->is_duplicate_bio = true; 1360 __map_bio(clone); 1361 break; 1362 default: 1363 alloc_multiple_bios(&blist, ci, ti, num_bios, len); 1364 while ((clone = bio_list_pop(&blist))) { 1365 clone_to_tio(clone)->is_duplicate_bio = true; 1366 __map_bio(clone); 1367 } 1368 break; 1369 } 1370 } 1371 1372 static int __send_empty_flush(struct clone_info *ci) 1373 { 1374 unsigned target_nr = 0; 1375 struct dm_target *ti; 1376 struct bio flush_bio; 1377 1378 /* 1379 * Use an on-stack bio for this, it's safe since we don't 1380 * need to reference it after submit. It's just used as 1381 * the basis for the clone(s). 1382 */ 1383 bio_init(&flush_bio, ci->io->md->disk->part0, NULL, 0, 1384 REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC); 1385 1386 ci->bio = &flush_bio; 1387 ci->sector_count = 0; 1388 1389 while ((ti = dm_table_get_target(ci->map, target_nr++))) 1390 __send_duplicate_bios(ci, ti, ti->num_flush_bios, NULL); 1391 1392 bio_uninit(ci->bio); 1393 return 0; 1394 } 1395 1396 static void __send_changing_extent_only(struct clone_info *ci, struct dm_target *ti, 1397 unsigned num_bios) 1398 { 1399 unsigned len; 1400 1401 len = min_t(sector_t, ci->sector_count, 1402 max_io_len_target_boundary(ti, dm_target_offset(ti, ci->sector))); 1403 1404 /* 1405 * dm_accept_partial_bio cannot be used with duplicate bios, 1406 * so update clone_info cursor before __send_duplicate_bios(). 1407 */ 1408 ci->sector += len; 1409 ci->sector_count -= len; 1410 1411 __send_duplicate_bios(ci, ti, num_bios, &len); 1412 } 1413 1414 static bool is_abnormal_io(struct bio *bio) 1415 { 1416 bool r = false; 1417 1418 switch (bio_op(bio)) { 1419 case REQ_OP_DISCARD: 1420 case REQ_OP_SECURE_ERASE: 1421 case REQ_OP_WRITE_SAME: 1422 case REQ_OP_WRITE_ZEROES: 1423 r = true; 1424 break; 1425 } 1426 1427 return r; 1428 } 1429 1430 static bool __process_abnormal_io(struct clone_info *ci, struct dm_target *ti, 1431 int *result) 1432 { 1433 unsigned num_bios = 0; 1434 1435 switch (bio_op(ci->bio)) { 1436 case REQ_OP_DISCARD: 1437 num_bios = ti->num_discard_bios; 1438 break; 1439 case REQ_OP_SECURE_ERASE: 1440 num_bios = ti->num_secure_erase_bios; 1441 break; 1442 case REQ_OP_WRITE_SAME: 1443 num_bios = ti->num_write_same_bios; 1444 break; 1445 case REQ_OP_WRITE_ZEROES: 1446 num_bios = ti->num_write_zeroes_bios; 1447 break; 1448 default: 1449 return false; 1450 } 1451 1452 /* 1453 * Even though the device advertised support for this type of 1454 * request, that does not mean every target supports it, and 1455 * reconfiguration might also have changed that since the 1456 * check was performed. 1457 */ 1458 if (!num_bios) 1459 *result = -EOPNOTSUPP; 1460 else { 1461 __send_changing_extent_only(ci, ti, num_bios); 1462 *result = 0; 1463 } 1464 return true; 1465 } 1466 1467 /* 1468 * Reuse ->bi_private as hlist head for storing all dm_io instances 1469 * associated with this bio, and this bio's bi_private needs to be 1470 * stored in dm_io->data before the reuse. 1471 * 1472 * bio->bi_private is owned by fs or upper layer, so block layer won't 1473 * touch it after splitting. Meantime it won't be changed by anyone after 1474 * bio is submitted. So this reuse is safe. 1475 */ 1476 static inline struct hlist_head *dm_get_bio_hlist_head(struct bio *bio) 1477 { 1478 return (struct hlist_head *)&bio->bi_private; 1479 } 1480 1481 static void dm_queue_poll_io(struct bio *bio, struct dm_io *io) 1482 { 1483 struct hlist_head *head = dm_get_bio_hlist_head(bio); 1484 1485 if (!(bio->bi_opf & REQ_DM_POLL_LIST)) { 1486 bio->bi_opf |= REQ_DM_POLL_LIST; 1487 /* 1488 * Save .bi_private into dm_io, so that we can reuse 1489 * .bi_private as hlist head for storing dm_io list 1490 */ 1491 io->data = bio->bi_private; 1492 1493 INIT_HLIST_HEAD(head); 1494 1495 /* tell block layer to poll for completion */ 1496 bio->bi_cookie = ~BLK_QC_T_NONE; 1497 } else { 1498 /* 1499 * bio recursed due to split, reuse original poll list, 1500 * and save bio->bi_private too. 1501 */ 1502 io->data = hlist_entry(head->first, struct dm_io, node)->data; 1503 } 1504 1505 hlist_add_head(&io->node, head); 1506 } 1507 1508 /* 1509 * Select the correct strategy for processing a non-flush bio. 1510 */ 1511 static int __split_and_process_bio(struct clone_info *ci) 1512 { 1513 struct bio *clone; 1514 struct dm_target *ti; 1515 unsigned len; 1516 int r; 1517 1518 ti = dm_table_find_target(ci->map, ci->sector); 1519 if (!ti) 1520 return -EIO; 1521 1522 if (__process_abnormal_io(ci, ti, &r)) 1523 return r; 1524 1525 /* 1526 * Only support bio polling for normal IO, and the target io is 1527 * exactly inside the dm_io instance (verified in dm_poll_dm_io) 1528 */ 1529 ci->submit_as_polled = ci->bio->bi_opf & REQ_POLLED; 1530 1531 len = min_t(sector_t, max_io_len(ti, ci->sector), ci->sector_count); 1532 clone = alloc_tio(ci, ti, 0, &len, GFP_NOIO); 1533 __map_bio(clone); 1534 1535 ci->sector += len; 1536 ci->sector_count -= len; 1537 1538 return 0; 1539 } 1540 1541 static void init_clone_info(struct clone_info *ci, struct mapped_device *md, 1542 struct dm_table *map, struct bio *bio) 1543 { 1544 ci->map = map; 1545 ci->io = alloc_io(md, bio); 1546 ci->bio = bio; 1547 ci->submit_as_polled = false; 1548 ci->sector = bio->bi_iter.bi_sector; 1549 ci->sector_count = bio_sectors(bio); 1550 1551 /* Shouldn't happen but sector_count was being set to 0 so... */ 1552 if (WARN_ON_ONCE(op_is_zone_mgmt(bio_op(bio)) && ci->sector_count)) 1553 ci->sector_count = 0; 1554 } 1555 1556 /* 1557 * Entry point to split a bio into clones and submit them to the targets. 1558 */ 1559 static void dm_split_and_process_bio(struct mapped_device *md, 1560 struct dm_table *map, struct bio *bio) 1561 { 1562 struct clone_info ci; 1563 struct bio *orig_bio = NULL; 1564 int error = 0; 1565 1566 init_clone_info(&ci, md, map, bio); 1567 1568 if (bio->bi_opf & REQ_PREFLUSH) { 1569 error = __send_empty_flush(&ci); 1570 /* dm_io_complete submits any data associated with flush */ 1571 goto out; 1572 } 1573 1574 error = __split_and_process_bio(&ci); 1575 ci.io->map_task = NULL; 1576 if (error || !ci.sector_count) 1577 goto out; 1578 1579 /* 1580 * Remainder must be passed to submit_bio_noacct() so it gets handled 1581 * *after* bios already submitted have been completely processed. 1582 * We take a clone of the original to store in ci.io->orig_bio to be 1583 * used by dm_end_io_acct() and for dm_io_complete() to use for 1584 * completion handling. 1585 */ 1586 orig_bio = bio_split(bio, bio_sectors(bio) - ci.sector_count, 1587 GFP_NOIO, &md->queue->bio_split); 1588 bio_chain(orig_bio, bio); 1589 trace_block_split(orig_bio, bio->bi_iter.bi_sector); 1590 submit_bio_noacct(bio); 1591 out: 1592 if (!orig_bio) 1593 orig_bio = bio; 1594 smp_store_release(&ci.io->orig_bio, orig_bio); 1595 if (ci.io->start_io_acct) 1596 dm_start_io_acct(ci.io, NULL); 1597 1598 /* 1599 * Drop the extra reference count for non-POLLED bio, and hold one 1600 * reference for POLLED bio, which will be released in dm_poll_bio 1601 * 1602 * Add every dm_io instance into the hlist_head which is stored in 1603 * bio->bi_private, so that dm_poll_bio can poll them all. 1604 */ 1605 if (error || !ci.submit_as_polled) 1606 dm_io_dec_pending(ci.io, errno_to_blk_status(error)); 1607 else 1608 dm_queue_poll_io(bio, ci.io); 1609 } 1610 1611 static void dm_submit_bio(struct bio *bio) 1612 { 1613 struct mapped_device *md = bio->bi_bdev->bd_disk->private_data; 1614 int srcu_idx; 1615 struct dm_table *map; 1616 1617 map = dm_get_live_table(md, &srcu_idx); 1618 1619 /* If suspended, or map not yet available, queue this IO for later */ 1620 if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) || 1621 unlikely(!map)) { 1622 if (bio->bi_opf & REQ_NOWAIT) 1623 bio_wouldblock_error(bio); 1624 else if (bio->bi_opf & REQ_RAHEAD) 1625 bio_io_error(bio); 1626 else 1627 queue_io(md, bio); 1628 goto out; 1629 } 1630 1631 /* 1632 * Use blk_queue_split() for abnormal IO (e.g. discard, writesame, etc) 1633 * otherwise associated queue_limits won't be imposed. 1634 */ 1635 if (is_abnormal_io(bio)) 1636 blk_queue_split(&bio); 1637 1638 dm_split_and_process_bio(md, map, bio); 1639 out: 1640 dm_put_live_table(md, srcu_idx); 1641 } 1642 1643 static bool dm_poll_dm_io(struct dm_io *io, struct io_comp_batch *iob, 1644 unsigned int flags) 1645 { 1646 WARN_ON_ONCE(!io->tio.inside_dm_io); 1647 1648 /* don't poll if the mapped io is done */ 1649 if (atomic_read(&io->io_count) > 1) 1650 bio_poll(&io->tio.clone, iob, flags); 1651 1652 /* bio_poll holds the last reference */ 1653 return atomic_read(&io->io_count) == 1; 1654 } 1655 1656 static int dm_poll_bio(struct bio *bio, struct io_comp_batch *iob, 1657 unsigned int flags) 1658 { 1659 struct hlist_head *head = dm_get_bio_hlist_head(bio); 1660 struct hlist_head tmp = HLIST_HEAD_INIT; 1661 struct hlist_node *next; 1662 struct dm_io *io; 1663 1664 /* Only poll normal bio which was marked as REQ_DM_POLL_LIST */ 1665 if (!(bio->bi_opf & REQ_DM_POLL_LIST)) 1666 return 0; 1667 1668 WARN_ON_ONCE(hlist_empty(head)); 1669 1670 hlist_move_list(head, &tmp); 1671 1672 /* 1673 * Restore .bi_private before possibly completing dm_io. 1674 * 1675 * bio_poll() is only possible once @bio has been completely 1676 * submitted via submit_bio_noacct()'s depth-first submission. 1677 * So there is no dm_queue_poll_io() race associated with 1678 * clearing REQ_DM_POLL_LIST here. 1679 */ 1680 bio->bi_opf &= ~REQ_DM_POLL_LIST; 1681 bio->bi_private = hlist_entry(tmp.first, struct dm_io, node)->data; 1682 1683 hlist_for_each_entry_safe(io, next, &tmp, node) { 1684 if (dm_poll_dm_io(io, iob, flags)) { 1685 hlist_del_init(&io->node); 1686 /* 1687 * clone_endio() has already occurred, so passing 1688 * error as 0 here doesn't override io->status 1689 */ 1690 dm_io_dec_pending(io, 0); 1691 } 1692 } 1693 1694 /* Not done? */ 1695 if (!hlist_empty(&tmp)) { 1696 bio->bi_opf |= REQ_DM_POLL_LIST; 1697 /* Reset bio->bi_private to dm_io list head */ 1698 hlist_move_list(&tmp, head); 1699 return 0; 1700 } 1701 return 1; 1702 } 1703 1704 /*----------------------------------------------------------------- 1705 * An IDR is used to keep track of allocated minor numbers. 1706 *---------------------------------------------------------------*/ 1707 static void free_minor(int minor) 1708 { 1709 spin_lock(&_minor_lock); 1710 idr_remove(&_minor_idr, minor); 1711 spin_unlock(&_minor_lock); 1712 } 1713 1714 /* 1715 * See if the device with a specific minor # is free. 1716 */ 1717 static int specific_minor(int minor) 1718 { 1719 int r; 1720 1721 if (minor >= (1 << MINORBITS)) 1722 return -EINVAL; 1723 1724 idr_preload(GFP_KERNEL); 1725 spin_lock(&_minor_lock); 1726 1727 r = idr_alloc(&_minor_idr, MINOR_ALLOCED, minor, minor + 1, GFP_NOWAIT); 1728 1729 spin_unlock(&_minor_lock); 1730 idr_preload_end(); 1731 if (r < 0) 1732 return r == -ENOSPC ? -EBUSY : r; 1733 return 0; 1734 } 1735 1736 static int next_free_minor(int *minor) 1737 { 1738 int r; 1739 1740 idr_preload(GFP_KERNEL); 1741 spin_lock(&_minor_lock); 1742 1743 r = idr_alloc(&_minor_idr, MINOR_ALLOCED, 0, 1 << MINORBITS, GFP_NOWAIT); 1744 1745 spin_unlock(&_minor_lock); 1746 idr_preload_end(); 1747 if (r < 0) 1748 return r; 1749 *minor = r; 1750 return 0; 1751 } 1752 1753 static const struct block_device_operations dm_blk_dops; 1754 static const struct block_device_operations dm_rq_blk_dops; 1755 static const struct dax_operations dm_dax_ops; 1756 1757 static void dm_wq_work(struct work_struct *work); 1758 1759 #ifdef CONFIG_BLK_INLINE_ENCRYPTION 1760 static void dm_queue_destroy_crypto_profile(struct request_queue *q) 1761 { 1762 dm_destroy_crypto_profile(q->crypto_profile); 1763 } 1764 1765 #else /* CONFIG_BLK_INLINE_ENCRYPTION */ 1766 1767 static inline void dm_queue_destroy_crypto_profile(struct request_queue *q) 1768 { 1769 } 1770 #endif /* !CONFIG_BLK_INLINE_ENCRYPTION */ 1771 1772 static void cleanup_mapped_device(struct mapped_device *md) 1773 { 1774 if (md->wq) 1775 destroy_workqueue(md->wq); 1776 bioset_exit(&md->bs); 1777 bioset_exit(&md->io_bs); 1778 1779 if (md->dax_dev) { 1780 dax_remove_host(md->disk); 1781 kill_dax(md->dax_dev); 1782 put_dax(md->dax_dev); 1783 md->dax_dev = NULL; 1784 } 1785 1786 dm_cleanup_zoned_dev(md); 1787 if (md->disk) { 1788 spin_lock(&_minor_lock); 1789 md->disk->private_data = NULL; 1790 spin_unlock(&_minor_lock); 1791 if (dm_get_md_type(md) != DM_TYPE_NONE) { 1792 dm_sysfs_exit(md); 1793 del_gendisk(md->disk); 1794 } 1795 dm_queue_destroy_crypto_profile(md->queue); 1796 blk_cleanup_disk(md->disk); 1797 } 1798 1799 if (md->pending_io) { 1800 free_percpu(md->pending_io); 1801 md->pending_io = NULL; 1802 } 1803 1804 cleanup_srcu_struct(&md->io_barrier); 1805 1806 mutex_destroy(&md->suspend_lock); 1807 mutex_destroy(&md->type_lock); 1808 mutex_destroy(&md->table_devices_lock); 1809 mutex_destroy(&md->swap_bios_lock); 1810 1811 dm_mq_cleanup_mapped_device(md); 1812 } 1813 1814 /* 1815 * Allocate and initialise a blank device with a given minor. 1816 */ 1817 static struct mapped_device *alloc_dev(int minor) 1818 { 1819 int r, numa_node_id = dm_get_numa_node(); 1820 struct mapped_device *md; 1821 void *old_md; 1822 1823 md = kvzalloc_node(sizeof(*md), GFP_KERNEL, numa_node_id); 1824 if (!md) { 1825 DMWARN("unable to allocate device, out of memory."); 1826 return NULL; 1827 } 1828 1829 if (!try_module_get(THIS_MODULE)) 1830 goto bad_module_get; 1831 1832 /* get a minor number for the dev */ 1833 if (minor == DM_ANY_MINOR) 1834 r = next_free_minor(&minor); 1835 else 1836 r = specific_minor(minor); 1837 if (r < 0) 1838 goto bad_minor; 1839 1840 r = init_srcu_struct(&md->io_barrier); 1841 if (r < 0) 1842 goto bad_io_barrier; 1843 1844 md->numa_node_id = numa_node_id; 1845 md->init_tio_pdu = false; 1846 md->type = DM_TYPE_NONE; 1847 mutex_init(&md->suspend_lock); 1848 mutex_init(&md->type_lock); 1849 mutex_init(&md->table_devices_lock); 1850 spin_lock_init(&md->deferred_lock); 1851 atomic_set(&md->holders, 1); 1852 atomic_set(&md->open_count, 0); 1853 atomic_set(&md->event_nr, 0); 1854 atomic_set(&md->uevent_seq, 0); 1855 INIT_LIST_HEAD(&md->uevent_list); 1856 INIT_LIST_HEAD(&md->table_devices); 1857 spin_lock_init(&md->uevent_lock); 1858 1859 /* 1860 * default to bio-based until DM table is loaded and md->type 1861 * established. If request-based table is loaded: blk-mq will 1862 * override accordingly. 1863 */ 1864 md->disk = blk_alloc_disk(md->numa_node_id); 1865 if (!md->disk) 1866 goto bad; 1867 md->queue = md->disk->queue; 1868 1869 init_waitqueue_head(&md->wait); 1870 INIT_WORK(&md->work, dm_wq_work); 1871 init_waitqueue_head(&md->eventq); 1872 init_completion(&md->kobj_holder.completion); 1873 1874 md->swap_bios = get_swap_bios(); 1875 sema_init(&md->swap_bios_semaphore, md->swap_bios); 1876 mutex_init(&md->swap_bios_lock); 1877 1878 md->disk->major = _major; 1879 md->disk->first_minor = minor; 1880 md->disk->minors = 1; 1881 md->disk->flags |= GENHD_FL_NO_PART; 1882 md->disk->fops = &dm_blk_dops; 1883 md->disk->queue = md->queue; 1884 md->disk->private_data = md; 1885 sprintf(md->disk->disk_name, "dm-%d", minor); 1886 1887 if (IS_ENABLED(CONFIG_FS_DAX)) { 1888 md->dax_dev = alloc_dax(md, &dm_dax_ops); 1889 if (IS_ERR(md->dax_dev)) { 1890 md->dax_dev = NULL; 1891 goto bad; 1892 } 1893 set_dax_nocache(md->dax_dev); 1894 set_dax_nomc(md->dax_dev); 1895 if (dax_add_host(md->dax_dev, md->disk)) 1896 goto bad; 1897 } 1898 1899 format_dev_t(md->name, MKDEV(_major, minor)); 1900 1901 md->wq = alloc_workqueue("kdmflush/%s", WQ_MEM_RECLAIM, 0, md->name); 1902 if (!md->wq) 1903 goto bad; 1904 1905 md->pending_io = alloc_percpu(unsigned long); 1906 if (!md->pending_io) 1907 goto bad; 1908 1909 dm_stats_init(&md->stats); 1910 1911 /* Populate the mapping, nobody knows we exist yet */ 1912 spin_lock(&_minor_lock); 1913 old_md = idr_replace(&_minor_idr, md, minor); 1914 spin_unlock(&_minor_lock); 1915 1916 BUG_ON(old_md != MINOR_ALLOCED); 1917 1918 return md; 1919 1920 bad: 1921 cleanup_mapped_device(md); 1922 bad_io_barrier: 1923 free_minor(minor); 1924 bad_minor: 1925 module_put(THIS_MODULE); 1926 bad_module_get: 1927 kvfree(md); 1928 return NULL; 1929 } 1930 1931 static void unlock_fs(struct mapped_device *md); 1932 1933 static void free_dev(struct mapped_device *md) 1934 { 1935 int minor = MINOR(disk_devt(md->disk)); 1936 1937 unlock_fs(md); 1938 1939 cleanup_mapped_device(md); 1940 1941 free_table_devices(&md->table_devices); 1942 dm_stats_cleanup(&md->stats); 1943 free_minor(minor); 1944 1945 module_put(THIS_MODULE); 1946 kvfree(md); 1947 } 1948 1949 static int __bind_mempools(struct mapped_device *md, struct dm_table *t) 1950 { 1951 struct dm_md_mempools *p = dm_table_get_md_mempools(t); 1952 int ret = 0; 1953 1954 if (dm_table_bio_based(t)) { 1955 /* 1956 * The md may already have mempools that need changing. 1957 * If so, reload bioset because front_pad may have changed 1958 * because a different table was loaded. 1959 */ 1960 bioset_exit(&md->bs); 1961 bioset_exit(&md->io_bs); 1962 1963 } else if (bioset_initialized(&md->bs)) { 1964 /* 1965 * There's no need to reload with request-based dm 1966 * because the size of front_pad doesn't change. 1967 * Note for future: If you are to reload bioset, 1968 * prep-ed requests in the queue may refer 1969 * to bio from the old bioset, so you must walk 1970 * through the queue to unprep. 1971 */ 1972 goto out; 1973 } 1974 1975 BUG_ON(!p || 1976 bioset_initialized(&md->bs) || 1977 bioset_initialized(&md->io_bs)); 1978 1979 ret = bioset_init_from_src(&md->bs, &p->bs); 1980 if (ret) 1981 goto out; 1982 ret = bioset_init_from_src(&md->io_bs, &p->io_bs); 1983 if (ret) 1984 bioset_exit(&md->bs); 1985 out: 1986 /* mempool bind completed, no longer need any mempools in the table */ 1987 dm_table_free_md_mempools(t); 1988 return ret; 1989 } 1990 1991 /* 1992 * Bind a table to the device. 1993 */ 1994 static void event_callback(void *context) 1995 { 1996 unsigned long flags; 1997 LIST_HEAD(uevents); 1998 struct mapped_device *md = (struct mapped_device *) context; 1999 2000 spin_lock_irqsave(&md->uevent_lock, flags); 2001 list_splice_init(&md->uevent_list, &uevents); 2002 spin_unlock_irqrestore(&md->uevent_lock, flags); 2003 2004 dm_send_uevents(&uevents, &disk_to_dev(md->disk)->kobj); 2005 2006 atomic_inc(&md->event_nr); 2007 wake_up(&md->eventq); 2008 dm_issue_global_event(); 2009 } 2010 2011 /* 2012 * Returns old map, which caller must destroy. 2013 */ 2014 static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t, 2015 struct queue_limits *limits) 2016 { 2017 struct dm_table *old_map; 2018 sector_t size; 2019 int ret; 2020 2021 lockdep_assert_held(&md->suspend_lock); 2022 2023 size = dm_table_get_size(t); 2024 2025 /* 2026 * Wipe any geometry if the size of the table changed. 2027 */ 2028 if (size != dm_get_size(md)) 2029 memset(&md->geometry, 0, sizeof(md->geometry)); 2030 2031 if (!get_capacity(md->disk)) 2032 set_capacity(md->disk, size); 2033 else 2034 set_capacity_and_notify(md->disk, size); 2035 2036 dm_table_event_callback(t, event_callback, md); 2037 2038 if (dm_table_request_based(t)) { 2039 /* 2040 * Leverage the fact that request-based DM targets are 2041 * immutable singletons - used to optimize dm_mq_queue_rq. 2042 */ 2043 md->immutable_target = dm_table_get_immutable_target(t); 2044 } 2045 2046 ret = __bind_mempools(md, t); 2047 if (ret) { 2048 old_map = ERR_PTR(ret); 2049 goto out; 2050 } 2051 2052 ret = dm_table_set_restrictions(t, md->queue, limits); 2053 if (ret) { 2054 old_map = ERR_PTR(ret); 2055 goto out; 2056 } 2057 2058 old_map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock)); 2059 rcu_assign_pointer(md->map, (void *)t); 2060 md->immutable_target_type = dm_table_get_immutable_target_type(t); 2061 2062 if (old_map) 2063 dm_sync_table(md); 2064 out: 2065 return old_map; 2066 } 2067 2068 /* 2069 * Returns unbound table for the caller to free. 2070 */ 2071 static struct dm_table *__unbind(struct mapped_device *md) 2072 { 2073 struct dm_table *map = rcu_dereference_protected(md->map, 1); 2074 2075 if (!map) 2076 return NULL; 2077 2078 dm_table_event_callback(map, NULL, NULL); 2079 RCU_INIT_POINTER(md->map, NULL); 2080 dm_sync_table(md); 2081 2082 return map; 2083 } 2084 2085 /* 2086 * Constructor for a new device. 2087 */ 2088 int dm_create(int minor, struct mapped_device **result) 2089 { 2090 struct mapped_device *md; 2091 2092 md = alloc_dev(minor); 2093 if (!md) 2094 return -ENXIO; 2095 2096 dm_ima_reset_data(md); 2097 2098 *result = md; 2099 return 0; 2100 } 2101 2102 /* 2103 * Functions to manage md->type. 2104 * All are required to hold md->type_lock. 2105 */ 2106 void dm_lock_md_type(struct mapped_device *md) 2107 { 2108 mutex_lock(&md->type_lock); 2109 } 2110 2111 void dm_unlock_md_type(struct mapped_device *md) 2112 { 2113 mutex_unlock(&md->type_lock); 2114 } 2115 2116 void dm_set_md_type(struct mapped_device *md, enum dm_queue_mode type) 2117 { 2118 BUG_ON(!mutex_is_locked(&md->type_lock)); 2119 md->type = type; 2120 } 2121 2122 enum dm_queue_mode dm_get_md_type(struct mapped_device *md) 2123 { 2124 return md->type; 2125 } 2126 2127 struct target_type *dm_get_immutable_target_type(struct mapped_device *md) 2128 { 2129 return md->immutable_target_type; 2130 } 2131 2132 /* 2133 * The queue_limits are only valid as long as you have a reference 2134 * count on 'md'. 2135 */ 2136 struct queue_limits *dm_get_queue_limits(struct mapped_device *md) 2137 { 2138 BUG_ON(!atomic_read(&md->holders)); 2139 return &md->queue->limits; 2140 } 2141 EXPORT_SYMBOL_GPL(dm_get_queue_limits); 2142 2143 /* 2144 * Setup the DM device's queue based on md's type 2145 */ 2146 int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t) 2147 { 2148 enum dm_queue_mode type = dm_table_get_type(t); 2149 struct queue_limits limits; 2150 int r; 2151 2152 switch (type) { 2153 case DM_TYPE_REQUEST_BASED: 2154 md->disk->fops = &dm_rq_blk_dops; 2155 r = dm_mq_init_request_queue(md, t); 2156 if (r) { 2157 DMERR("Cannot initialize queue for request-based dm mapped device"); 2158 return r; 2159 } 2160 break; 2161 case DM_TYPE_BIO_BASED: 2162 case DM_TYPE_DAX_BIO_BASED: 2163 break; 2164 case DM_TYPE_NONE: 2165 WARN_ON_ONCE(true); 2166 break; 2167 } 2168 2169 r = dm_calculate_queue_limits(t, &limits); 2170 if (r) { 2171 DMERR("Cannot calculate initial queue limits"); 2172 return r; 2173 } 2174 r = dm_table_set_restrictions(t, md->queue, &limits); 2175 if (r) 2176 return r; 2177 2178 r = add_disk(md->disk); 2179 if (r) 2180 return r; 2181 2182 r = dm_sysfs_init(md); 2183 if (r) { 2184 del_gendisk(md->disk); 2185 return r; 2186 } 2187 md->type = type; 2188 return 0; 2189 } 2190 2191 struct mapped_device *dm_get_md(dev_t dev) 2192 { 2193 struct mapped_device *md; 2194 unsigned minor = MINOR(dev); 2195 2196 if (MAJOR(dev) != _major || minor >= (1 << MINORBITS)) 2197 return NULL; 2198 2199 spin_lock(&_minor_lock); 2200 2201 md = idr_find(&_minor_idr, minor); 2202 if (!md || md == MINOR_ALLOCED || (MINOR(disk_devt(dm_disk(md))) != minor) || 2203 test_bit(DMF_FREEING, &md->flags) || dm_deleting_md(md)) { 2204 md = NULL; 2205 goto out; 2206 } 2207 dm_get(md); 2208 out: 2209 spin_unlock(&_minor_lock); 2210 2211 return md; 2212 } 2213 EXPORT_SYMBOL_GPL(dm_get_md); 2214 2215 void *dm_get_mdptr(struct mapped_device *md) 2216 { 2217 return md->interface_ptr; 2218 } 2219 2220 void dm_set_mdptr(struct mapped_device *md, void *ptr) 2221 { 2222 md->interface_ptr = ptr; 2223 } 2224 2225 void dm_get(struct mapped_device *md) 2226 { 2227 atomic_inc(&md->holders); 2228 BUG_ON(test_bit(DMF_FREEING, &md->flags)); 2229 } 2230 2231 int dm_hold(struct mapped_device *md) 2232 { 2233 spin_lock(&_minor_lock); 2234 if (test_bit(DMF_FREEING, &md->flags)) { 2235 spin_unlock(&_minor_lock); 2236 return -EBUSY; 2237 } 2238 dm_get(md); 2239 spin_unlock(&_minor_lock); 2240 return 0; 2241 } 2242 EXPORT_SYMBOL_GPL(dm_hold); 2243 2244 const char *dm_device_name(struct mapped_device *md) 2245 { 2246 return md->name; 2247 } 2248 EXPORT_SYMBOL_GPL(dm_device_name); 2249 2250 static void __dm_destroy(struct mapped_device *md, bool wait) 2251 { 2252 struct dm_table *map; 2253 int srcu_idx; 2254 2255 might_sleep(); 2256 2257 spin_lock(&_minor_lock); 2258 idr_replace(&_minor_idr, MINOR_ALLOCED, MINOR(disk_devt(dm_disk(md)))); 2259 set_bit(DMF_FREEING, &md->flags); 2260 spin_unlock(&_minor_lock); 2261 2262 blk_set_queue_dying(md->queue); 2263 2264 /* 2265 * Take suspend_lock so that presuspend and postsuspend methods 2266 * do not race with internal suspend. 2267 */ 2268 mutex_lock(&md->suspend_lock); 2269 map = dm_get_live_table(md, &srcu_idx); 2270 if (!dm_suspended_md(md)) { 2271 dm_table_presuspend_targets(map); 2272 set_bit(DMF_SUSPENDED, &md->flags); 2273 set_bit(DMF_POST_SUSPENDING, &md->flags); 2274 dm_table_postsuspend_targets(map); 2275 } 2276 /* dm_put_live_table must be before msleep, otherwise deadlock is possible */ 2277 dm_put_live_table(md, srcu_idx); 2278 mutex_unlock(&md->suspend_lock); 2279 2280 /* 2281 * Rare, but there may be I/O requests still going to complete, 2282 * for example. Wait for all references to disappear. 2283 * No one should increment the reference count of the mapped_device, 2284 * after the mapped_device state becomes DMF_FREEING. 2285 */ 2286 if (wait) 2287 while (atomic_read(&md->holders)) 2288 msleep(1); 2289 else if (atomic_read(&md->holders)) 2290 DMWARN("%s: Forcibly removing mapped_device still in use! (%d users)", 2291 dm_device_name(md), atomic_read(&md->holders)); 2292 2293 dm_table_destroy(__unbind(md)); 2294 free_dev(md); 2295 } 2296 2297 void dm_destroy(struct mapped_device *md) 2298 { 2299 __dm_destroy(md, true); 2300 } 2301 2302 void dm_destroy_immediate(struct mapped_device *md) 2303 { 2304 __dm_destroy(md, false); 2305 } 2306 2307 void dm_put(struct mapped_device *md) 2308 { 2309 atomic_dec(&md->holders); 2310 } 2311 EXPORT_SYMBOL_GPL(dm_put); 2312 2313 static bool dm_in_flight_bios(struct mapped_device *md) 2314 { 2315 int cpu; 2316 unsigned long sum = 0; 2317 2318 for_each_possible_cpu(cpu) 2319 sum += *per_cpu_ptr(md->pending_io, cpu); 2320 2321 return sum != 0; 2322 } 2323 2324 static int dm_wait_for_bios_completion(struct mapped_device *md, unsigned int task_state) 2325 { 2326 int r = 0; 2327 DEFINE_WAIT(wait); 2328 2329 while (true) { 2330 prepare_to_wait(&md->wait, &wait, task_state); 2331 2332 if (!dm_in_flight_bios(md)) 2333 break; 2334 2335 if (signal_pending_state(task_state, current)) { 2336 r = -EINTR; 2337 break; 2338 } 2339 2340 io_schedule(); 2341 } 2342 finish_wait(&md->wait, &wait); 2343 2344 smp_rmb(); 2345 2346 return r; 2347 } 2348 2349 static int dm_wait_for_completion(struct mapped_device *md, unsigned int task_state) 2350 { 2351 int r = 0; 2352 2353 if (!queue_is_mq(md->queue)) 2354 return dm_wait_for_bios_completion(md, task_state); 2355 2356 while (true) { 2357 if (!blk_mq_queue_inflight(md->queue)) 2358 break; 2359 2360 if (signal_pending_state(task_state, current)) { 2361 r = -EINTR; 2362 break; 2363 } 2364 2365 msleep(5); 2366 } 2367 2368 return r; 2369 } 2370 2371 /* 2372 * Process the deferred bios 2373 */ 2374 static void dm_wq_work(struct work_struct *work) 2375 { 2376 struct mapped_device *md = container_of(work, struct mapped_device, work); 2377 struct bio *bio; 2378 2379 while (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) { 2380 spin_lock_irq(&md->deferred_lock); 2381 bio = bio_list_pop(&md->deferred); 2382 spin_unlock_irq(&md->deferred_lock); 2383 2384 if (!bio) 2385 break; 2386 2387 submit_bio_noacct(bio); 2388 } 2389 } 2390 2391 static void dm_queue_flush(struct mapped_device *md) 2392 { 2393 clear_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags); 2394 smp_mb__after_atomic(); 2395 queue_work(md->wq, &md->work); 2396 } 2397 2398 /* 2399 * Swap in a new table, returning the old one for the caller to destroy. 2400 */ 2401 struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *table) 2402 { 2403 struct dm_table *live_map = NULL, *map = ERR_PTR(-EINVAL); 2404 struct queue_limits limits; 2405 int r; 2406 2407 mutex_lock(&md->suspend_lock); 2408 2409 /* device must be suspended */ 2410 if (!dm_suspended_md(md)) 2411 goto out; 2412 2413 /* 2414 * If the new table has no data devices, retain the existing limits. 2415 * This helps multipath with queue_if_no_path if all paths disappear, 2416 * then new I/O is queued based on these limits, and then some paths 2417 * reappear. 2418 */ 2419 if (dm_table_has_no_data_devices(table)) { 2420 live_map = dm_get_live_table_fast(md); 2421 if (live_map) 2422 limits = md->queue->limits; 2423 dm_put_live_table_fast(md); 2424 } 2425 2426 if (!live_map) { 2427 r = dm_calculate_queue_limits(table, &limits); 2428 if (r) { 2429 map = ERR_PTR(r); 2430 goto out; 2431 } 2432 } 2433 2434 map = __bind(md, table, &limits); 2435 dm_issue_global_event(); 2436 2437 out: 2438 mutex_unlock(&md->suspend_lock); 2439 return map; 2440 } 2441 2442 /* 2443 * Functions to lock and unlock any filesystem running on the 2444 * device. 2445 */ 2446 static int lock_fs(struct mapped_device *md) 2447 { 2448 int r; 2449 2450 WARN_ON(test_bit(DMF_FROZEN, &md->flags)); 2451 2452 r = freeze_bdev(md->disk->part0); 2453 if (!r) 2454 set_bit(DMF_FROZEN, &md->flags); 2455 return r; 2456 } 2457 2458 static void unlock_fs(struct mapped_device *md) 2459 { 2460 if (!test_bit(DMF_FROZEN, &md->flags)) 2461 return; 2462 thaw_bdev(md->disk->part0); 2463 clear_bit(DMF_FROZEN, &md->flags); 2464 } 2465 2466 /* 2467 * @suspend_flags: DM_SUSPEND_LOCKFS_FLAG and/or DM_SUSPEND_NOFLUSH_FLAG 2468 * @task_state: e.g. TASK_INTERRUPTIBLE or TASK_UNINTERRUPTIBLE 2469 * @dmf_suspended_flag: DMF_SUSPENDED or DMF_SUSPENDED_INTERNALLY 2470 * 2471 * If __dm_suspend returns 0, the device is completely quiescent 2472 * now. There is no request-processing activity. All new requests 2473 * are being added to md->deferred list. 2474 */ 2475 static int __dm_suspend(struct mapped_device *md, struct dm_table *map, 2476 unsigned suspend_flags, unsigned int task_state, 2477 int dmf_suspended_flag) 2478 { 2479 bool do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG; 2480 bool noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG; 2481 int r; 2482 2483 lockdep_assert_held(&md->suspend_lock); 2484 2485 /* 2486 * DMF_NOFLUSH_SUSPENDING must be set before presuspend. 2487 * This flag is cleared before dm_suspend returns. 2488 */ 2489 if (noflush) 2490 set_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); 2491 else 2492 DMDEBUG("%s: suspending with flush", dm_device_name(md)); 2493 2494 /* 2495 * This gets reverted if there's an error later and the targets 2496 * provide the .presuspend_undo hook. 2497 */ 2498 dm_table_presuspend_targets(map); 2499 2500 /* 2501 * Flush I/O to the device. 2502 * Any I/O submitted after lock_fs() may not be flushed. 2503 * noflush takes precedence over do_lockfs. 2504 * (lock_fs() flushes I/Os and waits for them to complete.) 2505 */ 2506 if (!noflush && do_lockfs) { 2507 r = lock_fs(md); 2508 if (r) { 2509 dm_table_presuspend_undo_targets(map); 2510 return r; 2511 } 2512 } 2513 2514 /* 2515 * Here we must make sure that no processes are submitting requests 2516 * to target drivers i.e. no one may be executing 2517 * dm_split_and_process_bio from dm_submit_bio. 2518 * 2519 * To get all processes out of dm_split_and_process_bio in dm_submit_bio, 2520 * we take the write lock. To prevent any process from reentering 2521 * dm_split_and_process_bio from dm_submit_bio and quiesce the thread 2522 * (dm_wq_work), we set DMF_BLOCK_IO_FOR_SUSPEND and call 2523 * flush_workqueue(md->wq). 2524 */ 2525 set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags); 2526 if (map) 2527 synchronize_srcu(&md->io_barrier); 2528 2529 /* 2530 * Stop md->queue before flushing md->wq in case request-based 2531 * dm defers requests to md->wq from md->queue. 2532 */ 2533 if (dm_request_based(md)) 2534 dm_stop_queue(md->queue); 2535 2536 flush_workqueue(md->wq); 2537 2538 /* 2539 * At this point no more requests are entering target request routines. 2540 * We call dm_wait_for_completion to wait for all existing requests 2541 * to finish. 2542 */ 2543 r = dm_wait_for_completion(md, task_state); 2544 if (!r) 2545 set_bit(dmf_suspended_flag, &md->flags); 2546 2547 if (noflush) 2548 clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); 2549 if (map) 2550 synchronize_srcu(&md->io_barrier); 2551 2552 /* were we interrupted ? */ 2553 if (r < 0) { 2554 dm_queue_flush(md); 2555 2556 if (dm_request_based(md)) 2557 dm_start_queue(md->queue); 2558 2559 unlock_fs(md); 2560 dm_table_presuspend_undo_targets(map); 2561 /* pushback list is already flushed, so skip flush */ 2562 } 2563 2564 return r; 2565 } 2566 2567 /* 2568 * We need to be able to change a mapping table under a mounted 2569 * filesystem. For example we might want to move some data in 2570 * the background. Before the table can be swapped with 2571 * dm_bind_table, dm_suspend must be called to flush any in 2572 * flight bios and ensure that any further io gets deferred. 2573 */ 2574 /* 2575 * Suspend mechanism in request-based dm. 2576 * 2577 * 1. Flush all I/Os by lock_fs() if needed. 2578 * 2. Stop dispatching any I/O by stopping the request_queue. 2579 * 3. Wait for all in-flight I/Os to be completed or requeued. 2580 * 2581 * To abort suspend, start the request_queue. 2582 */ 2583 int dm_suspend(struct mapped_device *md, unsigned suspend_flags) 2584 { 2585 struct dm_table *map = NULL; 2586 int r = 0; 2587 2588 retry: 2589 mutex_lock_nested(&md->suspend_lock, SINGLE_DEPTH_NESTING); 2590 2591 if (dm_suspended_md(md)) { 2592 r = -EINVAL; 2593 goto out_unlock; 2594 } 2595 2596 if (dm_suspended_internally_md(md)) { 2597 /* already internally suspended, wait for internal resume */ 2598 mutex_unlock(&md->suspend_lock); 2599 r = wait_on_bit(&md->flags, DMF_SUSPENDED_INTERNALLY, TASK_INTERRUPTIBLE); 2600 if (r) 2601 return r; 2602 goto retry; 2603 } 2604 2605 map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock)); 2606 2607 r = __dm_suspend(md, map, suspend_flags, TASK_INTERRUPTIBLE, DMF_SUSPENDED); 2608 if (r) 2609 goto out_unlock; 2610 2611 set_bit(DMF_POST_SUSPENDING, &md->flags); 2612 dm_table_postsuspend_targets(map); 2613 clear_bit(DMF_POST_SUSPENDING, &md->flags); 2614 2615 out_unlock: 2616 mutex_unlock(&md->suspend_lock); 2617 return r; 2618 } 2619 2620 static int __dm_resume(struct mapped_device *md, struct dm_table *map) 2621 { 2622 if (map) { 2623 int r = dm_table_resume_targets(map); 2624 if (r) 2625 return r; 2626 } 2627 2628 dm_queue_flush(md); 2629 2630 /* 2631 * Flushing deferred I/Os must be done after targets are resumed 2632 * so that mapping of targets can work correctly. 2633 * Request-based dm is queueing the deferred I/Os in its request_queue. 2634 */ 2635 if (dm_request_based(md)) 2636 dm_start_queue(md->queue); 2637 2638 unlock_fs(md); 2639 2640 return 0; 2641 } 2642 2643 int dm_resume(struct mapped_device *md) 2644 { 2645 int r; 2646 struct dm_table *map = NULL; 2647 2648 retry: 2649 r = -EINVAL; 2650 mutex_lock_nested(&md->suspend_lock, SINGLE_DEPTH_NESTING); 2651 2652 if (!dm_suspended_md(md)) 2653 goto out; 2654 2655 if (dm_suspended_internally_md(md)) { 2656 /* already internally suspended, wait for internal resume */ 2657 mutex_unlock(&md->suspend_lock); 2658 r = wait_on_bit(&md->flags, DMF_SUSPENDED_INTERNALLY, TASK_INTERRUPTIBLE); 2659 if (r) 2660 return r; 2661 goto retry; 2662 } 2663 2664 map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock)); 2665 if (!map || !dm_table_get_size(map)) 2666 goto out; 2667 2668 r = __dm_resume(md, map); 2669 if (r) 2670 goto out; 2671 2672 clear_bit(DMF_SUSPENDED, &md->flags); 2673 out: 2674 mutex_unlock(&md->suspend_lock); 2675 2676 return r; 2677 } 2678 2679 /* 2680 * Internal suspend/resume works like userspace-driven suspend. It waits 2681 * until all bios finish and prevents issuing new bios to the target drivers. 2682 * It may be used only from the kernel. 2683 */ 2684 2685 static void __dm_internal_suspend(struct mapped_device *md, unsigned suspend_flags) 2686 { 2687 struct dm_table *map = NULL; 2688 2689 lockdep_assert_held(&md->suspend_lock); 2690 2691 if (md->internal_suspend_count++) 2692 return; /* nested internal suspend */ 2693 2694 if (dm_suspended_md(md)) { 2695 set_bit(DMF_SUSPENDED_INTERNALLY, &md->flags); 2696 return; /* nest suspend */ 2697 } 2698 2699 map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock)); 2700 2701 /* 2702 * Using TASK_UNINTERRUPTIBLE because only NOFLUSH internal suspend is 2703 * supported. Properly supporting a TASK_INTERRUPTIBLE internal suspend 2704 * would require changing .presuspend to return an error -- avoid this 2705 * until there is a need for more elaborate variants of internal suspend. 2706 */ 2707 (void) __dm_suspend(md, map, suspend_flags, TASK_UNINTERRUPTIBLE, 2708 DMF_SUSPENDED_INTERNALLY); 2709 2710 set_bit(DMF_POST_SUSPENDING, &md->flags); 2711 dm_table_postsuspend_targets(map); 2712 clear_bit(DMF_POST_SUSPENDING, &md->flags); 2713 } 2714 2715 static void __dm_internal_resume(struct mapped_device *md) 2716 { 2717 BUG_ON(!md->internal_suspend_count); 2718 2719 if (--md->internal_suspend_count) 2720 return; /* resume from nested internal suspend */ 2721 2722 if (dm_suspended_md(md)) 2723 goto done; /* resume from nested suspend */ 2724 2725 /* 2726 * NOTE: existing callers don't need to call dm_table_resume_targets 2727 * (which may fail -- so best to avoid it for now by passing NULL map) 2728 */ 2729 (void) __dm_resume(md, NULL); 2730 2731 done: 2732 clear_bit(DMF_SUSPENDED_INTERNALLY, &md->flags); 2733 smp_mb__after_atomic(); 2734 wake_up_bit(&md->flags, DMF_SUSPENDED_INTERNALLY); 2735 } 2736 2737 void dm_internal_suspend_noflush(struct mapped_device *md) 2738 { 2739 mutex_lock(&md->suspend_lock); 2740 __dm_internal_suspend(md, DM_SUSPEND_NOFLUSH_FLAG); 2741 mutex_unlock(&md->suspend_lock); 2742 } 2743 EXPORT_SYMBOL_GPL(dm_internal_suspend_noflush); 2744 2745 void dm_internal_resume(struct mapped_device *md) 2746 { 2747 mutex_lock(&md->suspend_lock); 2748 __dm_internal_resume(md); 2749 mutex_unlock(&md->suspend_lock); 2750 } 2751 EXPORT_SYMBOL_GPL(dm_internal_resume); 2752 2753 /* 2754 * Fast variants of internal suspend/resume hold md->suspend_lock, 2755 * which prevents interaction with userspace-driven suspend. 2756 */ 2757 2758 void dm_internal_suspend_fast(struct mapped_device *md) 2759 { 2760 mutex_lock(&md->suspend_lock); 2761 if (dm_suspended_md(md) || dm_suspended_internally_md(md)) 2762 return; 2763 2764 set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags); 2765 synchronize_srcu(&md->io_barrier); 2766 flush_workqueue(md->wq); 2767 dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE); 2768 } 2769 EXPORT_SYMBOL_GPL(dm_internal_suspend_fast); 2770 2771 void dm_internal_resume_fast(struct mapped_device *md) 2772 { 2773 if (dm_suspended_md(md) || dm_suspended_internally_md(md)) 2774 goto done; 2775 2776 dm_queue_flush(md); 2777 2778 done: 2779 mutex_unlock(&md->suspend_lock); 2780 } 2781 EXPORT_SYMBOL_GPL(dm_internal_resume_fast); 2782 2783 /*----------------------------------------------------------------- 2784 * Event notification. 2785 *---------------------------------------------------------------*/ 2786 int dm_kobject_uevent(struct mapped_device *md, enum kobject_action action, 2787 unsigned cookie) 2788 { 2789 int r; 2790 unsigned noio_flag; 2791 char udev_cookie[DM_COOKIE_LENGTH]; 2792 char *envp[] = { udev_cookie, NULL }; 2793 2794 noio_flag = memalloc_noio_save(); 2795 2796 if (!cookie) 2797 r = kobject_uevent(&disk_to_dev(md->disk)->kobj, action); 2798 else { 2799 snprintf(udev_cookie, DM_COOKIE_LENGTH, "%s=%u", 2800 DM_COOKIE_ENV_VAR_NAME, cookie); 2801 r = kobject_uevent_env(&disk_to_dev(md->disk)->kobj, 2802 action, envp); 2803 } 2804 2805 memalloc_noio_restore(noio_flag); 2806 2807 return r; 2808 } 2809 2810 uint32_t dm_next_uevent_seq(struct mapped_device *md) 2811 { 2812 return atomic_add_return(1, &md->uevent_seq); 2813 } 2814 2815 uint32_t dm_get_event_nr(struct mapped_device *md) 2816 { 2817 return atomic_read(&md->event_nr); 2818 } 2819 2820 int dm_wait_event(struct mapped_device *md, int event_nr) 2821 { 2822 return wait_event_interruptible(md->eventq, 2823 (event_nr != atomic_read(&md->event_nr))); 2824 } 2825 2826 void dm_uevent_add(struct mapped_device *md, struct list_head *elist) 2827 { 2828 unsigned long flags; 2829 2830 spin_lock_irqsave(&md->uevent_lock, flags); 2831 list_add(elist, &md->uevent_list); 2832 spin_unlock_irqrestore(&md->uevent_lock, flags); 2833 } 2834 2835 /* 2836 * The gendisk is only valid as long as you have a reference 2837 * count on 'md'. 2838 */ 2839 struct gendisk *dm_disk(struct mapped_device *md) 2840 { 2841 return md->disk; 2842 } 2843 EXPORT_SYMBOL_GPL(dm_disk); 2844 2845 struct kobject *dm_kobject(struct mapped_device *md) 2846 { 2847 return &md->kobj_holder.kobj; 2848 } 2849 2850 struct mapped_device *dm_get_from_kobject(struct kobject *kobj) 2851 { 2852 struct mapped_device *md; 2853 2854 md = container_of(kobj, struct mapped_device, kobj_holder.kobj); 2855 2856 spin_lock(&_minor_lock); 2857 if (test_bit(DMF_FREEING, &md->flags) || dm_deleting_md(md)) { 2858 md = NULL; 2859 goto out; 2860 } 2861 dm_get(md); 2862 out: 2863 spin_unlock(&_minor_lock); 2864 2865 return md; 2866 } 2867 2868 int dm_suspended_md(struct mapped_device *md) 2869 { 2870 return test_bit(DMF_SUSPENDED, &md->flags); 2871 } 2872 2873 static int dm_post_suspending_md(struct mapped_device *md) 2874 { 2875 return test_bit(DMF_POST_SUSPENDING, &md->flags); 2876 } 2877 2878 int dm_suspended_internally_md(struct mapped_device *md) 2879 { 2880 return test_bit(DMF_SUSPENDED_INTERNALLY, &md->flags); 2881 } 2882 2883 int dm_test_deferred_remove_flag(struct mapped_device *md) 2884 { 2885 return test_bit(DMF_DEFERRED_REMOVE, &md->flags); 2886 } 2887 2888 int dm_suspended(struct dm_target *ti) 2889 { 2890 return dm_suspended_md(ti->table->md); 2891 } 2892 EXPORT_SYMBOL_GPL(dm_suspended); 2893 2894 int dm_post_suspending(struct dm_target *ti) 2895 { 2896 return dm_post_suspending_md(ti->table->md); 2897 } 2898 EXPORT_SYMBOL_GPL(dm_post_suspending); 2899 2900 int dm_noflush_suspending(struct dm_target *ti) 2901 { 2902 return __noflush_suspending(ti->table->md); 2903 } 2904 EXPORT_SYMBOL_GPL(dm_noflush_suspending); 2905 2906 struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, enum dm_queue_mode type, 2907 unsigned integrity, unsigned per_io_data_size, 2908 unsigned min_pool_size) 2909 { 2910 struct dm_md_mempools *pools = kzalloc_node(sizeof(*pools), GFP_KERNEL, md->numa_node_id); 2911 unsigned int pool_size = 0; 2912 unsigned int front_pad, io_front_pad; 2913 int ret; 2914 2915 if (!pools) 2916 return NULL; 2917 2918 switch (type) { 2919 case DM_TYPE_BIO_BASED: 2920 case DM_TYPE_DAX_BIO_BASED: 2921 pool_size = max(dm_get_reserved_bio_based_ios(), min_pool_size); 2922 front_pad = roundup(per_io_data_size, __alignof__(struct dm_target_io)) + DM_TARGET_IO_BIO_OFFSET; 2923 io_front_pad = roundup(per_io_data_size, __alignof__(struct dm_io)) + DM_IO_BIO_OFFSET; 2924 ret = bioset_init(&pools->io_bs, pool_size, io_front_pad, 0); 2925 if (ret) 2926 goto out; 2927 if (integrity && bioset_integrity_create(&pools->io_bs, pool_size)) 2928 goto out; 2929 break; 2930 case DM_TYPE_REQUEST_BASED: 2931 pool_size = max(dm_get_reserved_rq_based_ios(), min_pool_size); 2932 front_pad = offsetof(struct dm_rq_clone_bio_info, clone); 2933 /* per_io_data_size is used for blk-mq pdu at queue allocation */ 2934 break; 2935 default: 2936 BUG(); 2937 } 2938 2939 ret = bioset_init(&pools->bs, pool_size, front_pad, 0); 2940 if (ret) 2941 goto out; 2942 2943 if (integrity && bioset_integrity_create(&pools->bs, pool_size)) 2944 goto out; 2945 2946 return pools; 2947 2948 out: 2949 dm_free_md_mempools(pools); 2950 2951 return NULL; 2952 } 2953 2954 void dm_free_md_mempools(struct dm_md_mempools *pools) 2955 { 2956 if (!pools) 2957 return; 2958 2959 bioset_exit(&pools->bs); 2960 bioset_exit(&pools->io_bs); 2961 2962 kfree(pools); 2963 } 2964 2965 struct dm_pr { 2966 u64 old_key; 2967 u64 new_key; 2968 u32 flags; 2969 bool fail_early; 2970 }; 2971 2972 static int dm_call_pr(struct block_device *bdev, iterate_devices_callout_fn fn, 2973 void *data) 2974 { 2975 struct mapped_device *md = bdev->bd_disk->private_data; 2976 struct dm_table *table; 2977 struct dm_target *ti; 2978 int ret = -ENOTTY, srcu_idx; 2979 2980 table = dm_get_live_table(md, &srcu_idx); 2981 if (!table || !dm_table_get_size(table)) 2982 goto out; 2983 2984 /* We only support devices that have a single target */ 2985 if (dm_table_get_num_targets(table) != 1) 2986 goto out; 2987 ti = dm_table_get_target(table, 0); 2988 2989 ret = -EINVAL; 2990 if (!ti->type->iterate_devices) 2991 goto out; 2992 2993 ret = ti->type->iterate_devices(ti, fn, data); 2994 out: 2995 dm_put_live_table(md, srcu_idx); 2996 return ret; 2997 } 2998 2999 /* 3000 * For register / unregister we need to manually call out to every path. 3001 */ 3002 static int __dm_pr_register(struct dm_target *ti, struct dm_dev *dev, 3003 sector_t start, sector_t len, void *data) 3004 { 3005 struct dm_pr *pr = data; 3006 const struct pr_ops *ops = dev->bdev->bd_disk->fops->pr_ops; 3007 3008 if (!ops || !ops->pr_register) 3009 return -EOPNOTSUPP; 3010 return ops->pr_register(dev->bdev, pr->old_key, pr->new_key, pr->flags); 3011 } 3012 3013 static int dm_pr_register(struct block_device *bdev, u64 old_key, u64 new_key, 3014 u32 flags) 3015 { 3016 struct dm_pr pr = { 3017 .old_key = old_key, 3018 .new_key = new_key, 3019 .flags = flags, 3020 .fail_early = true, 3021 }; 3022 int ret; 3023 3024 ret = dm_call_pr(bdev, __dm_pr_register, &pr); 3025 if (ret && new_key) { 3026 /* unregister all paths if we failed to register any path */ 3027 pr.old_key = new_key; 3028 pr.new_key = 0; 3029 pr.flags = 0; 3030 pr.fail_early = false; 3031 dm_call_pr(bdev, __dm_pr_register, &pr); 3032 } 3033 3034 return ret; 3035 } 3036 3037 static int dm_pr_reserve(struct block_device *bdev, u64 key, enum pr_type type, 3038 u32 flags) 3039 { 3040 struct mapped_device *md = bdev->bd_disk->private_data; 3041 const struct pr_ops *ops; 3042 int r, srcu_idx; 3043 3044 r = dm_prepare_ioctl(md, &srcu_idx, &bdev); 3045 if (r < 0) 3046 goto out; 3047 3048 ops = bdev->bd_disk->fops->pr_ops; 3049 if (ops && ops->pr_reserve) 3050 r = ops->pr_reserve(bdev, key, type, flags); 3051 else 3052 r = -EOPNOTSUPP; 3053 out: 3054 dm_unprepare_ioctl(md, srcu_idx); 3055 return r; 3056 } 3057 3058 static int dm_pr_release(struct block_device *bdev, u64 key, enum pr_type type) 3059 { 3060 struct mapped_device *md = bdev->bd_disk->private_data; 3061 const struct pr_ops *ops; 3062 int r, srcu_idx; 3063 3064 r = dm_prepare_ioctl(md, &srcu_idx, &bdev); 3065 if (r < 0) 3066 goto out; 3067 3068 ops = bdev->bd_disk->fops->pr_ops; 3069 if (ops && ops->pr_release) 3070 r = ops->pr_release(bdev, key, type); 3071 else 3072 r = -EOPNOTSUPP; 3073 out: 3074 dm_unprepare_ioctl(md, srcu_idx); 3075 return r; 3076 } 3077 3078 static int dm_pr_preempt(struct block_device *bdev, u64 old_key, u64 new_key, 3079 enum pr_type type, bool abort) 3080 { 3081 struct mapped_device *md = bdev->bd_disk->private_data; 3082 const struct pr_ops *ops; 3083 int r, srcu_idx; 3084 3085 r = dm_prepare_ioctl(md, &srcu_idx, &bdev); 3086 if (r < 0) 3087 goto out; 3088 3089 ops = bdev->bd_disk->fops->pr_ops; 3090 if (ops && ops->pr_preempt) 3091 r = ops->pr_preempt(bdev, old_key, new_key, type, abort); 3092 else 3093 r = -EOPNOTSUPP; 3094 out: 3095 dm_unprepare_ioctl(md, srcu_idx); 3096 return r; 3097 } 3098 3099 static int dm_pr_clear(struct block_device *bdev, u64 key) 3100 { 3101 struct mapped_device *md = bdev->bd_disk->private_data; 3102 const struct pr_ops *ops; 3103 int r, srcu_idx; 3104 3105 r = dm_prepare_ioctl(md, &srcu_idx, &bdev); 3106 if (r < 0) 3107 goto out; 3108 3109 ops = bdev->bd_disk->fops->pr_ops; 3110 if (ops && ops->pr_clear) 3111 r = ops->pr_clear(bdev, key); 3112 else 3113 r = -EOPNOTSUPP; 3114 out: 3115 dm_unprepare_ioctl(md, srcu_idx); 3116 return r; 3117 } 3118 3119 static const struct pr_ops dm_pr_ops = { 3120 .pr_register = dm_pr_register, 3121 .pr_reserve = dm_pr_reserve, 3122 .pr_release = dm_pr_release, 3123 .pr_preempt = dm_pr_preempt, 3124 .pr_clear = dm_pr_clear, 3125 }; 3126 3127 static const struct block_device_operations dm_blk_dops = { 3128 .submit_bio = dm_submit_bio, 3129 .poll_bio = dm_poll_bio, 3130 .open = dm_blk_open, 3131 .release = dm_blk_close, 3132 .ioctl = dm_blk_ioctl, 3133 .getgeo = dm_blk_getgeo, 3134 .report_zones = dm_blk_report_zones, 3135 .pr_ops = &dm_pr_ops, 3136 .owner = THIS_MODULE 3137 }; 3138 3139 static const struct block_device_operations dm_rq_blk_dops = { 3140 .open = dm_blk_open, 3141 .release = dm_blk_close, 3142 .ioctl = dm_blk_ioctl, 3143 .getgeo = dm_blk_getgeo, 3144 .pr_ops = &dm_pr_ops, 3145 .owner = THIS_MODULE 3146 }; 3147 3148 static const struct dax_operations dm_dax_ops = { 3149 .direct_access = dm_dax_direct_access, 3150 .zero_page_range = dm_dax_zero_page_range, 3151 }; 3152 3153 /* 3154 * module hooks 3155 */ 3156 module_init(dm_init); 3157 module_exit(dm_exit); 3158 3159 module_param(major, uint, 0); 3160 MODULE_PARM_DESC(major, "The major number of the device mapper"); 3161 3162 module_param(reserved_bio_based_ios, uint, S_IRUGO | S_IWUSR); 3163 MODULE_PARM_DESC(reserved_bio_based_ios, "Reserved IOs in bio-based mempools"); 3164 3165 module_param(dm_numa_node, int, S_IRUGO | S_IWUSR); 3166 MODULE_PARM_DESC(dm_numa_node, "NUMA node for DM device memory allocations"); 3167 3168 module_param(swap_bios, int, S_IRUGO | S_IWUSR); 3169 MODULE_PARM_DESC(swap_bios, "Maximum allowed inflight swap IOs"); 3170 3171 MODULE_DESCRIPTION(DM_NAME " driver"); 3172 MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>"); 3173 MODULE_LICENSE("GPL"); 3174