1 /* 2 * Copyright (C) 2001, 2002 Sistina Software (UK) Limited. 3 * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved. 4 * 5 * This file is released under the GPL. 6 */ 7 8 #include "dm-core.h" 9 #include "dm-rq.h" 10 #include "dm-uevent.h" 11 #include "dm-ima.h" 12 13 #include <linux/init.h> 14 #include <linux/module.h> 15 #include <linux/mutex.h> 16 #include <linux/sched/mm.h> 17 #include <linux/sched/signal.h> 18 #include <linux/blkpg.h> 19 #include <linux/bio.h> 20 #include <linux/mempool.h> 21 #include <linux/dax.h> 22 #include <linux/slab.h> 23 #include <linux/idr.h> 24 #include <linux/uio.h> 25 #include <linux/hdreg.h> 26 #include <linux/delay.h> 27 #include <linux/wait.h> 28 #include <linux/pr.h> 29 #include <linux/refcount.h> 30 #include <linux/part_stat.h> 31 #include <linux/blk-crypto.h> 32 #include <linux/blk-crypto-profile.h> 33 34 #define DM_MSG_PREFIX "core" 35 36 /* 37 * Cookies are numeric values sent with CHANGE and REMOVE 38 * uevents while resuming, removing or renaming the device. 39 */ 40 #define DM_COOKIE_ENV_VAR_NAME "DM_COOKIE" 41 #define DM_COOKIE_LENGTH 24 42 43 /* 44 * For REQ_POLLED fs bio, this flag is set if we link mapped underlying 45 * dm_io into one list, and reuse bio->bi_private as the list head. Before 46 * ending this fs bio, we will recover its ->bi_private. 47 */ 48 #define REQ_DM_POLL_LIST REQ_DRV 49 50 static const char *_name = DM_NAME; 51 52 static unsigned int major = 0; 53 static unsigned int _major = 0; 54 55 static DEFINE_IDR(_minor_idr); 56 57 static DEFINE_SPINLOCK(_minor_lock); 58 59 static void do_deferred_remove(struct work_struct *w); 60 61 static DECLARE_WORK(deferred_remove_work, do_deferred_remove); 62 63 static struct workqueue_struct *deferred_remove_workqueue; 64 65 atomic_t dm_global_event_nr = ATOMIC_INIT(0); 66 DECLARE_WAIT_QUEUE_HEAD(dm_global_eventq); 67 68 void dm_issue_global_event(void) 69 { 70 atomic_inc(&dm_global_event_nr); 71 wake_up(&dm_global_eventq); 72 } 73 74 /* 75 * One of these is allocated (on-stack) per original bio. 76 */ 77 struct clone_info { 78 struct dm_table *map; 79 struct bio *bio; 80 struct dm_io *io; 81 sector_t sector; 82 unsigned sector_count; 83 bool submit_as_polled; 84 }; 85 86 #define DM_TARGET_IO_BIO_OFFSET (offsetof(struct dm_target_io, clone)) 87 #define DM_IO_BIO_OFFSET \ 88 (offsetof(struct dm_target_io, clone) + offsetof(struct dm_io, tio)) 89 90 static inline struct dm_target_io *clone_to_tio(struct bio *clone) 91 { 92 return container_of(clone, struct dm_target_io, clone); 93 } 94 95 void *dm_per_bio_data(struct bio *bio, size_t data_size) 96 { 97 if (!dm_tio_flagged(clone_to_tio(bio), DM_TIO_INSIDE_DM_IO)) 98 return (char *)bio - DM_TARGET_IO_BIO_OFFSET - data_size; 99 return (char *)bio - DM_IO_BIO_OFFSET - data_size; 100 } 101 EXPORT_SYMBOL_GPL(dm_per_bio_data); 102 103 struct bio *dm_bio_from_per_bio_data(void *data, size_t data_size) 104 { 105 struct dm_io *io = (struct dm_io *)((char *)data + data_size); 106 if (io->magic == DM_IO_MAGIC) 107 return (struct bio *)((char *)io + DM_IO_BIO_OFFSET); 108 BUG_ON(io->magic != DM_TIO_MAGIC); 109 return (struct bio *)((char *)io + DM_TARGET_IO_BIO_OFFSET); 110 } 111 EXPORT_SYMBOL_GPL(dm_bio_from_per_bio_data); 112 113 unsigned dm_bio_get_target_bio_nr(const struct bio *bio) 114 { 115 return container_of(bio, struct dm_target_io, clone)->target_bio_nr; 116 } 117 EXPORT_SYMBOL_GPL(dm_bio_get_target_bio_nr); 118 119 #define MINOR_ALLOCED ((void *)-1) 120 121 #define DM_NUMA_NODE NUMA_NO_NODE 122 static int dm_numa_node = DM_NUMA_NODE; 123 124 #define DEFAULT_SWAP_BIOS (8 * 1048576 / PAGE_SIZE) 125 static int swap_bios = DEFAULT_SWAP_BIOS; 126 static int get_swap_bios(void) 127 { 128 int latch = READ_ONCE(swap_bios); 129 if (unlikely(latch <= 0)) 130 latch = DEFAULT_SWAP_BIOS; 131 return latch; 132 } 133 134 /* 135 * For mempools pre-allocation at the table loading time. 136 */ 137 struct dm_md_mempools { 138 struct bio_set bs; 139 struct bio_set io_bs; 140 }; 141 142 struct table_device { 143 struct list_head list; 144 refcount_t count; 145 struct dm_dev dm_dev; 146 }; 147 148 /* 149 * Bio-based DM's mempools' reserved IOs set by the user. 150 */ 151 #define RESERVED_BIO_BASED_IOS 16 152 static unsigned reserved_bio_based_ios = RESERVED_BIO_BASED_IOS; 153 154 static int __dm_get_module_param_int(int *module_param, int min, int max) 155 { 156 int param = READ_ONCE(*module_param); 157 int modified_param = 0; 158 bool modified = true; 159 160 if (param < min) 161 modified_param = min; 162 else if (param > max) 163 modified_param = max; 164 else 165 modified = false; 166 167 if (modified) { 168 (void)cmpxchg(module_param, param, modified_param); 169 param = modified_param; 170 } 171 172 return param; 173 } 174 175 unsigned __dm_get_module_param(unsigned *module_param, 176 unsigned def, unsigned max) 177 { 178 unsigned param = READ_ONCE(*module_param); 179 unsigned modified_param = 0; 180 181 if (!param) 182 modified_param = def; 183 else if (param > max) 184 modified_param = max; 185 186 if (modified_param) { 187 (void)cmpxchg(module_param, param, modified_param); 188 param = modified_param; 189 } 190 191 return param; 192 } 193 194 unsigned dm_get_reserved_bio_based_ios(void) 195 { 196 return __dm_get_module_param(&reserved_bio_based_ios, 197 RESERVED_BIO_BASED_IOS, DM_RESERVED_MAX_IOS); 198 } 199 EXPORT_SYMBOL_GPL(dm_get_reserved_bio_based_ios); 200 201 static unsigned dm_get_numa_node(void) 202 { 203 return __dm_get_module_param_int(&dm_numa_node, 204 DM_NUMA_NODE, num_online_nodes() - 1); 205 } 206 207 static int __init local_init(void) 208 { 209 int r; 210 211 r = dm_uevent_init(); 212 if (r) 213 return r; 214 215 deferred_remove_workqueue = alloc_workqueue("kdmremove", WQ_UNBOUND, 1); 216 if (!deferred_remove_workqueue) { 217 r = -ENOMEM; 218 goto out_uevent_exit; 219 } 220 221 _major = major; 222 r = register_blkdev(_major, _name); 223 if (r < 0) 224 goto out_free_workqueue; 225 226 if (!_major) 227 _major = r; 228 229 return 0; 230 231 out_free_workqueue: 232 destroy_workqueue(deferred_remove_workqueue); 233 out_uevent_exit: 234 dm_uevent_exit(); 235 236 return r; 237 } 238 239 static void local_exit(void) 240 { 241 flush_scheduled_work(); 242 destroy_workqueue(deferred_remove_workqueue); 243 244 unregister_blkdev(_major, _name); 245 dm_uevent_exit(); 246 247 _major = 0; 248 249 DMINFO("cleaned up"); 250 } 251 252 static int (*_inits[])(void) __initdata = { 253 local_init, 254 dm_target_init, 255 dm_linear_init, 256 dm_stripe_init, 257 dm_io_init, 258 dm_kcopyd_init, 259 dm_interface_init, 260 dm_statistics_init, 261 }; 262 263 static void (*_exits[])(void) = { 264 local_exit, 265 dm_target_exit, 266 dm_linear_exit, 267 dm_stripe_exit, 268 dm_io_exit, 269 dm_kcopyd_exit, 270 dm_interface_exit, 271 dm_statistics_exit, 272 }; 273 274 static int __init dm_init(void) 275 { 276 const int count = ARRAY_SIZE(_inits); 277 int r, i; 278 279 #if (IS_ENABLED(CONFIG_IMA) && !IS_ENABLED(CONFIG_IMA_DISABLE_HTABLE)) 280 DMWARN("CONFIG_IMA_DISABLE_HTABLE is disabled." 281 " Duplicate IMA measurements will not be recorded in the IMA log."); 282 #endif 283 284 for (i = 0; i < count; i++) { 285 r = _inits[i](); 286 if (r) 287 goto bad; 288 } 289 290 return 0; 291 bad: 292 while (i--) 293 _exits[i](); 294 295 return r; 296 } 297 298 static void __exit dm_exit(void) 299 { 300 int i = ARRAY_SIZE(_exits); 301 302 while (i--) 303 _exits[i](); 304 305 /* 306 * Should be empty by this point. 307 */ 308 idr_destroy(&_minor_idr); 309 } 310 311 /* 312 * Block device functions 313 */ 314 int dm_deleting_md(struct mapped_device *md) 315 { 316 return test_bit(DMF_DELETING, &md->flags); 317 } 318 319 static int dm_blk_open(struct block_device *bdev, fmode_t mode) 320 { 321 struct mapped_device *md; 322 323 spin_lock(&_minor_lock); 324 325 md = bdev->bd_disk->private_data; 326 if (!md) 327 goto out; 328 329 if (test_bit(DMF_FREEING, &md->flags) || 330 dm_deleting_md(md)) { 331 md = NULL; 332 goto out; 333 } 334 335 dm_get(md); 336 atomic_inc(&md->open_count); 337 out: 338 spin_unlock(&_minor_lock); 339 340 return md ? 0 : -ENXIO; 341 } 342 343 static void dm_blk_close(struct gendisk *disk, fmode_t mode) 344 { 345 struct mapped_device *md; 346 347 spin_lock(&_minor_lock); 348 349 md = disk->private_data; 350 if (WARN_ON(!md)) 351 goto out; 352 353 if (atomic_dec_and_test(&md->open_count) && 354 (test_bit(DMF_DEFERRED_REMOVE, &md->flags))) 355 queue_work(deferred_remove_workqueue, &deferred_remove_work); 356 357 dm_put(md); 358 out: 359 spin_unlock(&_minor_lock); 360 } 361 362 int dm_open_count(struct mapped_device *md) 363 { 364 return atomic_read(&md->open_count); 365 } 366 367 /* 368 * Guarantees nothing is using the device before it's deleted. 369 */ 370 int dm_lock_for_deletion(struct mapped_device *md, bool mark_deferred, bool only_deferred) 371 { 372 int r = 0; 373 374 spin_lock(&_minor_lock); 375 376 if (dm_open_count(md)) { 377 r = -EBUSY; 378 if (mark_deferred) 379 set_bit(DMF_DEFERRED_REMOVE, &md->flags); 380 } else if (only_deferred && !test_bit(DMF_DEFERRED_REMOVE, &md->flags)) 381 r = -EEXIST; 382 else 383 set_bit(DMF_DELETING, &md->flags); 384 385 spin_unlock(&_minor_lock); 386 387 return r; 388 } 389 390 int dm_cancel_deferred_remove(struct mapped_device *md) 391 { 392 int r = 0; 393 394 spin_lock(&_minor_lock); 395 396 if (test_bit(DMF_DELETING, &md->flags)) 397 r = -EBUSY; 398 else 399 clear_bit(DMF_DEFERRED_REMOVE, &md->flags); 400 401 spin_unlock(&_minor_lock); 402 403 return r; 404 } 405 406 static void do_deferred_remove(struct work_struct *w) 407 { 408 dm_deferred_remove(); 409 } 410 411 static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo) 412 { 413 struct mapped_device *md = bdev->bd_disk->private_data; 414 415 return dm_get_geometry(md, geo); 416 } 417 418 static int dm_prepare_ioctl(struct mapped_device *md, int *srcu_idx, 419 struct block_device **bdev) 420 { 421 struct dm_target *tgt; 422 struct dm_table *map; 423 int r; 424 425 retry: 426 r = -ENOTTY; 427 map = dm_get_live_table(md, srcu_idx); 428 if (!map || !dm_table_get_size(map)) 429 return r; 430 431 /* We only support devices that have a single target */ 432 if (dm_table_get_num_targets(map) != 1) 433 return r; 434 435 tgt = dm_table_get_target(map, 0); 436 if (!tgt->type->prepare_ioctl) 437 return r; 438 439 if (dm_suspended_md(md)) 440 return -EAGAIN; 441 442 r = tgt->type->prepare_ioctl(tgt, bdev); 443 if (r == -ENOTCONN && !fatal_signal_pending(current)) { 444 dm_put_live_table(md, *srcu_idx); 445 msleep(10); 446 goto retry; 447 } 448 449 return r; 450 } 451 452 static void dm_unprepare_ioctl(struct mapped_device *md, int srcu_idx) 453 { 454 dm_put_live_table(md, srcu_idx); 455 } 456 457 static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode, 458 unsigned int cmd, unsigned long arg) 459 { 460 struct mapped_device *md = bdev->bd_disk->private_data; 461 int r, srcu_idx; 462 463 r = dm_prepare_ioctl(md, &srcu_idx, &bdev); 464 if (r < 0) 465 goto out; 466 467 if (r > 0) { 468 /* 469 * Target determined this ioctl is being issued against a 470 * subset of the parent bdev; require extra privileges. 471 */ 472 if (!capable(CAP_SYS_RAWIO)) { 473 DMDEBUG_LIMIT( 474 "%s: sending ioctl %x to DM device without required privilege.", 475 current->comm, cmd); 476 r = -ENOIOCTLCMD; 477 goto out; 478 } 479 } 480 481 if (!bdev->bd_disk->fops->ioctl) 482 r = -ENOTTY; 483 else 484 r = bdev->bd_disk->fops->ioctl(bdev, mode, cmd, arg); 485 out: 486 dm_unprepare_ioctl(md, srcu_idx); 487 return r; 488 } 489 490 u64 dm_start_time_ns_from_clone(struct bio *bio) 491 { 492 return jiffies_to_nsecs(clone_to_tio(bio)->io->start_time); 493 } 494 EXPORT_SYMBOL_GPL(dm_start_time_ns_from_clone); 495 496 static bool bio_is_flush_with_data(struct bio *bio) 497 { 498 return ((bio->bi_opf & REQ_PREFLUSH) && bio->bi_iter.bi_size); 499 } 500 501 static void dm_io_acct(bool end, struct mapped_device *md, struct bio *bio, 502 unsigned long start_time, struct dm_stats_aux *stats_aux) 503 { 504 bool is_flush_with_data; 505 unsigned int bi_size; 506 507 /* If REQ_PREFLUSH set save any payload but do not account it */ 508 is_flush_with_data = bio_is_flush_with_data(bio); 509 if (is_flush_with_data) { 510 bi_size = bio->bi_iter.bi_size; 511 bio->bi_iter.bi_size = 0; 512 } 513 514 if (!end) 515 bio_start_io_acct_time(bio, start_time); 516 else 517 bio_end_io_acct(bio, start_time); 518 519 if (unlikely(dm_stats_used(&md->stats))) 520 dm_stats_account_io(&md->stats, bio_data_dir(bio), 521 bio->bi_iter.bi_sector, bio_sectors(bio), 522 end, start_time, stats_aux); 523 524 /* Restore bio's payload so it does get accounted upon requeue */ 525 if (is_flush_with_data) 526 bio->bi_iter.bi_size = bi_size; 527 } 528 529 static void __dm_start_io_acct(struct dm_io *io, struct bio *bio) 530 { 531 dm_io_acct(false, io->md, bio, io->start_time, &io->stats_aux); 532 } 533 534 static void dm_start_io_acct(struct dm_io *io, struct bio *clone) 535 { 536 /* Must account IO to DM device in terms of orig_bio */ 537 struct bio *bio = io->orig_bio; 538 539 /* 540 * Ensure IO accounting is only ever started once. 541 * Expect no possibility for race unless DM_TIO_IS_DUPLICATE_BIO. 542 */ 543 if (!clone || 544 likely(!dm_tio_flagged(clone_to_tio(clone), DM_TIO_IS_DUPLICATE_BIO))) { 545 if (WARN_ON_ONCE(dm_io_flagged(io, DM_IO_ACCOUNTED))) 546 return; 547 dm_io_set_flag(io, DM_IO_ACCOUNTED); 548 } else { 549 unsigned long flags; 550 if (dm_io_flagged(io, DM_IO_ACCOUNTED)) 551 return; 552 /* Can afford locking given DM_TIO_IS_DUPLICATE_BIO */ 553 spin_lock_irqsave(&io->startio_lock, flags); 554 dm_io_set_flag(io, DM_IO_ACCOUNTED); 555 spin_unlock_irqrestore(&io->startio_lock, flags); 556 } 557 558 __dm_start_io_acct(io, bio); 559 } 560 561 static void dm_end_io_acct(struct dm_io *io, struct bio *bio) 562 { 563 dm_io_acct(true, io->md, bio, io->start_time, &io->stats_aux); 564 } 565 566 static struct dm_io *alloc_io(struct mapped_device *md, struct bio *bio) 567 { 568 struct dm_io *io; 569 struct dm_target_io *tio; 570 struct bio *clone; 571 572 clone = bio_alloc_clone(bio->bi_bdev, bio, GFP_NOIO, &md->io_bs); 573 574 tio = clone_to_tio(clone); 575 tio->flags = 0; 576 dm_tio_set_flag(tio, DM_TIO_INSIDE_DM_IO); 577 tio->io = NULL; 578 579 io = container_of(tio, struct dm_io, tio); 580 io->magic = DM_IO_MAGIC; 581 io->status = 0; 582 atomic_set(&io->io_count, 1); 583 this_cpu_inc(*md->pending_io); 584 io->orig_bio = NULL; 585 io->md = md; 586 io->map_task = current; 587 spin_lock_init(&io->startio_lock); 588 spin_lock_init(&io->endio_lock); 589 io->start_time = jiffies; 590 io->flags = 0; 591 592 dm_stats_record_start(&md->stats, &io->stats_aux); 593 594 return io; 595 } 596 597 static void free_io(struct dm_io *io) 598 { 599 bio_put(&io->tio.clone); 600 } 601 602 static struct bio *alloc_tio(struct clone_info *ci, struct dm_target *ti, 603 unsigned target_bio_nr, unsigned *len, gfp_t gfp_mask) 604 { 605 struct dm_target_io *tio; 606 struct bio *clone; 607 608 if (!ci->io->tio.io) { 609 /* the dm_target_io embedded in ci->io is available */ 610 tio = &ci->io->tio; 611 /* alloc_io() already initialized embedded clone */ 612 clone = &tio->clone; 613 } else { 614 clone = bio_alloc_clone(ci->bio->bi_bdev, ci->bio, 615 gfp_mask, &ci->io->md->bs); 616 if (!clone) 617 return NULL; 618 619 /* REQ_DM_POLL_LIST shouldn't be inherited */ 620 clone->bi_opf &= ~REQ_DM_POLL_LIST; 621 622 tio = clone_to_tio(clone); 623 tio->flags = 0; /* also clears DM_TIO_INSIDE_DM_IO */ 624 } 625 626 tio->magic = DM_TIO_MAGIC; 627 tio->io = ci->io; 628 tio->ti = ti; 629 tio->target_bio_nr = target_bio_nr; 630 tio->len_ptr = len; 631 tio->old_sector = 0; 632 633 if (len) { 634 clone->bi_iter.bi_size = to_bytes(*len); 635 if (bio_integrity(clone)) 636 bio_integrity_trim(clone); 637 } 638 639 return clone; 640 } 641 642 static void free_tio(struct bio *clone) 643 { 644 if (dm_tio_flagged(clone_to_tio(clone), DM_TIO_INSIDE_DM_IO)) 645 return; 646 bio_put(clone); 647 } 648 649 /* 650 * Add the bio to the list of deferred io. 651 */ 652 static void queue_io(struct mapped_device *md, struct bio *bio) 653 { 654 unsigned long flags; 655 656 spin_lock_irqsave(&md->deferred_lock, flags); 657 bio_list_add(&md->deferred, bio); 658 spin_unlock_irqrestore(&md->deferred_lock, flags); 659 queue_work(md->wq, &md->work); 660 } 661 662 /* 663 * Everyone (including functions in this file), should use this 664 * function to access the md->map field, and make sure they call 665 * dm_put_live_table() when finished. 666 */ 667 struct dm_table *dm_get_live_table(struct mapped_device *md, int *srcu_idx) __acquires(md->io_barrier) 668 { 669 *srcu_idx = srcu_read_lock(&md->io_barrier); 670 671 return srcu_dereference(md->map, &md->io_barrier); 672 } 673 674 void dm_put_live_table(struct mapped_device *md, int srcu_idx) __releases(md->io_barrier) 675 { 676 srcu_read_unlock(&md->io_barrier, srcu_idx); 677 } 678 679 void dm_sync_table(struct mapped_device *md) 680 { 681 synchronize_srcu(&md->io_barrier); 682 synchronize_rcu_expedited(); 683 } 684 685 /* 686 * A fast alternative to dm_get_live_table/dm_put_live_table. 687 * The caller must not block between these two functions. 688 */ 689 static struct dm_table *dm_get_live_table_fast(struct mapped_device *md) __acquires(RCU) 690 { 691 rcu_read_lock(); 692 return rcu_dereference(md->map); 693 } 694 695 static void dm_put_live_table_fast(struct mapped_device *md) __releases(RCU) 696 { 697 rcu_read_unlock(); 698 } 699 700 static char *_dm_claim_ptr = "I belong to device-mapper"; 701 702 /* 703 * Open a table device so we can use it as a map destination. 704 */ 705 static int open_table_device(struct table_device *td, dev_t dev, 706 struct mapped_device *md) 707 { 708 struct block_device *bdev; 709 u64 part_off; 710 int r; 711 712 BUG_ON(td->dm_dev.bdev); 713 714 bdev = blkdev_get_by_dev(dev, td->dm_dev.mode | FMODE_EXCL, _dm_claim_ptr); 715 if (IS_ERR(bdev)) 716 return PTR_ERR(bdev); 717 718 r = bd_link_disk_holder(bdev, dm_disk(md)); 719 if (r) { 720 blkdev_put(bdev, td->dm_dev.mode | FMODE_EXCL); 721 return r; 722 } 723 724 td->dm_dev.bdev = bdev; 725 td->dm_dev.dax_dev = fs_dax_get_by_bdev(bdev, &part_off); 726 return 0; 727 } 728 729 /* 730 * Close a table device that we've been using. 731 */ 732 static void close_table_device(struct table_device *td, struct mapped_device *md) 733 { 734 if (!td->dm_dev.bdev) 735 return; 736 737 bd_unlink_disk_holder(td->dm_dev.bdev, dm_disk(md)); 738 blkdev_put(td->dm_dev.bdev, td->dm_dev.mode | FMODE_EXCL); 739 put_dax(td->dm_dev.dax_dev); 740 td->dm_dev.bdev = NULL; 741 td->dm_dev.dax_dev = NULL; 742 } 743 744 static struct table_device *find_table_device(struct list_head *l, dev_t dev, 745 fmode_t mode) 746 { 747 struct table_device *td; 748 749 list_for_each_entry(td, l, list) 750 if (td->dm_dev.bdev->bd_dev == dev && td->dm_dev.mode == mode) 751 return td; 752 753 return NULL; 754 } 755 756 int dm_get_table_device(struct mapped_device *md, dev_t dev, fmode_t mode, 757 struct dm_dev **result) 758 { 759 int r; 760 struct table_device *td; 761 762 mutex_lock(&md->table_devices_lock); 763 td = find_table_device(&md->table_devices, dev, mode); 764 if (!td) { 765 td = kmalloc_node(sizeof(*td), GFP_KERNEL, md->numa_node_id); 766 if (!td) { 767 mutex_unlock(&md->table_devices_lock); 768 return -ENOMEM; 769 } 770 771 td->dm_dev.mode = mode; 772 td->dm_dev.bdev = NULL; 773 774 if ((r = open_table_device(td, dev, md))) { 775 mutex_unlock(&md->table_devices_lock); 776 kfree(td); 777 return r; 778 } 779 780 format_dev_t(td->dm_dev.name, dev); 781 782 refcount_set(&td->count, 1); 783 list_add(&td->list, &md->table_devices); 784 } else { 785 refcount_inc(&td->count); 786 } 787 mutex_unlock(&md->table_devices_lock); 788 789 *result = &td->dm_dev; 790 return 0; 791 } 792 793 void dm_put_table_device(struct mapped_device *md, struct dm_dev *d) 794 { 795 struct table_device *td = container_of(d, struct table_device, dm_dev); 796 797 mutex_lock(&md->table_devices_lock); 798 if (refcount_dec_and_test(&td->count)) { 799 close_table_device(td, md); 800 list_del(&td->list); 801 kfree(td); 802 } 803 mutex_unlock(&md->table_devices_lock); 804 } 805 806 static void free_table_devices(struct list_head *devices) 807 { 808 struct list_head *tmp, *next; 809 810 list_for_each_safe(tmp, next, devices) { 811 struct table_device *td = list_entry(tmp, struct table_device, list); 812 813 DMWARN("dm_destroy: %s still exists with %d references", 814 td->dm_dev.name, refcount_read(&td->count)); 815 kfree(td); 816 } 817 } 818 819 /* 820 * Get the geometry associated with a dm device 821 */ 822 int dm_get_geometry(struct mapped_device *md, struct hd_geometry *geo) 823 { 824 *geo = md->geometry; 825 826 return 0; 827 } 828 829 /* 830 * Set the geometry of a device. 831 */ 832 int dm_set_geometry(struct mapped_device *md, struct hd_geometry *geo) 833 { 834 sector_t sz = (sector_t)geo->cylinders * geo->heads * geo->sectors; 835 836 if (geo->start > sz) { 837 DMWARN("Start sector is beyond the geometry limits."); 838 return -EINVAL; 839 } 840 841 md->geometry = *geo; 842 843 return 0; 844 } 845 846 static int __noflush_suspending(struct mapped_device *md) 847 { 848 return test_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); 849 } 850 851 static void dm_io_complete(struct dm_io *io) 852 { 853 blk_status_t io_error; 854 struct mapped_device *md = io->md; 855 struct bio *bio = io->orig_bio; 856 857 if (io->status == BLK_STS_DM_REQUEUE) { 858 unsigned long flags; 859 /* 860 * Target requested pushing back the I/O. 861 */ 862 spin_lock_irqsave(&md->deferred_lock, flags); 863 if (__noflush_suspending(md) && 864 !WARN_ON_ONCE(dm_is_zone_write(md, bio))) { 865 /* NOTE early return due to BLK_STS_DM_REQUEUE below */ 866 bio_list_add_head(&md->deferred, bio); 867 } else { 868 /* 869 * noflush suspend was interrupted or this is 870 * a write to a zoned target. 871 */ 872 io->status = BLK_STS_IOERR; 873 } 874 spin_unlock_irqrestore(&md->deferred_lock, flags); 875 } 876 877 io_error = io->status; 878 if (dm_io_flagged(io, DM_IO_ACCOUNTED)) 879 dm_end_io_acct(io, bio); 880 else if (!io_error) { 881 /* 882 * Must handle target that DM_MAPIO_SUBMITTED only to 883 * then bio_endio() rather than dm_submit_bio_remap() 884 */ 885 __dm_start_io_acct(io, bio); 886 dm_end_io_acct(io, bio); 887 } 888 free_io(io); 889 smp_wmb(); 890 this_cpu_dec(*md->pending_io); 891 892 /* nudge anyone waiting on suspend queue */ 893 if (unlikely(wq_has_sleeper(&md->wait))) 894 wake_up(&md->wait); 895 896 if (io_error == BLK_STS_DM_REQUEUE) { 897 /* 898 * Upper layer won't help us poll split bio, io->orig_bio 899 * may only reflect a subset of the pre-split original, 900 * so clear REQ_POLLED in case of requeue 901 */ 902 bio->bi_opf &= ~REQ_POLLED; 903 return; 904 } 905 906 if (bio_is_flush_with_data(bio)) { 907 /* 908 * Preflush done for flush with data, reissue 909 * without REQ_PREFLUSH. 910 */ 911 bio->bi_opf &= ~REQ_PREFLUSH; 912 queue_io(md, bio); 913 } else { 914 /* done with normal IO or empty flush */ 915 if (io_error) 916 bio->bi_status = io_error; 917 bio_endio(bio); 918 } 919 } 920 921 static inline bool dm_tio_is_normal(struct dm_target_io *tio) 922 { 923 return (dm_tio_flagged(tio, DM_TIO_INSIDE_DM_IO) && 924 !dm_tio_flagged(tio, DM_TIO_IS_DUPLICATE_BIO)); 925 } 926 927 /* 928 * Decrements the number of outstanding ios that a bio has been 929 * cloned into, completing the original io if necc. 930 */ 931 void dm_io_dec_pending(struct dm_io *io, blk_status_t error) 932 { 933 /* Push-back supersedes any I/O errors */ 934 if (unlikely(error)) { 935 unsigned long flags; 936 spin_lock_irqsave(&io->endio_lock, flags); 937 if (!(io->status == BLK_STS_DM_REQUEUE && 938 __noflush_suspending(io->md))) 939 io->status = error; 940 spin_unlock_irqrestore(&io->endio_lock, flags); 941 } 942 943 if (atomic_dec_and_test(&io->io_count)) 944 dm_io_complete(io); 945 } 946 947 void disable_discard(struct mapped_device *md) 948 { 949 struct queue_limits *limits = dm_get_queue_limits(md); 950 951 /* device doesn't really support DISCARD, disable it */ 952 limits->max_discard_sectors = 0; 953 blk_queue_flag_clear(QUEUE_FLAG_DISCARD, md->queue); 954 } 955 956 void disable_write_same(struct mapped_device *md) 957 { 958 struct queue_limits *limits = dm_get_queue_limits(md); 959 960 /* device doesn't really support WRITE SAME, disable it */ 961 limits->max_write_same_sectors = 0; 962 } 963 964 void disable_write_zeroes(struct mapped_device *md) 965 { 966 struct queue_limits *limits = dm_get_queue_limits(md); 967 968 /* device doesn't really support WRITE ZEROES, disable it */ 969 limits->max_write_zeroes_sectors = 0; 970 } 971 972 static bool swap_bios_limit(struct dm_target *ti, struct bio *bio) 973 { 974 return unlikely((bio->bi_opf & REQ_SWAP) != 0) && unlikely(ti->limit_swap_bios); 975 } 976 977 static void clone_endio(struct bio *bio) 978 { 979 blk_status_t error = bio->bi_status; 980 struct dm_target_io *tio = clone_to_tio(bio); 981 struct dm_io *io = tio->io; 982 struct mapped_device *md = tio->io->md; 983 dm_endio_fn endio = tio->ti->type->end_io; 984 struct request_queue *q = bio->bi_bdev->bd_disk->queue; 985 986 if (unlikely(error == BLK_STS_TARGET)) { 987 if (bio_op(bio) == REQ_OP_DISCARD && 988 !q->limits.max_discard_sectors) 989 disable_discard(md); 990 else if (bio_op(bio) == REQ_OP_WRITE_SAME && 991 !q->limits.max_write_same_sectors) 992 disable_write_same(md); 993 else if (bio_op(bio) == REQ_OP_WRITE_ZEROES && 994 !q->limits.max_write_zeroes_sectors) 995 disable_write_zeroes(md); 996 } 997 998 if (blk_queue_is_zoned(q)) 999 dm_zone_endio(io, bio); 1000 1001 if (endio) { 1002 int r = endio(tio->ti, bio, &error); 1003 switch (r) { 1004 case DM_ENDIO_REQUEUE: 1005 /* 1006 * Requeuing writes to a sequential zone of a zoned 1007 * target will break the sequential write pattern: 1008 * fail such IO. 1009 */ 1010 if (WARN_ON_ONCE(dm_is_zone_write(md, bio))) 1011 error = BLK_STS_IOERR; 1012 else 1013 error = BLK_STS_DM_REQUEUE; 1014 fallthrough; 1015 case DM_ENDIO_DONE: 1016 break; 1017 case DM_ENDIO_INCOMPLETE: 1018 /* The target will handle the io */ 1019 return; 1020 default: 1021 DMWARN("unimplemented target endio return value: %d", r); 1022 BUG(); 1023 } 1024 } 1025 1026 if (unlikely(swap_bios_limit(tio->ti, bio))) { 1027 struct mapped_device *md = io->md; 1028 up(&md->swap_bios_semaphore); 1029 } 1030 1031 free_tio(bio); 1032 dm_io_dec_pending(io, error); 1033 } 1034 1035 /* 1036 * Return maximum size of I/O possible at the supplied sector up to the current 1037 * target boundary. 1038 */ 1039 static inline sector_t max_io_len_target_boundary(struct dm_target *ti, 1040 sector_t target_offset) 1041 { 1042 return ti->len - target_offset; 1043 } 1044 1045 static sector_t max_io_len(struct dm_target *ti, sector_t sector) 1046 { 1047 sector_t target_offset = dm_target_offset(ti, sector); 1048 sector_t len = max_io_len_target_boundary(ti, target_offset); 1049 sector_t max_len; 1050 1051 /* 1052 * Does the target need to split IO even further? 1053 * - varied (per target) IO splitting is a tenet of DM; this 1054 * explains why stacked chunk_sectors based splitting via 1055 * blk_max_size_offset() isn't possible here. So pass in 1056 * ti->max_io_len to override stacked chunk_sectors. 1057 */ 1058 if (ti->max_io_len) { 1059 max_len = blk_max_size_offset(ti->table->md->queue, 1060 target_offset, ti->max_io_len); 1061 if (len > max_len) 1062 len = max_len; 1063 } 1064 1065 return len; 1066 } 1067 1068 int dm_set_target_max_io_len(struct dm_target *ti, sector_t len) 1069 { 1070 if (len > UINT_MAX) { 1071 DMERR("Specified maximum size of target IO (%llu) exceeds limit (%u)", 1072 (unsigned long long)len, UINT_MAX); 1073 ti->error = "Maximum size of target IO is too large"; 1074 return -EINVAL; 1075 } 1076 1077 ti->max_io_len = (uint32_t) len; 1078 1079 return 0; 1080 } 1081 EXPORT_SYMBOL_GPL(dm_set_target_max_io_len); 1082 1083 static struct dm_target *dm_dax_get_live_target(struct mapped_device *md, 1084 sector_t sector, int *srcu_idx) 1085 __acquires(md->io_barrier) 1086 { 1087 struct dm_table *map; 1088 struct dm_target *ti; 1089 1090 map = dm_get_live_table(md, srcu_idx); 1091 if (!map) 1092 return NULL; 1093 1094 ti = dm_table_find_target(map, sector); 1095 if (!ti) 1096 return NULL; 1097 1098 return ti; 1099 } 1100 1101 static long dm_dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, 1102 long nr_pages, void **kaddr, pfn_t *pfn) 1103 { 1104 struct mapped_device *md = dax_get_private(dax_dev); 1105 sector_t sector = pgoff * PAGE_SECTORS; 1106 struct dm_target *ti; 1107 long len, ret = -EIO; 1108 int srcu_idx; 1109 1110 ti = dm_dax_get_live_target(md, sector, &srcu_idx); 1111 1112 if (!ti) 1113 goto out; 1114 if (!ti->type->direct_access) 1115 goto out; 1116 len = max_io_len(ti, sector) / PAGE_SECTORS; 1117 if (len < 1) 1118 goto out; 1119 nr_pages = min(len, nr_pages); 1120 ret = ti->type->direct_access(ti, pgoff, nr_pages, kaddr, pfn); 1121 1122 out: 1123 dm_put_live_table(md, srcu_idx); 1124 1125 return ret; 1126 } 1127 1128 static int dm_dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff, 1129 size_t nr_pages) 1130 { 1131 struct mapped_device *md = dax_get_private(dax_dev); 1132 sector_t sector = pgoff * PAGE_SECTORS; 1133 struct dm_target *ti; 1134 int ret = -EIO; 1135 int srcu_idx; 1136 1137 ti = dm_dax_get_live_target(md, sector, &srcu_idx); 1138 1139 if (!ti) 1140 goto out; 1141 if (WARN_ON(!ti->type->dax_zero_page_range)) { 1142 /* 1143 * ->zero_page_range() is mandatory dax operation. If we are 1144 * here, something is wrong. 1145 */ 1146 goto out; 1147 } 1148 ret = ti->type->dax_zero_page_range(ti, pgoff, nr_pages); 1149 out: 1150 dm_put_live_table(md, srcu_idx); 1151 1152 return ret; 1153 } 1154 1155 /* 1156 * A target may call dm_accept_partial_bio only from the map routine. It is 1157 * allowed for all bio types except REQ_PREFLUSH, REQ_OP_ZONE_* zone management 1158 * operations, REQ_OP_ZONE_APPEND (zone append writes) and any bio serviced by 1159 * __send_duplicate_bios(). 1160 * 1161 * dm_accept_partial_bio informs the dm that the target only wants to process 1162 * additional n_sectors sectors of the bio and the rest of the data should be 1163 * sent in a next bio. 1164 * 1165 * A diagram that explains the arithmetics: 1166 * +--------------------+---------------+-------+ 1167 * | 1 | 2 | 3 | 1168 * +--------------------+---------------+-------+ 1169 * 1170 * <-------------- *tio->len_ptr ---------------> 1171 * <------- bi_size -------> 1172 * <-- n_sectors --> 1173 * 1174 * Region 1 was already iterated over with bio_advance or similar function. 1175 * (it may be empty if the target doesn't use bio_advance) 1176 * Region 2 is the remaining bio size that the target wants to process. 1177 * (it may be empty if region 1 is non-empty, although there is no reason 1178 * to make it empty) 1179 * The target requires that region 3 is to be sent in the next bio. 1180 * 1181 * If the target wants to receive multiple copies of the bio (via num_*bios, etc), 1182 * the partially processed part (the sum of regions 1+2) must be the same for all 1183 * copies of the bio. 1184 */ 1185 void dm_accept_partial_bio(struct bio *bio, unsigned n_sectors) 1186 { 1187 struct dm_target_io *tio = clone_to_tio(bio); 1188 unsigned bi_size = bio->bi_iter.bi_size >> SECTOR_SHIFT; 1189 1190 BUG_ON(dm_tio_flagged(tio, DM_TIO_IS_DUPLICATE_BIO)); 1191 BUG_ON(op_is_zone_mgmt(bio_op(bio))); 1192 BUG_ON(bio_op(bio) == REQ_OP_ZONE_APPEND); 1193 BUG_ON(bi_size > *tio->len_ptr); 1194 BUG_ON(n_sectors > bi_size); 1195 1196 *tio->len_ptr -= bi_size - n_sectors; 1197 bio->bi_iter.bi_size = n_sectors << SECTOR_SHIFT; 1198 } 1199 EXPORT_SYMBOL_GPL(dm_accept_partial_bio); 1200 1201 static inline void __dm_submit_bio_remap(struct bio *clone, 1202 dev_t dev, sector_t old_sector) 1203 { 1204 trace_block_bio_remap(clone, dev, old_sector); 1205 submit_bio_noacct(clone); 1206 } 1207 1208 /* 1209 * @clone: clone bio that DM core passed to target's .map function 1210 * @tgt_clone: clone of @clone bio that target needs submitted 1211 * 1212 * Targets should use this interface to submit bios they take 1213 * ownership of when returning DM_MAPIO_SUBMITTED. 1214 * 1215 * Target should also enable ti->accounts_remapped_io 1216 */ 1217 void dm_submit_bio_remap(struct bio *clone, struct bio *tgt_clone) 1218 { 1219 struct dm_target_io *tio = clone_to_tio(clone); 1220 struct dm_io *io = tio->io; 1221 1222 WARN_ON_ONCE(!tio->ti->accounts_remapped_io); 1223 1224 /* establish bio that will get submitted */ 1225 if (!tgt_clone) 1226 tgt_clone = clone; 1227 1228 /* 1229 * Account io->origin_bio to DM dev on behalf of target 1230 * that took ownership of IO with DM_MAPIO_SUBMITTED. 1231 */ 1232 if (io->map_task == current) { 1233 /* Still in target's map function */ 1234 dm_io_set_flag(io, DM_IO_START_ACCT); 1235 } else { 1236 /* 1237 * Called by another thread, managed by DM target, 1238 * wait for dm_split_and_process_bio() to store 1239 * io->orig_bio 1240 */ 1241 while (unlikely(!smp_load_acquire(&io->orig_bio))) 1242 msleep(1); 1243 dm_start_io_acct(io, clone); 1244 } 1245 1246 __dm_submit_bio_remap(tgt_clone, disk_devt(io->md->disk), 1247 tio->old_sector); 1248 } 1249 EXPORT_SYMBOL_GPL(dm_submit_bio_remap); 1250 1251 static noinline void __set_swap_bios_limit(struct mapped_device *md, int latch) 1252 { 1253 mutex_lock(&md->swap_bios_lock); 1254 while (latch < md->swap_bios) { 1255 cond_resched(); 1256 down(&md->swap_bios_semaphore); 1257 md->swap_bios--; 1258 } 1259 while (latch > md->swap_bios) { 1260 cond_resched(); 1261 up(&md->swap_bios_semaphore); 1262 md->swap_bios++; 1263 } 1264 mutex_unlock(&md->swap_bios_lock); 1265 } 1266 1267 static void __map_bio(struct bio *clone) 1268 { 1269 struct dm_target_io *tio = clone_to_tio(clone); 1270 int r; 1271 struct dm_io *io = tio->io; 1272 struct dm_target *ti = tio->ti; 1273 1274 clone->bi_end_io = clone_endio; 1275 1276 /* 1277 * Map the clone. 1278 */ 1279 dm_io_inc_pending(io); 1280 tio->old_sector = clone->bi_iter.bi_sector; 1281 1282 if (unlikely(swap_bios_limit(ti, clone))) { 1283 struct mapped_device *md = io->md; 1284 int latch = get_swap_bios(); 1285 if (unlikely(latch != md->swap_bios)) 1286 __set_swap_bios_limit(md, latch); 1287 down(&md->swap_bios_semaphore); 1288 } 1289 1290 /* 1291 * Check if the IO needs a special mapping due to zone append emulation 1292 * on zoned target. In this case, dm_zone_map_bio() calls the target 1293 * map operation. 1294 */ 1295 if (dm_emulate_zone_append(io->md)) 1296 r = dm_zone_map_bio(tio); 1297 else 1298 r = ti->type->map(ti, clone); 1299 1300 switch (r) { 1301 case DM_MAPIO_SUBMITTED: 1302 /* target has assumed ownership of this io */ 1303 if (!ti->accounts_remapped_io) 1304 dm_io_set_flag(io, DM_IO_START_ACCT); 1305 break; 1306 case DM_MAPIO_REMAPPED: 1307 /* 1308 * the bio has been remapped so dispatch it, but defer 1309 * dm_start_io_acct() until after possible bio_split(). 1310 */ 1311 __dm_submit_bio_remap(clone, disk_devt(io->md->disk), 1312 tio->old_sector); 1313 dm_io_set_flag(io, DM_IO_START_ACCT); 1314 break; 1315 case DM_MAPIO_KILL: 1316 case DM_MAPIO_REQUEUE: 1317 if (unlikely(swap_bios_limit(ti, clone))) 1318 up(&io->md->swap_bios_semaphore); 1319 free_tio(clone); 1320 if (r == DM_MAPIO_KILL) 1321 dm_io_dec_pending(io, BLK_STS_IOERR); 1322 else 1323 dm_io_dec_pending(io, BLK_STS_DM_REQUEUE); 1324 break; 1325 default: 1326 DMWARN("unimplemented target map return value: %d", r); 1327 BUG(); 1328 } 1329 } 1330 1331 static void alloc_multiple_bios(struct bio_list *blist, struct clone_info *ci, 1332 struct dm_target *ti, unsigned num_bios, 1333 unsigned *len) 1334 { 1335 struct bio *bio; 1336 int try; 1337 1338 for (try = 0; try < 2; try++) { 1339 int bio_nr; 1340 1341 if (try) 1342 mutex_lock(&ci->io->md->table_devices_lock); 1343 for (bio_nr = 0; bio_nr < num_bios; bio_nr++) { 1344 bio = alloc_tio(ci, ti, bio_nr, len, 1345 try ? GFP_NOIO : GFP_NOWAIT); 1346 if (!bio) 1347 break; 1348 1349 bio_list_add(blist, bio); 1350 } 1351 if (try) 1352 mutex_unlock(&ci->io->md->table_devices_lock); 1353 if (bio_nr == num_bios) 1354 return; 1355 1356 while ((bio = bio_list_pop(blist))) 1357 free_tio(bio); 1358 } 1359 } 1360 1361 static void __send_duplicate_bios(struct clone_info *ci, struct dm_target *ti, 1362 unsigned num_bios, unsigned *len) 1363 { 1364 struct bio_list blist = BIO_EMPTY_LIST; 1365 struct bio *clone; 1366 1367 switch (num_bios) { 1368 case 0: 1369 break; 1370 case 1: 1371 clone = alloc_tio(ci, ti, 0, len, GFP_NOIO); 1372 dm_tio_set_flag(clone_to_tio(clone), DM_TIO_IS_DUPLICATE_BIO); 1373 __map_bio(clone); 1374 break; 1375 default: 1376 alloc_multiple_bios(&blist, ci, ti, num_bios, len); 1377 while ((clone = bio_list_pop(&blist))) { 1378 dm_tio_set_flag(clone_to_tio(clone), DM_TIO_IS_DUPLICATE_BIO); 1379 __map_bio(clone); 1380 } 1381 break; 1382 } 1383 } 1384 1385 static void __send_empty_flush(struct clone_info *ci) 1386 { 1387 unsigned target_nr = 0; 1388 struct dm_target *ti; 1389 struct bio flush_bio; 1390 1391 /* 1392 * Use an on-stack bio for this, it's safe since we don't 1393 * need to reference it after submit. It's just used as 1394 * the basis for the clone(s). 1395 */ 1396 bio_init(&flush_bio, ci->io->md->disk->part0, NULL, 0, 1397 REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC); 1398 1399 ci->bio = &flush_bio; 1400 ci->sector_count = 0; 1401 1402 while ((ti = dm_table_get_target(ci->map, target_nr++))) 1403 __send_duplicate_bios(ci, ti, ti->num_flush_bios, NULL); 1404 1405 bio_uninit(ci->bio); 1406 } 1407 1408 static void __send_changing_extent_only(struct clone_info *ci, struct dm_target *ti, 1409 unsigned num_bios) 1410 { 1411 unsigned len; 1412 1413 len = min_t(sector_t, ci->sector_count, 1414 max_io_len_target_boundary(ti, dm_target_offset(ti, ci->sector))); 1415 1416 /* 1417 * dm_accept_partial_bio cannot be used with duplicate bios, 1418 * so update clone_info cursor before __send_duplicate_bios(). 1419 */ 1420 ci->sector += len; 1421 ci->sector_count -= len; 1422 1423 __send_duplicate_bios(ci, ti, num_bios, &len); 1424 } 1425 1426 static bool is_abnormal_io(struct bio *bio) 1427 { 1428 bool r = false; 1429 1430 switch (bio_op(bio)) { 1431 case REQ_OP_DISCARD: 1432 case REQ_OP_SECURE_ERASE: 1433 case REQ_OP_WRITE_SAME: 1434 case REQ_OP_WRITE_ZEROES: 1435 r = true; 1436 break; 1437 } 1438 1439 return r; 1440 } 1441 1442 static bool __process_abnormal_io(struct clone_info *ci, struct dm_target *ti, 1443 int *result) 1444 { 1445 unsigned num_bios = 0; 1446 1447 switch (bio_op(ci->bio)) { 1448 case REQ_OP_DISCARD: 1449 num_bios = ti->num_discard_bios; 1450 break; 1451 case REQ_OP_SECURE_ERASE: 1452 num_bios = ti->num_secure_erase_bios; 1453 break; 1454 case REQ_OP_WRITE_SAME: 1455 num_bios = ti->num_write_same_bios; 1456 break; 1457 case REQ_OP_WRITE_ZEROES: 1458 num_bios = ti->num_write_zeroes_bios; 1459 break; 1460 default: 1461 return false; 1462 } 1463 1464 /* 1465 * Even though the device advertised support for this type of 1466 * request, that does not mean every target supports it, and 1467 * reconfiguration might also have changed that since the 1468 * check was performed. 1469 */ 1470 if (!num_bios) 1471 *result = -EOPNOTSUPP; 1472 else { 1473 __send_changing_extent_only(ci, ti, num_bios); 1474 *result = 0; 1475 } 1476 return true; 1477 } 1478 1479 /* 1480 * Reuse ->bi_private as hlist head for storing all dm_io instances 1481 * associated with this bio, and this bio's bi_private needs to be 1482 * stored in dm_io->data before the reuse. 1483 * 1484 * bio->bi_private is owned by fs or upper layer, so block layer won't 1485 * touch it after splitting. Meantime it won't be changed by anyone after 1486 * bio is submitted. So this reuse is safe. 1487 */ 1488 static inline struct hlist_head *dm_get_bio_hlist_head(struct bio *bio) 1489 { 1490 return (struct hlist_head *)&bio->bi_private; 1491 } 1492 1493 static void dm_queue_poll_io(struct bio *bio, struct dm_io *io) 1494 { 1495 struct hlist_head *head = dm_get_bio_hlist_head(bio); 1496 1497 if (!(bio->bi_opf & REQ_DM_POLL_LIST)) { 1498 bio->bi_opf |= REQ_DM_POLL_LIST; 1499 /* 1500 * Save .bi_private into dm_io, so that we can reuse 1501 * .bi_private as hlist head for storing dm_io list 1502 */ 1503 io->data = bio->bi_private; 1504 1505 INIT_HLIST_HEAD(head); 1506 1507 /* tell block layer to poll for completion */ 1508 bio->bi_cookie = ~BLK_QC_T_NONE; 1509 } else { 1510 /* 1511 * bio recursed due to split, reuse original poll list, 1512 * and save bio->bi_private too. 1513 */ 1514 io->data = hlist_entry(head->first, struct dm_io, node)->data; 1515 } 1516 1517 hlist_add_head(&io->node, head); 1518 } 1519 1520 /* 1521 * Select the correct strategy for processing a non-flush bio. 1522 */ 1523 static int __split_and_process_bio(struct clone_info *ci) 1524 { 1525 struct bio *clone; 1526 struct dm_target *ti; 1527 unsigned len; 1528 int r; 1529 1530 ti = dm_table_find_target(ci->map, ci->sector); 1531 if (!ti) 1532 return -EIO; 1533 1534 if (__process_abnormal_io(ci, ti, &r)) 1535 return r; 1536 1537 /* 1538 * Only support bio polling for normal IO, and the target io is 1539 * exactly inside the dm_io instance (verified in dm_poll_dm_io) 1540 */ 1541 ci->submit_as_polled = ci->bio->bi_opf & REQ_POLLED; 1542 1543 len = min_t(sector_t, max_io_len(ti, ci->sector), ci->sector_count); 1544 clone = alloc_tio(ci, ti, 0, &len, GFP_NOIO); 1545 __map_bio(clone); 1546 1547 ci->sector += len; 1548 ci->sector_count -= len; 1549 1550 return 0; 1551 } 1552 1553 static void init_clone_info(struct clone_info *ci, struct mapped_device *md, 1554 struct dm_table *map, struct bio *bio) 1555 { 1556 ci->map = map; 1557 ci->io = alloc_io(md, bio); 1558 ci->bio = bio; 1559 ci->submit_as_polled = false; 1560 ci->sector = bio->bi_iter.bi_sector; 1561 ci->sector_count = bio_sectors(bio); 1562 1563 /* Shouldn't happen but sector_count was being set to 0 so... */ 1564 if (WARN_ON_ONCE(op_is_zone_mgmt(bio_op(bio)) && ci->sector_count)) 1565 ci->sector_count = 0; 1566 } 1567 1568 /* 1569 * Entry point to split a bio into clones and submit them to the targets. 1570 */ 1571 static void dm_split_and_process_bio(struct mapped_device *md, 1572 struct dm_table *map, struct bio *bio) 1573 { 1574 struct clone_info ci; 1575 struct bio *orig_bio = NULL; 1576 int error = 0; 1577 1578 init_clone_info(&ci, md, map, bio); 1579 1580 if (bio->bi_opf & REQ_PREFLUSH) { 1581 __send_empty_flush(&ci); 1582 /* dm_io_complete submits any data associated with flush */ 1583 goto out; 1584 } 1585 1586 error = __split_and_process_bio(&ci); 1587 ci.io->map_task = NULL; 1588 if (error || !ci.sector_count) 1589 goto out; 1590 1591 /* 1592 * Remainder must be passed to submit_bio_noacct() so it gets handled 1593 * *after* bios already submitted have been completely processed. 1594 * We take a clone of the original to store in ci.io->orig_bio to be 1595 * used by dm_end_io_acct() and for dm_io_complete() to use for 1596 * completion handling. 1597 */ 1598 orig_bio = bio_split(bio, bio_sectors(bio) - ci.sector_count, 1599 GFP_NOIO, &md->queue->bio_split); 1600 bio_chain(orig_bio, bio); 1601 trace_block_split(orig_bio, bio->bi_iter.bi_sector); 1602 submit_bio_noacct(bio); 1603 out: 1604 if (!orig_bio) 1605 orig_bio = bio; 1606 smp_store_release(&ci.io->orig_bio, orig_bio); 1607 if (dm_io_flagged(ci.io, DM_IO_START_ACCT)) 1608 dm_start_io_acct(ci.io, NULL); 1609 1610 /* 1611 * Drop the extra reference count for non-POLLED bio, and hold one 1612 * reference for POLLED bio, which will be released in dm_poll_bio 1613 * 1614 * Add every dm_io instance into the hlist_head which is stored in 1615 * bio->bi_private, so that dm_poll_bio can poll them all. 1616 */ 1617 if (error || !ci.submit_as_polled) 1618 dm_io_dec_pending(ci.io, errno_to_blk_status(error)); 1619 else 1620 dm_queue_poll_io(bio, ci.io); 1621 } 1622 1623 static void dm_submit_bio(struct bio *bio) 1624 { 1625 struct mapped_device *md = bio->bi_bdev->bd_disk->private_data; 1626 int srcu_idx; 1627 struct dm_table *map; 1628 1629 map = dm_get_live_table(md, &srcu_idx); 1630 1631 /* If suspended, or map not yet available, queue this IO for later */ 1632 if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) || 1633 unlikely(!map)) { 1634 if (bio->bi_opf & REQ_NOWAIT) 1635 bio_wouldblock_error(bio); 1636 else if (bio->bi_opf & REQ_RAHEAD) 1637 bio_io_error(bio); 1638 else 1639 queue_io(md, bio); 1640 goto out; 1641 } 1642 1643 /* 1644 * Use blk_queue_split() for abnormal IO (e.g. discard, writesame, etc) 1645 * otherwise associated queue_limits won't be imposed. 1646 */ 1647 if (is_abnormal_io(bio)) 1648 blk_queue_split(&bio); 1649 1650 dm_split_and_process_bio(md, map, bio); 1651 out: 1652 dm_put_live_table(md, srcu_idx); 1653 } 1654 1655 static bool dm_poll_dm_io(struct dm_io *io, struct io_comp_batch *iob, 1656 unsigned int flags) 1657 { 1658 WARN_ON_ONCE(!dm_tio_is_normal(&io->tio)); 1659 1660 /* don't poll if the mapped io is done */ 1661 if (atomic_read(&io->io_count) > 1) 1662 bio_poll(&io->tio.clone, iob, flags); 1663 1664 /* bio_poll holds the last reference */ 1665 return atomic_read(&io->io_count) == 1; 1666 } 1667 1668 static int dm_poll_bio(struct bio *bio, struct io_comp_batch *iob, 1669 unsigned int flags) 1670 { 1671 struct hlist_head *head = dm_get_bio_hlist_head(bio); 1672 struct hlist_head tmp = HLIST_HEAD_INIT; 1673 struct hlist_node *next; 1674 struct dm_io *io; 1675 1676 /* Only poll normal bio which was marked as REQ_DM_POLL_LIST */ 1677 if (!(bio->bi_opf & REQ_DM_POLL_LIST)) 1678 return 0; 1679 1680 WARN_ON_ONCE(hlist_empty(head)); 1681 1682 hlist_move_list(head, &tmp); 1683 1684 /* 1685 * Restore .bi_private before possibly completing dm_io. 1686 * 1687 * bio_poll() is only possible once @bio has been completely 1688 * submitted via submit_bio_noacct()'s depth-first submission. 1689 * So there is no dm_queue_poll_io() race associated with 1690 * clearing REQ_DM_POLL_LIST here. 1691 */ 1692 bio->bi_opf &= ~REQ_DM_POLL_LIST; 1693 bio->bi_private = hlist_entry(tmp.first, struct dm_io, node)->data; 1694 1695 hlist_for_each_entry_safe(io, next, &tmp, node) { 1696 if (dm_poll_dm_io(io, iob, flags)) { 1697 hlist_del_init(&io->node); 1698 /* 1699 * clone_endio() has already occurred, so passing 1700 * error as 0 here doesn't override io->status 1701 */ 1702 dm_io_dec_pending(io, 0); 1703 } 1704 } 1705 1706 /* Not done? */ 1707 if (!hlist_empty(&tmp)) { 1708 bio->bi_opf |= REQ_DM_POLL_LIST; 1709 /* Reset bio->bi_private to dm_io list head */ 1710 hlist_move_list(&tmp, head); 1711 return 0; 1712 } 1713 return 1; 1714 } 1715 1716 /*----------------------------------------------------------------- 1717 * An IDR is used to keep track of allocated minor numbers. 1718 *---------------------------------------------------------------*/ 1719 static void free_minor(int minor) 1720 { 1721 spin_lock(&_minor_lock); 1722 idr_remove(&_minor_idr, minor); 1723 spin_unlock(&_minor_lock); 1724 } 1725 1726 /* 1727 * See if the device with a specific minor # is free. 1728 */ 1729 static int specific_minor(int minor) 1730 { 1731 int r; 1732 1733 if (minor >= (1 << MINORBITS)) 1734 return -EINVAL; 1735 1736 idr_preload(GFP_KERNEL); 1737 spin_lock(&_minor_lock); 1738 1739 r = idr_alloc(&_minor_idr, MINOR_ALLOCED, minor, minor + 1, GFP_NOWAIT); 1740 1741 spin_unlock(&_minor_lock); 1742 idr_preload_end(); 1743 if (r < 0) 1744 return r == -ENOSPC ? -EBUSY : r; 1745 return 0; 1746 } 1747 1748 static int next_free_minor(int *minor) 1749 { 1750 int r; 1751 1752 idr_preload(GFP_KERNEL); 1753 spin_lock(&_minor_lock); 1754 1755 r = idr_alloc(&_minor_idr, MINOR_ALLOCED, 0, 1 << MINORBITS, GFP_NOWAIT); 1756 1757 spin_unlock(&_minor_lock); 1758 idr_preload_end(); 1759 if (r < 0) 1760 return r; 1761 *minor = r; 1762 return 0; 1763 } 1764 1765 static const struct block_device_operations dm_blk_dops; 1766 static const struct block_device_operations dm_rq_blk_dops; 1767 static const struct dax_operations dm_dax_ops; 1768 1769 static void dm_wq_work(struct work_struct *work); 1770 1771 #ifdef CONFIG_BLK_INLINE_ENCRYPTION 1772 static void dm_queue_destroy_crypto_profile(struct request_queue *q) 1773 { 1774 dm_destroy_crypto_profile(q->crypto_profile); 1775 } 1776 1777 #else /* CONFIG_BLK_INLINE_ENCRYPTION */ 1778 1779 static inline void dm_queue_destroy_crypto_profile(struct request_queue *q) 1780 { 1781 } 1782 #endif /* !CONFIG_BLK_INLINE_ENCRYPTION */ 1783 1784 static void cleanup_mapped_device(struct mapped_device *md) 1785 { 1786 if (md->wq) 1787 destroy_workqueue(md->wq); 1788 bioset_exit(&md->bs); 1789 bioset_exit(&md->io_bs); 1790 1791 if (md->dax_dev) { 1792 dax_remove_host(md->disk); 1793 kill_dax(md->dax_dev); 1794 put_dax(md->dax_dev); 1795 md->dax_dev = NULL; 1796 } 1797 1798 dm_cleanup_zoned_dev(md); 1799 if (md->disk) { 1800 spin_lock(&_minor_lock); 1801 md->disk->private_data = NULL; 1802 spin_unlock(&_minor_lock); 1803 if (dm_get_md_type(md) != DM_TYPE_NONE) { 1804 dm_sysfs_exit(md); 1805 del_gendisk(md->disk); 1806 } 1807 dm_queue_destroy_crypto_profile(md->queue); 1808 blk_cleanup_disk(md->disk); 1809 } 1810 1811 if (md->pending_io) { 1812 free_percpu(md->pending_io); 1813 md->pending_io = NULL; 1814 } 1815 1816 cleanup_srcu_struct(&md->io_barrier); 1817 1818 mutex_destroy(&md->suspend_lock); 1819 mutex_destroy(&md->type_lock); 1820 mutex_destroy(&md->table_devices_lock); 1821 mutex_destroy(&md->swap_bios_lock); 1822 1823 dm_mq_cleanup_mapped_device(md); 1824 } 1825 1826 /* 1827 * Allocate and initialise a blank device with a given minor. 1828 */ 1829 static struct mapped_device *alloc_dev(int minor) 1830 { 1831 int r, numa_node_id = dm_get_numa_node(); 1832 struct mapped_device *md; 1833 void *old_md; 1834 1835 md = kvzalloc_node(sizeof(*md), GFP_KERNEL, numa_node_id); 1836 if (!md) { 1837 DMWARN("unable to allocate device, out of memory."); 1838 return NULL; 1839 } 1840 1841 if (!try_module_get(THIS_MODULE)) 1842 goto bad_module_get; 1843 1844 /* get a minor number for the dev */ 1845 if (minor == DM_ANY_MINOR) 1846 r = next_free_minor(&minor); 1847 else 1848 r = specific_minor(minor); 1849 if (r < 0) 1850 goto bad_minor; 1851 1852 r = init_srcu_struct(&md->io_barrier); 1853 if (r < 0) 1854 goto bad_io_barrier; 1855 1856 md->numa_node_id = numa_node_id; 1857 md->init_tio_pdu = false; 1858 md->type = DM_TYPE_NONE; 1859 mutex_init(&md->suspend_lock); 1860 mutex_init(&md->type_lock); 1861 mutex_init(&md->table_devices_lock); 1862 spin_lock_init(&md->deferred_lock); 1863 atomic_set(&md->holders, 1); 1864 atomic_set(&md->open_count, 0); 1865 atomic_set(&md->event_nr, 0); 1866 atomic_set(&md->uevent_seq, 0); 1867 INIT_LIST_HEAD(&md->uevent_list); 1868 INIT_LIST_HEAD(&md->table_devices); 1869 spin_lock_init(&md->uevent_lock); 1870 1871 /* 1872 * default to bio-based until DM table is loaded and md->type 1873 * established. If request-based table is loaded: blk-mq will 1874 * override accordingly. 1875 */ 1876 md->disk = blk_alloc_disk(md->numa_node_id); 1877 if (!md->disk) 1878 goto bad; 1879 md->queue = md->disk->queue; 1880 1881 init_waitqueue_head(&md->wait); 1882 INIT_WORK(&md->work, dm_wq_work); 1883 init_waitqueue_head(&md->eventq); 1884 init_completion(&md->kobj_holder.completion); 1885 1886 md->swap_bios = get_swap_bios(); 1887 sema_init(&md->swap_bios_semaphore, md->swap_bios); 1888 mutex_init(&md->swap_bios_lock); 1889 1890 md->disk->major = _major; 1891 md->disk->first_minor = minor; 1892 md->disk->minors = 1; 1893 md->disk->flags |= GENHD_FL_NO_PART; 1894 md->disk->fops = &dm_blk_dops; 1895 md->disk->queue = md->queue; 1896 md->disk->private_data = md; 1897 sprintf(md->disk->disk_name, "dm-%d", minor); 1898 1899 if (IS_ENABLED(CONFIG_FS_DAX)) { 1900 md->dax_dev = alloc_dax(md, &dm_dax_ops); 1901 if (IS_ERR(md->dax_dev)) { 1902 md->dax_dev = NULL; 1903 goto bad; 1904 } 1905 set_dax_nocache(md->dax_dev); 1906 set_dax_nomc(md->dax_dev); 1907 if (dax_add_host(md->dax_dev, md->disk)) 1908 goto bad; 1909 } 1910 1911 format_dev_t(md->name, MKDEV(_major, minor)); 1912 1913 md->wq = alloc_workqueue("kdmflush/%s", WQ_MEM_RECLAIM, 0, md->name); 1914 if (!md->wq) 1915 goto bad; 1916 1917 md->pending_io = alloc_percpu(unsigned long); 1918 if (!md->pending_io) 1919 goto bad; 1920 1921 dm_stats_init(&md->stats); 1922 1923 /* Populate the mapping, nobody knows we exist yet */ 1924 spin_lock(&_minor_lock); 1925 old_md = idr_replace(&_minor_idr, md, minor); 1926 spin_unlock(&_minor_lock); 1927 1928 BUG_ON(old_md != MINOR_ALLOCED); 1929 1930 return md; 1931 1932 bad: 1933 cleanup_mapped_device(md); 1934 bad_io_barrier: 1935 free_minor(minor); 1936 bad_minor: 1937 module_put(THIS_MODULE); 1938 bad_module_get: 1939 kvfree(md); 1940 return NULL; 1941 } 1942 1943 static void unlock_fs(struct mapped_device *md); 1944 1945 static void free_dev(struct mapped_device *md) 1946 { 1947 int minor = MINOR(disk_devt(md->disk)); 1948 1949 unlock_fs(md); 1950 1951 cleanup_mapped_device(md); 1952 1953 free_table_devices(&md->table_devices); 1954 dm_stats_cleanup(&md->stats); 1955 free_minor(minor); 1956 1957 module_put(THIS_MODULE); 1958 kvfree(md); 1959 } 1960 1961 static int __bind_mempools(struct mapped_device *md, struct dm_table *t) 1962 { 1963 struct dm_md_mempools *p = dm_table_get_md_mempools(t); 1964 int ret = 0; 1965 1966 if (dm_table_bio_based(t)) { 1967 /* 1968 * The md may already have mempools that need changing. 1969 * If so, reload bioset because front_pad may have changed 1970 * because a different table was loaded. 1971 */ 1972 bioset_exit(&md->bs); 1973 bioset_exit(&md->io_bs); 1974 1975 } else if (bioset_initialized(&md->bs)) { 1976 /* 1977 * There's no need to reload with request-based dm 1978 * because the size of front_pad doesn't change. 1979 * Note for future: If you are to reload bioset, 1980 * prep-ed requests in the queue may refer 1981 * to bio from the old bioset, so you must walk 1982 * through the queue to unprep. 1983 */ 1984 goto out; 1985 } 1986 1987 BUG_ON(!p || 1988 bioset_initialized(&md->bs) || 1989 bioset_initialized(&md->io_bs)); 1990 1991 ret = bioset_init_from_src(&md->bs, &p->bs); 1992 if (ret) 1993 goto out; 1994 ret = bioset_init_from_src(&md->io_bs, &p->io_bs); 1995 if (ret) 1996 bioset_exit(&md->bs); 1997 out: 1998 /* mempool bind completed, no longer need any mempools in the table */ 1999 dm_table_free_md_mempools(t); 2000 return ret; 2001 } 2002 2003 /* 2004 * Bind a table to the device. 2005 */ 2006 static void event_callback(void *context) 2007 { 2008 unsigned long flags; 2009 LIST_HEAD(uevents); 2010 struct mapped_device *md = (struct mapped_device *) context; 2011 2012 spin_lock_irqsave(&md->uevent_lock, flags); 2013 list_splice_init(&md->uevent_list, &uevents); 2014 spin_unlock_irqrestore(&md->uevent_lock, flags); 2015 2016 dm_send_uevents(&uevents, &disk_to_dev(md->disk)->kobj); 2017 2018 atomic_inc(&md->event_nr); 2019 wake_up(&md->eventq); 2020 dm_issue_global_event(); 2021 } 2022 2023 /* 2024 * Returns old map, which caller must destroy. 2025 */ 2026 static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t, 2027 struct queue_limits *limits) 2028 { 2029 struct dm_table *old_map; 2030 sector_t size; 2031 int ret; 2032 2033 lockdep_assert_held(&md->suspend_lock); 2034 2035 size = dm_table_get_size(t); 2036 2037 /* 2038 * Wipe any geometry if the size of the table changed. 2039 */ 2040 if (size != dm_get_size(md)) 2041 memset(&md->geometry, 0, sizeof(md->geometry)); 2042 2043 if (!get_capacity(md->disk)) 2044 set_capacity(md->disk, size); 2045 else 2046 set_capacity_and_notify(md->disk, size); 2047 2048 dm_table_event_callback(t, event_callback, md); 2049 2050 if (dm_table_request_based(t)) { 2051 /* 2052 * Leverage the fact that request-based DM targets are 2053 * immutable singletons - used to optimize dm_mq_queue_rq. 2054 */ 2055 md->immutable_target = dm_table_get_immutable_target(t); 2056 } 2057 2058 ret = __bind_mempools(md, t); 2059 if (ret) { 2060 old_map = ERR_PTR(ret); 2061 goto out; 2062 } 2063 2064 ret = dm_table_set_restrictions(t, md->queue, limits); 2065 if (ret) { 2066 old_map = ERR_PTR(ret); 2067 goto out; 2068 } 2069 2070 old_map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock)); 2071 rcu_assign_pointer(md->map, (void *)t); 2072 md->immutable_target_type = dm_table_get_immutable_target_type(t); 2073 2074 if (old_map) 2075 dm_sync_table(md); 2076 out: 2077 return old_map; 2078 } 2079 2080 /* 2081 * Returns unbound table for the caller to free. 2082 */ 2083 static struct dm_table *__unbind(struct mapped_device *md) 2084 { 2085 struct dm_table *map = rcu_dereference_protected(md->map, 1); 2086 2087 if (!map) 2088 return NULL; 2089 2090 dm_table_event_callback(map, NULL, NULL); 2091 RCU_INIT_POINTER(md->map, NULL); 2092 dm_sync_table(md); 2093 2094 return map; 2095 } 2096 2097 /* 2098 * Constructor for a new device. 2099 */ 2100 int dm_create(int minor, struct mapped_device **result) 2101 { 2102 struct mapped_device *md; 2103 2104 md = alloc_dev(minor); 2105 if (!md) 2106 return -ENXIO; 2107 2108 dm_ima_reset_data(md); 2109 2110 *result = md; 2111 return 0; 2112 } 2113 2114 /* 2115 * Functions to manage md->type. 2116 * All are required to hold md->type_lock. 2117 */ 2118 void dm_lock_md_type(struct mapped_device *md) 2119 { 2120 mutex_lock(&md->type_lock); 2121 } 2122 2123 void dm_unlock_md_type(struct mapped_device *md) 2124 { 2125 mutex_unlock(&md->type_lock); 2126 } 2127 2128 void dm_set_md_type(struct mapped_device *md, enum dm_queue_mode type) 2129 { 2130 BUG_ON(!mutex_is_locked(&md->type_lock)); 2131 md->type = type; 2132 } 2133 2134 enum dm_queue_mode dm_get_md_type(struct mapped_device *md) 2135 { 2136 return md->type; 2137 } 2138 2139 struct target_type *dm_get_immutable_target_type(struct mapped_device *md) 2140 { 2141 return md->immutable_target_type; 2142 } 2143 2144 /* 2145 * The queue_limits are only valid as long as you have a reference 2146 * count on 'md'. 2147 */ 2148 struct queue_limits *dm_get_queue_limits(struct mapped_device *md) 2149 { 2150 BUG_ON(!atomic_read(&md->holders)); 2151 return &md->queue->limits; 2152 } 2153 EXPORT_SYMBOL_GPL(dm_get_queue_limits); 2154 2155 /* 2156 * Setup the DM device's queue based on md's type 2157 */ 2158 int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t) 2159 { 2160 enum dm_queue_mode type = dm_table_get_type(t); 2161 struct queue_limits limits; 2162 int r; 2163 2164 switch (type) { 2165 case DM_TYPE_REQUEST_BASED: 2166 md->disk->fops = &dm_rq_blk_dops; 2167 r = dm_mq_init_request_queue(md, t); 2168 if (r) { 2169 DMERR("Cannot initialize queue for request-based dm mapped device"); 2170 return r; 2171 } 2172 break; 2173 case DM_TYPE_BIO_BASED: 2174 case DM_TYPE_DAX_BIO_BASED: 2175 break; 2176 case DM_TYPE_NONE: 2177 WARN_ON_ONCE(true); 2178 break; 2179 } 2180 2181 r = dm_calculate_queue_limits(t, &limits); 2182 if (r) { 2183 DMERR("Cannot calculate initial queue limits"); 2184 return r; 2185 } 2186 r = dm_table_set_restrictions(t, md->queue, &limits); 2187 if (r) 2188 return r; 2189 2190 r = add_disk(md->disk); 2191 if (r) 2192 return r; 2193 2194 r = dm_sysfs_init(md); 2195 if (r) { 2196 del_gendisk(md->disk); 2197 return r; 2198 } 2199 md->type = type; 2200 return 0; 2201 } 2202 2203 struct mapped_device *dm_get_md(dev_t dev) 2204 { 2205 struct mapped_device *md; 2206 unsigned minor = MINOR(dev); 2207 2208 if (MAJOR(dev) != _major || minor >= (1 << MINORBITS)) 2209 return NULL; 2210 2211 spin_lock(&_minor_lock); 2212 2213 md = idr_find(&_minor_idr, minor); 2214 if (!md || md == MINOR_ALLOCED || (MINOR(disk_devt(dm_disk(md))) != minor) || 2215 test_bit(DMF_FREEING, &md->flags) || dm_deleting_md(md)) { 2216 md = NULL; 2217 goto out; 2218 } 2219 dm_get(md); 2220 out: 2221 spin_unlock(&_minor_lock); 2222 2223 return md; 2224 } 2225 EXPORT_SYMBOL_GPL(dm_get_md); 2226 2227 void *dm_get_mdptr(struct mapped_device *md) 2228 { 2229 return md->interface_ptr; 2230 } 2231 2232 void dm_set_mdptr(struct mapped_device *md, void *ptr) 2233 { 2234 md->interface_ptr = ptr; 2235 } 2236 2237 void dm_get(struct mapped_device *md) 2238 { 2239 atomic_inc(&md->holders); 2240 BUG_ON(test_bit(DMF_FREEING, &md->flags)); 2241 } 2242 2243 int dm_hold(struct mapped_device *md) 2244 { 2245 spin_lock(&_minor_lock); 2246 if (test_bit(DMF_FREEING, &md->flags)) { 2247 spin_unlock(&_minor_lock); 2248 return -EBUSY; 2249 } 2250 dm_get(md); 2251 spin_unlock(&_minor_lock); 2252 return 0; 2253 } 2254 EXPORT_SYMBOL_GPL(dm_hold); 2255 2256 const char *dm_device_name(struct mapped_device *md) 2257 { 2258 return md->name; 2259 } 2260 EXPORT_SYMBOL_GPL(dm_device_name); 2261 2262 static void __dm_destroy(struct mapped_device *md, bool wait) 2263 { 2264 struct dm_table *map; 2265 int srcu_idx; 2266 2267 might_sleep(); 2268 2269 spin_lock(&_minor_lock); 2270 idr_replace(&_minor_idr, MINOR_ALLOCED, MINOR(disk_devt(dm_disk(md)))); 2271 set_bit(DMF_FREEING, &md->flags); 2272 spin_unlock(&_minor_lock); 2273 2274 blk_set_queue_dying(md->queue); 2275 2276 /* 2277 * Take suspend_lock so that presuspend and postsuspend methods 2278 * do not race with internal suspend. 2279 */ 2280 mutex_lock(&md->suspend_lock); 2281 map = dm_get_live_table(md, &srcu_idx); 2282 if (!dm_suspended_md(md)) { 2283 dm_table_presuspend_targets(map); 2284 set_bit(DMF_SUSPENDED, &md->flags); 2285 set_bit(DMF_POST_SUSPENDING, &md->flags); 2286 dm_table_postsuspend_targets(map); 2287 } 2288 /* dm_put_live_table must be before msleep, otherwise deadlock is possible */ 2289 dm_put_live_table(md, srcu_idx); 2290 mutex_unlock(&md->suspend_lock); 2291 2292 /* 2293 * Rare, but there may be I/O requests still going to complete, 2294 * for example. Wait for all references to disappear. 2295 * No one should increment the reference count of the mapped_device, 2296 * after the mapped_device state becomes DMF_FREEING. 2297 */ 2298 if (wait) 2299 while (atomic_read(&md->holders)) 2300 msleep(1); 2301 else if (atomic_read(&md->holders)) 2302 DMWARN("%s: Forcibly removing mapped_device still in use! (%d users)", 2303 dm_device_name(md), atomic_read(&md->holders)); 2304 2305 dm_table_destroy(__unbind(md)); 2306 free_dev(md); 2307 } 2308 2309 void dm_destroy(struct mapped_device *md) 2310 { 2311 __dm_destroy(md, true); 2312 } 2313 2314 void dm_destroy_immediate(struct mapped_device *md) 2315 { 2316 __dm_destroy(md, false); 2317 } 2318 2319 void dm_put(struct mapped_device *md) 2320 { 2321 atomic_dec(&md->holders); 2322 } 2323 EXPORT_SYMBOL_GPL(dm_put); 2324 2325 static bool dm_in_flight_bios(struct mapped_device *md) 2326 { 2327 int cpu; 2328 unsigned long sum = 0; 2329 2330 for_each_possible_cpu(cpu) 2331 sum += *per_cpu_ptr(md->pending_io, cpu); 2332 2333 return sum != 0; 2334 } 2335 2336 static int dm_wait_for_bios_completion(struct mapped_device *md, unsigned int task_state) 2337 { 2338 int r = 0; 2339 DEFINE_WAIT(wait); 2340 2341 while (true) { 2342 prepare_to_wait(&md->wait, &wait, task_state); 2343 2344 if (!dm_in_flight_bios(md)) 2345 break; 2346 2347 if (signal_pending_state(task_state, current)) { 2348 r = -EINTR; 2349 break; 2350 } 2351 2352 io_schedule(); 2353 } 2354 finish_wait(&md->wait, &wait); 2355 2356 smp_rmb(); 2357 2358 return r; 2359 } 2360 2361 static int dm_wait_for_completion(struct mapped_device *md, unsigned int task_state) 2362 { 2363 int r = 0; 2364 2365 if (!queue_is_mq(md->queue)) 2366 return dm_wait_for_bios_completion(md, task_state); 2367 2368 while (true) { 2369 if (!blk_mq_queue_inflight(md->queue)) 2370 break; 2371 2372 if (signal_pending_state(task_state, current)) { 2373 r = -EINTR; 2374 break; 2375 } 2376 2377 msleep(5); 2378 } 2379 2380 return r; 2381 } 2382 2383 /* 2384 * Process the deferred bios 2385 */ 2386 static void dm_wq_work(struct work_struct *work) 2387 { 2388 struct mapped_device *md = container_of(work, struct mapped_device, work); 2389 struct bio *bio; 2390 2391 while (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) { 2392 spin_lock_irq(&md->deferred_lock); 2393 bio = bio_list_pop(&md->deferred); 2394 spin_unlock_irq(&md->deferred_lock); 2395 2396 if (!bio) 2397 break; 2398 2399 submit_bio_noacct(bio); 2400 } 2401 } 2402 2403 static void dm_queue_flush(struct mapped_device *md) 2404 { 2405 clear_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags); 2406 smp_mb__after_atomic(); 2407 queue_work(md->wq, &md->work); 2408 } 2409 2410 /* 2411 * Swap in a new table, returning the old one for the caller to destroy. 2412 */ 2413 struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *table) 2414 { 2415 struct dm_table *live_map = NULL, *map = ERR_PTR(-EINVAL); 2416 struct queue_limits limits; 2417 int r; 2418 2419 mutex_lock(&md->suspend_lock); 2420 2421 /* device must be suspended */ 2422 if (!dm_suspended_md(md)) 2423 goto out; 2424 2425 /* 2426 * If the new table has no data devices, retain the existing limits. 2427 * This helps multipath with queue_if_no_path if all paths disappear, 2428 * then new I/O is queued based on these limits, and then some paths 2429 * reappear. 2430 */ 2431 if (dm_table_has_no_data_devices(table)) { 2432 live_map = dm_get_live_table_fast(md); 2433 if (live_map) 2434 limits = md->queue->limits; 2435 dm_put_live_table_fast(md); 2436 } 2437 2438 if (!live_map) { 2439 r = dm_calculate_queue_limits(table, &limits); 2440 if (r) { 2441 map = ERR_PTR(r); 2442 goto out; 2443 } 2444 } 2445 2446 map = __bind(md, table, &limits); 2447 dm_issue_global_event(); 2448 2449 out: 2450 mutex_unlock(&md->suspend_lock); 2451 return map; 2452 } 2453 2454 /* 2455 * Functions to lock and unlock any filesystem running on the 2456 * device. 2457 */ 2458 static int lock_fs(struct mapped_device *md) 2459 { 2460 int r; 2461 2462 WARN_ON(test_bit(DMF_FROZEN, &md->flags)); 2463 2464 r = freeze_bdev(md->disk->part0); 2465 if (!r) 2466 set_bit(DMF_FROZEN, &md->flags); 2467 return r; 2468 } 2469 2470 static void unlock_fs(struct mapped_device *md) 2471 { 2472 if (!test_bit(DMF_FROZEN, &md->flags)) 2473 return; 2474 thaw_bdev(md->disk->part0); 2475 clear_bit(DMF_FROZEN, &md->flags); 2476 } 2477 2478 /* 2479 * @suspend_flags: DM_SUSPEND_LOCKFS_FLAG and/or DM_SUSPEND_NOFLUSH_FLAG 2480 * @task_state: e.g. TASK_INTERRUPTIBLE or TASK_UNINTERRUPTIBLE 2481 * @dmf_suspended_flag: DMF_SUSPENDED or DMF_SUSPENDED_INTERNALLY 2482 * 2483 * If __dm_suspend returns 0, the device is completely quiescent 2484 * now. There is no request-processing activity. All new requests 2485 * are being added to md->deferred list. 2486 */ 2487 static int __dm_suspend(struct mapped_device *md, struct dm_table *map, 2488 unsigned suspend_flags, unsigned int task_state, 2489 int dmf_suspended_flag) 2490 { 2491 bool do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG; 2492 bool noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG; 2493 int r; 2494 2495 lockdep_assert_held(&md->suspend_lock); 2496 2497 /* 2498 * DMF_NOFLUSH_SUSPENDING must be set before presuspend. 2499 * This flag is cleared before dm_suspend returns. 2500 */ 2501 if (noflush) 2502 set_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); 2503 else 2504 DMDEBUG("%s: suspending with flush", dm_device_name(md)); 2505 2506 /* 2507 * This gets reverted if there's an error later and the targets 2508 * provide the .presuspend_undo hook. 2509 */ 2510 dm_table_presuspend_targets(map); 2511 2512 /* 2513 * Flush I/O to the device. 2514 * Any I/O submitted after lock_fs() may not be flushed. 2515 * noflush takes precedence over do_lockfs. 2516 * (lock_fs() flushes I/Os and waits for them to complete.) 2517 */ 2518 if (!noflush && do_lockfs) { 2519 r = lock_fs(md); 2520 if (r) { 2521 dm_table_presuspend_undo_targets(map); 2522 return r; 2523 } 2524 } 2525 2526 /* 2527 * Here we must make sure that no processes are submitting requests 2528 * to target drivers i.e. no one may be executing 2529 * dm_split_and_process_bio from dm_submit_bio. 2530 * 2531 * To get all processes out of dm_split_and_process_bio in dm_submit_bio, 2532 * we take the write lock. To prevent any process from reentering 2533 * dm_split_and_process_bio from dm_submit_bio and quiesce the thread 2534 * (dm_wq_work), we set DMF_BLOCK_IO_FOR_SUSPEND and call 2535 * flush_workqueue(md->wq). 2536 */ 2537 set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags); 2538 if (map) 2539 synchronize_srcu(&md->io_barrier); 2540 2541 /* 2542 * Stop md->queue before flushing md->wq in case request-based 2543 * dm defers requests to md->wq from md->queue. 2544 */ 2545 if (dm_request_based(md)) 2546 dm_stop_queue(md->queue); 2547 2548 flush_workqueue(md->wq); 2549 2550 /* 2551 * At this point no more requests are entering target request routines. 2552 * We call dm_wait_for_completion to wait for all existing requests 2553 * to finish. 2554 */ 2555 r = dm_wait_for_completion(md, task_state); 2556 if (!r) 2557 set_bit(dmf_suspended_flag, &md->flags); 2558 2559 if (noflush) 2560 clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); 2561 if (map) 2562 synchronize_srcu(&md->io_barrier); 2563 2564 /* were we interrupted ? */ 2565 if (r < 0) { 2566 dm_queue_flush(md); 2567 2568 if (dm_request_based(md)) 2569 dm_start_queue(md->queue); 2570 2571 unlock_fs(md); 2572 dm_table_presuspend_undo_targets(map); 2573 /* pushback list is already flushed, so skip flush */ 2574 } 2575 2576 return r; 2577 } 2578 2579 /* 2580 * We need to be able to change a mapping table under a mounted 2581 * filesystem. For example we might want to move some data in 2582 * the background. Before the table can be swapped with 2583 * dm_bind_table, dm_suspend must be called to flush any in 2584 * flight bios and ensure that any further io gets deferred. 2585 */ 2586 /* 2587 * Suspend mechanism in request-based dm. 2588 * 2589 * 1. Flush all I/Os by lock_fs() if needed. 2590 * 2. Stop dispatching any I/O by stopping the request_queue. 2591 * 3. Wait for all in-flight I/Os to be completed or requeued. 2592 * 2593 * To abort suspend, start the request_queue. 2594 */ 2595 int dm_suspend(struct mapped_device *md, unsigned suspend_flags) 2596 { 2597 struct dm_table *map = NULL; 2598 int r = 0; 2599 2600 retry: 2601 mutex_lock_nested(&md->suspend_lock, SINGLE_DEPTH_NESTING); 2602 2603 if (dm_suspended_md(md)) { 2604 r = -EINVAL; 2605 goto out_unlock; 2606 } 2607 2608 if (dm_suspended_internally_md(md)) { 2609 /* already internally suspended, wait for internal resume */ 2610 mutex_unlock(&md->suspend_lock); 2611 r = wait_on_bit(&md->flags, DMF_SUSPENDED_INTERNALLY, TASK_INTERRUPTIBLE); 2612 if (r) 2613 return r; 2614 goto retry; 2615 } 2616 2617 map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock)); 2618 2619 r = __dm_suspend(md, map, suspend_flags, TASK_INTERRUPTIBLE, DMF_SUSPENDED); 2620 if (r) 2621 goto out_unlock; 2622 2623 set_bit(DMF_POST_SUSPENDING, &md->flags); 2624 dm_table_postsuspend_targets(map); 2625 clear_bit(DMF_POST_SUSPENDING, &md->flags); 2626 2627 out_unlock: 2628 mutex_unlock(&md->suspend_lock); 2629 return r; 2630 } 2631 2632 static int __dm_resume(struct mapped_device *md, struct dm_table *map) 2633 { 2634 if (map) { 2635 int r = dm_table_resume_targets(map); 2636 if (r) 2637 return r; 2638 } 2639 2640 dm_queue_flush(md); 2641 2642 /* 2643 * Flushing deferred I/Os must be done after targets are resumed 2644 * so that mapping of targets can work correctly. 2645 * Request-based dm is queueing the deferred I/Os in its request_queue. 2646 */ 2647 if (dm_request_based(md)) 2648 dm_start_queue(md->queue); 2649 2650 unlock_fs(md); 2651 2652 return 0; 2653 } 2654 2655 int dm_resume(struct mapped_device *md) 2656 { 2657 int r; 2658 struct dm_table *map = NULL; 2659 2660 retry: 2661 r = -EINVAL; 2662 mutex_lock_nested(&md->suspend_lock, SINGLE_DEPTH_NESTING); 2663 2664 if (!dm_suspended_md(md)) 2665 goto out; 2666 2667 if (dm_suspended_internally_md(md)) { 2668 /* already internally suspended, wait for internal resume */ 2669 mutex_unlock(&md->suspend_lock); 2670 r = wait_on_bit(&md->flags, DMF_SUSPENDED_INTERNALLY, TASK_INTERRUPTIBLE); 2671 if (r) 2672 return r; 2673 goto retry; 2674 } 2675 2676 map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock)); 2677 if (!map || !dm_table_get_size(map)) 2678 goto out; 2679 2680 r = __dm_resume(md, map); 2681 if (r) 2682 goto out; 2683 2684 clear_bit(DMF_SUSPENDED, &md->flags); 2685 out: 2686 mutex_unlock(&md->suspend_lock); 2687 2688 return r; 2689 } 2690 2691 /* 2692 * Internal suspend/resume works like userspace-driven suspend. It waits 2693 * until all bios finish and prevents issuing new bios to the target drivers. 2694 * It may be used only from the kernel. 2695 */ 2696 2697 static void __dm_internal_suspend(struct mapped_device *md, unsigned suspend_flags) 2698 { 2699 struct dm_table *map = NULL; 2700 2701 lockdep_assert_held(&md->suspend_lock); 2702 2703 if (md->internal_suspend_count++) 2704 return; /* nested internal suspend */ 2705 2706 if (dm_suspended_md(md)) { 2707 set_bit(DMF_SUSPENDED_INTERNALLY, &md->flags); 2708 return; /* nest suspend */ 2709 } 2710 2711 map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock)); 2712 2713 /* 2714 * Using TASK_UNINTERRUPTIBLE because only NOFLUSH internal suspend is 2715 * supported. Properly supporting a TASK_INTERRUPTIBLE internal suspend 2716 * would require changing .presuspend to return an error -- avoid this 2717 * until there is a need for more elaborate variants of internal suspend. 2718 */ 2719 (void) __dm_suspend(md, map, suspend_flags, TASK_UNINTERRUPTIBLE, 2720 DMF_SUSPENDED_INTERNALLY); 2721 2722 set_bit(DMF_POST_SUSPENDING, &md->flags); 2723 dm_table_postsuspend_targets(map); 2724 clear_bit(DMF_POST_SUSPENDING, &md->flags); 2725 } 2726 2727 static void __dm_internal_resume(struct mapped_device *md) 2728 { 2729 BUG_ON(!md->internal_suspend_count); 2730 2731 if (--md->internal_suspend_count) 2732 return; /* resume from nested internal suspend */ 2733 2734 if (dm_suspended_md(md)) 2735 goto done; /* resume from nested suspend */ 2736 2737 /* 2738 * NOTE: existing callers don't need to call dm_table_resume_targets 2739 * (which may fail -- so best to avoid it for now by passing NULL map) 2740 */ 2741 (void) __dm_resume(md, NULL); 2742 2743 done: 2744 clear_bit(DMF_SUSPENDED_INTERNALLY, &md->flags); 2745 smp_mb__after_atomic(); 2746 wake_up_bit(&md->flags, DMF_SUSPENDED_INTERNALLY); 2747 } 2748 2749 void dm_internal_suspend_noflush(struct mapped_device *md) 2750 { 2751 mutex_lock(&md->suspend_lock); 2752 __dm_internal_suspend(md, DM_SUSPEND_NOFLUSH_FLAG); 2753 mutex_unlock(&md->suspend_lock); 2754 } 2755 EXPORT_SYMBOL_GPL(dm_internal_suspend_noflush); 2756 2757 void dm_internal_resume(struct mapped_device *md) 2758 { 2759 mutex_lock(&md->suspend_lock); 2760 __dm_internal_resume(md); 2761 mutex_unlock(&md->suspend_lock); 2762 } 2763 EXPORT_SYMBOL_GPL(dm_internal_resume); 2764 2765 /* 2766 * Fast variants of internal suspend/resume hold md->suspend_lock, 2767 * which prevents interaction with userspace-driven suspend. 2768 */ 2769 2770 void dm_internal_suspend_fast(struct mapped_device *md) 2771 { 2772 mutex_lock(&md->suspend_lock); 2773 if (dm_suspended_md(md) || dm_suspended_internally_md(md)) 2774 return; 2775 2776 set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags); 2777 synchronize_srcu(&md->io_barrier); 2778 flush_workqueue(md->wq); 2779 dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE); 2780 } 2781 EXPORT_SYMBOL_GPL(dm_internal_suspend_fast); 2782 2783 void dm_internal_resume_fast(struct mapped_device *md) 2784 { 2785 if (dm_suspended_md(md) || dm_suspended_internally_md(md)) 2786 goto done; 2787 2788 dm_queue_flush(md); 2789 2790 done: 2791 mutex_unlock(&md->suspend_lock); 2792 } 2793 EXPORT_SYMBOL_GPL(dm_internal_resume_fast); 2794 2795 /*----------------------------------------------------------------- 2796 * Event notification. 2797 *---------------------------------------------------------------*/ 2798 int dm_kobject_uevent(struct mapped_device *md, enum kobject_action action, 2799 unsigned cookie) 2800 { 2801 int r; 2802 unsigned noio_flag; 2803 char udev_cookie[DM_COOKIE_LENGTH]; 2804 char *envp[] = { udev_cookie, NULL }; 2805 2806 noio_flag = memalloc_noio_save(); 2807 2808 if (!cookie) 2809 r = kobject_uevent(&disk_to_dev(md->disk)->kobj, action); 2810 else { 2811 snprintf(udev_cookie, DM_COOKIE_LENGTH, "%s=%u", 2812 DM_COOKIE_ENV_VAR_NAME, cookie); 2813 r = kobject_uevent_env(&disk_to_dev(md->disk)->kobj, 2814 action, envp); 2815 } 2816 2817 memalloc_noio_restore(noio_flag); 2818 2819 return r; 2820 } 2821 2822 uint32_t dm_next_uevent_seq(struct mapped_device *md) 2823 { 2824 return atomic_add_return(1, &md->uevent_seq); 2825 } 2826 2827 uint32_t dm_get_event_nr(struct mapped_device *md) 2828 { 2829 return atomic_read(&md->event_nr); 2830 } 2831 2832 int dm_wait_event(struct mapped_device *md, int event_nr) 2833 { 2834 return wait_event_interruptible(md->eventq, 2835 (event_nr != atomic_read(&md->event_nr))); 2836 } 2837 2838 void dm_uevent_add(struct mapped_device *md, struct list_head *elist) 2839 { 2840 unsigned long flags; 2841 2842 spin_lock_irqsave(&md->uevent_lock, flags); 2843 list_add(elist, &md->uevent_list); 2844 spin_unlock_irqrestore(&md->uevent_lock, flags); 2845 } 2846 2847 /* 2848 * The gendisk is only valid as long as you have a reference 2849 * count on 'md'. 2850 */ 2851 struct gendisk *dm_disk(struct mapped_device *md) 2852 { 2853 return md->disk; 2854 } 2855 EXPORT_SYMBOL_GPL(dm_disk); 2856 2857 struct kobject *dm_kobject(struct mapped_device *md) 2858 { 2859 return &md->kobj_holder.kobj; 2860 } 2861 2862 struct mapped_device *dm_get_from_kobject(struct kobject *kobj) 2863 { 2864 struct mapped_device *md; 2865 2866 md = container_of(kobj, struct mapped_device, kobj_holder.kobj); 2867 2868 spin_lock(&_minor_lock); 2869 if (test_bit(DMF_FREEING, &md->flags) || dm_deleting_md(md)) { 2870 md = NULL; 2871 goto out; 2872 } 2873 dm_get(md); 2874 out: 2875 spin_unlock(&_minor_lock); 2876 2877 return md; 2878 } 2879 2880 int dm_suspended_md(struct mapped_device *md) 2881 { 2882 return test_bit(DMF_SUSPENDED, &md->flags); 2883 } 2884 2885 static int dm_post_suspending_md(struct mapped_device *md) 2886 { 2887 return test_bit(DMF_POST_SUSPENDING, &md->flags); 2888 } 2889 2890 int dm_suspended_internally_md(struct mapped_device *md) 2891 { 2892 return test_bit(DMF_SUSPENDED_INTERNALLY, &md->flags); 2893 } 2894 2895 int dm_test_deferred_remove_flag(struct mapped_device *md) 2896 { 2897 return test_bit(DMF_DEFERRED_REMOVE, &md->flags); 2898 } 2899 2900 int dm_suspended(struct dm_target *ti) 2901 { 2902 return dm_suspended_md(ti->table->md); 2903 } 2904 EXPORT_SYMBOL_GPL(dm_suspended); 2905 2906 int dm_post_suspending(struct dm_target *ti) 2907 { 2908 return dm_post_suspending_md(ti->table->md); 2909 } 2910 EXPORT_SYMBOL_GPL(dm_post_suspending); 2911 2912 int dm_noflush_suspending(struct dm_target *ti) 2913 { 2914 return __noflush_suspending(ti->table->md); 2915 } 2916 EXPORT_SYMBOL_GPL(dm_noflush_suspending); 2917 2918 struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, enum dm_queue_mode type, 2919 unsigned integrity, unsigned per_io_data_size, 2920 unsigned min_pool_size) 2921 { 2922 struct dm_md_mempools *pools = kzalloc_node(sizeof(*pools), GFP_KERNEL, md->numa_node_id); 2923 unsigned int pool_size = 0; 2924 unsigned int front_pad, io_front_pad; 2925 int ret; 2926 2927 if (!pools) 2928 return NULL; 2929 2930 switch (type) { 2931 case DM_TYPE_BIO_BASED: 2932 case DM_TYPE_DAX_BIO_BASED: 2933 pool_size = max(dm_get_reserved_bio_based_ios(), min_pool_size); 2934 front_pad = roundup(per_io_data_size, __alignof__(struct dm_target_io)) + DM_TARGET_IO_BIO_OFFSET; 2935 io_front_pad = roundup(per_io_data_size, __alignof__(struct dm_io)) + DM_IO_BIO_OFFSET; 2936 ret = bioset_init(&pools->io_bs, pool_size, io_front_pad, 0); 2937 if (ret) 2938 goto out; 2939 if (integrity && bioset_integrity_create(&pools->io_bs, pool_size)) 2940 goto out; 2941 break; 2942 case DM_TYPE_REQUEST_BASED: 2943 pool_size = max(dm_get_reserved_rq_based_ios(), min_pool_size); 2944 front_pad = offsetof(struct dm_rq_clone_bio_info, clone); 2945 /* per_io_data_size is used for blk-mq pdu at queue allocation */ 2946 break; 2947 default: 2948 BUG(); 2949 } 2950 2951 ret = bioset_init(&pools->bs, pool_size, front_pad, 0); 2952 if (ret) 2953 goto out; 2954 2955 if (integrity && bioset_integrity_create(&pools->bs, pool_size)) 2956 goto out; 2957 2958 return pools; 2959 2960 out: 2961 dm_free_md_mempools(pools); 2962 2963 return NULL; 2964 } 2965 2966 void dm_free_md_mempools(struct dm_md_mempools *pools) 2967 { 2968 if (!pools) 2969 return; 2970 2971 bioset_exit(&pools->bs); 2972 bioset_exit(&pools->io_bs); 2973 2974 kfree(pools); 2975 } 2976 2977 struct dm_pr { 2978 u64 old_key; 2979 u64 new_key; 2980 u32 flags; 2981 bool fail_early; 2982 }; 2983 2984 static int dm_call_pr(struct block_device *bdev, iterate_devices_callout_fn fn, 2985 void *data) 2986 { 2987 struct mapped_device *md = bdev->bd_disk->private_data; 2988 struct dm_table *table; 2989 struct dm_target *ti; 2990 int ret = -ENOTTY, srcu_idx; 2991 2992 table = dm_get_live_table(md, &srcu_idx); 2993 if (!table || !dm_table_get_size(table)) 2994 goto out; 2995 2996 /* We only support devices that have a single target */ 2997 if (dm_table_get_num_targets(table) != 1) 2998 goto out; 2999 ti = dm_table_get_target(table, 0); 3000 3001 ret = -EINVAL; 3002 if (!ti->type->iterate_devices) 3003 goto out; 3004 3005 ret = ti->type->iterate_devices(ti, fn, data); 3006 out: 3007 dm_put_live_table(md, srcu_idx); 3008 return ret; 3009 } 3010 3011 /* 3012 * For register / unregister we need to manually call out to every path. 3013 */ 3014 static int __dm_pr_register(struct dm_target *ti, struct dm_dev *dev, 3015 sector_t start, sector_t len, void *data) 3016 { 3017 struct dm_pr *pr = data; 3018 const struct pr_ops *ops = dev->bdev->bd_disk->fops->pr_ops; 3019 3020 if (!ops || !ops->pr_register) 3021 return -EOPNOTSUPP; 3022 return ops->pr_register(dev->bdev, pr->old_key, pr->new_key, pr->flags); 3023 } 3024 3025 static int dm_pr_register(struct block_device *bdev, u64 old_key, u64 new_key, 3026 u32 flags) 3027 { 3028 struct dm_pr pr = { 3029 .old_key = old_key, 3030 .new_key = new_key, 3031 .flags = flags, 3032 .fail_early = true, 3033 }; 3034 int ret; 3035 3036 ret = dm_call_pr(bdev, __dm_pr_register, &pr); 3037 if (ret && new_key) { 3038 /* unregister all paths if we failed to register any path */ 3039 pr.old_key = new_key; 3040 pr.new_key = 0; 3041 pr.flags = 0; 3042 pr.fail_early = false; 3043 dm_call_pr(bdev, __dm_pr_register, &pr); 3044 } 3045 3046 return ret; 3047 } 3048 3049 static int dm_pr_reserve(struct block_device *bdev, u64 key, enum pr_type type, 3050 u32 flags) 3051 { 3052 struct mapped_device *md = bdev->bd_disk->private_data; 3053 const struct pr_ops *ops; 3054 int r, srcu_idx; 3055 3056 r = dm_prepare_ioctl(md, &srcu_idx, &bdev); 3057 if (r < 0) 3058 goto out; 3059 3060 ops = bdev->bd_disk->fops->pr_ops; 3061 if (ops && ops->pr_reserve) 3062 r = ops->pr_reserve(bdev, key, type, flags); 3063 else 3064 r = -EOPNOTSUPP; 3065 out: 3066 dm_unprepare_ioctl(md, srcu_idx); 3067 return r; 3068 } 3069 3070 static int dm_pr_release(struct block_device *bdev, u64 key, enum pr_type type) 3071 { 3072 struct mapped_device *md = bdev->bd_disk->private_data; 3073 const struct pr_ops *ops; 3074 int r, srcu_idx; 3075 3076 r = dm_prepare_ioctl(md, &srcu_idx, &bdev); 3077 if (r < 0) 3078 goto out; 3079 3080 ops = bdev->bd_disk->fops->pr_ops; 3081 if (ops && ops->pr_release) 3082 r = ops->pr_release(bdev, key, type); 3083 else 3084 r = -EOPNOTSUPP; 3085 out: 3086 dm_unprepare_ioctl(md, srcu_idx); 3087 return r; 3088 } 3089 3090 static int dm_pr_preempt(struct block_device *bdev, u64 old_key, u64 new_key, 3091 enum pr_type type, bool abort) 3092 { 3093 struct mapped_device *md = bdev->bd_disk->private_data; 3094 const struct pr_ops *ops; 3095 int r, srcu_idx; 3096 3097 r = dm_prepare_ioctl(md, &srcu_idx, &bdev); 3098 if (r < 0) 3099 goto out; 3100 3101 ops = bdev->bd_disk->fops->pr_ops; 3102 if (ops && ops->pr_preempt) 3103 r = ops->pr_preempt(bdev, old_key, new_key, type, abort); 3104 else 3105 r = -EOPNOTSUPP; 3106 out: 3107 dm_unprepare_ioctl(md, srcu_idx); 3108 return r; 3109 } 3110 3111 static int dm_pr_clear(struct block_device *bdev, u64 key) 3112 { 3113 struct mapped_device *md = bdev->bd_disk->private_data; 3114 const struct pr_ops *ops; 3115 int r, srcu_idx; 3116 3117 r = dm_prepare_ioctl(md, &srcu_idx, &bdev); 3118 if (r < 0) 3119 goto out; 3120 3121 ops = bdev->bd_disk->fops->pr_ops; 3122 if (ops && ops->pr_clear) 3123 r = ops->pr_clear(bdev, key); 3124 else 3125 r = -EOPNOTSUPP; 3126 out: 3127 dm_unprepare_ioctl(md, srcu_idx); 3128 return r; 3129 } 3130 3131 static const struct pr_ops dm_pr_ops = { 3132 .pr_register = dm_pr_register, 3133 .pr_reserve = dm_pr_reserve, 3134 .pr_release = dm_pr_release, 3135 .pr_preempt = dm_pr_preempt, 3136 .pr_clear = dm_pr_clear, 3137 }; 3138 3139 static const struct block_device_operations dm_blk_dops = { 3140 .submit_bio = dm_submit_bio, 3141 .poll_bio = dm_poll_bio, 3142 .open = dm_blk_open, 3143 .release = dm_blk_close, 3144 .ioctl = dm_blk_ioctl, 3145 .getgeo = dm_blk_getgeo, 3146 .report_zones = dm_blk_report_zones, 3147 .pr_ops = &dm_pr_ops, 3148 .owner = THIS_MODULE 3149 }; 3150 3151 static const struct block_device_operations dm_rq_blk_dops = { 3152 .open = dm_blk_open, 3153 .release = dm_blk_close, 3154 .ioctl = dm_blk_ioctl, 3155 .getgeo = dm_blk_getgeo, 3156 .pr_ops = &dm_pr_ops, 3157 .owner = THIS_MODULE 3158 }; 3159 3160 static const struct dax_operations dm_dax_ops = { 3161 .direct_access = dm_dax_direct_access, 3162 .zero_page_range = dm_dax_zero_page_range, 3163 }; 3164 3165 /* 3166 * module hooks 3167 */ 3168 module_init(dm_init); 3169 module_exit(dm_exit); 3170 3171 module_param(major, uint, 0); 3172 MODULE_PARM_DESC(major, "The major number of the device mapper"); 3173 3174 module_param(reserved_bio_based_ios, uint, S_IRUGO | S_IWUSR); 3175 MODULE_PARM_DESC(reserved_bio_based_ios, "Reserved IOs in bio-based mempools"); 3176 3177 module_param(dm_numa_node, int, S_IRUGO | S_IWUSR); 3178 MODULE_PARM_DESC(dm_numa_node, "NUMA node for DM device memory allocations"); 3179 3180 module_param(swap_bios, int, S_IRUGO | S_IWUSR); 3181 MODULE_PARM_DESC(swap_bios, "Maximum allowed inflight swap IOs"); 3182 3183 MODULE_DESCRIPTION(DM_NAME " driver"); 3184 MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>"); 3185 MODULE_LICENSE("GPL"); 3186