1 /* 2 * Copyright (C) 2001, 2002 Sistina Software (UK) Limited. 3 * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved. 4 * 5 * This file is released under the GPL. 6 */ 7 8 #include "dm-core.h" 9 #include "dm-rq.h" 10 #include "dm-uevent.h" 11 #include "dm-ima.h" 12 13 #include <linux/init.h> 14 #include <linux/module.h> 15 #include <linux/mutex.h> 16 #include <linux/sched/mm.h> 17 #include <linux/sched/signal.h> 18 #include <linux/blkpg.h> 19 #include <linux/bio.h> 20 #include <linux/mempool.h> 21 #include <linux/dax.h> 22 #include <linux/slab.h> 23 #include <linux/idr.h> 24 #include <linux/uio.h> 25 #include <linux/hdreg.h> 26 #include <linux/delay.h> 27 #include <linux/wait.h> 28 #include <linux/pr.h> 29 #include <linux/refcount.h> 30 #include <linux/part_stat.h> 31 #include <linux/blk-crypto.h> 32 #include <linux/blk-crypto-profile.h> 33 34 #define DM_MSG_PREFIX "core" 35 36 /* 37 * Cookies are numeric values sent with CHANGE and REMOVE 38 * uevents while resuming, removing or renaming the device. 39 */ 40 #define DM_COOKIE_ENV_VAR_NAME "DM_COOKIE" 41 #define DM_COOKIE_LENGTH 24 42 43 /* 44 * For REQ_POLLED fs bio, this flag is set if we link mapped underlying 45 * dm_io into one list, and reuse bio->bi_private as the list head. Before 46 * ending this fs bio, we will recover its ->bi_private. 47 */ 48 #define REQ_DM_POLL_LIST REQ_DRV 49 50 static const char *_name = DM_NAME; 51 52 static unsigned int major = 0; 53 static unsigned int _major = 0; 54 55 static DEFINE_IDR(_minor_idr); 56 57 static DEFINE_SPINLOCK(_minor_lock); 58 59 static void do_deferred_remove(struct work_struct *w); 60 61 static DECLARE_WORK(deferred_remove_work, do_deferred_remove); 62 63 static struct workqueue_struct *deferred_remove_workqueue; 64 65 atomic_t dm_global_event_nr = ATOMIC_INIT(0); 66 DECLARE_WAIT_QUEUE_HEAD(dm_global_eventq); 67 68 void dm_issue_global_event(void) 69 { 70 atomic_inc(&dm_global_event_nr); 71 wake_up(&dm_global_eventq); 72 } 73 74 /* 75 * One of these is allocated (on-stack) per original bio. 76 */ 77 struct clone_info { 78 struct dm_table *map; 79 struct bio *bio; 80 struct dm_io *io; 81 sector_t sector; 82 unsigned sector_count; 83 bool submit_as_polled; 84 }; 85 86 #define DM_TARGET_IO_BIO_OFFSET (offsetof(struct dm_target_io, clone)) 87 #define DM_IO_BIO_OFFSET \ 88 (offsetof(struct dm_target_io, clone) + offsetof(struct dm_io, tio)) 89 90 static inline struct dm_target_io *clone_to_tio(struct bio *clone) 91 { 92 return container_of(clone, struct dm_target_io, clone); 93 } 94 95 void *dm_per_bio_data(struct bio *bio, size_t data_size) 96 { 97 if (!dm_tio_flagged(clone_to_tio(bio), DM_TIO_INSIDE_DM_IO)) 98 return (char *)bio - DM_TARGET_IO_BIO_OFFSET - data_size; 99 return (char *)bio - DM_IO_BIO_OFFSET - data_size; 100 } 101 EXPORT_SYMBOL_GPL(dm_per_bio_data); 102 103 struct bio *dm_bio_from_per_bio_data(void *data, size_t data_size) 104 { 105 struct dm_io *io = (struct dm_io *)((char *)data + data_size); 106 if (io->magic == DM_IO_MAGIC) 107 return (struct bio *)((char *)io + DM_IO_BIO_OFFSET); 108 BUG_ON(io->magic != DM_TIO_MAGIC); 109 return (struct bio *)((char *)io + DM_TARGET_IO_BIO_OFFSET); 110 } 111 EXPORT_SYMBOL_GPL(dm_bio_from_per_bio_data); 112 113 unsigned dm_bio_get_target_bio_nr(const struct bio *bio) 114 { 115 return container_of(bio, struct dm_target_io, clone)->target_bio_nr; 116 } 117 EXPORT_SYMBOL_GPL(dm_bio_get_target_bio_nr); 118 119 #define MINOR_ALLOCED ((void *)-1) 120 121 #define DM_NUMA_NODE NUMA_NO_NODE 122 static int dm_numa_node = DM_NUMA_NODE; 123 124 #define DEFAULT_SWAP_BIOS (8 * 1048576 / PAGE_SIZE) 125 static int swap_bios = DEFAULT_SWAP_BIOS; 126 static int get_swap_bios(void) 127 { 128 int latch = READ_ONCE(swap_bios); 129 if (unlikely(latch <= 0)) 130 latch = DEFAULT_SWAP_BIOS; 131 return latch; 132 } 133 134 /* 135 * For mempools pre-allocation at the table loading time. 136 */ 137 struct dm_md_mempools { 138 struct bio_set bs; 139 struct bio_set io_bs; 140 }; 141 142 struct table_device { 143 struct list_head list; 144 refcount_t count; 145 struct dm_dev dm_dev; 146 }; 147 148 /* 149 * Bio-based DM's mempools' reserved IOs set by the user. 150 */ 151 #define RESERVED_BIO_BASED_IOS 16 152 static unsigned reserved_bio_based_ios = RESERVED_BIO_BASED_IOS; 153 154 static int __dm_get_module_param_int(int *module_param, int min, int max) 155 { 156 int param = READ_ONCE(*module_param); 157 int modified_param = 0; 158 bool modified = true; 159 160 if (param < min) 161 modified_param = min; 162 else if (param > max) 163 modified_param = max; 164 else 165 modified = false; 166 167 if (modified) { 168 (void)cmpxchg(module_param, param, modified_param); 169 param = modified_param; 170 } 171 172 return param; 173 } 174 175 unsigned __dm_get_module_param(unsigned *module_param, 176 unsigned def, unsigned max) 177 { 178 unsigned param = READ_ONCE(*module_param); 179 unsigned modified_param = 0; 180 181 if (!param) 182 modified_param = def; 183 else if (param > max) 184 modified_param = max; 185 186 if (modified_param) { 187 (void)cmpxchg(module_param, param, modified_param); 188 param = modified_param; 189 } 190 191 return param; 192 } 193 194 unsigned dm_get_reserved_bio_based_ios(void) 195 { 196 return __dm_get_module_param(&reserved_bio_based_ios, 197 RESERVED_BIO_BASED_IOS, DM_RESERVED_MAX_IOS); 198 } 199 EXPORT_SYMBOL_GPL(dm_get_reserved_bio_based_ios); 200 201 static unsigned dm_get_numa_node(void) 202 { 203 return __dm_get_module_param_int(&dm_numa_node, 204 DM_NUMA_NODE, num_online_nodes() - 1); 205 } 206 207 static int __init local_init(void) 208 { 209 int r; 210 211 r = dm_uevent_init(); 212 if (r) 213 return r; 214 215 deferred_remove_workqueue = alloc_workqueue("kdmremove", WQ_UNBOUND, 1); 216 if (!deferred_remove_workqueue) { 217 r = -ENOMEM; 218 goto out_uevent_exit; 219 } 220 221 _major = major; 222 r = register_blkdev(_major, _name); 223 if (r < 0) 224 goto out_free_workqueue; 225 226 if (!_major) 227 _major = r; 228 229 return 0; 230 231 out_free_workqueue: 232 destroy_workqueue(deferred_remove_workqueue); 233 out_uevent_exit: 234 dm_uevent_exit(); 235 236 return r; 237 } 238 239 static void local_exit(void) 240 { 241 flush_scheduled_work(); 242 destroy_workqueue(deferred_remove_workqueue); 243 244 unregister_blkdev(_major, _name); 245 dm_uevent_exit(); 246 247 _major = 0; 248 249 DMINFO("cleaned up"); 250 } 251 252 static int (*_inits[])(void) __initdata = { 253 local_init, 254 dm_target_init, 255 dm_linear_init, 256 dm_stripe_init, 257 dm_io_init, 258 dm_kcopyd_init, 259 dm_interface_init, 260 dm_statistics_init, 261 }; 262 263 static void (*_exits[])(void) = { 264 local_exit, 265 dm_target_exit, 266 dm_linear_exit, 267 dm_stripe_exit, 268 dm_io_exit, 269 dm_kcopyd_exit, 270 dm_interface_exit, 271 dm_statistics_exit, 272 }; 273 274 static int __init dm_init(void) 275 { 276 const int count = ARRAY_SIZE(_inits); 277 int r, i; 278 279 #if (IS_ENABLED(CONFIG_IMA) && !IS_ENABLED(CONFIG_IMA_DISABLE_HTABLE)) 280 DMWARN("CONFIG_IMA_DISABLE_HTABLE is disabled." 281 " Duplicate IMA measurements will not be recorded in the IMA log."); 282 #endif 283 284 for (i = 0; i < count; i++) { 285 r = _inits[i](); 286 if (r) 287 goto bad; 288 } 289 290 return 0; 291 bad: 292 while (i--) 293 _exits[i](); 294 295 return r; 296 } 297 298 static void __exit dm_exit(void) 299 { 300 int i = ARRAY_SIZE(_exits); 301 302 while (i--) 303 _exits[i](); 304 305 /* 306 * Should be empty by this point. 307 */ 308 idr_destroy(&_minor_idr); 309 } 310 311 /* 312 * Block device functions 313 */ 314 int dm_deleting_md(struct mapped_device *md) 315 { 316 return test_bit(DMF_DELETING, &md->flags); 317 } 318 319 static int dm_blk_open(struct block_device *bdev, fmode_t mode) 320 { 321 struct mapped_device *md; 322 323 spin_lock(&_minor_lock); 324 325 md = bdev->bd_disk->private_data; 326 if (!md) 327 goto out; 328 329 if (test_bit(DMF_FREEING, &md->flags) || 330 dm_deleting_md(md)) { 331 md = NULL; 332 goto out; 333 } 334 335 dm_get(md); 336 atomic_inc(&md->open_count); 337 out: 338 spin_unlock(&_minor_lock); 339 340 return md ? 0 : -ENXIO; 341 } 342 343 static void dm_blk_close(struct gendisk *disk, fmode_t mode) 344 { 345 struct mapped_device *md; 346 347 spin_lock(&_minor_lock); 348 349 md = disk->private_data; 350 if (WARN_ON(!md)) 351 goto out; 352 353 if (atomic_dec_and_test(&md->open_count) && 354 (test_bit(DMF_DEFERRED_REMOVE, &md->flags))) 355 queue_work(deferred_remove_workqueue, &deferred_remove_work); 356 357 dm_put(md); 358 out: 359 spin_unlock(&_minor_lock); 360 } 361 362 int dm_open_count(struct mapped_device *md) 363 { 364 return atomic_read(&md->open_count); 365 } 366 367 /* 368 * Guarantees nothing is using the device before it's deleted. 369 */ 370 int dm_lock_for_deletion(struct mapped_device *md, bool mark_deferred, bool only_deferred) 371 { 372 int r = 0; 373 374 spin_lock(&_minor_lock); 375 376 if (dm_open_count(md)) { 377 r = -EBUSY; 378 if (mark_deferred) 379 set_bit(DMF_DEFERRED_REMOVE, &md->flags); 380 } else if (only_deferred && !test_bit(DMF_DEFERRED_REMOVE, &md->flags)) 381 r = -EEXIST; 382 else 383 set_bit(DMF_DELETING, &md->flags); 384 385 spin_unlock(&_minor_lock); 386 387 return r; 388 } 389 390 int dm_cancel_deferred_remove(struct mapped_device *md) 391 { 392 int r = 0; 393 394 spin_lock(&_minor_lock); 395 396 if (test_bit(DMF_DELETING, &md->flags)) 397 r = -EBUSY; 398 else 399 clear_bit(DMF_DEFERRED_REMOVE, &md->flags); 400 401 spin_unlock(&_minor_lock); 402 403 return r; 404 } 405 406 static void do_deferred_remove(struct work_struct *w) 407 { 408 dm_deferred_remove(); 409 } 410 411 static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo) 412 { 413 struct mapped_device *md = bdev->bd_disk->private_data; 414 415 return dm_get_geometry(md, geo); 416 } 417 418 static int dm_prepare_ioctl(struct mapped_device *md, int *srcu_idx, 419 struct block_device **bdev) 420 { 421 struct dm_target *tgt; 422 struct dm_table *map; 423 int r; 424 425 retry: 426 r = -ENOTTY; 427 map = dm_get_live_table(md, srcu_idx); 428 if (!map || !dm_table_get_size(map)) 429 return r; 430 431 /* We only support devices that have a single target */ 432 if (dm_table_get_num_targets(map) != 1) 433 return r; 434 435 tgt = dm_table_get_target(map, 0); 436 if (!tgt->type->prepare_ioctl) 437 return r; 438 439 if (dm_suspended_md(md)) 440 return -EAGAIN; 441 442 r = tgt->type->prepare_ioctl(tgt, bdev); 443 if (r == -ENOTCONN && !fatal_signal_pending(current)) { 444 dm_put_live_table(md, *srcu_idx); 445 msleep(10); 446 goto retry; 447 } 448 449 return r; 450 } 451 452 static void dm_unprepare_ioctl(struct mapped_device *md, int srcu_idx) 453 { 454 dm_put_live_table(md, srcu_idx); 455 } 456 457 static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode, 458 unsigned int cmd, unsigned long arg) 459 { 460 struct mapped_device *md = bdev->bd_disk->private_data; 461 int r, srcu_idx; 462 463 r = dm_prepare_ioctl(md, &srcu_idx, &bdev); 464 if (r < 0) 465 goto out; 466 467 if (r > 0) { 468 /* 469 * Target determined this ioctl is being issued against a 470 * subset of the parent bdev; require extra privileges. 471 */ 472 if (!capable(CAP_SYS_RAWIO)) { 473 DMDEBUG_LIMIT( 474 "%s: sending ioctl %x to DM device without required privilege.", 475 current->comm, cmd); 476 r = -ENOIOCTLCMD; 477 goto out; 478 } 479 } 480 481 if (!bdev->bd_disk->fops->ioctl) 482 r = -ENOTTY; 483 else 484 r = bdev->bd_disk->fops->ioctl(bdev, mode, cmd, arg); 485 out: 486 dm_unprepare_ioctl(md, srcu_idx); 487 return r; 488 } 489 490 u64 dm_start_time_ns_from_clone(struct bio *bio) 491 { 492 return jiffies_to_nsecs(clone_to_tio(bio)->io->start_time); 493 } 494 EXPORT_SYMBOL_GPL(dm_start_time_ns_from_clone); 495 496 static bool bio_is_flush_with_data(struct bio *bio) 497 { 498 return ((bio->bi_opf & REQ_PREFLUSH) && bio->bi_iter.bi_size); 499 } 500 501 static void dm_io_acct(bool end, struct mapped_device *md, struct bio *bio, 502 unsigned long start_time, struct dm_stats_aux *stats_aux) 503 { 504 bool is_flush_with_data; 505 unsigned int bi_size; 506 507 /* If REQ_PREFLUSH set save any payload but do not account it */ 508 is_flush_with_data = bio_is_flush_with_data(bio); 509 if (is_flush_with_data) { 510 bi_size = bio->bi_iter.bi_size; 511 bio->bi_iter.bi_size = 0; 512 } 513 514 if (!end) 515 bio_start_io_acct_time(bio, start_time); 516 else 517 bio_end_io_acct(bio, start_time); 518 519 if (unlikely(dm_stats_used(&md->stats))) 520 dm_stats_account_io(&md->stats, bio_data_dir(bio), 521 bio->bi_iter.bi_sector, bio_sectors(bio), 522 end, start_time, stats_aux); 523 524 /* Restore bio's payload so it does get accounted upon requeue */ 525 if (is_flush_with_data) 526 bio->bi_iter.bi_size = bi_size; 527 } 528 529 static void __dm_start_io_acct(struct dm_io *io, struct bio *bio) 530 { 531 dm_io_acct(false, io->md, bio, io->start_time, &io->stats_aux); 532 } 533 534 static void dm_start_io_acct(struct dm_io *io, struct bio *clone) 535 { 536 /* Must account IO to DM device in terms of orig_bio */ 537 struct bio *bio = io->orig_bio; 538 539 /* 540 * Ensure IO accounting is only ever started once. 541 * Expect no possibility for race unless DM_TIO_IS_DUPLICATE_BIO. 542 */ 543 if (!clone || 544 likely(!dm_tio_flagged(clone_to_tio(clone), DM_TIO_IS_DUPLICATE_BIO))) { 545 if (WARN_ON_ONCE(dm_io_flagged(io, DM_IO_ACCOUNTED))) 546 return; 547 dm_io_set_flag(io, DM_IO_ACCOUNTED); 548 } else { 549 unsigned long flags; 550 if (dm_io_flagged(io, DM_IO_ACCOUNTED)) 551 return; 552 /* Can afford locking given DM_TIO_IS_DUPLICATE_BIO */ 553 spin_lock_irqsave(&io->lock, flags); 554 dm_io_set_flag(io, DM_IO_ACCOUNTED); 555 spin_unlock_irqrestore(&io->lock, flags); 556 } 557 558 __dm_start_io_acct(io, bio); 559 } 560 561 static void dm_end_io_acct(struct dm_io *io, struct bio *bio) 562 { 563 dm_io_acct(true, io->md, bio, io->start_time, &io->stats_aux); 564 } 565 566 static struct dm_io *alloc_io(struct mapped_device *md, struct bio *bio) 567 { 568 struct dm_io *io; 569 struct dm_target_io *tio; 570 struct bio *clone; 571 572 clone = bio_alloc_clone(bio->bi_bdev, bio, GFP_NOIO, &md->io_bs); 573 574 tio = clone_to_tio(clone); 575 tio->flags = 0; 576 dm_tio_set_flag(tio, DM_TIO_INSIDE_DM_IO); 577 tio->io = NULL; 578 579 io = container_of(tio, struct dm_io, tio); 580 io->magic = DM_IO_MAGIC; 581 io->status = 0; 582 atomic_set(&io->io_count, 1); 583 this_cpu_inc(*md->pending_io); 584 io->orig_bio = NULL; 585 io->md = md; 586 io->map_task = current; 587 spin_lock_init(&io->lock); 588 io->start_time = jiffies; 589 io->flags = 0; 590 591 dm_stats_record_start(&md->stats, &io->stats_aux); 592 593 return io; 594 } 595 596 static void free_io(struct dm_io *io) 597 { 598 bio_put(&io->tio.clone); 599 } 600 601 static struct bio *alloc_tio(struct clone_info *ci, struct dm_target *ti, 602 unsigned target_bio_nr, unsigned *len, gfp_t gfp_mask) 603 { 604 struct dm_target_io *tio; 605 struct bio *clone; 606 607 if (!ci->io->tio.io) { 608 /* the dm_target_io embedded in ci->io is available */ 609 tio = &ci->io->tio; 610 /* alloc_io() already initialized embedded clone */ 611 clone = &tio->clone; 612 } else { 613 clone = bio_alloc_clone(ci->bio->bi_bdev, ci->bio, 614 gfp_mask, &ci->io->md->bs); 615 if (!clone) 616 return NULL; 617 618 /* REQ_DM_POLL_LIST shouldn't be inherited */ 619 clone->bi_opf &= ~REQ_DM_POLL_LIST; 620 621 tio = clone_to_tio(clone); 622 tio->flags = 0; /* also clears DM_TIO_INSIDE_DM_IO */ 623 } 624 625 tio->magic = DM_TIO_MAGIC; 626 tio->io = ci->io; 627 tio->ti = ti; 628 tio->target_bio_nr = target_bio_nr; 629 tio->len_ptr = len; 630 tio->old_sector = 0; 631 632 if (len) { 633 clone->bi_iter.bi_size = to_bytes(*len); 634 if (bio_integrity(clone)) 635 bio_integrity_trim(clone); 636 } 637 638 return clone; 639 } 640 641 static void free_tio(struct bio *clone) 642 { 643 if (dm_tio_flagged(clone_to_tio(clone), DM_TIO_INSIDE_DM_IO)) 644 return; 645 bio_put(clone); 646 } 647 648 /* 649 * Add the bio to the list of deferred io. 650 */ 651 static void queue_io(struct mapped_device *md, struct bio *bio) 652 { 653 unsigned long flags; 654 655 spin_lock_irqsave(&md->deferred_lock, flags); 656 bio_list_add(&md->deferred, bio); 657 spin_unlock_irqrestore(&md->deferred_lock, flags); 658 queue_work(md->wq, &md->work); 659 } 660 661 /* 662 * Everyone (including functions in this file), should use this 663 * function to access the md->map field, and make sure they call 664 * dm_put_live_table() when finished. 665 */ 666 struct dm_table *dm_get_live_table(struct mapped_device *md, int *srcu_idx) __acquires(md->io_barrier) 667 { 668 *srcu_idx = srcu_read_lock(&md->io_barrier); 669 670 return srcu_dereference(md->map, &md->io_barrier); 671 } 672 673 void dm_put_live_table(struct mapped_device *md, int srcu_idx) __releases(md->io_barrier) 674 { 675 srcu_read_unlock(&md->io_barrier, srcu_idx); 676 } 677 678 void dm_sync_table(struct mapped_device *md) 679 { 680 synchronize_srcu(&md->io_barrier); 681 synchronize_rcu_expedited(); 682 } 683 684 /* 685 * A fast alternative to dm_get_live_table/dm_put_live_table. 686 * The caller must not block between these two functions. 687 */ 688 static struct dm_table *dm_get_live_table_fast(struct mapped_device *md) __acquires(RCU) 689 { 690 rcu_read_lock(); 691 return rcu_dereference(md->map); 692 } 693 694 static void dm_put_live_table_fast(struct mapped_device *md) __releases(RCU) 695 { 696 rcu_read_unlock(); 697 } 698 699 static char *_dm_claim_ptr = "I belong to device-mapper"; 700 701 /* 702 * Open a table device so we can use it as a map destination. 703 */ 704 static int open_table_device(struct table_device *td, dev_t dev, 705 struct mapped_device *md) 706 { 707 struct block_device *bdev; 708 u64 part_off; 709 int r; 710 711 BUG_ON(td->dm_dev.bdev); 712 713 bdev = blkdev_get_by_dev(dev, td->dm_dev.mode | FMODE_EXCL, _dm_claim_ptr); 714 if (IS_ERR(bdev)) 715 return PTR_ERR(bdev); 716 717 r = bd_link_disk_holder(bdev, dm_disk(md)); 718 if (r) { 719 blkdev_put(bdev, td->dm_dev.mode | FMODE_EXCL); 720 return r; 721 } 722 723 td->dm_dev.bdev = bdev; 724 td->dm_dev.dax_dev = fs_dax_get_by_bdev(bdev, &part_off); 725 return 0; 726 } 727 728 /* 729 * Close a table device that we've been using. 730 */ 731 static void close_table_device(struct table_device *td, struct mapped_device *md) 732 { 733 if (!td->dm_dev.bdev) 734 return; 735 736 bd_unlink_disk_holder(td->dm_dev.bdev, dm_disk(md)); 737 blkdev_put(td->dm_dev.bdev, td->dm_dev.mode | FMODE_EXCL); 738 put_dax(td->dm_dev.dax_dev); 739 td->dm_dev.bdev = NULL; 740 td->dm_dev.dax_dev = NULL; 741 } 742 743 static struct table_device *find_table_device(struct list_head *l, dev_t dev, 744 fmode_t mode) 745 { 746 struct table_device *td; 747 748 list_for_each_entry(td, l, list) 749 if (td->dm_dev.bdev->bd_dev == dev && td->dm_dev.mode == mode) 750 return td; 751 752 return NULL; 753 } 754 755 int dm_get_table_device(struct mapped_device *md, dev_t dev, fmode_t mode, 756 struct dm_dev **result) 757 { 758 int r; 759 struct table_device *td; 760 761 mutex_lock(&md->table_devices_lock); 762 td = find_table_device(&md->table_devices, dev, mode); 763 if (!td) { 764 td = kmalloc_node(sizeof(*td), GFP_KERNEL, md->numa_node_id); 765 if (!td) { 766 mutex_unlock(&md->table_devices_lock); 767 return -ENOMEM; 768 } 769 770 td->dm_dev.mode = mode; 771 td->dm_dev.bdev = NULL; 772 773 if ((r = open_table_device(td, dev, md))) { 774 mutex_unlock(&md->table_devices_lock); 775 kfree(td); 776 return r; 777 } 778 779 format_dev_t(td->dm_dev.name, dev); 780 781 refcount_set(&td->count, 1); 782 list_add(&td->list, &md->table_devices); 783 } else { 784 refcount_inc(&td->count); 785 } 786 mutex_unlock(&md->table_devices_lock); 787 788 *result = &td->dm_dev; 789 return 0; 790 } 791 792 void dm_put_table_device(struct mapped_device *md, struct dm_dev *d) 793 { 794 struct table_device *td = container_of(d, struct table_device, dm_dev); 795 796 mutex_lock(&md->table_devices_lock); 797 if (refcount_dec_and_test(&td->count)) { 798 close_table_device(td, md); 799 list_del(&td->list); 800 kfree(td); 801 } 802 mutex_unlock(&md->table_devices_lock); 803 } 804 805 static void free_table_devices(struct list_head *devices) 806 { 807 struct list_head *tmp, *next; 808 809 list_for_each_safe(tmp, next, devices) { 810 struct table_device *td = list_entry(tmp, struct table_device, list); 811 812 DMWARN("dm_destroy: %s still exists with %d references", 813 td->dm_dev.name, refcount_read(&td->count)); 814 kfree(td); 815 } 816 } 817 818 /* 819 * Get the geometry associated with a dm device 820 */ 821 int dm_get_geometry(struct mapped_device *md, struct hd_geometry *geo) 822 { 823 *geo = md->geometry; 824 825 return 0; 826 } 827 828 /* 829 * Set the geometry of a device. 830 */ 831 int dm_set_geometry(struct mapped_device *md, struct hd_geometry *geo) 832 { 833 sector_t sz = (sector_t)geo->cylinders * geo->heads * geo->sectors; 834 835 if (geo->start > sz) { 836 DMWARN("Start sector is beyond the geometry limits."); 837 return -EINVAL; 838 } 839 840 md->geometry = *geo; 841 842 return 0; 843 } 844 845 static int __noflush_suspending(struct mapped_device *md) 846 { 847 return test_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); 848 } 849 850 static void dm_io_complete(struct dm_io *io) 851 { 852 blk_status_t io_error; 853 struct mapped_device *md = io->md; 854 struct bio *bio = io->orig_bio; 855 856 if (io->status == BLK_STS_DM_REQUEUE) { 857 unsigned long flags; 858 /* 859 * Target requested pushing back the I/O. 860 */ 861 spin_lock_irqsave(&md->deferred_lock, flags); 862 if (__noflush_suspending(md) && 863 !WARN_ON_ONCE(dm_is_zone_write(md, bio))) { 864 /* NOTE early return due to BLK_STS_DM_REQUEUE below */ 865 bio_list_add_head(&md->deferred, bio); 866 } else { 867 /* 868 * noflush suspend was interrupted or this is 869 * a write to a zoned target. 870 */ 871 io->status = BLK_STS_IOERR; 872 } 873 spin_unlock_irqrestore(&md->deferred_lock, flags); 874 } 875 876 io_error = io->status; 877 if (dm_io_flagged(io, DM_IO_ACCOUNTED)) 878 dm_end_io_acct(io, bio); 879 else if (!io_error) { 880 /* 881 * Must handle target that DM_MAPIO_SUBMITTED only to 882 * then bio_endio() rather than dm_submit_bio_remap() 883 */ 884 __dm_start_io_acct(io, bio); 885 dm_end_io_acct(io, bio); 886 } 887 free_io(io); 888 smp_wmb(); 889 this_cpu_dec(*md->pending_io); 890 891 /* nudge anyone waiting on suspend queue */ 892 if (unlikely(wq_has_sleeper(&md->wait))) 893 wake_up(&md->wait); 894 895 if (io_error == BLK_STS_DM_REQUEUE || io_error == BLK_STS_AGAIN) { 896 if (bio->bi_opf & REQ_POLLED) { 897 /* 898 * Upper layer won't help us poll split bio (io->orig_bio 899 * may only reflect a subset of the pre-split original) 900 * so clear REQ_POLLED in case of requeue. 901 */ 902 bio->bi_opf &= ~REQ_POLLED; 903 if (io_error == BLK_STS_AGAIN) { 904 /* io_uring doesn't handle BLK_STS_AGAIN (yet) */ 905 queue_io(md, bio); 906 } 907 } 908 return; 909 } 910 911 if (bio_is_flush_with_data(bio)) { 912 /* 913 * Preflush done for flush with data, reissue 914 * without REQ_PREFLUSH. 915 */ 916 bio->bi_opf &= ~REQ_PREFLUSH; 917 queue_io(md, bio); 918 } else { 919 /* done with normal IO or empty flush */ 920 if (io_error) 921 bio->bi_status = io_error; 922 bio_endio(bio); 923 } 924 } 925 926 static inline bool dm_tio_is_normal(struct dm_target_io *tio) 927 { 928 return (dm_tio_flagged(tio, DM_TIO_INSIDE_DM_IO) && 929 !dm_tio_flagged(tio, DM_TIO_IS_DUPLICATE_BIO)); 930 } 931 932 /* 933 * Decrements the number of outstanding ios that a bio has been 934 * cloned into, completing the original io if necc. 935 */ 936 void dm_io_dec_pending(struct dm_io *io, blk_status_t error) 937 { 938 /* Push-back supersedes any I/O errors */ 939 if (unlikely(error)) { 940 unsigned long flags; 941 spin_lock_irqsave(&io->lock, flags); 942 if (!(io->status == BLK_STS_DM_REQUEUE && 943 __noflush_suspending(io->md))) 944 io->status = error; 945 spin_unlock_irqrestore(&io->lock, flags); 946 } 947 948 if (atomic_dec_and_test(&io->io_count)) 949 dm_io_complete(io); 950 } 951 952 void disable_discard(struct mapped_device *md) 953 { 954 struct queue_limits *limits = dm_get_queue_limits(md); 955 956 /* device doesn't really support DISCARD, disable it */ 957 limits->max_discard_sectors = 0; 958 blk_queue_flag_clear(QUEUE_FLAG_DISCARD, md->queue); 959 } 960 961 void disable_write_zeroes(struct mapped_device *md) 962 { 963 struct queue_limits *limits = dm_get_queue_limits(md); 964 965 /* device doesn't really support WRITE ZEROES, disable it */ 966 limits->max_write_zeroes_sectors = 0; 967 } 968 969 static bool swap_bios_limit(struct dm_target *ti, struct bio *bio) 970 { 971 return unlikely((bio->bi_opf & REQ_SWAP) != 0) && unlikely(ti->limit_swap_bios); 972 } 973 974 static void clone_endio(struct bio *bio) 975 { 976 blk_status_t error = bio->bi_status; 977 struct dm_target_io *tio = clone_to_tio(bio); 978 struct dm_io *io = tio->io; 979 struct mapped_device *md = tio->io->md; 980 dm_endio_fn endio = tio->ti->type->end_io; 981 struct request_queue *q = bio->bi_bdev->bd_disk->queue; 982 983 if (unlikely(error == BLK_STS_TARGET)) { 984 if (bio_op(bio) == REQ_OP_DISCARD && 985 !q->limits.max_discard_sectors) 986 disable_discard(md); 987 else if (bio_op(bio) == REQ_OP_WRITE_ZEROES && 988 !q->limits.max_write_zeroes_sectors) 989 disable_write_zeroes(md); 990 } 991 992 if (blk_queue_is_zoned(q)) 993 dm_zone_endio(io, bio); 994 995 if (endio) { 996 int r = endio(tio->ti, bio, &error); 997 switch (r) { 998 case DM_ENDIO_REQUEUE: 999 /* 1000 * Requeuing writes to a sequential zone of a zoned 1001 * target will break the sequential write pattern: 1002 * fail such IO. 1003 */ 1004 if (WARN_ON_ONCE(dm_is_zone_write(md, bio))) 1005 error = BLK_STS_IOERR; 1006 else 1007 error = BLK_STS_DM_REQUEUE; 1008 fallthrough; 1009 case DM_ENDIO_DONE: 1010 break; 1011 case DM_ENDIO_INCOMPLETE: 1012 /* The target will handle the io */ 1013 return; 1014 default: 1015 DMWARN("unimplemented target endio return value: %d", r); 1016 BUG(); 1017 } 1018 } 1019 1020 if (unlikely(swap_bios_limit(tio->ti, bio))) { 1021 struct mapped_device *md = io->md; 1022 up(&md->swap_bios_semaphore); 1023 } 1024 1025 free_tio(bio); 1026 dm_io_dec_pending(io, error); 1027 } 1028 1029 /* 1030 * Return maximum size of I/O possible at the supplied sector up to the current 1031 * target boundary. 1032 */ 1033 static inline sector_t max_io_len_target_boundary(struct dm_target *ti, 1034 sector_t target_offset) 1035 { 1036 return ti->len - target_offset; 1037 } 1038 1039 static sector_t max_io_len(struct dm_target *ti, sector_t sector) 1040 { 1041 sector_t target_offset = dm_target_offset(ti, sector); 1042 sector_t len = max_io_len_target_boundary(ti, target_offset); 1043 sector_t max_len; 1044 1045 /* 1046 * Does the target need to split IO even further? 1047 * - varied (per target) IO splitting is a tenet of DM; this 1048 * explains why stacked chunk_sectors based splitting via 1049 * blk_max_size_offset() isn't possible here. So pass in 1050 * ti->max_io_len to override stacked chunk_sectors. 1051 */ 1052 if (ti->max_io_len) { 1053 max_len = blk_max_size_offset(ti->table->md->queue, 1054 target_offset, ti->max_io_len); 1055 if (len > max_len) 1056 len = max_len; 1057 } 1058 1059 return len; 1060 } 1061 1062 int dm_set_target_max_io_len(struct dm_target *ti, sector_t len) 1063 { 1064 if (len > UINT_MAX) { 1065 DMERR("Specified maximum size of target IO (%llu) exceeds limit (%u)", 1066 (unsigned long long)len, UINT_MAX); 1067 ti->error = "Maximum size of target IO is too large"; 1068 return -EINVAL; 1069 } 1070 1071 ti->max_io_len = (uint32_t) len; 1072 1073 return 0; 1074 } 1075 EXPORT_SYMBOL_GPL(dm_set_target_max_io_len); 1076 1077 static struct dm_target *dm_dax_get_live_target(struct mapped_device *md, 1078 sector_t sector, int *srcu_idx) 1079 __acquires(md->io_barrier) 1080 { 1081 struct dm_table *map; 1082 struct dm_target *ti; 1083 1084 map = dm_get_live_table(md, srcu_idx); 1085 if (!map) 1086 return NULL; 1087 1088 ti = dm_table_find_target(map, sector); 1089 if (!ti) 1090 return NULL; 1091 1092 return ti; 1093 } 1094 1095 static long dm_dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, 1096 long nr_pages, enum dax_access_mode mode, void **kaddr, 1097 pfn_t *pfn) 1098 { 1099 struct mapped_device *md = dax_get_private(dax_dev); 1100 sector_t sector = pgoff * PAGE_SECTORS; 1101 struct dm_target *ti; 1102 long len, ret = -EIO; 1103 int srcu_idx; 1104 1105 ti = dm_dax_get_live_target(md, sector, &srcu_idx); 1106 1107 if (!ti) 1108 goto out; 1109 if (!ti->type->direct_access) 1110 goto out; 1111 len = max_io_len(ti, sector) / PAGE_SECTORS; 1112 if (len < 1) 1113 goto out; 1114 nr_pages = min(len, nr_pages); 1115 ret = ti->type->direct_access(ti, pgoff, nr_pages, mode, kaddr, pfn); 1116 1117 out: 1118 dm_put_live_table(md, srcu_idx); 1119 1120 return ret; 1121 } 1122 1123 static int dm_dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff, 1124 size_t nr_pages) 1125 { 1126 struct mapped_device *md = dax_get_private(dax_dev); 1127 sector_t sector = pgoff * PAGE_SECTORS; 1128 struct dm_target *ti; 1129 int ret = -EIO; 1130 int srcu_idx; 1131 1132 ti = dm_dax_get_live_target(md, sector, &srcu_idx); 1133 1134 if (!ti) 1135 goto out; 1136 if (WARN_ON(!ti->type->dax_zero_page_range)) { 1137 /* 1138 * ->zero_page_range() is mandatory dax operation. If we are 1139 * here, something is wrong. 1140 */ 1141 goto out; 1142 } 1143 ret = ti->type->dax_zero_page_range(ti, pgoff, nr_pages); 1144 out: 1145 dm_put_live_table(md, srcu_idx); 1146 1147 return ret; 1148 } 1149 1150 static size_t dm_dax_recovery_write(struct dax_device *dax_dev, pgoff_t pgoff, 1151 void *addr, size_t bytes, struct iov_iter *i) 1152 { 1153 struct mapped_device *md = dax_get_private(dax_dev); 1154 sector_t sector = pgoff * PAGE_SECTORS; 1155 struct dm_target *ti; 1156 int srcu_idx; 1157 long ret = 0; 1158 1159 ti = dm_dax_get_live_target(md, sector, &srcu_idx); 1160 if (!ti || !ti->type->dax_recovery_write) 1161 goto out; 1162 1163 ret = ti->type->dax_recovery_write(ti, pgoff, addr, bytes, i); 1164 out: 1165 dm_put_live_table(md, srcu_idx); 1166 return ret; 1167 } 1168 1169 /* 1170 * A target may call dm_accept_partial_bio only from the map routine. It is 1171 * allowed for all bio types except REQ_PREFLUSH, REQ_OP_ZONE_* zone management 1172 * operations, REQ_OP_ZONE_APPEND (zone append writes) and any bio serviced by 1173 * __send_duplicate_bios(). 1174 * 1175 * dm_accept_partial_bio informs the dm that the target only wants to process 1176 * additional n_sectors sectors of the bio and the rest of the data should be 1177 * sent in a next bio. 1178 * 1179 * A diagram that explains the arithmetics: 1180 * +--------------------+---------------+-------+ 1181 * | 1 | 2 | 3 | 1182 * +--------------------+---------------+-------+ 1183 * 1184 * <-------------- *tio->len_ptr ---------------> 1185 * <------- bi_size -------> 1186 * <-- n_sectors --> 1187 * 1188 * Region 1 was already iterated over with bio_advance or similar function. 1189 * (it may be empty if the target doesn't use bio_advance) 1190 * Region 2 is the remaining bio size that the target wants to process. 1191 * (it may be empty if region 1 is non-empty, although there is no reason 1192 * to make it empty) 1193 * The target requires that region 3 is to be sent in the next bio. 1194 * 1195 * If the target wants to receive multiple copies of the bio (via num_*bios, etc), 1196 * the partially processed part (the sum of regions 1+2) must be the same for all 1197 * copies of the bio. 1198 */ 1199 void dm_accept_partial_bio(struct bio *bio, unsigned n_sectors) 1200 { 1201 struct dm_target_io *tio = clone_to_tio(bio); 1202 unsigned bi_size = bio->bi_iter.bi_size >> SECTOR_SHIFT; 1203 1204 BUG_ON(dm_tio_flagged(tio, DM_TIO_IS_DUPLICATE_BIO)); 1205 BUG_ON(op_is_zone_mgmt(bio_op(bio))); 1206 BUG_ON(bio_op(bio) == REQ_OP_ZONE_APPEND); 1207 BUG_ON(bi_size > *tio->len_ptr); 1208 BUG_ON(n_sectors > bi_size); 1209 1210 *tio->len_ptr -= bi_size - n_sectors; 1211 bio->bi_iter.bi_size = n_sectors << SECTOR_SHIFT; 1212 } 1213 EXPORT_SYMBOL_GPL(dm_accept_partial_bio); 1214 1215 static inline void __dm_submit_bio_remap(struct bio *clone, 1216 dev_t dev, sector_t old_sector) 1217 { 1218 trace_block_bio_remap(clone, dev, old_sector); 1219 submit_bio_noacct(clone); 1220 } 1221 1222 /* 1223 * @clone: clone bio that DM core passed to target's .map function 1224 * @tgt_clone: clone of @clone bio that target needs submitted 1225 * 1226 * Targets should use this interface to submit bios they take 1227 * ownership of when returning DM_MAPIO_SUBMITTED. 1228 * 1229 * Target should also enable ti->accounts_remapped_io 1230 */ 1231 void dm_submit_bio_remap(struct bio *clone, struct bio *tgt_clone) 1232 { 1233 struct dm_target_io *tio = clone_to_tio(clone); 1234 struct dm_io *io = tio->io; 1235 1236 WARN_ON_ONCE(!tio->ti->accounts_remapped_io); 1237 1238 /* establish bio that will get submitted */ 1239 if (!tgt_clone) 1240 tgt_clone = clone; 1241 1242 /* 1243 * Account io->origin_bio to DM dev on behalf of target 1244 * that took ownership of IO with DM_MAPIO_SUBMITTED. 1245 */ 1246 if (io->map_task == current) { 1247 /* Still in target's map function */ 1248 dm_io_set_flag(io, DM_IO_START_ACCT); 1249 } else { 1250 /* 1251 * Called by another thread, managed by DM target, 1252 * wait for dm_split_and_process_bio() to store 1253 * io->orig_bio 1254 */ 1255 while (unlikely(!smp_load_acquire(&io->orig_bio))) 1256 msleep(1); 1257 dm_start_io_acct(io, clone); 1258 } 1259 1260 __dm_submit_bio_remap(tgt_clone, disk_devt(io->md->disk), 1261 tio->old_sector); 1262 } 1263 EXPORT_SYMBOL_GPL(dm_submit_bio_remap); 1264 1265 static noinline void __set_swap_bios_limit(struct mapped_device *md, int latch) 1266 { 1267 mutex_lock(&md->swap_bios_lock); 1268 while (latch < md->swap_bios) { 1269 cond_resched(); 1270 down(&md->swap_bios_semaphore); 1271 md->swap_bios--; 1272 } 1273 while (latch > md->swap_bios) { 1274 cond_resched(); 1275 up(&md->swap_bios_semaphore); 1276 md->swap_bios++; 1277 } 1278 mutex_unlock(&md->swap_bios_lock); 1279 } 1280 1281 static void __map_bio(struct bio *clone) 1282 { 1283 struct dm_target_io *tio = clone_to_tio(clone); 1284 int r; 1285 struct dm_io *io = tio->io; 1286 struct dm_target *ti = tio->ti; 1287 1288 clone->bi_end_io = clone_endio; 1289 1290 /* 1291 * Map the clone. 1292 */ 1293 dm_io_inc_pending(io); 1294 tio->old_sector = clone->bi_iter.bi_sector; 1295 1296 if (unlikely(swap_bios_limit(ti, clone))) { 1297 struct mapped_device *md = io->md; 1298 int latch = get_swap_bios(); 1299 if (unlikely(latch != md->swap_bios)) 1300 __set_swap_bios_limit(md, latch); 1301 down(&md->swap_bios_semaphore); 1302 } 1303 1304 /* 1305 * Check if the IO needs a special mapping due to zone append emulation 1306 * on zoned target. In this case, dm_zone_map_bio() calls the target 1307 * map operation. 1308 */ 1309 if (dm_emulate_zone_append(io->md)) 1310 r = dm_zone_map_bio(tio); 1311 else 1312 r = ti->type->map(ti, clone); 1313 1314 switch (r) { 1315 case DM_MAPIO_SUBMITTED: 1316 /* target has assumed ownership of this io */ 1317 if (!ti->accounts_remapped_io) 1318 dm_io_set_flag(io, DM_IO_START_ACCT); 1319 break; 1320 case DM_MAPIO_REMAPPED: 1321 /* 1322 * the bio has been remapped so dispatch it, but defer 1323 * dm_start_io_acct() until after possible bio_split(). 1324 */ 1325 __dm_submit_bio_remap(clone, disk_devt(io->md->disk), 1326 tio->old_sector); 1327 dm_io_set_flag(io, DM_IO_START_ACCT); 1328 break; 1329 case DM_MAPIO_KILL: 1330 case DM_MAPIO_REQUEUE: 1331 if (unlikely(swap_bios_limit(ti, clone))) 1332 up(&io->md->swap_bios_semaphore); 1333 free_tio(clone); 1334 if (r == DM_MAPIO_KILL) 1335 dm_io_dec_pending(io, BLK_STS_IOERR); 1336 else 1337 dm_io_dec_pending(io, BLK_STS_DM_REQUEUE); 1338 break; 1339 default: 1340 DMWARN("unimplemented target map return value: %d", r); 1341 BUG(); 1342 } 1343 } 1344 1345 static void alloc_multiple_bios(struct bio_list *blist, struct clone_info *ci, 1346 struct dm_target *ti, unsigned num_bios) 1347 { 1348 struct bio *bio; 1349 int try; 1350 1351 for (try = 0; try < 2; try++) { 1352 int bio_nr; 1353 1354 if (try) 1355 mutex_lock(&ci->io->md->table_devices_lock); 1356 for (bio_nr = 0; bio_nr < num_bios; bio_nr++) { 1357 bio = alloc_tio(ci, ti, bio_nr, NULL, 1358 try ? GFP_NOIO : GFP_NOWAIT); 1359 if (!bio) 1360 break; 1361 1362 bio_list_add(blist, bio); 1363 } 1364 if (try) 1365 mutex_unlock(&ci->io->md->table_devices_lock); 1366 if (bio_nr == num_bios) 1367 return; 1368 1369 while ((bio = bio_list_pop(blist))) 1370 free_tio(bio); 1371 } 1372 } 1373 1374 static void __send_duplicate_bios(struct clone_info *ci, struct dm_target *ti, 1375 unsigned num_bios, unsigned *len) 1376 { 1377 struct bio_list blist = BIO_EMPTY_LIST; 1378 struct bio *clone; 1379 1380 switch (num_bios) { 1381 case 0: 1382 break; 1383 case 1: 1384 clone = alloc_tio(ci, ti, 0, len, GFP_NOIO); 1385 __map_bio(clone); 1386 break; 1387 default: 1388 /* dm_accept_partial_bio() is not supported with shared tio->len_ptr */ 1389 alloc_multiple_bios(&blist, ci, ti, num_bios); 1390 while ((clone = bio_list_pop(&blist))) { 1391 dm_tio_set_flag(clone_to_tio(clone), DM_TIO_IS_DUPLICATE_BIO); 1392 __map_bio(clone); 1393 } 1394 break; 1395 } 1396 } 1397 1398 static void __send_empty_flush(struct clone_info *ci) 1399 { 1400 unsigned target_nr = 0; 1401 struct dm_target *ti; 1402 struct bio flush_bio; 1403 1404 /* 1405 * Use an on-stack bio for this, it's safe since we don't 1406 * need to reference it after submit. It's just used as 1407 * the basis for the clone(s). 1408 */ 1409 bio_init(&flush_bio, ci->io->md->disk->part0, NULL, 0, 1410 REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC); 1411 1412 ci->bio = &flush_bio; 1413 ci->sector_count = 0; 1414 ci->io->tio.clone.bi_iter.bi_size = 0; 1415 1416 while ((ti = dm_table_get_target(ci->map, target_nr++))) 1417 __send_duplicate_bios(ci, ti, ti->num_flush_bios, NULL); 1418 1419 bio_uninit(ci->bio); 1420 } 1421 1422 static void __send_changing_extent_only(struct clone_info *ci, struct dm_target *ti, 1423 unsigned num_bios) 1424 { 1425 unsigned len; 1426 1427 len = min_t(sector_t, ci->sector_count, 1428 max_io_len_target_boundary(ti, dm_target_offset(ti, ci->sector))); 1429 1430 __send_duplicate_bios(ci, ti, num_bios, &len); 1431 1432 ci->sector += len; 1433 ci->sector_count -= len; 1434 } 1435 1436 static bool is_abnormal_io(struct bio *bio) 1437 { 1438 bool r = false; 1439 1440 switch (bio_op(bio)) { 1441 case REQ_OP_DISCARD: 1442 case REQ_OP_SECURE_ERASE: 1443 case REQ_OP_WRITE_ZEROES: 1444 r = true; 1445 break; 1446 } 1447 1448 return r; 1449 } 1450 1451 static bool __process_abnormal_io(struct clone_info *ci, struct dm_target *ti, 1452 int *result) 1453 { 1454 unsigned num_bios = 0; 1455 1456 switch (bio_op(ci->bio)) { 1457 case REQ_OP_DISCARD: 1458 num_bios = ti->num_discard_bios; 1459 break; 1460 case REQ_OP_SECURE_ERASE: 1461 num_bios = ti->num_secure_erase_bios; 1462 break; 1463 case REQ_OP_WRITE_ZEROES: 1464 num_bios = ti->num_write_zeroes_bios; 1465 break; 1466 default: 1467 return false; 1468 } 1469 1470 /* 1471 * Even though the device advertised support for this type of 1472 * request, that does not mean every target supports it, and 1473 * reconfiguration might also have changed that since the 1474 * check was performed. 1475 */ 1476 if (!num_bios) 1477 *result = -EOPNOTSUPP; 1478 else { 1479 __send_changing_extent_only(ci, ti, num_bios); 1480 *result = 0; 1481 } 1482 return true; 1483 } 1484 1485 /* 1486 * Reuse ->bi_private as hlist head for storing all dm_io instances 1487 * associated with this bio, and this bio's bi_private needs to be 1488 * stored in dm_io->data before the reuse. 1489 * 1490 * bio->bi_private is owned by fs or upper layer, so block layer won't 1491 * touch it after splitting. Meantime it won't be changed by anyone after 1492 * bio is submitted. So this reuse is safe. 1493 */ 1494 static inline struct hlist_head *dm_get_bio_hlist_head(struct bio *bio) 1495 { 1496 return (struct hlist_head *)&bio->bi_private; 1497 } 1498 1499 static void dm_queue_poll_io(struct bio *bio, struct dm_io *io) 1500 { 1501 struct hlist_head *head = dm_get_bio_hlist_head(bio); 1502 1503 if (!(bio->bi_opf & REQ_DM_POLL_LIST)) { 1504 bio->bi_opf |= REQ_DM_POLL_LIST; 1505 /* 1506 * Save .bi_private into dm_io, so that we can reuse 1507 * .bi_private as hlist head for storing dm_io list 1508 */ 1509 io->data = bio->bi_private; 1510 1511 INIT_HLIST_HEAD(head); 1512 1513 /* tell block layer to poll for completion */ 1514 bio->bi_cookie = ~BLK_QC_T_NONE; 1515 } else { 1516 /* 1517 * bio recursed due to split, reuse original poll list, 1518 * and save bio->bi_private too. 1519 */ 1520 io->data = hlist_entry(head->first, struct dm_io, node)->data; 1521 } 1522 1523 hlist_add_head(&io->node, head); 1524 } 1525 1526 /* 1527 * Select the correct strategy for processing a non-flush bio. 1528 */ 1529 static int __split_and_process_bio(struct clone_info *ci) 1530 { 1531 struct bio *clone; 1532 struct dm_target *ti; 1533 unsigned len; 1534 int r; 1535 1536 ti = dm_table_find_target(ci->map, ci->sector); 1537 if (!ti) 1538 return -EIO; 1539 1540 if (__process_abnormal_io(ci, ti, &r)) 1541 return r; 1542 1543 /* 1544 * Only support bio polling for normal IO, and the target io is 1545 * exactly inside the dm_io instance (verified in dm_poll_dm_io) 1546 */ 1547 ci->submit_as_polled = ci->bio->bi_opf & REQ_POLLED; 1548 1549 len = min_t(sector_t, max_io_len(ti, ci->sector), ci->sector_count); 1550 clone = alloc_tio(ci, ti, 0, &len, GFP_NOIO); 1551 __map_bio(clone); 1552 1553 ci->sector += len; 1554 ci->sector_count -= len; 1555 1556 return 0; 1557 } 1558 1559 static void init_clone_info(struct clone_info *ci, struct mapped_device *md, 1560 struct dm_table *map, struct bio *bio) 1561 { 1562 ci->map = map; 1563 ci->io = alloc_io(md, bio); 1564 ci->bio = bio; 1565 ci->submit_as_polled = false; 1566 ci->sector = bio->bi_iter.bi_sector; 1567 ci->sector_count = bio_sectors(bio); 1568 1569 /* Shouldn't happen but sector_count was being set to 0 so... */ 1570 if (WARN_ON_ONCE(op_is_zone_mgmt(bio_op(bio)) && ci->sector_count)) 1571 ci->sector_count = 0; 1572 } 1573 1574 /* 1575 * Entry point to split a bio into clones and submit them to the targets. 1576 */ 1577 static void dm_split_and_process_bio(struct mapped_device *md, 1578 struct dm_table *map, struct bio *bio) 1579 { 1580 struct clone_info ci; 1581 struct bio *orig_bio = NULL; 1582 int error = 0; 1583 1584 init_clone_info(&ci, md, map, bio); 1585 1586 if (bio->bi_opf & REQ_PREFLUSH) { 1587 __send_empty_flush(&ci); 1588 /* dm_io_complete submits any data associated with flush */ 1589 goto out; 1590 } 1591 1592 error = __split_and_process_bio(&ci); 1593 ci.io->map_task = NULL; 1594 if (error || !ci.sector_count) 1595 goto out; 1596 1597 /* 1598 * Remainder must be passed to submit_bio_noacct() so it gets handled 1599 * *after* bios already submitted have been completely processed. 1600 * We take a clone of the original to store in ci.io->orig_bio to be 1601 * used by dm_end_io_acct() and for dm_io_complete() to use for 1602 * completion handling. 1603 */ 1604 orig_bio = bio_split(bio, bio_sectors(bio) - ci.sector_count, 1605 GFP_NOIO, &md->queue->bio_split); 1606 bio_chain(orig_bio, bio); 1607 trace_block_split(orig_bio, bio->bi_iter.bi_sector); 1608 submit_bio_noacct(bio); 1609 out: 1610 if (!orig_bio) 1611 orig_bio = bio; 1612 smp_store_release(&ci.io->orig_bio, orig_bio); 1613 if (dm_io_flagged(ci.io, DM_IO_START_ACCT)) 1614 dm_start_io_acct(ci.io, NULL); 1615 1616 /* 1617 * Drop the extra reference count for non-POLLED bio, and hold one 1618 * reference for POLLED bio, which will be released in dm_poll_bio 1619 * 1620 * Add every dm_io instance into the hlist_head which is stored in 1621 * bio->bi_private, so that dm_poll_bio can poll them all. 1622 */ 1623 if (error || !ci.submit_as_polled) 1624 dm_io_dec_pending(ci.io, errno_to_blk_status(error)); 1625 else 1626 dm_queue_poll_io(bio, ci.io); 1627 } 1628 1629 static void dm_submit_bio(struct bio *bio) 1630 { 1631 struct mapped_device *md = bio->bi_bdev->bd_disk->private_data; 1632 int srcu_idx; 1633 struct dm_table *map; 1634 1635 map = dm_get_live_table(md, &srcu_idx); 1636 1637 /* If suspended, or map not yet available, queue this IO for later */ 1638 if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) || 1639 unlikely(!map)) { 1640 if (bio->bi_opf & REQ_NOWAIT) 1641 bio_wouldblock_error(bio); 1642 else if (bio->bi_opf & REQ_RAHEAD) 1643 bio_io_error(bio); 1644 else 1645 queue_io(md, bio); 1646 goto out; 1647 } 1648 1649 /* 1650 * Use blk_queue_split() for abnormal IO (e.g. discard, writesame, etc) 1651 * otherwise associated queue_limits won't be imposed. 1652 */ 1653 if (is_abnormal_io(bio)) 1654 blk_queue_split(&bio); 1655 1656 dm_split_and_process_bio(md, map, bio); 1657 out: 1658 dm_put_live_table(md, srcu_idx); 1659 } 1660 1661 static bool dm_poll_dm_io(struct dm_io *io, struct io_comp_batch *iob, 1662 unsigned int flags) 1663 { 1664 WARN_ON_ONCE(!dm_tio_is_normal(&io->tio)); 1665 1666 /* don't poll if the mapped io is done */ 1667 if (atomic_read(&io->io_count) > 1) 1668 bio_poll(&io->tio.clone, iob, flags); 1669 1670 /* bio_poll holds the last reference */ 1671 return atomic_read(&io->io_count) == 1; 1672 } 1673 1674 static int dm_poll_bio(struct bio *bio, struct io_comp_batch *iob, 1675 unsigned int flags) 1676 { 1677 struct hlist_head *head = dm_get_bio_hlist_head(bio); 1678 struct hlist_head tmp = HLIST_HEAD_INIT; 1679 struct hlist_node *next; 1680 struct dm_io *io; 1681 1682 /* Only poll normal bio which was marked as REQ_DM_POLL_LIST */ 1683 if (!(bio->bi_opf & REQ_DM_POLL_LIST)) 1684 return 0; 1685 1686 WARN_ON_ONCE(hlist_empty(head)); 1687 1688 hlist_move_list(head, &tmp); 1689 1690 /* 1691 * Restore .bi_private before possibly completing dm_io. 1692 * 1693 * bio_poll() is only possible once @bio has been completely 1694 * submitted via submit_bio_noacct()'s depth-first submission. 1695 * So there is no dm_queue_poll_io() race associated with 1696 * clearing REQ_DM_POLL_LIST here. 1697 */ 1698 bio->bi_opf &= ~REQ_DM_POLL_LIST; 1699 bio->bi_private = hlist_entry(tmp.first, struct dm_io, node)->data; 1700 1701 hlist_for_each_entry_safe(io, next, &tmp, node) { 1702 if (dm_poll_dm_io(io, iob, flags)) { 1703 hlist_del_init(&io->node); 1704 /* 1705 * clone_endio() has already occurred, so passing 1706 * error as 0 here doesn't override io->status 1707 */ 1708 dm_io_dec_pending(io, 0); 1709 } 1710 } 1711 1712 /* Not done? */ 1713 if (!hlist_empty(&tmp)) { 1714 bio->bi_opf |= REQ_DM_POLL_LIST; 1715 /* Reset bio->bi_private to dm_io list head */ 1716 hlist_move_list(&tmp, head); 1717 return 0; 1718 } 1719 return 1; 1720 } 1721 1722 /*----------------------------------------------------------------- 1723 * An IDR is used to keep track of allocated minor numbers. 1724 *---------------------------------------------------------------*/ 1725 static void free_minor(int minor) 1726 { 1727 spin_lock(&_minor_lock); 1728 idr_remove(&_minor_idr, minor); 1729 spin_unlock(&_minor_lock); 1730 } 1731 1732 /* 1733 * See if the device with a specific minor # is free. 1734 */ 1735 static int specific_minor(int minor) 1736 { 1737 int r; 1738 1739 if (minor >= (1 << MINORBITS)) 1740 return -EINVAL; 1741 1742 idr_preload(GFP_KERNEL); 1743 spin_lock(&_minor_lock); 1744 1745 r = idr_alloc(&_minor_idr, MINOR_ALLOCED, minor, minor + 1, GFP_NOWAIT); 1746 1747 spin_unlock(&_minor_lock); 1748 idr_preload_end(); 1749 if (r < 0) 1750 return r == -ENOSPC ? -EBUSY : r; 1751 return 0; 1752 } 1753 1754 static int next_free_minor(int *minor) 1755 { 1756 int r; 1757 1758 idr_preload(GFP_KERNEL); 1759 spin_lock(&_minor_lock); 1760 1761 r = idr_alloc(&_minor_idr, MINOR_ALLOCED, 0, 1 << MINORBITS, GFP_NOWAIT); 1762 1763 spin_unlock(&_minor_lock); 1764 idr_preload_end(); 1765 if (r < 0) 1766 return r; 1767 *minor = r; 1768 return 0; 1769 } 1770 1771 static const struct block_device_operations dm_blk_dops; 1772 static const struct block_device_operations dm_rq_blk_dops; 1773 static const struct dax_operations dm_dax_ops; 1774 1775 static void dm_wq_work(struct work_struct *work); 1776 1777 #ifdef CONFIG_BLK_INLINE_ENCRYPTION 1778 static void dm_queue_destroy_crypto_profile(struct request_queue *q) 1779 { 1780 dm_destroy_crypto_profile(q->crypto_profile); 1781 } 1782 1783 #else /* CONFIG_BLK_INLINE_ENCRYPTION */ 1784 1785 static inline void dm_queue_destroy_crypto_profile(struct request_queue *q) 1786 { 1787 } 1788 #endif /* !CONFIG_BLK_INLINE_ENCRYPTION */ 1789 1790 static void cleanup_mapped_device(struct mapped_device *md) 1791 { 1792 if (md->wq) 1793 destroy_workqueue(md->wq); 1794 bioset_exit(&md->bs); 1795 bioset_exit(&md->io_bs); 1796 1797 if (md->dax_dev) { 1798 dax_remove_host(md->disk); 1799 kill_dax(md->dax_dev); 1800 put_dax(md->dax_dev); 1801 md->dax_dev = NULL; 1802 } 1803 1804 dm_cleanup_zoned_dev(md); 1805 if (md->disk) { 1806 spin_lock(&_minor_lock); 1807 md->disk->private_data = NULL; 1808 spin_unlock(&_minor_lock); 1809 if (dm_get_md_type(md) != DM_TYPE_NONE) { 1810 dm_sysfs_exit(md); 1811 del_gendisk(md->disk); 1812 } 1813 dm_queue_destroy_crypto_profile(md->queue); 1814 blk_cleanup_disk(md->disk); 1815 } 1816 1817 if (md->pending_io) { 1818 free_percpu(md->pending_io); 1819 md->pending_io = NULL; 1820 } 1821 1822 cleanup_srcu_struct(&md->io_barrier); 1823 1824 mutex_destroy(&md->suspend_lock); 1825 mutex_destroy(&md->type_lock); 1826 mutex_destroy(&md->table_devices_lock); 1827 mutex_destroy(&md->swap_bios_lock); 1828 1829 dm_mq_cleanup_mapped_device(md); 1830 } 1831 1832 /* 1833 * Allocate and initialise a blank device with a given minor. 1834 */ 1835 static struct mapped_device *alloc_dev(int minor) 1836 { 1837 int r, numa_node_id = dm_get_numa_node(); 1838 struct mapped_device *md; 1839 void *old_md; 1840 1841 md = kvzalloc_node(sizeof(*md), GFP_KERNEL, numa_node_id); 1842 if (!md) { 1843 DMWARN("unable to allocate device, out of memory."); 1844 return NULL; 1845 } 1846 1847 if (!try_module_get(THIS_MODULE)) 1848 goto bad_module_get; 1849 1850 /* get a minor number for the dev */ 1851 if (minor == DM_ANY_MINOR) 1852 r = next_free_minor(&minor); 1853 else 1854 r = specific_minor(minor); 1855 if (r < 0) 1856 goto bad_minor; 1857 1858 r = init_srcu_struct(&md->io_barrier); 1859 if (r < 0) 1860 goto bad_io_barrier; 1861 1862 md->numa_node_id = numa_node_id; 1863 md->init_tio_pdu = false; 1864 md->type = DM_TYPE_NONE; 1865 mutex_init(&md->suspend_lock); 1866 mutex_init(&md->type_lock); 1867 mutex_init(&md->table_devices_lock); 1868 spin_lock_init(&md->deferred_lock); 1869 atomic_set(&md->holders, 1); 1870 atomic_set(&md->open_count, 0); 1871 atomic_set(&md->event_nr, 0); 1872 atomic_set(&md->uevent_seq, 0); 1873 INIT_LIST_HEAD(&md->uevent_list); 1874 INIT_LIST_HEAD(&md->table_devices); 1875 spin_lock_init(&md->uevent_lock); 1876 1877 /* 1878 * default to bio-based until DM table is loaded and md->type 1879 * established. If request-based table is loaded: blk-mq will 1880 * override accordingly. 1881 */ 1882 md->disk = blk_alloc_disk(md->numa_node_id); 1883 if (!md->disk) 1884 goto bad; 1885 md->queue = md->disk->queue; 1886 1887 init_waitqueue_head(&md->wait); 1888 INIT_WORK(&md->work, dm_wq_work); 1889 init_waitqueue_head(&md->eventq); 1890 init_completion(&md->kobj_holder.completion); 1891 1892 md->swap_bios = get_swap_bios(); 1893 sema_init(&md->swap_bios_semaphore, md->swap_bios); 1894 mutex_init(&md->swap_bios_lock); 1895 1896 md->disk->major = _major; 1897 md->disk->first_minor = minor; 1898 md->disk->minors = 1; 1899 md->disk->flags |= GENHD_FL_NO_PART; 1900 md->disk->fops = &dm_blk_dops; 1901 md->disk->queue = md->queue; 1902 md->disk->private_data = md; 1903 sprintf(md->disk->disk_name, "dm-%d", minor); 1904 1905 if (IS_ENABLED(CONFIG_FS_DAX)) { 1906 md->dax_dev = alloc_dax(md, &dm_dax_ops); 1907 if (IS_ERR(md->dax_dev)) { 1908 md->dax_dev = NULL; 1909 goto bad; 1910 } 1911 set_dax_nocache(md->dax_dev); 1912 set_dax_nomc(md->dax_dev); 1913 if (dax_add_host(md->dax_dev, md->disk)) 1914 goto bad; 1915 } 1916 1917 format_dev_t(md->name, MKDEV(_major, minor)); 1918 1919 md->wq = alloc_workqueue("kdmflush/%s", WQ_MEM_RECLAIM, 0, md->name); 1920 if (!md->wq) 1921 goto bad; 1922 1923 md->pending_io = alloc_percpu(unsigned long); 1924 if (!md->pending_io) 1925 goto bad; 1926 1927 dm_stats_init(&md->stats); 1928 1929 /* Populate the mapping, nobody knows we exist yet */ 1930 spin_lock(&_minor_lock); 1931 old_md = idr_replace(&_minor_idr, md, minor); 1932 spin_unlock(&_minor_lock); 1933 1934 BUG_ON(old_md != MINOR_ALLOCED); 1935 1936 return md; 1937 1938 bad: 1939 cleanup_mapped_device(md); 1940 bad_io_barrier: 1941 free_minor(minor); 1942 bad_minor: 1943 module_put(THIS_MODULE); 1944 bad_module_get: 1945 kvfree(md); 1946 return NULL; 1947 } 1948 1949 static void unlock_fs(struct mapped_device *md); 1950 1951 static void free_dev(struct mapped_device *md) 1952 { 1953 int minor = MINOR(disk_devt(md->disk)); 1954 1955 unlock_fs(md); 1956 1957 cleanup_mapped_device(md); 1958 1959 free_table_devices(&md->table_devices); 1960 dm_stats_cleanup(&md->stats); 1961 free_minor(minor); 1962 1963 module_put(THIS_MODULE); 1964 kvfree(md); 1965 } 1966 1967 static int __bind_mempools(struct mapped_device *md, struct dm_table *t) 1968 { 1969 struct dm_md_mempools *p = dm_table_get_md_mempools(t); 1970 int ret = 0; 1971 1972 if (dm_table_bio_based(t)) { 1973 /* 1974 * The md may already have mempools that need changing. 1975 * If so, reload bioset because front_pad may have changed 1976 * because a different table was loaded. 1977 */ 1978 bioset_exit(&md->bs); 1979 bioset_exit(&md->io_bs); 1980 1981 } else if (bioset_initialized(&md->bs)) { 1982 /* 1983 * There's no need to reload with request-based dm 1984 * because the size of front_pad doesn't change. 1985 * Note for future: If you are to reload bioset, 1986 * prep-ed requests in the queue may refer 1987 * to bio from the old bioset, so you must walk 1988 * through the queue to unprep. 1989 */ 1990 goto out; 1991 } 1992 1993 BUG_ON(!p || 1994 bioset_initialized(&md->bs) || 1995 bioset_initialized(&md->io_bs)); 1996 1997 ret = bioset_init_from_src(&md->bs, &p->bs); 1998 if (ret) 1999 goto out; 2000 ret = bioset_init_from_src(&md->io_bs, &p->io_bs); 2001 if (ret) 2002 bioset_exit(&md->bs); 2003 out: 2004 /* mempool bind completed, no longer need any mempools in the table */ 2005 dm_table_free_md_mempools(t); 2006 return ret; 2007 } 2008 2009 /* 2010 * Bind a table to the device. 2011 */ 2012 static void event_callback(void *context) 2013 { 2014 unsigned long flags; 2015 LIST_HEAD(uevents); 2016 struct mapped_device *md = (struct mapped_device *) context; 2017 2018 spin_lock_irqsave(&md->uevent_lock, flags); 2019 list_splice_init(&md->uevent_list, &uevents); 2020 spin_unlock_irqrestore(&md->uevent_lock, flags); 2021 2022 dm_send_uevents(&uevents, &disk_to_dev(md->disk)->kobj); 2023 2024 atomic_inc(&md->event_nr); 2025 wake_up(&md->eventq); 2026 dm_issue_global_event(); 2027 } 2028 2029 /* 2030 * Returns old map, which caller must destroy. 2031 */ 2032 static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t, 2033 struct queue_limits *limits) 2034 { 2035 struct dm_table *old_map; 2036 sector_t size; 2037 int ret; 2038 2039 lockdep_assert_held(&md->suspend_lock); 2040 2041 size = dm_table_get_size(t); 2042 2043 /* 2044 * Wipe any geometry if the size of the table changed. 2045 */ 2046 if (size != dm_get_size(md)) 2047 memset(&md->geometry, 0, sizeof(md->geometry)); 2048 2049 if (!get_capacity(md->disk)) 2050 set_capacity(md->disk, size); 2051 else 2052 set_capacity_and_notify(md->disk, size); 2053 2054 dm_table_event_callback(t, event_callback, md); 2055 2056 if (dm_table_request_based(t)) { 2057 /* 2058 * Leverage the fact that request-based DM targets are 2059 * immutable singletons - used to optimize dm_mq_queue_rq. 2060 */ 2061 md->immutable_target = dm_table_get_immutable_target(t); 2062 } 2063 2064 ret = __bind_mempools(md, t); 2065 if (ret) { 2066 old_map = ERR_PTR(ret); 2067 goto out; 2068 } 2069 2070 ret = dm_table_set_restrictions(t, md->queue, limits); 2071 if (ret) { 2072 old_map = ERR_PTR(ret); 2073 goto out; 2074 } 2075 2076 old_map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock)); 2077 rcu_assign_pointer(md->map, (void *)t); 2078 md->immutable_target_type = dm_table_get_immutable_target_type(t); 2079 2080 if (old_map) 2081 dm_sync_table(md); 2082 out: 2083 return old_map; 2084 } 2085 2086 /* 2087 * Returns unbound table for the caller to free. 2088 */ 2089 static struct dm_table *__unbind(struct mapped_device *md) 2090 { 2091 struct dm_table *map = rcu_dereference_protected(md->map, 1); 2092 2093 if (!map) 2094 return NULL; 2095 2096 dm_table_event_callback(map, NULL, NULL); 2097 RCU_INIT_POINTER(md->map, NULL); 2098 dm_sync_table(md); 2099 2100 return map; 2101 } 2102 2103 /* 2104 * Constructor for a new device. 2105 */ 2106 int dm_create(int minor, struct mapped_device **result) 2107 { 2108 struct mapped_device *md; 2109 2110 md = alloc_dev(minor); 2111 if (!md) 2112 return -ENXIO; 2113 2114 dm_ima_reset_data(md); 2115 2116 *result = md; 2117 return 0; 2118 } 2119 2120 /* 2121 * Functions to manage md->type. 2122 * All are required to hold md->type_lock. 2123 */ 2124 void dm_lock_md_type(struct mapped_device *md) 2125 { 2126 mutex_lock(&md->type_lock); 2127 } 2128 2129 void dm_unlock_md_type(struct mapped_device *md) 2130 { 2131 mutex_unlock(&md->type_lock); 2132 } 2133 2134 void dm_set_md_type(struct mapped_device *md, enum dm_queue_mode type) 2135 { 2136 BUG_ON(!mutex_is_locked(&md->type_lock)); 2137 md->type = type; 2138 } 2139 2140 enum dm_queue_mode dm_get_md_type(struct mapped_device *md) 2141 { 2142 return md->type; 2143 } 2144 2145 struct target_type *dm_get_immutable_target_type(struct mapped_device *md) 2146 { 2147 return md->immutable_target_type; 2148 } 2149 2150 /* 2151 * The queue_limits are only valid as long as you have a reference 2152 * count on 'md'. 2153 */ 2154 struct queue_limits *dm_get_queue_limits(struct mapped_device *md) 2155 { 2156 BUG_ON(!atomic_read(&md->holders)); 2157 return &md->queue->limits; 2158 } 2159 EXPORT_SYMBOL_GPL(dm_get_queue_limits); 2160 2161 /* 2162 * Setup the DM device's queue based on md's type 2163 */ 2164 int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t) 2165 { 2166 enum dm_queue_mode type = dm_table_get_type(t); 2167 struct queue_limits limits; 2168 int r; 2169 2170 switch (type) { 2171 case DM_TYPE_REQUEST_BASED: 2172 md->disk->fops = &dm_rq_blk_dops; 2173 r = dm_mq_init_request_queue(md, t); 2174 if (r) { 2175 DMERR("Cannot initialize queue for request-based dm mapped device"); 2176 return r; 2177 } 2178 break; 2179 case DM_TYPE_BIO_BASED: 2180 case DM_TYPE_DAX_BIO_BASED: 2181 break; 2182 case DM_TYPE_NONE: 2183 WARN_ON_ONCE(true); 2184 break; 2185 } 2186 2187 r = dm_calculate_queue_limits(t, &limits); 2188 if (r) { 2189 DMERR("Cannot calculate initial queue limits"); 2190 return r; 2191 } 2192 r = dm_table_set_restrictions(t, md->queue, &limits); 2193 if (r) 2194 return r; 2195 2196 r = add_disk(md->disk); 2197 if (r) 2198 return r; 2199 2200 r = dm_sysfs_init(md); 2201 if (r) { 2202 del_gendisk(md->disk); 2203 return r; 2204 } 2205 md->type = type; 2206 return 0; 2207 } 2208 2209 struct mapped_device *dm_get_md(dev_t dev) 2210 { 2211 struct mapped_device *md; 2212 unsigned minor = MINOR(dev); 2213 2214 if (MAJOR(dev) != _major || minor >= (1 << MINORBITS)) 2215 return NULL; 2216 2217 spin_lock(&_minor_lock); 2218 2219 md = idr_find(&_minor_idr, minor); 2220 if (!md || md == MINOR_ALLOCED || (MINOR(disk_devt(dm_disk(md))) != minor) || 2221 test_bit(DMF_FREEING, &md->flags) || dm_deleting_md(md)) { 2222 md = NULL; 2223 goto out; 2224 } 2225 dm_get(md); 2226 out: 2227 spin_unlock(&_minor_lock); 2228 2229 return md; 2230 } 2231 EXPORT_SYMBOL_GPL(dm_get_md); 2232 2233 void *dm_get_mdptr(struct mapped_device *md) 2234 { 2235 return md->interface_ptr; 2236 } 2237 2238 void dm_set_mdptr(struct mapped_device *md, void *ptr) 2239 { 2240 md->interface_ptr = ptr; 2241 } 2242 2243 void dm_get(struct mapped_device *md) 2244 { 2245 atomic_inc(&md->holders); 2246 BUG_ON(test_bit(DMF_FREEING, &md->flags)); 2247 } 2248 2249 int dm_hold(struct mapped_device *md) 2250 { 2251 spin_lock(&_minor_lock); 2252 if (test_bit(DMF_FREEING, &md->flags)) { 2253 spin_unlock(&_minor_lock); 2254 return -EBUSY; 2255 } 2256 dm_get(md); 2257 spin_unlock(&_minor_lock); 2258 return 0; 2259 } 2260 EXPORT_SYMBOL_GPL(dm_hold); 2261 2262 const char *dm_device_name(struct mapped_device *md) 2263 { 2264 return md->name; 2265 } 2266 EXPORT_SYMBOL_GPL(dm_device_name); 2267 2268 static void __dm_destroy(struct mapped_device *md, bool wait) 2269 { 2270 struct dm_table *map; 2271 int srcu_idx; 2272 2273 might_sleep(); 2274 2275 spin_lock(&_minor_lock); 2276 idr_replace(&_minor_idr, MINOR_ALLOCED, MINOR(disk_devt(dm_disk(md)))); 2277 set_bit(DMF_FREEING, &md->flags); 2278 spin_unlock(&_minor_lock); 2279 2280 blk_mark_disk_dead(md->disk); 2281 2282 /* 2283 * Take suspend_lock so that presuspend and postsuspend methods 2284 * do not race with internal suspend. 2285 */ 2286 mutex_lock(&md->suspend_lock); 2287 map = dm_get_live_table(md, &srcu_idx); 2288 if (!dm_suspended_md(md)) { 2289 dm_table_presuspend_targets(map); 2290 set_bit(DMF_SUSPENDED, &md->flags); 2291 set_bit(DMF_POST_SUSPENDING, &md->flags); 2292 dm_table_postsuspend_targets(map); 2293 } 2294 /* dm_put_live_table must be before msleep, otherwise deadlock is possible */ 2295 dm_put_live_table(md, srcu_idx); 2296 mutex_unlock(&md->suspend_lock); 2297 2298 /* 2299 * Rare, but there may be I/O requests still going to complete, 2300 * for example. Wait for all references to disappear. 2301 * No one should increment the reference count of the mapped_device, 2302 * after the mapped_device state becomes DMF_FREEING. 2303 */ 2304 if (wait) 2305 while (atomic_read(&md->holders)) 2306 msleep(1); 2307 else if (atomic_read(&md->holders)) 2308 DMWARN("%s: Forcibly removing mapped_device still in use! (%d users)", 2309 dm_device_name(md), atomic_read(&md->holders)); 2310 2311 dm_table_destroy(__unbind(md)); 2312 free_dev(md); 2313 } 2314 2315 void dm_destroy(struct mapped_device *md) 2316 { 2317 __dm_destroy(md, true); 2318 } 2319 2320 void dm_destroy_immediate(struct mapped_device *md) 2321 { 2322 __dm_destroy(md, false); 2323 } 2324 2325 void dm_put(struct mapped_device *md) 2326 { 2327 atomic_dec(&md->holders); 2328 } 2329 EXPORT_SYMBOL_GPL(dm_put); 2330 2331 static bool dm_in_flight_bios(struct mapped_device *md) 2332 { 2333 int cpu; 2334 unsigned long sum = 0; 2335 2336 for_each_possible_cpu(cpu) 2337 sum += *per_cpu_ptr(md->pending_io, cpu); 2338 2339 return sum != 0; 2340 } 2341 2342 static int dm_wait_for_bios_completion(struct mapped_device *md, unsigned int task_state) 2343 { 2344 int r = 0; 2345 DEFINE_WAIT(wait); 2346 2347 while (true) { 2348 prepare_to_wait(&md->wait, &wait, task_state); 2349 2350 if (!dm_in_flight_bios(md)) 2351 break; 2352 2353 if (signal_pending_state(task_state, current)) { 2354 r = -EINTR; 2355 break; 2356 } 2357 2358 io_schedule(); 2359 } 2360 finish_wait(&md->wait, &wait); 2361 2362 smp_rmb(); 2363 2364 return r; 2365 } 2366 2367 static int dm_wait_for_completion(struct mapped_device *md, unsigned int task_state) 2368 { 2369 int r = 0; 2370 2371 if (!queue_is_mq(md->queue)) 2372 return dm_wait_for_bios_completion(md, task_state); 2373 2374 while (true) { 2375 if (!blk_mq_queue_inflight(md->queue)) 2376 break; 2377 2378 if (signal_pending_state(task_state, current)) { 2379 r = -EINTR; 2380 break; 2381 } 2382 2383 msleep(5); 2384 } 2385 2386 return r; 2387 } 2388 2389 /* 2390 * Process the deferred bios 2391 */ 2392 static void dm_wq_work(struct work_struct *work) 2393 { 2394 struct mapped_device *md = container_of(work, struct mapped_device, work); 2395 struct bio *bio; 2396 2397 while (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) { 2398 spin_lock_irq(&md->deferred_lock); 2399 bio = bio_list_pop(&md->deferred); 2400 spin_unlock_irq(&md->deferred_lock); 2401 2402 if (!bio) 2403 break; 2404 2405 submit_bio_noacct(bio); 2406 } 2407 } 2408 2409 static void dm_queue_flush(struct mapped_device *md) 2410 { 2411 clear_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags); 2412 smp_mb__after_atomic(); 2413 queue_work(md->wq, &md->work); 2414 } 2415 2416 /* 2417 * Swap in a new table, returning the old one for the caller to destroy. 2418 */ 2419 struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *table) 2420 { 2421 struct dm_table *live_map = NULL, *map = ERR_PTR(-EINVAL); 2422 struct queue_limits limits; 2423 int r; 2424 2425 mutex_lock(&md->suspend_lock); 2426 2427 /* device must be suspended */ 2428 if (!dm_suspended_md(md)) 2429 goto out; 2430 2431 /* 2432 * If the new table has no data devices, retain the existing limits. 2433 * This helps multipath with queue_if_no_path if all paths disappear, 2434 * then new I/O is queued based on these limits, and then some paths 2435 * reappear. 2436 */ 2437 if (dm_table_has_no_data_devices(table)) { 2438 live_map = dm_get_live_table_fast(md); 2439 if (live_map) 2440 limits = md->queue->limits; 2441 dm_put_live_table_fast(md); 2442 } 2443 2444 if (!live_map) { 2445 r = dm_calculate_queue_limits(table, &limits); 2446 if (r) { 2447 map = ERR_PTR(r); 2448 goto out; 2449 } 2450 } 2451 2452 map = __bind(md, table, &limits); 2453 dm_issue_global_event(); 2454 2455 out: 2456 mutex_unlock(&md->suspend_lock); 2457 return map; 2458 } 2459 2460 /* 2461 * Functions to lock and unlock any filesystem running on the 2462 * device. 2463 */ 2464 static int lock_fs(struct mapped_device *md) 2465 { 2466 int r; 2467 2468 WARN_ON(test_bit(DMF_FROZEN, &md->flags)); 2469 2470 r = freeze_bdev(md->disk->part0); 2471 if (!r) 2472 set_bit(DMF_FROZEN, &md->flags); 2473 return r; 2474 } 2475 2476 static void unlock_fs(struct mapped_device *md) 2477 { 2478 if (!test_bit(DMF_FROZEN, &md->flags)) 2479 return; 2480 thaw_bdev(md->disk->part0); 2481 clear_bit(DMF_FROZEN, &md->flags); 2482 } 2483 2484 /* 2485 * @suspend_flags: DM_SUSPEND_LOCKFS_FLAG and/or DM_SUSPEND_NOFLUSH_FLAG 2486 * @task_state: e.g. TASK_INTERRUPTIBLE or TASK_UNINTERRUPTIBLE 2487 * @dmf_suspended_flag: DMF_SUSPENDED or DMF_SUSPENDED_INTERNALLY 2488 * 2489 * If __dm_suspend returns 0, the device is completely quiescent 2490 * now. There is no request-processing activity. All new requests 2491 * are being added to md->deferred list. 2492 */ 2493 static int __dm_suspend(struct mapped_device *md, struct dm_table *map, 2494 unsigned suspend_flags, unsigned int task_state, 2495 int dmf_suspended_flag) 2496 { 2497 bool do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG; 2498 bool noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG; 2499 int r; 2500 2501 lockdep_assert_held(&md->suspend_lock); 2502 2503 /* 2504 * DMF_NOFLUSH_SUSPENDING must be set before presuspend. 2505 * This flag is cleared before dm_suspend returns. 2506 */ 2507 if (noflush) 2508 set_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); 2509 else 2510 DMDEBUG("%s: suspending with flush", dm_device_name(md)); 2511 2512 /* 2513 * This gets reverted if there's an error later and the targets 2514 * provide the .presuspend_undo hook. 2515 */ 2516 dm_table_presuspend_targets(map); 2517 2518 /* 2519 * Flush I/O to the device. 2520 * Any I/O submitted after lock_fs() may not be flushed. 2521 * noflush takes precedence over do_lockfs. 2522 * (lock_fs() flushes I/Os and waits for them to complete.) 2523 */ 2524 if (!noflush && do_lockfs) { 2525 r = lock_fs(md); 2526 if (r) { 2527 dm_table_presuspend_undo_targets(map); 2528 return r; 2529 } 2530 } 2531 2532 /* 2533 * Here we must make sure that no processes are submitting requests 2534 * to target drivers i.e. no one may be executing 2535 * dm_split_and_process_bio from dm_submit_bio. 2536 * 2537 * To get all processes out of dm_split_and_process_bio in dm_submit_bio, 2538 * we take the write lock. To prevent any process from reentering 2539 * dm_split_and_process_bio from dm_submit_bio and quiesce the thread 2540 * (dm_wq_work), we set DMF_BLOCK_IO_FOR_SUSPEND and call 2541 * flush_workqueue(md->wq). 2542 */ 2543 set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags); 2544 if (map) 2545 synchronize_srcu(&md->io_barrier); 2546 2547 /* 2548 * Stop md->queue before flushing md->wq in case request-based 2549 * dm defers requests to md->wq from md->queue. 2550 */ 2551 if (dm_request_based(md)) 2552 dm_stop_queue(md->queue); 2553 2554 flush_workqueue(md->wq); 2555 2556 /* 2557 * At this point no more requests are entering target request routines. 2558 * We call dm_wait_for_completion to wait for all existing requests 2559 * to finish. 2560 */ 2561 r = dm_wait_for_completion(md, task_state); 2562 if (!r) 2563 set_bit(dmf_suspended_flag, &md->flags); 2564 2565 if (noflush) 2566 clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); 2567 if (map) 2568 synchronize_srcu(&md->io_barrier); 2569 2570 /* were we interrupted ? */ 2571 if (r < 0) { 2572 dm_queue_flush(md); 2573 2574 if (dm_request_based(md)) 2575 dm_start_queue(md->queue); 2576 2577 unlock_fs(md); 2578 dm_table_presuspend_undo_targets(map); 2579 /* pushback list is already flushed, so skip flush */ 2580 } 2581 2582 return r; 2583 } 2584 2585 /* 2586 * We need to be able to change a mapping table under a mounted 2587 * filesystem. For example we might want to move some data in 2588 * the background. Before the table can be swapped with 2589 * dm_bind_table, dm_suspend must be called to flush any in 2590 * flight bios and ensure that any further io gets deferred. 2591 */ 2592 /* 2593 * Suspend mechanism in request-based dm. 2594 * 2595 * 1. Flush all I/Os by lock_fs() if needed. 2596 * 2. Stop dispatching any I/O by stopping the request_queue. 2597 * 3. Wait for all in-flight I/Os to be completed or requeued. 2598 * 2599 * To abort suspend, start the request_queue. 2600 */ 2601 int dm_suspend(struct mapped_device *md, unsigned suspend_flags) 2602 { 2603 struct dm_table *map = NULL; 2604 int r = 0; 2605 2606 retry: 2607 mutex_lock_nested(&md->suspend_lock, SINGLE_DEPTH_NESTING); 2608 2609 if (dm_suspended_md(md)) { 2610 r = -EINVAL; 2611 goto out_unlock; 2612 } 2613 2614 if (dm_suspended_internally_md(md)) { 2615 /* already internally suspended, wait for internal resume */ 2616 mutex_unlock(&md->suspend_lock); 2617 r = wait_on_bit(&md->flags, DMF_SUSPENDED_INTERNALLY, TASK_INTERRUPTIBLE); 2618 if (r) 2619 return r; 2620 goto retry; 2621 } 2622 2623 map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock)); 2624 2625 r = __dm_suspend(md, map, suspend_flags, TASK_INTERRUPTIBLE, DMF_SUSPENDED); 2626 if (r) 2627 goto out_unlock; 2628 2629 set_bit(DMF_POST_SUSPENDING, &md->flags); 2630 dm_table_postsuspend_targets(map); 2631 clear_bit(DMF_POST_SUSPENDING, &md->flags); 2632 2633 out_unlock: 2634 mutex_unlock(&md->suspend_lock); 2635 return r; 2636 } 2637 2638 static int __dm_resume(struct mapped_device *md, struct dm_table *map) 2639 { 2640 if (map) { 2641 int r = dm_table_resume_targets(map); 2642 if (r) 2643 return r; 2644 } 2645 2646 dm_queue_flush(md); 2647 2648 /* 2649 * Flushing deferred I/Os must be done after targets are resumed 2650 * so that mapping of targets can work correctly. 2651 * Request-based dm is queueing the deferred I/Os in its request_queue. 2652 */ 2653 if (dm_request_based(md)) 2654 dm_start_queue(md->queue); 2655 2656 unlock_fs(md); 2657 2658 return 0; 2659 } 2660 2661 int dm_resume(struct mapped_device *md) 2662 { 2663 int r; 2664 struct dm_table *map = NULL; 2665 2666 retry: 2667 r = -EINVAL; 2668 mutex_lock_nested(&md->suspend_lock, SINGLE_DEPTH_NESTING); 2669 2670 if (!dm_suspended_md(md)) 2671 goto out; 2672 2673 if (dm_suspended_internally_md(md)) { 2674 /* already internally suspended, wait for internal resume */ 2675 mutex_unlock(&md->suspend_lock); 2676 r = wait_on_bit(&md->flags, DMF_SUSPENDED_INTERNALLY, TASK_INTERRUPTIBLE); 2677 if (r) 2678 return r; 2679 goto retry; 2680 } 2681 2682 map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock)); 2683 if (!map || !dm_table_get_size(map)) 2684 goto out; 2685 2686 r = __dm_resume(md, map); 2687 if (r) 2688 goto out; 2689 2690 clear_bit(DMF_SUSPENDED, &md->flags); 2691 out: 2692 mutex_unlock(&md->suspend_lock); 2693 2694 return r; 2695 } 2696 2697 /* 2698 * Internal suspend/resume works like userspace-driven suspend. It waits 2699 * until all bios finish and prevents issuing new bios to the target drivers. 2700 * It may be used only from the kernel. 2701 */ 2702 2703 static void __dm_internal_suspend(struct mapped_device *md, unsigned suspend_flags) 2704 { 2705 struct dm_table *map = NULL; 2706 2707 lockdep_assert_held(&md->suspend_lock); 2708 2709 if (md->internal_suspend_count++) 2710 return; /* nested internal suspend */ 2711 2712 if (dm_suspended_md(md)) { 2713 set_bit(DMF_SUSPENDED_INTERNALLY, &md->flags); 2714 return; /* nest suspend */ 2715 } 2716 2717 map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock)); 2718 2719 /* 2720 * Using TASK_UNINTERRUPTIBLE because only NOFLUSH internal suspend is 2721 * supported. Properly supporting a TASK_INTERRUPTIBLE internal suspend 2722 * would require changing .presuspend to return an error -- avoid this 2723 * until there is a need for more elaborate variants of internal suspend. 2724 */ 2725 (void) __dm_suspend(md, map, suspend_flags, TASK_UNINTERRUPTIBLE, 2726 DMF_SUSPENDED_INTERNALLY); 2727 2728 set_bit(DMF_POST_SUSPENDING, &md->flags); 2729 dm_table_postsuspend_targets(map); 2730 clear_bit(DMF_POST_SUSPENDING, &md->flags); 2731 } 2732 2733 static void __dm_internal_resume(struct mapped_device *md) 2734 { 2735 BUG_ON(!md->internal_suspend_count); 2736 2737 if (--md->internal_suspend_count) 2738 return; /* resume from nested internal suspend */ 2739 2740 if (dm_suspended_md(md)) 2741 goto done; /* resume from nested suspend */ 2742 2743 /* 2744 * NOTE: existing callers don't need to call dm_table_resume_targets 2745 * (which may fail -- so best to avoid it for now by passing NULL map) 2746 */ 2747 (void) __dm_resume(md, NULL); 2748 2749 done: 2750 clear_bit(DMF_SUSPENDED_INTERNALLY, &md->flags); 2751 smp_mb__after_atomic(); 2752 wake_up_bit(&md->flags, DMF_SUSPENDED_INTERNALLY); 2753 } 2754 2755 void dm_internal_suspend_noflush(struct mapped_device *md) 2756 { 2757 mutex_lock(&md->suspend_lock); 2758 __dm_internal_suspend(md, DM_SUSPEND_NOFLUSH_FLAG); 2759 mutex_unlock(&md->suspend_lock); 2760 } 2761 EXPORT_SYMBOL_GPL(dm_internal_suspend_noflush); 2762 2763 void dm_internal_resume(struct mapped_device *md) 2764 { 2765 mutex_lock(&md->suspend_lock); 2766 __dm_internal_resume(md); 2767 mutex_unlock(&md->suspend_lock); 2768 } 2769 EXPORT_SYMBOL_GPL(dm_internal_resume); 2770 2771 /* 2772 * Fast variants of internal suspend/resume hold md->suspend_lock, 2773 * which prevents interaction with userspace-driven suspend. 2774 */ 2775 2776 void dm_internal_suspend_fast(struct mapped_device *md) 2777 { 2778 mutex_lock(&md->suspend_lock); 2779 if (dm_suspended_md(md) || dm_suspended_internally_md(md)) 2780 return; 2781 2782 set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags); 2783 synchronize_srcu(&md->io_barrier); 2784 flush_workqueue(md->wq); 2785 dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE); 2786 } 2787 EXPORT_SYMBOL_GPL(dm_internal_suspend_fast); 2788 2789 void dm_internal_resume_fast(struct mapped_device *md) 2790 { 2791 if (dm_suspended_md(md) || dm_suspended_internally_md(md)) 2792 goto done; 2793 2794 dm_queue_flush(md); 2795 2796 done: 2797 mutex_unlock(&md->suspend_lock); 2798 } 2799 EXPORT_SYMBOL_GPL(dm_internal_resume_fast); 2800 2801 /*----------------------------------------------------------------- 2802 * Event notification. 2803 *---------------------------------------------------------------*/ 2804 int dm_kobject_uevent(struct mapped_device *md, enum kobject_action action, 2805 unsigned cookie) 2806 { 2807 int r; 2808 unsigned noio_flag; 2809 char udev_cookie[DM_COOKIE_LENGTH]; 2810 char *envp[] = { udev_cookie, NULL }; 2811 2812 noio_flag = memalloc_noio_save(); 2813 2814 if (!cookie) 2815 r = kobject_uevent(&disk_to_dev(md->disk)->kobj, action); 2816 else { 2817 snprintf(udev_cookie, DM_COOKIE_LENGTH, "%s=%u", 2818 DM_COOKIE_ENV_VAR_NAME, cookie); 2819 r = kobject_uevent_env(&disk_to_dev(md->disk)->kobj, 2820 action, envp); 2821 } 2822 2823 memalloc_noio_restore(noio_flag); 2824 2825 return r; 2826 } 2827 2828 uint32_t dm_next_uevent_seq(struct mapped_device *md) 2829 { 2830 return atomic_add_return(1, &md->uevent_seq); 2831 } 2832 2833 uint32_t dm_get_event_nr(struct mapped_device *md) 2834 { 2835 return atomic_read(&md->event_nr); 2836 } 2837 2838 int dm_wait_event(struct mapped_device *md, int event_nr) 2839 { 2840 return wait_event_interruptible(md->eventq, 2841 (event_nr != atomic_read(&md->event_nr))); 2842 } 2843 2844 void dm_uevent_add(struct mapped_device *md, struct list_head *elist) 2845 { 2846 unsigned long flags; 2847 2848 spin_lock_irqsave(&md->uevent_lock, flags); 2849 list_add(elist, &md->uevent_list); 2850 spin_unlock_irqrestore(&md->uevent_lock, flags); 2851 } 2852 2853 /* 2854 * The gendisk is only valid as long as you have a reference 2855 * count on 'md'. 2856 */ 2857 struct gendisk *dm_disk(struct mapped_device *md) 2858 { 2859 return md->disk; 2860 } 2861 EXPORT_SYMBOL_GPL(dm_disk); 2862 2863 struct kobject *dm_kobject(struct mapped_device *md) 2864 { 2865 return &md->kobj_holder.kobj; 2866 } 2867 2868 struct mapped_device *dm_get_from_kobject(struct kobject *kobj) 2869 { 2870 struct mapped_device *md; 2871 2872 md = container_of(kobj, struct mapped_device, kobj_holder.kobj); 2873 2874 spin_lock(&_minor_lock); 2875 if (test_bit(DMF_FREEING, &md->flags) || dm_deleting_md(md)) { 2876 md = NULL; 2877 goto out; 2878 } 2879 dm_get(md); 2880 out: 2881 spin_unlock(&_minor_lock); 2882 2883 return md; 2884 } 2885 2886 int dm_suspended_md(struct mapped_device *md) 2887 { 2888 return test_bit(DMF_SUSPENDED, &md->flags); 2889 } 2890 2891 static int dm_post_suspending_md(struct mapped_device *md) 2892 { 2893 return test_bit(DMF_POST_SUSPENDING, &md->flags); 2894 } 2895 2896 int dm_suspended_internally_md(struct mapped_device *md) 2897 { 2898 return test_bit(DMF_SUSPENDED_INTERNALLY, &md->flags); 2899 } 2900 2901 int dm_test_deferred_remove_flag(struct mapped_device *md) 2902 { 2903 return test_bit(DMF_DEFERRED_REMOVE, &md->flags); 2904 } 2905 2906 int dm_suspended(struct dm_target *ti) 2907 { 2908 return dm_suspended_md(ti->table->md); 2909 } 2910 EXPORT_SYMBOL_GPL(dm_suspended); 2911 2912 int dm_post_suspending(struct dm_target *ti) 2913 { 2914 return dm_post_suspending_md(ti->table->md); 2915 } 2916 EXPORT_SYMBOL_GPL(dm_post_suspending); 2917 2918 int dm_noflush_suspending(struct dm_target *ti) 2919 { 2920 return __noflush_suspending(ti->table->md); 2921 } 2922 EXPORT_SYMBOL_GPL(dm_noflush_suspending); 2923 2924 struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, enum dm_queue_mode type, 2925 unsigned integrity, unsigned per_io_data_size, 2926 unsigned min_pool_size) 2927 { 2928 struct dm_md_mempools *pools = kzalloc_node(sizeof(*pools), GFP_KERNEL, md->numa_node_id); 2929 unsigned int pool_size = 0; 2930 unsigned int front_pad, io_front_pad; 2931 int ret; 2932 2933 if (!pools) 2934 return NULL; 2935 2936 switch (type) { 2937 case DM_TYPE_BIO_BASED: 2938 case DM_TYPE_DAX_BIO_BASED: 2939 pool_size = max(dm_get_reserved_bio_based_ios(), min_pool_size); 2940 front_pad = roundup(per_io_data_size, __alignof__(struct dm_target_io)) + DM_TARGET_IO_BIO_OFFSET; 2941 io_front_pad = roundup(per_io_data_size, __alignof__(struct dm_io)) + DM_IO_BIO_OFFSET; 2942 ret = bioset_init(&pools->io_bs, pool_size, io_front_pad, 0); 2943 if (ret) 2944 goto out; 2945 if (integrity && bioset_integrity_create(&pools->io_bs, pool_size)) 2946 goto out; 2947 break; 2948 case DM_TYPE_REQUEST_BASED: 2949 pool_size = max(dm_get_reserved_rq_based_ios(), min_pool_size); 2950 front_pad = offsetof(struct dm_rq_clone_bio_info, clone); 2951 /* per_io_data_size is used for blk-mq pdu at queue allocation */ 2952 break; 2953 default: 2954 BUG(); 2955 } 2956 2957 ret = bioset_init(&pools->bs, pool_size, front_pad, 0); 2958 if (ret) 2959 goto out; 2960 2961 if (integrity && bioset_integrity_create(&pools->bs, pool_size)) 2962 goto out; 2963 2964 return pools; 2965 2966 out: 2967 dm_free_md_mempools(pools); 2968 2969 return NULL; 2970 } 2971 2972 void dm_free_md_mempools(struct dm_md_mempools *pools) 2973 { 2974 if (!pools) 2975 return; 2976 2977 bioset_exit(&pools->bs); 2978 bioset_exit(&pools->io_bs); 2979 2980 kfree(pools); 2981 } 2982 2983 struct dm_pr { 2984 u64 old_key; 2985 u64 new_key; 2986 u32 flags; 2987 bool fail_early; 2988 }; 2989 2990 static int dm_call_pr(struct block_device *bdev, iterate_devices_callout_fn fn, 2991 void *data) 2992 { 2993 struct mapped_device *md = bdev->bd_disk->private_data; 2994 struct dm_table *table; 2995 struct dm_target *ti; 2996 int ret = -ENOTTY, srcu_idx; 2997 2998 table = dm_get_live_table(md, &srcu_idx); 2999 if (!table || !dm_table_get_size(table)) 3000 goto out; 3001 3002 /* We only support devices that have a single target */ 3003 if (dm_table_get_num_targets(table) != 1) 3004 goto out; 3005 ti = dm_table_get_target(table, 0); 3006 3007 ret = -EINVAL; 3008 if (!ti->type->iterate_devices) 3009 goto out; 3010 3011 ret = ti->type->iterate_devices(ti, fn, data); 3012 out: 3013 dm_put_live_table(md, srcu_idx); 3014 return ret; 3015 } 3016 3017 /* 3018 * For register / unregister we need to manually call out to every path. 3019 */ 3020 static int __dm_pr_register(struct dm_target *ti, struct dm_dev *dev, 3021 sector_t start, sector_t len, void *data) 3022 { 3023 struct dm_pr *pr = data; 3024 const struct pr_ops *ops = dev->bdev->bd_disk->fops->pr_ops; 3025 3026 if (!ops || !ops->pr_register) 3027 return -EOPNOTSUPP; 3028 return ops->pr_register(dev->bdev, pr->old_key, pr->new_key, pr->flags); 3029 } 3030 3031 static int dm_pr_register(struct block_device *bdev, u64 old_key, u64 new_key, 3032 u32 flags) 3033 { 3034 struct dm_pr pr = { 3035 .old_key = old_key, 3036 .new_key = new_key, 3037 .flags = flags, 3038 .fail_early = true, 3039 }; 3040 int ret; 3041 3042 ret = dm_call_pr(bdev, __dm_pr_register, &pr); 3043 if (ret && new_key) { 3044 /* unregister all paths if we failed to register any path */ 3045 pr.old_key = new_key; 3046 pr.new_key = 0; 3047 pr.flags = 0; 3048 pr.fail_early = false; 3049 dm_call_pr(bdev, __dm_pr_register, &pr); 3050 } 3051 3052 return ret; 3053 } 3054 3055 static int dm_pr_reserve(struct block_device *bdev, u64 key, enum pr_type type, 3056 u32 flags) 3057 { 3058 struct mapped_device *md = bdev->bd_disk->private_data; 3059 const struct pr_ops *ops; 3060 int r, srcu_idx; 3061 3062 r = dm_prepare_ioctl(md, &srcu_idx, &bdev); 3063 if (r < 0) 3064 goto out; 3065 3066 ops = bdev->bd_disk->fops->pr_ops; 3067 if (ops && ops->pr_reserve) 3068 r = ops->pr_reserve(bdev, key, type, flags); 3069 else 3070 r = -EOPNOTSUPP; 3071 out: 3072 dm_unprepare_ioctl(md, srcu_idx); 3073 return r; 3074 } 3075 3076 static int dm_pr_release(struct block_device *bdev, u64 key, enum pr_type type) 3077 { 3078 struct mapped_device *md = bdev->bd_disk->private_data; 3079 const struct pr_ops *ops; 3080 int r, srcu_idx; 3081 3082 r = dm_prepare_ioctl(md, &srcu_idx, &bdev); 3083 if (r < 0) 3084 goto out; 3085 3086 ops = bdev->bd_disk->fops->pr_ops; 3087 if (ops && ops->pr_release) 3088 r = ops->pr_release(bdev, key, type); 3089 else 3090 r = -EOPNOTSUPP; 3091 out: 3092 dm_unprepare_ioctl(md, srcu_idx); 3093 return r; 3094 } 3095 3096 static int dm_pr_preempt(struct block_device *bdev, u64 old_key, u64 new_key, 3097 enum pr_type type, bool abort) 3098 { 3099 struct mapped_device *md = bdev->bd_disk->private_data; 3100 const struct pr_ops *ops; 3101 int r, srcu_idx; 3102 3103 r = dm_prepare_ioctl(md, &srcu_idx, &bdev); 3104 if (r < 0) 3105 goto out; 3106 3107 ops = bdev->bd_disk->fops->pr_ops; 3108 if (ops && ops->pr_preempt) 3109 r = ops->pr_preempt(bdev, old_key, new_key, type, abort); 3110 else 3111 r = -EOPNOTSUPP; 3112 out: 3113 dm_unprepare_ioctl(md, srcu_idx); 3114 return r; 3115 } 3116 3117 static int dm_pr_clear(struct block_device *bdev, u64 key) 3118 { 3119 struct mapped_device *md = bdev->bd_disk->private_data; 3120 const struct pr_ops *ops; 3121 int r, srcu_idx; 3122 3123 r = dm_prepare_ioctl(md, &srcu_idx, &bdev); 3124 if (r < 0) 3125 goto out; 3126 3127 ops = bdev->bd_disk->fops->pr_ops; 3128 if (ops && ops->pr_clear) 3129 r = ops->pr_clear(bdev, key); 3130 else 3131 r = -EOPNOTSUPP; 3132 out: 3133 dm_unprepare_ioctl(md, srcu_idx); 3134 return r; 3135 } 3136 3137 static const struct pr_ops dm_pr_ops = { 3138 .pr_register = dm_pr_register, 3139 .pr_reserve = dm_pr_reserve, 3140 .pr_release = dm_pr_release, 3141 .pr_preempt = dm_pr_preempt, 3142 .pr_clear = dm_pr_clear, 3143 }; 3144 3145 static const struct block_device_operations dm_blk_dops = { 3146 .submit_bio = dm_submit_bio, 3147 .poll_bio = dm_poll_bio, 3148 .open = dm_blk_open, 3149 .release = dm_blk_close, 3150 .ioctl = dm_blk_ioctl, 3151 .getgeo = dm_blk_getgeo, 3152 .report_zones = dm_blk_report_zones, 3153 .pr_ops = &dm_pr_ops, 3154 .owner = THIS_MODULE 3155 }; 3156 3157 static const struct block_device_operations dm_rq_blk_dops = { 3158 .open = dm_blk_open, 3159 .release = dm_blk_close, 3160 .ioctl = dm_blk_ioctl, 3161 .getgeo = dm_blk_getgeo, 3162 .pr_ops = &dm_pr_ops, 3163 .owner = THIS_MODULE 3164 }; 3165 3166 static const struct dax_operations dm_dax_ops = { 3167 .direct_access = dm_dax_direct_access, 3168 .zero_page_range = dm_dax_zero_page_range, 3169 .recovery_write = dm_dax_recovery_write, 3170 }; 3171 3172 /* 3173 * module hooks 3174 */ 3175 module_init(dm_init); 3176 module_exit(dm_exit); 3177 3178 module_param(major, uint, 0); 3179 MODULE_PARM_DESC(major, "The major number of the device mapper"); 3180 3181 module_param(reserved_bio_based_ios, uint, S_IRUGO | S_IWUSR); 3182 MODULE_PARM_DESC(reserved_bio_based_ios, "Reserved IOs in bio-based mempools"); 3183 3184 module_param(dm_numa_node, int, S_IRUGO | S_IWUSR); 3185 MODULE_PARM_DESC(dm_numa_node, "NUMA node for DM device memory allocations"); 3186 3187 module_param(swap_bios, int, S_IRUGO | S_IWUSR); 3188 MODULE_PARM_DESC(swap_bios, "Maximum allowed inflight swap IOs"); 3189 3190 MODULE_DESCRIPTION(DM_NAME " driver"); 3191 MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>"); 3192 MODULE_LICENSE("GPL"); 3193