1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright (C) 2001 Sistina Software (UK) Limited. 4 * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved. 5 * 6 * This file is released under the GPL. 7 */ 8 9 #include "dm-core.h" 10 #include "dm-rq.h" 11 12 #include <linux/module.h> 13 #include <linux/vmalloc.h> 14 #include <linux/blkdev.h> 15 #include <linux/blk-integrity.h> 16 #include <linux/namei.h> 17 #include <linux/ctype.h> 18 #include <linux/string.h> 19 #include <linux/slab.h> 20 #include <linux/interrupt.h> 21 #include <linux/mutex.h> 22 #include <linux/delay.h> 23 #include <linux/atomic.h> 24 #include <linux/blk-mq.h> 25 #include <linux/mount.h> 26 #include <linux/dax.h> 27 28 #define DM_MSG_PREFIX "table" 29 30 #define NODE_SIZE L1_CACHE_BYTES 31 #define KEYS_PER_NODE (NODE_SIZE / sizeof(sector_t)) 32 #define CHILDREN_PER_NODE (KEYS_PER_NODE + 1) 33 34 /* 35 * Similar to ceiling(log_size(n)) 36 */ 37 static unsigned int int_log(unsigned int n, unsigned int base) 38 { 39 int result = 0; 40 41 while (n > 1) { 42 n = dm_div_up(n, base); 43 result++; 44 } 45 46 return result; 47 } 48 49 /* 50 * Calculate the index of the child node of the n'th node k'th key. 51 */ 52 static inline unsigned int get_child(unsigned int n, unsigned int k) 53 { 54 return (n * CHILDREN_PER_NODE) + k; 55 } 56 57 /* 58 * Return the n'th node of level l from table t. 59 */ 60 static inline sector_t *get_node(struct dm_table *t, 61 unsigned int l, unsigned int n) 62 { 63 return t->index[l] + (n * KEYS_PER_NODE); 64 } 65 66 /* 67 * Return the highest key that you could lookup from the n'th 68 * node on level l of the btree. 69 */ 70 static sector_t high(struct dm_table *t, unsigned int l, unsigned int n) 71 { 72 for (; l < t->depth - 1; l++) 73 n = get_child(n, CHILDREN_PER_NODE - 1); 74 75 if (n >= t->counts[l]) 76 return (sector_t) -1; 77 78 return get_node(t, l, n)[KEYS_PER_NODE - 1]; 79 } 80 81 /* 82 * Fills in a level of the btree based on the highs of the level 83 * below it. 84 */ 85 static int setup_btree_index(unsigned int l, struct dm_table *t) 86 { 87 unsigned int n, k; 88 sector_t *node; 89 90 for (n = 0U; n < t->counts[l]; n++) { 91 node = get_node(t, l, n); 92 93 for (k = 0U; k < KEYS_PER_NODE; k++) 94 node[k] = high(t, l + 1, get_child(n, k)); 95 } 96 97 return 0; 98 } 99 100 /* 101 * highs, and targets are managed as dynamic arrays during a 102 * table load. 103 */ 104 static int alloc_targets(struct dm_table *t, unsigned int num) 105 { 106 sector_t *n_highs; 107 struct dm_target *n_targets; 108 109 /* 110 * Allocate both the target array and offset array at once. 111 */ 112 n_highs = kvcalloc(num, sizeof(struct dm_target) + sizeof(sector_t), 113 GFP_KERNEL); 114 if (!n_highs) 115 return -ENOMEM; 116 117 n_targets = (struct dm_target *) (n_highs + num); 118 119 memset(n_highs, -1, sizeof(*n_highs) * num); 120 kvfree(t->highs); 121 122 t->num_allocated = num; 123 t->highs = n_highs; 124 t->targets = n_targets; 125 126 return 0; 127 } 128 129 int dm_table_create(struct dm_table **result, fmode_t mode, 130 unsigned int num_targets, struct mapped_device *md) 131 { 132 struct dm_table *t = kzalloc(sizeof(*t), GFP_KERNEL); 133 134 if (!t) 135 return -ENOMEM; 136 137 INIT_LIST_HEAD(&t->devices); 138 139 if (!num_targets) 140 num_targets = KEYS_PER_NODE; 141 142 num_targets = dm_round_up(num_targets, KEYS_PER_NODE); 143 144 if (!num_targets) { 145 kfree(t); 146 return -ENOMEM; 147 } 148 149 if (alloc_targets(t, num_targets)) { 150 kfree(t); 151 return -ENOMEM; 152 } 153 154 t->type = DM_TYPE_NONE; 155 t->mode = mode; 156 t->md = md; 157 *result = t; 158 return 0; 159 } 160 161 static void free_devices(struct list_head *devices, struct mapped_device *md) 162 { 163 struct list_head *tmp, *next; 164 165 list_for_each_safe(tmp, next, devices) { 166 struct dm_dev_internal *dd = 167 list_entry(tmp, struct dm_dev_internal, list); 168 DMWARN("%s: dm_table_destroy: dm_put_device call missing for %s", 169 dm_device_name(md), dd->dm_dev->name); 170 dm_put_table_device(md, dd->dm_dev); 171 kfree(dd); 172 } 173 } 174 175 static void dm_table_destroy_crypto_profile(struct dm_table *t); 176 177 void dm_table_destroy(struct dm_table *t) 178 { 179 if (!t) 180 return; 181 182 /* free the indexes */ 183 if (t->depth >= 2) 184 kvfree(t->index[t->depth - 2]); 185 186 /* free the targets */ 187 for (unsigned int i = 0; i < t->num_targets; i++) { 188 struct dm_target *ti = dm_table_get_target(t, i); 189 190 if (ti->type->dtr) 191 ti->type->dtr(ti); 192 193 dm_put_target_type(ti->type); 194 } 195 196 kvfree(t->highs); 197 198 /* free the device list */ 199 free_devices(&t->devices, t->md); 200 201 dm_free_md_mempools(t->mempools); 202 203 dm_table_destroy_crypto_profile(t); 204 205 kfree(t); 206 } 207 208 /* 209 * See if we've already got a device in the list. 210 */ 211 static struct dm_dev_internal *find_device(struct list_head *l, dev_t dev) 212 { 213 struct dm_dev_internal *dd; 214 215 list_for_each_entry(dd, l, list) 216 if (dd->dm_dev->bdev->bd_dev == dev) 217 return dd; 218 219 return NULL; 220 } 221 222 /* 223 * If possible, this checks an area of a destination device is invalid. 224 */ 225 static int device_area_is_invalid(struct dm_target *ti, struct dm_dev *dev, 226 sector_t start, sector_t len, void *data) 227 { 228 struct queue_limits *limits = data; 229 struct block_device *bdev = dev->bdev; 230 sector_t dev_size = bdev_nr_sectors(bdev); 231 unsigned short logical_block_size_sectors = 232 limits->logical_block_size >> SECTOR_SHIFT; 233 234 if (!dev_size) 235 return 0; 236 237 if ((start >= dev_size) || (start + len > dev_size)) { 238 DMERR("%s: %pg too small for target: " 239 "start=%llu, len=%llu, dev_size=%llu", 240 dm_device_name(ti->table->md), bdev, 241 (unsigned long long)start, 242 (unsigned long long)len, 243 (unsigned long long)dev_size); 244 return 1; 245 } 246 247 /* 248 * If the target is mapped to zoned block device(s), check 249 * that the zones are not partially mapped. 250 */ 251 if (bdev_is_zoned(bdev)) { 252 unsigned int zone_sectors = bdev_zone_sectors(bdev); 253 254 if (start & (zone_sectors - 1)) { 255 DMERR("%s: start=%llu not aligned to h/w zone size %u of %pg", 256 dm_device_name(ti->table->md), 257 (unsigned long long)start, 258 zone_sectors, bdev); 259 return 1; 260 } 261 262 /* 263 * Note: The last zone of a zoned block device may be smaller 264 * than other zones. So for a target mapping the end of a 265 * zoned block device with such a zone, len would not be zone 266 * aligned. We do not allow such last smaller zone to be part 267 * of the mapping here to ensure that mappings with multiple 268 * devices do not end up with a smaller zone in the middle of 269 * the sector range. 270 */ 271 if (len & (zone_sectors - 1)) { 272 DMERR("%s: len=%llu not aligned to h/w zone size %u of %pg", 273 dm_device_name(ti->table->md), 274 (unsigned long long)len, 275 zone_sectors, bdev); 276 return 1; 277 } 278 } 279 280 if (logical_block_size_sectors <= 1) 281 return 0; 282 283 if (start & (logical_block_size_sectors - 1)) { 284 DMERR("%s: start=%llu not aligned to h/w " 285 "logical block size %u of %pg", 286 dm_device_name(ti->table->md), 287 (unsigned long long)start, 288 limits->logical_block_size, bdev); 289 return 1; 290 } 291 292 if (len & (logical_block_size_sectors - 1)) { 293 DMERR("%s: len=%llu not aligned to h/w " 294 "logical block size %u of %pg", 295 dm_device_name(ti->table->md), 296 (unsigned long long)len, 297 limits->logical_block_size, bdev); 298 return 1; 299 } 300 301 return 0; 302 } 303 304 /* 305 * This upgrades the mode on an already open dm_dev, being 306 * careful to leave things as they were if we fail to reopen the 307 * device and not to touch the existing bdev field in case 308 * it is accessed concurrently. 309 */ 310 static int upgrade_mode(struct dm_dev_internal *dd, fmode_t new_mode, 311 struct mapped_device *md) 312 { 313 int r; 314 struct dm_dev *old_dev, *new_dev; 315 316 old_dev = dd->dm_dev; 317 318 r = dm_get_table_device(md, dd->dm_dev->bdev->bd_dev, 319 dd->dm_dev->mode | new_mode, &new_dev); 320 if (r) 321 return r; 322 323 dd->dm_dev = new_dev; 324 dm_put_table_device(md, old_dev); 325 326 return 0; 327 } 328 329 /* 330 * Convert the path to a device 331 */ 332 dev_t dm_get_dev_t(const char *path) 333 { 334 dev_t dev; 335 336 if (lookup_bdev(path, &dev)) 337 dev = name_to_dev_t(path); 338 return dev; 339 } 340 EXPORT_SYMBOL_GPL(dm_get_dev_t); 341 342 /* 343 * Add a device to the list, or just increment the usage count if 344 * it's already present. 345 */ 346 int dm_get_device(struct dm_target *ti, const char *path, fmode_t mode, 347 struct dm_dev **result) 348 { 349 int r; 350 dev_t dev; 351 unsigned int major, minor; 352 char dummy; 353 struct dm_dev_internal *dd; 354 struct dm_table *t = ti->table; 355 356 BUG_ON(!t); 357 358 if (sscanf(path, "%u:%u%c", &major, &minor, &dummy) == 2) { 359 /* Extract the major/minor numbers */ 360 dev = MKDEV(major, minor); 361 if (MAJOR(dev) != major || MINOR(dev) != minor) 362 return -EOVERFLOW; 363 } else { 364 dev = dm_get_dev_t(path); 365 if (!dev) 366 return -ENODEV; 367 } 368 if (dev == disk_devt(t->md->disk)) 369 return -EINVAL; 370 371 dd = find_device(&t->devices, dev); 372 if (!dd) { 373 dd = kmalloc(sizeof(*dd), GFP_KERNEL); 374 if (!dd) 375 return -ENOMEM; 376 377 r = dm_get_table_device(t->md, dev, mode, &dd->dm_dev); 378 if (r) { 379 kfree(dd); 380 return r; 381 } 382 383 refcount_set(&dd->count, 1); 384 list_add(&dd->list, &t->devices); 385 goto out; 386 387 } else if (dd->dm_dev->mode != (mode | dd->dm_dev->mode)) { 388 r = upgrade_mode(dd, mode, t->md); 389 if (r) 390 return r; 391 } 392 refcount_inc(&dd->count); 393 out: 394 *result = dd->dm_dev; 395 return 0; 396 } 397 EXPORT_SYMBOL(dm_get_device); 398 399 static int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev, 400 sector_t start, sector_t len, void *data) 401 { 402 struct queue_limits *limits = data; 403 struct block_device *bdev = dev->bdev; 404 struct request_queue *q = bdev_get_queue(bdev); 405 406 if (unlikely(!q)) { 407 DMWARN("%s: Cannot set limits for nonexistent device %pg", 408 dm_device_name(ti->table->md), bdev); 409 return 0; 410 } 411 412 if (blk_stack_limits(limits, &q->limits, 413 get_start_sect(bdev) + start) < 0) 414 DMWARN("%s: adding target device %pg caused an alignment inconsistency: " 415 "physical_block_size=%u, logical_block_size=%u, " 416 "alignment_offset=%u, start=%llu", 417 dm_device_name(ti->table->md), bdev, 418 q->limits.physical_block_size, 419 q->limits.logical_block_size, 420 q->limits.alignment_offset, 421 (unsigned long long) start << SECTOR_SHIFT); 422 return 0; 423 } 424 425 /* 426 * Decrement a device's use count and remove it if necessary. 427 */ 428 void dm_put_device(struct dm_target *ti, struct dm_dev *d) 429 { 430 int found = 0; 431 struct list_head *devices = &ti->table->devices; 432 struct dm_dev_internal *dd; 433 434 list_for_each_entry(dd, devices, list) { 435 if (dd->dm_dev == d) { 436 found = 1; 437 break; 438 } 439 } 440 if (!found) { 441 DMERR("%s: device %s not in table devices list", 442 dm_device_name(ti->table->md), d->name); 443 return; 444 } 445 if (refcount_dec_and_test(&dd->count)) { 446 dm_put_table_device(ti->table->md, d); 447 list_del(&dd->list); 448 kfree(dd); 449 } 450 } 451 EXPORT_SYMBOL(dm_put_device); 452 453 /* 454 * Checks to see if the target joins onto the end of the table. 455 */ 456 static int adjoin(struct dm_table *t, struct dm_target *ti) 457 { 458 struct dm_target *prev; 459 460 if (!t->num_targets) 461 return !ti->begin; 462 463 prev = &t->targets[t->num_targets - 1]; 464 return (ti->begin == (prev->begin + prev->len)); 465 } 466 467 /* 468 * Used to dynamically allocate the arg array. 469 * 470 * We do first allocation with GFP_NOIO because dm-mpath and dm-thin must 471 * process messages even if some device is suspended. These messages have a 472 * small fixed number of arguments. 473 * 474 * On the other hand, dm-switch needs to process bulk data using messages and 475 * excessive use of GFP_NOIO could cause trouble. 476 */ 477 static char **realloc_argv(unsigned int *size, char **old_argv) 478 { 479 char **argv; 480 unsigned int new_size; 481 gfp_t gfp; 482 483 if (*size) { 484 new_size = *size * 2; 485 gfp = GFP_KERNEL; 486 } else { 487 new_size = 8; 488 gfp = GFP_NOIO; 489 } 490 argv = kmalloc_array(new_size, sizeof(*argv), gfp); 491 if (argv && old_argv) { 492 memcpy(argv, old_argv, *size * sizeof(*argv)); 493 *size = new_size; 494 } 495 496 kfree(old_argv); 497 return argv; 498 } 499 500 /* 501 * Destructively splits up the argument list to pass to ctr. 502 */ 503 int dm_split_args(int *argc, char ***argvp, char *input) 504 { 505 char *start, *end = input, *out, **argv = NULL; 506 unsigned int array_size = 0; 507 508 *argc = 0; 509 510 if (!input) { 511 *argvp = NULL; 512 return 0; 513 } 514 515 argv = realloc_argv(&array_size, argv); 516 if (!argv) 517 return -ENOMEM; 518 519 while (1) { 520 /* Skip whitespace */ 521 start = skip_spaces(end); 522 523 if (!*start) 524 break; /* success, we hit the end */ 525 526 /* 'out' is used to remove any back-quotes */ 527 end = out = start; 528 while (*end) { 529 /* Everything apart from '\0' can be quoted */ 530 if (*end == '\\' && *(end + 1)) { 531 *out++ = *(end + 1); 532 end += 2; 533 continue; 534 } 535 536 if (isspace(*end)) 537 break; /* end of token */ 538 539 *out++ = *end++; 540 } 541 542 /* have we already filled the array ? */ 543 if ((*argc + 1) > array_size) { 544 argv = realloc_argv(&array_size, argv); 545 if (!argv) 546 return -ENOMEM; 547 } 548 549 /* we know this is whitespace */ 550 if (*end) 551 end++; 552 553 /* terminate the string and put it in the array */ 554 *out = '\0'; 555 argv[*argc] = start; 556 (*argc)++; 557 } 558 559 *argvp = argv; 560 return 0; 561 } 562 563 /* 564 * Impose necessary and sufficient conditions on a devices's table such 565 * that any incoming bio which respects its logical_block_size can be 566 * processed successfully. If it falls across the boundary between 567 * two or more targets, the size of each piece it gets split into must 568 * be compatible with the logical_block_size of the target processing it. 569 */ 570 static int validate_hardware_logical_block_alignment(struct dm_table *t, 571 struct queue_limits *limits) 572 { 573 /* 574 * This function uses arithmetic modulo the logical_block_size 575 * (in units of 512-byte sectors). 576 */ 577 unsigned short device_logical_block_size_sects = 578 limits->logical_block_size >> SECTOR_SHIFT; 579 580 /* 581 * Offset of the start of the next table entry, mod logical_block_size. 582 */ 583 unsigned short next_target_start = 0; 584 585 /* 586 * Given an aligned bio that extends beyond the end of a 587 * target, how many sectors must the next target handle? 588 */ 589 unsigned short remaining = 0; 590 591 struct dm_target *ti; 592 struct queue_limits ti_limits; 593 unsigned int i; 594 595 /* 596 * Check each entry in the table in turn. 597 */ 598 for (i = 0; i < t->num_targets; i++) { 599 ti = dm_table_get_target(t, i); 600 601 blk_set_stacking_limits(&ti_limits); 602 603 /* combine all target devices' limits */ 604 if (ti->type->iterate_devices) 605 ti->type->iterate_devices(ti, dm_set_device_limits, 606 &ti_limits); 607 608 /* 609 * If the remaining sectors fall entirely within this 610 * table entry are they compatible with its logical_block_size? 611 */ 612 if (remaining < ti->len && 613 remaining & ((ti_limits.logical_block_size >> 614 SECTOR_SHIFT) - 1)) 615 break; /* Error */ 616 617 next_target_start = 618 (unsigned short) ((next_target_start + ti->len) & 619 (device_logical_block_size_sects - 1)); 620 remaining = next_target_start ? 621 device_logical_block_size_sects - next_target_start : 0; 622 } 623 624 if (remaining) { 625 DMERR("%s: table line %u (start sect %llu len %llu) " 626 "not aligned to h/w logical block size %u", 627 dm_device_name(t->md), i, 628 (unsigned long long) ti->begin, 629 (unsigned long long) ti->len, 630 limits->logical_block_size); 631 return -EINVAL; 632 } 633 634 return 0; 635 } 636 637 int dm_table_add_target(struct dm_table *t, const char *type, 638 sector_t start, sector_t len, char *params) 639 { 640 int r = -EINVAL, argc; 641 char **argv; 642 struct dm_target *ti; 643 644 if (t->singleton) { 645 DMERR("%s: target type %s must appear alone in table", 646 dm_device_name(t->md), t->targets->type->name); 647 return -EINVAL; 648 } 649 650 BUG_ON(t->num_targets >= t->num_allocated); 651 652 ti = t->targets + t->num_targets; 653 memset(ti, 0, sizeof(*ti)); 654 655 if (!len) { 656 DMERR("%s: zero-length target", dm_device_name(t->md)); 657 return -EINVAL; 658 } 659 660 ti->type = dm_get_target_type(type); 661 if (!ti->type) { 662 DMERR("%s: %s: unknown target type", dm_device_name(t->md), type); 663 return -EINVAL; 664 } 665 666 if (dm_target_needs_singleton(ti->type)) { 667 if (t->num_targets) { 668 ti->error = "singleton target type must appear alone in table"; 669 goto bad; 670 } 671 t->singleton = true; 672 } 673 674 if (dm_target_always_writeable(ti->type) && !(t->mode & FMODE_WRITE)) { 675 ti->error = "target type may not be included in a read-only table"; 676 goto bad; 677 } 678 679 if (t->immutable_target_type) { 680 if (t->immutable_target_type != ti->type) { 681 ti->error = "immutable target type cannot be mixed with other target types"; 682 goto bad; 683 } 684 } else if (dm_target_is_immutable(ti->type)) { 685 if (t->num_targets) { 686 ti->error = "immutable target type cannot be mixed with other target types"; 687 goto bad; 688 } 689 t->immutable_target_type = ti->type; 690 } 691 692 if (dm_target_has_integrity(ti->type)) 693 t->integrity_added = 1; 694 695 ti->table = t; 696 ti->begin = start; 697 ti->len = len; 698 ti->error = "Unknown error"; 699 700 /* 701 * Does this target adjoin the previous one ? 702 */ 703 if (!adjoin(t, ti)) { 704 ti->error = "Gap in table"; 705 goto bad; 706 } 707 708 r = dm_split_args(&argc, &argv, params); 709 if (r) { 710 ti->error = "couldn't split parameters"; 711 goto bad; 712 } 713 714 r = ti->type->ctr(ti, argc, argv); 715 kfree(argv); 716 if (r) 717 goto bad; 718 719 t->highs[t->num_targets++] = ti->begin + ti->len - 1; 720 721 if (!ti->num_discard_bios && ti->discards_supported) 722 DMWARN("%s: %s: ignoring discards_supported because num_discard_bios is zero.", 723 dm_device_name(t->md), type); 724 725 if (ti->limit_swap_bios && !static_key_enabled(&swap_bios_enabled.key)) 726 static_branch_enable(&swap_bios_enabled); 727 728 return 0; 729 730 bad: 731 DMERR("%s: %s: %s (%pe)", dm_device_name(t->md), type, ti->error, ERR_PTR(r)); 732 dm_put_target_type(ti->type); 733 return r; 734 } 735 736 /* 737 * Target argument parsing helpers. 738 */ 739 static int validate_next_arg(const struct dm_arg *arg, struct dm_arg_set *arg_set, 740 unsigned int *value, char **error, unsigned int grouped) 741 { 742 const char *arg_str = dm_shift_arg(arg_set); 743 char dummy; 744 745 if (!arg_str || 746 (sscanf(arg_str, "%u%c", value, &dummy) != 1) || 747 (*value < arg->min) || 748 (*value > arg->max) || 749 (grouped && arg_set->argc < *value)) { 750 *error = arg->error; 751 return -EINVAL; 752 } 753 754 return 0; 755 } 756 757 int dm_read_arg(const struct dm_arg *arg, struct dm_arg_set *arg_set, 758 unsigned int *value, char **error) 759 { 760 return validate_next_arg(arg, arg_set, value, error, 0); 761 } 762 EXPORT_SYMBOL(dm_read_arg); 763 764 int dm_read_arg_group(const struct dm_arg *arg, struct dm_arg_set *arg_set, 765 unsigned int *value, char **error) 766 { 767 return validate_next_arg(arg, arg_set, value, error, 1); 768 } 769 EXPORT_SYMBOL(dm_read_arg_group); 770 771 const char *dm_shift_arg(struct dm_arg_set *as) 772 { 773 char *r; 774 775 if (as->argc) { 776 as->argc--; 777 r = *as->argv; 778 as->argv++; 779 return r; 780 } 781 782 return NULL; 783 } 784 EXPORT_SYMBOL(dm_shift_arg); 785 786 void dm_consume_args(struct dm_arg_set *as, unsigned int num_args) 787 { 788 BUG_ON(as->argc < num_args); 789 as->argc -= num_args; 790 as->argv += num_args; 791 } 792 EXPORT_SYMBOL(dm_consume_args); 793 794 static bool __table_type_bio_based(enum dm_queue_mode table_type) 795 { 796 return (table_type == DM_TYPE_BIO_BASED || 797 table_type == DM_TYPE_DAX_BIO_BASED); 798 } 799 800 static bool __table_type_request_based(enum dm_queue_mode table_type) 801 { 802 return table_type == DM_TYPE_REQUEST_BASED; 803 } 804 805 void dm_table_set_type(struct dm_table *t, enum dm_queue_mode type) 806 { 807 t->type = type; 808 } 809 EXPORT_SYMBOL_GPL(dm_table_set_type); 810 811 /* validate the dax capability of the target device span */ 812 static int device_not_dax_capable(struct dm_target *ti, struct dm_dev *dev, 813 sector_t start, sector_t len, void *data) 814 { 815 if (dev->dax_dev) 816 return false; 817 818 DMDEBUG("%pg: error: dax unsupported by block device", dev->bdev); 819 return true; 820 } 821 822 /* Check devices support synchronous DAX */ 823 static int device_not_dax_synchronous_capable(struct dm_target *ti, struct dm_dev *dev, 824 sector_t start, sector_t len, void *data) 825 { 826 return !dev->dax_dev || !dax_synchronous(dev->dax_dev); 827 } 828 829 static bool dm_table_supports_dax(struct dm_table *t, 830 iterate_devices_callout_fn iterate_fn) 831 { 832 /* Ensure that all targets support DAX. */ 833 for (unsigned int i = 0; i < t->num_targets; i++) { 834 struct dm_target *ti = dm_table_get_target(t, i); 835 836 if (!ti->type->direct_access) 837 return false; 838 839 if (!ti->type->iterate_devices || 840 ti->type->iterate_devices(ti, iterate_fn, NULL)) 841 return false; 842 } 843 844 return true; 845 } 846 847 static int device_is_rq_stackable(struct dm_target *ti, struct dm_dev *dev, 848 sector_t start, sector_t len, void *data) 849 { 850 struct block_device *bdev = dev->bdev; 851 struct request_queue *q = bdev_get_queue(bdev); 852 853 /* request-based cannot stack on partitions! */ 854 if (bdev_is_partition(bdev)) 855 return false; 856 857 return queue_is_mq(q); 858 } 859 860 static int dm_table_determine_type(struct dm_table *t) 861 { 862 unsigned int bio_based = 0, request_based = 0, hybrid = 0; 863 struct dm_target *ti; 864 struct list_head *devices = dm_table_get_devices(t); 865 enum dm_queue_mode live_md_type = dm_get_md_type(t->md); 866 867 if (t->type != DM_TYPE_NONE) { 868 /* target already set the table's type */ 869 if (t->type == DM_TYPE_BIO_BASED) { 870 /* possibly upgrade to a variant of bio-based */ 871 goto verify_bio_based; 872 } 873 BUG_ON(t->type == DM_TYPE_DAX_BIO_BASED); 874 goto verify_rq_based; 875 } 876 877 for (unsigned int i = 0; i < t->num_targets; i++) { 878 ti = dm_table_get_target(t, i); 879 if (dm_target_hybrid(ti)) 880 hybrid = 1; 881 else if (dm_target_request_based(ti)) 882 request_based = 1; 883 else 884 bio_based = 1; 885 886 if (bio_based && request_based) { 887 DMERR("Inconsistent table: different target types" 888 " can't be mixed up"); 889 return -EINVAL; 890 } 891 } 892 893 if (hybrid && !bio_based && !request_based) { 894 /* 895 * The targets can work either way. 896 * Determine the type from the live device. 897 * Default to bio-based if device is new. 898 */ 899 if (__table_type_request_based(live_md_type)) 900 request_based = 1; 901 else 902 bio_based = 1; 903 } 904 905 if (bio_based) { 906 verify_bio_based: 907 /* We must use this table as bio-based */ 908 t->type = DM_TYPE_BIO_BASED; 909 if (dm_table_supports_dax(t, device_not_dax_capable) || 910 (list_empty(devices) && live_md_type == DM_TYPE_DAX_BIO_BASED)) { 911 t->type = DM_TYPE_DAX_BIO_BASED; 912 } 913 return 0; 914 } 915 916 BUG_ON(!request_based); /* No targets in this table */ 917 918 t->type = DM_TYPE_REQUEST_BASED; 919 920 verify_rq_based: 921 /* 922 * Request-based dm supports only tables that have a single target now. 923 * To support multiple targets, request splitting support is needed, 924 * and that needs lots of changes in the block-layer. 925 * (e.g. request completion process for partial completion.) 926 */ 927 if (t->num_targets > 1) { 928 DMERR("request-based DM doesn't support multiple targets"); 929 return -EINVAL; 930 } 931 932 if (list_empty(devices)) { 933 int srcu_idx; 934 struct dm_table *live_table = dm_get_live_table(t->md, &srcu_idx); 935 936 /* inherit live table's type */ 937 if (live_table) 938 t->type = live_table->type; 939 dm_put_live_table(t->md, srcu_idx); 940 return 0; 941 } 942 943 ti = dm_table_get_immutable_target(t); 944 if (!ti) { 945 DMERR("table load rejected: immutable target is required"); 946 return -EINVAL; 947 } else if (ti->max_io_len) { 948 DMERR("table load rejected: immutable target that splits IO is not supported"); 949 return -EINVAL; 950 } 951 952 /* Non-request-stackable devices can't be used for request-based dm */ 953 if (!ti->type->iterate_devices || 954 !ti->type->iterate_devices(ti, device_is_rq_stackable, NULL)) { 955 DMERR("table load rejected: including non-request-stackable devices"); 956 return -EINVAL; 957 } 958 959 return 0; 960 } 961 962 enum dm_queue_mode dm_table_get_type(struct dm_table *t) 963 { 964 return t->type; 965 } 966 967 struct target_type *dm_table_get_immutable_target_type(struct dm_table *t) 968 { 969 return t->immutable_target_type; 970 } 971 972 struct dm_target *dm_table_get_immutable_target(struct dm_table *t) 973 { 974 /* Immutable target is implicitly a singleton */ 975 if (t->num_targets > 1 || 976 !dm_target_is_immutable(t->targets[0].type)) 977 return NULL; 978 979 return t->targets; 980 } 981 982 struct dm_target *dm_table_get_wildcard_target(struct dm_table *t) 983 { 984 for (unsigned int i = 0; i < t->num_targets; i++) { 985 struct dm_target *ti = dm_table_get_target(t, i); 986 987 if (dm_target_is_wildcard(ti->type)) 988 return ti; 989 } 990 991 return NULL; 992 } 993 994 bool dm_table_bio_based(struct dm_table *t) 995 { 996 return __table_type_bio_based(dm_table_get_type(t)); 997 } 998 999 bool dm_table_request_based(struct dm_table *t) 1000 { 1001 return __table_type_request_based(dm_table_get_type(t)); 1002 } 1003 1004 static bool dm_table_supports_poll(struct dm_table *t); 1005 1006 static int dm_table_alloc_md_mempools(struct dm_table *t, struct mapped_device *md) 1007 { 1008 enum dm_queue_mode type = dm_table_get_type(t); 1009 unsigned int per_io_data_size = 0, front_pad, io_front_pad; 1010 unsigned int min_pool_size = 0, pool_size; 1011 struct dm_md_mempools *pools; 1012 1013 if (unlikely(type == DM_TYPE_NONE)) { 1014 DMERR("no table type is set, can't allocate mempools"); 1015 return -EINVAL; 1016 } 1017 1018 pools = kzalloc_node(sizeof(*pools), GFP_KERNEL, md->numa_node_id); 1019 if (!pools) 1020 return -ENOMEM; 1021 1022 if (type == DM_TYPE_REQUEST_BASED) { 1023 pool_size = dm_get_reserved_rq_based_ios(); 1024 front_pad = offsetof(struct dm_rq_clone_bio_info, clone); 1025 goto init_bs; 1026 } 1027 1028 for (unsigned int i = 0; i < t->num_targets; i++) { 1029 struct dm_target *ti = dm_table_get_target(t, i); 1030 1031 per_io_data_size = max(per_io_data_size, ti->per_io_data_size); 1032 min_pool_size = max(min_pool_size, ti->num_flush_bios); 1033 } 1034 pool_size = max(dm_get_reserved_bio_based_ios(), min_pool_size); 1035 front_pad = roundup(per_io_data_size, 1036 __alignof__(struct dm_target_io)) + DM_TARGET_IO_BIO_OFFSET; 1037 1038 io_front_pad = roundup(per_io_data_size, 1039 __alignof__(struct dm_io)) + DM_IO_BIO_OFFSET; 1040 if (bioset_init(&pools->io_bs, pool_size, io_front_pad, 1041 dm_table_supports_poll(t) ? BIOSET_PERCPU_CACHE : 0)) 1042 goto out_free_pools; 1043 if (t->integrity_supported && 1044 bioset_integrity_create(&pools->io_bs, pool_size)) 1045 goto out_free_pools; 1046 init_bs: 1047 if (bioset_init(&pools->bs, pool_size, front_pad, 0)) 1048 goto out_free_pools; 1049 if (t->integrity_supported && 1050 bioset_integrity_create(&pools->bs, pool_size)) 1051 goto out_free_pools; 1052 1053 t->mempools = pools; 1054 return 0; 1055 1056 out_free_pools: 1057 dm_free_md_mempools(pools); 1058 return -ENOMEM; 1059 } 1060 1061 static int setup_indexes(struct dm_table *t) 1062 { 1063 int i; 1064 unsigned int total = 0; 1065 sector_t *indexes; 1066 1067 /* allocate the space for *all* the indexes */ 1068 for (i = t->depth - 2; i >= 0; i--) { 1069 t->counts[i] = dm_div_up(t->counts[i + 1], CHILDREN_PER_NODE); 1070 total += t->counts[i]; 1071 } 1072 1073 indexes = kvcalloc(total, NODE_SIZE, GFP_KERNEL); 1074 if (!indexes) 1075 return -ENOMEM; 1076 1077 /* set up internal nodes, bottom-up */ 1078 for (i = t->depth - 2; i >= 0; i--) { 1079 t->index[i] = indexes; 1080 indexes += (KEYS_PER_NODE * t->counts[i]); 1081 setup_btree_index(i, t); 1082 } 1083 1084 return 0; 1085 } 1086 1087 /* 1088 * Builds the btree to index the map. 1089 */ 1090 static int dm_table_build_index(struct dm_table *t) 1091 { 1092 int r = 0; 1093 unsigned int leaf_nodes; 1094 1095 /* how many indexes will the btree have ? */ 1096 leaf_nodes = dm_div_up(t->num_targets, KEYS_PER_NODE); 1097 t->depth = 1 + int_log(leaf_nodes, CHILDREN_PER_NODE); 1098 1099 /* leaf layer has already been set up */ 1100 t->counts[t->depth - 1] = leaf_nodes; 1101 t->index[t->depth - 1] = t->highs; 1102 1103 if (t->depth >= 2) 1104 r = setup_indexes(t); 1105 1106 return r; 1107 } 1108 1109 static bool integrity_profile_exists(struct gendisk *disk) 1110 { 1111 return !!blk_get_integrity(disk); 1112 } 1113 1114 /* 1115 * Get a disk whose integrity profile reflects the table's profile. 1116 * Returns NULL if integrity support was inconsistent or unavailable. 1117 */ 1118 static struct gendisk *dm_table_get_integrity_disk(struct dm_table *t) 1119 { 1120 struct list_head *devices = dm_table_get_devices(t); 1121 struct dm_dev_internal *dd = NULL; 1122 struct gendisk *prev_disk = NULL, *template_disk = NULL; 1123 1124 for (unsigned int i = 0; i < t->num_targets; i++) { 1125 struct dm_target *ti = dm_table_get_target(t, i); 1126 1127 if (!dm_target_passes_integrity(ti->type)) 1128 goto no_integrity; 1129 } 1130 1131 list_for_each_entry(dd, devices, list) { 1132 template_disk = dd->dm_dev->bdev->bd_disk; 1133 if (!integrity_profile_exists(template_disk)) 1134 goto no_integrity; 1135 else if (prev_disk && 1136 blk_integrity_compare(prev_disk, template_disk) < 0) 1137 goto no_integrity; 1138 prev_disk = template_disk; 1139 } 1140 1141 return template_disk; 1142 1143 no_integrity: 1144 if (prev_disk) 1145 DMWARN("%s: integrity not set: %s and %s profile mismatch", 1146 dm_device_name(t->md), 1147 prev_disk->disk_name, 1148 template_disk->disk_name); 1149 return NULL; 1150 } 1151 1152 /* 1153 * Register the mapped device for blk_integrity support if the 1154 * underlying devices have an integrity profile. But all devices may 1155 * not have matching profiles (checking all devices isn't reliable 1156 * during table load because this table may use other DM device(s) which 1157 * must be resumed before they will have an initialized integity 1158 * profile). Consequently, stacked DM devices force a 2 stage integrity 1159 * profile validation: First pass during table load, final pass during 1160 * resume. 1161 */ 1162 static int dm_table_register_integrity(struct dm_table *t) 1163 { 1164 struct mapped_device *md = t->md; 1165 struct gendisk *template_disk = NULL; 1166 1167 /* If target handles integrity itself do not register it here. */ 1168 if (t->integrity_added) 1169 return 0; 1170 1171 template_disk = dm_table_get_integrity_disk(t); 1172 if (!template_disk) 1173 return 0; 1174 1175 if (!integrity_profile_exists(dm_disk(md))) { 1176 t->integrity_supported = true; 1177 /* 1178 * Register integrity profile during table load; we can do 1179 * this because the final profile must match during resume. 1180 */ 1181 blk_integrity_register(dm_disk(md), 1182 blk_get_integrity(template_disk)); 1183 return 0; 1184 } 1185 1186 /* 1187 * If DM device already has an initialized integrity 1188 * profile the new profile should not conflict. 1189 */ 1190 if (blk_integrity_compare(dm_disk(md), template_disk) < 0) { 1191 DMERR("%s: conflict with existing integrity profile: " 1192 "%s profile mismatch", 1193 dm_device_name(t->md), 1194 template_disk->disk_name); 1195 return 1; 1196 } 1197 1198 /* Preserve existing integrity profile */ 1199 t->integrity_supported = true; 1200 return 0; 1201 } 1202 1203 #ifdef CONFIG_BLK_INLINE_ENCRYPTION 1204 1205 struct dm_crypto_profile { 1206 struct blk_crypto_profile profile; 1207 struct mapped_device *md; 1208 }; 1209 1210 struct dm_keyslot_evict_args { 1211 const struct blk_crypto_key *key; 1212 int err; 1213 }; 1214 1215 static int dm_keyslot_evict_callback(struct dm_target *ti, struct dm_dev *dev, 1216 sector_t start, sector_t len, void *data) 1217 { 1218 struct dm_keyslot_evict_args *args = data; 1219 int err; 1220 1221 err = blk_crypto_evict_key(dev->bdev, args->key); 1222 if (!args->err) 1223 args->err = err; 1224 /* Always try to evict the key from all devices. */ 1225 return 0; 1226 } 1227 1228 /* 1229 * When an inline encryption key is evicted from a device-mapper device, evict 1230 * it from all the underlying devices. 1231 */ 1232 static int dm_keyslot_evict(struct blk_crypto_profile *profile, 1233 const struct blk_crypto_key *key, unsigned int slot) 1234 { 1235 struct mapped_device *md = 1236 container_of(profile, struct dm_crypto_profile, profile)->md; 1237 struct dm_keyslot_evict_args args = { key }; 1238 struct dm_table *t; 1239 int srcu_idx; 1240 1241 t = dm_get_live_table(md, &srcu_idx); 1242 if (!t) 1243 return 0; 1244 1245 for (unsigned int i = 0; i < t->num_targets; i++) { 1246 struct dm_target *ti = dm_table_get_target(t, i); 1247 1248 if (!ti->type->iterate_devices) 1249 continue; 1250 ti->type->iterate_devices(ti, dm_keyslot_evict_callback, &args); 1251 } 1252 1253 dm_put_live_table(md, srcu_idx); 1254 return args.err; 1255 } 1256 1257 static int 1258 device_intersect_crypto_capabilities(struct dm_target *ti, struct dm_dev *dev, 1259 sector_t start, sector_t len, void *data) 1260 { 1261 struct blk_crypto_profile *parent = data; 1262 struct blk_crypto_profile *child = 1263 bdev_get_queue(dev->bdev)->crypto_profile; 1264 1265 blk_crypto_intersect_capabilities(parent, child); 1266 return 0; 1267 } 1268 1269 void dm_destroy_crypto_profile(struct blk_crypto_profile *profile) 1270 { 1271 struct dm_crypto_profile *dmcp = container_of(profile, 1272 struct dm_crypto_profile, 1273 profile); 1274 1275 if (!profile) 1276 return; 1277 1278 blk_crypto_profile_destroy(profile); 1279 kfree(dmcp); 1280 } 1281 1282 static void dm_table_destroy_crypto_profile(struct dm_table *t) 1283 { 1284 dm_destroy_crypto_profile(t->crypto_profile); 1285 t->crypto_profile = NULL; 1286 } 1287 1288 /* 1289 * Constructs and initializes t->crypto_profile with a crypto profile that 1290 * represents the common set of crypto capabilities of the devices described by 1291 * the dm_table. However, if the constructed crypto profile doesn't support all 1292 * crypto capabilities that are supported by the current mapped_device, it 1293 * returns an error instead, since we don't support removing crypto capabilities 1294 * on table changes. Finally, if the constructed crypto profile is "empty" (has 1295 * no crypto capabilities at all), it just sets t->crypto_profile to NULL. 1296 */ 1297 static int dm_table_construct_crypto_profile(struct dm_table *t) 1298 { 1299 struct dm_crypto_profile *dmcp; 1300 struct blk_crypto_profile *profile; 1301 unsigned int i; 1302 bool empty_profile = true; 1303 1304 dmcp = kmalloc(sizeof(*dmcp), GFP_KERNEL); 1305 if (!dmcp) 1306 return -ENOMEM; 1307 dmcp->md = t->md; 1308 1309 profile = &dmcp->profile; 1310 blk_crypto_profile_init(profile, 0); 1311 profile->ll_ops.keyslot_evict = dm_keyslot_evict; 1312 profile->max_dun_bytes_supported = UINT_MAX; 1313 memset(profile->modes_supported, 0xFF, 1314 sizeof(profile->modes_supported)); 1315 1316 for (i = 0; i < t->num_targets; i++) { 1317 struct dm_target *ti = dm_table_get_target(t, i); 1318 1319 if (!dm_target_passes_crypto(ti->type)) { 1320 blk_crypto_intersect_capabilities(profile, NULL); 1321 break; 1322 } 1323 if (!ti->type->iterate_devices) 1324 continue; 1325 ti->type->iterate_devices(ti, 1326 device_intersect_crypto_capabilities, 1327 profile); 1328 } 1329 1330 if (t->md->queue && 1331 !blk_crypto_has_capabilities(profile, 1332 t->md->queue->crypto_profile)) { 1333 DMERR("Inline encryption capabilities of new DM table were more restrictive than the old table's. This is not supported!"); 1334 dm_destroy_crypto_profile(profile); 1335 return -EINVAL; 1336 } 1337 1338 /* 1339 * If the new profile doesn't actually support any crypto capabilities, 1340 * we may as well represent it with a NULL profile. 1341 */ 1342 for (i = 0; i < ARRAY_SIZE(profile->modes_supported); i++) { 1343 if (profile->modes_supported[i]) { 1344 empty_profile = false; 1345 break; 1346 } 1347 } 1348 1349 if (empty_profile) { 1350 dm_destroy_crypto_profile(profile); 1351 profile = NULL; 1352 } 1353 1354 /* 1355 * t->crypto_profile is only set temporarily while the table is being 1356 * set up, and it gets set to NULL after the profile has been 1357 * transferred to the request_queue. 1358 */ 1359 t->crypto_profile = profile; 1360 1361 return 0; 1362 } 1363 1364 static void dm_update_crypto_profile(struct request_queue *q, 1365 struct dm_table *t) 1366 { 1367 if (!t->crypto_profile) 1368 return; 1369 1370 /* Make the crypto profile less restrictive. */ 1371 if (!q->crypto_profile) { 1372 blk_crypto_register(t->crypto_profile, q); 1373 } else { 1374 blk_crypto_update_capabilities(q->crypto_profile, 1375 t->crypto_profile); 1376 dm_destroy_crypto_profile(t->crypto_profile); 1377 } 1378 t->crypto_profile = NULL; 1379 } 1380 1381 #else /* CONFIG_BLK_INLINE_ENCRYPTION */ 1382 1383 static int dm_table_construct_crypto_profile(struct dm_table *t) 1384 { 1385 return 0; 1386 } 1387 1388 void dm_destroy_crypto_profile(struct blk_crypto_profile *profile) 1389 { 1390 } 1391 1392 static void dm_table_destroy_crypto_profile(struct dm_table *t) 1393 { 1394 } 1395 1396 static void dm_update_crypto_profile(struct request_queue *q, 1397 struct dm_table *t) 1398 { 1399 } 1400 1401 #endif /* !CONFIG_BLK_INLINE_ENCRYPTION */ 1402 1403 /* 1404 * Prepares the table for use by building the indices, 1405 * setting the type, and allocating mempools. 1406 */ 1407 int dm_table_complete(struct dm_table *t) 1408 { 1409 int r; 1410 1411 r = dm_table_determine_type(t); 1412 if (r) { 1413 DMERR("unable to determine table type"); 1414 return r; 1415 } 1416 1417 r = dm_table_build_index(t); 1418 if (r) { 1419 DMERR("unable to build btrees"); 1420 return r; 1421 } 1422 1423 r = dm_table_register_integrity(t); 1424 if (r) { 1425 DMERR("could not register integrity profile."); 1426 return r; 1427 } 1428 1429 r = dm_table_construct_crypto_profile(t); 1430 if (r) { 1431 DMERR("could not construct crypto profile."); 1432 return r; 1433 } 1434 1435 r = dm_table_alloc_md_mempools(t, t->md); 1436 if (r) 1437 DMERR("unable to allocate mempools"); 1438 1439 return r; 1440 } 1441 1442 static DEFINE_MUTEX(_event_lock); 1443 void dm_table_event_callback(struct dm_table *t, 1444 void (*fn)(void *), void *context) 1445 { 1446 mutex_lock(&_event_lock); 1447 t->event_fn = fn; 1448 t->event_context = context; 1449 mutex_unlock(&_event_lock); 1450 } 1451 1452 void dm_table_event(struct dm_table *t) 1453 { 1454 mutex_lock(&_event_lock); 1455 if (t->event_fn) 1456 t->event_fn(t->event_context); 1457 mutex_unlock(&_event_lock); 1458 } 1459 EXPORT_SYMBOL(dm_table_event); 1460 1461 inline sector_t dm_table_get_size(struct dm_table *t) 1462 { 1463 return t->num_targets ? (t->highs[t->num_targets - 1] + 1) : 0; 1464 } 1465 EXPORT_SYMBOL(dm_table_get_size); 1466 1467 /* 1468 * Search the btree for the correct target. 1469 * 1470 * Caller should check returned pointer for NULL 1471 * to trap I/O beyond end of device. 1472 */ 1473 struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector) 1474 { 1475 unsigned int l, n = 0, k = 0; 1476 sector_t *node; 1477 1478 if (unlikely(sector >= dm_table_get_size(t))) 1479 return NULL; 1480 1481 for (l = 0; l < t->depth; l++) { 1482 n = get_child(n, k); 1483 node = get_node(t, l, n); 1484 1485 for (k = 0; k < KEYS_PER_NODE; k++) 1486 if (node[k] >= sector) 1487 break; 1488 } 1489 1490 return &t->targets[(KEYS_PER_NODE * n) + k]; 1491 } 1492 1493 static int device_not_poll_capable(struct dm_target *ti, struct dm_dev *dev, 1494 sector_t start, sector_t len, void *data) 1495 { 1496 struct request_queue *q = bdev_get_queue(dev->bdev); 1497 1498 return !test_bit(QUEUE_FLAG_POLL, &q->queue_flags); 1499 } 1500 1501 /* 1502 * type->iterate_devices() should be called when the sanity check needs to 1503 * iterate and check all underlying data devices. iterate_devices() will 1504 * iterate all underlying data devices until it encounters a non-zero return 1505 * code, returned by whether the input iterate_devices_callout_fn, or 1506 * iterate_devices() itself internally. 1507 * 1508 * For some target type (e.g. dm-stripe), one call of iterate_devices() may 1509 * iterate multiple underlying devices internally, in which case a non-zero 1510 * return code returned by iterate_devices_callout_fn will stop the iteration 1511 * in advance. 1512 * 1513 * Cases requiring _any_ underlying device supporting some kind of attribute, 1514 * should use the iteration structure like dm_table_any_dev_attr(), or call 1515 * it directly. @func should handle semantics of positive examples, e.g. 1516 * capable of something. 1517 * 1518 * Cases requiring _all_ underlying devices supporting some kind of attribute, 1519 * should use the iteration structure like dm_table_supports_nowait() or 1520 * dm_table_supports_discards(). Or introduce dm_table_all_devs_attr() that 1521 * uses an @anti_func that handle semantics of counter examples, e.g. not 1522 * capable of something. So: return !dm_table_any_dev_attr(t, anti_func, data); 1523 */ 1524 static bool dm_table_any_dev_attr(struct dm_table *t, 1525 iterate_devices_callout_fn func, void *data) 1526 { 1527 for (unsigned int i = 0; i < t->num_targets; i++) { 1528 struct dm_target *ti = dm_table_get_target(t, i); 1529 1530 if (ti->type->iterate_devices && 1531 ti->type->iterate_devices(ti, func, data)) 1532 return true; 1533 } 1534 1535 return false; 1536 } 1537 1538 static int count_device(struct dm_target *ti, struct dm_dev *dev, 1539 sector_t start, sector_t len, void *data) 1540 { 1541 unsigned int *num_devices = data; 1542 1543 (*num_devices)++; 1544 1545 return 0; 1546 } 1547 1548 static bool dm_table_supports_poll(struct dm_table *t) 1549 { 1550 for (unsigned int i = 0; i < t->num_targets; i++) { 1551 struct dm_target *ti = dm_table_get_target(t, i); 1552 1553 if (!ti->type->iterate_devices || 1554 ti->type->iterate_devices(ti, device_not_poll_capable, NULL)) 1555 return false; 1556 } 1557 1558 return true; 1559 } 1560 1561 /* 1562 * Check whether a table has no data devices attached using each 1563 * target's iterate_devices method. 1564 * Returns false if the result is unknown because a target doesn't 1565 * support iterate_devices. 1566 */ 1567 bool dm_table_has_no_data_devices(struct dm_table *t) 1568 { 1569 for (unsigned int i = 0; i < t->num_targets; i++) { 1570 struct dm_target *ti = dm_table_get_target(t, i); 1571 unsigned int num_devices = 0; 1572 1573 if (!ti->type->iterate_devices) 1574 return false; 1575 1576 ti->type->iterate_devices(ti, count_device, &num_devices); 1577 if (num_devices) 1578 return false; 1579 } 1580 1581 return true; 1582 } 1583 1584 static int device_not_zoned_model(struct dm_target *ti, struct dm_dev *dev, 1585 sector_t start, sector_t len, void *data) 1586 { 1587 struct request_queue *q = bdev_get_queue(dev->bdev); 1588 enum blk_zoned_model *zoned_model = data; 1589 1590 return blk_queue_zoned_model(q) != *zoned_model; 1591 } 1592 1593 /* 1594 * Check the device zoned model based on the target feature flag. If the target 1595 * has the DM_TARGET_ZONED_HM feature flag set, host-managed zoned devices are 1596 * also accepted but all devices must have the same zoned model. If the target 1597 * has the DM_TARGET_MIXED_ZONED_MODEL feature set, the devices can have any 1598 * zoned model with all zoned devices having the same zone size. 1599 */ 1600 static bool dm_table_supports_zoned_model(struct dm_table *t, 1601 enum blk_zoned_model zoned_model) 1602 { 1603 for (unsigned int i = 0; i < t->num_targets; i++) { 1604 struct dm_target *ti = dm_table_get_target(t, i); 1605 1606 if (dm_target_supports_zoned_hm(ti->type)) { 1607 if (!ti->type->iterate_devices || 1608 ti->type->iterate_devices(ti, device_not_zoned_model, 1609 &zoned_model)) 1610 return false; 1611 } else if (!dm_target_supports_mixed_zoned_model(ti->type)) { 1612 if (zoned_model == BLK_ZONED_HM) 1613 return false; 1614 } 1615 } 1616 1617 return true; 1618 } 1619 1620 static int device_not_matches_zone_sectors(struct dm_target *ti, struct dm_dev *dev, 1621 sector_t start, sector_t len, void *data) 1622 { 1623 unsigned int *zone_sectors = data; 1624 1625 if (!bdev_is_zoned(dev->bdev)) 1626 return 0; 1627 return bdev_zone_sectors(dev->bdev) != *zone_sectors; 1628 } 1629 1630 /* 1631 * Check consistency of zoned model and zone sectors across all targets. For 1632 * zone sectors, if the destination device is a zoned block device, it shall 1633 * have the specified zone_sectors. 1634 */ 1635 static int validate_hardware_zoned_model(struct dm_table *t, 1636 enum blk_zoned_model zoned_model, 1637 unsigned int zone_sectors) 1638 { 1639 if (zoned_model == BLK_ZONED_NONE) 1640 return 0; 1641 1642 if (!dm_table_supports_zoned_model(t, zoned_model)) { 1643 DMERR("%s: zoned model is not consistent across all devices", 1644 dm_device_name(t->md)); 1645 return -EINVAL; 1646 } 1647 1648 /* Check zone size validity and compatibility */ 1649 if (!zone_sectors || !is_power_of_2(zone_sectors)) 1650 return -EINVAL; 1651 1652 if (dm_table_any_dev_attr(t, device_not_matches_zone_sectors, &zone_sectors)) { 1653 DMERR("%s: zone sectors is not consistent across all zoned devices", 1654 dm_device_name(t->md)); 1655 return -EINVAL; 1656 } 1657 1658 return 0; 1659 } 1660 1661 /* 1662 * Establish the new table's queue_limits and validate them. 1663 */ 1664 int dm_calculate_queue_limits(struct dm_table *t, 1665 struct queue_limits *limits) 1666 { 1667 struct queue_limits ti_limits; 1668 enum blk_zoned_model zoned_model = BLK_ZONED_NONE; 1669 unsigned int zone_sectors = 0; 1670 1671 blk_set_stacking_limits(limits); 1672 1673 for (unsigned int i = 0; i < t->num_targets; i++) { 1674 struct dm_target *ti = dm_table_get_target(t, i); 1675 1676 blk_set_stacking_limits(&ti_limits); 1677 1678 if (!ti->type->iterate_devices) 1679 goto combine_limits; 1680 1681 /* 1682 * Combine queue limits of all the devices this target uses. 1683 */ 1684 ti->type->iterate_devices(ti, dm_set_device_limits, 1685 &ti_limits); 1686 1687 if (zoned_model == BLK_ZONED_NONE && ti_limits.zoned != BLK_ZONED_NONE) { 1688 /* 1689 * After stacking all limits, validate all devices 1690 * in table support this zoned model and zone sectors. 1691 */ 1692 zoned_model = ti_limits.zoned; 1693 zone_sectors = ti_limits.chunk_sectors; 1694 } 1695 1696 /* Set I/O hints portion of queue limits */ 1697 if (ti->type->io_hints) 1698 ti->type->io_hints(ti, &ti_limits); 1699 1700 /* 1701 * Check each device area is consistent with the target's 1702 * overall queue limits. 1703 */ 1704 if (ti->type->iterate_devices(ti, device_area_is_invalid, 1705 &ti_limits)) 1706 return -EINVAL; 1707 1708 combine_limits: 1709 /* 1710 * Merge this target's queue limits into the overall limits 1711 * for the table. 1712 */ 1713 if (blk_stack_limits(limits, &ti_limits, 0) < 0) 1714 DMWARN("%s: adding target device " 1715 "(start sect %llu len %llu) " 1716 "caused an alignment inconsistency", 1717 dm_device_name(t->md), 1718 (unsigned long long) ti->begin, 1719 (unsigned long long) ti->len); 1720 } 1721 1722 /* 1723 * Verify that the zoned model and zone sectors, as determined before 1724 * any .io_hints override, are the same across all devices in the table. 1725 * - this is especially relevant if .io_hints is emulating a disk-managed 1726 * zoned model (aka BLK_ZONED_NONE) on host-managed zoned block devices. 1727 * BUT... 1728 */ 1729 if (limits->zoned != BLK_ZONED_NONE) { 1730 /* 1731 * ...IF the above limits stacking determined a zoned model 1732 * validate that all of the table's devices conform to it. 1733 */ 1734 zoned_model = limits->zoned; 1735 zone_sectors = limits->chunk_sectors; 1736 } 1737 if (validate_hardware_zoned_model(t, zoned_model, zone_sectors)) 1738 return -EINVAL; 1739 1740 return validate_hardware_logical_block_alignment(t, limits); 1741 } 1742 1743 /* 1744 * Verify that all devices have an integrity profile that matches the 1745 * DM device's registered integrity profile. If the profiles don't 1746 * match then unregister the DM device's integrity profile. 1747 */ 1748 static void dm_table_verify_integrity(struct dm_table *t) 1749 { 1750 struct gendisk *template_disk = NULL; 1751 1752 if (t->integrity_added) 1753 return; 1754 1755 if (t->integrity_supported) { 1756 /* 1757 * Verify that the original integrity profile 1758 * matches all the devices in this table. 1759 */ 1760 template_disk = dm_table_get_integrity_disk(t); 1761 if (template_disk && 1762 blk_integrity_compare(dm_disk(t->md), template_disk) >= 0) 1763 return; 1764 } 1765 1766 if (integrity_profile_exists(dm_disk(t->md))) { 1767 DMWARN("%s: unable to establish an integrity profile", 1768 dm_device_name(t->md)); 1769 blk_integrity_unregister(dm_disk(t->md)); 1770 } 1771 } 1772 1773 static int device_flush_capable(struct dm_target *ti, struct dm_dev *dev, 1774 sector_t start, sector_t len, void *data) 1775 { 1776 unsigned long flush = (unsigned long) data; 1777 struct request_queue *q = bdev_get_queue(dev->bdev); 1778 1779 return (q->queue_flags & flush); 1780 } 1781 1782 static bool dm_table_supports_flush(struct dm_table *t, unsigned long flush) 1783 { 1784 /* 1785 * Require at least one underlying device to support flushes. 1786 * t->devices includes internal dm devices such as mirror logs 1787 * so we need to use iterate_devices here, which targets 1788 * supporting flushes must provide. 1789 */ 1790 for (unsigned int i = 0; i < t->num_targets; i++) { 1791 struct dm_target *ti = dm_table_get_target(t, i); 1792 1793 if (!ti->num_flush_bios) 1794 continue; 1795 1796 if (ti->flush_supported) 1797 return true; 1798 1799 if (ti->type->iterate_devices && 1800 ti->type->iterate_devices(ti, device_flush_capable, (void *) flush)) 1801 return true; 1802 } 1803 1804 return false; 1805 } 1806 1807 static int device_dax_write_cache_enabled(struct dm_target *ti, 1808 struct dm_dev *dev, sector_t start, 1809 sector_t len, void *data) 1810 { 1811 struct dax_device *dax_dev = dev->dax_dev; 1812 1813 if (!dax_dev) 1814 return false; 1815 1816 if (dax_write_cache_enabled(dax_dev)) 1817 return true; 1818 return false; 1819 } 1820 1821 static int device_is_rotational(struct dm_target *ti, struct dm_dev *dev, 1822 sector_t start, sector_t len, void *data) 1823 { 1824 return !bdev_nonrot(dev->bdev); 1825 } 1826 1827 static int device_is_not_random(struct dm_target *ti, struct dm_dev *dev, 1828 sector_t start, sector_t len, void *data) 1829 { 1830 struct request_queue *q = bdev_get_queue(dev->bdev); 1831 1832 return !blk_queue_add_random(q); 1833 } 1834 1835 static int device_not_write_zeroes_capable(struct dm_target *ti, struct dm_dev *dev, 1836 sector_t start, sector_t len, void *data) 1837 { 1838 struct request_queue *q = bdev_get_queue(dev->bdev); 1839 1840 return !q->limits.max_write_zeroes_sectors; 1841 } 1842 1843 static bool dm_table_supports_write_zeroes(struct dm_table *t) 1844 { 1845 for (unsigned int i = 0; i < t->num_targets; i++) { 1846 struct dm_target *ti = dm_table_get_target(t, i); 1847 1848 if (!ti->num_write_zeroes_bios) 1849 return false; 1850 1851 if (!ti->type->iterate_devices || 1852 ti->type->iterate_devices(ti, device_not_write_zeroes_capable, NULL)) 1853 return false; 1854 } 1855 1856 return true; 1857 } 1858 1859 static int device_not_nowait_capable(struct dm_target *ti, struct dm_dev *dev, 1860 sector_t start, sector_t len, void *data) 1861 { 1862 return !bdev_nowait(dev->bdev); 1863 } 1864 1865 static bool dm_table_supports_nowait(struct dm_table *t) 1866 { 1867 for (unsigned int i = 0; i < t->num_targets; i++) { 1868 struct dm_target *ti = dm_table_get_target(t, i); 1869 1870 if (!dm_target_supports_nowait(ti->type)) 1871 return false; 1872 1873 if (!ti->type->iterate_devices || 1874 ti->type->iterate_devices(ti, device_not_nowait_capable, NULL)) 1875 return false; 1876 } 1877 1878 return true; 1879 } 1880 1881 static int device_not_discard_capable(struct dm_target *ti, struct dm_dev *dev, 1882 sector_t start, sector_t len, void *data) 1883 { 1884 return !bdev_max_discard_sectors(dev->bdev); 1885 } 1886 1887 static bool dm_table_supports_discards(struct dm_table *t) 1888 { 1889 for (unsigned int i = 0; i < t->num_targets; i++) { 1890 struct dm_target *ti = dm_table_get_target(t, i); 1891 1892 if (!ti->num_discard_bios) 1893 return false; 1894 1895 /* 1896 * Either the target provides discard support (as implied by setting 1897 * 'discards_supported') or it relies on _all_ data devices having 1898 * discard support. 1899 */ 1900 if (!ti->discards_supported && 1901 (!ti->type->iterate_devices || 1902 ti->type->iterate_devices(ti, device_not_discard_capable, NULL))) 1903 return false; 1904 } 1905 1906 return true; 1907 } 1908 1909 static int device_not_secure_erase_capable(struct dm_target *ti, 1910 struct dm_dev *dev, sector_t start, 1911 sector_t len, void *data) 1912 { 1913 return !bdev_max_secure_erase_sectors(dev->bdev); 1914 } 1915 1916 static bool dm_table_supports_secure_erase(struct dm_table *t) 1917 { 1918 for (unsigned int i = 0; i < t->num_targets; i++) { 1919 struct dm_target *ti = dm_table_get_target(t, i); 1920 1921 if (!ti->num_secure_erase_bios) 1922 return false; 1923 1924 if (!ti->type->iterate_devices || 1925 ti->type->iterate_devices(ti, device_not_secure_erase_capable, NULL)) 1926 return false; 1927 } 1928 1929 return true; 1930 } 1931 1932 static int device_requires_stable_pages(struct dm_target *ti, 1933 struct dm_dev *dev, sector_t start, 1934 sector_t len, void *data) 1935 { 1936 return bdev_stable_writes(dev->bdev); 1937 } 1938 1939 int dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, 1940 struct queue_limits *limits) 1941 { 1942 bool wc = false, fua = false; 1943 int r; 1944 1945 /* 1946 * Copy table's limits to the DM device's request_queue 1947 */ 1948 q->limits = *limits; 1949 1950 if (dm_table_supports_nowait(t)) 1951 blk_queue_flag_set(QUEUE_FLAG_NOWAIT, q); 1952 else 1953 blk_queue_flag_clear(QUEUE_FLAG_NOWAIT, q); 1954 1955 if (!dm_table_supports_discards(t)) { 1956 q->limits.max_discard_sectors = 0; 1957 q->limits.max_hw_discard_sectors = 0; 1958 q->limits.discard_granularity = 0; 1959 q->limits.discard_alignment = 0; 1960 q->limits.discard_misaligned = 0; 1961 } 1962 1963 if (!dm_table_supports_secure_erase(t)) 1964 q->limits.max_secure_erase_sectors = 0; 1965 1966 if (dm_table_supports_flush(t, (1UL << QUEUE_FLAG_WC))) { 1967 wc = true; 1968 if (dm_table_supports_flush(t, (1UL << QUEUE_FLAG_FUA))) 1969 fua = true; 1970 } 1971 blk_queue_write_cache(q, wc, fua); 1972 1973 if (dm_table_supports_dax(t, device_not_dax_capable)) { 1974 blk_queue_flag_set(QUEUE_FLAG_DAX, q); 1975 if (dm_table_supports_dax(t, device_not_dax_synchronous_capable)) 1976 set_dax_synchronous(t->md->dax_dev); 1977 } else 1978 blk_queue_flag_clear(QUEUE_FLAG_DAX, q); 1979 1980 if (dm_table_any_dev_attr(t, device_dax_write_cache_enabled, NULL)) 1981 dax_write_cache(t->md->dax_dev, true); 1982 1983 /* Ensure that all underlying devices are non-rotational. */ 1984 if (dm_table_any_dev_attr(t, device_is_rotational, NULL)) 1985 blk_queue_flag_clear(QUEUE_FLAG_NONROT, q); 1986 else 1987 blk_queue_flag_set(QUEUE_FLAG_NONROT, q); 1988 1989 if (!dm_table_supports_write_zeroes(t)) 1990 q->limits.max_write_zeroes_sectors = 0; 1991 1992 dm_table_verify_integrity(t); 1993 1994 /* 1995 * Some devices don't use blk_integrity but still want stable pages 1996 * because they do their own checksumming. 1997 * If any underlying device requires stable pages, a table must require 1998 * them as well. Only targets that support iterate_devices are considered: 1999 * don't want error, zero, etc to require stable pages. 2000 */ 2001 if (dm_table_any_dev_attr(t, device_requires_stable_pages, NULL)) 2002 blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, q); 2003 else 2004 blk_queue_flag_clear(QUEUE_FLAG_STABLE_WRITES, q); 2005 2006 /* 2007 * Determine whether or not this queue's I/O timings contribute 2008 * to the entropy pool, Only request-based targets use this. 2009 * Clear QUEUE_FLAG_ADD_RANDOM if any underlying device does not 2010 * have it set. 2011 */ 2012 if (blk_queue_add_random(q) && 2013 dm_table_any_dev_attr(t, device_is_not_random, NULL)) 2014 blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, q); 2015 2016 /* 2017 * For a zoned target, setup the zones related queue attributes 2018 * and resources necessary for zone append emulation if necessary. 2019 */ 2020 if (blk_queue_is_zoned(q)) { 2021 r = dm_set_zones_restrictions(t, q); 2022 if (r) 2023 return r; 2024 if (!static_key_enabled(&zoned_enabled.key)) 2025 static_branch_enable(&zoned_enabled); 2026 } 2027 2028 dm_update_crypto_profile(q, t); 2029 disk_update_readahead(t->md->disk); 2030 2031 /* 2032 * Check for request-based device is left to 2033 * dm_mq_init_request_queue()->blk_mq_init_allocated_queue(). 2034 * 2035 * For bio-based device, only set QUEUE_FLAG_POLL when all 2036 * underlying devices supporting polling. 2037 */ 2038 if (__table_type_bio_based(t->type)) { 2039 if (dm_table_supports_poll(t)) 2040 blk_queue_flag_set(QUEUE_FLAG_POLL, q); 2041 else 2042 blk_queue_flag_clear(QUEUE_FLAG_POLL, q); 2043 } 2044 2045 return 0; 2046 } 2047 2048 struct list_head *dm_table_get_devices(struct dm_table *t) 2049 { 2050 return &t->devices; 2051 } 2052 2053 fmode_t dm_table_get_mode(struct dm_table *t) 2054 { 2055 return t->mode; 2056 } 2057 EXPORT_SYMBOL(dm_table_get_mode); 2058 2059 enum suspend_mode { 2060 PRESUSPEND, 2061 PRESUSPEND_UNDO, 2062 POSTSUSPEND, 2063 }; 2064 2065 static void suspend_targets(struct dm_table *t, enum suspend_mode mode) 2066 { 2067 lockdep_assert_held(&t->md->suspend_lock); 2068 2069 for (unsigned int i = 0; i < t->num_targets; i++) { 2070 struct dm_target *ti = dm_table_get_target(t, i); 2071 2072 switch (mode) { 2073 case PRESUSPEND: 2074 if (ti->type->presuspend) 2075 ti->type->presuspend(ti); 2076 break; 2077 case PRESUSPEND_UNDO: 2078 if (ti->type->presuspend_undo) 2079 ti->type->presuspend_undo(ti); 2080 break; 2081 case POSTSUSPEND: 2082 if (ti->type->postsuspend) 2083 ti->type->postsuspend(ti); 2084 break; 2085 } 2086 } 2087 } 2088 2089 void dm_table_presuspend_targets(struct dm_table *t) 2090 { 2091 if (!t) 2092 return; 2093 2094 suspend_targets(t, PRESUSPEND); 2095 } 2096 2097 void dm_table_presuspend_undo_targets(struct dm_table *t) 2098 { 2099 if (!t) 2100 return; 2101 2102 suspend_targets(t, PRESUSPEND_UNDO); 2103 } 2104 2105 void dm_table_postsuspend_targets(struct dm_table *t) 2106 { 2107 if (!t) 2108 return; 2109 2110 suspend_targets(t, POSTSUSPEND); 2111 } 2112 2113 int dm_table_resume_targets(struct dm_table *t) 2114 { 2115 unsigned int i; 2116 int r = 0; 2117 2118 lockdep_assert_held(&t->md->suspend_lock); 2119 2120 for (i = 0; i < t->num_targets; i++) { 2121 struct dm_target *ti = dm_table_get_target(t, i); 2122 2123 if (!ti->type->preresume) 2124 continue; 2125 2126 r = ti->type->preresume(ti); 2127 if (r) { 2128 DMERR("%s: %s: preresume failed, error = %d", 2129 dm_device_name(t->md), ti->type->name, r); 2130 return r; 2131 } 2132 } 2133 2134 for (i = 0; i < t->num_targets; i++) { 2135 struct dm_target *ti = dm_table_get_target(t, i); 2136 2137 if (ti->type->resume) 2138 ti->type->resume(ti); 2139 } 2140 2141 return 0; 2142 } 2143 2144 struct mapped_device *dm_table_get_md(struct dm_table *t) 2145 { 2146 return t->md; 2147 } 2148 EXPORT_SYMBOL(dm_table_get_md); 2149 2150 const char *dm_table_device_name(struct dm_table *t) 2151 { 2152 return dm_device_name(t->md); 2153 } 2154 EXPORT_SYMBOL_GPL(dm_table_device_name); 2155 2156 void dm_table_run_md_queue_async(struct dm_table *t) 2157 { 2158 if (!dm_table_request_based(t)) 2159 return; 2160 2161 if (t->md->queue) 2162 blk_mq_run_hw_queues(t->md->queue, true); 2163 } 2164 EXPORT_SYMBOL(dm_table_run_md_queue_async); 2165 2166