1 /* 2 * Copyright (C) 2001 Sistina Software (UK) Limited. 3 * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved. 4 * 5 * This file is released under the GPL. 6 */ 7 8 #include "dm-core.h" 9 #include "dm-rq.h" 10 11 #include <linux/module.h> 12 #include <linux/vmalloc.h> 13 #include <linux/blkdev.h> 14 #include <linux/blk-integrity.h> 15 #include <linux/namei.h> 16 #include <linux/ctype.h> 17 #include <linux/string.h> 18 #include <linux/slab.h> 19 #include <linux/interrupt.h> 20 #include <linux/mutex.h> 21 #include <linux/delay.h> 22 #include <linux/atomic.h> 23 #include <linux/blk-mq.h> 24 #include <linux/mount.h> 25 #include <linux/dax.h> 26 27 #define DM_MSG_PREFIX "table" 28 29 #define NODE_SIZE L1_CACHE_BYTES 30 #define KEYS_PER_NODE (NODE_SIZE / sizeof(sector_t)) 31 #define CHILDREN_PER_NODE (KEYS_PER_NODE + 1) 32 33 /* 34 * Similar to ceiling(log_size(n)) 35 */ 36 static unsigned int int_log(unsigned int n, unsigned int base) 37 { 38 int result = 0; 39 40 while (n > 1) { 41 n = dm_div_up(n, base); 42 result++; 43 } 44 45 return result; 46 } 47 48 /* 49 * Calculate the index of the child node of the n'th node k'th key. 50 */ 51 static inline unsigned int get_child(unsigned int n, unsigned int k) 52 { 53 return (n * CHILDREN_PER_NODE) + k; 54 } 55 56 /* 57 * Return the n'th node of level l from table t. 58 */ 59 static inline sector_t *get_node(struct dm_table *t, 60 unsigned int l, unsigned int n) 61 { 62 return t->index[l] + (n * KEYS_PER_NODE); 63 } 64 65 /* 66 * Return the highest key that you could lookup from the n'th 67 * node on level l of the btree. 68 */ 69 static sector_t high(struct dm_table *t, unsigned int l, unsigned int n) 70 { 71 for (; l < t->depth - 1; l++) 72 n = get_child(n, CHILDREN_PER_NODE - 1); 73 74 if (n >= t->counts[l]) 75 return (sector_t) - 1; 76 77 return get_node(t, l, n)[KEYS_PER_NODE - 1]; 78 } 79 80 /* 81 * Fills in a level of the btree based on the highs of the level 82 * below it. 83 */ 84 static int setup_btree_index(unsigned int l, struct dm_table *t) 85 { 86 unsigned int n, k; 87 sector_t *node; 88 89 for (n = 0U; n < t->counts[l]; n++) { 90 node = get_node(t, l, n); 91 92 for (k = 0U; k < KEYS_PER_NODE; k++) 93 node[k] = high(t, l + 1, get_child(n, k)); 94 } 95 96 return 0; 97 } 98 99 /* 100 * highs, and targets are managed as dynamic arrays during a 101 * table load. 102 */ 103 static int alloc_targets(struct dm_table *t, unsigned int num) 104 { 105 sector_t *n_highs; 106 struct dm_target *n_targets; 107 108 /* 109 * Allocate both the target array and offset array at once. 110 */ 111 n_highs = kvcalloc(num, sizeof(struct dm_target) + sizeof(sector_t), 112 GFP_KERNEL); 113 if (!n_highs) 114 return -ENOMEM; 115 116 n_targets = (struct dm_target *) (n_highs + num); 117 118 memset(n_highs, -1, sizeof(*n_highs) * num); 119 kvfree(t->highs); 120 121 t->num_allocated = num; 122 t->highs = n_highs; 123 t->targets = n_targets; 124 125 return 0; 126 } 127 128 int dm_table_create(struct dm_table **result, fmode_t mode, 129 unsigned num_targets, struct mapped_device *md) 130 { 131 struct dm_table *t = kzalloc(sizeof(*t), GFP_KERNEL); 132 133 if (!t) 134 return -ENOMEM; 135 136 INIT_LIST_HEAD(&t->devices); 137 138 if (!num_targets) 139 num_targets = KEYS_PER_NODE; 140 141 num_targets = dm_round_up(num_targets, KEYS_PER_NODE); 142 143 if (!num_targets) { 144 kfree(t); 145 return -ENOMEM; 146 } 147 148 if (alloc_targets(t, num_targets)) { 149 kfree(t); 150 return -ENOMEM; 151 } 152 153 t->type = DM_TYPE_NONE; 154 t->mode = mode; 155 t->md = md; 156 *result = t; 157 return 0; 158 } 159 160 static void free_devices(struct list_head *devices, struct mapped_device *md) 161 { 162 struct list_head *tmp, *next; 163 164 list_for_each_safe(tmp, next, devices) { 165 struct dm_dev_internal *dd = 166 list_entry(tmp, struct dm_dev_internal, list); 167 DMWARN("%s: dm_table_destroy: dm_put_device call missing for %s", 168 dm_device_name(md), dd->dm_dev->name); 169 dm_put_table_device(md, dd->dm_dev); 170 kfree(dd); 171 } 172 } 173 174 static void dm_table_destroy_crypto_profile(struct dm_table *t); 175 176 void dm_table_destroy(struct dm_table *t) 177 { 178 if (!t) 179 return; 180 181 /* free the indexes */ 182 if (t->depth >= 2) 183 kvfree(t->index[t->depth - 2]); 184 185 /* free the targets */ 186 for (unsigned int i = 0; i < t->num_targets; i++) { 187 struct dm_target *ti = dm_table_get_target(t, i); 188 189 if (ti->type->dtr) 190 ti->type->dtr(ti); 191 192 dm_put_target_type(ti->type); 193 } 194 195 kvfree(t->highs); 196 197 /* free the device list */ 198 free_devices(&t->devices, t->md); 199 200 dm_free_md_mempools(t->mempools); 201 202 dm_table_destroy_crypto_profile(t); 203 204 kfree(t); 205 } 206 207 /* 208 * See if we've already got a device in the list. 209 */ 210 static struct dm_dev_internal *find_device(struct list_head *l, dev_t dev) 211 { 212 struct dm_dev_internal *dd; 213 214 list_for_each_entry (dd, l, list) 215 if (dd->dm_dev->bdev->bd_dev == dev) 216 return dd; 217 218 return NULL; 219 } 220 221 /* 222 * If possible, this checks an area of a destination device is invalid. 223 */ 224 static int device_area_is_invalid(struct dm_target *ti, struct dm_dev *dev, 225 sector_t start, sector_t len, void *data) 226 { 227 struct queue_limits *limits = data; 228 struct block_device *bdev = dev->bdev; 229 sector_t dev_size = bdev_nr_sectors(bdev); 230 unsigned short logical_block_size_sectors = 231 limits->logical_block_size >> SECTOR_SHIFT; 232 233 if (!dev_size) 234 return 0; 235 236 if ((start >= dev_size) || (start + len > dev_size)) { 237 DMWARN("%s: %pg too small for target: " 238 "start=%llu, len=%llu, dev_size=%llu", 239 dm_device_name(ti->table->md), bdev, 240 (unsigned long long)start, 241 (unsigned long long)len, 242 (unsigned long long)dev_size); 243 return 1; 244 } 245 246 /* 247 * If the target is mapped to zoned block device(s), check 248 * that the zones are not partially mapped. 249 */ 250 if (bdev_is_zoned(bdev)) { 251 unsigned int zone_sectors = bdev_zone_sectors(bdev); 252 253 if (start & (zone_sectors - 1)) { 254 DMWARN("%s: start=%llu not aligned to h/w zone size %u of %pg", 255 dm_device_name(ti->table->md), 256 (unsigned long long)start, 257 zone_sectors, bdev); 258 return 1; 259 } 260 261 /* 262 * Note: The last zone of a zoned block device may be smaller 263 * than other zones. So for a target mapping the end of a 264 * zoned block device with such a zone, len would not be zone 265 * aligned. We do not allow such last smaller zone to be part 266 * of the mapping here to ensure that mappings with multiple 267 * devices do not end up with a smaller zone in the middle of 268 * the sector range. 269 */ 270 if (len & (zone_sectors - 1)) { 271 DMWARN("%s: len=%llu not aligned to h/w zone size %u of %pg", 272 dm_device_name(ti->table->md), 273 (unsigned long long)len, 274 zone_sectors, bdev); 275 return 1; 276 } 277 } 278 279 if (logical_block_size_sectors <= 1) 280 return 0; 281 282 if (start & (logical_block_size_sectors - 1)) { 283 DMWARN("%s: start=%llu not aligned to h/w " 284 "logical block size %u of %pg", 285 dm_device_name(ti->table->md), 286 (unsigned long long)start, 287 limits->logical_block_size, bdev); 288 return 1; 289 } 290 291 if (len & (logical_block_size_sectors - 1)) { 292 DMWARN("%s: len=%llu not aligned to h/w " 293 "logical block size %u of %pg", 294 dm_device_name(ti->table->md), 295 (unsigned long long)len, 296 limits->logical_block_size, bdev); 297 return 1; 298 } 299 300 return 0; 301 } 302 303 /* 304 * This upgrades the mode on an already open dm_dev, being 305 * careful to leave things as they were if we fail to reopen the 306 * device and not to touch the existing bdev field in case 307 * it is accessed concurrently. 308 */ 309 static int upgrade_mode(struct dm_dev_internal *dd, fmode_t new_mode, 310 struct mapped_device *md) 311 { 312 int r; 313 struct dm_dev *old_dev, *new_dev; 314 315 old_dev = dd->dm_dev; 316 317 r = dm_get_table_device(md, dd->dm_dev->bdev->bd_dev, 318 dd->dm_dev->mode | new_mode, &new_dev); 319 if (r) 320 return r; 321 322 dd->dm_dev = new_dev; 323 dm_put_table_device(md, old_dev); 324 325 return 0; 326 } 327 328 /* 329 * Convert the path to a device 330 */ 331 dev_t dm_get_dev_t(const char *path) 332 { 333 dev_t dev; 334 335 if (lookup_bdev(path, &dev)) 336 dev = name_to_dev_t(path); 337 return dev; 338 } 339 EXPORT_SYMBOL_GPL(dm_get_dev_t); 340 341 /* 342 * Add a device to the list, or just increment the usage count if 343 * it's already present. 344 */ 345 int dm_get_device(struct dm_target *ti, const char *path, fmode_t mode, 346 struct dm_dev **result) 347 { 348 int r; 349 dev_t dev; 350 unsigned int major, minor; 351 char dummy; 352 struct dm_dev_internal *dd; 353 struct dm_table *t = ti->table; 354 355 BUG_ON(!t); 356 357 if (sscanf(path, "%u:%u%c", &major, &minor, &dummy) == 2) { 358 /* Extract the major/minor numbers */ 359 dev = MKDEV(major, minor); 360 if (MAJOR(dev) != major || MINOR(dev) != minor) 361 return -EOVERFLOW; 362 } else { 363 dev = dm_get_dev_t(path); 364 if (!dev) 365 return -ENODEV; 366 } 367 368 dd = find_device(&t->devices, dev); 369 if (!dd) { 370 dd = kmalloc(sizeof(*dd), GFP_KERNEL); 371 if (!dd) 372 return -ENOMEM; 373 374 if ((r = dm_get_table_device(t->md, dev, mode, &dd->dm_dev))) { 375 kfree(dd); 376 return r; 377 } 378 379 refcount_set(&dd->count, 1); 380 list_add(&dd->list, &t->devices); 381 goto out; 382 383 } else if (dd->dm_dev->mode != (mode | dd->dm_dev->mode)) { 384 r = upgrade_mode(dd, mode, t->md); 385 if (r) 386 return r; 387 } 388 refcount_inc(&dd->count); 389 out: 390 *result = dd->dm_dev; 391 return 0; 392 } 393 EXPORT_SYMBOL(dm_get_device); 394 395 static int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev, 396 sector_t start, sector_t len, void *data) 397 { 398 struct queue_limits *limits = data; 399 struct block_device *bdev = dev->bdev; 400 struct request_queue *q = bdev_get_queue(bdev); 401 402 if (unlikely(!q)) { 403 DMWARN("%s: Cannot set limits for nonexistent device %pg", 404 dm_device_name(ti->table->md), bdev); 405 return 0; 406 } 407 408 if (blk_stack_limits(limits, &q->limits, 409 get_start_sect(bdev) + start) < 0) 410 DMWARN("%s: adding target device %pg caused an alignment inconsistency: " 411 "physical_block_size=%u, logical_block_size=%u, " 412 "alignment_offset=%u, start=%llu", 413 dm_device_name(ti->table->md), bdev, 414 q->limits.physical_block_size, 415 q->limits.logical_block_size, 416 q->limits.alignment_offset, 417 (unsigned long long) start << SECTOR_SHIFT); 418 return 0; 419 } 420 421 /* 422 * Decrement a device's use count and remove it if necessary. 423 */ 424 void dm_put_device(struct dm_target *ti, struct dm_dev *d) 425 { 426 int found = 0; 427 struct list_head *devices = &ti->table->devices; 428 struct dm_dev_internal *dd; 429 430 list_for_each_entry(dd, devices, list) { 431 if (dd->dm_dev == d) { 432 found = 1; 433 break; 434 } 435 } 436 if (!found) { 437 DMWARN("%s: device %s not in table devices list", 438 dm_device_name(ti->table->md), d->name); 439 return; 440 } 441 if (refcount_dec_and_test(&dd->count)) { 442 dm_put_table_device(ti->table->md, d); 443 list_del(&dd->list); 444 kfree(dd); 445 } 446 } 447 EXPORT_SYMBOL(dm_put_device); 448 449 /* 450 * Checks to see if the target joins onto the end of the table. 451 */ 452 static int adjoin(struct dm_table *t, struct dm_target *ti) 453 { 454 struct dm_target *prev; 455 456 if (!t->num_targets) 457 return !ti->begin; 458 459 prev = &t->targets[t->num_targets - 1]; 460 return (ti->begin == (prev->begin + prev->len)); 461 } 462 463 /* 464 * Used to dynamically allocate the arg array. 465 * 466 * We do first allocation with GFP_NOIO because dm-mpath and dm-thin must 467 * process messages even if some device is suspended. These messages have a 468 * small fixed number of arguments. 469 * 470 * On the other hand, dm-switch needs to process bulk data using messages and 471 * excessive use of GFP_NOIO could cause trouble. 472 */ 473 static char **realloc_argv(unsigned *size, char **old_argv) 474 { 475 char **argv; 476 unsigned new_size; 477 gfp_t gfp; 478 479 if (*size) { 480 new_size = *size * 2; 481 gfp = GFP_KERNEL; 482 } else { 483 new_size = 8; 484 gfp = GFP_NOIO; 485 } 486 argv = kmalloc_array(new_size, sizeof(*argv), gfp); 487 if (argv && old_argv) { 488 memcpy(argv, old_argv, *size * sizeof(*argv)); 489 *size = new_size; 490 } 491 492 kfree(old_argv); 493 return argv; 494 } 495 496 /* 497 * Destructively splits up the argument list to pass to ctr. 498 */ 499 int dm_split_args(int *argc, char ***argvp, char *input) 500 { 501 char *start, *end = input, *out, **argv = NULL; 502 unsigned array_size = 0; 503 504 *argc = 0; 505 506 if (!input) { 507 *argvp = NULL; 508 return 0; 509 } 510 511 argv = realloc_argv(&array_size, argv); 512 if (!argv) 513 return -ENOMEM; 514 515 while (1) { 516 /* Skip whitespace */ 517 start = skip_spaces(end); 518 519 if (!*start) 520 break; /* success, we hit the end */ 521 522 /* 'out' is used to remove any back-quotes */ 523 end = out = start; 524 while (*end) { 525 /* Everything apart from '\0' can be quoted */ 526 if (*end == '\\' && *(end + 1)) { 527 *out++ = *(end + 1); 528 end += 2; 529 continue; 530 } 531 532 if (isspace(*end)) 533 break; /* end of token */ 534 535 *out++ = *end++; 536 } 537 538 /* have we already filled the array ? */ 539 if ((*argc + 1) > array_size) { 540 argv = realloc_argv(&array_size, argv); 541 if (!argv) 542 return -ENOMEM; 543 } 544 545 /* we know this is whitespace */ 546 if (*end) 547 end++; 548 549 /* terminate the string and put it in the array */ 550 *out = '\0'; 551 argv[*argc] = start; 552 (*argc)++; 553 } 554 555 *argvp = argv; 556 return 0; 557 } 558 559 /* 560 * Impose necessary and sufficient conditions on a devices's table such 561 * that any incoming bio which respects its logical_block_size can be 562 * processed successfully. If it falls across the boundary between 563 * two or more targets, the size of each piece it gets split into must 564 * be compatible with the logical_block_size of the target processing it. 565 */ 566 static int validate_hardware_logical_block_alignment(struct dm_table *t, 567 struct queue_limits *limits) 568 { 569 /* 570 * This function uses arithmetic modulo the logical_block_size 571 * (in units of 512-byte sectors). 572 */ 573 unsigned short device_logical_block_size_sects = 574 limits->logical_block_size >> SECTOR_SHIFT; 575 576 /* 577 * Offset of the start of the next table entry, mod logical_block_size. 578 */ 579 unsigned short next_target_start = 0; 580 581 /* 582 * Given an aligned bio that extends beyond the end of a 583 * target, how many sectors must the next target handle? 584 */ 585 unsigned short remaining = 0; 586 587 struct dm_target *ti; 588 struct queue_limits ti_limits; 589 unsigned int i; 590 591 /* 592 * Check each entry in the table in turn. 593 */ 594 for (i = 0; i < t->num_targets; i++) { 595 ti = dm_table_get_target(t, i); 596 597 blk_set_stacking_limits(&ti_limits); 598 599 /* combine all target devices' limits */ 600 if (ti->type->iterate_devices) 601 ti->type->iterate_devices(ti, dm_set_device_limits, 602 &ti_limits); 603 604 /* 605 * If the remaining sectors fall entirely within this 606 * table entry are they compatible with its logical_block_size? 607 */ 608 if (remaining < ti->len && 609 remaining & ((ti_limits.logical_block_size >> 610 SECTOR_SHIFT) - 1)) 611 break; /* Error */ 612 613 next_target_start = 614 (unsigned short) ((next_target_start + ti->len) & 615 (device_logical_block_size_sects - 1)); 616 remaining = next_target_start ? 617 device_logical_block_size_sects - next_target_start : 0; 618 } 619 620 if (remaining) { 621 DMWARN("%s: table line %u (start sect %llu len %llu) " 622 "not aligned to h/w logical block size %u", 623 dm_device_name(t->md), i, 624 (unsigned long long) ti->begin, 625 (unsigned long long) ti->len, 626 limits->logical_block_size); 627 return -EINVAL; 628 } 629 630 return 0; 631 } 632 633 int dm_table_add_target(struct dm_table *t, const char *type, 634 sector_t start, sector_t len, char *params) 635 { 636 int r = -EINVAL, argc; 637 char **argv; 638 struct dm_target *ti; 639 640 if (t->singleton) { 641 DMERR("%s: target type %s must appear alone in table", 642 dm_device_name(t->md), t->targets->type->name); 643 return -EINVAL; 644 } 645 646 BUG_ON(t->num_targets >= t->num_allocated); 647 648 ti = t->targets + t->num_targets; 649 memset(ti, 0, sizeof(*ti)); 650 651 if (!len) { 652 DMERR("%s: zero-length target", dm_device_name(t->md)); 653 return -EINVAL; 654 } 655 656 ti->type = dm_get_target_type(type); 657 if (!ti->type) { 658 DMERR("%s: %s: unknown target type", dm_device_name(t->md), type); 659 return -EINVAL; 660 } 661 662 if (dm_target_needs_singleton(ti->type)) { 663 if (t->num_targets) { 664 ti->error = "singleton target type must appear alone in table"; 665 goto bad; 666 } 667 t->singleton = true; 668 } 669 670 if (dm_target_always_writeable(ti->type) && !(t->mode & FMODE_WRITE)) { 671 ti->error = "target type may not be included in a read-only table"; 672 goto bad; 673 } 674 675 if (t->immutable_target_type) { 676 if (t->immutable_target_type != ti->type) { 677 ti->error = "immutable target type cannot be mixed with other target types"; 678 goto bad; 679 } 680 } else if (dm_target_is_immutable(ti->type)) { 681 if (t->num_targets) { 682 ti->error = "immutable target type cannot be mixed with other target types"; 683 goto bad; 684 } 685 t->immutable_target_type = ti->type; 686 } 687 688 if (dm_target_has_integrity(ti->type)) 689 t->integrity_added = 1; 690 691 ti->table = t; 692 ti->begin = start; 693 ti->len = len; 694 ti->error = "Unknown error"; 695 696 /* 697 * Does this target adjoin the previous one ? 698 */ 699 if (!adjoin(t, ti)) { 700 ti->error = "Gap in table"; 701 goto bad; 702 } 703 704 r = dm_split_args(&argc, &argv, params); 705 if (r) { 706 ti->error = "couldn't split parameters"; 707 goto bad; 708 } 709 710 r = ti->type->ctr(ti, argc, argv); 711 kfree(argv); 712 if (r) 713 goto bad; 714 715 t->highs[t->num_targets++] = ti->begin + ti->len - 1; 716 717 if (!ti->num_discard_bios && ti->discards_supported) 718 DMWARN("%s: %s: ignoring discards_supported because num_discard_bios is zero.", 719 dm_device_name(t->md), type); 720 721 if (ti->limit_swap_bios && !static_key_enabled(&swap_bios_enabled.key)) 722 static_branch_enable(&swap_bios_enabled); 723 724 return 0; 725 726 bad: 727 DMERR("%s: %s: %s (%pe)", dm_device_name(t->md), type, ti->error, ERR_PTR(r)); 728 dm_put_target_type(ti->type); 729 return r; 730 } 731 732 /* 733 * Target argument parsing helpers. 734 */ 735 static int validate_next_arg(const struct dm_arg *arg, 736 struct dm_arg_set *arg_set, 737 unsigned *value, char **error, unsigned grouped) 738 { 739 const char *arg_str = dm_shift_arg(arg_set); 740 char dummy; 741 742 if (!arg_str || 743 (sscanf(arg_str, "%u%c", value, &dummy) != 1) || 744 (*value < arg->min) || 745 (*value > arg->max) || 746 (grouped && arg_set->argc < *value)) { 747 *error = arg->error; 748 return -EINVAL; 749 } 750 751 return 0; 752 } 753 754 int dm_read_arg(const struct dm_arg *arg, struct dm_arg_set *arg_set, 755 unsigned *value, char **error) 756 { 757 return validate_next_arg(arg, arg_set, value, error, 0); 758 } 759 EXPORT_SYMBOL(dm_read_arg); 760 761 int dm_read_arg_group(const struct dm_arg *arg, struct dm_arg_set *arg_set, 762 unsigned *value, char **error) 763 { 764 return validate_next_arg(arg, arg_set, value, error, 1); 765 } 766 EXPORT_SYMBOL(dm_read_arg_group); 767 768 const char *dm_shift_arg(struct dm_arg_set *as) 769 { 770 char *r; 771 772 if (as->argc) { 773 as->argc--; 774 r = *as->argv; 775 as->argv++; 776 return r; 777 } 778 779 return NULL; 780 } 781 EXPORT_SYMBOL(dm_shift_arg); 782 783 void dm_consume_args(struct dm_arg_set *as, unsigned num_args) 784 { 785 BUG_ON(as->argc < num_args); 786 as->argc -= num_args; 787 as->argv += num_args; 788 } 789 EXPORT_SYMBOL(dm_consume_args); 790 791 static bool __table_type_bio_based(enum dm_queue_mode table_type) 792 { 793 return (table_type == DM_TYPE_BIO_BASED || 794 table_type == DM_TYPE_DAX_BIO_BASED); 795 } 796 797 static bool __table_type_request_based(enum dm_queue_mode table_type) 798 { 799 return table_type == DM_TYPE_REQUEST_BASED; 800 } 801 802 void dm_table_set_type(struct dm_table *t, enum dm_queue_mode type) 803 { 804 t->type = type; 805 } 806 EXPORT_SYMBOL_GPL(dm_table_set_type); 807 808 /* validate the dax capability of the target device span */ 809 static int device_not_dax_capable(struct dm_target *ti, struct dm_dev *dev, 810 sector_t start, sector_t len, void *data) 811 { 812 if (dev->dax_dev) 813 return false; 814 815 DMDEBUG("%pg: error: dax unsupported by block device", dev->bdev); 816 return true; 817 } 818 819 /* Check devices support synchronous DAX */ 820 static int device_not_dax_synchronous_capable(struct dm_target *ti, struct dm_dev *dev, 821 sector_t start, sector_t len, void *data) 822 { 823 return !dev->dax_dev || !dax_synchronous(dev->dax_dev); 824 } 825 826 static bool dm_table_supports_dax(struct dm_table *t, 827 iterate_devices_callout_fn iterate_fn) 828 { 829 /* Ensure that all targets support DAX. */ 830 for (unsigned int i = 0; i < t->num_targets; i++) { 831 struct dm_target *ti = dm_table_get_target(t, i); 832 833 if (!ti->type->direct_access) 834 return false; 835 836 if (!ti->type->iterate_devices || 837 ti->type->iterate_devices(ti, iterate_fn, NULL)) 838 return false; 839 } 840 841 return true; 842 } 843 844 static int device_is_rq_stackable(struct dm_target *ti, struct dm_dev *dev, 845 sector_t start, sector_t len, void *data) 846 { 847 struct block_device *bdev = dev->bdev; 848 struct request_queue *q = bdev_get_queue(bdev); 849 850 /* request-based cannot stack on partitions! */ 851 if (bdev_is_partition(bdev)) 852 return false; 853 854 return queue_is_mq(q); 855 } 856 857 static int dm_table_determine_type(struct dm_table *t) 858 { 859 unsigned bio_based = 0, request_based = 0, hybrid = 0; 860 struct dm_target *ti; 861 struct list_head *devices = dm_table_get_devices(t); 862 enum dm_queue_mode live_md_type = dm_get_md_type(t->md); 863 864 if (t->type != DM_TYPE_NONE) { 865 /* target already set the table's type */ 866 if (t->type == DM_TYPE_BIO_BASED) { 867 /* possibly upgrade to a variant of bio-based */ 868 goto verify_bio_based; 869 } 870 BUG_ON(t->type == DM_TYPE_DAX_BIO_BASED); 871 goto verify_rq_based; 872 } 873 874 for (unsigned int i = 0; i < t->num_targets; i++) { 875 ti = dm_table_get_target(t, i); 876 if (dm_target_hybrid(ti)) 877 hybrid = 1; 878 else if (dm_target_request_based(ti)) 879 request_based = 1; 880 else 881 bio_based = 1; 882 883 if (bio_based && request_based) { 884 DMERR("Inconsistent table: different target types" 885 " can't be mixed up"); 886 return -EINVAL; 887 } 888 } 889 890 if (hybrid && !bio_based && !request_based) { 891 /* 892 * The targets can work either way. 893 * Determine the type from the live device. 894 * Default to bio-based if device is new. 895 */ 896 if (__table_type_request_based(live_md_type)) 897 request_based = 1; 898 else 899 bio_based = 1; 900 } 901 902 if (bio_based) { 903 verify_bio_based: 904 /* We must use this table as bio-based */ 905 t->type = DM_TYPE_BIO_BASED; 906 if (dm_table_supports_dax(t, device_not_dax_capable) || 907 (list_empty(devices) && live_md_type == DM_TYPE_DAX_BIO_BASED)) { 908 t->type = DM_TYPE_DAX_BIO_BASED; 909 } 910 return 0; 911 } 912 913 BUG_ON(!request_based); /* No targets in this table */ 914 915 t->type = DM_TYPE_REQUEST_BASED; 916 917 verify_rq_based: 918 /* 919 * Request-based dm supports only tables that have a single target now. 920 * To support multiple targets, request splitting support is needed, 921 * and that needs lots of changes in the block-layer. 922 * (e.g. request completion process for partial completion.) 923 */ 924 if (t->num_targets > 1) { 925 DMERR("request-based DM doesn't support multiple targets"); 926 return -EINVAL; 927 } 928 929 if (list_empty(devices)) { 930 int srcu_idx; 931 struct dm_table *live_table = dm_get_live_table(t->md, &srcu_idx); 932 933 /* inherit live table's type */ 934 if (live_table) 935 t->type = live_table->type; 936 dm_put_live_table(t->md, srcu_idx); 937 return 0; 938 } 939 940 ti = dm_table_get_immutable_target(t); 941 if (!ti) { 942 DMERR("table load rejected: immutable target is required"); 943 return -EINVAL; 944 } else if (ti->max_io_len) { 945 DMERR("table load rejected: immutable target that splits IO is not supported"); 946 return -EINVAL; 947 } 948 949 /* Non-request-stackable devices can't be used for request-based dm */ 950 if (!ti->type->iterate_devices || 951 !ti->type->iterate_devices(ti, device_is_rq_stackable, NULL)) { 952 DMERR("table load rejected: including non-request-stackable devices"); 953 return -EINVAL; 954 } 955 956 return 0; 957 } 958 959 enum dm_queue_mode dm_table_get_type(struct dm_table *t) 960 { 961 return t->type; 962 } 963 964 struct target_type *dm_table_get_immutable_target_type(struct dm_table *t) 965 { 966 return t->immutable_target_type; 967 } 968 969 struct dm_target *dm_table_get_immutable_target(struct dm_table *t) 970 { 971 /* Immutable target is implicitly a singleton */ 972 if (t->num_targets > 1 || 973 !dm_target_is_immutable(t->targets[0].type)) 974 return NULL; 975 976 return t->targets; 977 } 978 979 struct dm_target *dm_table_get_wildcard_target(struct dm_table *t) 980 { 981 for (unsigned int i = 0; i < t->num_targets; i++) { 982 struct dm_target *ti = dm_table_get_target(t, i); 983 984 if (dm_target_is_wildcard(ti->type)) 985 return ti; 986 } 987 988 return NULL; 989 } 990 991 bool dm_table_bio_based(struct dm_table *t) 992 { 993 return __table_type_bio_based(dm_table_get_type(t)); 994 } 995 996 bool dm_table_request_based(struct dm_table *t) 997 { 998 return __table_type_request_based(dm_table_get_type(t)); 999 } 1000 1001 static bool dm_table_supports_poll(struct dm_table *t); 1002 1003 static int dm_table_alloc_md_mempools(struct dm_table *t, struct mapped_device *md) 1004 { 1005 enum dm_queue_mode type = dm_table_get_type(t); 1006 unsigned int per_io_data_size = 0, front_pad, io_front_pad; 1007 unsigned int min_pool_size = 0, pool_size; 1008 struct dm_md_mempools *pools; 1009 1010 if (unlikely(type == DM_TYPE_NONE)) { 1011 DMWARN("no table type is set, can't allocate mempools"); 1012 return -EINVAL; 1013 } 1014 1015 pools = kzalloc_node(sizeof(*pools), GFP_KERNEL, md->numa_node_id); 1016 if (!pools) 1017 return -ENOMEM; 1018 1019 if (type == DM_TYPE_REQUEST_BASED) { 1020 pool_size = dm_get_reserved_rq_based_ios(); 1021 front_pad = offsetof(struct dm_rq_clone_bio_info, clone); 1022 goto init_bs; 1023 } 1024 1025 for (unsigned int i = 0; i < t->num_targets; i++) { 1026 struct dm_target *ti = dm_table_get_target(t, i); 1027 1028 per_io_data_size = max(per_io_data_size, ti->per_io_data_size); 1029 min_pool_size = max(min_pool_size, ti->num_flush_bios); 1030 } 1031 pool_size = max(dm_get_reserved_bio_based_ios(), min_pool_size); 1032 front_pad = roundup(per_io_data_size, 1033 __alignof__(struct dm_target_io)) + DM_TARGET_IO_BIO_OFFSET; 1034 1035 io_front_pad = roundup(per_io_data_size, 1036 __alignof__(struct dm_io)) + DM_IO_BIO_OFFSET; 1037 if (bioset_init(&pools->io_bs, pool_size, io_front_pad, 1038 dm_table_supports_poll(t) ? BIOSET_PERCPU_CACHE : 0)) 1039 goto out_free_pools; 1040 if (t->integrity_supported && 1041 bioset_integrity_create(&pools->io_bs, pool_size)) 1042 goto out_free_pools; 1043 init_bs: 1044 if (bioset_init(&pools->bs, pool_size, front_pad, 0)) 1045 goto out_free_pools; 1046 if (t->integrity_supported && 1047 bioset_integrity_create(&pools->bs, pool_size)) 1048 goto out_free_pools; 1049 1050 t->mempools = pools; 1051 return 0; 1052 1053 out_free_pools: 1054 dm_free_md_mempools(pools); 1055 return -ENOMEM; 1056 } 1057 1058 static int setup_indexes(struct dm_table *t) 1059 { 1060 int i; 1061 unsigned int total = 0; 1062 sector_t *indexes; 1063 1064 /* allocate the space for *all* the indexes */ 1065 for (i = t->depth - 2; i >= 0; i--) { 1066 t->counts[i] = dm_div_up(t->counts[i + 1], CHILDREN_PER_NODE); 1067 total += t->counts[i]; 1068 } 1069 1070 indexes = kvcalloc(total, NODE_SIZE, GFP_KERNEL); 1071 if (!indexes) 1072 return -ENOMEM; 1073 1074 /* set up internal nodes, bottom-up */ 1075 for (i = t->depth - 2; i >= 0; i--) { 1076 t->index[i] = indexes; 1077 indexes += (KEYS_PER_NODE * t->counts[i]); 1078 setup_btree_index(i, t); 1079 } 1080 1081 return 0; 1082 } 1083 1084 /* 1085 * Builds the btree to index the map. 1086 */ 1087 static int dm_table_build_index(struct dm_table *t) 1088 { 1089 int r = 0; 1090 unsigned int leaf_nodes; 1091 1092 /* how many indexes will the btree have ? */ 1093 leaf_nodes = dm_div_up(t->num_targets, KEYS_PER_NODE); 1094 t->depth = 1 + int_log(leaf_nodes, CHILDREN_PER_NODE); 1095 1096 /* leaf layer has already been set up */ 1097 t->counts[t->depth - 1] = leaf_nodes; 1098 t->index[t->depth - 1] = t->highs; 1099 1100 if (t->depth >= 2) 1101 r = setup_indexes(t); 1102 1103 return r; 1104 } 1105 1106 static bool integrity_profile_exists(struct gendisk *disk) 1107 { 1108 return !!blk_get_integrity(disk); 1109 } 1110 1111 /* 1112 * Get a disk whose integrity profile reflects the table's profile. 1113 * Returns NULL if integrity support was inconsistent or unavailable. 1114 */ 1115 static struct gendisk * dm_table_get_integrity_disk(struct dm_table *t) 1116 { 1117 struct list_head *devices = dm_table_get_devices(t); 1118 struct dm_dev_internal *dd = NULL; 1119 struct gendisk *prev_disk = NULL, *template_disk = NULL; 1120 1121 for (unsigned int i = 0; i < t->num_targets; i++) { 1122 struct dm_target *ti = dm_table_get_target(t, i); 1123 1124 if (!dm_target_passes_integrity(ti->type)) 1125 goto no_integrity; 1126 } 1127 1128 list_for_each_entry(dd, devices, list) { 1129 template_disk = dd->dm_dev->bdev->bd_disk; 1130 if (!integrity_profile_exists(template_disk)) 1131 goto no_integrity; 1132 else if (prev_disk && 1133 blk_integrity_compare(prev_disk, template_disk) < 0) 1134 goto no_integrity; 1135 prev_disk = template_disk; 1136 } 1137 1138 return template_disk; 1139 1140 no_integrity: 1141 if (prev_disk) 1142 DMWARN("%s: integrity not set: %s and %s profile mismatch", 1143 dm_device_name(t->md), 1144 prev_disk->disk_name, 1145 template_disk->disk_name); 1146 return NULL; 1147 } 1148 1149 /* 1150 * Register the mapped device for blk_integrity support if the 1151 * underlying devices have an integrity profile. But all devices may 1152 * not have matching profiles (checking all devices isn't reliable 1153 * during table load because this table may use other DM device(s) which 1154 * must be resumed before they will have an initialized integity 1155 * profile). Consequently, stacked DM devices force a 2 stage integrity 1156 * profile validation: First pass during table load, final pass during 1157 * resume. 1158 */ 1159 static int dm_table_register_integrity(struct dm_table *t) 1160 { 1161 struct mapped_device *md = t->md; 1162 struct gendisk *template_disk = NULL; 1163 1164 /* If target handles integrity itself do not register it here. */ 1165 if (t->integrity_added) 1166 return 0; 1167 1168 template_disk = dm_table_get_integrity_disk(t); 1169 if (!template_disk) 1170 return 0; 1171 1172 if (!integrity_profile_exists(dm_disk(md))) { 1173 t->integrity_supported = true; 1174 /* 1175 * Register integrity profile during table load; we can do 1176 * this because the final profile must match during resume. 1177 */ 1178 blk_integrity_register(dm_disk(md), 1179 blk_get_integrity(template_disk)); 1180 return 0; 1181 } 1182 1183 /* 1184 * If DM device already has an initialized integrity 1185 * profile the new profile should not conflict. 1186 */ 1187 if (blk_integrity_compare(dm_disk(md), template_disk) < 0) { 1188 DMWARN("%s: conflict with existing integrity profile: " 1189 "%s profile mismatch", 1190 dm_device_name(t->md), 1191 template_disk->disk_name); 1192 return 1; 1193 } 1194 1195 /* Preserve existing integrity profile */ 1196 t->integrity_supported = true; 1197 return 0; 1198 } 1199 1200 #ifdef CONFIG_BLK_INLINE_ENCRYPTION 1201 1202 struct dm_crypto_profile { 1203 struct blk_crypto_profile profile; 1204 struct mapped_device *md; 1205 }; 1206 1207 struct dm_keyslot_evict_args { 1208 const struct blk_crypto_key *key; 1209 int err; 1210 }; 1211 1212 static int dm_keyslot_evict_callback(struct dm_target *ti, struct dm_dev *dev, 1213 sector_t start, sector_t len, void *data) 1214 { 1215 struct dm_keyslot_evict_args *args = data; 1216 int err; 1217 1218 err = blk_crypto_evict_key(bdev_get_queue(dev->bdev), args->key); 1219 if (!args->err) 1220 args->err = err; 1221 /* Always try to evict the key from all devices. */ 1222 return 0; 1223 } 1224 1225 /* 1226 * When an inline encryption key is evicted from a device-mapper device, evict 1227 * it from all the underlying devices. 1228 */ 1229 static int dm_keyslot_evict(struct blk_crypto_profile *profile, 1230 const struct blk_crypto_key *key, unsigned int slot) 1231 { 1232 struct mapped_device *md = 1233 container_of(profile, struct dm_crypto_profile, profile)->md; 1234 struct dm_keyslot_evict_args args = { key }; 1235 struct dm_table *t; 1236 int srcu_idx; 1237 1238 t = dm_get_live_table(md, &srcu_idx); 1239 if (!t) 1240 return 0; 1241 1242 for (unsigned int i = 0; i < t->num_targets; i++) { 1243 struct dm_target *ti = dm_table_get_target(t, i); 1244 1245 if (!ti->type->iterate_devices) 1246 continue; 1247 ti->type->iterate_devices(ti, dm_keyslot_evict_callback, &args); 1248 } 1249 1250 dm_put_live_table(md, srcu_idx); 1251 return args.err; 1252 } 1253 1254 static int 1255 device_intersect_crypto_capabilities(struct dm_target *ti, struct dm_dev *dev, 1256 sector_t start, sector_t len, void *data) 1257 { 1258 struct blk_crypto_profile *parent = data; 1259 struct blk_crypto_profile *child = 1260 bdev_get_queue(dev->bdev)->crypto_profile; 1261 1262 blk_crypto_intersect_capabilities(parent, child); 1263 return 0; 1264 } 1265 1266 void dm_destroy_crypto_profile(struct blk_crypto_profile *profile) 1267 { 1268 struct dm_crypto_profile *dmcp = container_of(profile, 1269 struct dm_crypto_profile, 1270 profile); 1271 1272 if (!profile) 1273 return; 1274 1275 blk_crypto_profile_destroy(profile); 1276 kfree(dmcp); 1277 } 1278 1279 static void dm_table_destroy_crypto_profile(struct dm_table *t) 1280 { 1281 dm_destroy_crypto_profile(t->crypto_profile); 1282 t->crypto_profile = NULL; 1283 } 1284 1285 /* 1286 * Constructs and initializes t->crypto_profile with a crypto profile that 1287 * represents the common set of crypto capabilities of the devices described by 1288 * the dm_table. However, if the constructed crypto profile doesn't support all 1289 * crypto capabilities that are supported by the current mapped_device, it 1290 * returns an error instead, since we don't support removing crypto capabilities 1291 * on table changes. Finally, if the constructed crypto profile is "empty" (has 1292 * no crypto capabilities at all), it just sets t->crypto_profile to NULL. 1293 */ 1294 static int dm_table_construct_crypto_profile(struct dm_table *t) 1295 { 1296 struct dm_crypto_profile *dmcp; 1297 struct blk_crypto_profile *profile; 1298 unsigned int i; 1299 bool empty_profile = true; 1300 1301 dmcp = kmalloc(sizeof(*dmcp), GFP_KERNEL); 1302 if (!dmcp) 1303 return -ENOMEM; 1304 dmcp->md = t->md; 1305 1306 profile = &dmcp->profile; 1307 blk_crypto_profile_init(profile, 0); 1308 profile->ll_ops.keyslot_evict = dm_keyslot_evict; 1309 profile->max_dun_bytes_supported = UINT_MAX; 1310 memset(profile->modes_supported, 0xFF, 1311 sizeof(profile->modes_supported)); 1312 1313 for (i = 0; i < t->num_targets; i++) { 1314 struct dm_target *ti = dm_table_get_target(t, i); 1315 1316 if (!dm_target_passes_crypto(ti->type)) { 1317 blk_crypto_intersect_capabilities(profile, NULL); 1318 break; 1319 } 1320 if (!ti->type->iterate_devices) 1321 continue; 1322 ti->type->iterate_devices(ti, 1323 device_intersect_crypto_capabilities, 1324 profile); 1325 } 1326 1327 if (t->md->queue && 1328 !blk_crypto_has_capabilities(profile, 1329 t->md->queue->crypto_profile)) { 1330 DMWARN("Inline encryption capabilities of new DM table were more restrictive than the old table's. This is not supported!"); 1331 dm_destroy_crypto_profile(profile); 1332 return -EINVAL; 1333 } 1334 1335 /* 1336 * If the new profile doesn't actually support any crypto capabilities, 1337 * we may as well represent it with a NULL profile. 1338 */ 1339 for (i = 0; i < ARRAY_SIZE(profile->modes_supported); i++) { 1340 if (profile->modes_supported[i]) { 1341 empty_profile = false; 1342 break; 1343 } 1344 } 1345 1346 if (empty_profile) { 1347 dm_destroy_crypto_profile(profile); 1348 profile = NULL; 1349 } 1350 1351 /* 1352 * t->crypto_profile is only set temporarily while the table is being 1353 * set up, and it gets set to NULL after the profile has been 1354 * transferred to the request_queue. 1355 */ 1356 t->crypto_profile = profile; 1357 1358 return 0; 1359 } 1360 1361 static void dm_update_crypto_profile(struct request_queue *q, 1362 struct dm_table *t) 1363 { 1364 if (!t->crypto_profile) 1365 return; 1366 1367 /* Make the crypto profile less restrictive. */ 1368 if (!q->crypto_profile) { 1369 blk_crypto_register(t->crypto_profile, q); 1370 } else { 1371 blk_crypto_update_capabilities(q->crypto_profile, 1372 t->crypto_profile); 1373 dm_destroy_crypto_profile(t->crypto_profile); 1374 } 1375 t->crypto_profile = NULL; 1376 } 1377 1378 #else /* CONFIG_BLK_INLINE_ENCRYPTION */ 1379 1380 static int dm_table_construct_crypto_profile(struct dm_table *t) 1381 { 1382 return 0; 1383 } 1384 1385 void dm_destroy_crypto_profile(struct blk_crypto_profile *profile) 1386 { 1387 } 1388 1389 static void dm_table_destroy_crypto_profile(struct dm_table *t) 1390 { 1391 } 1392 1393 static void dm_update_crypto_profile(struct request_queue *q, 1394 struct dm_table *t) 1395 { 1396 } 1397 1398 #endif /* !CONFIG_BLK_INLINE_ENCRYPTION */ 1399 1400 /* 1401 * Prepares the table for use by building the indices, 1402 * setting the type, and allocating mempools. 1403 */ 1404 int dm_table_complete(struct dm_table *t) 1405 { 1406 int r; 1407 1408 r = dm_table_determine_type(t); 1409 if (r) { 1410 DMERR("unable to determine table type"); 1411 return r; 1412 } 1413 1414 r = dm_table_build_index(t); 1415 if (r) { 1416 DMERR("unable to build btrees"); 1417 return r; 1418 } 1419 1420 r = dm_table_register_integrity(t); 1421 if (r) { 1422 DMERR("could not register integrity profile."); 1423 return r; 1424 } 1425 1426 r = dm_table_construct_crypto_profile(t); 1427 if (r) { 1428 DMERR("could not construct crypto profile."); 1429 return r; 1430 } 1431 1432 r = dm_table_alloc_md_mempools(t, t->md); 1433 if (r) 1434 DMERR("unable to allocate mempools"); 1435 1436 return r; 1437 } 1438 1439 static DEFINE_MUTEX(_event_lock); 1440 void dm_table_event_callback(struct dm_table *t, 1441 void (*fn)(void *), void *context) 1442 { 1443 mutex_lock(&_event_lock); 1444 t->event_fn = fn; 1445 t->event_context = context; 1446 mutex_unlock(&_event_lock); 1447 } 1448 1449 void dm_table_event(struct dm_table *t) 1450 { 1451 mutex_lock(&_event_lock); 1452 if (t->event_fn) 1453 t->event_fn(t->event_context); 1454 mutex_unlock(&_event_lock); 1455 } 1456 EXPORT_SYMBOL(dm_table_event); 1457 1458 inline sector_t dm_table_get_size(struct dm_table *t) 1459 { 1460 return t->num_targets ? (t->highs[t->num_targets - 1] + 1) : 0; 1461 } 1462 EXPORT_SYMBOL(dm_table_get_size); 1463 1464 /* 1465 * Search the btree for the correct target. 1466 * 1467 * Caller should check returned pointer for NULL 1468 * to trap I/O beyond end of device. 1469 */ 1470 struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector) 1471 { 1472 unsigned int l, n = 0, k = 0; 1473 sector_t *node; 1474 1475 if (unlikely(sector >= dm_table_get_size(t))) 1476 return NULL; 1477 1478 for (l = 0; l < t->depth; l++) { 1479 n = get_child(n, k); 1480 node = get_node(t, l, n); 1481 1482 for (k = 0; k < KEYS_PER_NODE; k++) 1483 if (node[k] >= sector) 1484 break; 1485 } 1486 1487 return &t->targets[(KEYS_PER_NODE * n) + k]; 1488 } 1489 1490 static int device_not_poll_capable(struct dm_target *ti, struct dm_dev *dev, 1491 sector_t start, sector_t len, void *data) 1492 { 1493 struct request_queue *q = bdev_get_queue(dev->bdev); 1494 1495 return !test_bit(QUEUE_FLAG_POLL, &q->queue_flags); 1496 } 1497 1498 /* 1499 * type->iterate_devices() should be called when the sanity check needs to 1500 * iterate and check all underlying data devices. iterate_devices() will 1501 * iterate all underlying data devices until it encounters a non-zero return 1502 * code, returned by whether the input iterate_devices_callout_fn, or 1503 * iterate_devices() itself internally. 1504 * 1505 * For some target type (e.g. dm-stripe), one call of iterate_devices() may 1506 * iterate multiple underlying devices internally, in which case a non-zero 1507 * return code returned by iterate_devices_callout_fn will stop the iteration 1508 * in advance. 1509 * 1510 * Cases requiring _any_ underlying device supporting some kind of attribute, 1511 * should use the iteration structure like dm_table_any_dev_attr(), or call 1512 * it directly. @func should handle semantics of positive examples, e.g. 1513 * capable of something. 1514 * 1515 * Cases requiring _all_ underlying devices supporting some kind of attribute, 1516 * should use the iteration structure like dm_table_supports_nowait() or 1517 * dm_table_supports_discards(). Or introduce dm_table_all_devs_attr() that 1518 * uses an @anti_func that handle semantics of counter examples, e.g. not 1519 * capable of something. So: return !dm_table_any_dev_attr(t, anti_func, data); 1520 */ 1521 static bool dm_table_any_dev_attr(struct dm_table *t, 1522 iterate_devices_callout_fn func, void *data) 1523 { 1524 for (unsigned int i = 0; i < t->num_targets; i++) { 1525 struct dm_target *ti = dm_table_get_target(t, i); 1526 1527 if (ti->type->iterate_devices && 1528 ti->type->iterate_devices(ti, func, data)) 1529 return true; 1530 } 1531 1532 return false; 1533 } 1534 1535 static int count_device(struct dm_target *ti, struct dm_dev *dev, 1536 sector_t start, sector_t len, void *data) 1537 { 1538 unsigned *num_devices = data; 1539 1540 (*num_devices)++; 1541 1542 return 0; 1543 } 1544 1545 static bool dm_table_supports_poll(struct dm_table *t) 1546 { 1547 for (unsigned int i = 0; i < t->num_targets; i++) { 1548 struct dm_target *ti = dm_table_get_target(t, i); 1549 1550 if (!ti->type->iterate_devices || 1551 ti->type->iterate_devices(ti, device_not_poll_capable, NULL)) 1552 return false; 1553 } 1554 1555 return true; 1556 } 1557 1558 /* 1559 * Check whether a table has no data devices attached using each 1560 * target's iterate_devices method. 1561 * Returns false if the result is unknown because a target doesn't 1562 * support iterate_devices. 1563 */ 1564 bool dm_table_has_no_data_devices(struct dm_table *t) 1565 { 1566 for (unsigned int i = 0; i < t->num_targets; i++) { 1567 struct dm_target *ti = dm_table_get_target(t, i); 1568 unsigned num_devices = 0; 1569 1570 if (!ti->type->iterate_devices) 1571 return false; 1572 1573 ti->type->iterate_devices(ti, count_device, &num_devices); 1574 if (num_devices) 1575 return false; 1576 } 1577 1578 return true; 1579 } 1580 1581 static int device_not_zoned_model(struct dm_target *ti, struct dm_dev *dev, 1582 sector_t start, sector_t len, void *data) 1583 { 1584 struct request_queue *q = bdev_get_queue(dev->bdev); 1585 enum blk_zoned_model *zoned_model = data; 1586 1587 return blk_queue_zoned_model(q) != *zoned_model; 1588 } 1589 1590 /* 1591 * Check the device zoned model based on the target feature flag. If the target 1592 * has the DM_TARGET_ZONED_HM feature flag set, host-managed zoned devices are 1593 * also accepted but all devices must have the same zoned model. If the target 1594 * has the DM_TARGET_MIXED_ZONED_MODEL feature set, the devices can have any 1595 * zoned model with all zoned devices having the same zone size. 1596 */ 1597 static bool dm_table_supports_zoned_model(struct dm_table *t, 1598 enum blk_zoned_model zoned_model) 1599 { 1600 for (unsigned int i = 0; i < t->num_targets; i++) { 1601 struct dm_target *ti = dm_table_get_target(t, i); 1602 1603 if (dm_target_supports_zoned_hm(ti->type)) { 1604 if (!ti->type->iterate_devices || 1605 ti->type->iterate_devices(ti, device_not_zoned_model, 1606 &zoned_model)) 1607 return false; 1608 } else if (!dm_target_supports_mixed_zoned_model(ti->type)) { 1609 if (zoned_model == BLK_ZONED_HM) 1610 return false; 1611 } 1612 } 1613 1614 return true; 1615 } 1616 1617 static int device_not_matches_zone_sectors(struct dm_target *ti, struct dm_dev *dev, 1618 sector_t start, sector_t len, void *data) 1619 { 1620 unsigned int *zone_sectors = data; 1621 1622 if (!bdev_is_zoned(dev->bdev)) 1623 return 0; 1624 return bdev_zone_sectors(dev->bdev) != *zone_sectors; 1625 } 1626 1627 /* 1628 * Check consistency of zoned model and zone sectors across all targets. For 1629 * zone sectors, if the destination device is a zoned block device, it shall 1630 * have the specified zone_sectors. 1631 */ 1632 static int validate_hardware_zoned_model(struct dm_table *t, 1633 enum blk_zoned_model zoned_model, 1634 unsigned int zone_sectors) 1635 { 1636 if (zoned_model == BLK_ZONED_NONE) 1637 return 0; 1638 1639 if (!dm_table_supports_zoned_model(t, zoned_model)) { 1640 DMERR("%s: zoned model is not consistent across all devices", 1641 dm_device_name(t->md)); 1642 return -EINVAL; 1643 } 1644 1645 /* Check zone size validity and compatibility */ 1646 if (!zone_sectors || !is_power_of_2(zone_sectors)) 1647 return -EINVAL; 1648 1649 if (dm_table_any_dev_attr(t, device_not_matches_zone_sectors, &zone_sectors)) { 1650 DMERR("%s: zone sectors is not consistent across all zoned devices", 1651 dm_device_name(t->md)); 1652 return -EINVAL; 1653 } 1654 1655 return 0; 1656 } 1657 1658 /* 1659 * Establish the new table's queue_limits and validate them. 1660 */ 1661 int dm_calculate_queue_limits(struct dm_table *t, 1662 struct queue_limits *limits) 1663 { 1664 struct queue_limits ti_limits; 1665 enum blk_zoned_model zoned_model = BLK_ZONED_NONE; 1666 unsigned int zone_sectors = 0; 1667 1668 blk_set_stacking_limits(limits); 1669 1670 for (unsigned int i = 0; i < t->num_targets; i++) { 1671 struct dm_target *ti = dm_table_get_target(t, i); 1672 1673 blk_set_stacking_limits(&ti_limits); 1674 1675 if (!ti->type->iterate_devices) 1676 goto combine_limits; 1677 1678 /* 1679 * Combine queue limits of all the devices this target uses. 1680 */ 1681 ti->type->iterate_devices(ti, dm_set_device_limits, 1682 &ti_limits); 1683 1684 if (zoned_model == BLK_ZONED_NONE && ti_limits.zoned != BLK_ZONED_NONE) { 1685 /* 1686 * After stacking all limits, validate all devices 1687 * in table support this zoned model and zone sectors. 1688 */ 1689 zoned_model = ti_limits.zoned; 1690 zone_sectors = ti_limits.chunk_sectors; 1691 } 1692 1693 /* Set I/O hints portion of queue limits */ 1694 if (ti->type->io_hints) 1695 ti->type->io_hints(ti, &ti_limits); 1696 1697 /* 1698 * Check each device area is consistent with the target's 1699 * overall queue limits. 1700 */ 1701 if (ti->type->iterate_devices(ti, device_area_is_invalid, 1702 &ti_limits)) 1703 return -EINVAL; 1704 1705 combine_limits: 1706 /* 1707 * Merge this target's queue limits into the overall limits 1708 * for the table. 1709 */ 1710 if (blk_stack_limits(limits, &ti_limits, 0) < 0) 1711 DMWARN("%s: adding target device " 1712 "(start sect %llu len %llu) " 1713 "caused an alignment inconsistency", 1714 dm_device_name(t->md), 1715 (unsigned long long) ti->begin, 1716 (unsigned long long) ti->len); 1717 } 1718 1719 /* 1720 * Verify that the zoned model and zone sectors, as determined before 1721 * any .io_hints override, are the same across all devices in the table. 1722 * - this is especially relevant if .io_hints is emulating a disk-managed 1723 * zoned model (aka BLK_ZONED_NONE) on host-managed zoned block devices. 1724 * BUT... 1725 */ 1726 if (limits->zoned != BLK_ZONED_NONE) { 1727 /* 1728 * ...IF the above limits stacking determined a zoned model 1729 * validate that all of the table's devices conform to it. 1730 */ 1731 zoned_model = limits->zoned; 1732 zone_sectors = limits->chunk_sectors; 1733 } 1734 if (validate_hardware_zoned_model(t, zoned_model, zone_sectors)) 1735 return -EINVAL; 1736 1737 return validate_hardware_logical_block_alignment(t, limits); 1738 } 1739 1740 /* 1741 * Verify that all devices have an integrity profile that matches the 1742 * DM device's registered integrity profile. If the profiles don't 1743 * match then unregister the DM device's integrity profile. 1744 */ 1745 static void dm_table_verify_integrity(struct dm_table *t) 1746 { 1747 struct gendisk *template_disk = NULL; 1748 1749 if (t->integrity_added) 1750 return; 1751 1752 if (t->integrity_supported) { 1753 /* 1754 * Verify that the original integrity profile 1755 * matches all the devices in this table. 1756 */ 1757 template_disk = dm_table_get_integrity_disk(t); 1758 if (template_disk && 1759 blk_integrity_compare(dm_disk(t->md), template_disk) >= 0) 1760 return; 1761 } 1762 1763 if (integrity_profile_exists(dm_disk(t->md))) { 1764 DMWARN("%s: unable to establish an integrity profile", 1765 dm_device_name(t->md)); 1766 blk_integrity_unregister(dm_disk(t->md)); 1767 } 1768 } 1769 1770 static int device_flush_capable(struct dm_target *ti, struct dm_dev *dev, 1771 sector_t start, sector_t len, void *data) 1772 { 1773 unsigned long flush = (unsigned long) data; 1774 struct request_queue *q = bdev_get_queue(dev->bdev); 1775 1776 return (q->queue_flags & flush); 1777 } 1778 1779 static bool dm_table_supports_flush(struct dm_table *t, unsigned long flush) 1780 { 1781 /* 1782 * Require at least one underlying device to support flushes. 1783 * t->devices includes internal dm devices such as mirror logs 1784 * so we need to use iterate_devices here, which targets 1785 * supporting flushes must provide. 1786 */ 1787 for (unsigned int i = 0; i < t->num_targets; i++) { 1788 struct dm_target *ti = dm_table_get_target(t, i); 1789 1790 if (!ti->num_flush_bios) 1791 continue; 1792 1793 if (ti->flush_supported) 1794 return true; 1795 1796 if (ti->type->iterate_devices && 1797 ti->type->iterate_devices(ti, device_flush_capable, (void *) flush)) 1798 return true; 1799 } 1800 1801 return false; 1802 } 1803 1804 static int device_dax_write_cache_enabled(struct dm_target *ti, 1805 struct dm_dev *dev, sector_t start, 1806 sector_t len, void *data) 1807 { 1808 struct dax_device *dax_dev = dev->dax_dev; 1809 1810 if (!dax_dev) 1811 return false; 1812 1813 if (dax_write_cache_enabled(dax_dev)) 1814 return true; 1815 return false; 1816 } 1817 1818 static int device_is_rotational(struct dm_target *ti, struct dm_dev *dev, 1819 sector_t start, sector_t len, void *data) 1820 { 1821 return !bdev_nonrot(dev->bdev); 1822 } 1823 1824 static int device_is_not_random(struct dm_target *ti, struct dm_dev *dev, 1825 sector_t start, sector_t len, void *data) 1826 { 1827 struct request_queue *q = bdev_get_queue(dev->bdev); 1828 1829 return !blk_queue_add_random(q); 1830 } 1831 1832 static int device_not_write_zeroes_capable(struct dm_target *ti, struct dm_dev *dev, 1833 sector_t start, sector_t len, void *data) 1834 { 1835 struct request_queue *q = bdev_get_queue(dev->bdev); 1836 1837 return !q->limits.max_write_zeroes_sectors; 1838 } 1839 1840 static bool dm_table_supports_write_zeroes(struct dm_table *t) 1841 { 1842 for (unsigned int i = 0; i < t->num_targets; i++) { 1843 struct dm_target *ti = dm_table_get_target(t, i); 1844 1845 if (!ti->num_write_zeroes_bios) 1846 return false; 1847 1848 if (!ti->type->iterate_devices || 1849 ti->type->iterate_devices(ti, device_not_write_zeroes_capable, NULL)) 1850 return false; 1851 } 1852 1853 return true; 1854 } 1855 1856 static int device_not_nowait_capable(struct dm_target *ti, struct dm_dev *dev, 1857 sector_t start, sector_t len, void *data) 1858 { 1859 return !bdev_nowait(dev->bdev); 1860 } 1861 1862 static bool dm_table_supports_nowait(struct dm_table *t) 1863 { 1864 for (unsigned int i = 0; i < t->num_targets; i++) { 1865 struct dm_target *ti = dm_table_get_target(t, i); 1866 1867 if (!dm_target_supports_nowait(ti->type)) 1868 return false; 1869 1870 if (!ti->type->iterate_devices || 1871 ti->type->iterate_devices(ti, device_not_nowait_capable, NULL)) 1872 return false; 1873 } 1874 1875 return true; 1876 } 1877 1878 static int device_not_discard_capable(struct dm_target *ti, struct dm_dev *dev, 1879 sector_t start, sector_t len, void *data) 1880 { 1881 return !bdev_max_discard_sectors(dev->bdev); 1882 } 1883 1884 static bool dm_table_supports_discards(struct dm_table *t) 1885 { 1886 for (unsigned int i = 0; i < t->num_targets; i++) { 1887 struct dm_target *ti = dm_table_get_target(t, i); 1888 1889 if (!ti->num_discard_bios) 1890 return false; 1891 1892 /* 1893 * Either the target provides discard support (as implied by setting 1894 * 'discards_supported') or it relies on _all_ data devices having 1895 * discard support. 1896 */ 1897 if (!ti->discards_supported && 1898 (!ti->type->iterate_devices || 1899 ti->type->iterate_devices(ti, device_not_discard_capable, NULL))) 1900 return false; 1901 } 1902 1903 return true; 1904 } 1905 1906 static int device_not_secure_erase_capable(struct dm_target *ti, 1907 struct dm_dev *dev, sector_t start, 1908 sector_t len, void *data) 1909 { 1910 return !bdev_max_secure_erase_sectors(dev->bdev); 1911 } 1912 1913 static bool dm_table_supports_secure_erase(struct dm_table *t) 1914 { 1915 for (unsigned int i = 0; i < t->num_targets; i++) { 1916 struct dm_target *ti = dm_table_get_target(t, i); 1917 1918 if (!ti->num_secure_erase_bios) 1919 return false; 1920 1921 if (!ti->type->iterate_devices || 1922 ti->type->iterate_devices(ti, device_not_secure_erase_capable, NULL)) 1923 return false; 1924 } 1925 1926 return true; 1927 } 1928 1929 static int device_requires_stable_pages(struct dm_target *ti, 1930 struct dm_dev *dev, sector_t start, 1931 sector_t len, void *data) 1932 { 1933 return bdev_stable_writes(dev->bdev); 1934 } 1935 1936 int dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, 1937 struct queue_limits *limits) 1938 { 1939 bool wc = false, fua = false; 1940 int r; 1941 1942 /* 1943 * Copy table's limits to the DM device's request_queue 1944 */ 1945 q->limits = *limits; 1946 1947 if (dm_table_supports_nowait(t)) 1948 blk_queue_flag_set(QUEUE_FLAG_NOWAIT, q); 1949 else 1950 blk_queue_flag_clear(QUEUE_FLAG_NOWAIT, q); 1951 1952 if (!dm_table_supports_discards(t)) { 1953 q->limits.max_discard_sectors = 0; 1954 q->limits.max_hw_discard_sectors = 0; 1955 q->limits.discard_granularity = 0; 1956 q->limits.discard_alignment = 0; 1957 q->limits.discard_misaligned = 0; 1958 } 1959 1960 if (!dm_table_supports_secure_erase(t)) 1961 q->limits.max_secure_erase_sectors = 0; 1962 1963 if (dm_table_supports_flush(t, (1UL << QUEUE_FLAG_WC))) { 1964 wc = true; 1965 if (dm_table_supports_flush(t, (1UL << QUEUE_FLAG_FUA))) 1966 fua = true; 1967 } 1968 blk_queue_write_cache(q, wc, fua); 1969 1970 if (dm_table_supports_dax(t, device_not_dax_capable)) { 1971 blk_queue_flag_set(QUEUE_FLAG_DAX, q); 1972 if (dm_table_supports_dax(t, device_not_dax_synchronous_capable)) 1973 set_dax_synchronous(t->md->dax_dev); 1974 } 1975 else 1976 blk_queue_flag_clear(QUEUE_FLAG_DAX, q); 1977 1978 if (dm_table_any_dev_attr(t, device_dax_write_cache_enabled, NULL)) 1979 dax_write_cache(t->md->dax_dev, true); 1980 1981 /* Ensure that all underlying devices are non-rotational. */ 1982 if (dm_table_any_dev_attr(t, device_is_rotational, NULL)) 1983 blk_queue_flag_clear(QUEUE_FLAG_NONROT, q); 1984 else 1985 blk_queue_flag_set(QUEUE_FLAG_NONROT, q); 1986 1987 if (!dm_table_supports_write_zeroes(t)) 1988 q->limits.max_write_zeroes_sectors = 0; 1989 1990 dm_table_verify_integrity(t); 1991 1992 /* 1993 * Some devices don't use blk_integrity but still want stable pages 1994 * because they do their own checksumming. 1995 * If any underlying device requires stable pages, a table must require 1996 * them as well. Only targets that support iterate_devices are considered: 1997 * don't want error, zero, etc to require stable pages. 1998 */ 1999 if (dm_table_any_dev_attr(t, device_requires_stable_pages, NULL)) 2000 blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, q); 2001 else 2002 blk_queue_flag_clear(QUEUE_FLAG_STABLE_WRITES, q); 2003 2004 /* 2005 * Determine whether or not this queue's I/O timings contribute 2006 * to the entropy pool, Only request-based targets use this. 2007 * Clear QUEUE_FLAG_ADD_RANDOM if any underlying device does not 2008 * have it set. 2009 */ 2010 if (blk_queue_add_random(q) && 2011 dm_table_any_dev_attr(t, device_is_not_random, NULL)) 2012 blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, q); 2013 2014 /* 2015 * For a zoned target, setup the zones related queue attributes 2016 * and resources necessary for zone append emulation if necessary. 2017 */ 2018 if (blk_queue_is_zoned(q)) { 2019 r = dm_set_zones_restrictions(t, q); 2020 if (r) 2021 return r; 2022 if (!static_key_enabled(&zoned_enabled.key)) 2023 static_branch_enable(&zoned_enabled); 2024 } 2025 2026 dm_update_crypto_profile(q, t); 2027 disk_update_readahead(t->md->disk); 2028 2029 /* 2030 * Check for request-based device is left to 2031 * dm_mq_init_request_queue()->blk_mq_init_allocated_queue(). 2032 * 2033 * For bio-based device, only set QUEUE_FLAG_POLL when all 2034 * underlying devices supporting polling. 2035 */ 2036 if (__table_type_bio_based(t->type)) { 2037 if (dm_table_supports_poll(t)) 2038 blk_queue_flag_set(QUEUE_FLAG_POLL, q); 2039 else 2040 blk_queue_flag_clear(QUEUE_FLAG_POLL, q); 2041 } 2042 2043 return 0; 2044 } 2045 2046 struct list_head *dm_table_get_devices(struct dm_table *t) 2047 { 2048 return &t->devices; 2049 } 2050 2051 fmode_t dm_table_get_mode(struct dm_table *t) 2052 { 2053 return t->mode; 2054 } 2055 EXPORT_SYMBOL(dm_table_get_mode); 2056 2057 enum suspend_mode { 2058 PRESUSPEND, 2059 PRESUSPEND_UNDO, 2060 POSTSUSPEND, 2061 }; 2062 2063 static void suspend_targets(struct dm_table *t, enum suspend_mode mode) 2064 { 2065 lockdep_assert_held(&t->md->suspend_lock); 2066 2067 for (unsigned int i = 0; i < t->num_targets; i++) { 2068 struct dm_target *ti = dm_table_get_target(t, i); 2069 2070 switch (mode) { 2071 case PRESUSPEND: 2072 if (ti->type->presuspend) 2073 ti->type->presuspend(ti); 2074 break; 2075 case PRESUSPEND_UNDO: 2076 if (ti->type->presuspend_undo) 2077 ti->type->presuspend_undo(ti); 2078 break; 2079 case POSTSUSPEND: 2080 if (ti->type->postsuspend) 2081 ti->type->postsuspend(ti); 2082 break; 2083 } 2084 } 2085 } 2086 2087 void dm_table_presuspend_targets(struct dm_table *t) 2088 { 2089 if (!t) 2090 return; 2091 2092 suspend_targets(t, PRESUSPEND); 2093 } 2094 2095 void dm_table_presuspend_undo_targets(struct dm_table *t) 2096 { 2097 if (!t) 2098 return; 2099 2100 suspend_targets(t, PRESUSPEND_UNDO); 2101 } 2102 2103 void dm_table_postsuspend_targets(struct dm_table *t) 2104 { 2105 if (!t) 2106 return; 2107 2108 suspend_targets(t, POSTSUSPEND); 2109 } 2110 2111 int dm_table_resume_targets(struct dm_table *t) 2112 { 2113 unsigned int i; 2114 int r = 0; 2115 2116 lockdep_assert_held(&t->md->suspend_lock); 2117 2118 for (i = 0; i < t->num_targets; i++) { 2119 struct dm_target *ti = dm_table_get_target(t, i); 2120 2121 if (!ti->type->preresume) 2122 continue; 2123 2124 r = ti->type->preresume(ti); 2125 if (r) { 2126 DMERR("%s: %s: preresume failed, error = %d", 2127 dm_device_name(t->md), ti->type->name, r); 2128 return r; 2129 } 2130 } 2131 2132 for (i = 0; i < t->num_targets; i++) { 2133 struct dm_target *ti = dm_table_get_target(t, i); 2134 2135 if (ti->type->resume) 2136 ti->type->resume(ti); 2137 } 2138 2139 return 0; 2140 } 2141 2142 struct mapped_device *dm_table_get_md(struct dm_table *t) 2143 { 2144 return t->md; 2145 } 2146 EXPORT_SYMBOL(dm_table_get_md); 2147 2148 const char *dm_table_device_name(struct dm_table *t) 2149 { 2150 return dm_device_name(t->md); 2151 } 2152 EXPORT_SYMBOL_GPL(dm_table_device_name); 2153 2154 void dm_table_run_md_queue_async(struct dm_table *t) 2155 { 2156 if (!dm_table_request_based(t)) 2157 return; 2158 2159 if (t->md->queue) 2160 blk_mq_run_hw_queues(t->md->queue, true); 2161 } 2162 EXPORT_SYMBOL(dm_table_run_md_queue_async); 2163 2164