1 /* 2 * Copyright (C) 2001 Sistina Software (UK) Limited. 3 * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved. 4 * 5 * This file is released under the GPL. 6 */ 7 8 #include "dm-core.h" 9 10 #include <linux/module.h> 11 #include <linux/vmalloc.h> 12 #include <linux/blkdev.h> 13 #include <linux/namei.h> 14 #include <linux/ctype.h> 15 #include <linux/string.h> 16 #include <linux/slab.h> 17 #include <linux/interrupt.h> 18 #include <linux/mutex.h> 19 #include <linux/delay.h> 20 #include <linux/atomic.h> 21 #include <linux/blk-mq.h> 22 #include <linux/mount.h> 23 #include <linux/dax.h> 24 25 #define DM_MSG_PREFIX "table" 26 27 #define MAX_DEPTH 16 28 #define NODE_SIZE L1_CACHE_BYTES 29 #define KEYS_PER_NODE (NODE_SIZE / sizeof(sector_t)) 30 #define CHILDREN_PER_NODE (KEYS_PER_NODE + 1) 31 32 struct dm_table { 33 struct mapped_device *md; 34 enum dm_queue_mode type; 35 36 /* btree table */ 37 unsigned int depth; 38 unsigned int counts[MAX_DEPTH]; /* in nodes */ 39 sector_t *index[MAX_DEPTH]; 40 41 unsigned int num_targets; 42 unsigned int num_allocated; 43 sector_t *highs; 44 struct dm_target *targets; 45 46 struct target_type *immutable_target_type; 47 48 bool integrity_supported:1; 49 bool singleton:1; 50 unsigned integrity_added:1; 51 52 /* 53 * Indicates the rw permissions for the new logical 54 * device. This should be a combination of FMODE_READ 55 * and FMODE_WRITE. 56 */ 57 fmode_t mode; 58 59 /* a list of devices used by this table */ 60 struct list_head devices; 61 62 /* events get handed up using this callback */ 63 void (*event_fn)(void *); 64 void *event_context; 65 66 struct dm_md_mempools *mempools; 67 68 struct list_head target_callbacks; 69 }; 70 71 /* 72 * Similar to ceiling(log_size(n)) 73 */ 74 static unsigned int int_log(unsigned int n, unsigned int base) 75 { 76 int result = 0; 77 78 while (n > 1) { 79 n = dm_div_up(n, base); 80 result++; 81 } 82 83 return result; 84 } 85 86 /* 87 * Calculate the index of the child node of the n'th node k'th key. 88 */ 89 static inline unsigned int get_child(unsigned int n, unsigned int k) 90 { 91 return (n * CHILDREN_PER_NODE) + k; 92 } 93 94 /* 95 * Return the n'th node of level l from table t. 96 */ 97 static inline sector_t *get_node(struct dm_table *t, 98 unsigned int l, unsigned int n) 99 { 100 return t->index[l] + (n * KEYS_PER_NODE); 101 } 102 103 /* 104 * Return the highest key that you could lookup from the n'th 105 * node on level l of the btree. 106 */ 107 static sector_t high(struct dm_table *t, unsigned int l, unsigned int n) 108 { 109 for (; l < t->depth - 1; l++) 110 n = get_child(n, CHILDREN_PER_NODE - 1); 111 112 if (n >= t->counts[l]) 113 return (sector_t) - 1; 114 115 return get_node(t, l, n)[KEYS_PER_NODE - 1]; 116 } 117 118 /* 119 * Fills in a level of the btree based on the highs of the level 120 * below it. 121 */ 122 static int setup_btree_index(unsigned int l, struct dm_table *t) 123 { 124 unsigned int n, k; 125 sector_t *node; 126 127 for (n = 0U; n < t->counts[l]; n++) { 128 node = get_node(t, l, n); 129 130 for (k = 0U; k < KEYS_PER_NODE; k++) 131 node[k] = high(t, l + 1, get_child(n, k)); 132 } 133 134 return 0; 135 } 136 137 void *dm_vcalloc(unsigned long nmemb, unsigned long elem_size) 138 { 139 unsigned long size; 140 void *addr; 141 142 /* 143 * Check that we're not going to overflow. 144 */ 145 if (nmemb > (ULONG_MAX / elem_size)) 146 return NULL; 147 148 size = nmemb * elem_size; 149 addr = vzalloc(size); 150 151 return addr; 152 } 153 EXPORT_SYMBOL(dm_vcalloc); 154 155 /* 156 * highs, and targets are managed as dynamic arrays during a 157 * table load. 158 */ 159 static int alloc_targets(struct dm_table *t, unsigned int num) 160 { 161 sector_t *n_highs; 162 struct dm_target *n_targets; 163 164 /* 165 * Allocate both the target array and offset array at once. 166 */ 167 n_highs = (sector_t *) dm_vcalloc(num, sizeof(struct dm_target) + 168 sizeof(sector_t)); 169 if (!n_highs) 170 return -ENOMEM; 171 172 n_targets = (struct dm_target *) (n_highs + num); 173 174 memset(n_highs, -1, sizeof(*n_highs) * num); 175 vfree(t->highs); 176 177 t->num_allocated = num; 178 t->highs = n_highs; 179 t->targets = n_targets; 180 181 return 0; 182 } 183 184 int dm_table_create(struct dm_table **result, fmode_t mode, 185 unsigned num_targets, struct mapped_device *md) 186 { 187 struct dm_table *t = kzalloc(sizeof(*t), GFP_KERNEL); 188 189 if (!t) 190 return -ENOMEM; 191 192 INIT_LIST_HEAD(&t->devices); 193 INIT_LIST_HEAD(&t->target_callbacks); 194 195 if (!num_targets) 196 num_targets = KEYS_PER_NODE; 197 198 num_targets = dm_round_up(num_targets, KEYS_PER_NODE); 199 200 if (!num_targets) { 201 kfree(t); 202 return -ENOMEM; 203 } 204 205 if (alloc_targets(t, num_targets)) { 206 kfree(t); 207 return -ENOMEM; 208 } 209 210 t->type = DM_TYPE_NONE; 211 t->mode = mode; 212 t->md = md; 213 *result = t; 214 return 0; 215 } 216 217 static void free_devices(struct list_head *devices, struct mapped_device *md) 218 { 219 struct list_head *tmp, *next; 220 221 list_for_each_safe(tmp, next, devices) { 222 struct dm_dev_internal *dd = 223 list_entry(tmp, struct dm_dev_internal, list); 224 DMWARN("%s: dm_table_destroy: dm_put_device call missing for %s", 225 dm_device_name(md), dd->dm_dev->name); 226 dm_put_table_device(md, dd->dm_dev); 227 kfree(dd); 228 } 229 } 230 231 void dm_table_destroy(struct dm_table *t) 232 { 233 unsigned int i; 234 235 if (!t) 236 return; 237 238 /* free the indexes */ 239 if (t->depth >= 2) 240 vfree(t->index[t->depth - 2]); 241 242 /* free the targets */ 243 for (i = 0; i < t->num_targets; i++) { 244 struct dm_target *tgt = t->targets + i; 245 246 if (tgt->type->dtr) 247 tgt->type->dtr(tgt); 248 249 dm_put_target_type(tgt->type); 250 } 251 252 vfree(t->highs); 253 254 /* free the device list */ 255 free_devices(&t->devices, t->md); 256 257 dm_free_md_mempools(t->mempools); 258 259 kfree(t); 260 } 261 262 /* 263 * See if we've already got a device in the list. 264 */ 265 static struct dm_dev_internal *find_device(struct list_head *l, dev_t dev) 266 { 267 struct dm_dev_internal *dd; 268 269 list_for_each_entry (dd, l, list) 270 if (dd->dm_dev->bdev->bd_dev == dev) 271 return dd; 272 273 return NULL; 274 } 275 276 /* 277 * If possible, this checks an area of a destination device is invalid. 278 */ 279 static int device_area_is_invalid(struct dm_target *ti, struct dm_dev *dev, 280 sector_t start, sector_t len, void *data) 281 { 282 struct request_queue *q; 283 struct queue_limits *limits = data; 284 struct block_device *bdev = dev->bdev; 285 sector_t dev_size = 286 i_size_read(bdev->bd_inode) >> SECTOR_SHIFT; 287 unsigned short logical_block_size_sectors = 288 limits->logical_block_size >> SECTOR_SHIFT; 289 char b[BDEVNAME_SIZE]; 290 291 /* 292 * Some devices exist without request functions, 293 * such as loop devices not yet bound to backing files. 294 * Forbid the use of such devices. 295 */ 296 q = bdev_get_queue(bdev); 297 if (!q || !q->make_request_fn) { 298 DMWARN("%s: %s is not yet initialised: " 299 "start=%llu, len=%llu, dev_size=%llu", 300 dm_device_name(ti->table->md), bdevname(bdev, b), 301 (unsigned long long)start, 302 (unsigned long long)len, 303 (unsigned long long)dev_size); 304 return 1; 305 } 306 307 if (!dev_size) 308 return 0; 309 310 if ((start >= dev_size) || (start + len > dev_size)) { 311 DMWARN("%s: %s too small for target: " 312 "start=%llu, len=%llu, dev_size=%llu", 313 dm_device_name(ti->table->md), bdevname(bdev, b), 314 (unsigned long long)start, 315 (unsigned long long)len, 316 (unsigned long long)dev_size); 317 return 1; 318 } 319 320 /* 321 * If the target is mapped to zoned block device(s), check 322 * that the zones are not partially mapped. 323 */ 324 if (bdev_zoned_model(bdev) != BLK_ZONED_NONE) { 325 unsigned int zone_sectors = bdev_zone_sectors(bdev); 326 327 if (start & (zone_sectors - 1)) { 328 DMWARN("%s: start=%llu not aligned to h/w zone size %u of %s", 329 dm_device_name(ti->table->md), 330 (unsigned long long)start, 331 zone_sectors, bdevname(bdev, b)); 332 return 1; 333 } 334 335 /* 336 * Note: The last zone of a zoned block device may be smaller 337 * than other zones. So for a target mapping the end of a 338 * zoned block device with such a zone, len would not be zone 339 * aligned. We do not allow such last smaller zone to be part 340 * of the mapping here to ensure that mappings with multiple 341 * devices do not end up with a smaller zone in the middle of 342 * the sector range. 343 */ 344 if (len & (zone_sectors - 1)) { 345 DMWARN("%s: len=%llu not aligned to h/w zone size %u of %s", 346 dm_device_name(ti->table->md), 347 (unsigned long long)len, 348 zone_sectors, bdevname(bdev, b)); 349 return 1; 350 } 351 } 352 353 if (logical_block_size_sectors <= 1) 354 return 0; 355 356 if (start & (logical_block_size_sectors - 1)) { 357 DMWARN("%s: start=%llu not aligned to h/w " 358 "logical block size %u of %s", 359 dm_device_name(ti->table->md), 360 (unsigned long long)start, 361 limits->logical_block_size, bdevname(bdev, b)); 362 return 1; 363 } 364 365 if (len & (logical_block_size_sectors - 1)) { 366 DMWARN("%s: len=%llu not aligned to h/w " 367 "logical block size %u of %s", 368 dm_device_name(ti->table->md), 369 (unsigned long long)len, 370 limits->logical_block_size, bdevname(bdev, b)); 371 return 1; 372 } 373 374 return 0; 375 } 376 377 /* 378 * This upgrades the mode on an already open dm_dev, being 379 * careful to leave things as they were if we fail to reopen the 380 * device and not to touch the existing bdev field in case 381 * it is accessed concurrently inside dm_table_any_congested(). 382 */ 383 static int upgrade_mode(struct dm_dev_internal *dd, fmode_t new_mode, 384 struct mapped_device *md) 385 { 386 int r; 387 struct dm_dev *old_dev, *new_dev; 388 389 old_dev = dd->dm_dev; 390 391 r = dm_get_table_device(md, dd->dm_dev->bdev->bd_dev, 392 dd->dm_dev->mode | new_mode, &new_dev); 393 if (r) 394 return r; 395 396 dd->dm_dev = new_dev; 397 dm_put_table_device(md, old_dev); 398 399 return 0; 400 } 401 402 /* 403 * Convert the path to a device 404 */ 405 dev_t dm_get_dev_t(const char *path) 406 { 407 dev_t dev; 408 struct block_device *bdev; 409 410 bdev = lookup_bdev(path); 411 if (IS_ERR(bdev)) 412 dev = name_to_dev_t(path); 413 else { 414 dev = bdev->bd_dev; 415 bdput(bdev); 416 } 417 418 return dev; 419 } 420 EXPORT_SYMBOL_GPL(dm_get_dev_t); 421 422 /* 423 * Add a device to the list, or just increment the usage count if 424 * it's already present. 425 */ 426 int dm_get_device(struct dm_target *ti, const char *path, fmode_t mode, 427 struct dm_dev **result) 428 { 429 int r; 430 dev_t dev; 431 struct dm_dev_internal *dd; 432 struct dm_table *t = ti->table; 433 434 BUG_ON(!t); 435 436 dev = dm_get_dev_t(path); 437 if (!dev) 438 return -ENODEV; 439 440 dd = find_device(&t->devices, dev); 441 if (!dd) { 442 dd = kmalloc(sizeof(*dd), GFP_KERNEL); 443 if (!dd) 444 return -ENOMEM; 445 446 if ((r = dm_get_table_device(t->md, dev, mode, &dd->dm_dev))) { 447 kfree(dd); 448 return r; 449 } 450 451 refcount_set(&dd->count, 1); 452 list_add(&dd->list, &t->devices); 453 goto out; 454 455 } else if (dd->dm_dev->mode != (mode | dd->dm_dev->mode)) { 456 r = upgrade_mode(dd, mode, t->md); 457 if (r) 458 return r; 459 } 460 refcount_inc(&dd->count); 461 out: 462 *result = dd->dm_dev; 463 return 0; 464 } 465 EXPORT_SYMBOL(dm_get_device); 466 467 static int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev, 468 sector_t start, sector_t len, void *data) 469 { 470 struct queue_limits *limits = data; 471 struct block_device *bdev = dev->bdev; 472 struct request_queue *q = bdev_get_queue(bdev); 473 char b[BDEVNAME_SIZE]; 474 475 if (unlikely(!q)) { 476 DMWARN("%s: Cannot set limits for nonexistent device %s", 477 dm_device_name(ti->table->md), bdevname(bdev, b)); 478 return 0; 479 } 480 481 if (bdev_stack_limits(limits, bdev, start) < 0) 482 DMWARN("%s: adding target device %s caused an alignment inconsistency: " 483 "physical_block_size=%u, logical_block_size=%u, " 484 "alignment_offset=%u, start=%llu", 485 dm_device_name(ti->table->md), bdevname(bdev, b), 486 q->limits.physical_block_size, 487 q->limits.logical_block_size, 488 q->limits.alignment_offset, 489 (unsigned long long) start << SECTOR_SHIFT); 490 491 limits->zoned = blk_queue_zoned_model(q); 492 493 return 0; 494 } 495 496 /* 497 * Decrement a device's use count and remove it if necessary. 498 */ 499 void dm_put_device(struct dm_target *ti, struct dm_dev *d) 500 { 501 int found = 0; 502 struct list_head *devices = &ti->table->devices; 503 struct dm_dev_internal *dd; 504 505 list_for_each_entry(dd, devices, list) { 506 if (dd->dm_dev == d) { 507 found = 1; 508 break; 509 } 510 } 511 if (!found) { 512 DMWARN("%s: device %s not in table devices list", 513 dm_device_name(ti->table->md), d->name); 514 return; 515 } 516 if (refcount_dec_and_test(&dd->count)) { 517 dm_put_table_device(ti->table->md, d); 518 list_del(&dd->list); 519 kfree(dd); 520 } 521 } 522 EXPORT_SYMBOL(dm_put_device); 523 524 /* 525 * Checks to see if the target joins onto the end of the table. 526 */ 527 static int adjoin(struct dm_table *table, struct dm_target *ti) 528 { 529 struct dm_target *prev; 530 531 if (!table->num_targets) 532 return !ti->begin; 533 534 prev = &table->targets[table->num_targets - 1]; 535 return (ti->begin == (prev->begin + prev->len)); 536 } 537 538 /* 539 * Used to dynamically allocate the arg array. 540 * 541 * We do first allocation with GFP_NOIO because dm-mpath and dm-thin must 542 * process messages even if some device is suspended. These messages have a 543 * small fixed number of arguments. 544 * 545 * On the other hand, dm-switch needs to process bulk data using messages and 546 * excessive use of GFP_NOIO could cause trouble. 547 */ 548 static char **realloc_argv(unsigned *size, char **old_argv) 549 { 550 char **argv; 551 unsigned new_size; 552 gfp_t gfp; 553 554 if (*size) { 555 new_size = *size * 2; 556 gfp = GFP_KERNEL; 557 } else { 558 new_size = 8; 559 gfp = GFP_NOIO; 560 } 561 argv = kmalloc_array(new_size, sizeof(*argv), gfp); 562 if (argv && old_argv) { 563 memcpy(argv, old_argv, *size * sizeof(*argv)); 564 *size = new_size; 565 } 566 567 kfree(old_argv); 568 return argv; 569 } 570 571 /* 572 * Destructively splits up the argument list to pass to ctr. 573 */ 574 int dm_split_args(int *argc, char ***argvp, char *input) 575 { 576 char *start, *end = input, *out, **argv = NULL; 577 unsigned array_size = 0; 578 579 *argc = 0; 580 581 if (!input) { 582 *argvp = NULL; 583 return 0; 584 } 585 586 argv = realloc_argv(&array_size, argv); 587 if (!argv) 588 return -ENOMEM; 589 590 while (1) { 591 /* Skip whitespace */ 592 start = skip_spaces(end); 593 594 if (!*start) 595 break; /* success, we hit the end */ 596 597 /* 'out' is used to remove any back-quotes */ 598 end = out = start; 599 while (*end) { 600 /* Everything apart from '\0' can be quoted */ 601 if (*end == '\\' && *(end + 1)) { 602 *out++ = *(end + 1); 603 end += 2; 604 continue; 605 } 606 607 if (isspace(*end)) 608 break; /* end of token */ 609 610 *out++ = *end++; 611 } 612 613 /* have we already filled the array ? */ 614 if ((*argc + 1) > array_size) { 615 argv = realloc_argv(&array_size, argv); 616 if (!argv) 617 return -ENOMEM; 618 } 619 620 /* we know this is whitespace */ 621 if (*end) 622 end++; 623 624 /* terminate the string and put it in the array */ 625 *out = '\0'; 626 argv[*argc] = start; 627 (*argc)++; 628 } 629 630 *argvp = argv; 631 return 0; 632 } 633 634 /* 635 * Impose necessary and sufficient conditions on a devices's table such 636 * that any incoming bio which respects its logical_block_size can be 637 * processed successfully. If it falls across the boundary between 638 * two or more targets, the size of each piece it gets split into must 639 * be compatible with the logical_block_size of the target processing it. 640 */ 641 static int validate_hardware_logical_block_alignment(struct dm_table *table, 642 struct queue_limits *limits) 643 { 644 /* 645 * This function uses arithmetic modulo the logical_block_size 646 * (in units of 512-byte sectors). 647 */ 648 unsigned short device_logical_block_size_sects = 649 limits->logical_block_size >> SECTOR_SHIFT; 650 651 /* 652 * Offset of the start of the next table entry, mod logical_block_size. 653 */ 654 unsigned short next_target_start = 0; 655 656 /* 657 * Given an aligned bio that extends beyond the end of a 658 * target, how many sectors must the next target handle? 659 */ 660 unsigned short remaining = 0; 661 662 struct dm_target *uninitialized_var(ti); 663 struct queue_limits ti_limits; 664 unsigned i; 665 666 /* 667 * Check each entry in the table in turn. 668 */ 669 for (i = 0; i < dm_table_get_num_targets(table); i++) { 670 ti = dm_table_get_target(table, i); 671 672 blk_set_stacking_limits(&ti_limits); 673 674 /* combine all target devices' limits */ 675 if (ti->type->iterate_devices) 676 ti->type->iterate_devices(ti, dm_set_device_limits, 677 &ti_limits); 678 679 /* 680 * If the remaining sectors fall entirely within this 681 * table entry are they compatible with its logical_block_size? 682 */ 683 if (remaining < ti->len && 684 remaining & ((ti_limits.logical_block_size >> 685 SECTOR_SHIFT) - 1)) 686 break; /* Error */ 687 688 next_target_start = 689 (unsigned short) ((next_target_start + ti->len) & 690 (device_logical_block_size_sects - 1)); 691 remaining = next_target_start ? 692 device_logical_block_size_sects - next_target_start : 0; 693 } 694 695 if (remaining) { 696 DMWARN("%s: table line %u (start sect %llu len %llu) " 697 "not aligned to h/w logical block size %u", 698 dm_device_name(table->md), i, 699 (unsigned long long) ti->begin, 700 (unsigned long long) ti->len, 701 limits->logical_block_size); 702 return -EINVAL; 703 } 704 705 return 0; 706 } 707 708 int dm_table_add_target(struct dm_table *t, const char *type, 709 sector_t start, sector_t len, char *params) 710 { 711 int r = -EINVAL, argc; 712 char **argv; 713 struct dm_target *tgt; 714 715 if (t->singleton) { 716 DMERR("%s: target type %s must appear alone in table", 717 dm_device_name(t->md), t->targets->type->name); 718 return -EINVAL; 719 } 720 721 BUG_ON(t->num_targets >= t->num_allocated); 722 723 tgt = t->targets + t->num_targets; 724 memset(tgt, 0, sizeof(*tgt)); 725 726 if (!len) { 727 DMERR("%s: zero-length target", dm_device_name(t->md)); 728 return -EINVAL; 729 } 730 731 tgt->type = dm_get_target_type(type); 732 if (!tgt->type) { 733 DMERR("%s: %s: unknown target type", dm_device_name(t->md), type); 734 return -EINVAL; 735 } 736 737 if (dm_target_needs_singleton(tgt->type)) { 738 if (t->num_targets) { 739 tgt->error = "singleton target type must appear alone in table"; 740 goto bad; 741 } 742 t->singleton = true; 743 } 744 745 if (dm_target_always_writeable(tgt->type) && !(t->mode & FMODE_WRITE)) { 746 tgt->error = "target type may not be included in a read-only table"; 747 goto bad; 748 } 749 750 if (t->immutable_target_type) { 751 if (t->immutable_target_type != tgt->type) { 752 tgt->error = "immutable target type cannot be mixed with other target types"; 753 goto bad; 754 } 755 } else if (dm_target_is_immutable(tgt->type)) { 756 if (t->num_targets) { 757 tgt->error = "immutable target type cannot be mixed with other target types"; 758 goto bad; 759 } 760 t->immutable_target_type = tgt->type; 761 } 762 763 if (dm_target_has_integrity(tgt->type)) 764 t->integrity_added = 1; 765 766 tgt->table = t; 767 tgt->begin = start; 768 tgt->len = len; 769 tgt->error = "Unknown error"; 770 771 /* 772 * Does this target adjoin the previous one ? 773 */ 774 if (!adjoin(t, tgt)) { 775 tgt->error = "Gap in table"; 776 goto bad; 777 } 778 779 r = dm_split_args(&argc, &argv, params); 780 if (r) { 781 tgt->error = "couldn't split parameters (insufficient memory)"; 782 goto bad; 783 } 784 785 r = tgt->type->ctr(tgt, argc, argv); 786 kfree(argv); 787 if (r) 788 goto bad; 789 790 t->highs[t->num_targets++] = tgt->begin + tgt->len - 1; 791 792 if (!tgt->num_discard_bios && tgt->discards_supported) 793 DMWARN("%s: %s: ignoring discards_supported because num_discard_bios is zero.", 794 dm_device_name(t->md), type); 795 796 return 0; 797 798 bad: 799 DMERR("%s: %s: %s", dm_device_name(t->md), type, tgt->error); 800 dm_put_target_type(tgt->type); 801 return r; 802 } 803 804 /* 805 * Target argument parsing helpers. 806 */ 807 static int validate_next_arg(const struct dm_arg *arg, 808 struct dm_arg_set *arg_set, 809 unsigned *value, char **error, unsigned grouped) 810 { 811 const char *arg_str = dm_shift_arg(arg_set); 812 char dummy; 813 814 if (!arg_str || 815 (sscanf(arg_str, "%u%c", value, &dummy) != 1) || 816 (*value < arg->min) || 817 (*value > arg->max) || 818 (grouped && arg_set->argc < *value)) { 819 *error = arg->error; 820 return -EINVAL; 821 } 822 823 return 0; 824 } 825 826 int dm_read_arg(const struct dm_arg *arg, struct dm_arg_set *arg_set, 827 unsigned *value, char **error) 828 { 829 return validate_next_arg(arg, arg_set, value, error, 0); 830 } 831 EXPORT_SYMBOL(dm_read_arg); 832 833 int dm_read_arg_group(const struct dm_arg *arg, struct dm_arg_set *arg_set, 834 unsigned *value, char **error) 835 { 836 return validate_next_arg(arg, arg_set, value, error, 1); 837 } 838 EXPORT_SYMBOL(dm_read_arg_group); 839 840 const char *dm_shift_arg(struct dm_arg_set *as) 841 { 842 char *r; 843 844 if (as->argc) { 845 as->argc--; 846 r = *as->argv; 847 as->argv++; 848 return r; 849 } 850 851 return NULL; 852 } 853 EXPORT_SYMBOL(dm_shift_arg); 854 855 void dm_consume_args(struct dm_arg_set *as, unsigned num_args) 856 { 857 BUG_ON(as->argc < num_args); 858 as->argc -= num_args; 859 as->argv += num_args; 860 } 861 EXPORT_SYMBOL(dm_consume_args); 862 863 static bool __table_type_bio_based(enum dm_queue_mode table_type) 864 { 865 return (table_type == DM_TYPE_BIO_BASED || 866 table_type == DM_TYPE_DAX_BIO_BASED || 867 table_type == DM_TYPE_NVME_BIO_BASED); 868 } 869 870 static bool __table_type_request_based(enum dm_queue_mode table_type) 871 { 872 return table_type == DM_TYPE_REQUEST_BASED; 873 } 874 875 void dm_table_set_type(struct dm_table *t, enum dm_queue_mode type) 876 { 877 t->type = type; 878 } 879 EXPORT_SYMBOL_GPL(dm_table_set_type); 880 881 /* validate the dax capability of the target device span */ 882 int device_supports_dax(struct dm_target *ti, struct dm_dev *dev, 883 sector_t start, sector_t len, void *data) 884 { 885 int blocksize = *(int *) data; 886 887 return generic_fsdax_supported(dev->dax_dev, dev->bdev, blocksize, 888 start, len); 889 } 890 891 /* Check devices support synchronous DAX */ 892 static int device_dax_synchronous(struct dm_target *ti, struct dm_dev *dev, 893 sector_t start, sector_t len, void *data) 894 { 895 return dev->dax_dev && dax_synchronous(dev->dax_dev); 896 } 897 898 bool dm_table_supports_dax(struct dm_table *t, 899 iterate_devices_callout_fn iterate_fn, int *blocksize) 900 { 901 struct dm_target *ti; 902 unsigned i; 903 904 /* Ensure that all targets support DAX. */ 905 for (i = 0; i < dm_table_get_num_targets(t); i++) { 906 ti = dm_table_get_target(t, i); 907 908 if (!ti->type->direct_access) 909 return false; 910 911 if (!ti->type->iterate_devices || 912 !ti->type->iterate_devices(ti, iterate_fn, blocksize)) 913 return false; 914 } 915 916 return true; 917 } 918 919 static bool dm_table_does_not_support_partial_completion(struct dm_table *t); 920 921 static int device_is_rq_stackable(struct dm_target *ti, struct dm_dev *dev, 922 sector_t start, sector_t len, void *data) 923 { 924 struct block_device *bdev = dev->bdev; 925 struct request_queue *q = bdev_get_queue(bdev); 926 927 /* request-based cannot stack on partitions! */ 928 if (bdev != bdev->bd_contains) 929 return false; 930 931 return queue_is_mq(q); 932 } 933 934 static int dm_table_determine_type(struct dm_table *t) 935 { 936 unsigned i; 937 unsigned bio_based = 0, request_based = 0, hybrid = 0; 938 struct dm_target *tgt; 939 struct list_head *devices = dm_table_get_devices(t); 940 enum dm_queue_mode live_md_type = dm_get_md_type(t->md); 941 int page_size = PAGE_SIZE; 942 943 if (t->type != DM_TYPE_NONE) { 944 /* target already set the table's type */ 945 if (t->type == DM_TYPE_BIO_BASED) { 946 /* possibly upgrade to a variant of bio-based */ 947 goto verify_bio_based; 948 } 949 BUG_ON(t->type == DM_TYPE_DAX_BIO_BASED); 950 BUG_ON(t->type == DM_TYPE_NVME_BIO_BASED); 951 goto verify_rq_based; 952 } 953 954 for (i = 0; i < t->num_targets; i++) { 955 tgt = t->targets + i; 956 if (dm_target_hybrid(tgt)) 957 hybrid = 1; 958 else if (dm_target_request_based(tgt)) 959 request_based = 1; 960 else 961 bio_based = 1; 962 963 if (bio_based && request_based) { 964 DMERR("Inconsistent table: different target types" 965 " can't be mixed up"); 966 return -EINVAL; 967 } 968 } 969 970 if (hybrid && !bio_based && !request_based) { 971 /* 972 * The targets can work either way. 973 * Determine the type from the live device. 974 * Default to bio-based if device is new. 975 */ 976 if (__table_type_request_based(live_md_type)) 977 request_based = 1; 978 else 979 bio_based = 1; 980 } 981 982 if (bio_based) { 983 verify_bio_based: 984 /* We must use this table as bio-based */ 985 t->type = DM_TYPE_BIO_BASED; 986 if (dm_table_supports_dax(t, device_supports_dax, &page_size) || 987 (list_empty(devices) && live_md_type == DM_TYPE_DAX_BIO_BASED)) { 988 t->type = DM_TYPE_DAX_BIO_BASED; 989 } else { 990 /* Check if upgrading to NVMe bio-based is valid or required */ 991 tgt = dm_table_get_immutable_target(t); 992 if (tgt && !tgt->max_io_len && dm_table_does_not_support_partial_completion(t)) { 993 t->type = DM_TYPE_NVME_BIO_BASED; 994 goto verify_rq_based; /* must be stacked directly on NVMe (blk-mq) */ 995 } else if (list_empty(devices) && live_md_type == DM_TYPE_NVME_BIO_BASED) { 996 t->type = DM_TYPE_NVME_BIO_BASED; 997 } 998 } 999 return 0; 1000 } 1001 1002 BUG_ON(!request_based); /* No targets in this table */ 1003 1004 t->type = DM_TYPE_REQUEST_BASED; 1005 1006 verify_rq_based: 1007 /* 1008 * Request-based dm supports only tables that have a single target now. 1009 * To support multiple targets, request splitting support is needed, 1010 * and that needs lots of changes in the block-layer. 1011 * (e.g. request completion process for partial completion.) 1012 */ 1013 if (t->num_targets > 1) { 1014 DMERR("%s DM doesn't support multiple targets", 1015 t->type == DM_TYPE_NVME_BIO_BASED ? "nvme bio-based" : "request-based"); 1016 return -EINVAL; 1017 } 1018 1019 if (list_empty(devices)) { 1020 int srcu_idx; 1021 struct dm_table *live_table = dm_get_live_table(t->md, &srcu_idx); 1022 1023 /* inherit live table's type */ 1024 if (live_table) 1025 t->type = live_table->type; 1026 dm_put_live_table(t->md, srcu_idx); 1027 return 0; 1028 } 1029 1030 tgt = dm_table_get_immutable_target(t); 1031 if (!tgt) { 1032 DMERR("table load rejected: immutable target is required"); 1033 return -EINVAL; 1034 } else if (tgt->max_io_len) { 1035 DMERR("table load rejected: immutable target that splits IO is not supported"); 1036 return -EINVAL; 1037 } 1038 1039 /* Non-request-stackable devices can't be used for request-based dm */ 1040 if (!tgt->type->iterate_devices || 1041 !tgt->type->iterate_devices(tgt, device_is_rq_stackable, NULL)) { 1042 DMERR("table load rejected: including non-request-stackable devices"); 1043 return -EINVAL; 1044 } 1045 1046 return 0; 1047 } 1048 1049 enum dm_queue_mode dm_table_get_type(struct dm_table *t) 1050 { 1051 return t->type; 1052 } 1053 1054 struct target_type *dm_table_get_immutable_target_type(struct dm_table *t) 1055 { 1056 return t->immutable_target_type; 1057 } 1058 1059 struct dm_target *dm_table_get_immutable_target(struct dm_table *t) 1060 { 1061 /* Immutable target is implicitly a singleton */ 1062 if (t->num_targets > 1 || 1063 !dm_target_is_immutable(t->targets[0].type)) 1064 return NULL; 1065 1066 return t->targets; 1067 } 1068 1069 struct dm_target *dm_table_get_wildcard_target(struct dm_table *t) 1070 { 1071 struct dm_target *ti; 1072 unsigned i; 1073 1074 for (i = 0; i < dm_table_get_num_targets(t); i++) { 1075 ti = dm_table_get_target(t, i); 1076 if (dm_target_is_wildcard(ti->type)) 1077 return ti; 1078 } 1079 1080 return NULL; 1081 } 1082 1083 bool dm_table_bio_based(struct dm_table *t) 1084 { 1085 return __table_type_bio_based(dm_table_get_type(t)); 1086 } 1087 1088 bool dm_table_request_based(struct dm_table *t) 1089 { 1090 return __table_type_request_based(dm_table_get_type(t)); 1091 } 1092 1093 static int dm_table_alloc_md_mempools(struct dm_table *t, struct mapped_device *md) 1094 { 1095 enum dm_queue_mode type = dm_table_get_type(t); 1096 unsigned per_io_data_size = 0; 1097 unsigned min_pool_size = 0; 1098 struct dm_target *ti; 1099 unsigned i; 1100 1101 if (unlikely(type == DM_TYPE_NONE)) { 1102 DMWARN("no table type is set, can't allocate mempools"); 1103 return -EINVAL; 1104 } 1105 1106 if (__table_type_bio_based(type)) 1107 for (i = 0; i < t->num_targets; i++) { 1108 ti = t->targets + i; 1109 per_io_data_size = max(per_io_data_size, ti->per_io_data_size); 1110 min_pool_size = max(min_pool_size, ti->num_flush_bios); 1111 } 1112 1113 t->mempools = dm_alloc_md_mempools(md, type, t->integrity_supported, 1114 per_io_data_size, min_pool_size); 1115 if (!t->mempools) 1116 return -ENOMEM; 1117 1118 return 0; 1119 } 1120 1121 void dm_table_free_md_mempools(struct dm_table *t) 1122 { 1123 dm_free_md_mempools(t->mempools); 1124 t->mempools = NULL; 1125 } 1126 1127 struct dm_md_mempools *dm_table_get_md_mempools(struct dm_table *t) 1128 { 1129 return t->mempools; 1130 } 1131 1132 static int setup_indexes(struct dm_table *t) 1133 { 1134 int i; 1135 unsigned int total = 0; 1136 sector_t *indexes; 1137 1138 /* allocate the space for *all* the indexes */ 1139 for (i = t->depth - 2; i >= 0; i--) { 1140 t->counts[i] = dm_div_up(t->counts[i + 1], CHILDREN_PER_NODE); 1141 total += t->counts[i]; 1142 } 1143 1144 indexes = (sector_t *) dm_vcalloc(total, (unsigned long) NODE_SIZE); 1145 if (!indexes) 1146 return -ENOMEM; 1147 1148 /* set up internal nodes, bottom-up */ 1149 for (i = t->depth - 2; i >= 0; i--) { 1150 t->index[i] = indexes; 1151 indexes += (KEYS_PER_NODE * t->counts[i]); 1152 setup_btree_index(i, t); 1153 } 1154 1155 return 0; 1156 } 1157 1158 /* 1159 * Builds the btree to index the map. 1160 */ 1161 static int dm_table_build_index(struct dm_table *t) 1162 { 1163 int r = 0; 1164 unsigned int leaf_nodes; 1165 1166 /* how many indexes will the btree have ? */ 1167 leaf_nodes = dm_div_up(t->num_targets, KEYS_PER_NODE); 1168 t->depth = 1 + int_log(leaf_nodes, CHILDREN_PER_NODE); 1169 1170 /* leaf layer has already been set up */ 1171 t->counts[t->depth - 1] = leaf_nodes; 1172 t->index[t->depth - 1] = t->highs; 1173 1174 if (t->depth >= 2) 1175 r = setup_indexes(t); 1176 1177 return r; 1178 } 1179 1180 static bool integrity_profile_exists(struct gendisk *disk) 1181 { 1182 return !!blk_get_integrity(disk); 1183 } 1184 1185 /* 1186 * Get a disk whose integrity profile reflects the table's profile. 1187 * Returns NULL if integrity support was inconsistent or unavailable. 1188 */ 1189 static struct gendisk * dm_table_get_integrity_disk(struct dm_table *t) 1190 { 1191 struct list_head *devices = dm_table_get_devices(t); 1192 struct dm_dev_internal *dd = NULL; 1193 struct gendisk *prev_disk = NULL, *template_disk = NULL; 1194 unsigned i; 1195 1196 for (i = 0; i < dm_table_get_num_targets(t); i++) { 1197 struct dm_target *ti = dm_table_get_target(t, i); 1198 if (!dm_target_passes_integrity(ti->type)) 1199 goto no_integrity; 1200 } 1201 1202 list_for_each_entry(dd, devices, list) { 1203 template_disk = dd->dm_dev->bdev->bd_disk; 1204 if (!integrity_profile_exists(template_disk)) 1205 goto no_integrity; 1206 else if (prev_disk && 1207 blk_integrity_compare(prev_disk, template_disk) < 0) 1208 goto no_integrity; 1209 prev_disk = template_disk; 1210 } 1211 1212 return template_disk; 1213 1214 no_integrity: 1215 if (prev_disk) 1216 DMWARN("%s: integrity not set: %s and %s profile mismatch", 1217 dm_device_name(t->md), 1218 prev_disk->disk_name, 1219 template_disk->disk_name); 1220 return NULL; 1221 } 1222 1223 /* 1224 * Register the mapped device for blk_integrity support if the 1225 * underlying devices have an integrity profile. But all devices may 1226 * not have matching profiles (checking all devices isn't reliable 1227 * during table load because this table may use other DM device(s) which 1228 * must be resumed before they will have an initialized integity 1229 * profile). Consequently, stacked DM devices force a 2 stage integrity 1230 * profile validation: First pass during table load, final pass during 1231 * resume. 1232 */ 1233 static int dm_table_register_integrity(struct dm_table *t) 1234 { 1235 struct mapped_device *md = t->md; 1236 struct gendisk *template_disk = NULL; 1237 1238 /* If target handles integrity itself do not register it here. */ 1239 if (t->integrity_added) 1240 return 0; 1241 1242 template_disk = dm_table_get_integrity_disk(t); 1243 if (!template_disk) 1244 return 0; 1245 1246 if (!integrity_profile_exists(dm_disk(md))) { 1247 t->integrity_supported = true; 1248 /* 1249 * Register integrity profile during table load; we can do 1250 * this because the final profile must match during resume. 1251 */ 1252 blk_integrity_register(dm_disk(md), 1253 blk_get_integrity(template_disk)); 1254 return 0; 1255 } 1256 1257 /* 1258 * If DM device already has an initialized integrity 1259 * profile the new profile should not conflict. 1260 */ 1261 if (blk_integrity_compare(dm_disk(md), template_disk) < 0) { 1262 DMWARN("%s: conflict with existing integrity profile: " 1263 "%s profile mismatch", 1264 dm_device_name(t->md), 1265 template_disk->disk_name); 1266 return 1; 1267 } 1268 1269 /* Preserve existing integrity profile */ 1270 t->integrity_supported = true; 1271 return 0; 1272 } 1273 1274 /* 1275 * Prepares the table for use by building the indices, 1276 * setting the type, and allocating mempools. 1277 */ 1278 int dm_table_complete(struct dm_table *t) 1279 { 1280 int r; 1281 1282 r = dm_table_determine_type(t); 1283 if (r) { 1284 DMERR("unable to determine table type"); 1285 return r; 1286 } 1287 1288 r = dm_table_build_index(t); 1289 if (r) { 1290 DMERR("unable to build btrees"); 1291 return r; 1292 } 1293 1294 r = dm_table_register_integrity(t); 1295 if (r) { 1296 DMERR("could not register integrity profile."); 1297 return r; 1298 } 1299 1300 r = dm_table_alloc_md_mempools(t, t->md); 1301 if (r) 1302 DMERR("unable to allocate mempools"); 1303 1304 return r; 1305 } 1306 1307 static DEFINE_MUTEX(_event_lock); 1308 void dm_table_event_callback(struct dm_table *t, 1309 void (*fn)(void *), void *context) 1310 { 1311 mutex_lock(&_event_lock); 1312 t->event_fn = fn; 1313 t->event_context = context; 1314 mutex_unlock(&_event_lock); 1315 } 1316 1317 void dm_table_event(struct dm_table *t) 1318 { 1319 /* 1320 * You can no longer call dm_table_event() from interrupt 1321 * context, use a bottom half instead. 1322 */ 1323 BUG_ON(in_interrupt()); 1324 1325 mutex_lock(&_event_lock); 1326 if (t->event_fn) 1327 t->event_fn(t->event_context); 1328 mutex_unlock(&_event_lock); 1329 } 1330 EXPORT_SYMBOL(dm_table_event); 1331 1332 inline sector_t dm_table_get_size(struct dm_table *t) 1333 { 1334 return t->num_targets ? (t->highs[t->num_targets - 1] + 1) : 0; 1335 } 1336 EXPORT_SYMBOL(dm_table_get_size); 1337 1338 struct dm_target *dm_table_get_target(struct dm_table *t, unsigned int index) 1339 { 1340 if (index >= t->num_targets) 1341 return NULL; 1342 1343 return t->targets + index; 1344 } 1345 1346 /* 1347 * Search the btree for the correct target. 1348 * 1349 * Caller should check returned pointer for NULL 1350 * to trap I/O beyond end of device. 1351 */ 1352 struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector) 1353 { 1354 unsigned int l, n = 0, k = 0; 1355 sector_t *node; 1356 1357 if (unlikely(sector >= dm_table_get_size(t))) 1358 return NULL; 1359 1360 for (l = 0; l < t->depth; l++) { 1361 n = get_child(n, k); 1362 node = get_node(t, l, n); 1363 1364 for (k = 0; k < KEYS_PER_NODE; k++) 1365 if (node[k] >= sector) 1366 break; 1367 } 1368 1369 return &t->targets[(KEYS_PER_NODE * n) + k]; 1370 } 1371 1372 static int count_device(struct dm_target *ti, struct dm_dev *dev, 1373 sector_t start, sector_t len, void *data) 1374 { 1375 unsigned *num_devices = data; 1376 1377 (*num_devices)++; 1378 1379 return 0; 1380 } 1381 1382 /* 1383 * Check whether a table has no data devices attached using each 1384 * target's iterate_devices method. 1385 * Returns false if the result is unknown because a target doesn't 1386 * support iterate_devices. 1387 */ 1388 bool dm_table_has_no_data_devices(struct dm_table *table) 1389 { 1390 struct dm_target *ti; 1391 unsigned i, num_devices; 1392 1393 for (i = 0; i < dm_table_get_num_targets(table); i++) { 1394 ti = dm_table_get_target(table, i); 1395 1396 if (!ti->type->iterate_devices) 1397 return false; 1398 1399 num_devices = 0; 1400 ti->type->iterate_devices(ti, count_device, &num_devices); 1401 if (num_devices) 1402 return false; 1403 } 1404 1405 return true; 1406 } 1407 1408 static int device_is_zoned_model(struct dm_target *ti, struct dm_dev *dev, 1409 sector_t start, sector_t len, void *data) 1410 { 1411 struct request_queue *q = bdev_get_queue(dev->bdev); 1412 enum blk_zoned_model *zoned_model = data; 1413 1414 return q && blk_queue_zoned_model(q) == *zoned_model; 1415 } 1416 1417 static bool dm_table_supports_zoned_model(struct dm_table *t, 1418 enum blk_zoned_model zoned_model) 1419 { 1420 struct dm_target *ti; 1421 unsigned i; 1422 1423 for (i = 0; i < dm_table_get_num_targets(t); i++) { 1424 ti = dm_table_get_target(t, i); 1425 1426 if (zoned_model == BLK_ZONED_HM && 1427 !dm_target_supports_zoned_hm(ti->type)) 1428 return false; 1429 1430 if (!ti->type->iterate_devices || 1431 !ti->type->iterate_devices(ti, device_is_zoned_model, &zoned_model)) 1432 return false; 1433 } 1434 1435 return true; 1436 } 1437 1438 static int device_matches_zone_sectors(struct dm_target *ti, struct dm_dev *dev, 1439 sector_t start, sector_t len, void *data) 1440 { 1441 struct request_queue *q = bdev_get_queue(dev->bdev); 1442 unsigned int *zone_sectors = data; 1443 1444 return q && blk_queue_zone_sectors(q) == *zone_sectors; 1445 } 1446 1447 static bool dm_table_matches_zone_sectors(struct dm_table *t, 1448 unsigned int zone_sectors) 1449 { 1450 struct dm_target *ti; 1451 unsigned i; 1452 1453 for (i = 0; i < dm_table_get_num_targets(t); i++) { 1454 ti = dm_table_get_target(t, i); 1455 1456 if (!ti->type->iterate_devices || 1457 !ti->type->iterate_devices(ti, device_matches_zone_sectors, &zone_sectors)) 1458 return false; 1459 } 1460 1461 return true; 1462 } 1463 1464 static int validate_hardware_zoned_model(struct dm_table *table, 1465 enum blk_zoned_model zoned_model, 1466 unsigned int zone_sectors) 1467 { 1468 if (zoned_model == BLK_ZONED_NONE) 1469 return 0; 1470 1471 if (!dm_table_supports_zoned_model(table, zoned_model)) { 1472 DMERR("%s: zoned model is not consistent across all devices", 1473 dm_device_name(table->md)); 1474 return -EINVAL; 1475 } 1476 1477 /* Check zone size validity and compatibility */ 1478 if (!zone_sectors || !is_power_of_2(zone_sectors)) 1479 return -EINVAL; 1480 1481 if (!dm_table_matches_zone_sectors(table, zone_sectors)) { 1482 DMERR("%s: zone sectors is not consistent across all devices", 1483 dm_device_name(table->md)); 1484 return -EINVAL; 1485 } 1486 1487 return 0; 1488 } 1489 1490 /* 1491 * Establish the new table's queue_limits and validate them. 1492 */ 1493 int dm_calculate_queue_limits(struct dm_table *table, 1494 struct queue_limits *limits) 1495 { 1496 struct dm_target *ti; 1497 struct queue_limits ti_limits; 1498 unsigned i; 1499 enum blk_zoned_model zoned_model = BLK_ZONED_NONE; 1500 unsigned int zone_sectors = 0; 1501 1502 blk_set_stacking_limits(limits); 1503 1504 for (i = 0; i < dm_table_get_num_targets(table); i++) { 1505 blk_set_stacking_limits(&ti_limits); 1506 1507 ti = dm_table_get_target(table, i); 1508 1509 if (!ti->type->iterate_devices) 1510 goto combine_limits; 1511 1512 /* 1513 * Combine queue limits of all the devices this target uses. 1514 */ 1515 ti->type->iterate_devices(ti, dm_set_device_limits, 1516 &ti_limits); 1517 1518 if (zoned_model == BLK_ZONED_NONE && ti_limits.zoned != BLK_ZONED_NONE) { 1519 /* 1520 * After stacking all limits, validate all devices 1521 * in table support this zoned model and zone sectors. 1522 */ 1523 zoned_model = ti_limits.zoned; 1524 zone_sectors = ti_limits.chunk_sectors; 1525 } 1526 1527 /* Set I/O hints portion of queue limits */ 1528 if (ti->type->io_hints) 1529 ti->type->io_hints(ti, &ti_limits); 1530 1531 /* 1532 * Check each device area is consistent with the target's 1533 * overall queue limits. 1534 */ 1535 if (ti->type->iterate_devices(ti, device_area_is_invalid, 1536 &ti_limits)) 1537 return -EINVAL; 1538 1539 combine_limits: 1540 /* 1541 * Merge this target's queue limits into the overall limits 1542 * for the table. 1543 */ 1544 if (blk_stack_limits(limits, &ti_limits, 0) < 0) 1545 DMWARN("%s: adding target device " 1546 "(start sect %llu len %llu) " 1547 "caused an alignment inconsistency", 1548 dm_device_name(table->md), 1549 (unsigned long long) ti->begin, 1550 (unsigned long long) ti->len); 1551 1552 /* 1553 * FIXME: this should likely be moved to blk_stack_limits(), would 1554 * also eliminate limits->zoned stacking hack in dm_set_device_limits() 1555 */ 1556 if (limits->zoned == BLK_ZONED_NONE && ti_limits.zoned != BLK_ZONED_NONE) { 1557 /* 1558 * By default, the stacked limits zoned model is set to 1559 * BLK_ZONED_NONE in blk_set_stacking_limits(). Update 1560 * this model using the first target model reported 1561 * that is not BLK_ZONED_NONE. This will be either the 1562 * first target device zoned model or the model reported 1563 * by the target .io_hints. 1564 */ 1565 limits->zoned = ti_limits.zoned; 1566 } 1567 } 1568 1569 /* 1570 * Verify that the zoned model and zone sectors, as determined before 1571 * any .io_hints override, are the same across all devices in the table. 1572 * - this is especially relevant if .io_hints is emulating a disk-managed 1573 * zoned model (aka BLK_ZONED_NONE) on host-managed zoned block devices. 1574 * BUT... 1575 */ 1576 if (limits->zoned != BLK_ZONED_NONE) { 1577 /* 1578 * ...IF the above limits stacking determined a zoned model 1579 * validate that all of the table's devices conform to it. 1580 */ 1581 zoned_model = limits->zoned; 1582 zone_sectors = limits->chunk_sectors; 1583 } 1584 if (validate_hardware_zoned_model(table, zoned_model, zone_sectors)) 1585 return -EINVAL; 1586 1587 return validate_hardware_logical_block_alignment(table, limits); 1588 } 1589 1590 /* 1591 * Verify that all devices have an integrity profile that matches the 1592 * DM device's registered integrity profile. If the profiles don't 1593 * match then unregister the DM device's integrity profile. 1594 */ 1595 static void dm_table_verify_integrity(struct dm_table *t) 1596 { 1597 struct gendisk *template_disk = NULL; 1598 1599 if (t->integrity_added) 1600 return; 1601 1602 if (t->integrity_supported) { 1603 /* 1604 * Verify that the original integrity profile 1605 * matches all the devices in this table. 1606 */ 1607 template_disk = dm_table_get_integrity_disk(t); 1608 if (template_disk && 1609 blk_integrity_compare(dm_disk(t->md), template_disk) >= 0) 1610 return; 1611 } 1612 1613 if (integrity_profile_exists(dm_disk(t->md))) { 1614 DMWARN("%s: unable to establish an integrity profile", 1615 dm_device_name(t->md)); 1616 blk_integrity_unregister(dm_disk(t->md)); 1617 } 1618 } 1619 1620 static int device_flush_capable(struct dm_target *ti, struct dm_dev *dev, 1621 sector_t start, sector_t len, void *data) 1622 { 1623 unsigned long flush = (unsigned long) data; 1624 struct request_queue *q = bdev_get_queue(dev->bdev); 1625 1626 return q && (q->queue_flags & flush); 1627 } 1628 1629 static bool dm_table_supports_flush(struct dm_table *t, unsigned long flush) 1630 { 1631 struct dm_target *ti; 1632 unsigned i; 1633 1634 /* 1635 * Require at least one underlying device to support flushes. 1636 * t->devices includes internal dm devices such as mirror logs 1637 * so we need to use iterate_devices here, which targets 1638 * supporting flushes must provide. 1639 */ 1640 for (i = 0; i < dm_table_get_num_targets(t); i++) { 1641 ti = dm_table_get_target(t, i); 1642 1643 if (!ti->num_flush_bios) 1644 continue; 1645 1646 if (ti->flush_supported) 1647 return true; 1648 1649 if (ti->type->iterate_devices && 1650 ti->type->iterate_devices(ti, device_flush_capable, (void *) flush)) 1651 return true; 1652 } 1653 1654 return false; 1655 } 1656 1657 static int device_dax_write_cache_enabled(struct dm_target *ti, 1658 struct dm_dev *dev, sector_t start, 1659 sector_t len, void *data) 1660 { 1661 struct dax_device *dax_dev = dev->dax_dev; 1662 1663 if (!dax_dev) 1664 return false; 1665 1666 if (dax_write_cache_enabled(dax_dev)) 1667 return true; 1668 return false; 1669 } 1670 1671 static int dm_table_supports_dax_write_cache(struct dm_table *t) 1672 { 1673 struct dm_target *ti; 1674 unsigned i; 1675 1676 for (i = 0; i < dm_table_get_num_targets(t); i++) { 1677 ti = dm_table_get_target(t, i); 1678 1679 if (ti->type->iterate_devices && 1680 ti->type->iterate_devices(ti, 1681 device_dax_write_cache_enabled, NULL)) 1682 return true; 1683 } 1684 1685 return false; 1686 } 1687 1688 static int device_is_nonrot(struct dm_target *ti, struct dm_dev *dev, 1689 sector_t start, sector_t len, void *data) 1690 { 1691 struct request_queue *q = bdev_get_queue(dev->bdev); 1692 1693 return q && blk_queue_nonrot(q); 1694 } 1695 1696 static int device_is_not_random(struct dm_target *ti, struct dm_dev *dev, 1697 sector_t start, sector_t len, void *data) 1698 { 1699 struct request_queue *q = bdev_get_queue(dev->bdev); 1700 1701 return q && !blk_queue_add_random(q); 1702 } 1703 1704 static bool dm_table_all_devices_attribute(struct dm_table *t, 1705 iterate_devices_callout_fn func) 1706 { 1707 struct dm_target *ti; 1708 unsigned i; 1709 1710 for (i = 0; i < dm_table_get_num_targets(t); i++) { 1711 ti = dm_table_get_target(t, i); 1712 1713 if (!ti->type->iterate_devices || 1714 !ti->type->iterate_devices(ti, func, NULL)) 1715 return false; 1716 } 1717 1718 return true; 1719 } 1720 1721 static int device_no_partial_completion(struct dm_target *ti, struct dm_dev *dev, 1722 sector_t start, sector_t len, void *data) 1723 { 1724 char b[BDEVNAME_SIZE]; 1725 1726 /* For now, NVMe devices are the only devices of this class */ 1727 return (strncmp(bdevname(dev->bdev, b), "nvme", 4) == 0); 1728 } 1729 1730 static bool dm_table_does_not_support_partial_completion(struct dm_table *t) 1731 { 1732 return dm_table_all_devices_attribute(t, device_no_partial_completion); 1733 } 1734 1735 static int device_not_write_same_capable(struct dm_target *ti, struct dm_dev *dev, 1736 sector_t start, sector_t len, void *data) 1737 { 1738 struct request_queue *q = bdev_get_queue(dev->bdev); 1739 1740 return q && !q->limits.max_write_same_sectors; 1741 } 1742 1743 static bool dm_table_supports_write_same(struct dm_table *t) 1744 { 1745 struct dm_target *ti; 1746 unsigned i; 1747 1748 for (i = 0; i < dm_table_get_num_targets(t); i++) { 1749 ti = dm_table_get_target(t, i); 1750 1751 if (!ti->num_write_same_bios) 1752 return false; 1753 1754 if (!ti->type->iterate_devices || 1755 ti->type->iterate_devices(ti, device_not_write_same_capable, NULL)) 1756 return false; 1757 } 1758 1759 return true; 1760 } 1761 1762 static int device_not_write_zeroes_capable(struct dm_target *ti, struct dm_dev *dev, 1763 sector_t start, sector_t len, void *data) 1764 { 1765 struct request_queue *q = bdev_get_queue(dev->bdev); 1766 1767 return q && !q->limits.max_write_zeroes_sectors; 1768 } 1769 1770 static bool dm_table_supports_write_zeroes(struct dm_table *t) 1771 { 1772 struct dm_target *ti; 1773 unsigned i = 0; 1774 1775 while (i < dm_table_get_num_targets(t)) { 1776 ti = dm_table_get_target(t, i++); 1777 1778 if (!ti->num_write_zeroes_bios) 1779 return false; 1780 1781 if (!ti->type->iterate_devices || 1782 ti->type->iterate_devices(ti, device_not_write_zeroes_capable, NULL)) 1783 return false; 1784 } 1785 1786 return true; 1787 } 1788 1789 static int device_not_discard_capable(struct dm_target *ti, struct dm_dev *dev, 1790 sector_t start, sector_t len, void *data) 1791 { 1792 struct request_queue *q = bdev_get_queue(dev->bdev); 1793 1794 return q && !blk_queue_discard(q); 1795 } 1796 1797 static bool dm_table_supports_discards(struct dm_table *t) 1798 { 1799 struct dm_target *ti; 1800 unsigned i; 1801 1802 for (i = 0; i < dm_table_get_num_targets(t); i++) { 1803 ti = dm_table_get_target(t, i); 1804 1805 if (!ti->num_discard_bios) 1806 return false; 1807 1808 /* 1809 * Either the target provides discard support (as implied by setting 1810 * 'discards_supported') or it relies on _all_ data devices having 1811 * discard support. 1812 */ 1813 if (!ti->discards_supported && 1814 (!ti->type->iterate_devices || 1815 ti->type->iterate_devices(ti, device_not_discard_capable, NULL))) 1816 return false; 1817 } 1818 1819 return true; 1820 } 1821 1822 static int device_not_secure_erase_capable(struct dm_target *ti, 1823 struct dm_dev *dev, sector_t start, 1824 sector_t len, void *data) 1825 { 1826 struct request_queue *q = bdev_get_queue(dev->bdev); 1827 1828 return q && !blk_queue_secure_erase(q); 1829 } 1830 1831 static bool dm_table_supports_secure_erase(struct dm_table *t) 1832 { 1833 struct dm_target *ti; 1834 unsigned int i; 1835 1836 for (i = 0; i < dm_table_get_num_targets(t); i++) { 1837 ti = dm_table_get_target(t, i); 1838 1839 if (!ti->num_secure_erase_bios) 1840 return false; 1841 1842 if (!ti->type->iterate_devices || 1843 ti->type->iterate_devices(ti, device_not_secure_erase_capable, NULL)) 1844 return false; 1845 } 1846 1847 return true; 1848 } 1849 1850 static int device_requires_stable_pages(struct dm_target *ti, 1851 struct dm_dev *dev, sector_t start, 1852 sector_t len, void *data) 1853 { 1854 struct request_queue *q = bdev_get_queue(dev->bdev); 1855 1856 return q && bdi_cap_stable_pages_required(q->backing_dev_info); 1857 } 1858 1859 /* 1860 * If any underlying device requires stable pages, a table must require 1861 * them as well. Only targets that support iterate_devices are considered: 1862 * don't want error, zero, etc to require stable pages. 1863 */ 1864 static bool dm_table_requires_stable_pages(struct dm_table *t) 1865 { 1866 struct dm_target *ti; 1867 unsigned i; 1868 1869 for (i = 0; i < dm_table_get_num_targets(t); i++) { 1870 ti = dm_table_get_target(t, i); 1871 1872 if (ti->type->iterate_devices && 1873 ti->type->iterate_devices(ti, device_requires_stable_pages, NULL)) 1874 return true; 1875 } 1876 1877 return false; 1878 } 1879 1880 void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, 1881 struct queue_limits *limits) 1882 { 1883 bool wc = false, fua = false; 1884 int page_size = PAGE_SIZE; 1885 1886 /* 1887 * Copy table's limits to the DM device's request_queue 1888 */ 1889 q->limits = *limits; 1890 1891 if (!dm_table_supports_discards(t)) { 1892 blk_queue_flag_clear(QUEUE_FLAG_DISCARD, q); 1893 /* Must also clear discard limits... */ 1894 q->limits.max_discard_sectors = 0; 1895 q->limits.max_hw_discard_sectors = 0; 1896 q->limits.discard_granularity = 0; 1897 q->limits.discard_alignment = 0; 1898 q->limits.discard_misaligned = 0; 1899 } else 1900 blk_queue_flag_set(QUEUE_FLAG_DISCARD, q); 1901 1902 if (dm_table_supports_secure_erase(t)) 1903 blk_queue_flag_set(QUEUE_FLAG_SECERASE, q); 1904 1905 if (dm_table_supports_flush(t, (1UL << QUEUE_FLAG_WC))) { 1906 wc = true; 1907 if (dm_table_supports_flush(t, (1UL << QUEUE_FLAG_FUA))) 1908 fua = true; 1909 } 1910 blk_queue_write_cache(q, wc, fua); 1911 1912 if (dm_table_supports_dax(t, device_supports_dax, &page_size)) { 1913 blk_queue_flag_set(QUEUE_FLAG_DAX, q); 1914 if (dm_table_supports_dax(t, device_dax_synchronous, NULL)) 1915 set_dax_synchronous(t->md->dax_dev); 1916 } 1917 else 1918 blk_queue_flag_clear(QUEUE_FLAG_DAX, q); 1919 1920 if (dm_table_supports_dax_write_cache(t)) 1921 dax_write_cache(t->md->dax_dev, true); 1922 1923 /* Ensure that all underlying devices are non-rotational. */ 1924 if (dm_table_all_devices_attribute(t, device_is_nonrot)) 1925 blk_queue_flag_set(QUEUE_FLAG_NONROT, q); 1926 else 1927 blk_queue_flag_clear(QUEUE_FLAG_NONROT, q); 1928 1929 if (!dm_table_supports_write_same(t)) 1930 q->limits.max_write_same_sectors = 0; 1931 if (!dm_table_supports_write_zeroes(t)) 1932 q->limits.max_write_zeroes_sectors = 0; 1933 1934 dm_table_verify_integrity(t); 1935 1936 /* 1937 * Some devices don't use blk_integrity but still want stable pages 1938 * because they do their own checksumming. 1939 */ 1940 if (dm_table_requires_stable_pages(t)) 1941 q->backing_dev_info->capabilities |= BDI_CAP_STABLE_WRITES; 1942 else 1943 q->backing_dev_info->capabilities &= ~BDI_CAP_STABLE_WRITES; 1944 1945 /* 1946 * Determine whether or not this queue's I/O timings contribute 1947 * to the entropy pool, Only request-based targets use this. 1948 * Clear QUEUE_FLAG_ADD_RANDOM if any underlying device does not 1949 * have it set. 1950 */ 1951 if (blk_queue_add_random(q) && dm_table_all_devices_attribute(t, device_is_not_random)) 1952 blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, q); 1953 1954 /* 1955 * For a zoned target, the number of zones should be updated for the 1956 * correct value to be exposed in sysfs queue/nr_zones. For a BIO based 1957 * target, this is all that is needed. 1958 */ 1959 #ifdef CONFIG_BLK_DEV_ZONED 1960 if (blk_queue_is_zoned(q)) { 1961 WARN_ON_ONCE(queue_is_mq(q)); 1962 q->nr_zones = blkdev_nr_zones(t->md->disk); 1963 } 1964 #endif 1965 1966 /* Allow reads to exceed readahead limits */ 1967 q->backing_dev_info->io_pages = limits->max_sectors >> (PAGE_SHIFT - 9); 1968 } 1969 1970 unsigned int dm_table_get_num_targets(struct dm_table *t) 1971 { 1972 return t->num_targets; 1973 } 1974 1975 struct list_head *dm_table_get_devices(struct dm_table *t) 1976 { 1977 return &t->devices; 1978 } 1979 1980 fmode_t dm_table_get_mode(struct dm_table *t) 1981 { 1982 return t->mode; 1983 } 1984 EXPORT_SYMBOL(dm_table_get_mode); 1985 1986 enum suspend_mode { 1987 PRESUSPEND, 1988 PRESUSPEND_UNDO, 1989 POSTSUSPEND, 1990 }; 1991 1992 static void suspend_targets(struct dm_table *t, enum suspend_mode mode) 1993 { 1994 int i = t->num_targets; 1995 struct dm_target *ti = t->targets; 1996 1997 lockdep_assert_held(&t->md->suspend_lock); 1998 1999 while (i--) { 2000 switch (mode) { 2001 case PRESUSPEND: 2002 if (ti->type->presuspend) 2003 ti->type->presuspend(ti); 2004 break; 2005 case PRESUSPEND_UNDO: 2006 if (ti->type->presuspend_undo) 2007 ti->type->presuspend_undo(ti); 2008 break; 2009 case POSTSUSPEND: 2010 if (ti->type->postsuspend) 2011 ti->type->postsuspend(ti); 2012 break; 2013 } 2014 ti++; 2015 } 2016 } 2017 2018 void dm_table_presuspend_targets(struct dm_table *t) 2019 { 2020 if (!t) 2021 return; 2022 2023 suspend_targets(t, PRESUSPEND); 2024 } 2025 2026 void dm_table_presuspend_undo_targets(struct dm_table *t) 2027 { 2028 if (!t) 2029 return; 2030 2031 suspend_targets(t, PRESUSPEND_UNDO); 2032 } 2033 2034 void dm_table_postsuspend_targets(struct dm_table *t) 2035 { 2036 if (!t) 2037 return; 2038 2039 suspend_targets(t, POSTSUSPEND); 2040 } 2041 2042 int dm_table_resume_targets(struct dm_table *t) 2043 { 2044 int i, r = 0; 2045 2046 lockdep_assert_held(&t->md->suspend_lock); 2047 2048 for (i = 0; i < t->num_targets; i++) { 2049 struct dm_target *ti = t->targets + i; 2050 2051 if (!ti->type->preresume) 2052 continue; 2053 2054 r = ti->type->preresume(ti); 2055 if (r) { 2056 DMERR("%s: %s: preresume failed, error = %d", 2057 dm_device_name(t->md), ti->type->name, r); 2058 return r; 2059 } 2060 } 2061 2062 for (i = 0; i < t->num_targets; i++) { 2063 struct dm_target *ti = t->targets + i; 2064 2065 if (ti->type->resume) 2066 ti->type->resume(ti); 2067 } 2068 2069 return 0; 2070 } 2071 2072 void dm_table_add_target_callbacks(struct dm_table *t, struct dm_target_callbacks *cb) 2073 { 2074 list_add(&cb->list, &t->target_callbacks); 2075 } 2076 EXPORT_SYMBOL_GPL(dm_table_add_target_callbacks); 2077 2078 int dm_table_any_congested(struct dm_table *t, int bdi_bits) 2079 { 2080 struct dm_dev_internal *dd; 2081 struct list_head *devices = dm_table_get_devices(t); 2082 struct dm_target_callbacks *cb; 2083 int r = 0; 2084 2085 list_for_each_entry(dd, devices, list) { 2086 struct request_queue *q = bdev_get_queue(dd->dm_dev->bdev); 2087 char b[BDEVNAME_SIZE]; 2088 2089 if (likely(q)) 2090 r |= bdi_congested(q->backing_dev_info, bdi_bits); 2091 else 2092 DMWARN_LIMIT("%s: any_congested: nonexistent device %s", 2093 dm_device_name(t->md), 2094 bdevname(dd->dm_dev->bdev, b)); 2095 } 2096 2097 list_for_each_entry(cb, &t->target_callbacks, list) 2098 if (cb->congested_fn) 2099 r |= cb->congested_fn(cb, bdi_bits); 2100 2101 return r; 2102 } 2103 2104 struct mapped_device *dm_table_get_md(struct dm_table *t) 2105 { 2106 return t->md; 2107 } 2108 EXPORT_SYMBOL(dm_table_get_md); 2109 2110 const char *dm_table_device_name(struct dm_table *t) 2111 { 2112 return dm_device_name(t->md); 2113 } 2114 EXPORT_SYMBOL_GPL(dm_table_device_name); 2115 2116 void dm_table_run_md_queue_async(struct dm_table *t) 2117 { 2118 struct mapped_device *md; 2119 struct request_queue *queue; 2120 2121 if (!dm_table_request_based(t)) 2122 return; 2123 2124 md = dm_table_get_md(t); 2125 queue = dm_get_md_queue(md); 2126 if (queue) 2127 blk_mq_run_hw_queues(queue, true); 2128 } 2129 EXPORT_SYMBOL(dm_table_run_md_queue_async); 2130 2131