1 /* 2 * Copyright (C) 2001 Sistina Software (UK) Limited. 3 * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved. 4 * 5 * This file is released under the GPL. 6 */ 7 8 #include "dm-core.h" 9 10 #include <linux/module.h> 11 #include <linux/vmalloc.h> 12 #include <linux/blkdev.h> 13 #include <linux/namei.h> 14 #include <linux/ctype.h> 15 #include <linux/string.h> 16 #include <linux/slab.h> 17 #include <linux/interrupt.h> 18 #include <linux/mutex.h> 19 #include <linux/delay.h> 20 #include <linux/atomic.h> 21 #include <linux/blk-mq.h> 22 #include <linux/mount.h> 23 #include <linux/dax.h> 24 25 #define DM_MSG_PREFIX "table" 26 27 #define NODE_SIZE L1_CACHE_BYTES 28 #define KEYS_PER_NODE (NODE_SIZE / sizeof(sector_t)) 29 #define CHILDREN_PER_NODE (KEYS_PER_NODE + 1) 30 31 /* 32 * Similar to ceiling(log_size(n)) 33 */ 34 static unsigned int int_log(unsigned int n, unsigned int base) 35 { 36 int result = 0; 37 38 while (n > 1) { 39 n = dm_div_up(n, base); 40 result++; 41 } 42 43 return result; 44 } 45 46 /* 47 * Calculate the index of the child node of the n'th node k'th key. 48 */ 49 static inline unsigned int get_child(unsigned int n, unsigned int k) 50 { 51 return (n * CHILDREN_PER_NODE) + k; 52 } 53 54 /* 55 * Return the n'th node of level l from table t. 56 */ 57 static inline sector_t *get_node(struct dm_table *t, 58 unsigned int l, unsigned int n) 59 { 60 return t->index[l] + (n * KEYS_PER_NODE); 61 } 62 63 /* 64 * Return the highest key that you could lookup from the n'th 65 * node on level l of the btree. 66 */ 67 static sector_t high(struct dm_table *t, unsigned int l, unsigned int n) 68 { 69 for (; l < t->depth - 1; l++) 70 n = get_child(n, CHILDREN_PER_NODE - 1); 71 72 if (n >= t->counts[l]) 73 return (sector_t) - 1; 74 75 return get_node(t, l, n)[KEYS_PER_NODE - 1]; 76 } 77 78 /* 79 * Fills in a level of the btree based on the highs of the level 80 * below it. 81 */ 82 static int setup_btree_index(unsigned int l, struct dm_table *t) 83 { 84 unsigned int n, k; 85 sector_t *node; 86 87 for (n = 0U; n < t->counts[l]; n++) { 88 node = get_node(t, l, n); 89 90 for (k = 0U; k < KEYS_PER_NODE; k++) 91 node[k] = high(t, l + 1, get_child(n, k)); 92 } 93 94 return 0; 95 } 96 97 void *dm_vcalloc(unsigned long nmemb, unsigned long elem_size) 98 { 99 unsigned long size; 100 void *addr; 101 102 /* 103 * Check that we're not going to overflow. 104 */ 105 if (nmemb > (ULONG_MAX / elem_size)) 106 return NULL; 107 108 size = nmemb * elem_size; 109 addr = vzalloc(size); 110 111 return addr; 112 } 113 EXPORT_SYMBOL(dm_vcalloc); 114 115 /* 116 * highs, and targets are managed as dynamic arrays during a 117 * table load. 118 */ 119 static int alloc_targets(struct dm_table *t, unsigned int num) 120 { 121 sector_t *n_highs; 122 struct dm_target *n_targets; 123 124 /* 125 * Allocate both the target array and offset array at once. 126 */ 127 n_highs = (sector_t *) dm_vcalloc(num, sizeof(struct dm_target) + 128 sizeof(sector_t)); 129 if (!n_highs) 130 return -ENOMEM; 131 132 n_targets = (struct dm_target *) (n_highs + num); 133 134 memset(n_highs, -1, sizeof(*n_highs) * num); 135 vfree(t->highs); 136 137 t->num_allocated = num; 138 t->highs = n_highs; 139 t->targets = n_targets; 140 141 return 0; 142 } 143 144 int dm_table_create(struct dm_table **result, fmode_t mode, 145 unsigned num_targets, struct mapped_device *md) 146 { 147 struct dm_table *t = kzalloc(sizeof(*t), GFP_KERNEL); 148 149 if (!t) 150 return -ENOMEM; 151 152 INIT_LIST_HEAD(&t->devices); 153 154 if (!num_targets) 155 num_targets = KEYS_PER_NODE; 156 157 num_targets = dm_round_up(num_targets, KEYS_PER_NODE); 158 159 if (!num_targets) { 160 kfree(t); 161 return -ENOMEM; 162 } 163 164 if (alloc_targets(t, num_targets)) { 165 kfree(t); 166 return -ENOMEM; 167 } 168 169 t->type = DM_TYPE_NONE; 170 t->mode = mode; 171 t->md = md; 172 *result = t; 173 return 0; 174 } 175 176 static void free_devices(struct list_head *devices, struct mapped_device *md) 177 { 178 struct list_head *tmp, *next; 179 180 list_for_each_safe(tmp, next, devices) { 181 struct dm_dev_internal *dd = 182 list_entry(tmp, struct dm_dev_internal, list); 183 DMWARN("%s: dm_table_destroy: dm_put_device call missing for %s", 184 dm_device_name(md), dd->dm_dev->name); 185 dm_put_table_device(md, dd->dm_dev); 186 kfree(dd); 187 } 188 } 189 190 void dm_table_destroy(struct dm_table *t) 191 { 192 unsigned int i; 193 194 if (!t) 195 return; 196 197 /* free the indexes */ 198 if (t->depth >= 2) 199 vfree(t->index[t->depth - 2]); 200 201 /* free the targets */ 202 for (i = 0; i < t->num_targets; i++) { 203 struct dm_target *tgt = t->targets + i; 204 205 if (tgt->type->dtr) 206 tgt->type->dtr(tgt); 207 208 dm_put_target_type(tgt->type); 209 } 210 211 vfree(t->highs); 212 213 /* free the device list */ 214 free_devices(&t->devices, t->md); 215 216 dm_free_md_mempools(t->mempools); 217 218 kfree(t); 219 } 220 221 /* 222 * See if we've already got a device in the list. 223 */ 224 static struct dm_dev_internal *find_device(struct list_head *l, dev_t dev) 225 { 226 struct dm_dev_internal *dd; 227 228 list_for_each_entry (dd, l, list) 229 if (dd->dm_dev->bdev->bd_dev == dev) 230 return dd; 231 232 return NULL; 233 } 234 235 /* 236 * If possible, this checks an area of a destination device is invalid. 237 */ 238 static int device_area_is_invalid(struct dm_target *ti, struct dm_dev *dev, 239 sector_t start, sector_t len, void *data) 240 { 241 struct queue_limits *limits = data; 242 struct block_device *bdev = dev->bdev; 243 sector_t dev_size = 244 i_size_read(bdev->bd_inode) >> SECTOR_SHIFT; 245 unsigned short logical_block_size_sectors = 246 limits->logical_block_size >> SECTOR_SHIFT; 247 char b[BDEVNAME_SIZE]; 248 249 if (!dev_size) 250 return 0; 251 252 if ((start >= dev_size) || (start + len > dev_size)) { 253 DMWARN("%s: %s too small for target: " 254 "start=%llu, len=%llu, dev_size=%llu", 255 dm_device_name(ti->table->md), bdevname(bdev, b), 256 (unsigned long long)start, 257 (unsigned long long)len, 258 (unsigned long long)dev_size); 259 return 1; 260 } 261 262 /* 263 * If the target is mapped to zoned block device(s), check 264 * that the zones are not partially mapped. 265 */ 266 if (bdev_zoned_model(bdev) != BLK_ZONED_NONE) { 267 unsigned int zone_sectors = bdev_zone_sectors(bdev); 268 269 if (start & (zone_sectors - 1)) { 270 DMWARN("%s: start=%llu not aligned to h/w zone size %u of %s", 271 dm_device_name(ti->table->md), 272 (unsigned long long)start, 273 zone_sectors, bdevname(bdev, b)); 274 return 1; 275 } 276 277 /* 278 * Note: The last zone of a zoned block device may be smaller 279 * than other zones. So for a target mapping the end of a 280 * zoned block device with such a zone, len would not be zone 281 * aligned. We do not allow such last smaller zone to be part 282 * of the mapping here to ensure that mappings with multiple 283 * devices do not end up with a smaller zone in the middle of 284 * the sector range. 285 */ 286 if (len & (zone_sectors - 1)) { 287 DMWARN("%s: len=%llu not aligned to h/w zone size %u of %s", 288 dm_device_name(ti->table->md), 289 (unsigned long long)len, 290 zone_sectors, bdevname(bdev, b)); 291 return 1; 292 } 293 } 294 295 if (logical_block_size_sectors <= 1) 296 return 0; 297 298 if (start & (logical_block_size_sectors - 1)) { 299 DMWARN("%s: start=%llu not aligned to h/w " 300 "logical block size %u of %s", 301 dm_device_name(ti->table->md), 302 (unsigned long long)start, 303 limits->logical_block_size, bdevname(bdev, b)); 304 return 1; 305 } 306 307 if (len & (logical_block_size_sectors - 1)) { 308 DMWARN("%s: len=%llu not aligned to h/w " 309 "logical block size %u of %s", 310 dm_device_name(ti->table->md), 311 (unsigned long long)len, 312 limits->logical_block_size, bdevname(bdev, b)); 313 return 1; 314 } 315 316 return 0; 317 } 318 319 /* 320 * This upgrades the mode on an already open dm_dev, being 321 * careful to leave things as they were if we fail to reopen the 322 * device and not to touch the existing bdev field in case 323 * it is accessed concurrently. 324 */ 325 static int upgrade_mode(struct dm_dev_internal *dd, fmode_t new_mode, 326 struct mapped_device *md) 327 { 328 int r; 329 struct dm_dev *old_dev, *new_dev; 330 331 old_dev = dd->dm_dev; 332 333 r = dm_get_table_device(md, dd->dm_dev->bdev->bd_dev, 334 dd->dm_dev->mode | new_mode, &new_dev); 335 if (r) 336 return r; 337 338 dd->dm_dev = new_dev; 339 dm_put_table_device(md, old_dev); 340 341 return 0; 342 } 343 344 /* 345 * Convert the path to a device 346 */ 347 dev_t dm_get_dev_t(const char *path) 348 { 349 dev_t dev; 350 351 if (lookup_bdev(path, &dev)) 352 dev = name_to_dev_t(path); 353 return dev; 354 } 355 EXPORT_SYMBOL_GPL(dm_get_dev_t); 356 357 /* 358 * Add a device to the list, or just increment the usage count if 359 * it's already present. 360 */ 361 int dm_get_device(struct dm_target *ti, const char *path, fmode_t mode, 362 struct dm_dev **result) 363 { 364 int r; 365 dev_t dev; 366 unsigned int major, minor; 367 char dummy; 368 struct dm_dev_internal *dd; 369 struct dm_table *t = ti->table; 370 371 BUG_ON(!t); 372 373 if (sscanf(path, "%u:%u%c", &major, &minor, &dummy) == 2) { 374 /* Extract the major/minor numbers */ 375 dev = MKDEV(major, minor); 376 if (MAJOR(dev) != major || MINOR(dev) != minor) 377 return -EOVERFLOW; 378 } else { 379 dev = dm_get_dev_t(path); 380 if (!dev) 381 return -ENODEV; 382 } 383 384 dd = find_device(&t->devices, dev); 385 if (!dd) { 386 dd = kmalloc(sizeof(*dd), GFP_KERNEL); 387 if (!dd) 388 return -ENOMEM; 389 390 if ((r = dm_get_table_device(t->md, dev, mode, &dd->dm_dev))) { 391 kfree(dd); 392 return r; 393 } 394 395 refcount_set(&dd->count, 1); 396 list_add(&dd->list, &t->devices); 397 goto out; 398 399 } else if (dd->dm_dev->mode != (mode | dd->dm_dev->mode)) { 400 r = upgrade_mode(dd, mode, t->md); 401 if (r) 402 return r; 403 } 404 refcount_inc(&dd->count); 405 out: 406 *result = dd->dm_dev; 407 return 0; 408 } 409 EXPORT_SYMBOL(dm_get_device); 410 411 static int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev, 412 sector_t start, sector_t len, void *data) 413 { 414 struct queue_limits *limits = data; 415 struct block_device *bdev = dev->bdev; 416 struct request_queue *q = bdev_get_queue(bdev); 417 char b[BDEVNAME_SIZE]; 418 419 if (unlikely(!q)) { 420 DMWARN("%s: Cannot set limits for nonexistent device %s", 421 dm_device_name(ti->table->md), bdevname(bdev, b)); 422 return 0; 423 } 424 425 if (blk_stack_limits(limits, &q->limits, 426 get_start_sect(bdev) + start) < 0) 427 DMWARN("%s: adding target device %s caused an alignment inconsistency: " 428 "physical_block_size=%u, logical_block_size=%u, " 429 "alignment_offset=%u, start=%llu", 430 dm_device_name(ti->table->md), bdevname(bdev, b), 431 q->limits.physical_block_size, 432 q->limits.logical_block_size, 433 q->limits.alignment_offset, 434 (unsigned long long) start << SECTOR_SHIFT); 435 return 0; 436 } 437 438 /* 439 * Decrement a device's use count and remove it if necessary. 440 */ 441 void dm_put_device(struct dm_target *ti, struct dm_dev *d) 442 { 443 int found = 0; 444 struct list_head *devices = &ti->table->devices; 445 struct dm_dev_internal *dd; 446 447 list_for_each_entry(dd, devices, list) { 448 if (dd->dm_dev == d) { 449 found = 1; 450 break; 451 } 452 } 453 if (!found) { 454 DMWARN("%s: device %s not in table devices list", 455 dm_device_name(ti->table->md), d->name); 456 return; 457 } 458 if (refcount_dec_and_test(&dd->count)) { 459 dm_put_table_device(ti->table->md, d); 460 list_del(&dd->list); 461 kfree(dd); 462 } 463 } 464 EXPORT_SYMBOL(dm_put_device); 465 466 /* 467 * Checks to see if the target joins onto the end of the table. 468 */ 469 static int adjoin(struct dm_table *table, struct dm_target *ti) 470 { 471 struct dm_target *prev; 472 473 if (!table->num_targets) 474 return !ti->begin; 475 476 prev = &table->targets[table->num_targets - 1]; 477 return (ti->begin == (prev->begin + prev->len)); 478 } 479 480 /* 481 * Used to dynamically allocate the arg array. 482 * 483 * We do first allocation with GFP_NOIO because dm-mpath and dm-thin must 484 * process messages even if some device is suspended. These messages have a 485 * small fixed number of arguments. 486 * 487 * On the other hand, dm-switch needs to process bulk data using messages and 488 * excessive use of GFP_NOIO could cause trouble. 489 */ 490 static char **realloc_argv(unsigned *size, char **old_argv) 491 { 492 char **argv; 493 unsigned new_size; 494 gfp_t gfp; 495 496 if (*size) { 497 new_size = *size * 2; 498 gfp = GFP_KERNEL; 499 } else { 500 new_size = 8; 501 gfp = GFP_NOIO; 502 } 503 argv = kmalloc_array(new_size, sizeof(*argv), gfp); 504 if (argv && old_argv) { 505 memcpy(argv, old_argv, *size * sizeof(*argv)); 506 *size = new_size; 507 } 508 509 kfree(old_argv); 510 return argv; 511 } 512 513 /* 514 * Destructively splits up the argument list to pass to ctr. 515 */ 516 int dm_split_args(int *argc, char ***argvp, char *input) 517 { 518 char *start, *end = input, *out, **argv = NULL; 519 unsigned array_size = 0; 520 521 *argc = 0; 522 523 if (!input) { 524 *argvp = NULL; 525 return 0; 526 } 527 528 argv = realloc_argv(&array_size, argv); 529 if (!argv) 530 return -ENOMEM; 531 532 while (1) { 533 /* Skip whitespace */ 534 start = skip_spaces(end); 535 536 if (!*start) 537 break; /* success, we hit the end */ 538 539 /* 'out' is used to remove any back-quotes */ 540 end = out = start; 541 while (*end) { 542 /* Everything apart from '\0' can be quoted */ 543 if (*end == '\\' && *(end + 1)) { 544 *out++ = *(end + 1); 545 end += 2; 546 continue; 547 } 548 549 if (isspace(*end)) 550 break; /* end of token */ 551 552 *out++ = *end++; 553 } 554 555 /* have we already filled the array ? */ 556 if ((*argc + 1) > array_size) { 557 argv = realloc_argv(&array_size, argv); 558 if (!argv) 559 return -ENOMEM; 560 } 561 562 /* we know this is whitespace */ 563 if (*end) 564 end++; 565 566 /* terminate the string and put it in the array */ 567 *out = '\0'; 568 argv[*argc] = start; 569 (*argc)++; 570 } 571 572 *argvp = argv; 573 return 0; 574 } 575 576 /* 577 * Impose necessary and sufficient conditions on a devices's table such 578 * that any incoming bio which respects its logical_block_size can be 579 * processed successfully. If it falls across the boundary between 580 * two or more targets, the size of each piece it gets split into must 581 * be compatible with the logical_block_size of the target processing it. 582 */ 583 static int validate_hardware_logical_block_alignment(struct dm_table *table, 584 struct queue_limits *limits) 585 { 586 /* 587 * This function uses arithmetic modulo the logical_block_size 588 * (in units of 512-byte sectors). 589 */ 590 unsigned short device_logical_block_size_sects = 591 limits->logical_block_size >> SECTOR_SHIFT; 592 593 /* 594 * Offset of the start of the next table entry, mod logical_block_size. 595 */ 596 unsigned short next_target_start = 0; 597 598 /* 599 * Given an aligned bio that extends beyond the end of a 600 * target, how many sectors must the next target handle? 601 */ 602 unsigned short remaining = 0; 603 604 struct dm_target *ti; 605 struct queue_limits ti_limits; 606 unsigned i; 607 608 /* 609 * Check each entry in the table in turn. 610 */ 611 for (i = 0; i < dm_table_get_num_targets(table); i++) { 612 ti = dm_table_get_target(table, i); 613 614 blk_set_stacking_limits(&ti_limits); 615 616 /* combine all target devices' limits */ 617 if (ti->type->iterate_devices) 618 ti->type->iterate_devices(ti, dm_set_device_limits, 619 &ti_limits); 620 621 /* 622 * If the remaining sectors fall entirely within this 623 * table entry are they compatible with its logical_block_size? 624 */ 625 if (remaining < ti->len && 626 remaining & ((ti_limits.logical_block_size >> 627 SECTOR_SHIFT) - 1)) 628 break; /* Error */ 629 630 next_target_start = 631 (unsigned short) ((next_target_start + ti->len) & 632 (device_logical_block_size_sects - 1)); 633 remaining = next_target_start ? 634 device_logical_block_size_sects - next_target_start : 0; 635 } 636 637 if (remaining) { 638 DMWARN("%s: table line %u (start sect %llu len %llu) " 639 "not aligned to h/w logical block size %u", 640 dm_device_name(table->md), i, 641 (unsigned long long) ti->begin, 642 (unsigned long long) ti->len, 643 limits->logical_block_size); 644 return -EINVAL; 645 } 646 647 return 0; 648 } 649 650 int dm_table_add_target(struct dm_table *t, const char *type, 651 sector_t start, sector_t len, char *params) 652 { 653 int r = -EINVAL, argc; 654 char **argv; 655 struct dm_target *tgt; 656 657 if (t->singleton) { 658 DMERR("%s: target type %s must appear alone in table", 659 dm_device_name(t->md), t->targets->type->name); 660 return -EINVAL; 661 } 662 663 BUG_ON(t->num_targets >= t->num_allocated); 664 665 tgt = t->targets + t->num_targets; 666 memset(tgt, 0, sizeof(*tgt)); 667 668 if (!len) { 669 DMERR("%s: zero-length target", dm_device_name(t->md)); 670 return -EINVAL; 671 } 672 673 tgt->type = dm_get_target_type(type); 674 if (!tgt->type) { 675 DMERR("%s: %s: unknown target type", dm_device_name(t->md), type); 676 return -EINVAL; 677 } 678 679 if (dm_target_needs_singleton(tgt->type)) { 680 if (t->num_targets) { 681 tgt->error = "singleton target type must appear alone in table"; 682 goto bad; 683 } 684 t->singleton = true; 685 } 686 687 if (dm_target_always_writeable(tgt->type) && !(t->mode & FMODE_WRITE)) { 688 tgt->error = "target type may not be included in a read-only table"; 689 goto bad; 690 } 691 692 if (t->immutable_target_type) { 693 if (t->immutable_target_type != tgt->type) { 694 tgt->error = "immutable target type cannot be mixed with other target types"; 695 goto bad; 696 } 697 } else if (dm_target_is_immutable(tgt->type)) { 698 if (t->num_targets) { 699 tgt->error = "immutable target type cannot be mixed with other target types"; 700 goto bad; 701 } 702 t->immutable_target_type = tgt->type; 703 } 704 705 if (dm_target_has_integrity(tgt->type)) 706 t->integrity_added = 1; 707 708 tgt->table = t; 709 tgt->begin = start; 710 tgt->len = len; 711 tgt->error = "Unknown error"; 712 713 /* 714 * Does this target adjoin the previous one ? 715 */ 716 if (!adjoin(t, tgt)) { 717 tgt->error = "Gap in table"; 718 goto bad; 719 } 720 721 r = dm_split_args(&argc, &argv, params); 722 if (r) { 723 tgt->error = "couldn't split parameters (insufficient memory)"; 724 goto bad; 725 } 726 727 r = tgt->type->ctr(tgt, argc, argv); 728 kfree(argv); 729 if (r) 730 goto bad; 731 732 t->highs[t->num_targets++] = tgt->begin + tgt->len - 1; 733 734 if (!tgt->num_discard_bios && tgt->discards_supported) 735 DMWARN("%s: %s: ignoring discards_supported because num_discard_bios is zero.", 736 dm_device_name(t->md), type); 737 738 return 0; 739 740 bad: 741 DMERR("%s: %s: %s", dm_device_name(t->md), type, tgt->error); 742 dm_put_target_type(tgt->type); 743 return r; 744 } 745 746 /* 747 * Target argument parsing helpers. 748 */ 749 static int validate_next_arg(const struct dm_arg *arg, 750 struct dm_arg_set *arg_set, 751 unsigned *value, char **error, unsigned grouped) 752 { 753 const char *arg_str = dm_shift_arg(arg_set); 754 char dummy; 755 756 if (!arg_str || 757 (sscanf(arg_str, "%u%c", value, &dummy) != 1) || 758 (*value < arg->min) || 759 (*value > arg->max) || 760 (grouped && arg_set->argc < *value)) { 761 *error = arg->error; 762 return -EINVAL; 763 } 764 765 return 0; 766 } 767 768 int dm_read_arg(const struct dm_arg *arg, struct dm_arg_set *arg_set, 769 unsigned *value, char **error) 770 { 771 return validate_next_arg(arg, arg_set, value, error, 0); 772 } 773 EXPORT_SYMBOL(dm_read_arg); 774 775 int dm_read_arg_group(const struct dm_arg *arg, struct dm_arg_set *arg_set, 776 unsigned *value, char **error) 777 { 778 return validate_next_arg(arg, arg_set, value, error, 1); 779 } 780 EXPORT_SYMBOL(dm_read_arg_group); 781 782 const char *dm_shift_arg(struct dm_arg_set *as) 783 { 784 char *r; 785 786 if (as->argc) { 787 as->argc--; 788 r = *as->argv; 789 as->argv++; 790 return r; 791 } 792 793 return NULL; 794 } 795 EXPORT_SYMBOL(dm_shift_arg); 796 797 void dm_consume_args(struct dm_arg_set *as, unsigned num_args) 798 { 799 BUG_ON(as->argc < num_args); 800 as->argc -= num_args; 801 as->argv += num_args; 802 } 803 EXPORT_SYMBOL(dm_consume_args); 804 805 static bool __table_type_bio_based(enum dm_queue_mode table_type) 806 { 807 return (table_type == DM_TYPE_BIO_BASED || 808 table_type == DM_TYPE_DAX_BIO_BASED); 809 } 810 811 static bool __table_type_request_based(enum dm_queue_mode table_type) 812 { 813 return table_type == DM_TYPE_REQUEST_BASED; 814 } 815 816 void dm_table_set_type(struct dm_table *t, enum dm_queue_mode type) 817 { 818 t->type = type; 819 } 820 EXPORT_SYMBOL_GPL(dm_table_set_type); 821 822 /* validate the dax capability of the target device span */ 823 int device_supports_dax(struct dm_target *ti, struct dm_dev *dev, 824 sector_t start, sector_t len, void *data) 825 { 826 int blocksize = *(int *) data, id; 827 bool rc; 828 829 id = dax_read_lock(); 830 rc = dax_supported(dev->dax_dev, dev->bdev, blocksize, start, len); 831 dax_read_unlock(id); 832 833 return rc; 834 } 835 836 /* Check devices support synchronous DAX */ 837 static int device_dax_synchronous(struct dm_target *ti, struct dm_dev *dev, 838 sector_t start, sector_t len, void *data) 839 { 840 return dev->dax_dev && dax_synchronous(dev->dax_dev); 841 } 842 843 bool dm_table_supports_dax(struct dm_table *t, 844 iterate_devices_callout_fn iterate_fn, int *blocksize) 845 { 846 struct dm_target *ti; 847 unsigned i; 848 849 /* Ensure that all targets support DAX. */ 850 for (i = 0; i < dm_table_get_num_targets(t); i++) { 851 ti = dm_table_get_target(t, i); 852 853 if (!ti->type->direct_access) 854 return false; 855 856 if (!ti->type->iterate_devices || 857 !ti->type->iterate_devices(ti, iterate_fn, blocksize)) 858 return false; 859 } 860 861 return true; 862 } 863 864 static int device_is_rq_stackable(struct dm_target *ti, struct dm_dev *dev, 865 sector_t start, sector_t len, void *data) 866 { 867 struct block_device *bdev = dev->bdev; 868 struct request_queue *q = bdev_get_queue(bdev); 869 870 /* request-based cannot stack on partitions! */ 871 if (bdev_is_partition(bdev)) 872 return false; 873 874 return queue_is_mq(q); 875 } 876 877 static int dm_table_determine_type(struct dm_table *t) 878 { 879 unsigned i; 880 unsigned bio_based = 0, request_based = 0, hybrid = 0; 881 struct dm_target *tgt; 882 struct list_head *devices = dm_table_get_devices(t); 883 enum dm_queue_mode live_md_type = dm_get_md_type(t->md); 884 int page_size = PAGE_SIZE; 885 886 if (t->type != DM_TYPE_NONE) { 887 /* target already set the table's type */ 888 if (t->type == DM_TYPE_BIO_BASED) { 889 /* possibly upgrade to a variant of bio-based */ 890 goto verify_bio_based; 891 } 892 BUG_ON(t->type == DM_TYPE_DAX_BIO_BASED); 893 goto verify_rq_based; 894 } 895 896 for (i = 0; i < t->num_targets; i++) { 897 tgt = t->targets + i; 898 if (dm_target_hybrid(tgt)) 899 hybrid = 1; 900 else if (dm_target_request_based(tgt)) 901 request_based = 1; 902 else 903 bio_based = 1; 904 905 if (bio_based && request_based) { 906 DMERR("Inconsistent table: different target types" 907 " can't be mixed up"); 908 return -EINVAL; 909 } 910 } 911 912 if (hybrid && !bio_based && !request_based) { 913 /* 914 * The targets can work either way. 915 * Determine the type from the live device. 916 * Default to bio-based if device is new. 917 */ 918 if (__table_type_request_based(live_md_type)) 919 request_based = 1; 920 else 921 bio_based = 1; 922 } 923 924 if (bio_based) { 925 verify_bio_based: 926 /* We must use this table as bio-based */ 927 t->type = DM_TYPE_BIO_BASED; 928 if (dm_table_supports_dax(t, device_supports_dax, &page_size) || 929 (list_empty(devices) && live_md_type == DM_TYPE_DAX_BIO_BASED)) { 930 t->type = DM_TYPE_DAX_BIO_BASED; 931 } 932 return 0; 933 } 934 935 BUG_ON(!request_based); /* No targets in this table */ 936 937 t->type = DM_TYPE_REQUEST_BASED; 938 939 verify_rq_based: 940 /* 941 * Request-based dm supports only tables that have a single target now. 942 * To support multiple targets, request splitting support is needed, 943 * and that needs lots of changes in the block-layer. 944 * (e.g. request completion process for partial completion.) 945 */ 946 if (t->num_targets > 1) { 947 DMERR("request-based DM doesn't support multiple targets"); 948 return -EINVAL; 949 } 950 951 if (list_empty(devices)) { 952 int srcu_idx; 953 struct dm_table *live_table = dm_get_live_table(t->md, &srcu_idx); 954 955 /* inherit live table's type */ 956 if (live_table) 957 t->type = live_table->type; 958 dm_put_live_table(t->md, srcu_idx); 959 return 0; 960 } 961 962 tgt = dm_table_get_immutable_target(t); 963 if (!tgt) { 964 DMERR("table load rejected: immutable target is required"); 965 return -EINVAL; 966 } else if (tgt->max_io_len) { 967 DMERR("table load rejected: immutable target that splits IO is not supported"); 968 return -EINVAL; 969 } 970 971 /* Non-request-stackable devices can't be used for request-based dm */ 972 if (!tgt->type->iterate_devices || 973 !tgt->type->iterate_devices(tgt, device_is_rq_stackable, NULL)) { 974 DMERR("table load rejected: including non-request-stackable devices"); 975 return -EINVAL; 976 } 977 978 return 0; 979 } 980 981 enum dm_queue_mode dm_table_get_type(struct dm_table *t) 982 { 983 return t->type; 984 } 985 986 struct target_type *dm_table_get_immutable_target_type(struct dm_table *t) 987 { 988 return t->immutable_target_type; 989 } 990 991 struct dm_target *dm_table_get_immutable_target(struct dm_table *t) 992 { 993 /* Immutable target is implicitly a singleton */ 994 if (t->num_targets > 1 || 995 !dm_target_is_immutable(t->targets[0].type)) 996 return NULL; 997 998 return t->targets; 999 } 1000 1001 struct dm_target *dm_table_get_wildcard_target(struct dm_table *t) 1002 { 1003 struct dm_target *ti; 1004 unsigned i; 1005 1006 for (i = 0; i < dm_table_get_num_targets(t); i++) { 1007 ti = dm_table_get_target(t, i); 1008 if (dm_target_is_wildcard(ti->type)) 1009 return ti; 1010 } 1011 1012 return NULL; 1013 } 1014 1015 bool dm_table_bio_based(struct dm_table *t) 1016 { 1017 return __table_type_bio_based(dm_table_get_type(t)); 1018 } 1019 1020 bool dm_table_request_based(struct dm_table *t) 1021 { 1022 return __table_type_request_based(dm_table_get_type(t)); 1023 } 1024 1025 static int dm_table_alloc_md_mempools(struct dm_table *t, struct mapped_device *md) 1026 { 1027 enum dm_queue_mode type = dm_table_get_type(t); 1028 unsigned per_io_data_size = 0; 1029 unsigned min_pool_size = 0; 1030 struct dm_target *ti; 1031 unsigned i; 1032 1033 if (unlikely(type == DM_TYPE_NONE)) { 1034 DMWARN("no table type is set, can't allocate mempools"); 1035 return -EINVAL; 1036 } 1037 1038 if (__table_type_bio_based(type)) 1039 for (i = 0; i < t->num_targets; i++) { 1040 ti = t->targets + i; 1041 per_io_data_size = max(per_io_data_size, ti->per_io_data_size); 1042 min_pool_size = max(min_pool_size, ti->num_flush_bios); 1043 } 1044 1045 t->mempools = dm_alloc_md_mempools(md, type, t->integrity_supported, 1046 per_io_data_size, min_pool_size); 1047 if (!t->mempools) 1048 return -ENOMEM; 1049 1050 return 0; 1051 } 1052 1053 void dm_table_free_md_mempools(struct dm_table *t) 1054 { 1055 dm_free_md_mempools(t->mempools); 1056 t->mempools = NULL; 1057 } 1058 1059 struct dm_md_mempools *dm_table_get_md_mempools(struct dm_table *t) 1060 { 1061 return t->mempools; 1062 } 1063 1064 static int setup_indexes(struct dm_table *t) 1065 { 1066 int i; 1067 unsigned int total = 0; 1068 sector_t *indexes; 1069 1070 /* allocate the space for *all* the indexes */ 1071 for (i = t->depth - 2; i >= 0; i--) { 1072 t->counts[i] = dm_div_up(t->counts[i + 1], CHILDREN_PER_NODE); 1073 total += t->counts[i]; 1074 } 1075 1076 indexes = (sector_t *) dm_vcalloc(total, (unsigned long) NODE_SIZE); 1077 if (!indexes) 1078 return -ENOMEM; 1079 1080 /* set up internal nodes, bottom-up */ 1081 for (i = t->depth - 2; i >= 0; i--) { 1082 t->index[i] = indexes; 1083 indexes += (KEYS_PER_NODE * t->counts[i]); 1084 setup_btree_index(i, t); 1085 } 1086 1087 return 0; 1088 } 1089 1090 /* 1091 * Builds the btree to index the map. 1092 */ 1093 static int dm_table_build_index(struct dm_table *t) 1094 { 1095 int r = 0; 1096 unsigned int leaf_nodes; 1097 1098 /* how many indexes will the btree have ? */ 1099 leaf_nodes = dm_div_up(t->num_targets, KEYS_PER_NODE); 1100 t->depth = 1 + int_log(leaf_nodes, CHILDREN_PER_NODE); 1101 1102 /* leaf layer has already been set up */ 1103 t->counts[t->depth - 1] = leaf_nodes; 1104 t->index[t->depth - 1] = t->highs; 1105 1106 if (t->depth >= 2) 1107 r = setup_indexes(t); 1108 1109 return r; 1110 } 1111 1112 static bool integrity_profile_exists(struct gendisk *disk) 1113 { 1114 return !!blk_get_integrity(disk); 1115 } 1116 1117 /* 1118 * Get a disk whose integrity profile reflects the table's profile. 1119 * Returns NULL if integrity support was inconsistent or unavailable. 1120 */ 1121 static struct gendisk * dm_table_get_integrity_disk(struct dm_table *t) 1122 { 1123 struct list_head *devices = dm_table_get_devices(t); 1124 struct dm_dev_internal *dd = NULL; 1125 struct gendisk *prev_disk = NULL, *template_disk = NULL; 1126 unsigned i; 1127 1128 for (i = 0; i < dm_table_get_num_targets(t); i++) { 1129 struct dm_target *ti = dm_table_get_target(t, i); 1130 if (!dm_target_passes_integrity(ti->type)) 1131 goto no_integrity; 1132 } 1133 1134 list_for_each_entry(dd, devices, list) { 1135 template_disk = dd->dm_dev->bdev->bd_disk; 1136 if (!integrity_profile_exists(template_disk)) 1137 goto no_integrity; 1138 else if (prev_disk && 1139 blk_integrity_compare(prev_disk, template_disk) < 0) 1140 goto no_integrity; 1141 prev_disk = template_disk; 1142 } 1143 1144 return template_disk; 1145 1146 no_integrity: 1147 if (prev_disk) 1148 DMWARN("%s: integrity not set: %s and %s profile mismatch", 1149 dm_device_name(t->md), 1150 prev_disk->disk_name, 1151 template_disk->disk_name); 1152 return NULL; 1153 } 1154 1155 /* 1156 * Register the mapped device for blk_integrity support if the 1157 * underlying devices have an integrity profile. But all devices may 1158 * not have matching profiles (checking all devices isn't reliable 1159 * during table load because this table may use other DM device(s) which 1160 * must be resumed before they will have an initialized integity 1161 * profile). Consequently, stacked DM devices force a 2 stage integrity 1162 * profile validation: First pass during table load, final pass during 1163 * resume. 1164 */ 1165 static int dm_table_register_integrity(struct dm_table *t) 1166 { 1167 struct mapped_device *md = t->md; 1168 struct gendisk *template_disk = NULL; 1169 1170 /* If target handles integrity itself do not register it here. */ 1171 if (t->integrity_added) 1172 return 0; 1173 1174 template_disk = dm_table_get_integrity_disk(t); 1175 if (!template_disk) 1176 return 0; 1177 1178 if (!integrity_profile_exists(dm_disk(md))) { 1179 t->integrity_supported = true; 1180 /* 1181 * Register integrity profile during table load; we can do 1182 * this because the final profile must match during resume. 1183 */ 1184 blk_integrity_register(dm_disk(md), 1185 blk_get_integrity(template_disk)); 1186 return 0; 1187 } 1188 1189 /* 1190 * If DM device already has an initialized integrity 1191 * profile the new profile should not conflict. 1192 */ 1193 if (blk_integrity_compare(dm_disk(md), template_disk) < 0) { 1194 DMWARN("%s: conflict with existing integrity profile: " 1195 "%s profile mismatch", 1196 dm_device_name(t->md), 1197 template_disk->disk_name); 1198 return 1; 1199 } 1200 1201 /* Preserve existing integrity profile */ 1202 t->integrity_supported = true; 1203 return 0; 1204 } 1205 1206 /* 1207 * Prepares the table for use by building the indices, 1208 * setting the type, and allocating mempools. 1209 */ 1210 int dm_table_complete(struct dm_table *t) 1211 { 1212 int r; 1213 1214 r = dm_table_determine_type(t); 1215 if (r) { 1216 DMERR("unable to determine table type"); 1217 return r; 1218 } 1219 1220 r = dm_table_build_index(t); 1221 if (r) { 1222 DMERR("unable to build btrees"); 1223 return r; 1224 } 1225 1226 r = dm_table_register_integrity(t); 1227 if (r) { 1228 DMERR("could not register integrity profile."); 1229 return r; 1230 } 1231 1232 r = dm_table_alloc_md_mempools(t, t->md); 1233 if (r) 1234 DMERR("unable to allocate mempools"); 1235 1236 return r; 1237 } 1238 1239 static DEFINE_MUTEX(_event_lock); 1240 void dm_table_event_callback(struct dm_table *t, 1241 void (*fn)(void *), void *context) 1242 { 1243 mutex_lock(&_event_lock); 1244 t->event_fn = fn; 1245 t->event_context = context; 1246 mutex_unlock(&_event_lock); 1247 } 1248 1249 void dm_table_event(struct dm_table *t) 1250 { 1251 mutex_lock(&_event_lock); 1252 if (t->event_fn) 1253 t->event_fn(t->event_context); 1254 mutex_unlock(&_event_lock); 1255 } 1256 EXPORT_SYMBOL(dm_table_event); 1257 1258 inline sector_t dm_table_get_size(struct dm_table *t) 1259 { 1260 return t->num_targets ? (t->highs[t->num_targets - 1] + 1) : 0; 1261 } 1262 EXPORT_SYMBOL(dm_table_get_size); 1263 1264 struct dm_target *dm_table_get_target(struct dm_table *t, unsigned int index) 1265 { 1266 if (index >= t->num_targets) 1267 return NULL; 1268 1269 return t->targets + index; 1270 } 1271 1272 /* 1273 * Search the btree for the correct target. 1274 * 1275 * Caller should check returned pointer for NULL 1276 * to trap I/O beyond end of device. 1277 */ 1278 struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector) 1279 { 1280 unsigned int l, n = 0, k = 0; 1281 sector_t *node; 1282 1283 if (unlikely(sector >= dm_table_get_size(t))) 1284 return NULL; 1285 1286 for (l = 0; l < t->depth; l++) { 1287 n = get_child(n, k); 1288 node = get_node(t, l, n); 1289 1290 for (k = 0; k < KEYS_PER_NODE; k++) 1291 if (node[k] >= sector) 1292 break; 1293 } 1294 1295 return &t->targets[(KEYS_PER_NODE * n) + k]; 1296 } 1297 1298 static int count_device(struct dm_target *ti, struct dm_dev *dev, 1299 sector_t start, sector_t len, void *data) 1300 { 1301 unsigned *num_devices = data; 1302 1303 (*num_devices)++; 1304 1305 return 0; 1306 } 1307 1308 /* 1309 * Check whether a table has no data devices attached using each 1310 * target's iterate_devices method. 1311 * Returns false if the result is unknown because a target doesn't 1312 * support iterate_devices. 1313 */ 1314 bool dm_table_has_no_data_devices(struct dm_table *table) 1315 { 1316 struct dm_target *ti; 1317 unsigned i, num_devices; 1318 1319 for (i = 0; i < dm_table_get_num_targets(table); i++) { 1320 ti = dm_table_get_target(table, i); 1321 1322 if (!ti->type->iterate_devices) 1323 return false; 1324 1325 num_devices = 0; 1326 ti->type->iterate_devices(ti, count_device, &num_devices); 1327 if (num_devices) 1328 return false; 1329 } 1330 1331 return true; 1332 } 1333 1334 static int device_is_zoned_model(struct dm_target *ti, struct dm_dev *dev, 1335 sector_t start, sector_t len, void *data) 1336 { 1337 struct request_queue *q = bdev_get_queue(dev->bdev); 1338 enum blk_zoned_model *zoned_model = data; 1339 1340 return q && blk_queue_zoned_model(q) == *zoned_model; 1341 } 1342 1343 static bool dm_table_supports_zoned_model(struct dm_table *t, 1344 enum blk_zoned_model zoned_model) 1345 { 1346 struct dm_target *ti; 1347 unsigned i; 1348 1349 for (i = 0; i < dm_table_get_num_targets(t); i++) { 1350 ti = dm_table_get_target(t, i); 1351 1352 if (zoned_model == BLK_ZONED_HM && 1353 !dm_target_supports_zoned_hm(ti->type)) 1354 return false; 1355 1356 if (!ti->type->iterate_devices || 1357 !ti->type->iterate_devices(ti, device_is_zoned_model, &zoned_model)) 1358 return false; 1359 } 1360 1361 return true; 1362 } 1363 1364 static int device_matches_zone_sectors(struct dm_target *ti, struct dm_dev *dev, 1365 sector_t start, sector_t len, void *data) 1366 { 1367 struct request_queue *q = bdev_get_queue(dev->bdev); 1368 unsigned int *zone_sectors = data; 1369 1370 return q && blk_queue_zone_sectors(q) == *zone_sectors; 1371 } 1372 1373 static bool dm_table_matches_zone_sectors(struct dm_table *t, 1374 unsigned int zone_sectors) 1375 { 1376 struct dm_target *ti; 1377 unsigned i; 1378 1379 for (i = 0; i < dm_table_get_num_targets(t); i++) { 1380 ti = dm_table_get_target(t, i); 1381 1382 if (!ti->type->iterate_devices || 1383 !ti->type->iterate_devices(ti, device_matches_zone_sectors, &zone_sectors)) 1384 return false; 1385 } 1386 1387 return true; 1388 } 1389 1390 static int validate_hardware_zoned_model(struct dm_table *table, 1391 enum blk_zoned_model zoned_model, 1392 unsigned int zone_sectors) 1393 { 1394 if (zoned_model == BLK_ZONED_NONE) 1395 return 0; 1396 1397 if (!dm_table_supports_zoned_model(table, zoned_model)) { 1398 DMERR("%s: zoned model is not consistent across all devices", 1399 dm_device_name(table->md)); 1400 return -EINVAL; 1401 } 1402 1403 /* Check zone size validity and compatibility */ 1404 if (!zone_sectors || !is_power_of_2(zone_sectors)) 1405 return -EINVAL; 1406 1407 if (!dm_table_matches_zone_sectors(table, zone_sectors)) { 1408 DMERR("%s: zone sectors is not consistent across all devices", 1409 dm_device_name(table->md)); 1410 return -EINVAL; 1411 } 1412 1413 return 0; 1414 } 1415 1416 /* 1417 * Establish the new table's queue_limits and validate them. 1418 */ 1419 int dm_calculate_queue_limits(struct dm_table *table, 1420 struct queue_limits *limits) 1421 { 1422 struct dm_target *ti; 1423 struct queue_limits ti_limits; 1424 unsigned i; 1425 enum blk_zoned_model zoned_model = BLK_ZONED_NONE; 1426 unsigned int zone_sectors = 0; 1427 1428 blk_set_stacking_limits(limits); 1429 1430 for (i = 0; i < dm_table_get_num_targets(table); i++) { 1431 blk_set_stacking_limits(&ti_limits); 1432 1433 ti = dm_table_get_target(table, i); 1434 1435 if (!ti->type->iterate_devices) 1436 goto combine_limits; 1437 1438 /* 1439 * Combine queue limits of all the devices this target uses. 1440 */ 1441 ti->type->iterate_devices(ti, dm_set_device_limits, 1442 &ti_limits); 1443 1444 if (zoned_model == BLK_ZONED_NONE && ti_limits.zoned != BLK_ZONED_NONE) { 1445 /* 1446 * After stacking all limits, validate all devices 1447 * in table support this zoned model and zone sectors. 1448 */ 1449 zoned_model = ti_limits.zoned; 1450 zone_sectors = ti_limits.chunk_sectors; 1451 } 1452 1453 /* Set I/O hints portion of queue limits */ 1454 if (ti->type->io_hints) 1455 ti->type->io_hints(ti, &ti_limits); 1456 1457 /* 1458 * Check each device area is consistent with the target's 1459 * overall queue limits. 1460 */ 1461 if (ti->type->iterate_devices(ti, device_area_is_invalid, 1462 &ti_limits)) 1463 return -EINVAL; 1464 1465 combine_limits: 1466 /* 1467 * Merge this target's queue limits into the overall limits 1468 * for the table. 1469 */ 1470 if (blk_stack_limits(limits, &ti_limits, 0) < 0) 1471 DMWARN("%s: adding target device " 1472 "(start sect %llu len %llu) " 1473 "caused an alignment inconsistency", 1474 dm_device_name(table->md), 1475 (unsigned long long) ti->begin, 1476 (unsigned long long) ti->len); 1477 } 1478 1479 /* 1480 * Verify that the zoned model and zone sectors, as determined before 1481 * any .io_hints override, are the same across all devices in the table. 1482 * - this is especially relevant if .io_hints is emulating a disk-managed 1483 * zoned model (aka BLK_ZONED_NONE) on host-managed zoned block devices. 1484 * BUT... 1485 */ 1486 if (limits->zoned != BLK_ZONED_NONE) { 1487 /* 1488 * ...IF the above limits stacking determined a zoned model 1489 * validate that all of the table's devices conform to it. 1490 */ 1491 zoned_model = limits->zoned; 1492 zone_sectors = limits->chunk_sectors; 1493 } 1494 if (validate_hardware_zoned_model(table, zoned_model, zone_sectors)) 1495 return -EINVAL; 1496 1497 return validate_hardware_logical_block_alignment(table, limits); 1498 } 1499 1500 /* 1501 * Verify that all devices have an integrity profile that matches the 1502 * DM device's registered integrity profile. If the profiles don't 1503 * match then unregister the DM device's integrity profile. 1504 */ 1505 static void dm_table_verify_integrity(struct dm_table *t) 1506 { 1507 struct gendisk *template_disk = NULL; 1508 1509 if (t->integrity_added) 1510 return; 1511 1512 if (t->integrity_supported) { 1513 /* 1514 * Verify that the original integrity profile 1515 * matches all the devices in this table. 1516 */ 1517 template_disk = dm_table_get_integrity_disk(t); 1518 if (template_disk && 1519 blk_integrity_compare(dm_disk(t->md), template_disk) >= 0) 1520 return; 1521 } 1522 1523 if (integrity_profile_exists(dm_disk(t->md))) { 1524 DMWARN("%s: unable to establish an integrity profile", 1525 dm_device_name(t->md)); 1526 blk_integrity_unregister(dm_disk(t->md)); 1527 } 1528 } 1529 1530 static int device_flush_capable(struct dm_target *ti, struct dm_dev *dev, 1531 sector_t start, sector_t len, void *data) 1532 { 1533 unsigned long flush = (unsigned long) data; 1534 struct request_queue *q = bdev_get_queue(dev->bdev); 1535 1536 return q && (q->queue_flags & flush); 1537 } 1538 1539 static bool dm_table_supports_flush(struct dm_table *t, unsigned long flush) 1540 { 1541 struct dm_target *ti; 1542 unsigned i; 1543 1544 /* 1545 * Require at least one underlying device to support flushes. 1546 * t->devices includes internal dm devices such as mirror logs 1547 * so we need to use iterate_devices here, which targets 1548 * supporting flushes must provide. 1549 */ 1550 for (i = 0; i < dm_table_get_num_targets(t); i++) { 1551 ti = dm_table_get_target(t, i); 1552 1553 if (!ti->num_flush_bios) 1554 continue; 1555 1556 if (ti->flush_supported) 1557 return true; 1558 1559 if (ti->type->iterate_devices && 1560 ti->type->iterate_devices(ti, device_flush_capable, (void *) flush)) 1561 return true; 1562 } 1563 1564 return false; 1565 } 1566 1567 static int device_dax_write_cache_enabled(struct dm_target *ti, 1568 struct dm_dev *dev, sector_t start, 1569 sector_t len, void *data) 1570 { 1571 struct dax_device *dax_dev = dev->dax_dev; 1572 1573 if (!dax_dev) 1574 return false; 1575 1576 if (dax_write_cache_enabled(dax_dev)) 1577 return true; 1578 return false; 1579 } 1580 1581 static int dm_table_supports_dax_write_cache(struct dm_table *t) 1582 { 1583 struct dm_target *ti; 1584 unsigned i; 1585 1586 for (i = 0; i < dm_table_get_num_targets(t); i++) { 1587 ti = dm_table_get_target(t, i); 1588 1589 if (ti->type->iterate_devices && 1590 ti->type->iterate_devices(ti, 1591 device_dax_write_cache_enabled, NULL)) 1592 return true; 1593 } 1594 1595 return false; 1596 } 1597 1598 static int device_is_nonrot(struct dm_target *ti, struct dm_dev *dev, 1599 sector_t start, sector_t len, void *data) 1600 { 1601 struct request_queue *q = bdev_get_queue(dev->bdev); 1602 1603 return q && blk_queue_nonrot(q); 1604 } 1605 1606 static int device_is_not_random(struct dm_target *ti, struct dm_dev *dev, 1607 sector_t start, sector_t len, void *data) 1608 { 1609 struct request_queue *q = bdev_get_queue(dev->bdev); 1610 1611 return q && !blk_queue_add_random(q); 1612 } 1613 1614 static bool dm_table_all_devices_attribute(struct dm_table *t, 1615 iterate_devices_callout_fn func) 1616 { 1617 struct dm_target *ti; 1618 unsigned i; 1619 1620 for (i = 0; i < dm_table_get_num_targets(t); i++) { 1621 ti = dm_table_get_target(t, i); 1622 1623 if (!ti->type->iterate_devices || 1624 !ti->type->iterate_devices(ti, func, NULL)) 1625 return false; 1626 } 1627 1628 return true; 1629 } 1630 1631 static int device_not_write_same_capable(struct dm_target *ti, struct dm_dev *dev, 1632 sector_t start, sector_t len, void *data) 1633 { 1634 struct request_queue *q = bdev_get_queue(dev->bdev); 1635 1636 return q && !q->limits.max_write_same_sectors; 1637 } 1638 1639 static bool dm_table_supports_write_same(struct dm_table *t) 1640 { 1641 struct dm_target *ti; 1642 unsigned i; 1643 1644 for (i = 0; i < dm_table_get_num_targets(t); i++) { 1645 ti = dm_table_get_target(t, i); 1646 1647 if (!ti->num_write_same_bios) 1648 return false; 1649 1650 if (!ti->type->iterate_devices || 1651 ti->type->iterate_devices(ti, device_not_write_same_capable, NULL)) 1652 return false; 1653 } 1654 1655 return true; 1656 } 1657 1658 static int device_not_write_zeroes_capable(struct dm_target *ti, struct dm_dev *dev, 1659 sector_t start, sector_t len, void *data) 1660 { 1661 struct request_queue *q = bdev_get_queue(dev->bdev); 1662 1663 return q && !q->limits.max_write_zeroes_sectors; 1664 } 1665 1666 static bool dm_table_supports_write_zeroes(struct dm_table *t) 1667 { 1668 struct dm_target *ti; 1669 unsigned i = 0; 1670 1671 while (i < dm_table_get_num_targets(t)) { 1672 ti = dm_table_get_target(t, i++); 1673 1674 if (!ti->num_write_zeroes_bios) 1675 return false; 1676 1677 if (!ti->type->iterate_devices || 1678 ti->type->iterate_devices(ti, device_not_write_zeroes_capable, NULL)) 1679 return false; 1680 } 1681 1682 return true; 1683 } 1684 1685 static int device_not_nowait_capable(struct dm_target *ti, struct dm_dev *dev, 1686 sector_t start, sector_t len, void *data) 1687 { 1688 struct request_queue *q = bdev_get_queue(dev->bdev); 1689 1690 return q && !blk_queue_nowait(q); 1691 } 1692 1693 static bool dm_table_supports_nowait(struct dm_table *t) 1694 { 1695 struct dm_target *ti; 1696 unsigned i = 0; 1697 1698 while (i < dm_table_get_num_targets(t)) { 1699 ti = dm_table_get_target(t, i++); 1700 1701 if (!dm_target_supports_nowait(ti->type)) 1702 return false; 1703 1704 if (!ti->type->iterate_devices || 1705 ti->type->iterate_devices(ti, device_not_nowait_capable, NULL)) 1706 return false; 1707 } 1708 1709 return true; 1710 } 1711 1712 static int device_not_discard_capable(struct dm_target *ti, struct dm_dev *dev, 1713 sector_t start, sector_t len, void *data) 1714 { 1715 struct request_queue *q = bdev_get_queue(dev->bdev); 1716 1717 return q && !blk_queue_discard(q); 1718 } 1719 1720 static bool dm_table_supports_discards(struct dm_table *t) 1721 { 1722 struct dm_target *ti; 1723 unsigned i; 1724 1725 for (i = 0; i < dm_table_get_num_targets(t); i++) { 1726 ti = dm_table_get_target(t, i); 1727 1728 if (!ti->num_discard_bios) 1729 return false; 1730 1731 /* 1732 * Either the target provides discard support (as implied by setting 1733 * 'discards_supported') or it relies on _all_ data devices having 1734 * discard support. 1735 */ 1736 if (!ti->discards_supported && 1737 (!ti->type->iterate_devices || 1738 ti->type->iterate_devices(ti, device_not_discard_capable, NULL))) 1739 return false; 1740 } 1741 1742 return true; 1743 } 1744 1745 static int device_not_secure_erase_capable(struct dm_target *ti, 1746 struct dm_dev *dev, sector_t start, 1747 sector_t len, void *data) 1748 { 1749 struct request_queue *q = bdev_get_queue(dev->bdev); 1750 1751 return q && !blk_queue_secure_erase(q); 1752 } 1753 1754 static bool dm_table_supports_secure_erase(struct dm_table *t) 1755 { 1756 struct dm_target *ti; 1757 unsigned int i; 1758 1759 for (i = 0; i < dm_table_get_num_targets(t); i++) { 1760 ti = dm_table_get_target(t, i); 1761 1762 if (!ti->num_secure_erase_bios) 1763 return false; 1764 1765 if (!ti->type->iterate_devices || 1766 ti->type->iterate_devices(ti, device_not_secure_erase_capable, NULL)) 1767 return false; 1768 } 1769 1770 return true; 1771 } 1772 1773 static int device_requires_stable_pages(struct dm_target *ti, 1774 struct dm_dev *dev, sector_t start, 1775 sector_t len, void *data) 1776 { 1777 struct request_queue *q = bdev_get_queue(dev->bdev); 1778 1779 return q && blk_queue_stable_writes(q); 1780 } 1781 1782 /* 1783 * If any underlying device requires stable pages, a table must require 1784 * them as well. Only targets that support iterate_devices are considered: 1785 * don't want error, zero, etc to require stable pages. 1786 */ 1787 static bool dm_table_requires_stable_pages(struct dm_table *t) 1788 { 1789 struct dm_target *ti; 1790 unsigned i; 1791 1792 for (i = 0; i < dm_table_get_num_targets(t); i++) { 1793 ti = dm_table_get_target(t, i); 1794 1795 if (ti->type->iterate_devices && 1796 ti->type->iterate_devices(ti, device_requires_stable_pages, NULL)) 1797 return true; 1798 } 1799 1800 return false; 1801 } 1802 1803 void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, 1804 struct queue_limits *limits) 1805 { 1806 bool wc = false, fua = false; 1807 int page_size = PAGE_SIZE; 1808 1809 /* 1810 * Copy table's limits to the DM device's request_queue 1811 */ 1812 q->limits = *limits; 1813 1814 if (dm_table_supports_nowait(t)) 1815 blk_queue_flag_set(QUEUE_FLAG_NOWAIT, q); 1816 else 1817 blk_queue_flag_clear(QUEUE_FLAG_NOWAIT, q); 1818 1819 if (!dm_table_supports_discards(t)) { 1820 blk_queue_flag_clear(QUEUE_FLAG_DISCARD, q); 1821 /* Must also clear discard limits... */ 1822 q->limits.max_discard_sectors = 0; 1823 q->limits.max_hw_discard_sectors = 0; 1824 q->limits.discard_granularity = 0; 1825 q->limits.discard_alignment = 0; 1826 q->limits.discard_misaligned = 0; 1827 } else 1828 blk_queue_flag_set(QUEUE_FLAG_DISCARD, q); 1829 1830 if (dm_table_supports_secure_erase(t)) 1831 blk_queue_flag_set(QUEUE_FLAG_SECERASE, q); 1832 1833 if (dm_table_supports_flush(t, (1UL << QUEUE_FLAG_WC))) { 1834 wc = true; 1835 if (dm_table_supports_flush(t, (1UL << QUEUE_FLAG_FUA))) 1836 fua = true; 1837 } 1838 blk_queue_write_cache(q, wc, fua); 1839 1840 if (dm_table_supports_dax(t, device_supports_dax, &page_size)) { 1841 blk_queue_flag_set(QUEUE_FLAG_DAX, q); 1842 if (dm_table_supports_dax(t, device_dax_synchronous, NULL)) 1843 set_dax_synchronous(t->md->dax_dev); 1844 } 1845 else 1846 blk_queue_flag_clear(QUEUE_FLAG_DAX, q); 1847 1848 if (dm_table_supports_dax_write_cache(t)) 1849 dax_write_cache(t->md->dax_dev, true); 1850 1851 /* Ensure that all underlying devices are non-rotational. */ 1852 if (dm_table_all_devices_attribute(t, device_is_nonrot)) 1853 blk_queue_flag_set(QUEUE_FLAG_NONROT, q); 1854 else 1855 blk_queue_flag_clear(QUEUE_FLAG_NONROT, q); 1856 1857 if (!dm_table_supports_write_same(t)) 1858 q->limits.max_write_same_sectors = 0; 1859 if (!dm_table_supports_write_zeroes(t)) 1860 q->limits.max_write_zeroes_sectors = 0; 1861 1862 dm_table_verify_integrity(t); 1863 1864 /* 1865 * Some devices don't use blk_integrity but still want stable pages 1866 * because they do their own checksumming. 1867 */ 1868 if (dm_table_requires_stable_pages(t)) 1869 blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, q); 1870 else 1871 blk_queue_flag_clear(QUEUE_FLAG_STABLE_WRITES, q); 1872 1873 /* 1874 * Determine whether or not this queue's I/O timings contribute 1875 * to the entropy pool, Only request-based targets use this. 1876 * Clear QUEUE_FLAG_ADD_RANDOM if any underlying device does not 1877 * have it set. 1878 */ 1879 if (blk_queue_add_random(q) && dm_table_all_devices_attribute(t, device_is_not_random)) 1880 blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, q); 1881 1882 /* 1883 * For a zoned target, the number of zones should be updated for the 1884 * correct value to be exposed in sysfs queue/nr_zones. For a BIO based 1885 * target, this is all that is needed. 1886 */ 1887 #ifdef CONFIG_BLK_DEV_ZONED 1888 if (blk_queue_is_zoned(q)) { 1889 WARN_ON_ONCE(queue_is_mq(q)); 1890 q->nr_zones = blkdev_nr_zones(t->md->disk); 1891 } 1892 #endif 1893 1894 blk_queue_update_readahead(q); 1895 } 1896 1897 unsigned int dm_table_get_num_targets(struct dm_table *t) 1898 { 1899 return t->num_targets; 1900 } 1901 1902 struct list_head *dm_table_get_devices(struct dm_table *t) 1903 { 1904 return &t->devices; 1905 } 1906 1907 fmode_t dm_table_get_mode(struct dm_table *t) 1908 { 1909 return t->mode; 1910 } 1911 EXPORT_SYMBOL(dm_table_get_mode); 1912 1913 enum suspend_mode { 1914 PRESUSPEND, 1915 PRESUSPEND_UNDO, 1916 POSTSUSPEND, 1917 }; 1918 1919 static void suspend_targets(struct dm_table *t, enum suspend_mode mode) 1920 { 1921 int i = t->num_targets; 1922 struct dm_target *ti = t->targets; 1923 1924 lockdep_assert_held(&t->md->suspend_lock); 1925 1926 while (i--) { 1927 switch (mode) { 1928 case PRESUSPEND: 1929 if (ti->type->presuspend) 1930 ti->type->presuspend(ti); 1931 break; 1932 case PRESUSPEND_UNDO: 1933 if (ti->type->presuspend_undo) 1934 ti->type->presuspend_undo(ti); 1935 break; 1936 case POSTSUSPEND: 1937 if (ti->type->postsuspend) 1938 ti->type->postsuspend(ti); 1939 break; 1940 } 1941 ti++; 1942 } 1943 } 1944 1945 void dm_table_presuspend_targets(struct dm_table *t) 1946 { 1947 if (!t) 1948 return; 1949 1950 suspend_targets(t, PRESUSPEND); 1951 } 1952 1953 void dm_table_presuspend_undo_targets(struct dm_table *t) 1954 { 1955 if (!t) 1956 return; 1957 1958 suspend_targets(t, PRESUSPEND_UNDO); 1959 } 1960 1961 void dm_table_postsuspend_targets(struct dm_table *t) 1962 { 1963 if (!t) 1964 return; 1965 1966 suspend_targets(t, POSTSUSPEND); 1967 } 1968 1969 int dm_table_resume_targets(struct dm_table *t) 1970 { 1971 int i, r = 0; 1972 1973 lockdep_assert_held(&t->md->suspend_lock); 1974 1975 for (i = 0; i < t->num_targets; i++) { 1976 struct dm_target *ti = t->targets + i; 1977 1978 if (!ti->type->preresume) 1979 continue; 1980 1981 r = ti->type->preresume(ti); 1982 if (r) { 1983 DMERR("%s: %s: preresume failed, error = %d", 1984 dm_device_name(t->md), ti->type->name, r); 1985 return r; 1986 } 1987 } 1988 1989 for (i = 0; i < t->num_targets; i++) { 1990 struct dm_target *ti = t->targets + i; 1991 1992 if (ti->type->resume) 1993 ti->type->resume(ti); 1994 } 1995 1996 return 0; 1997 } 1998 1999 struct mapped_device *dm_table_get_md(struct dm_table *t) 2000 { 2001 return t->md; 2002 } 2003 EXPORT_SYMBOL(dm_table_get_md); 2004 2005 const char *dm_table_device_name(struct dm_table *t) 2006 { 2007 return dm_device_name(t->md); 2008 } 2009 EXPORT_SYMBOL_GPL(dm_table_device_name); 2010 2011 void dm_table_run_md_queue_async(struct dm_table *t) 2012 { 2013 if (!dm_table_request_based(t)) 2014 return; 2015 2016 if (t->md->queue) 2017 blk_mq_run_hw_queues(t->md->queue, true); 2018 } 2019 EXPORT_SYMBOL(dm_table_run_md_queue_async); 2020 2021