1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (c) 2017-2018 Christoph Hellwig. 4 */ 5 6 #include <linux/backing-dev.h> 7 #include <linux/moduleparam.h> 8 #include <trace/events/block.h> 9 #include "nvme.h" 10 11 static bool multipath = true; 12 module_param(multipath, bool, 0444); 13 MODULE_PARM_DESC(multipath, 14 "turn on native support for multiple controllers per subsystem"); 15 16 void nvme_mpath_unfreeze(struct nvme_subsystem *subsys) 17 { 18 struct nvme_ns_head *h; 19 20 lockdep_assert_held(&subsys->lock); 21 list_for_each_entry(h, &subsys->nsheads, entry) 22 if (h->disk) 23 blk_mq_unfreeze_queue(h->disk->queue); 24 } 25 26 void nvme_mpath_wait_freeze(struct nvme_subsystem *subsys) 27 { 28 struct nvme_ns_head *h; 29 30 lockdep_assert_held(&subsys->lock); 31 list_for_each_entry(h, &subsys->nsheads, entry) 32 if (h->disk) 33 blk_mq_freeze_queue_wait(h->disk->queue); 34 } 35 36 void nvme_mpath_start_freeze(struct nvme_subsystem *subsys) 37 { 38 struct nvme_ns_head *h; 39 40 lockdep_assert_held(&subsys->lock); 41 list_for_each_entry(h, &subsys->nsheads, entry) 42 if (h->disk) 43 blk_freeze_queue_start(h->disk->queue); 44 } 45 46 /* 47 * If multipathing is enabled we need to always use the subsystem instance 48 * number for numbering our devices to avoid conflicts between subsystems that 49 * have multiple controllers and thus use the multipath-aware subsystem node 50 * and those that have a single controller and use the controller node 51 * directly. 52 */ 53 bool nvme_mpath_set_disk_name(struct nvme_ns *ns, char *disk_name, int *flags) 54 { 55 if (!multipath) 56 return false; 57 if (!ns->head->disk) { 58 sprintf(disk_name, "nvme%dn%d", ns->ctrl->subsys->instance, 59 ns->head->instance); 60 return true; 61 } 62 sprintf(disk_name, "nvme%dc%dn%d", ns->ctrl->subsys->instance, 63 ns->ctrl->instance, ns->head->instance); 64 *flags = GENHD_FL_HIDDEN; 65 return true; 66 } 67 68 void nvme_failover_req(struct request *req) 69 { 70 struct nvme_ns *ns = req->q->queuedata; 71 u16 status = nvme_req(req)->status & 0x7ff; 72 unsigned long flags; 73 struct bio *bio; 74 75 nvme_mpath_clear_current_path(ns); 76 77 /* 78 * If we got back an ANA error, we know the controller is alive but not 79 * ready to serve this namespace. Kick of a re-read of the ANA 80 * information page, and just try any other available path for now. 81 */ 82 if (nvme_is_ana_error(status) && ns->ctrl->ana_log_buf) { 83 set_bit(NVME_NS_ANA_PENDING, &ns->flags); 84 queue_work(nvme_wq, &ns->ctrl->ana_work); 85 } 86 87 spin_lock_irqsave(&ns->head->requeue_lock, flags); 88 for (bio = req->bio; bio; bio = bio->bi_next) 89 bio_set_dev(bio, ns->head->disk->part0); 90 blk_steal_bios(&ns->head->requeue_list, req); 91 spin_unlock_irqrestore(&ns->head->requeue_lock, flags); 92 93 blk_mq_end_request(req, 0); 94 kblockd_schedule_work(&ns->head->requeue_work); 95 } 96 97 void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl) 98 { 99 struct nvme_ns *ns; 100 101 down_read(&ctrl->namespaces_rwsem); 102 list_for_each_entry(ns, &ctrl->namespaces, list) { 103 if (ns->head->disk) 104 kblockd_schedule_work(&ns->head->requeue_work); 105 } 106 up_read(&ctrl->namespaces_rwsem); 107 } 108 109 static const char *nvme_ana_state_names[] = { 110 [0] = "invalid state", 111 [NVME_ANA_OPTIMIZED] = "optimized", 112 [NVME_ANA_NONOPTIMIZED] = "non-optimized", 113 [NVME_ANA_INACCESSIBLE] = "inaccessible", 114 [NVME_ANA_PERSISTENT_LOSS] = "persistent-loss", 115 [NVME_ANA_CHANGE] = "change", 116 }; 117 118 bool nvme_mpath_clear_current_path(struct nvme_ns *ns) 119 { 120 struct nvme_ns_head *head = ns->head; 121 bool changed = false; 122 int node; 123 124 if (!head) 125 goto out; 126 127 for_each_node(node) { 128 if (ns == rcu_access_pointer(head->current_path[node])) { 129 rcu_assign_pointer(head->current_path[node], NULL); 130 changed = true; 131 } 132 } 133 out: 134 return changed; 135 } 136 137 void nvme_mpath_clear_ctrl_paths(struct nvme_ctrl *ctrl) 138 { 139 struct nvme_ns *ns; 140 141 mutex_lock(&ctrl->scan_lock); 142 down_read(&ctrl->namespaces_rwsem); 143 list_for_each_entry(ns, &ctrl->namespaces, list) 144 if (nvme_mpath_clear_current_path(ns)) 145 kblockd_schedule_work(&ns->head->requeue_work); 146 up_read(&ctrl->namespaces_rwsem); 147 mutex_unlock(&ctrl->scan_lock); 148 } 149 150 static bool nvme_path_is_disabled(struct nvme_ns *ns) 151 { 152 /* 153 * We don't treat NVME_CTRL_DELETING as a disabled path as I/O should 154 * still be able to complete assuming that the controller is connected. 155 * Otherwise it will fail immediately and return to the requeue list. 156 */ 157 if (ns->ctrl->state != NVME_CTRL_LIVE && 158 ns->ctrl->state != NVME_CTRL_DELETING) 159 return true; 160 if (test_bit(NVME_NS_ANA_PENDING, &ns->flags) || 161 test_bit(NVME_NS_REMOVING, &ns->flags)) 162 return true; 163 return false; 164 } 165 166 static struct nvme_ns *__nvme_find_path(struct nvme_ns_head *head, int node) 167 { 168 int found_distance = INT_MAX, fallback_distance = INT_MAX, distance; 169 struct nvme_ns *found = NULL, *fallback = NULL, *ns; 170 171 list_for_each_entry_rcu(ns, &head->list, siblings) { 172 if (nvme_path_is_disabled(ns)) 173 continue; 174 175 if (READ_ONCE(head->subsys->iopolicy) == NVME_IOPOLICY_NUMA) 176 distance = node_distance(node, ns->ctrl->numa_node); 177 else 178 distance = LOCAL_DISTANCE; 179 180 switch (ns->ana_state) { 181 case NVME_ANA_OPTIMIZED: 182 if (distance < found_distance) { 183 found_distance = distance; 184 found = ns; 185 } 186 break; 187 case NVME_ANA_NONOPTIMIZED: 188 if (distance < fallback_distance) { 189 fallback_distance = distance; 190 fallback = ns; 191 } 192 break; 193 default: 194 break; 195 } 196 } 197 198 if (!found) 199 found = fallback; 200 if (found) 201 rcu_assign_pointer(head->current_path[node], found); 202 return found; 203 } 204 205 static struct nvme_ns *nvme_next_ns(struct nvme_ns_head *head, 206 struct nvme_ns *ns) 207 { 208 ns = list_next_or_null_rcu(&head->list, &ns->siblings, struct nvme_ns, 209 siblings); 210 if (ns) 211 return ns; 212 return list_first_or_null_rcu(&head->list, struct nvme_ns, siblings); 213 } 214 215 static struct nvme_ns *nvme_round_robin_path(struct nvme_ns_head *head, 216 int node, struct nvme_ns *old) 217 { 218 struct nvme_ns *ns, *found = NULL; 219 220 if (list_is_singular(&head->list)) { 221 if (nvme_path_is_disabled(old)) 222 return NULL; 223 return old; 224 } 225 226 for (ns = nvme_next_ns(head, old); 227 ns && ns != old; 228 ns = nvme_next_ns(head, ns)) { 229 if (nvme_path_is_disabled(ns)) 230 continue; 231 232 if (ns->ana_state == NVME_ANA_OPTIMIZED) { 233 found = ns; 234 goto out; 235 } 236 if (ns->ana_state == NVME_ANA_NONOPTIMIZED) 237 found = ns; 238 } 239 240 /* 241 * The loop above skips the current path for round-robin semantics. 242 * Fall back to the current path if either: 243 * - no other optimized path found and current is optimized, 244 * - no other usable path found and current is usable. 245 */ 246 if (!nvme_path_is_disabled(old) && 247 (old->ana_state == NVME_ANA_OPTIMIZED || 248 (!found && old->ana_state == NVME_ANA_NONOPTIMIZED))) 249 return old; 250 251 if (!found) 252 return NULL; 253 out: 254 rcu_assign_pointer(head->current_path[node], found); 255 return found; 256 } 257 258 static inline bool nvme_path_is_optimized(struct nvme_ns *ns) 259 { 260 return ns->ctrl->state == NVME_CTRL_LIVE && 261 ns->ana_state == NVME_ANA_OPTIMIZED; 262 } 263 264 inline struct nvme_ns *nvme_find_path(struct nvme_ns_head *head) 265 { 266 int node = numa_node_id(); 267 struct nvme_ns *ns; 268 269 ns = srcu_dereference(head->current_path[node], &head->srcu); 270 if (unlikely(!ns)) 271 return __nvme_find_path(head, node); 272 273 if (READ_ONCE(head->subsys->iopolicy) == NVME_IOPOLICY_RR) 274 return nvme_round_robin_path(head, node, ns); 275 if (unlikely(!nvme_path_is_optimized(ns))) 276 return __nvme_find_path(head, node); 277 return ns; 278 } 279 280 static bool nvme_available_path(struct nvme_ns_head *head) 281 { 282 struct nvme_ns *ns; 283 284 list_for_each_entry_rcu(ns, &head->list, siblings) { 285 if (test_bit(NVME_CTRL_FAILFAST_EXPIRED, &ns->ctrl->flags)) 286 continue; 287 switch (ns->ctrl->state) { 288 case NVME_CTRL_LIVE: 289 case NVME_CTRL_RESETTING: 290 case NVME_CTRL_CONNECTING: 291 /* fallthru */ 292 return true; 293 default: 294 break; 295 } 296 } 297 return false; 298 } 299 300 static blk_qc_t nvme_ns_head_submit_bio(struct bio *bio) 301 { 302 struct nvme_ns_head *head = bio->bi_bdev->bd_disk->private_data; 303 struct device *dev = disk_to_dev(head->disk); 304 struct nvme_ns *ns; 305 blk_qc_t ret = BLK_QC_T_NONE; 306 int srcu_idx; 307 308 /* 309 * The namespace might be going away and the bio might be moved to a 310 * different queue via blk_steal_bios(), so we need to use the bio_split 311 * pool from the original queue to allocate the bvecs from. 312 */ 313 blk_queue_split(&bio); 314 315 srcu_idx = srcu_read_lock(&head->srcu); 316 ns = nvme_find_path(head); 317 if (likely(ns)) { 318 bio_set_dev(bio, ns->disk->part0); 319 bio->bi_opf |= REQ_NVME_MPATH; 320 trace_block_bio_remap(bio, disk_devt(ns->head->disk), 321 bio->bi_iter.bi_sector); 322 ret = submit_bio_noacct(bio); 323 } else if (nvme_available_path(head)) { 324 dev_warn_ratelimited(dev, "no usable path - requeuing I/O\n"); 325 326 spin_lock_irq(&head->requeue_lock); 327 bio_list_add(&head->requeue_list, bio); 328 spin_unlock_irq(&head->requeue_lock); 329 } else { 330 dev_warn_ratelimited(dev, "no available path - failing I/O\n"); 331 332 bio->bi_status = BLK_STS_IOERR; 333 bio_endio(bio); 334 } 335 336 srcu_read_unlock(&head->srcu, srcu_idx); 337 return ret; 338 } 339 340 static int nvme_ns_head_open(struct block_device *bdev, fmode_t mode) 341 { 342 if (!nvme_tryget_ns_head(bdev->bd_disk->private_data)) 343 return -ENXIO; 344 return 0; 345 } 346 347 static void nvme_ns_head_release(struct gendisk *disk, fmode_t mode) 348 { 349 nvme_put_ns_head(disk->private_data); 350 } 351 352 #ifdef CONFIG_BLK_DEV_ZONED 353 static int nvme_ns_head_report_zones(struct gendisk *disk, sector_t sector, 354 unsigned int nr_zones, report_zones_cb cb, void *data) 355 { 356 struct nvme_ns_head *head = disk->private_data; 357 struct nvme_ns *ns; 358 int srcu_idx, ret = -EWOULDBLOCK; 359 360 srcu_idx = srcu_read_lock(&head->srcu); 361 ns = nvme_find_path(head); 362 if (ns) 363 ret = nvme_ns_report_zones(ns, sector, nr_zones, cb, data); 364 srcu_read_unlock(&head->srcu, srcu_idx); 365 return ret; 366 } 367 #else 368 #define nvme_ns_head_report_zones NULL 369 #endif /* CONFIG_BLK_DEV_ZONED */ 370 371 const struct block_device_operations nvme_ns_head_ops = { 372 .owner = THIS_MODULE, 373 .submit_bio = nvme_ns_head_submit_bio, 374 .open = nvme_ns_head_open, 375 .release = nvme_ns_head_release, 376 .ioctl = nvme_ns_head_ioctl, 377 .getgeo = nvme_getgeo, 378 .report_zones = nvme_ns_head_report_zones, 379 .pr_ops = &nvme_pr_ops, 380 }; 381 382 static inline struct nvme_ns_head *cdev_to_ns_head(struct cdev *cdev) 383 { 384 return container_of(cdev, struct nvme_ns_head, cdev); 385 } 386 387 static int nvme_ns_head_chr_open(struct inode *inode, struct file *file) 388 { 389 if (!nvme_tryget_ns_head(cdev_to_ns_head(inode->i_cdev))) 390 return -ENXIO; 391 return 0; 392 } 393 394 static int nvme_ns_head_chr_release(struct inode *inode, struct file *file) 395 { 396 nvme_put_ns_head(cdev_to_ns_head(inode->i_cdev)); 397 return 0; 398 } 399 400 static const struct file_operations nvme_ns_head_chr_fops = { 401 .owner = THIS_MODULE, 402 .open = nvme_ns_head_chr_open, 403 .release = nvme_ns_head_chr_release, 404 .unlocked_ioctl = nvme_ns_head_chr_ioctl, 405 .compat_ioctl = compat_ptr_ioctl, 406 }; 407 408 static int nvme_add_ns_head_cdev(struct nvme_ns_head *head) 409 { 410 int ret; 411 412 head->cdev_device.parent = &head->subsys->dev; 413 ret = dev_set_name(&head->cdev_device, "ng%dn%d", 414 head->subsys->instance, head->instance); 415 if (ret) 416 return ret; 417 ret = nvme_cdev_add(&head->cdev, &head->cdev_device, 418 &nvme_ns_head_chr_fops, THIS_MODULE); 419 if (ret) 420 kfree_const(head->cdev_device.kobj.name); 421 return ret; 422 } 423 424 static void nvme_requeue_work(struct work_struct *work) 425 { 426 struct nvme_ns_head *head = 427 container_of(work, struct nvme_ns_head, requeue_work); 428 struct bio *bio, *next; 429 430 spin_lock_irq(&head->requeue_lock); 431 next = bio_list_get(&head->requeue_list); 432 spin_unlock_irq(&head->requeue_lock); 433 434 while ((bio = next) != NULL) { 435 next = bio->bi_next; 436 bio->bi_next = NULL; 437 438 submit_bio_noacct(bio); 439 } 440 } 441 442 int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head) 443 { 444 bool vwc = false; 445 446 mutex_init(&head->lock); 447 bio_list_init(&head->requeue_list); 448 spin_lock_init(&head->requeue_lock); 449 INIT_WORK(&head->requeue_work, nvme_requeue_work); 450 451 /* 452 * Add a multipath node if the subsystems supports multiple controllers. 453 * We also do this for private namespaces as the namespace sharing data could 454 * change after a rescan. 455 */ 456 if (!(ctrl->subsys->cmic & NVME_CTRL_CMIC_MULTI_CTRL) || !multipath) 457 return 0; 458 459 head->disk = blk_alloc_disk(ctrl->numa_node); 460 if (!head->disk) 461 return -ENOMEM; 462 head->disk->fops = &nvme_ns_head_ops; 463 head->disk->private_data = head; 464 sprintf(head->disk->disk_name, "nvme%dn%d", 465 ctrl->subsys->instance, head->instance); 466 467 blk_queue_flag_set(QUEUE_FLAG_NONROT, head->disk->queue); 468 /* set to a default value of 512 until the disk is validated */ 469 blk_queue_logical_block_size(head->disk->queue, 512); 470 blk_set_stacking_limits(&head->disk->queue->limits); 471 472 /* we need to propagate up the VMC settings */ 473 if (ctrl->vwc & NVME_CTRL_VWC_PRESENT) 474 vwc = true; 475 blk_queue_write_cache(head->disk->queue, vwc, vwc); 476 return 0; 477 } 478 479 static void nvme_mpath_set_live(struct nvme_ns *ns) 480 { 481 struct nvme_ns_head *head = ns->head; 482 483 if (!head->disk) 484 return; 485 486 if (!test_and_set_bit(NVME_NSHEAD_DISK_LIVE, &head->flags)) { 487 device_add_disk(&head->subsys->dev, head->disk, 488 nvme_ns_id_attr_groups); 489 nvme_add_ns_head_cdev(head); 490 } 491 492 mutex_lock(&head->lock); 493 if (nvme_path_is_optimized(ns)) { 494 int node, srcu_idx; 495 496 srcu_idx = srcu_read_lock(&head->srcu); 497 for_each_node(node) 498 __nvme_find_path(head, node); 499 srcu_read_unlock(&head->srcu, srcu_idx); 500 } 501 mutex_unlock(&head->lock); 502 503 synchronize_srcu(&head->srcu); 504 kblockd_schedule_work(&head->requeue_work); 505 } 506 507 static int nvme_parse_ana_log(struct nvme_ctrl *ctrl, void *data, 508 int (*cb)(struct nvme_ctrl *ctrl, struct nvme_ana_group_desc *, 509 void *)) 510 { 511 void *base = ctrl->ana_log_buf; 512 size_t offset = sizeof(struct nvme_ana_rsp_hdr); 513 int error, i; 514 515 lockdep_assert_held(&ctrl->ana_lock); 516 517 for (i = 0; i < le16_to_cpu(ctrl->ana_log_buf->ngrps); i++) { 518 struct nvme_ana_group_desc *desc = base + offset; 519 u32 nr_nsids; 520 size_t nsid_buf_size; 521 522 if (WARN_ON_ONCE(offset > ctrl->ana_log_size - sizeof(*desc))) 523 return -EINVAL; 524 525 nr_nsids = le32_to_cpu(desc->nnsids); 526 nsid_buf_size = nr_nsids * sizeof(__le32); 527 528 if (WARN_ON_ONCE(desc->grpid == 0)) 529 return -EINVAL; 530 if (WARN_ON_ONCE(le32_to_cpu(desc->grpid) > ctrl->anagrpmax)) 531 return -EINVAL; 532 if (WARN_ON_ONCE(desc->state == 0)) 533 return -EINVAL; 534 if (WARN_ON_ONCE(desc->state > NVME_ANA_CHANGE)) 535 return -EINVAL; 536 537 offset += sizeof(*desc); 538 if (WARN_ON_ONCE(offset > ctrl->ana_log_size - nsid_buf_size)) 539 return -EINVAL; 540 541 error = cb(ctrl, desc, data); 542 if (error) 543 return error; 544 545 offset += nsid_buf_size; 546 } 547 548 return 0; 549 } 550 551 static inline bool nvme_state_is_live(enum nvme_ana_state state) 552 { 553 return state == NVME_ANA_OPTIMIZED || state == NVME_ANA_NONOPTIMIZED; 554 } 555 556 static void nvme_update_ns_ana_state(struct nvme_ana_group_desc *desc, 557 struct nvme_ns *ns) 558 { 559 ns->ana_grpid = le32_to_cpu(desc->grpid); 560 ns->ana_state = desc->state; 561 clear_bit(NVME_NS_ANA_PENDING, &ns->flags); 562 563 if (nvme_state_is_live(ns->ana_state)) 564 nvme_mpath_set_live(ns); 565 } 566 567 static int nvme_update_ana_state(struct nvme_ctrl *ctrl, 568 struct nvme_ana_group_desc *desc, void *data) 569 { 570 u32 nr_nsids = le32_to_cpu(desc->nnsids), n = 0; 571 unsigned *nr_change_groups = data; 572 struct nvme_ns *ns; 573 574 dev_dbg(ctrl->device, "ANA group %d: %s.\n", 575 le32_to_cpu(desc->grpid), 576 nvme_ana_state_names[desc->state]); 577 578 if (desc->state == NVME_ANA_CHANGE) 579 (*nr_change_groups)++; 580 581 if (!nr_nsids) 582 return 0; 583 584 down_read(&ctrl->namespaces_rwsem); 585 list_for_each_entry(ns, &ctrl->namespaces, list) { 586 unsigned nsid = le32_to_cpu(desc->nsids[n]); 587 588 if (ns->head->ns_id < nsid) 589 continue; 590 if (ns->head->ns_id == nsid) 591 nvme_update_ns_ana_state(desc, ns); 592 if (++n == nr_nsids) 593 break; 594 } 595 up_read(&ctrl->namespaces_rwsem); 596 return 0; 597 } 598 599 static int nvme_read_ana_log(struct nvme_ctrl *ctrl) 600 { 601 u32 nr_change_groups = 0; 602 int error; 603 604 mutex_lock(&ctrl->ana_lock); 605 error = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_ANA, 0, NVME_CSI_NVM, 606 ctrl->ana_log_buf, ctrl->ana_log_size, 0); 607 if (error) { 608 dev_warn(ctrl->device, "Failed to get ANA log: %d\n", error); 609 goto out_unlock; 610 } 611 612 error = nvme_parse_ana_log(ctrl, &nr_change_groups, 613 nvme_update_ana_state); 614 if (error) 615 goto out_unlock; 616 617 /* 618 * In theory we should have an ANATT timer per group as they might enter 619 * the change state at different times. But that is a lot of overhead 620 * just to protect against a target that keeps entering new changes 621 * states while never finishing previous ones. But we'll still 622 * eventually time out once all groups are in change state, so this 623 * isn't a big deal. 624 * 625 * We also double the ANATT value to provide some slack for transports 626 * or AEN processing overhead. 627 */ 628 if (nr_change_groups) 629 mod_timer(&ctrl->anatt_timer, ctrl->anatt * HZ * 2 + jiffies); 630 else 631 del_timer_sync(&ctrl->anatt_timer); 632 out_unlock: 633 mutex_unlock(&ctrl->ana_lock); 634 return error; 635 } 636 637 static void nvme_ana_work(struct work_struct *work) 638 { 639 struct nvme_ctrl *ctrl = container_of(work, struct nvme_ctrl, ana_work); 640 641 if (ctrl->state != NVME_CTRL_LIVE) 642 return; 643 644 nvme_read_ana_log(ctrl); 645 } 646 647 static void nvme_anatt_timeout(struct timer_list *t) 648 { 649 struct nvme_ctrl *ctrl = from_timer(ctrl, t, anatt_timer); 650 651 dev_info(ctrl->device, "ANATT timeout, resetting controller.\n"); 652 nvme_reset_ctrl(ctrl); 653 } 654 655 void nvme_mpath_stop(struct nvme_ctrl *ctrl) 656 { 657 if (!nvme_ctrl_use_ana(ctrl)) 658 return; 659 del_timer_sync(&ctrl->anatt_timer); 660 cancel_work_sync(&ctrl->ana_work); 661 } 662 663 #define SUBSYS_ATTR_RW(_name, _mode, _show, _store) \ 664 struct device_attribute subsys_attr_##_name = \ 665 __ATTR(_name, _mode, _show, _store) 666 667 static const char *nvme_iopolicy_names[] = { 668 [NVME_IOPOLICY_NUMA] = "numa", 669 [NVME_IOPOLICY_RR] = "round-robin", 670 }; 671 672 static ssize_t nvme_subsys_iopolicy_show(struct device *dev, 673 struct device_attribute *attr, char *buf) 674 { 675 struct nvme_subsystem *subsys = 676 container_of(dev, struct nvme_subsystem, dev); 677 678 return sysfs_emit(buf, "%s\n", 679 nvme_iopolicy_names[READ_ONCE(subsys->iopolicy)]); 680 } 681 682 static ssize_t nvme_subsys_iopolicy_store(struct device *dev, 683 struct device_attribute *attr, const char *buf, size_t count) 684 { 685 struct nvme_subsystem *subsys = 686 container_of(dev, struct nvme_subsystem, dev); 687 int i; 688 689 for (i = 0; i < ARRAY_SIZE(nvme_iopolicy_names); i++) { 690 if (sysfs_streq(buf, nvme_iopolicy_names[i])) { 691 WRITE_ONCE(subsys->iopolicy, i); 692 return count; 693 } 694 } 695 696 return -EINVAL; 697 } 698 SUBSYS_ATTR_RW(iopolicy, S_IRUGO | S_IWUSR, 699 nvme_subsys_iopolicy_show, nvme_subsys_iopolicy_store); 700 701 static ssize_t ana_grpid_show(struct device *dev, struct device_attribute *attr, 702 char *buf) 703 { 704 return sysfs_emit(buf, "%d\n", nvme_get_ns_from_dev(dev)->ana_grpid); 705 } 706 DEVICE_ATTR_RO(ana_grpid); 707 708 static ssize_t ana_state_show(struct device *dev, struct device_attribute *attr, 709 char *buf) 710 { 711 struct nvme_ns *ns = nvme_get_ns_from_dev(dev); 712 713 return sysfs_emit(buf, "%s\n", nvme_ana_state_names[ns->ana_state]); 714 } 715 DEVICE_ATTR_RO(ana_state); 716 717 static int nvme_lookup_ana_group_desc(struct nvme_ctrl *ctrl, 718 struct nvme_ana_group_desc *desc, void *data) 719 { 720 struct nvme_ana_group_desc *dst = data; 721 722 if (desc->grpid != dst->grpid) 723 return 0; 724 725 *dst = *desc; 726 return -ENXIO; /* just break out of the loop */ 727 } 728 729 void nvme_mpath_add_disk(struct nvme_ns *ns, struct nvme_id_ns *id) 730 { 731 if (nvme_ctrl_use_ana(ns->ctrl)) { 732 struct nvme_ana_group_desc desc = { 733 .grpid = id->anagrpid, 734 .state = 0, 735 }; 736 737 mutex_lock(&ns->ctrl->ana_lock); 738 ns->ana_grpid = le32_to_cpu(id->anagrpid); 739 nvme_parse_ana_log(ns->ctrl, &desc, nvme_lookup_ana_group_desc); 740 mutex_unlock(&ns->ctrl->ana_lock); 741 if (desc.state) { 742 /* found the group desc: update */ 743 nvme_update_ns_ana_state(&desc, ns); 744 } else { 745 /* group desc not found: trigger a re-read */ 746 set_bit(NVME_NS_ANA_PENDING, &ns->flags); 747 queue_work(nvme_wq, &ns->ctrl->ana_work); 748 } 749 } else { 750 ns->ana_state = NVME_ANA_OPTIMIZED; 751 nvme_mpath_set_live(ns); 752 } 753 754 if (blk_queue_stable_writes(ns->queue) && ns->head->disk) 755 blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, 756 ns->head->disk->queue); 757 #ifdef CONFIG_BLK_DEV_ZONED 758 if (blk_queue_is_zoned(ns->queue) && ns->head->disk) 759 ns->head->disk->queue->nr_zones = ns->queue->nr_zones; 760 #endif 761 } 762 763 void nvme_mpath_remove_disk(struct nvme_ns_head *head) 764 { 765 if (!head->disk) 766 return; 767 if (head->disk->flags & GENHD_FL_UP) { 768 nvme_cdev_del(&head->cdev, &head->cdev_device); 769 del_gendisk(head->disk); 770 } 771 blk_set_queue_dying(head->disk->queue); 772 /* make sure all pending bios are cleaned up */ 773 kblockd_schedule_work(&head->requeue_work); 774 flush_work(&head->requeue_work); 775 blk_cleanup_disk(head->disk); 776 } 777 778 void nvme_mpath_init_ctrl(struct nvme_ctrl *ctrl) 779 { 780 mutex_init(&ctrl->ana_lock); 781 timer_setup(&ctrl->anatt_timer, nvme_anatt_timeout, 0); 782 INIT_WORK(&ctrl->ana_work, nvme_ana_work); 783 } 784 785 int nvme_mpath_init_identify(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id) 786 { 787 size_t max_transfer_size = ctrl->max_hw_sectors << SECTOR_SHIFT; 788 size_t ana_log_size; 789 int error = 0; 790 791 /* check if multipath is enabled and we have the capability */ 792 if (!multipath || !ctrl->subsys || 793 !(ctrl->subsys->cmic & NVME_CTRL_CMIC_ANA)) 794 return 0; 795 796 if (!ctrl->max_namespaces || 797 ctrl->max_namespaces > le32_to_cpu(id->nn)) { 798 dev_err(ctrl->device, 799 "Invalid MNAN value %u\n", ctrl->max_namespaces); 800 return -EINVAL; 801 } 802 803 ctrl->anacap = id->anacap; 804 ctrl->anatt = id->anatt; 805 ctrl->nanagrpid = le32_to_cpu(id->nanagrpid); 806 ctrl->anagrpmax = le32_to_cpu(id->anagrpmax); 807 808 ana_log_size = sizeof(struct nvme_ana_rsp_hdr) + 809 ctrl->nanagrpid * sizeof(struct nvme_ana_group_desc) + 810 ctrl->max_namespaces * sizeof(__le32); 811 if (ana_log_size > max_transfer_size) { 812 dev_err(ctrl->device, 813 "ANA log page size (%zd) larger than MDTS (%zd).\n", 814 ana_log_size, max_transfer_size); 815 dev_err(ctrl->device, "disabling ANA support.\n"); 816 goto out_uninit; 817 } 818 if (ana_log_size > ctrl->ana_log_size) { 819 nvme_mpath_stop(ctrl); 820 kfree(ctrl->ana_log_buf); 821 ctrl->ana_log_buf = kmalloc(ana_log_size, GFP_KERNEL); 822 if (!ctrl->ana_log_buf) 823 return -ENOMEM; 824 } 825 ctrl->ana_log_size = ana_log_size; 826 error = nvme_read_ana_log(ctrl); 827 if (error) 828 goto out_uninit; 829 return 0; 830 831 out_uninit: 832 nvme_mpath_uninit(ctrl); 833 return error; 834 } 835 836 void nvme_mpath_uninit(struct nvme_ctrl *ctrl) 837 { 838 kfree(ctrl->ana_log_buf); 839 ctrl->ana_log_buf = NULL; 840 } 841