1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (c) 2017-2018 Christoph Hellwig. 4 */ 5 6 #include <linux/backing-dev.h> 7 #include <linux/moduleparam.h> 8 #include <trace/events/block.h> 9 #include "nvme.h" 10 11 static bool multipath = true; 12 module_param(multipath, bool, 0444); 13 MODULE_PARM_DESC(multipath, 14 "turn on native support for multiple controllers per subsystem"); 15 16 void nvme_mpath_unfreeze(struct nvme_subsystem *subsys) 17 { 18 struct nvme_ns_head *h; 19 20 lockdep_assert_held(&subsys->lock); 21 list_for_each_entry(h, &subsys->nsheads, entry) 22 if (h->disk) 23 blk_mq_unfreeze_queue(h->disk->queue); 24 } 25 26 void nvme_mpath_wait_freeze(struct nvme_subsystem *subsys) 27 { 28 struct nvme_ns_head *h; 29 30 lockdep_assert_held(&subsys->lock); 31 list_for_each_entry(h, &subsys->nsheads, entry) 32 if (h->disk) 33 blk_mq_freeze_queue_wait(h->disk->queue); 34 } 35 36 void nvme_mpath_start_freeze(struct nvme_subsystem *subsys) 37 { 38 struct nvme_ns_head *h; 39 40 lockdep_assert_held(&subsys->lock); 41 list_for_each_entry(h, &subsys->nsheads, entry) 42 if (h->disk) 43 blk_freeze_queue_start(h->disk->queue); 44 } 45 46 /* 47 * If multipathing is enabled we need to always use the subsystem instance 48 * number for numbering our devices to avoid conflicts between subsystems that 49 * have multiple controllers and thus use the multipath-aware subsystem node 50 * and those that have a single controller and use the controller node 51 * directly. 52 */ 53 bool nvme_mpath_set_disk_name(struct nvme_ns *ns, char *disk_name, int *flags) 54 { 55 if (!multipath) 56 return false; 57 if (!ns->head->disk) { 58 sprintf(disk_name, "nvme%dn%d", ns->ctrl->subsys->instance, 59 ns->head->instance); 60 return true; 61 } 62 sprintf(disk_name, "nvme%dc%dn%d", ns->ctrl->subsys->instance, 63 ns->ctrl->instance, ns->head->instance); 64 *flags = GENHD_FL_HIDDEN; 65 return true; 66 } 67 68 void nvme_failover_req(struct request *req) 69 { 70 struct nvme_ns *ns = req->q->queuedata; 71 u16 status = nvme_req(req)->status & 0x7ff; 72 unsigned long flags; 73 struct bio *bio; 74 75 nvme_mpath_clear_current_path(ns); 76 77 /* 78 * If we got back an ANA error, we know the controller is alive but not 79 * ready to serve this namespace. Kick of a re-read of the ANA 80 * information page, and just try any other available path for now. 81 */ 82 if (nvme_is_ana_error(status) && ns->ctrl->ana_log_buf) { 83 set_bit(NVME_NS_ANA_PENDING, &ns->flags); 84 queue_work(nvme_wq, &ns->ctrl->ana_work); 85 } 86 87 spin_lock_irqsave(&ns->head->requeue_lock, flags); 88 for (bio = req->bio; bio; bio = bio->bi_next) 89 bio_set_dev(bio, ns->head->disk->part0); 90 blk_steal_bios(&ns->head->requeue_list, req); 91 spin_unlock_irqrestore(&ns->head->requeue_lock, flags); 92 93 blk_mq_end_request(req, 0); 94 kblockd_schedule_work(&ns->head->requeue_work); 95 } 96 97 void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl) 98 { 99 struct nvme_ns *ns; 100 101 down_read(&ctrl->namespaces_rwsem); 102 list_for_each_entry(ns, &ctrl->namespaces, list) { 103 if (ns->head->disk) 104 kblockd_schedule_work(&ns->head->requeue_work); 105 } 106 up_read(&ctrl->namespaces_rwsem); 107 } 108 109 static const char *nvme_ana_state_names[] = { 110 [0] = "invalid state", 111 [NVME_ANA_OPTIMIZED] = "optimized", 112 [NVME_ANA_NONOPTIMIZED] = "non-optimized", 113 [NVME_ANA_INACCESSIBLE] = "inaccessible", 114 [NVME_ANA_PERSISTENT_LOSS] = "persistent-loss", 115 [NVME_ANA_CHANGE] = "change", 116 }; 117 118 bool nvme_mpath_clear_current_path(struct nvme_ns *ns) 119 { 120 struct nvme_ns_head *head = ns->head; 121 bool changed = false; 122 int node; 123 124 if (!head) 125 goto out; 126 127 for_each_node(node) { 128 if (ns == rcu_access_pointer(head->current_path[node])) { 129 rcu_assign_pointer(head->current_path[node], NULL); 130 changed = true; 131 } 132 } 133 out: 134 return changed; 135 } 136 137 void nvme_mpath_clear_ctrl_paths(struct nvme_ctrl *ctrl) 138 { 139 struct nvme_ns *ns; 140 141 mutex_lock(&ctrl->scan_lock); 142 down_read(&ctrl->namespaces_rwsem); 143 list_for_each_entry(ns, &ctrl->namespaces, list) 144 if (nvme_mpath_clear_current_path(ns)) 145 kblockd_schedule_work(&ns->head->requeue_work); 146 up_read(&ctrl->namespaces_rwsem); 147 mutex_unlock(&ctrl->scan_lock); 148 } 149 150 static bool nvme_path_is_disabled(struct nvme_ns *ns) 151 { 152 /* 153 * We don't treat NVME_CTRL_DELETING as a disabled path as I/O should 154 * still be able to complete assuming that the controller is connected. 155 * Otherwise it will fail immediately and return to the requeue list. 156 */ 157 if (ns->ctrl->state != NVME_CTRL_LIVE && 158 ns->ctrl->state != NVME_CTRL_DELETING) 159 return true; 160 if (test_bit(NVME_NS_ANA_PENDING, &ns->flags) || 161 test_bit(NVME_NS_REMOVING, &ns->flags)) 162 return true; 163 return false; 164 } 165 166 static struct nvme_ns *__nvme_find_path(struct nvme_ns_head *head, int node) 167 { 168 int found_distance = INT_MAX, fallback_distance = INT_MAX, distance; 169 struct nvme_ns *found = NULL, *fallback = NULL, *ns; 170 171 list_for_each_entry_rcu(ns, &head->list, siblings) { 172 if (nvme_path_is_disabled(ns)) 173 continue; 174 175 if (READ_ONCE(head->subsys->iopolicy) == NVME_IOPOLICY_NUMA) 176 distance = node_distance(node, ns->ctrl->numa_node); 177 else 178 distance = LOCAL_DISTANCE; 179 180 switch (ns->ana_state) { 181 case NVME_ANA_OPTIMIZED: 182 if (distance < found_distance) { 183 found_distance = distance; 184 found = ns; 185 } 186 break; 187 case NVME_ANA_NONOPTIMIZED: 188 if (distance < fallback_distance) { 189 fallback_distance = distance; 190 fallback = ns; 191 } 192 break; 193 default: 194 break; 195 } 196 } 197 198 if (!found) 199 found = fallback; 200 if (found) 201 rcu_assign_pointer(head->current_path[node], found); 202 return found; 203 } 204 205 static struct nvme_ns *nvme_next_ns(struct nvme_ns_head *head, 206 struct nvme_ns *ns) 207 { 208 ns = list_next_or_null_rcu(&head->list, &ns->siblings, struct nvme_ns, 209 siblings); 210 if (ns) 211 return ns; 212 return list_first_or_null_rcu(&head->list, struct nvme_ns, siblings); 213 } 214 215 static struct nvme_ns *nvme_round_robin_path(struct nvme_ns_head *head, 216 int node, struct nvme_ns *old) 217 { 218 struct nvme_ns *ns, *found = NULL; 219 220 if (list_is_singular(&head->list)) { 221 if (nvme_path_is_disabled(old)) 222 return NULL; 223 return old; 224 } 225 226 for (ns = nvme_next_ns(head, old); 227 ns && ns != old; 228 ns = nvme_next_ns(head, ns)) { 229 if (nvme_path_is_disabled(ns)) 230 continue; 231 232 if (ns->ana_state == NVME_ANA_OPTIMIZED) { 233 found = ns; 234 goto out; 235 } 236 if (ns->ana_state == NVME_ANA_NONOPTIMIZED) 237 found = ns; 238 } 239 240 /* 241 * The loop above skips the current path for round-robin semantics. 242 * Fall back to the current path if either: 243 * - no other optimized path found and current is optimized, 244 * - no other usable path found and current is usable. 245 */ 246 if (!nvme_path_is_disabled(old) && 247 (old->ana_state == NVME_ANA_OPTIMIZED || 248 (!found && old->ana_state == NVME_ANA_NONOPTIMIZED))) 249 return old; 250 251 if (!found) 252 return NULL; 253 out: 254 rcu_assign_pointer(head->current_path[node], found); 255 return found; 256 } 257 258 static inline bool nvme_path_is_optimized(struct nvme_ns *ns) 259 { 260 return ns->ctrl->state == NVME_CTRL_LIVE && 261 ns->ana_state == NVME_ANA_OPTIMIZED; 262 } 263 264 inline struct nvme_ns *nvme_find_path(struct nvme_ns_head *head) 265 { 266 int node = numa_node_id(); 267 struct nvme_ns *ns; 268 269 ns = srcu_dereference(head->current_path[node], &head->srcu); 270 if (unlikely(!ns)) 271 return __nvme_find_path(head, node); 272 273 if (READ_ONCE(head->subsys->iopolicy) == NVME_IOPOLICY_RR) 274 return nvme_round_robin_path(head, node, ns); 275 if (unlikely(!nvme_path_is_optimized(ns))) 276 return __nvme_find_path(head, node); 277 return ns; 278 } 279 280 static bool nvme_available_path(struct nvme_ns_head *head) 281 { 282 struct nvme_ns *ns; 283 284 list_for_each_entry_rcu(ns, &head->list, siblings) { 285 if (test_bit(NVME_CTRL_FAILFAST_EXPIRED, &ns->ctrl->flags)) 286 continue; 287 switch (ns->ctrl->state) { 288 case NVME_CTRL_LIVE: 289 case NVME_CTRL_RESETTING: 290 case NVME_CTRL_CONNECTING: 291 /* fallthru */ 292 return true; 293 default: 294 break; 295 } 296 } 297 return false; 298 } 299 300 static blk_qc_t nvme_ns_head_submit_bio(struct bio *bio) 301 { 302 struct nvme_ns_head *head = bio->bi_bdev->bd_disk->private_data; 303 struct device *dev = disk_to_dev(head->disk); 304 struct nvme_ns *ns; 305 blk_qc_t ret = BLK_QC_T_NONE; 306 int srcu_idx; 307 308 /* 309 * The namespace might be going away and the bio might be moved to a 310 * different queue via blk_steal_bios(), so we need to use the bio_split 311 * pool from the original queue to allocate the bvecs from. 312 */ 313 blk_queue_split(&bio); 314 315 srcu_idx = srcu_read_lock(&head->srcu); 316 ns = nvme_find_path(head); 317 if (likely(ns)) { 318 bio_set_dev(bio, ns->disk->part0); 319 bio->bi_opf |= REQ_NVME_MPATH; 320 trace_block_bio_remap(bio, disk_devt(ns->head->disk), 321 bio->bi_iter.bi_sector); 322 ret = submit_bio_noacct(bio); 323 } else if (nvme_available_path(head)) { 324 dev_warn_ratelimited(dev, "no usable path - requeuing I/O\n"); 325 326 spin_lock_irq(&head->requeue_lock); 327 bio_list_add(&head->requeue_list, bio); 328 spin_unlock_irq(&head->requeue_lock); 329 } else { 330 dev_warn_ratelimited(dev, "no available path - failing I/O\n"); 331 332 bio->bi_status = BLK_STS_IOERR; 333 bio_endio(bio); 334 } 335 336 srcu_read_unlock(&head->srcu, srcu_idx); 337 return ret; 338 } 339 340 static int nvme_ns_head_open(struct block_device *bdev, fmode_t mode) 341 { 342 if (!nvme_tryget_ns_head(bdev->bd_disk->private_data)) 343 return -ENXIO; 344 return 0; 345 } 346 347 static void nvme_ns_head_release(struct gendisk *disk, fmode_t mode) 348 { 349 nvme_put_ns_head(disk->private_data); 350 } 351 352 const struct block_device_operations nvme_ns_head_ops = { 353 .owner = THIS_MODULE, 354 .submit_bio = nvme_ns_head_submit_bio, 355 .open = nvme_ns_head_open, 356 .release = nvme_ns_head_release, 357 .ioctl = nvme_ns_head_ioctl, 358 .getgeo = nvme_getgeo, 359 .report_zones = nvme_report_zones, 360 .pr_ops = &nvme_pr_ops, 361 }; 362 363 static inline struct nvme_ns_head *cdev_to_ns_head(struct cdev *cdev) 364 { 365 return container_of(cdev, struct nvme_ns_head, cdev); 366 } 367 368 static int nvme_ns_head_chr_open(struct inode *inode, struct file *file) 369 { 370 if (!nvme_tryget_ns_head(cdev_to_ns_head(inode->i_cdev))) 371 return -ENXIO; 372 return 0; 373 } 374 375 static int nvme_ns_head_chr_release(struct inode *inode, struct file *file) 376 { 377 nvme_put_ns_head(cdev_to_ns_head(inode->i_cdev)); 378 return 0; 379 } 380 381 static const struct file_operations nvme_ns_head_chr_fops = { 382 .owner = THIS_MODULE, 383 .open = nvme_ns_head_chr_open, 384 .release = nvme_ns_head_chr_release, 385 .unlocked_ioctl = nvme_ns_head_chr_ioctl, 386 .compat_ioctl = compat_ptr_ioctl, 387 }; 388 389 static int nvme_add_ns_head_cdev(struct nvme_ns_head *head) 390 { 391 int ret; 392 393 head->cdev_device.parent = &head->subsys->dev; 394 ret = dev_set_name(&head->cdev_device, "ng%dn%d", 395 head->subsys->instance, head->instance); 396 if (ret) 397 return ret; 398 ret = nvme_cdev_add(&head->cdev, &head->cdev_device, 399 &nvme_ns_head_chr_fops, THIS_MODULE); 400 if (ret) 401 kfree_const(head->cdev_device.kobj.name); 402 return ret; 403 } 404 405 static void nvme_requeue_work(struct work_struct *work) 406 { 407 struct nvme_ns_head *head = 408 container_of(work, struct nvme_ns_head, requeue_work); 409 struct bio *bio, *next; 410 411 spin_lock_irq(&head->requeue_lock); 412 next = bio_list_get(&head->requeue_list); 413 spin_unlock_irq(&head->requeue_lock); 414 415 while ((bio = next) != NULL) { 416 next = bio->bi_next; 417 bio->bi_next = NULL; 418 419 /* 420 * Reset disk to the mpath node and resubmit to select a new 421 * path. 422 */ 423 bio_set_dev(bio, head->disk->part0); 424 submit_bio_noacct(bio); 425 } 426 } 427 428 int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head) 429 { 430 struct request_queue *q; 431 bool vwc = false; 432 433 mutex_init(&head->lock); 434 bio_list_init(&head->requeue_list); 435 spin_lock_init(&head->requeue_lock); 436 INIT_WORK(&head->requeue_work, nvme_requeue_work); 437 438 /* 439 * Add a multipath node if the subsystems supports multiple controllers. 440 * We also do this for private namespaces as the namespace sharing data could 441 * change after a rescan. 442 */ 443 if (!(ctrl->subsys->cmic & NVME_CTRL_CMIC_MULTI_CTRL) || !multipath) 444 return 0; 445 446 q = blk_alloc_queue(ctrl->numa_node); 447 if (!q) 448 goto out; 449 blk_queue_flag_set(QUEUE_FLAG_NONROT, q); 450 /* set to a default value for 512 until disk is validated */ 451 blk_queue_logical_block_size(q, 512); 452 blk_set_stacking_limits(&q->limits); 453 454 /* we need to propagate up the VMC settings */ 455 if (ctrl->vwc & NVME_CTRL_VWC_PRESENT) 456 vwc = true; 457 blk_queue_write_cache(q, vwc, vwc); 458 459 head->disk = alloc_disk(0); 460 if (!head->disk) 461 goto out_cleanup_queue; 462 head->disk->fops = &nvme_ns_head_ops; 463 head->disk->private_data = head; 464 head->disk->queue = q; 465 head->disk->flags = GENHD_FL_EXT_DEVT; 466 sprintf(head->disk->disk_name, "nvme%dn%d", 467 ctrl->subsys->instance, head->instance); 468 return 0; 469 470 out_cleanup_queue: 471 blk_cleanup_queue(q); 472 out: 473 return -ENOMEM; 474 } 475 476 static void nvme_mpath_set_live(struct nvme_ns *ns) 477 { 478 struct nvme_ns_head *head = ns->head; 479 480 if (!head->disk) 481 return; 482 483 if (!test_and_set_bit(NVME_NSHEAD_DISK_LIVE, &head->flags)) { 484 device_add_disk(&head->subsys->dev, head->disk, 485 nvme_ns_id_attr_groups); 486 nvme_add_ns_head_cdev(head); 487 } 488 489 mutex_lock(&head->lock); 490 if (nvme_path_is_optimized(ns)) { 491 int node, srcu_idx; 492 493 srcu_idx = srcu_read_lock(&head->srcu); 494 for_each_node(node) 495 __nvme_find_path(head, node); 496 srcu_read_unlock(&head->srcu, srcu_idx); 497 } 498 mutex_unlock(&head->lock); 499 500 synchronize_srcu(&head->srcu); 501 kblockd_schedule_work(&head->requeue_work); 502 } 503 504 static int nvme_parse_ana_log(struct nvme_ctrl *ctrl, void *data, 505 int (*cb)(struct nvme_ctrl *ctrl, struct nvme_ana_group_desc *, 506 void *)) 507 { 508 void *base = ctrl->ana_log_buf; 509 size_t offset = sizeof(struct nvme_ana_rsp_hdr); 510 int error, i; 511 512 lockdep_assert_held(&ctrl->ana_lock); 513 514 for (i = 0; i < le16_to_cpu(ctrl->ana_log_buf->ngrps); i++) { 515 struct nvme_ana_group_desc *desc = base + offset; 516 u32 nr_nsids; 517 size_t nsid_buf_size; 518 519 if (WARN_ON_ONCE(offset > ctrl->ana_log_size - sizeof(*desc))) 520 return -EINVAL; 521 522 nr_nsids = le32_to_cpu(desc->nnsids); 523 nsid_buf_size = nr_nsids * sizeof(__le32); 524 525 if (WARN_ON_ONCE(desc->grpid == 0)) 526 return -EINVAL; 527 if (WARN_ON_ONCE(le32_to_cpu(desc->grpid) > ctrl->anagrpmax)) 528 return -EINVAL; 529 if (WARN_ON_ONCE(desc->state == 0)) 530 return -EINVAL; 531 if (WARN_ON_ONCE(desc->state > NVME_ANA_CHANGE)) 532 return -EINVAL; 533 534 offset += sizeof(*desc); 535 if (WARN_ON_ONCE(offset > ctrl->ana_log_size - nsid_buf_size)) 536 return -EINVAL; 537 538 error = cb(ctrl, desc, data); 539 if (error) 540 return error; 541 542 offset += nsid_buf_size; 543 } 544 545 return 0; 546 } 547 548 static inline bool nvme_state_is_live(enum nvme_ana_state state) 549 { 550 return state == NVME_ANA_OPTIMIZED || state == NVME_ANA_NONOPTIMIZED; 551 } 552 553 static void nvme_update_ns_ana_state(struct nvme_ana_group_desc *desc, 554 struct nvme_ns *ns) 555 { 556 ns->ana_grpid = le32_to_cpu(desc->grpid); 557 ns->ana_state = desc->state; 558 clear_bit(NVME_NS_ANA_PENDING, &ns->flags); 559 560 if (nvme_state_is_live(ns->ana_state)) 561 nvme_mpath_set_live(ns); 562 } 563 564 static int nvme_update_ana_state(struct nvme_ctrl *ctrl, 565 struct nvme_ana_group_desc *desc, void *data) 566 { 567 u32 nr_nsids = le32_to_cpu(desc->nnsids), n = 0; 568 unsigned *nr_change_groups = data; 569 struct nvme_ns *ns; 570 571 dev_dbg(ctrl->device, "ANA group %d: %s.\n", 572 le32_to_cpu(desc->grpid), 573 nvme_ana_state_names[desc->state]); 574 575 if (desc->state == NVME_ANA_CHANGE) 576 (*nr_change_groups)++; 577 578 if (!nr_nsids) 579 return 0; 580 581 down_read(&ctrl->namespaces_rwsem); 582 list_for_each_entry(ns, &ctrl->namespaces, list) { 583 unsigned nsid = le32_to_cpu(desc->nsids[n]); 584 585 if (ns->head->ns_id < nsid) 586 continue; 587 if (ns->head->ns_id == nsid) 588 nvme_update_ns_ana_state(desc, ns); 589 if (++n == nr_nsids) 590 break; 591 } 592 up_read(&ctrl->namespaces_rwsem); 593 return 0; 594 } 595 596 static int nvme_read_ana_log(struct nvme_ctrl *ctrl) 597 { 598 u32 nr_change_groups = 0; 599 int error; 600 601 mutex_lock(&ctrl->ana_lock); 602 error = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_ANA, 0, NVME_CSI_NVM, 603 ctrl->ana_log_buf, ctrl->ana_log_size, 0); 604 if (error) { 605 dev_warn(ctrl->device, "Failed to get ANA log: %d\n", error); 606 goto out_unlock; 607 } 608 609 error = nvme_parse_ana_log(ctrl, &nr_change_groups, 610 nvme_update_ana_state); 611 if (error) 612 goto out_unlock; 613 614 /* 615 * In theory we should have an ANATT timer per group as they might enter 616 * the change state at different times. But that is a lot of overhead 617 * just to protect against a target that keeps entering new changes 618 * states while never finishing previous ones. But we'll still 619 * eventually time out once all groups are in change state, so this 620 * isn't a big deal. 621 * 622 * We also double the ANATT value to provide some slack for transports 623 * or AEN processing overhead. 624 */ 625 if (nr_change_groups) 626 mod_timer(&ctrl->anatt_timer, ctrl->anatt * HZ * 2 + jiffies); 627 else 628 del_timer_sync(&ctrl->anatt_timer); 629 out_unlock: 630 mutex_unlock(&ctrl->ana_lock); 631 return error; 632 } 633 634 static void nvme_ana_work(struct work_struct *work) 635 { 636 struct nvme_ctrl *ctrl = container_of(work, struct nvme_ctrl, ana_work); 637 638 if (ctrl->state != NVME_CTRL_LIVE) 639 return; 640 641 nvme_read_ana_log(ctrl); 642 } 643 644 static void nvme_anatt_timeout(struct timer_list *t) 645 { 646 struct nvme_ctrl *ctrl = from_timer(ctrl, t, anatt_timer); 647 648 dev_info(ctrl->device, "ANATT timeout, resetting controller.\n"); 649 nvme_reset_ctrl(ctrl); 650 } 651 652 void nvme_mpath_stop(struct nvme_ctrl *ctrl) 653 { 654 if (!nvme_ctrl_use_ana(ctrl)) 655 return; 656 del_timer_sync(&ctrl->anatt_timer); 657 cancel_work_sync(&ctrl->ana_work); 658 } 659 660 #define SUBSYS_ATTR_RW(_name, _mode, _show, _store) \ 661 struct device_attribute subsys_attr_##_name = \ 662 __ATTR(_name, _mode, _show, _store) 663 664 static const char *nvme_iopolicy_names[] = { 665 [NVME_IOPOLICY_NUMA] = "numa", 666 [NVME_IOPOLICY_RR] = "round-robin", 667 }; 668 669 static ssize_t nvme_subsys_iopolicy_show(struct device *dev, 670 struct device_attribute *attr, char *buf) 671 { 672 struct nvme_subsystem *subsys = 673 container_of(dev, struct nvme_subsystem, dev); 674 675 return sysfs_emit(buf, "%s\n", 676 nvme_iopolicy_names[READ_ONCE(subsys->iopolicy)]); 677 } 678 679 static ssize_t nvme_subsys_iopolicy_store(struct device *dev, 680 struct device_attribute *attr, const char *buf, size_t count) 681 { 682 struct nvme_subsystem *subsys = 683 container_of(dev, struct nvme_subsystem, dev); 684 int i; 685 686 for (i = 0; i < ARRAY_SIZE(nvme_iopolicy_names); i++) { 687 if (sysfs_streq(buf, nvme_iopolicy_names[i])) { 688 WRITE_ONCE(subsys->iopolicy, i); 689 return count; 690 } 691 } 692 693 return -EINVAL; 694 } 695 SUBSYS_ATTR_RW(iopolicy, S_IRUGO | S_IWUSR, 696 nvme_subsys_iopolicy_show, nvme_subsys_iopolicy_store); 697 698 static ssize_t ana_grpid_show(struct device *dev, struct device_attribute *attr, 699 char *buf) 700 { 701 return sysfs_emit(buf, "%d\n", nvme_get_ns_from_dev(dev)->ana_grpid); 702 } 703 DEVICE_ATTR_RO(ana_grpid); 704 705 static ssize_t ana_state_show(struct device *dev, struct device_attribute *attr, 706 char *buf) 707 { 708 struct nvme_ns *ns = nvme_get_ns_from_dev(dev); 709 710 return sysfs_emit(buf, "%s\n", nvme_ana_state_names[ns->ana_state]); 711 } 712 DEVICE_ATTR_RO(ana_state); 713 714 static int nvme_lookup_ana_group_desc(struct nvme_ctrl *ctrl, 715 struct nvme_ana_group_desc *desc, void *data) 716 { 717 struct nvme_ana_group_desc *dst = data; 718 719 if (desc->grpid != dst->grpid) 720 return 0; 721 722 *dst = *desc; 723 return -ENXIO; /* just break out of the loop */ 724 } 725 726 void nvme_mpath_add_disk(struct nvme_ns *ns, struct nvme_id_ns *id) 727 { 728 if (nvme_ctrl_use_ana(ns->ctrl)) { 729 struct nvme_ana_group_desc desc = { 730 .grpid = id->anagrpid, 731 .state = 0, 732 }; 733 734 mutex_lock(&ns->ctrl->ana_lock); 735 ns->ana_grpid = le32_to_cpu(id->anagrpid); 736 nvme_parse_ana_log(ns->ctrl, &desc, nvme_lookup_ana_group_desc); 737 mutex_unlock(&ns->ctrl->ana_lock); 738 if (desc.state) { 739 /* found the group desc: update */ 740 nvme_update_ns_ana_state(&desc, ns); 741 } else { 742 /* group desc not found: trigger a re-read */ 743 set_bit(NVME_NS_ANA_PENDING, &ns->flags); 744 queue_work(nvme_wq, &ns->ctrl->ana_work); 745 } 746 } else { 747 ns->ana_state = NVME_ANA_OPTIMIZED; 748 nvme_mpath_set_live(ns); 749 } 750 751 if (blk_queue_stable_writes(ns->queue) && ns->head->disk) 752 blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, 753 ns->head->disk->queue); 754 #ifdef CONFIG_BLK_DEV_ZONED 755 if (blk_queue_is_zoned(ns->queue) && ns->head->disk) 756 ns->head->disk->queue->nr_zones = ns->queue->nr_zones; 757 #endif 758 } 759 760 void nvme_mpath_remove_disk(struct nvme_ns_head *head) 761 { 762 if (!head->disk) 763 return; 764 if (head->disk->flags & GENHD_FL_UP) { 765 nvme_cdev_del(&head->cdev, &head->cdev_device); 766 del_gendisk(head->disk); 767 } 768 blk_set_queue_dying(head->disk->queue); 769 /* make sure all pending bios are cleaned up */ 770 kblockd_schedule_work(&head->requeue_work); 771 flush_work(&head->requeue_work); 772 blk_cleanup_queue(head->disk->queue); 773 if (!test_bit(NVME_NSHEAD_DISK_LIVE, &head->flags)) { 774 /* 775 * if device_add_disk wasn't called, prevent 776 * disk release to put a bogus reference on the 777 * request queue 778 */ 779 head->disk->queue = NULL; 780 } 781 put_disk(head->disk); 782 } 783 784 void nvme_mpath_init_ctrl(struct nvme_ctrl *ctrl) 785 { 786 mutex_init(&ctrl->ana_lock); 787 timer_setup(&ctrl->anatt_timer, nvme_anatt_timeout, 0); 788 INIT_WORK(&ctrl->ana_work, nvme_ana_work); 789 } 790 791 int nvme_mpath_init_identify(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id) 792 { 793 size_t max_transfer_size = ctrl->max_hw_sectors << SECTOR_SHIFT; 794 size_t ana_log_size; 795 int error = 0; 796 797 /* check if multipath is enabled and we have the capability */ 798 if (!multipath || !ctrl->subsys || 799 !(ctrl->subsys->cmic & NVME_CTRL_CMIC_ANA)) 800 return 0; 801 802 ctrl->anacap = id->anacap; 803 ctrl->anatt = id->anatt; 804 ctrl->nanagrpid = le32_to_cpu(id->nanagrpid); 805 ctrl->anagrpmax = le32_to_cpu(id->anagrpmax); 806 807 ana_log_size = sizeof(struct nvme_ana_rsp_hdr) + 808 ctrl->nanagrpid * sizeof(struct nvme_ana_group_desc) + 809 ctrl->max_namespaces * sizeof(__le32); 810 if (ana_log_size > max_transfer_size) { 811 dev_err(ctrl->device, 812 "ANA log page size (%zd) larger than MDTS (%zd).\n", 813 ana_log_size, max_transfer_size); 814 dev_err(ctrl->device, "disabling ANA support.\n"); 815 goto out_uninit; 816 } 817 if (ana_log_size > ctrl->ana_log_size) { 818 nvme_mpath_stop(ctrl); 819 kfree(ctrl->ana_log_buf); 820 ctrl->ana_log_buf = kmalloc(ana_log_size, GFP_KERNEL); 821 if (!ctrl->ana_log_buf) 822 return -ENOMEM; 823 } 824 ctrl->ana_log_size = ana_log_size; 825 error = nvme_read_ana_log(ctrl); 826 if (error) 827 goto out_uninit; 828 return 0; 829 830 out_uninit: 831 nvme_mpath_uninit(ctrl); 832 return error; 833 } 834 835 void nvme_mpath_uninit(struct nvme_ctrl *ctrl) 836 { 837 kfree(ctrl->ana_log_buf); 838 ctrl->ana_log_buf = NULL; 839 } 840