1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (c) 2017-2018 Christoph Hellwig. 4 */ 5 6 #include <linux/backing-dev.h> 7 #include <linux/moduleparam.h> 8 #include <trace/events/block.h> 9 #include "nvme.h" 10 11 static bool multipath = true; 12 module_param(multipath, bool, 0444); 13 MODULE_PARM_DESC(multipath, 14 "turn on native support for multiple controllers per subsystem"); 15 16 void nvme_mpath_unfreeze(struct nvme_subsystem *subsys) 17 { 18 struct nvme_ns_head *h; 19 20 lockdep_assert_held(&subsys->lock); 21 list_for_each_entry(h, &subsys->nsheads, entry) 22 if (h->disk) 23 blk_mq_unfreeze_queue(h->disk->queue); 24 } 25 26 void nvme_mpath_wait_freeze(struct nvme_subsystem *subsys) 27 { 28 struct nvme_ns_head *h; 29 30 lockdep_assert_held(&subsys->lock); 31 list_for_each_entry(h, &subsys->nsheads, entry) 32 if (h->disk) 33 blk_mq_freeze_queue_wait(h->disk->queue); 34 } 35 36 void nvme_mpath_start_freeze(struct nvme_subsystem *subsys) 37 { 38 struct nvme_ns_head *h; 39 40 lockdep_assert_held(&subsys->lock); 41 list_for_each_entry(h, &subsys->nsheads, entry) 42 if (h->disk) 43 blk_freeze_queue_start(h->disk->queue); 44 } 45 46 /* 47 * If multipathing is enabled we need to always use the subsystem instance 48 * number for numbering our devices to avoid conflicts between subsystems that 49 * have multiple controllers and thus use the multipath-aware subsystem node 50 * and those that have a single controller and use the controller node 51 * directly. 52 */ 53 bool nvme_mpath_set_disk_name(struct nvme_ns *ns, char *disk_name, int *flags) 54 { 55 if (!multipath) 56 return false; 57 if (!ns->head->disk) { 58 sprintf(disk_name, "nvme%dn%d", ns->ctrl->subsys->instance, 59 ns->head->instance); 60 return true; 61 } 62 sprintf(disk_name, "nvme%dc%dn%d", ns->ctrl->subsys->instance, 63 ns->ctrl->instance, ns->head->instance); 64 *flags = GENHD_FL_HIDDEN; 65 return true; 66 } 67 68 void nvme_failover_req(struct request *req) 69 { 70 struct nvme_ns *ns = req->q->queuedata; 71 u16 status = nvme_req(req)->status & 0x7ff; 72 unsigned long flags; 73 74 nvme_mpath_clear_current_path(ns); 75 76 /* 77 * If we got back an ANA error, we know the controller is alive but not 78 * ready to serve this namespace. Kick of a re-read of the ANA 79 * information page, and just try any other available path for now. 80 */ 81 if (nvme_is_ana_error(status) && ns->ctrl->ana_log_buf) { 82 set_bit(NVME_NS_ANA_PENDING, &ns->flags); 83 queue_work(nvme_wq, &ns->ctrl->ana_work); 84 } 85 86 spin_lock_irqsave(&ns->head->requeue_lock, flags); 87 blk_steal_bios(&ns->head->requeue_list, req); 88 spin_unlock_irqrestore(&ns->head->requeue_lock, flags); 89 90 blk_mq_end_request(req, 0); 91 kblockd_schedule_work(&ns->head->requeue_work); 92 } 93 94 void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl) 95 { 96 struct nvme_ns *ns; 97 98 down_read(&ctrl->namespaces_rwsem); 99 list_for_each_entry(ns, &ctrl->namespaces, list) { 100 if (ns->head->disk) 101 kblockd_schedule_work(&ns->head->requeue_work); 102 } 103 up_read(&ctrl->namespaces_rwsem); 104 } 105 106 static const char *nvme_ana_state_names[] = { 107 [0] = "invalid state", 108 [NVME_ANA_OPTIMIZED] = "optimized", 109 [NVME_ANA_NONOPTIMIZED] = "non-optimized", 110 [NVME_ANA_INACCESSIBLE] = "inaccessible", 111 [NVME_ANA_PERSISTENT_LOSS] = "persistent-loss", 112 [NVME_ANA_CHANGE] = "change", 113 }; 114 115 bool nvme_mpath_clear_current_path(struct nvme_ns *ns) 116 { 117 struct nvme_ns_head *head = ns->head; 118 bool changed = false; 119 int node; 120 121 if (!head) 122 goto out; 123 124 for_each_node(node) { 125 if (ns == rcu_access_pointer(head->current_path[node])) { 126 rcu_assign_pointer(head->current_path[node], NULL); 127 changed = true; 128 } 129 } 130 out: 131 return changed; 132 } 133 134 void nvme_mpath_clear_ctrl_paths(struct nvme_ctrl *ctrl) 135 { 136 struct nvme_ns *ns; 137 138 mutex_lock(&ctrl->scan_lock); 139 down_read(&ctrl->namespaces_rwsem); 140 list_for_each_entry(ns, &ctrl->namespaces, list) 141 if (nvme_mpath_clear_current_path(ns)) 142 kblockd_schedule_work(&ns->head->requeue_work); 143 up_read(&ctrl->namespaces_rwsem); 144 mutex_unlock(&ctrl->scan_lock); 145 } 146 147 static bool nvme_path_is_disabled(struct nvme_ns *ns) 148 { 149 /* 150 * We don't treat NVME_CTRL_DELETING as a disabled path as I/O should 151 * still be able to complete assuming that the controller is connected. 152 * Otherwise it will fail immediately and return to the requeue list. 153 */ 154 if (ns->ctrl->state != NVME_CTRL_LIVE && 155 ns->ctrl->state != NVME_CTRL_DELETING) 156 return true; 157 if (test_bit(NVME_NS_ANA_PENDING, &ns->flags) || 158 test_bit(NVME_NS_REMOVING, &ns->flags)) 159 return true; 160 return false; 161 } 162 163 static struct nvme_ns *__nvme_find_path(struct nvme_ns_head *head, int node) 164 { 165 int found_distance = INT_MAX, fallback_distance = INT_MAX, distance; 166 struct nvme_ns *found = NULL, *fallback = NULL, *ns; 167 168 list_for_each_entry_rcu(ns, &head->list, siblings) { 169 if (nvme_path_is_disabled(ns)) 170 continue; 171 172 if (READ_ONCE(head->subsys->iopolicy) == NVME_IOPOLICY_NUMA) 173 distance = node_distance(node, ns->ctrl->numa_node); 174 else 175 distance = LOCAL_DISTANCE; 176 177 switch (ns->ana_state) { 178 case NVME_ANA_OPTIMIZED: 179 if (distance < found_distance) { 180 found_distance = distance; 181 found = ns; 182 } 183 break; 184 case NVME_ANA_NONOPTIMIZED: 185 if (distance < fallback_distance) { 186 fallback_distance = distance; 187 fallback = ns; 188 } 189 break; 190 default: 191 break; 192 } 193 } 194 195 if (!found) 196 found = fallback; 197 if (found) 198 rcu_assign_pointer(head->current_path[node], found); 199 return found; 200 } 201 202 static struct nvme_ns *nvme_next_ns(struct nvme_ns_head *head, 203 struct nvme_ns *ns) 204 { 205 ns = list_next_or_null_rcu(&head->list, &ns->siblings, struct nvme_ns, 206 siblings); 207 if (ns) 208 return ns; 209 return list_first_or_null_rcu(&head->list, struct nvme_ns, siblings); 210 } 211 212 static struct nvme_ns *nvme_round_robin_path(struct nvme_ns_head *head, 213 int node, struct nvme_ns *old) 214 { 215 struct nvme_ns *ns, *found = NULL; 216 217 if (list_is_singular(&head->list)) { 218 if (nvme_path_is_disabled(old)) 219 return NULL; 220 return old; 221 } 222 223 for (ns = nvme_next_ns(head, old); 224 ns && ns != old; 225 ns = nvme_next_ns(head, ns)) { 226 if (nvme_path_is_disabled(ns)) 227 continue; 228 229 if (ns->ana_state == NVME_ANA_OPTIMIZED) { 230 found = ns; 231 goto out; 232 } 233 if (ns->ana_state == NVME_ANA_NONOPTIMIZED) 234 found = ns; 235 } 236 237 /* 238 * The loop above skips the current path for round-robin semantics. 239 * Fall back to the current path if either: 240 * - no other optimized path found and current is optimized, 241 * - no other usable path found and current is usable. 242 */ 243 if (!nvme_path_is_disabled(old) && 244 (old->ana_state == NVME_ANA_OPTIMIZED || 245 (!found && old->ana_state == NVME_ANA_NONOPTIMIZED))) 246 return old; 247 248 if (!found) 249 return NULL; 250 out: 251 rcu_assign_pointer(head->current_path[node], found); 252 return found; 253 } 254 255 static inline bool nvme_path_is_optimized(struct nvme_ns *ns) 256 { 257 return ns->ctrl->state == NVME_CTRL_LIVE && 258 ns->ana_state == NVME_ANA_OPTIMIZED; 259 } 260 261 inline struct nvme_ns *nvme_find_path(struct nvme_ns_head *head) 262 { 263 int node = numa_node_id(); 264 struct nvme_ns *ns; 265 266 ns = srcu_dereference(head->current_path[node], &head->srcu); 267 if (unlikely(!ns)) 268 return __nvme_find_path(head, node); 269 270 if (READ_ONCE(head->subsys->iopolicy) == NVME_IOPOLICY_RR) 271 return nvme_round_robin_path(head, node, ns); 272 if (unlikely(!nvme_path_is_optimized(ns))) 273 return __nvme_find_path(head, node); 274 return ns; 275 } 276 277 static bool nvme_available_path(struct nvme_ns_head *head) 278 { 279 struct nvme_ns *ns; 280 281 list_for_each_entry_rcu(ns, &head->list, siblings) { 282 if (test_bit(NVME_CTRL_FAILFAST_EXPIRED, &ns->ctrl->flags)) 283 continue; 284 switch (ns->ctrl->state) { 285 case NVME_CTRL_LIVE: 286 case NVME_CTRL_RESETTING: 287 case NVME_CTRL_CONNECTING: 288 /* fallthru */ 289 return true; 290 default: 291 break; 292 } 293 } 294 return false; 295 } 296 297 static blk_qc_t nvme_ns_head_submit_bio(struct bio *bio) 298 { 299 struct nvme_ns_head *head = bio->bi_bdev->bd_disk->private_data; 300 struct device *dev = disk_to_dev(head->disk); 301 struct nvme_ns *ns; 302 blk_qc_t ret = BLK_QC_T_NONE; 303 int srcu_idx; 304 305 /* 306 * The namespace might be going away and the bio might be moved to a 307 * different queue via blk_steal_bios(), so we need to use the bio_split 308 * pool from the original queue to allocate the bvecs from. 309 */ 310 blk_queue_split(&bio); 311 312 srcu_idx = srcu_read_lock(&head->srcu); 313 ns = nvme_find_path(head); 314 if (likely(ns)) { 315 bio_set_dev(bio, ns->disk->part0); 316 bio->bi_opf |= REQ_NVME_MPATH; 317 trace_block_bio_remap(bio, disk_devt(ns->head->disk), 318 bio->bi_iter.bi_sector); 319 ret = submit_bio_noacct(bio); 320 } else if (nvme_available_path(head)) { 321 dev_warn_ratelimited(dev, "no usable path - requeuing I/O\n"); 322 323 spin_lock_irq(&head->requeue_lock); 324 bio_list_add(&head->requeue_list, bio); 325 spin_unlock_irq(&head->requeue_lock); 326 } else { 327 dev_warn_ratelimited(dev, "no available path - failing I/O\n"); 328 329 bio->bi_status = BLK_STS_IOERR; 330 bio_endio(bio); 331 } 332 333 srcu_read_unlock(&head->srcu, srcu_idx); 334 return ret; 335 } 336 337 static int nvme_ns_head_open(struct block_device *bdev, fmode_t mode) 338 { 339 if (!nvme_tryget_ns_head(bdev->bd_disk->private_data)) 340 return -ENXIO; 341 return 0; 342 } 343 344 static void nvme_ns_head_release(struct gendisk *disk, fmode_t mode) 345 { 346 nvme_put_ns_head(disk->private_data); 347 } 348 349 const struct block_device_operations nvme_ns_head_ops = { 350 .owner = THIS_MODULE, 351 .submit_bio = nvme_ns_head_submit_bio, 352 .open = nvme_ns_head_open, 353 .release = nvme_ns_head_release, 354 .ioctl = nvme_ns_head_ioctl, 355 .getgeo = nvme_getgeo, 356 .report_zones = nvme_report_zones, 357 .pr_ops = &nvme_pr_ops, 358 }; 359 360 static inline struct nvme_ns_head *cdev_to_ns_head(struct cdev *cdev) 361 { 362 return container_of(cdev, struct nvme_ns_head, cdev); 363 } 364 365 static int nvme_ns_head_chr_open(struct inode *inode, struct file *file) 366 { 367 if (!nvme_tryget_ns_head(cdev_to_ns_head(inode->i_cdev))) 368 return -ENXIO; 369 return 0; 370 } 371 372 static int nvme_ns_head_chr_release(struct inode *inode, struct file *file) 373 { 374 nvme_put_ns_head(cdev_to_ns_head(inode->i_cdev)); 375 return 0; 376 } 377 378 static const struct file_operations nvme_ns_head_chr_fops = { 379 .owner = THIS_MODULE, 380 .open = nvme_ns_head_chr_open, 381 .release = nvme_ns_head_chr_release, 382 .unlocked_ioctl = nvme_ns_head_chr_ioctl, 383 .compat_ioctl = compat_ptr_ioctl, 384 }; 385 386 static int nvme_add_ns_head_cdev(struct nvme_ns_head *head) 387 { 388 int ret; 389 390 head->cdev_device.parent = &head->subsys->dev; 391 ret = dev_set_name(&head->cdev_device, "ng%dn%d", 392 head->subsys->instance, head->instance); 393 if (ret) 394 return ret; 395 ret = nvme_cdev_add(&head->cdev, &head->cdev_device, 396 &nvme_ns_head_chr_fops, THIS_MODULE); 397 if (ret) 398 kfree_const(head->cdev_device.kobj.name); 399 return ret; 400 } 401 402 static void nvme_requeue_work(struct work_struct *work) 403 { 404 struct nvme_ns_head *head = 405 container_of(work, struct nvme_ns_head, requeue_work); 406 struct bio *bio, *next; 407 408 spin_lock_irq(&head->requeue_lock); 409 next = bio_list_get(&head->requeue_list); 410 spin_unlock_irq(&head->requeue_lock); 411 412 while ((bio = next) != NULL) { 413 next = bio->bi_next; 414 bio->bi_next = NULL; 415 416 /* 417 * Reset disk to the mpath node and resubmit to select a new 418 * path. 419 */ 420 bio_set_dev(bio, head->disk->part0); 421 submit_bio_noacct(bio); 422 } 423 } 424 425 int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head) 426 { 427 struct request_queue *q; 428 bool vwc = false; 429 430 mutex_init(&head->lock); 431 bio_list_init(&head->requeue_list); 432 spin_lock_init(&head->requeue_lock); 433 INIT_WORK(&head->requeue_work, nvme_requeue_work); 434 435 /* 436 * Add a multipath node if the subsystems supports multiple controllers. 437 * We also do this for private namespaces as the namespace sharing data could 438 * change after a rescan. 439 */ 440 if (!(ctrl->subsys->cmic & NVME_CTRL_CMIC_MULTI_CTRL) || !multipath) 441 return 0; 442 443 q = blk_alloc_queue(ctrl->numa_node); 444 if (!q) 445 goto out; 446 blk_queue_flag_set(QUEUE_FLAG_NONROT, q); 447 /* set to a default value for 512 until disk is validated */ 448 blk_queue_logical_block_size(q, 512); 449 blk_set_stacking_limits(&q->limits); 450 451 /* we need to propagate up the VMC settings */ 452 if (ctrl->vwc & NVME_CTRL_VWC_PRESENT) 453 vwc = true; 454 blk_queue_write_cache(q, vwc, vwc); 455 456 head->disk = alloc_disk(0); 457 if (!head->disk) 458 goto out_cleanup_queue; 459 head->disk->fops = &nvme_ns_head_ops; 460 head->disk->private_data = head; 461 head->disk->queue = q; 462 head->disk->flags = GENHD_FL_EXT_DEVT; 463 sprintf(head->disk->disk_name, "nvme%dn%d", 464 ctrl->subsys->instance, head->instance); 465 return 0; 466 467 out_cleanup_queue: 468 blk_cleanup_queue(q); 469 out: 470 return -ENOMEM; 471 } 472 473 static void nvme_mpath_set_live(struct nvme_ns *ns) 474 { 475 struct nvme_ns_head *head = ns->head; 476 477 if (!head->disk) 478 return; 479 480 if (!test_and_set_bit(NVME_NSHEAD_DISK_LIVE, &head->flags)) { 481 device_add_disk(&head->subsys->dev, head->disk, 482 nvme_ns_id_attr_groups); 483 nvme_add_ns_head_cdev(head); 484 } 485 486 mutex_lock(&head->lock); 487 if (nvme_path_is_optimized(ns)) { 488 int node, srcu_idx; 489 490 srcu_idx = srcu_read_lock(&head->srcu); 491 for_each_node(node) 492 __nvme_find_path(head, node); 493 srcu_read_unlock(&head->srcu, srcu_idx); 494 } 495 mutex_unlock(&head->lock); 496 497 synchronize_srcu(&head->srcu); 498 kblockd_schedule_work(&head->requeue_work); 499 } 500 501 static int nvme_parse_ana_log(struct nvme_ctrl *ctrl, void *data, 502 int (*cb)(struct nvme_ctrl *ctrl, struct nvme_ana_group_desc *, 503 void *)) 504 { 505 void *base = ctrl->ana_log_buf; 506 size_t offset = sizeof(struct nvme_ana_rsp_hdr); 507 int error, i; 508 509 lockdep_assert_held(&ctrl->ana_lock); 510 511 for (i = 0; i < le16_to_cpu(ctrl->ana_log_buf->ngrps); i++) { 512 struct nvme_ana_group_desc *desc = base + offset; 513 u32 nr_nsids; 514 size_t nsid_buf_size; 515 516 if (WARN_ON_ONCE(offset > ctrl->ana_log_size - sizeof(*desc))) 517 return -EINVAL; 518 519 nr_nsids = le32_to_cpu(desc->nnsids); 520 nsid_buf_size = nr_nsids * sizeof(__le32); 521 522 if (WARN_ON_ONCE(desc->grpid == 0)) 523 return -EINVAL; 524 if (WARN_ON_ONCE(le32_to_cpu(desc->grpid) > ctrl->anagrpmax)) 525 return -EINVAL; 526 if (WARN_ON_ONCE(desc->state == 0)) 527 return -EINVAL; 528 if (WARN_ON_ONCE(desc->state > NVME_ANA_CHANGE)) 529 return -EINVAL; 530 531 offset += sizeof(*desc); 532 if (WARN_ON_ONCE(offset > ctrl->ana_log_size - nsid_buf_size)) 533 return -EINVAL; 534 535 error = cb(ctrl, desc, data); 536 if (error) 537 return error; 538 539 offset += nsid_buf_size; 540 } 541 542 return 0; 543 } 544 545 static inline bool nvme_state_is_live(enum nvme_ana_state state) 546 { 547 return state == NVME_ANA_OPTIMIZED || state == NVME_ANA_NONOPTIMIZED; 548 } 549 550 static void nvme_update_ns_ana_state(struct nvme_ana_group_desc *desc, 551 struct nvme_ns *ns) 552 { 553 ns->ana_grpid = le32_to_cpu(desc->grpid); 554 ns->ana_state = desc->state; 555 clear_bit(NVME_NS_ANA_PENDING, &ns->flags); 556 557 if (nvme_state_is_live(ns->ana_state)) 558 nvme_mpath_set_live(ns); 559 } 560 561 static int nvme_update_ana_state(struct nvme_ctrl *ctrl, 562 struct nvme_ana_group_desc *desc, void *data) 563 { 564 u32 nr_nsids = le32_to_cpu(desc->nnsids), n = 0; 565 unsigned *nr_change_groups = data; 566 struct nvme_ns *ns; 567 568 dev_dbg(ctrl->device, "ANA group %d: %s.\n", 569 le32_to_cpu(desc->grpid), 570 nvme_ana_state_names[desc->state]); 571 572 if (desc->state == NVME_ANA_CHANGE) 573 (*nr_change_groups)++; 574 575 if (!nr_nsids) 576 return 0; 577 578 down_read(&ctrl->namespaces_rwsem); 579 list_for_each_entry(ns, &ctrl->namespaces, list) { 580 unsigned nsid = le32_to_cpu(desc->nsids[n]); 581 582 if (ns->head->ns_id < nsid) 583 continue; 584 if (ns->head->ns_id == nsid) 585 nvme_update_ns_ana_state(desc, ns); 586 if (++n == nr_nsids) 587 break; 588 } 589 up_read(&ctrl->namespaces_rwsem); 590 return 0; 591 } 592 593 static int nvme_read_ana_log(struct nvme_ctrl *ctrl) 594 { 595 u32 nr_change_groups = 0; 596 int error; 597 598 mutex_lock(&ctrl->ana_lock); 599 error = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_ANA, 0, NVME_CSI_NVM, 600 ctrl->ana_log_buf, ctrl->ana_log_size, 0); 601 if (error) { 602 dev_warn(ctrl->device, "Failed to get ANA log: %d\n", error); 603 goto out_unlock; 604 } 605 606 error = nvme_parse_ana_log(ctrl, &nr_change_groups, 607 nvme_update_ana_state); 608 if (error) 609 goto out_unlock; 610 611 /* 612 * In theory we should have an ANATT timer per group as they might enter 613 * the change state at different times. But that is a lot of overhead 614 * just to protect against a target that keeps entering new changes 615 * states while never finishing previous ones. But we'll still 616 * eventually time out once all groups are in change state, so this 617 * isn't a big deal. 618 * 619 * We also double the ANATT value to provide some slack for transports 620 * or AEN processing overhead. 621 */ 622 if (nr_change_groups) 623 mod_timer(&ctrl->anatt_timer, ctrl->anatt * HZ * 2 + jiffies); 624 else 625 del_timer_sync(&ctrl->anatt_timer); 626 out_unlock: 627 mutex_unlock(&ctrl->ana_lock); 628 return error; 629 } 630 631 static void nvme_ana_work(struct work_struct *work) 632 { 633 struct nvme_ctrl *ctrl = container_of(work, struct nvme_ctrl, ana_work); 634 635 if (ctrl->state != NVME_CTRL_LIVE) 636 return; 637 638 nvme_read_ana_log(ctrl); 639 } 640 641 static void nvme_anatt_timeout(struct timer_list *t) 642 { 643 struct nvme_ctrl *ctrl = from_timer(ctrl, t, anatt_timer); 644 645 dev_info(ctrl->device, "ANATT timeout, resetting controller.\n"); 646 nvme_reset_ctrl(ctrl); 647 } 648 649 void nvme_mpath_stop(struct nvme_ctrl *ctrl) 650 { 651 if (!nvme_ctrl_use_ana(ctrl)) 652 return; 653 del_timer_sync(&ctrl->anatt_timer); 654 cancel_work_sync(&ctrl->ana_work); 655 } 656 657 #define SUBSYS_ATTR_RW(_name, _mode, _show, _store) \ 658 struct device_attribute subsys_attr_##_name = \ 659 __ATTR(_name, _mode, _show, _store) 660 661 static const char *nvme_iopolicy_names[] = { 662 [NVME_IOPOLICY_NUMA] = "numa", 663 [NVME_IOPOLICY_RR] = "round-robin", 664 }; 665 666 static ssize_t nvme_subsys_iopolicy_show(struct device *dev, 667 struct device_attribute *attr, char *buf) 668 { 669 struct nvme_subsystem *subsys = 670 container_of(dev, struct nvme_subsystem, dev); 671 672 return sysfs_emit(buf, "%s\n", 673 nvme_iopolicy_names[READ_ONCE(subsys->iopolicy)]); 674 } 675 676 static ssize_t nvme_subsys_iopolicy_store(struct device *dev, 677 struct device_attribute *attr, const char *buf, size_t count) 678 { 679 struct nvme_subsystem *subsys = 680 container_of(dev, struct nvme_subsystem, dev); 681 int i; 682 683 for (i = 0; i < ARRAY_SIZE(nvme_iopolicy_names); i++) { 684 if (sysfs_streq(buf, nvme_iopolicy_names[i])) { 685 WRITE_ONCE(subsys->iopolicy, i); 686 return count; 687 } 688 } 689 690 return -EINVAL; 691 } 692 SUBSYS_ATTR_RW(iopolicy, S_IRUGO | S_IWUSR, 693 nvme_subsys_iopolicy_show, nvme_subsys_iopolicy_store); 694 695 static ssize_t ana_grpid_show(struct device *dev, struct device_attribute *attr, 696 char *buf) 697 { 698 return sysfs_emit(buf, "%d\n", nvme_get_ns_from_dev(dev)->ana_grpid); 699 } 700 DEVICE_ATTR_RO(ana_grpid); 701 702 static ssize_t ana_state_show(struct device *dev, struct device_attribute *attr, 703 char *buf) 704 { 705 struct nvme_ns *ns = nvme_get_ns_from_dev(dev); 706 707 return sysfs_emit(buf, "%s\n", nvme_ana_state_names[ns->ana_state]); 708 } 709 DEVICE_ATTR_RO(ana_state); 710 711 static int nvme_lookup_ana_group_desc(struct nvme_ctrl *ctrl, 712 struct nvme_ana_group_desc *desc, void *data) 713 { 714 struct nvme_ana_group_desc *dst = data; 715 716 if (desc->grpid != dst->grpid) 717 return 0; 718 719 *dst = *desc; 720 return -ENXIO; /* just break out of the loop */ 721 } 722 723 void nvme_mpath_add_disk(struct nvme_ns *ns, struct nvme_id_ns *id) 724 { 725 if (nvme_ctrl_use_ana(ns->ctrl)) { 726 struct nvme_ana_group_desc desc = { 727 .grpid = id->anagrpid, 728 .state = 0, 729 }; 730 731 mutex_lock(&ns->ctrl->ana_lock); 732 ns->ana_grpid = le32_to_cpu(id->anagrpid); 733 nvme_parse_ana_log(ns->ctrl, &desc, nvme_lookup_ana_group_desc); 734 mutex_unlock(&ns->ctrl->ana_lock); 735 if (desc.state) { 736 /* found the group desc: update */ 737 nvme_update_ns_ana_state(&desc, ns); 738 } else { 739 /* group desc not found: trigger a re-read */ 740 set_bit(NVME_NS_ANA_PENDING, &ns->flags); 741 queue_work(nvme_wq, &ns->ctrl->ana_work); 742 } 743 } else { 744 ns->ana_state = NVME_ANA_OPTIMIZED; 745 nvme_mpath_set_live(ns); 746 } 747 748 if (blk_queue_stable_writes(ns->queue) && ns->head->disk) 749 blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, 750 ns->head->disk->queue); 751 #ifdef CONFIG_BLK_DEV_ZONED 752 if (blk_queue_is_zoned(ns->queue) && ns->head->disk) 753 ns->head->disk->queue->nr_zones = ns->queue->nr_zones; 754 #endif 755 } 756 757 void nvme_mpath_remove_disk(struct nvme_ns_head *head) 758 { 759 if (!head->disk) 760 return; 761 if (head->disk->flags & GENHD_FL_UP) { 762 nvme_cdev_del(&head->cdev, &head->cdev_device); 763 del_gendisk(head->disk); 764 } 765 blk_set_queue_dying(head->disk->queue); 766 /* make sure all pending bios are cleaned up */ 767 kblockd_schedule_work(&head->requeue_work); 768 flush_work(&head->requeue_work); 769 blk_cleanup_queue(head->disk->queue); 770 if (!test_bit(NVME_NSHEAD_DISK_LIVE, &head->flags)) { 771 /* 772 * if device_add_disk wasn't called, prevent 773 * disk release to put a bogus reference on the 774 * request queue 775 */ 776 head->disk->queue = NULL; 777 } 778 put_disk(head->disk); 779 } 780 781 int nvme_mpath_init(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id) 782 { 783 int error; 784 785 /* check if multipath is enabled and we have the capability */ 786 if (!multipath || !ctrl->subsys || 787 !(ctrl->subsys->cmic & NVME_CTRL_CMIC_ANA)) 788 return 0; 789 790 ctrl->anacap = id->anacap; 791 ctrl->anatt = id->anatt; 792 ctrl->nanagrpid = le32_to_cpu(id->nanagrpid); 793 ctrl->anagrpmax = le32_to_cpu(id->anagrpmax); 794 795 mutex_init(&ctrl->ana_lock); 796 timer_setup(&ctrl->anatt_timer, nvme_anatt_timeout, 0); 797 ctrl->ana_log_size = sizeof(struct nvme_ana_rsp_hdr) + 798 ctrl->nanagrpid * sizeof(struct nvme_ana_group_desc); 799 ctrl->ana_log_size += ctrl->max_namespaces * sizeof(__le32); 800 801 if (ctrl->ana_log_size > ctrl->max_hw_sectors << SECTOR_SHIFT) { 802 dev_err(ctrl->device, 803 "ANA log page size (%zd) larger than MDTS (%d).\n", 804 ctrl->ana_log_size, 805 ctrl->max_hw_sectors << SECTOR_SHIFT); 806 dev_err(ctrl->device, "disabling ANA support.\n"); 807 return 0; 808 } 809 810 INIT_WORK(&ctrl->ana_work, nvme_ana_work); 811 kfree(ctrl->ana_log_buf); 812 ctrl->ana_log_buf = kmalloc(ctrl->ana_log_size, GFP_KERNEL); 813 if (!ctrl->ana_log_buf) { 814 error = -ENOMEM; 815 goto out; 816 } 817 818 error = nvme_read_ana_log(ctrl); 819 if (error) 820 goto out_free_ana_log_buf; 821 return 0; 822 out_free_ana_log_buf: 823 kfree(ctrl->ana_log_buf); 824 ctrl->ana_log_buf = NULL; 825 out: 826 return error; 827 } 828 829 void nvme_mpath_uninit(struct nvme_ctrl *ctrl) 830 { 831 kfree(ctrl->ana_log_buf); 832 ctrl->ana_log_buf = NULL; 833 } 834