1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (c) 2017-2018 Christoph Hellwig. 4 */ 5 6 #include <linux/backing-dev.h> 7 #include <linux/moduleparam.h> 8 #include <trace/events/block.h> 9 #include "nvme.h" 10 11 static bool multipath = true; 12 module_param(multipath, bool, 0444); 13 MODULE_PARM_DESC(multipath, 14 "turn on native support for multiple controllers per subsystem"); 15 16 void nvme_mpath_unfreeze(struct nvme_subsystem *subsys) 17 { 18 struct nvme_ns_head *h; 19 20 lockdep_assert_held(&subsys->lock); 21 list_for_each_entry(h, &subsys->nsheads, entry) 22 if (h->disk) 23 blk_mq_unfreeze_queue(h->disk->queue); 24 } 25 26 void nvme_mpath_wait_freeze(struct nvme_subsystem *subsys) 27 { 28 struct nvme_ns_head *h; 29 30 lockdep_assert_held(&subsys->lock); 31 list_for_each_entry(h, &subsys->nsheads, entry) 32 if (h->disk) 33 blk_mq_freeze_queue_wait(h->disk->queue); 34 } 35 36 void nvme_mpath_start_freeze(struct nvme_subsystem *subsys) 37 { 38 struct nvme_ns_head *h; 39 40 lockdep_assert_held(&subsys->lock); 41 list_for_each_entry(h, &subsys->nsheads, entry) 42 if (h->disk) 43 blk_freeze_queue_start(h->disk->queue); 44 } 45 46 /* 47 * If multipathing is enabled we need to always use the subsystem instance 48 * number for numbering our devices to avoid conflicts between subsystems that 49 * have multiple controllers and thus use the multipath-aware subsystem node 50 * and those that have a single controller and use the controller node 51 * directly. 52 */ 53 bool nvme_mpath_set_disk_name(struct nvme_ns *ns, char *disk_name, int *flags) 54 { 55 if (!multipath) 56 return false; 57 if (!ns->head->disk) { 58 sprintf(disk_name, "nvme%dn%d", ns->ctrl->subsys->instance, 59 ns->head->instance); 60 return true; 61 } 62 sprintf(disk_name, "nvme%dc%dn%d", ns->ctrl->subsys->instance, 63 ns->ctrl->instance, ns->head->instance); 64 *flags = GENHD_FL_HIDDEN; 65 return true; 66 } 67 68 void nvme_failover_req(struct request *req) 69 { 70 struct nvme_ns *ns = req->q->queuedata; 71 u16 status = nvme_req(req)->status & 0x7ff; 72 unsigned long flags; 73 struct bio *bio; 74 75 nvme_mpath_clear_current_path(ns); 76 77 /* 78 * If we got back an ANA error, we know the controller is alive but not 79 * ready to serve this namespace. Kick of a re-read of the ANA 80 * information page, and just try any other available path for now. 81 */ 82 if (nvme_is_ana_error(status) && ns->ctrl->ana_log_buf) { 83 set_bit(NVME_NS_ANA_PENDING, &ns->flags); 84 queue_work(nvme_wq, &ns->ctrl->ana_work); 85 } 86 87 spin_lock_irqsave(&ns->head->requeue_lock, flags); 88 for (bio = req->bio; bio; bio = bio->bi_next) { 89 bio_set_dev(bio, ns->head->disk->part0); 90 if (bio->bi_opf & REQ_POLLED) { 91 bio->bi_opf &= ~REQ_POLLED; 92 bio->bi_cookie = BLK_QC_T_NONE; 93 } 94 } 95 blk_steal_bios(&ns->head->requeue_list, req); 96 spin_unlock_irqrestore(&ns->head->requeue_lock, flags); 97 98 blk_mq_end_request(req, 0); 99 kblockd_schedule_work(&ns->head->requeue_work); 100 } 101 102 void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl) 103 { 104 struct nvme_ns *ns; 105 106 down_read(&ctrl->namespaces_rwsem); 107 list_for_each_entry(ns, &ctrl->namespaces, list) { 108 if (!ns->head->disk) 109 continue; 110 kblockd_schedule_work(&ns->head->requeue_work); 111 if (ctrl->state == NVME_CTRL_LIVE) 112 disk_uevent(ns->head->disk, KOBJ_CHANGE); 113 } 114 up_read(&ctrl->namespaces_rwsem); 115 } 116 117 static const char *nvme_ana_state_names[] = { 118 [0] = "invalid state", 119 [NVME_ANA_OPTIMIZED] = "optimized", 120 [NVME_ANA_NONOPTIMIZED] = "non-optimized", 121 [NVME_ANA_INACCESSIBLE] = "inaccessible", 122 [NVME_ANA_PERSISTENT_LOSS] = "persistent-loss", 123 [NVME_ANA_CHANGE] = "change", 124 }; 125 126 bool nvme_mpath_clear_current_path(struct nvme_ns *ns) 127 { 128 struct nvme_ns_head *head = ns->head; 129 bool changed = false; 130 int node; 131 132 if (!head) 133 goto out; 134 135 for_each_node(node) { 136 if (ns == rcu_access_pointer(head->current_path[node])) { 137 rcu_assign_pointer(head->current_path[node], NULL); 138 changed = true; 139 } 140 } 141 out: 142 return changed; 143 } 144 145 void nvme_mpath_clear_ctrl_paths(struct nvme_ctrl *ctrl) 146 { 147 struct nvme_ns *ns; 148 149 down_read(&ctrl->namespaces_rwsem); 150 list_for_each_entry(ns, &ctrl->namespaces, list) { 151 nvme_mpath_clear_current_path(ns); 152 kblockd_schedule_work(&ns->head->requeue_work); 153 } 154 up_read(&ctrl->namespaces_rwsem); 155 } 156 157 void nvme_mpath_revalidate_paths(struct nvme_ns *ns) 158 { 159 struct nvme_ns_head *head = ns->head; 160 sector_t capacity = get_capacity(head->disk); 161 int node; 162 163 list_for_each_entry_rcu(ns, &head->list, siblings) { 164 if (capacity != get_capacity(ns->disk)) 165 clear_bit(NVME_NS_READY, &ns->flags); 166 } 167 168 for_each_node(node) 169 rcu_assign_pointer(head->current_path[node], NULL); 170 } 171 172 static bool nvme_path_is_disabled(struct nvme_ns *ns) 173 { 174 /* 175 * We don't treat NVME_CTRL_DELETING as a disabled path as I/O should 176 * still be able to complete assuming that the controller is connected. 177 * Otherwise it will fail immediately and return to the requeue list. 178 */ 179 if (ns->ctrl->state != NVME_CTRL_LIVE && 180 ns->ctrl->state != NVME_CTRL_DELETING) 181 return true; 182 if (test_bit(NVME_NS_ANA_PENDING, &ns->flags) || 183 !test_bit(NVME_NS_READY, &ns->flags)) 184 return true; 185 return false; 186 } 187 188 static struct nvme_ns *__nvme_find_path(struct nvme_ns_head *head, int node) 189 { 190 int found_distance = INT_MAX, fallback_distance = INT_MAX, distance; 191 struct nvme_ns *found = NULL, *fallback = NULL, *ns; 192 193 list_for_each_entry_rcu(ns, &head->list, siblings) { 194 if (nvme_path_is_disabled(ns)) 195 continue; 196 197 if (READ_ONCE(head->subsys->iopolicy) == NVME_IOPOLICY_NUMA) 198 distance = node_distance(node, ns->ctrl->numa_node); 199 else 200 distance = LOCAL_DISTANCE; 201 202 switch (ns->ana_state) { 203 case NVME_ANA_OPTIMIZED: 204 if (distance < found_distance) { 205 found_distance = distance; 206 found = ns; 207 } 208 break; 209 case NVME_ANA_NONOPTIMIZED: 210 if (distance < fallback_distance) { 211 fallback_distance = distance; 212 fallback = ns; 213 } 214 break; 215 default: 216 break; 217 } 218 } 219 220 if (!found) 221 found = fallback; 222 if (found) 223 rcu_assign_pointer(head->current_path[node], found); 224 return found; 225 } 226 227 static struct nvme_ns *nvme_next_ns(struct nvme_ns_head *head, 228 struct nvme_ns *ns) 229 { 230 ns = list_next_or_null_rcu(&head->list, &ns->siblings, struct nvme_ns, 231 siblings); 232 if (ns) 233 return ns; 234 return list_first_or_null_rcu(&head->list, struct nvme_ns, siblings); 235 } 236 237 static struct nvme_ns *nvme_round_robin_path(struct nvme_ns_head *head, 238 int node, struct nvme_ns *old) 239 { 240 struct nvme_ns *ns, *found = NULL; 241 242 if (list_is_singular(&head->list)) { 243 if (nvme_path_is_disabled(old)) 244 return NULL; 245 return old; 246 } 247 248 for (ns = nvme_next_ns(head, old); 249 ns && ns != old; 250 ns = nvme_next_ns(head, ns)) { 251 if (nvme_path_is_disabled(ns)) 252 continue; 253 254 if (ns->ana_state == NVME_ANA_OPTIMIZED) { 255 found = ns; 256 goto out; 257 } 258 if (ns->ana_state == NVME_ANA_NONOPTIMIZED) 259 found = ns; 260 } 261 262 /* 263 * The loop above skips the current path for round-robin semantics. 264 * Fall back to the current path if either: 265 * - no other optimized path found and current is optimized, 266 * - no other usable path found and current is usable. 267 */ 268 if (!nvme_path_is_disabled(old) && 269 (old->ana_state == NVME_ANA_OPTIMIZED || 270 (!found && old->ana_state == NVME_ANA_NONOPTIMIZED))) 271 return old; 272 273 if (!found) 274 return NULL; 275 out: 276 rcu_assign_pointer(head->current_path[node], found); 277 return found; 278 } 279 280 static inline bool nvme_path_is_optimized(struct nvme_ns *ns) 281 { 282 return ns->ctrl->state == NVME_CTRL_LIVE && 283 ns->ana_state == NVME_ANA_OPTIMIZED; 284 } 285 286 inline struct nvme_ns *nvme_find_path(struct nvme_ns_head *head) 287 { 288 int node = numa_node_id(); 289 struct nvme_ns *ns; 290 291 ns = srcu_dereference(head->current_path[node], &head->srcu); 292 if (unlikely(!ns)) 293 return __nvme_find_path(head, node); 294 295 if (READ_ONCE(head->subsys->iopolicy) == NVME_IOPOLICY_RR) 296 return nvme_round_robin_path(head, node, ns); 297 if (unlikely(!nvme_path_is_optimized(ns))) 298 return __nvme_find_path(head, node); 299 return ns; 300 } 301 302 static bool nvme_available_path(struct nvme_ns_head *head) 303 { 304 struct nvme_ns *ns; 305 306 list_for_each_entry_rcu(ns, &head->list, siblings) { 307 if (test_bit(NVME_CTRL_FAILFAST_EXPIRED, &ns->ctrl->flags)) 308 continue; 309 switch (ns->ctrl->state) { 310 case NVME_CTRL_LIVE: 311 case NVME_CTRL_RESETTING: 312 case NVME_CTRL_CONNECTING: 313 /* fallthru */ 314 return true; 315 default: 316 break; 317 } 318 } 319 return false; 320 } 321 322 static void nvme_ns_head_submit_bio(struct bio *bio) 323 { 324 struct nvme_ns_head *head = bio->bi_bdev->bd_disk->private_data; 325 struct device *dev = disk_to_dev(head->disk); 326 struct nvme_ns *ns; 327 int srcu_idx; 328 329 /* 330 * The namespace might be going away and the bio might be moved to a 331 * different queue via blk_steal_bios(), so we need to use the bio_split 332 * pool from the original queue to allocate the bvecs from. 333 */ 334 blk_queue_split(&bio); 335 336 srcu_idx = srcu_read_lock(&head->srcu); 337 ns = nvme_find_path(head); 338 if (likely(ns)) { 339 bio_set_dev(bio, ns->disk->part0); 340 bio->bi_opf |= REQ_NVME_MPATH; 341 trace_block_bio_remap(bio, disk_devt(ns->head->disk), 342 bio->bi_iter.bi_sector); 343 submit_bio_noacct(bio); 344 } else if (nvme_available_path(head)) { 345 dev_warn_ratelimited(dev, "no usable path - requeuing I/O\n"); 346 347 spin_lock_irq(&head->requeue_lock); 348 bio_list_add(&head->requeue_list, bio); 349 spin_unlock_irq(&head->requeue_lock); 350 } else { 351 dev_warn_ratelimited(dev, "no available path - failing I/O\n"); 352 353 bio->bi_status = BLK_STS_IOERR; 354 bio_endio(bio); 355 } 356 357 srcu_read_unlock(&head->srcu, srcu_idx); 358 } 359 360 static int nvme_ns_head_open(struct block_device *bdev, fmode_t mode) 361 { 362 if (!nvme_tryget_ns_head(bdev->bd_disk->private_data)) 363 return -ENXIO; 364 return 0; 365 } 366 367 static void nvme_ns_head_release(struct gendisk *disk, fmode_t mode) 368 { 369 nvme_put_ns_head(disk->private_data); 370 } 371 372 #ifdef CONFIG_BLK_DEV_ZONED 373 static int nvme_ns_head_report_zones(struct gendisk *disk, sector_t sector, 374 unsigned int nr_zones, report_zones_cb cb, void *data) 375 { 376 struct nvme_ns_head *head = disk->private_data; 377 struct nvme_ns *ns; 378 int srcu_idx, ret = -EWOULDBLOCK; 379 380 srcu_idx = srcu_read_lock(&head->srcu); 381 ns = nvme_find_path(head); 382 if (ns) 383 ret = nvme_ns_report_zones(ns, sector, nr_zones, cb, data); 384 srcu_read_unlock(&head->srcu, srcu_idx); 385 return ret; 386 } 387 #else 388 #define nvme_ns_head_report_zones NULL 389 #endif /* CONFIG_BLK_DEV_ZONED */ 390 391 const struct block_device_operations nvme_ns_head_ops = { 392 .owner = THIS_MODULE, 393 .submit_bio = nvme_ns_head_submit_bio, 394 .open = nvme_ns_head_open, 395 .release = nvme_ns_head_release, 396 .ioctl = nvme_ns_head_ioctl, 397 .getgeo = nvme_getgeo, 398 .report_zones = nvme_ns_head_report_zones, 399 .pr_ops = &nvme_pr_ops, 400 }; 401 402 static inline struct nvme_ns_head *cdev_to_ns_head(struct cdev *cdev) 403 { 404 return container_of(cdev, struct nvme_ns_head, cdev); 405 } 406 407 static int nvme_ns_head_chr_open(struct inode *inode, struct file *file) 408 { 409 if (!nvme_tryget_ns_head(cdev_to_ns_head(inode->i_cdev))) 410 return -ENXIO; 411 return 0; 412 } 413 414 static int nvme_ns_head_chr_release(struct inode *inode, struct file *file) 415 { 416 nvme_put_ns_head(cdev_to_ns_head(inode->i_cdev)); 417 return 0; 418 } 419 420 static const struct file_operations nvme_ns_head_chr_fops = { 421 .owner = THIS_MODULE, 422 .open = nvme_ns_head_chr_open, 423 .release = nvme_ns_head_chr_release, 424 .unlocked_ioctl = nvme_ns_head_chr_ioctl, 425 .compat_ioctl = compat_ptr_ioctl, 426 }; 427 428 static int nvme_add_ns_head_cdev(struct nvme_ns_head *head) 429 { 430 int ret; 431 432 head->cdev_device.parent = &head->subsys->dev; 433 ret = dev_set_name(&head->cdev_device, "ng%dn%d", 434 head->subsys->instance, head->instance); 435 if (ret) 436 return ret; 437 ret = nvme_cdev_add(&head->cdev, &head->cdev_device, 438 &nvme_ns_head_chr_fops, THIS_MODULE); 439 return ret; 440 } 441 442 static void nvme_requeue_work(struct work_struct *work) 443 { 444 struct nvme_ns_head *head = 445 container_of(work, struct nvme_ns_head, requeue_work); 446 struct bio *bio, *next; 447 448 spin_lock_irq(&head->requeue_lock); 449 next = bio_list_get(&head->requeue_list); 450 spin_unlock_irq(&head->requeue_lock); 451 452 while ((bio = next) != NULL) { 453 next = bio->bi_next; 454 bio->bi_next = NULL; 455 456 submit_bio_noacct(bio); 457 } 458 } 459 460 int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head) 461 { 462 bool vwc = false; 463 464 mutex_init(&head->lock); 465 bio_list_init(&head->requeue_list); 466 spin_lock_init(&head->requeue_lock); 467 INIT_WORK(&head->requeue_work, nvme_requeue_work); 468 469 /* 470 * Add a multipath node if the subsystems supports multiple controllers. 471 * We also do this for private namespaces as the namespace sharing data could 472 * change after a rescan. 473 */ 474 if (!(ctrl->subsys->cmic & NVME_CTRL_CMIC_MULTI_CTRL) || !multipath) 475 return 0; 476 477 head->disk = blk_alloc_disk(ctrl->numa_node); 478 if (!head->disk) 479 return -ENOMEM; 480 head->disk->fops = &nvme_ns_head_ops; 481 head->disk->private_data = head; 482 sprintf(head->disk->disk_name, "nvme%dn%d", 483 ctrl->subsys->instance, head->instance); 484 485 blk_queue_flag_set(QUEUE_FLAG_NONROT, head->disk->queue); 486 blk_queue_flag_set(QUEUE_FLAG_NOWAIT, head->disk->queue); 487 /* 488 * This assumes all controllers that refer to a namespace either 489 * support poll queues or not. That is not a strict guarantee, 490 * but if the assumption is wrong the effect is only suboptimal 491 * performance but not correctness problem. 492 */ 493 if (ctrl->tagset->nr_maps > HCTX_TYPE_POLL && 494 ctrl->tagset->map[HCTX_TYPE_POLL].nr_queues) 495 blk_queue_flag_set(QUEUE_FLAG_POLL, head->disk->queue); 496 497 /* set to a default value of 512 until the disk is validated */ 498 blk_queue_logical_block_size(head->disk->queue, 512); 499 blk_set_stacking_limits(&head->disk->queue->limits); 500 501 /* we need to propagate up the VMC settings */ 502 if (ctrl->vwc & NVME_CTRL_VWC_PRESENT) 503 vwc = true; 504 blk_queue_write_cache(head->disk->queue, vwc, vwc); 505 return 0; 506 } 507 508 static void nvme_mpath_set_live(struct nvme_ns *ns) 509 { 510 struct nvme_ns_head *head = ns->head; 511 int rc; 512 513 if (!head->disk) 514 return; 515 516 /* 517 * test_and_set_bit() is used because it is protecting against two nvme 518 * paths simultaneously calling device_add_disk() on the same namespace 519 * head. 520 */ 521 if (!test_and_set_bit(NVME_NSHEAD_DISK_LIVE, &head->flags)) { 522 rc = device_add_disk(&head->subsys->dev, head->disk, 523 nvme_ns_id_attr_groups); 524 if (rc) { 525 clear_bit(NVME_NSHEAD_DISK_LIVE, &ns->flags); 526 return; 527 } 528 nvme_add_ns_head_cdev(head); 529 } 530 531 mutex_lock(&head->lock); 532 if (nvme_path_is_optimized(ns)) { 533 int node, srcu_idx; 534 535 srcu_idx = srcu_read_lock(&head->srcu); 536 for_each_node(node) 537 __nvme_find_path(head, node); 538 srcu_read_unlock(&head->srcu, srcu_idx); 539 } 540 mutex_unlock(&head->lock); 541 542 synchronize_srcu(&head->srcu); 543 kblockd_schedule_work(&head->requeue_work); 544 } 545 546 static int nvme_parse_ana_log(struct nvme_ctrl *ctrl, void *data, 547 int (*cb)(struct nvme_ctrl *ctrl, struct nvme_ana_group_desc *, 548 void *)) 549 { 550 void *base = ctrl->ana_log_buf; 551 size_t offset = sizeof(struct nvme_ana_rsp_hdr); 552 int error, i; 553 554 lockdep_assert_held(&ctrl->ana_lock); 555 556 for (i = 0; i < le16_to_cpu(ctrl->ana_log_buf->ngrps); i++) { 557 struct nvme_ana_group_desc *desc = base + offset; 558 u32 nr_nsids; 559 size_t nsid_buf_size; 560 561 if (WARN_ON_ONCE(offset > ctrl->ana_log_size - sizeof(*desc))) 562 return -EINVAL; 563 564 nr_nsids = le32_to_cpu(desc->nnsids); 565 nsid_buf_size = flex_array_size(desc, nsids, nr_nsids); 566 567 if (WARN_ON_ONCE(desc->grpid == 0)) 568 return -EINVAL; 569 if (WARN_ON_ONCE(le32_to_cpu(desc->grpid) > ctrl->anagrpmax)) 570 return -EINVAL; 571 if (WARN_ON_ONCE(desc->state == 0)) 572 return -EINVAL; 573 if (WARN_ON_ONCE(desc->state > NVME_ANA_CHANGE)) 574 return -EINVAL; 575 576 offset += sizeof(*desc); 577 if (WARN_ON_ONCE(offset > ctrl->ana_log_size - nsid_buf_size)) 578 return -EINVAL; 579 580 error = cb(ctrl, desc, data); 581 if (error) 582 return error; 583 584 offset += nsid_buf_size; 585 } 586 587 return 0; 588 } 589 590 static inline bool nvme_state_is_live(enum nvme_ana_state state) 591 { 592 return state == NVME_ANA_OPTIMIZED || state == NVME_ANA_NONOPTIMIZED; 593 } 594 595 static void nvme_update_ns_ana_state(struct nvme_ana_group_desc *desc, 596 struct nvme_ns *ns) 597 { 598 ns->ana_grpid = le32_to_cpu(desc->grpid); 599 ns->ana_state = desc->state; 600 clear_bit(NVME_NS_ANA_PENDING, &ns->flags); 601 602 if (nvme_state_is_live(ns->ana_state)) 603 nvme_mpath_set_live(ns); 604 } 605 606 static int nvme_update_ana_state(struct nvme_ctrl *ctrl, 607 struct nvme_ana_group_desc *desc, void *data) 608 { 609 u32 nr_nsids = le32_to_cpu(desc->nnsids), n = 0; 610 unsigned *nr_change_groups = data; 611 struct nvme_ns *ns; 612 613 dev_dbg(ctrl->device, "ANA group %d: %s.\n", 614 le32_to_cpu(desc->grpid), 615 nvme_ana_state_names[desc->state]); 616 617 if (desc->state == NVME_ANA_CHANGE) 618 (*nr_change_groups)++; 619 620 if (!nr_nsids) 621 return 0; 622 623 down_read(&ctrl->namespaces_rwsem); 624 list_for_each_entry(ns, &ctrl->namespaces, list) { 625 unsigned nsid; 626 again: 627 nsid = le32_to_cpu(desc->nsids[n]); 628 if (ns->head->ns_id < nsid) 629 continue; 630 if (ns->head->ns_id == nsid) 631 nvme_update_ns_ana_state(desc, ns); 632 if (++n == nr_nsids) 633 break; 634 if (ns->head->ns_id > nsid) 635 goto again; 636 } 637 up_read(&ctrl->namespaces_rwsem); 638 return 0; 639 } 640 641 static int nvme_read_ana_log(struct nvme_ctrl *ctrl) 642 { 643 u32 nr_change_groups = 0; 644 int error; 645 646 mutex_lock(&ctrl->ana_lock); 647 error = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_ANA, 0, NVME_CSI_NVM, 648 ctrl->ana_log_buf, ctrl->ana_log_size, 0); 649 if (error) { 650 dev_warn(ctrl->device, "Failed to get ANA log: %d\n", error); 651 goto out_unlock; 652 } 653 654 error = nvme_parse_ana_log(ctrl, &nr_change_groups, 655 nvme_update_ana_state); 656 if (error) 657 goto out_unlock; 658 659 /* 660 * In theory we should have an ANATT timer per group as they might enter 661 * the change state at different times. But that is a lot of overhead 662 * just to protect against a target that keeps entering new changes 663 * states while never finishing previous ones. But we'll still 664 * eventually time out once all groups are in change state, so this 665 * isn't a big deal. 666 * 667 * We also double the ANATT value to provide some slack for transports 668 * or AEN processing overhead. 669 */ 670 if (nr_change_groups) 671 mod_timer(&ctrl->anatt_timer, ctrl->anatt * HZ * 2 + jiffies); 672 else 673 del_timer_sync(&ctrl->anatt_timer); 674 out_unlock: 675 mutex_unlock(&ctrl->ana_lock); 676 return error; 677 } 678 679 static void nvme_ana_work(struct work_struct *work) 680 { 681 struct nvme_ctrl *ctrl = container_of(work, struct nvme_ctrl, ana_work); 682 683 if (ctrl->state != NVME_CTRL_LIVE) 684 return; 685 686 nvme_read_ana_log(ctrl); 687 } 688 689 static void nvme_anatt_timeout(struct timer_list *t) 690 { 691 struct nvme_ctrl *ctrl = from_timer(ctrl, t, anatt_timer); 692 693 dev_info(ctrl->device, "ANATT timeout, resetting controller.\n"); 694 nvme_reset_ctrl(ctrl); 695 } 696 697 void nvme_mpath_stop(struct nvme_ctrl *ctrl) 698 { 699 if (!nvme_ctrl_use_ana(ctrl)) 700 return; 701 del_timer_sync(&ctrl->anatt_timer); 702 cancel_work_sync(&ctrl->ana_work); 703 } 704 705 #define SUBSYS_ATTR_RW(_name, _mode, _show, _store) \ 706 struct device_attribute subsys_attr_##_name = \ 707 __ATTR(_name, _mode, _show, _store) 708 709 static const char *nvme_iopolicy_names[] = { 710 [NVME_IOPOLICY_NUMA] = "numa", 711 [NVME_IOPOLICY_RR] = "round-robin", 712 }; 713 714 static ssize_t nvme_subsys_iopolicy_show(struct device *dev, 715 struct device_attribute *attr, char *buf) 716 { 717 struct nvme_subsystem *subsys = 718 container_of(dev, struct nvme_subsystem, dev); 719 720 return sysfs_emit(buf, "%s\n", 721 nvme_iopolicy_names[READ_ONCE(subsys->iopolicy)]); 722 } 723 724 static ssize_t nvme_subsys_iopolicy_store(struct device *dev, 725 struct device_attribute *attr, const char *buf, size_t count) 726 { 727 struct nvme_subsystem *subsys = 728 container_of(dev, struct nvme_subsystem, dev); 729 int i; 730 731 for (i = 0; i < ARRAY_SIZE(nvme_iopolicy_names); i++) { 732 if (sysfs_streq(buf, nvme_iopolicy_names[i])) { 733 WRITE_ONCE(subsys->iopolicy, i); 734 return count; 735 } 736 } 737 738 return -EINVAL; 739 } 740 SUBSYS_ATTR_RW(iopolicy, S_IRUGO | S_IWUSR, 741 nvme_subsys_iopolicy_show, nvme_subsys_iopolicy_store); 742 743 static ssize_t ana_grpid_show(struct device *dev, struct device_attribute *attr, 744 char *buf) 745 { 746 return sysfs_emit(buf, "%d\n", nvme_get_ns_from_dev(dev)->ana_grpid); 747 } 748 DEVICE_ATTR_RO(ana_grpid); 749 750 static ssize_t ana_state_show(struct device *dev, struct device_attribute *attr, 751 char *buf) 752 { 753 struct nvme_ns *ns = nvme_get_ns_from_dev(dev); 754 755 return sysfs_emit(buf, "%s\n", nvme_ana_state_names[ns->ana_state]); 756 } 757 DEVICE_ATTR_RO(ana_state); 758 759 static int nvme_lookup_ana_group_desc(struct nvme_ctrl *ctrl, 760 struct nvme_ana_group_desc *desc, void *data) 761 { 762 struct nvme_ana_group_desc *dst = data; 763 764 if (desc->grpid != dst->grpid) 765 return 0; 766 767 *dst = *desc; 768 return -ENXIO; /* just break out of the loop */ 769 } 770 771 void nvme_mpath_add_disk(struct nvme_ns *ns, struct nvme_id_ns *id) 772 { 773 if (nvme_ctrl_use_ana(ns->ctrl)) { 774 struct nvme_ana_group_desc desc = { 775 .grpid = id->anagrpid, 776 .state = 0, 777 }; 778 779 mutex_lock(&ns->ctrl->ana_lock); 780 ns->ana_grpid = le32_to_cpu(id->anagrpid); 781 nvme_parse_ana_log(ns->ctrl, &desc, nvme_lookup_ana_group_desc); 782 mutex_unlock(&ns->ctrl->ana_lock); 783 if (desc.state) { 784 /* found the group desc: update */ 785 nvme_update_ns_ana_state(&desc, ns); 786 } else { 787 /* group desc not found: trigger a re-read */ 788 set_bit(NVME_NS_ANA_PENDING, &ns->flags); 789 queue_work(nvme_wq, &ns->ctrl->ana_work); 790 } 791 } else { 792 ns->ana_state = NVME_ANA_OPTIMIZED; 793 nvme_mpath_set_live(ns); 794 } 795 796 if (blk_queue_stable_writes(ns->queue) && ns->head->disk) 797 blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, 798 ns->head->disk->queue); 799 #ifdef CONFIG_BLK_DEV_ZONED 800 if (blk_queue_is_zoned(ns->queue) && ns->head->disk) 801 ns->head->disk->queue->nr_zones = ns->queue->nr_zones; 802 #endif 803 } 804 805 void nvme_mpath_shutdown_disk(struct nvme_ns_head *head) 806 { 807 if (!head->disk) 808 return; 809 kblockd_schedule_work(&head->requeue_work); 810 if (test_bit(NVME_NSHEAD_DISK_LIVE, &head->flags)) { 811 nvme_cdev_del(&head->cdev, &head->cdev_device); 812 del_gendisk(head->disk); 813 } 814 } 815 816 void nvme_mpath_remove_disk(struct nvme_ns_head *head) 817 { 818 if (!head->disk) 819 return; 820 blk_set_queue_dying(head->disk->queue); 821 /* make sure all pending bios are cleaned up */ 822 kblockd_schedule_work(&head->requeue_work); 823 flush_work(&head->requeue_work); 824 blk_cleanup_disk(head->disk); 825 } 826 827 void nvme_mpath_init_ctrl(struct nvme_ctrl *ctrl) 828 { 829 mutex_init(&ctrl->ana_lock); 830 timer_setup(&ctrl->anatt_timer, nvme_anatt_timeout, 0); 831 INIT_WORK(&ctrl->ana_work, nvme_ana_work); 832 } 833 834 int nvme_mpath_init_identify(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id) 835 { 836 size_t max_transfer_size = ctrl->max_hw_sectors << SECTOR_SHIFT; 837 size_t ana_log_size; 838 int error = 0; 839 840 /* check if multipath is enabled and we have the capability */ 841 if (!multipath || !ctrl->subsys || 842 !(ctrl->subsys->cmic & NVME_CTRL_CMIC_ANA)) 843 return 0; 844 845 if (!ctrl->max_namespaces || 846 ctrl->max_namespaces > le32_to_cpu(id->nn)) { 847 dev_err(ctrl->device, 848 "Invalid MNAN value %u\n", ctrl->max_namespaces); 849 return -EINVAL; 850 } 851 852 ctrl->anacap = id->anacap; 853 ctrl->anatt = id->anatt; 854 ctrl->nanagrpid = le32_to_cpu(id->nanagrpid); 855 ctrl->anagrpmax = le32_to_cpu(id->anagrpmax); 856 857 ana_log_size = sizeof(struct nvme_ana_rsp_hdr) + 858 ctrl->nanagrpid * sizeof(struct nvme_ana_group_desc) + 859 ctrl->max_namespaces * sizeof(__le32); 860 if (ana_log_size > max_transfer_size) { 861 dev_err(ctrl->device, 862 "ANA log page size (%zd) larger than MDTS (%zd).\n", 863 ana_log_size, max_transfer_size); 864 dev_err(ctrl->device, "disabling ANA support.\n"); 865 goto out_uninit; 866 } 867 if (ana_log_size > ctrl->ana_log_size) { 868 nvme_mpath_stop(ctrl); 869 kfree(ctrl->ana_log_buf); 870 ctrl->ana_log_buf = kmalloc(ana_log_size, GFP_KERNEL); 871 if (!ctrl->ana_log_buf) 872 return -ENOMEM; 873 } 874 ctrl->ana_log_size = ana_log_size; 875 error = nvme_read_ana_log(ctrl); 876 if (error) 877 goto out_uninit; 878 return 0; 879 880 out_uninit: 881 nvme_mpath_uninit(ctrl); 882 return error; 883 } 884 885 void nvme_mpath_uninit(struct nvme_ctrl *ctrl) 886 { 887 kfree(ctrl->ana_log_buf); 888 ctrl->ana_log_buf = NULL; 889 } 890