1 /* 2 * Copyright (c) 2017-2018 Christoph Hellwig. 3 * 4 * This program is free software; you can redistribute it and/or modify it 5 * under the terms and conditions of the GNU General Public License, 6 * version 2, as published by the Free Software Foundation. 7 * 8 * This program is distributed in the hope it will be useful, but WITHOUT 9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 11 * more details. 12 */ 13 14 #include <linux/moduleparam.h> 15 #include <trace/events/block.h> 16 #include "nvme.h" 17 18 static bool multipath = true; 19 module_param(multipath, bool, 0444); 20 MODULE_PARM_DESC(multipath, 21 "turn on native support for multiple controllers per subsystem"); 22 23 inline bool nvme_ctrl_use_ana(struct nvme_ctrl *ctrl) 24 { 25 return multipath && ctrl->subsys && (ctrl->subsys->cmic & (1 << 3)); 26 } 27 28 /* 29 * If multipathing is enabled we need to always use the subsystem instance 30 * number for numbering our devices to avoid conflicts between subsystems that 31 * have multiple controllers and thus use the multipath-aware subsystem node 32 * and those that have a single controller and use the controller node 33 * directly. 34 */ 35 void nvme_set_disk_name(char *disk_name, struct nvme_ns *ns, 36 struct nvme_ctrl *ctrl, int *flags) 37 { 38 if (!multipath) { 39 sprintf(disk_name, "nvme%dn%d", ctrl->instance, ns->head->instance); 40 } else if (ns->head->disk) { 41 sprintf(disk_name, "nvme%dc%dn%d", ctrl->subsys->instance, 42 ctrl->cntlid, ns->head->instance); 43 *flags = GENHD_FL_HIDDEN; 44 } else { 45 sprintf(disk_name, "nvme%dn%d", ctrl->subsys->instance, 46 ns->head->instance); 47 } 48 } 49 50 void nvme_failover_req(struct request *req) 51 { 52 struct nvme_ns *ns = req->q->queuedata; 53 u16 status = nvme_req(req)->status; 54 unsigned long flags; 55 56 spin_lock_irqsave(&ns->head->requeue_lock, flags); 57 blk_steal_bios(&ns->head->requeue_list, req); 58 spin_unlock_irqrestore(&ns->head->requeue_lock, flags); 59 blk_mq_end_request(req, 0); 60 61 switch (status & 0x7ff) { 62 case NVME_SC_ANA_TRANSITION: 63 case NVME_SC_ANA_INACCESSIBLE: 64 case NVME_SC_ANA_PERSISTENT_LOSS: 65 /* 66 * If we got back an ANA error we know the controller is alive, 67 * but not ready to serve this namespaces. The spec suggests 68 * we should update our general state here, but due to the fact 69 * that the admin and I/O queues are not serialized that is 70 * fundamentally racy. So instead just clear the current path, 71 * mark the the path as pending and kick of a re-read of the ANA 72 * log page ASAP. 73 */ 74 nvme_mpath_clear_current_path(ns); 75 if (ns->ctrl->ana_log_buf) { 76 set_bit(NVME_NS_ANA_PENDING, &ns->flags); 77 queue_work(nvme_wq, &ns->ctrl->ana_work); 78 } 79 break; 80 default: 81 /* 82 * Reset the controller for any non-ANA error as we don't know 83 * what caused the error. 84 */ 85 nvme_reset_ctrl(ns->ctrl); 86 break; 87 } 88 89 kblockd_schedule_work(&ns->head->requeue_work); 90 } 91 92 void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl) 93 { 94 struct nvme_ns *ns; 95 96 down_read(&ctrl->namespaces_rwsem); 97 list_for_each_entry(ns, &ctrl->namespaces, list) { 98 if (ns->head->disk) 99 kblockd_schedule_work(&ns->head->requeue_work); 100 } 101 up_read(&ctrl->namespaces_rwsem); 102 } 103 104 static const char *nvme_ana_state_names[] = { 105 [0] = "invalid state", 106 [NVME_ANA_OPTIMIZED] = "optimized", 107 [NVME_ANA_NONOPTIMIZED] = "non-optimized", 108 [NVME_ANA_INACCESSIBLE] = "inaccessible", 109 [NVME_ANA_PERSISTENT_LOSS] = "persistent-loss", 110 [NVME_ANA_CHANGE] = "change", 111 }; 112 113 static struct nvme_ns *__nvme_find_path(struct nvme_ns_head *head) 114 { 115 struct nvme_ns *ns, *fallback = NULL; 116 117 list_for_each_entry_rcu(ns, &head->list, siblings) { 118 if (ns->ctrl->state != NVME_CTRL_LIVE || 119 test_bit(NVME_NS_ANA_PENDING, &ns->flags)) 120 continue; 121 switch (ns->ana_state) { 122 case NVME_ANA_OPTIMIZED: 123 rcu_assign_pointer(head->current_path, ns); 124 return ns; 125 case NVME_ANA_NONOPTIMIZED: 126 fallback = ns; 127 break; 128 default: 129 break; 130 } 131 } 132 133 if (fallback) 134 rcu_assign_pointer(head->current_path, fallback); 135 return fallback; 136 } 137 138 static inline bool nvme_path_is_optimized(struct nvme_ns *ns) 139 { 140 return ns->ctrl->state == NVME_CTRL_LIVE && 141 ns->ana_state == NVME_ANA_OPTIMIZED; 142 } 143 144 inline struct nvme_ns *nvme_find_path(struct nvme_ns_head *head) 145 { 146 struct nvme_ns *ns = srcu_dereference(head->current_path, &head->srcu); 147 148 if (unlikely(!ns || !nvme_path_is_optimized(ns))) 149 ns = __nvme_find_path(head); 150 return ns; 151 } 152 153 static blk_qc_t nvme_ns_head_make_request(struct request_queue *q, 154 struct bio *bio) 155 { 156 struct nvme_ns_head *head = q->queuedata; 157 struct device *dev = disk_to_dev(head->disk); 158 struct nvme_ns *ns; 159 blk_qc_t ret = BLK_QC_T_NONE; 160 int srcu_idx; 161 162 srcu_idx = srcu_read_lock(&head->srcu); 163 ns = nvme_find_path(head); 164 if (likely(ns)) { 165 bio->bi_disk = ns->disk; 166 bio->bi_opf |= REQ_NVME_MPATH; 167 trace_block_bio_remap(bio->bi_disk->queue, bio, 168 disk_devt(ns->head->disk), 169 bio->bi_iter.bi_sector); 170 ret = direct_make_request(bio); 171 } else if (!list_empty_careful(&head->list)) { 172 dev_warn_ratelimited(dev, "no path available - requeuing I/O\n"); 173 174 spin_lock_irq(&head->requeue_lock); 175 bio_list_add(&head->requeue_list, bio); 176 spin_unlock_irq(&head->requeue_lock); 177 } else { 178 dev_warn_ratelimited(dev, "no path - failing I/O\n"); 179 180 bio->bi_status = BLK_STS_IOERR; 181 bio_endio(bio); 182 } 183 184 srcu_read_unlock(&head->srcu, srcu_idx); 185 return ret; 186 } 187 188 static bool nvme_ns_head_poll(struct request_queue *q, blk_qc_t qc) 189 { 190 struct nvme_ns_head *head = q->queuedata; 191 struct nvme_ns *ns; 192 bool found = false; 193 int srcu_idx; 194 195 srcu_idx = srcu_read_lock(&head->srcu); 196 ns = srcu_dereference(head->current_path, &head->srcu); 197 if (likely(ns && nvme_path_is_optimized(ns))) 198 found = ns->queue->poll_fn(q, qc); 199 srcu_read_unlock(&head->srcu, srcu_idx); 200 return found; 201 } 202 203 static void nvme_requeue_work(struct work_struct *work) 204 { 205 struct nvme_ns_head *head = 206 container_of(work, struct nvme_ns_head, requeue_work); 207 struct bio *bio, *next; 208 209 spin_lock_irq(&head->requeue_lock); 210 next = bio_list_get(&head->requeue_list); 211 spin_unlock_irq(&head->requeue_lock); 212 213 while ((bio = next) != NULL) { 214 next = bio->bi_next; 215 bio->bi_next = NULL; 216 217 /* 218 * Reset disk to the mpath node and resubmit to select a new 219 * path. 220 */ 221 bio->bi_disk = head->disk; 222 generic_make_request(bio); 223 } 224 } 225 226 int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head) 227 { 228 struct request_queue *q; 229 bool vwc = false; 230 231 mutex_init(&head->lock); 232 bio_list_init(&head->requeue_list); 233 spin_lock_init(&head->requeue_lock); 234 INIT_WORK(&head->requeue_work, nvme_requeue_work); 235 236 /* 237 * Add a multipath node if the subsystems supports multiple controllers. 238 * We also do this for private namespaces as the namespace sharing data could 239 * change after a rescan. 240 */ 241 if (!(ctrl->subsys->cmic & (1 << 1)) || !multipath) 242 return 0; 243 244 q = blk_alloc_queue_node(GFP_KERNEL, NUMA_NO_NODE, NULL); 245 if (!q) 246 goto out; 247 q->queuedata = head; 248 blk_queue_make_request(q, nvme_ns_head_make_request); 249 q->poll_fn = nvme_ns_head_poll; 250 blk_queue_flag_set(QUEUE_FLAG_NONROT, q); 251 /* set to a default value for 512 until disk is validated */ 252 blk_queue_logical_block_size(q, 512); 253 254 /* we need to propagate up the VMC settings */ 255 if (ctrl->vwc & NVME_CTRL_VWC_PRESENT) 256 vwc = true; 257 blk_queue_write_cache(q, vwc, vwc); 258 259 head->disk = alloc_disk(0); 260 if (!head->disk) 261 goto out_cleanup_queue; 262 head->disk->fops = &nvme_ns_head_ops; 263 head->disk->private_data = head; 264 head->disk->queue = q; 265 head->disk->flags = GENHD_FL_EXT_DEVT; 266 sprintf(head->disk->disk_name, "nvme%dn%d", 267 ctrl->subsys->instance, head->instance); 268 return 0; 269 270 out_cleanup_queue: 271 blk_cleanup_queue(q); 272 out: 273 return -ENOMEM; 274 } 275 276 static void nvme_mpath_set_live(struct nvme_ns *ns) 277 { 278 struct nvme_ns_head *head = ns->head; 279 280 lockdep_assert_held(&ns->head->lock); 281 282 if (!head->disk) 283 return; 284 285 if (!(head->disk->flags & GENHD_FL_UP)) { 286 device_add_disk(&head->subsys->dev, head->disk); 287 if (sysfs_create_group(&disk_to_dev(head->disk)->kobj, 288 &nvme_ns_id_attr_group)) 289 dev_warn(&head->subsys->dev, 290 "failed to create id group.\n"); 291 } 292 293 kblockd_schedule_work(&ns->head->requeue_work); 294 } 295 296 static int nvme_parse_ana_log(struct nvme_ctrl *ctrl, void *data, 297 int (*cb)(struct nvme_ctrl *ctrl, struct nvme_ana_group_desc *, 298 void *)) 299 { 300 void *base = ctrl->ana_log_buf; 301 size_t offset = sizeof(struct nvme_ana_rsp_hdr); 302 int error, i; 303 304 lockdep_assert_held(&ctrl->ana_lock); 305 306 for (i = 0; i < le16_to_cpu(ctrl->ana_log_buf->ngrps); i++) { 307 struct nvme_ana_group_desc *desc = base + offset; 308 u32 nr_nsids = le32_to_cpu(desc->nnsids); 309 size_t nsid_buf_size = nr_nsids * sizeof(__le32); 310 311 if (WARN_ON_ONCE(desc->grpid == 0)) 312 return -EINVAL; 313 if (WARN_ON_ONCE(le32_to_cpu(desc->grpid) > ctrl->anagrpmax)) 314 return -EINVAL; 315 if (WARN_ON_ONCE(desc->state == 0)) 316 return -EINVAL; 317 if (WARN_ON_ONCE(desc->state > NVME_ANA_CHANGE)) 318 return -EINVAL; 319 320 offset += sizeof(*desc); 321 if (WARN_ON_ONCE(offset > ctrl->ana_log_size - nsid_buf_size)) 322 return -EINVAL; 323 324 error = cb(ctrl, desc, data); 325 if (error) 326 return error; 327 328 offset += nsid_buf_size; 329 if (WARN_ON_ONCE(offset > ctrl->ana_log_size - sizeof(*desc))) 330 return -EINVAL; 331 } 332 333 return 0; 334 } 335 336 static inline bool nvme_state_is_live(enum nvme_ana_state state) 337 { 338 return state == NVME_ANA_OPTIMIZED || state == NVME_ANA_NONOPTIMIZED; 339 } 340 341 static void nvme_update_ns_ana_state(struct nvme_ana_group_desc *desc, 342 struct nvme_ns *ns) 343 { 344 enum nvme_ana_state old; 345 346 mutex_lock(&ns->head->lock); 347 old = ns->ana_state; 348 ns->ana_grpid = le32_to_cpu(desc->grpid); 349 ns->ana_state = desc->state; 350 clear_bit(NVME_NS_ANA_PENDING, &ns->flags); 351 352 if (nvme_state_is_live(ns->ana_state) && !nvme_state_is_live(old)) 353 nvme_mpath_set_live(ns); 354 mutex_unlock(&ns->head->lock); 355 } 356 357 static int nvme_update_ana_state(struct nvme_ctrl *ctrl, 358 struct nvme_ana_group_desc *desc, void *data) 359 { 360 u32 nr_nsids = le32_to_cpu(desc->nnsids), n = 0; 361 unsigned *nr_change_groups = data; 362 struct nvme_ns *ns; 363 364 dev_info(ctrl->device, "ANA group %d: %s.\n", 365 le32_to_cpu(desc->grpid), 366 nvme_ana_state_names[desc->state]); 367 368 if (desc->state == NVME_ANA_CHANGE) 369 (*nr_change_groups)++; 370 371 if (!nr_nsids) 372 return 0; 373 374 down_write(&ctrl->namespaces_rwsem); 375 list_for_each_entry(ns, &ctrl->namespaces, list) { 376 if (ns->head->ns_id != le32_to_cpu(desc->nsids[n])) 377 continue; 378 nvme_update_ns_ana_state(desc, ns); 379 if (++n == nr_nsids) 380 break; 381 } 382 up_write(&ctrl->namespaces_rwsem); 383 WARN_ON_ONCE(n < nr_nsids); 384 return 0; 385 } 386 387 static int nvme_read_ana_log(struct nvme_ctrl *ctrl, bool groups_only) 388 { 389 u32 nr_change_groups = 0; 390 int error; 391 392 mutex_lock(&ctrl->ana_lock); 393 error = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_ANA, 394 groups_only ? NVME_ANA_LOG_RGO : 0, 395 ctrl->ana_log_buf, ctrl->ana_log_size, 0); 396 if (error) { 397 dev_warn(ctrl->device, "Failed to get ANA log: %d\n", error); 398 goto out_unlock; 399 } 400 401 error = nvme_parse_ana_log(ctrl, &nr_change_groups, 402 nvme_update_ana_state); 403 if (error) 404 goto out_unlock; 405 406 /* 407 * In theory we should have an ANATT timer per group as they might enter 408 * the change state at different times. But that is a lot of overhead 409 * just to protect against a target that keeps entering new changes 410 * states while never finishing previous ones. But we'll still 411 * eventually time out once all groups are in change state, so this 412 * isn't a big deal. 413 * 414 * We also double the ANATT value to provide some slack for transports 415 * or AEN processing overhead. 416 */ 417 if (nr_change_groups) 418 mod_timer(&ctrl->anatt_timer, ctrl->anatt * HZ * 2 + jiffies); 419 else 420 del_timer_sync(&ctrl->anatt_timer); 421 out_unlock: 422 mutex_unlock(&ctrl->ana_lock); 423 return error; 424 } 425 426 static void nvme_ana_work(struct work_struct *work) 427 { 428 struct nvme_ctrl *ctrl = container_of(work, struct nvme_ctrl, ana_work); 429 430 nvme_read_ana_log(ctrl, false); 431 } 432 433 static void nvme_anatt_timeout(struct timer_list *t) 434 { 435 struct nvme_ctrl *ctrl = from_timer(ctrl, t, anatt_timer); 436 437 dev_info(ctrl->device, "ANATT timeout, resetting controller.\n"); 438 nvme_reset_ctrl(ctrl); 439 } 440 441 void nvme_mpath_stop(struct nvme_ctrl *ctrl) 442 { 443 if (!nvme_ctrl_use_ana(ctrl)) 444 return; 445 del_timer_sync(&ctrl->anatt_timer); 446 cancel_work_sync(&ctrl->ana_work); 447 } 448 449 static ssize_t ana_grpid_show(struct device *dev, struct device_attribute *attr, 450 char *buf) 451 { 452 return sprintf(buf, "%d\n", nvme_get_ns_from_dev(dev)->ana_grpid); 453 } 454 DEVICE_ATTR_RO(ana_grpid); 455 456 static ssize_t ana_state_show(struct device *dev, struct device_attribute *attr, 457 char *buf) 458 { 459 struct nvme_ns *ns = nvme_get_ns_from_dev(dev); 460 461 return sprintf(buf, "%s\n", nvme_ana_state_names[ns->ana_state]); 462 } 463 DEVICE_ATTR_RO(ana_state); 464 465 static int nvme_set_ns_ana_state(struct nvme_ctrl *ctrl, 466 struct nvme_ana_group_desc *desc, void *data) 467 { 468 struct nvme_ns *ns = data; 469 470 if (ns->ana_grpid == le32_to_cpu(desc->grpid)) { 471 nvme_update_ns_ana_state(desc, ns); 472 return -ENXIO; /* just break out of the loop */ 473 } 474 475 return 0; 476 } 477 478 void nvme_mpath_add_disk(struct nvme_ns *ns, struct nvme_id_ns *id) 479 { 480 if (nvme_ctrl_use_ana(ns->ctrl)) { 481 mutex_lock(&ns->ctrl->ana_lock); 482 ns->ana_grpid = le32_to_cpu(id->anagrpid); 483 nvme_parse_ana_log(ns->ctrl, ns, nvme_set_ns_ana_state); 484 mutex_unlock(&ns->ctrl->ana_lock); 485 } else { 486 mutex_lock(&ns->head->lock); 487 ns->ana_state = NVME_ANA_OPTIMIZED; 488 nvme_mpath_set_live(ns); 489 mutex_unlock(&ns->head->lock); 490 } 491 } 492 493 void nvme_mpath_remove_disk(struct nvme_ns_head *head) 494 { 495 if (!head->disk) 496 return; 497 if (head->disk->flags & GENHD_FL_UP) { 498 sysfs_remove_group(&disk_to_dev(head->disk)->kobj, 499 &nvme_ns_id_attr_group); 500 del_gendisk(head->disk); 501 } 502 blk_set_queue_dying(head->disk->queue); 503 /* make sure all pending bios are cleaned up */ 504 kblockd_schedule_work(&head->requeue_work); 505 flush_work(&head->requeue_work); 506 blk_cleanup_queue(head->disk->queue); 507 put_disk(head->disk); 508 } 509 510 int nvme_mpath_init(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id) 511 { 512 int error; 513 514 if (!nvme_ctrl_use_ana(ctrl)) 515 return 0; 516 517 ctrl->anacap = id->anacap; 518 ctrl->anatt = id->anatt; 519 ctrl->nanagrpid = le32_to_cpu(id->nanagrpid); 520 ctrl->anagrpmax = le32_to_cpu(id->anagrpmax); 521 522 mutex_init(&ctrl->ana_lock); 523 timer_setup(&ctrl->anatt_timer, nvme_anatt_timeout, 0); 524 ctrl->ana_log_size = sizeof(struct nvme_ana_rsp_hdr) + 525 ctrl->nanagrpid * sizeof(struct nvme_ana_group_desc); 526 if (!(ctrl->anacap & (1 << 6))) 527 ctrl->ana_log_size += ctrl->max_namespaces * sizeof(__le32); 528 529 if (ctrl->ana_log_size > ctrl->max_hw_sectors << SECTOR_SHIFT) { 530 dev_err(ctrl->device, 531 "ANA log page size (%zd) larger than MDTS (%d).\n", 532 ctrl->ana_log_size, 533 ctrl->max_hw_sectors << SECTOR_SHIFT); 534 dev_err(ctrl->device, "disabling ANA support.\n"); 535 return 0; 536 } 537 538 INIT_WORK(&ctrl->ana_work, nvme_ana_work); 539 ctrl->ana_log_buf = kmalloc(ctrl->ana_log_size, GFP_KERNEL); 540 if (!ctrl->ana_log_buf) 541 goto out; 542 543 error = nvme_read_ana_log(ctrl, true); 544 if (error) 545 goto out_free_ana_log_buf; 546 return 0; 547 out_free_ana_log_buf: 548 kfree(ctrl->ana_log_buf); 549 out: 550 return -ENOMEM; 551 } 552 553 void nvme_mpath_uninit(struct nvme_ctrl *ctrl) 554 { 555 kfree(ctrl->ana_log_buf); 556 } 557 558