1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * NVM Express device driver 4 * Copyright (c) 2011-2014, Intel Corporation. 5 */ 6 7 #include <linux/blkdev.h> 8 #include <linux/blk-mq.h> 9 #include <linux/compat.h> 10 #include <linux/delay.h> 11 #include <linux/errno.h> 12 #include <linux/hdreg.h> 13 #include <linux/kernel.h> 14 #include <linux/module.h> 15 #include <linux/backing-dev.h> 16 #include <linux/list_sort.h> 17 #include <linux/slab.h> 18 #include <linux/types.h> 19 #include <linux/pr.h> 20 #include <linux/ptrace.h> 21 #include <linux/nvme_ioctl.h> 22 #include <linux/pm_qos.h> 23 #include <asm/unaligned.h> 24 25 #include "nvme.h" 26 #include "fabrics.h" 27 28 #define CREATE_TRACE_POINTS 29 #include "trace.h" 30 31 #define NVME_MINORS (1U << MINORBITS) 32 33 unsigned int admin_timeout = 60; 34 module_param(admin_timeout, uint, 0644); 35 MODULE_PARM_DESC(admin_timeout, "timeout in seconds for admin commands"); 36 EXPORT_SYMBOL_GPL(admin_timeout); 37 38 unsigned int nvme_io_timeout = 30; 39 module_param_named(io_timeout, nvme_io_timeout, uint, 0644); 40 MODULE_PARM_DESC(io_timeout, "timeout in seconds for I/O"); 41 EXPORT_SYMBOL_GPL(nvme_io_timeout); 42 43 static unsigned char shutdown_timeout = 5; 44 module_param(shutdown_timeout, byte, 0644); 45 MODULE_PARM_DESC(shutdown_timeout, "timeout in seconds for controller shutdown"); 46 47 static u8 nvme_max_retries = 5; 48 module_param_named(max_retries, nvme_max_retries, byte, 0644); 49 MODULE_PARM_DESC(max_retries, "max number of retries a command may have"); 50 51 static unsigned long default_ps_max_latency_us = 100000; 52 module_param(default_ps_max_latency_us, ulong, 0644); 53 MODULE_PARM_DESC(default_ps_max_latency_us, 54 "max power saving latency for new devices; use PM QOS to change per device"); 55 56 static bool force_apst; 57 module_param(force_apst, bool, 0644); 58 MODULE_PARM_DESC(force_apst, "allow APST for newly enumerated devices even if quirked off"); 59 60 static bool streams; 61 module_param(streams, bool, 0644); 62 MODULE_PARM_DESC(streams, "turn on support for Streams write directives"); 63 64 /* 65 * nvme_wq - hosts nvme related works that are not reset or delete 66 * nvme_reset_wq - hosts nvme reset works 67 * nvme_delete_wq - hosts nvme delete works 68 * 69 * nvme_wq will host works such as scan, aen handling, fw activation, 70 * keep-alive, periodic reconnects etc. nvme_reset_wq 71 * runs reset works which also flush works hosted on nvme_wq for 72 * serialization purposes. nvme_delete_wq host controller deletion 73 * works which flush reset works for serialization. 74 */ 75 struct workqueue_struct *nvme_wq; 76 EXPORT_SYMBOL_GPL(nvme_wq); 77 78 struct workqueue_struct *nvme_reset_wq; 79 EXPORT_SYMBOL_GPL(nvme_reset_wq); 80 81 struct workqueue_struct *nvme_delete_wq; 82 EXPORT_SYMBOL_GPL(nvme_delete_wq); 83 84 static LIST_HEAD(nvme_subsystems); 85 static DEFINE_MUTEX(nvme_subsystems_lock); 86 87 static DEFINE_IDA(nvme_instance_ida); 88 static dev_t nvme_ctrl_base_chr_devt; 89 static struct class *nvme_class; 90 static struct class *nvme_subsys_class; 91 92 static void nvme_put_subsystem(struct nvme_subsystem *subsys); 93 static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl, 94 unsigned nsid); 95 96 /* 97 * Prepare a queue for teardown. 98 * 99 * This must forcibly unquiesce queues to avoid blocking dispatch, and only set 100 * the capacity to 0 after that to avoid blocking dispatchers that may be 101 * holding bd_butex. This will end buffered writers dirtying pages that can't 102 * be synced. 103 */ 104 static void nvme_set_queue_dying(struct nvme_ns *ns) 105 { 106 if (test_and_set_bit(NVME_NS_DEAD, &ns->flags)) 107 return; 108 109 blk_set_queue_dying(ns->queue); 110 blk_mq_unquiesce_queue(ns->queue); 111 112 set_capacity_and_notify(ns->disk, 0); 113 } 114 115 static void nvme_queue_scan(struct nvme_ctrl *ctrl) 116 { 117 /* 118 * Only new queue scan work when admin and IO queues are both alive 119 */ 120 if (ctrl->state == NVME_CTRL_LIVE && ctrl->tagset) 121 queue_work(nvme_wq, &ctrl->scan_work); 122 } 123 124 /* 125 * Use this function to proceed with scheduling reset_work for a controller 126 * that had previously been set to the resetting state. This is intended for 127 * code paths that can't be interrupted by other reset attempts. A hot removal 128 * may prevent this from succeeding. 129 */ 130 int nvme_try_sched_reset(struct nvme_ctrl *ctrl) 131 { 132 if (ctrl->state != NVME_CTRL_RESETTING) 133 return -EBUSY; 134 if (!queue_work(nvme_reset_wq, &ctrl->reset_work)) 135 return -EBUSY; 136 return 0; 137 } 138 EXPORT_SYMBOL_GPL(nvme_try_sched_reset); 139 140 static void nvme_failfast_work(struct work_struct *work) 141 { 142 struct nvme_ctrl *ctrl = container_of(to_delayed_work(work), 143 struct nvme_ctrl, failfast_work); 144 145 if (ctrl->state != NVME_CTRL_CONNECTING) 146 return; 147 148 set_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags); 149 dev_info(ctrl->device, "failfast expired\n"); 150 nvme_kick_requeue_lists(ctrl); 151 } 152 153 static inline void nvme_start_failfast_work(struct nvme_ctrl *ctrl) 154 { 155 if (!ctrl->opts || ctrl->opts->fast_io_fail_tmo == -1) 156 return; 157 158 schedule_delayed_work(&ctrl->failfast_work, 159 ctrl->opts->fast_io_fail_tmo * HZ); 160 } 161 162 static inline void nvme_stop_failfast_work(struct nvme_ctrl *ctrl) 163 { 164 if (!ctrl->opts) 165 return; 166 167 cancel_delayed_work_sync(&ctrl->failfast_work); 168 clear_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags); 169 } 170 171 172 int nvme_reset_ctrl(struct nvme_ctrl *ctrl) 173 { 174 if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING)) 175 return -EBUSY; 176 if (!queue_work(nvme_reset_wq, &ctrl->reset_work)) 177 return -EBUSY; 178 return 0; 179 } 180 EXPORT_SYMBOL_GPL(nvme_reset_ctrl); 181 182 static int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl) 183 { 184 int ret; 185 186 ret = nvme_reset_ctrl(ctrl); 187 if (!ret) { 188 flush_work(&ctrl->reset_work); 189 if (ctrl->state != NVME_CTRL_LIVE) 190 ret = -ENETRESET; 191 } 192 193 return ret; 194 } 195 196 static void nvme_do_delete_ctrl(struct nvme_ctrl *ctrl) 197 { 198 dev_info(ctrl->device, 199 "Removing ctrl: NQN \"%s\"\n", ctrl->opts->subsysnqn); 200 201 flush_work(&ctrl->reset_work); 202 nvme_stop_ctrl(ctrl); 203 nvme_remove_namespaces(ctrl); 204 ctrl->ops->delete_ctrl(ctrl); 205 nvme_uninit_ctrl(ctrl); 206 } 207 208 static void nvme_delete_ctrl_work(struct work_struct *work) 209 { 210 struct nvme_ctrl *ctrl = 211 container_of(work, struct nvme_ctrl, delete_work); 212 213 nvme_do_delete_ctrl(ctrl); 214 } 215 216 int nvme_delete_ctrl(struct nvme_ctrl *ctrl) 217 { 218 if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING)) 219 return -EBUSY; 220 if (!queue_work(nvme_delete_wq, &ctrl->delete_work)) 221 return -EBUSY; 222 return 0; 223 } 224 EXPORT_SYMBOL_GPL(nvme_delete_ctrl); 225 226 static void nvme_delete_ctrl_sync(struct nvme_ctrl *ctrl) 227 { 228 /* 229 * Keep a reference until nvme_do_delete_ctrl() complete, 230 * since ->delete_ctrl can free the controller. 231 */ 232 nvme_get_ctrl(ctrl); 233 if (nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING)) 234 nvme_do_delete_ctrl(ctrl); 235 nvme_put_ctrl(ctrl); 236 } 237 238 static blk_status_t nvme_error_status(u16 status) 239 { 240 switch (status & 0x7ff) { 241 case NVME_SC_SUCCESS: 242 return BLK_STS_OK; 243 case NVME_SC_CAP_EXCEEDED: 244 return BLK_STS_NOSPC; 245 case NVME_SC_LBA_RANGE: 246 case NVME_SC_CMD_INTERRUPTED: 247 case NVME_SC_NS_NOT_READY: 248 return BLK_STS_TARGET; 249 case NVME_SC_BAD_ATTRIBUTES: 250 case NVME_SC_ONCS_NOT_SUPPORTED: 251 case NVME_SC_INVALID_OPCODE: 252 case NVME_SC_INVALID_FIELD: 253 case NVME_SC_INVALID_NS: 254 return BLK_STS_NOTSUPP; 255 case NVME_SC_WRITE_FAULT: 256 case NVME_SC_READ_ERROR: 257 case NVME_SC_UNWRITTEN_BLOCK: 258 case NVME_SC_ACCESS_DENIED: 259 case NVME_SC_READ_ONLY: 260 case NVME_SC_COMPARE_FAILED: 261 return BLK_STS_MEDIUM; 262 case NVME_SC_GUARD_CHECK: 263 case NVME_SC_APPTAG_CHECK: 264 case NVME_SC_REFTAG_CHECK: 265 case NVME_SC_INVALID_PI: 266 return BLK_STS_PROTECTION; 267 case NVME_SC_RESERVATION_CONFLICT: 268 return BLK_STS_NEXUS; 269 case NVME_SC_HOST_PATH_ERROR: 270 return BLK_STS_TRANSPORT; 271 case NVME_SC_ZONE_TOO_MANY_ACTIVE: 272 return BLK_STS_ZONE_ACTIVE_RESOURCE; 273 case NVME_SC_ZONE_TOO_MANY_OPEN: 274 return BLK_STS_ZONE_OPEN_RESOURCE; 275 default: 276 return BLK_STS_IOERR; 277 } 278 } 279 280 static void nvme_retry_req(struct request *req) 281 { 282 unsigned long delay = 0; 283 u16 crd; 284 285 /* The mask and shift result must be <= 3 */ 286 crd = (nvme_req(req)->status & NVME_SC_CRD) >> 11; 287 if (crd) 288 delay = nvme_req(req)->ctrl->crdt[crd - 1] * 100; 289 290 nvme_req(req)->retries++; 291 blk_mq_requeue_request(req, false); 292 blk_mq_delay_kick_requeue_list(req->q, delay); 293 } 294 295 enum nvme_disposition { 296 COMPLETE, 297 RETRY, 298 FAILOVER, 299 }; 300 301 static inline enum nvme_disposition nvme_decide_disposition(struct request *req) 302 { 303 if (likely(nvme_req(req)->status == 0)) 304 return COMPLETE; 305 306 if (blk_noretry_request(req) || 307 (nvme_req(req)->status & NVME_SC_DNR) || 308 nvme_req(req)->retries >= nvme_max_retries) 309 return COMPLETE; 310 311 if (req->cmd_flags & REQ_NVME_MPATH) { 312 if (nvme_is_path_error(nvme_req(req)->status) || 313 blk_queue_dying(req->q)) 314 return FAILOVER; 315 } else { 316 if (blk_queue_dying(req->q)) 317 return COMPLETE; 318 } 319 320 return RETRY; 321 } 322 323 static inline void nvme_end_req(struct request *req) 324 { 325 blk_status_t status = nvme_error_status(nvme_req(req)->status); 326 327 if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) && 328 req_op(req) == REQ_OP_ZONE_APPEND) 329 req->__sector = nvme_lba_to_sect(req->q->queuedata, 330 le64_to_cpu(nvme_req(req)->result.u64)); 331 332 nvme_trace_bio_complete(req); 333 blk_mq_end_request(req, status); 334 } 335 336 void nvme_complete_rq(struct request *req) 337 { 338 trace_nvme_complete_rq(req); 339 nvme_cleanup_cmd(req); 340 341 if (nvme_req(req)->ctrl->kas) 342 nvme_req(req)->ctrl->comp_seen = true; 343 344 switch (nvme_decide_disposition(req)) { 345 case COMPLETE: 346 nvme_end_req(req); 347 return; 348 case RETRY: 349 nvme_retry_req(req); 350 return; 351 case FAILOVER: 352 nvme_failover_req(req); 353 return; 354 } 355 } 356 EXPORT_SYMBOL_GPL(nvme_complete_rq); 357 358 /* 359 * Called to unwind from ->queue_rq on a failed command submission so that the 360 * multipathing code gets called to potentially failover to another path. 361 * The caller needs to unwind all transport specific resource allocations and 362 * must return propagate the return value. 363 */ 364 blk_status_t nvme_host_path_error(struct request *req) 365 { 366 nvme_req(req)->status = NVME_SC_HOST_PATH_ERROR; 367 blk_mq_set_request_complete(req); 368 nvme_complete_rq(req); 369 return BLK_STS_OK; 370 } 371 EXPORT_SYMBOL_GPL(nvme_host_path_error); 372 373 bool nvme_cancel_request(struct request *req, void *data, bool reserved) 374 { 375 dev_dbg_ratelimited(((struct nvme_ctrl *) data)->device, 376 "Cancelling I/O %d", req->tag); 377 378 /* don't abort one completed request */ 379 if (blk_mq_request_completed(req)) 380 return true; 381 382 nvme_req(req)->status = NVME_SC_HOST_ABORTED_CMD; 383 nvme_req(req)->flags |= NVME_REQ_CANCELLED; 384 blk_mq_complete_request(req); 385 return true; 386 } 387 EXPORT_SYMBOL_GPL(nvme_cancel_request); 388 389 void nvme_cancel_tagset(struct nvme_ctrl *ctrl) 390 { 391 if (ctrl->tagset) { 392 blk_mq_tagset_busy_iter(ctrl->tagset, 393 nvme_cancel_request, ctrl); 394 blk_mq_tagset_wait_completed_request(ctrl->tagset); 395 } 396 } 397 EXPORT_SYMBOL_GPL(nvme_cancel_tagset); 398 399 void nvme_cancel_admin_tagset(struct nvme_ctrl *ctrl) 400 { 401 if (ctrl->admin_tagset) { 402 blk_mq_tagset_busy_iter(ctrl->admin_tagset, 403 nvme_cancel_request, ctrl); 404 blk_mq_tagset_wait_completed_request(ctrl->admin_tagset); 405 } 406 } 407 EXPORT_SYMBOL_GPL(nvme_cancel_admin_tagset); 408 409 bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl, 410 enum nvme_ctrl_state new_state) 411 { 412 enum nvme_ctrl_state old_state; 413 unsigned long flags; 414 bool changed = false; 415 416 spin_lock_irqsave(&ctrl->lock, flags); 417 418 old_state = ctrl->state; 419 switch (new_state) { 420 case NVME_CTRL_LIVE: 421 switch (old_state) { 422 case NVME_CTRL_NEW: 423 case NVME_CTRL_RESETTING: 424 case NVME_CTRL_CONNECTING: 425 changed = true; 426 fallthrough; 427 default: 428 break; 429 } 430 break; 431 case NVME_CTRL_RESETTING: 432 switch (old_state) { 433 case NVME_CTRL_NEW: 434 case NVME_CTRL_LIVE: 435 changed = true; 436 fallthrough; 437 default: 438 break; 439 } 440 break; 441 case NVME_CTRL_CONNECTING: 442 switch (old_state) { 443 case NVME_CTRL_NEW: 444 case NVME_CTRL_RESETTING: 445 changed = true; 446 fallthrough; 447 default: 448 break; 449 } 450 break; 451 case NVME_CTRL_DELETING: 452 switch (old_state) { 453 case NVME_CTRL_LIVE: 454 case NVME_CTRL_RESETTING: 455 case NVME_CTRL_CONNECTING: 456 changed = true; 457 fallthrough; 458 default: 459 break; 460 } 461 break; 462 case NVME_CTRL_DELETING_NOIO: 463 switch (old_state) { 464 case NVME_CTRL_DELETING: 465 case NVME_CTRL_DEAD: 466 changed = true; 467 fallthrough; 468 default: 469 break; 470 } 471 break; 472 case NVME_CTRL_DEAD: 473 switch (old_state) { 474 case NVME_CTRL_DELETING: 475 changed = true; 476 fallthrough; 477 default: 478 break; 479 } 480 break; 481 default: 482 break; 483 } 484 485 if (changed) { 486 ctrl->state = new_state; 487 wake_up_all(&ctrl->state_wq); 488 } 489 490 spin_unlock_irqrestore(&ctrl->lock, flags); 491 if (!changed) 492 return false; 493 494 if (ctrl->state == NVME_CTRL_LIVE) { 495 if (old_state == NVME_CTRL_CONNECTING) 496 nvme_stop_failfast_work(ctrl); 497 nvme_kick_requeue_lists(ctrl); 498 } else if (ctrl->state == NVME_CTRL_CONNECTING && 499 old_state == NVME_CTRL_RESETTING) { 500 nvme_start_failfast_work(ctrl); 501 } 502 return changed; 503 } 504 EXPORT_SYMBOL_GPL(nvme_change_ctrl_state); 505 506 /* 507 * Returns true for sink states that can't ever transition back to live. 508 */ 509 static bool nvme_state_terminal(struct nvme_ctrl *ctrl) 510 { 511 switch (ctrl->state) { 512 case NVME_CTRL_NEW: 513 case NVME_CTRL_LIVE: 514 case NVME_CTRL_RESETTING: 515 case NVME_CTRL_CONNECTING: 516 return false; 517 case NVME_CTRL_DELETING: 518 case NVME_CTRL_DELETING_NOIO: 519 case NVME_CTRL_DEAD: 520 return true; 521 default: 522 WARN_ONCE(1, "Unhandled ctrl state:%d", ctrl->state); 523 return true; 524 } 525 } 526 527 /* 528 * Waits for the controller state to be resetting, or returns false if it is 529 * not possible to ever transition to that state. 530 */ 531 bool nvme_wait_reset(struct nvme_ctrl *ctrl) 532 { 533 wait_event(ctrl->state_wq, 534 nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING) || 535 nvme_state_terminal(ctrl)); 536 return ctrl->state == NVME_CTRL_RESETTING; 537 } 538 EXPORT_SYMBOL_GPL(nvme_wait_reset); 539 540 static void nvme_free_ns_head(struct kref *ref) 541 { 542 struct nvme_ns_head *head = 543 container_of(ref, struct nvme_ns_head, ref); 544 545 nvme_mpath_remove_disk(head); 546 ida_simple_remove(&head->subsys->ns_ida, head->instance); 547 cleanup_srcu_struct(&head->srcu); 548 nvme_put_subsystem(head->subsys); 549 kfree(head); 550 } 551 552 static void nvme_put_ns_head(struct nvme_ns_head *head) 553 { 554 kref_put(&head->ref, nvme_free_ns_head); 555 } 556 557 static void nvme_free_ns(struct kref *kref) 558 { 559 struct nvme_ns *ns = container_of(kref, struct nvme_ns, kref); 560 561 if (ns->ndev) 562 nvme_nvm_unregister(ns); 563 564 put_disk(ns->disk); 565 nvme_put_ns_head(ns->head); 566 nvme_put_ctrl(ns->ctrl); 567 kfree(ns); 568 } 569 570 void nvme_put_ns(struct nvme_ns *ns) 571 { 572 kref_put(&ns->kref, nvme_free_ns); 573 } 574 EXPORT_SYMBOL_NS_GPL(nvme_put_ns, NVME_TARGET_PASSTHRU); 575 576 static inline void nvme_clear_nvme_request(struct request *req) 577 { 578 if (!(req->rq_flags & RQF_DONTPREP)) { 579 nvme_req(req)->retries = 0; 580 nvme_req(req)->flags = 0; 581 req->rq_flags |= RQF_DONTPREP; 582 } 583 } 584 585 static inline unsigned int nvme_req_op(struct nvme_command *cmd) 586 { 587 return nvme_is_write(cmd) ? REQ_OP_DRV_OUT : REQ_OP_DRV_IN; 588 } 589 590 static inline void nvme_init_request(struct request *req, 591 struct nvme_command *cmd) 592 { 593 if (req->q->queuedata) 594 req->timeout = NVME_IO_TIMEOUT; 595 else /* no queuedata implies admin queue */ 596 req->timeout = NVME_ADMIN_TIMEOUT; 597 598 req->cmd_flags |= REQ_FAILFAST_DRIVER; 599 nvme_clear_nvme_request(req); 600 nvme_req(req)->cmd = cmd; 601 } 602 603 struct request *nvme_alloc_request(struct request_queue *q, 604 struct nvme_command *cmd, blk_mq_req_flags_t flags) 605 { 606 struct request *req; 607 608 req = blk_mq_alloc_request(q, nvme_req_op(cmd), flags); 609 if (!IS_ERR(req)) 610 nvme_init_request(req, cmd); 611 return req; 612 } 613 EXPORT_SYMBOL_GPL(nvme_alloc_request); 614 615 static struct request *nvme_alloc_request_qid(struct request_queue *q, 616 struct nvme_command *cmd, blk_mq_req_flags_t flags, int qid) 617 { 618 struct request *req; 619 620 req = blk_mq_alloc_request_hctx(q, nvme_req_op(cmd), flags, 621 qid ? qid - 1 : 0); 622 if (!IS_ERR(req)) 623 nvme_init_request(req, cmd); 624 return req; 625 } 626 627 static int nvme_toggle_streams(struct nvme_ctrl *ctrl, bool enable) 628 { 629 struct nvme_command c; 630 631 memset(&c, 0, sizeof(c)); 632 633 c.directive.opcode = nvme_admin_directive_send; 634 c.directive.nsid = cpu_to_le32(NVME_NSID_ALL); 635 c.directive.doper = NVME_DIR_SND_ID_OP_ENABLE; 636 c.directive.dtype = NVME_DIR_IDENTIFY; 637 c.directive.tdtype = NVME_DIR_STREAMS; 638 c.directive.endir = enable ? NVME_DIR_ENDIR : 0; 639 640 return nvme_submit_sync_cmd(ctrl->admin_q, &c, NULL, 0); 641 } 642 643 static int nvme_disable_streams(struct nvme_ctrl *ctrl) 644 { 645 return nvme_toggle_streams(ctrl, false); 646 } 647 648 static int nvme_enable_streams(struct nvme_ctrl *ctrl) 649 { 650 return nvme_toggle_streams(ctrl, true); 651 } 652 653 static int nvme_get_stream_params(struct nvme_ctrl *ctrl, 654 struct streams_directive_params *s, u32 nsid) 655 { 656 struct nvme_command c; 657 658 memset(&c, 0, sizeof(c)); 659 memset(s, 0, sizeof(*s)); 660 661 c.directive.opcode = nvme_admin_directive_recv; 662 c.directive.nsid = cpu_to_le32(nsid); 663 c.directive.numd = cpu_to_le32(nvme_bytes_to_numd(sizeof(*s))); 664 c.directive.doper = NVME_DIR_RCV_ST_OP_PARAM; 665 c.directive.dtype = NVME_DIR_STREAMS; 666 667 return nvme_submit_sync_cmd(ctrl->admin_q, &c, s, sizeof(*s)); 668 } 669 670 static int nvme_configure_directives(struct nvme_ctrl *ctrl) 671 { 672 struct streams_directive_params s; 673 int ret; 674 675 if (!(ctrl->oacs & NVME_CTRL_OACS_DIRECTIVES)) 676 return 0; 677 if (!streams) 678 return 0; 679 680 ret = nvme_enable_streams(ctrl); 681 if (ret) 682 return ret; 683 684 ret = nvme_get_stream_params(ctrl, &s, NVME_NSID_ALL); 685 if (ret) 686 goto out_disable_stream; 687 688 ctrl->nssa = le16_to_cpu(s.nssa); 689 if (ctrl->nssa < BLK_MAX_WRITE_HINTS - 1) { 690 dev_info(ctrl->device, "too few streams (%u) available\n", 691 ctrl->nssa); 692 goto out_disable_stream; 693 } 694 695 ctrl->nr_streams = min_t(u16, ctrl->nssa, BLK_MAX_WRITE_HINTS - 1); 696 dev_info(ctrl->device, "Using %u streams\n", ctrl->nr_streams); 697 return 0; 698 699 out_disable_stream: 700 nvme_disable_streams(ctrl); 701 return ret; 702 } 703 704 /* 705 * Check if 'req' has a write hint associated with it. If it does, assign 706 * a valid namespace stream to the write. 707 */ 708 static void nvme_assign_write_stream(struct nvme_ctrl *ctrl, 709 struct request *req, u16 *control, 710 u32 *dsmgmt) 711 { 712 enum rw_hint streamid = req->write_hint; 713 714 if (streamid == WRITE_LIFE_NOT_SET || streamid == WRITE_LIFE_NONE) 715 streamid = 0; 716 else { 717 streamid--; 718 if (WARN_ON_ONCE(streamid > ctrl->nr_streams)) 719 return; 720 721 *control |= NVME_RW_DTYPE_STREAMS; 722 *dsmgmt |= streamid << 16; 723 } 724 725 if (streamid < ARRAY_SIZE(req->q->write_hints)) 726 req->q->write_hints[streamid] += blk_rq_bytes(req) >> 9; 727 } 728 729 static void nvme_setup_passthrough(struct request *req, 730 struct nvme_command *cmd) 731 { 732 memcpy(cmd, nvme_req(req)->cmd, sizeof(*cmd)); 733 /* passthru commands should let the driver set the SGL flags */ 734 cmd->common.flags &= ~NVME_CMD_SGL_ALL; 735 } 736 737 static inline void nvme_setup_flush(struct nvme_ns *ns, 738 struct nvme_command *cmnd) 739 { 740 cmnd->common.opcode = nvme_cmd_flush; 741 cmnd->common.nsid = cpu_to_le32(ns->head->ns_id); 742 } 743 744 static blk_status_t nvme_setup_discard(struct nvme_ns *ns, struct request *req, 745 struct nvme_command *cmnd) 746 { 747 unsigned short segments = blk_rq_nr_discard_segments(req), n = 0; 748 struct nvme_dsm_range *range; 749 struct bio *bio; 750 751 /* 752 * Some devices do not consider the DSM 'Number of Ranges' field when 753 * determining how much data to DMA. Always allocate memory for maximum 754 * number of segments to prevent device reading beyond end of buffer. 755 */ 756 static const size_t alloc_size = sizeof(*range) * NVME_DSM_MAX_RANGES; 757 758 range = kzalloc(alloc_size, GFP_ATOMIC | __GFP_NOWARN); 759 if (!range) { 760 /* 761 * If we fail allocation our range, fallback to the controller 762 * discard page. If that's also busy, it's safe to return 763 * busy, as we know we can make progress once that's freed. 764 */ 765 if (test_and_set_bit_lock(0, &ns->ctrl->discard_page_busy)) 766 return BLK_STS_RESOURCE; 767 768 range = page_address(ns->ctrl->discard_page); 769 } 770 771 __rq_for_each_bio(bio, req) { 772 u64 slba = nvme_sect_to_lba(ns, bio->bi_iter.bi_sector); 773 u32 nlb = bio->bi_iter.bi_size >> ns->lba_shift; 774 775 if (n < segments) { 776 range[n].cattr = cpu_to_le32(0); 777 range[n].nlb = cpu_to_le32(nlb); 778 range[n].slba = cpu_to_le64(slba); 779 } 780 n++; 781 } 782 783 if (WARN_ON_ONCE(n != segments)) { 784 if (virt_to_page(range) == ns->ctrl->discard_page) 785 clear_bit_unlock(0, &ns->ctrl->discard_page_busy); 786 else 787 kfree(range); 788 return BLK_STS_IOERR; 789 } 790 791 cmnd->dsm.opcode = nvme_cmd_dsm; 792 cmnd->dsm.nsid = cpu_to_le32(ns->head->ns_id); 793 cmnd->dsm.nr = cpu_to_le32(segments - 1); 794 cmnd->dsm.attributes = cpu_to_le32(NVME_DSMGMT_AD); 795 796 req->special_vec.bv_page = virt_to_page(range); 797 req->special_vec.bv_offset = offset_in_page(range); 798 req->special_vec.bv_len = alloc_size; 799 req->rq_flags |= RQF_SPECIAL_PAYLOAD; 800 801 return BLK_STS_OK; 802 } 803 804 static inline blk_status_t nvme_setup_write_zeroes(struct nvme_ns *ns, 805 struct request *req, struct nvme_command *cmnd) 806 { 807 if (ns->ctrl->quirks & NVME_QUIRK_DEALLOCATE_ZEROES) 808 return nvme_setup_discard(ns, req, cmnd); 809 810 cmnd->write_zeroes.opcode = nvme_cmd_write_zeroes; 811 cmnd->write_zeroes.nsid = cpu_to_le32(ns->head->ns_id); 812 cmnd->write_zeroes.slba = 813 cpu_to_le64(nvme_sect_to_lba(ns, blk_rq_pos(req))); 814 cmnd->write_zeroes.length = 815 cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1); 816 cmnd->write_zeroes.control = 0; 817 return BLK_STS_OK; 818 } 819 820 static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns, 821 struct request *req, struct nvme_command *cmnd, 822 enum nvme_opcode op) 823 { 824 struct nvme_ctrl *ctrl = ns->ctrl; 825 u16 control = 0; 826 u32 dsmgmt = 0; 827 828 if (req->cmd_flags & REQ_FUA) 829 control |= NVME_RW_FUA; 830 if (req->cmd_flags & (REQ_FAILFAST_DEV | REQ_RAHEAD)) 831 control |= NVME_RW_LR; 832 833 if (req->cmd_flags & REQ_RAHEAD) 834 dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH; 835 836 cmnd->rw.opcode = op; 837 cmnd->rw.nsid = cpu_to_le32(ns->head->ns_id); 838 cmnd->rw.slba = cpu_to_le64(nvme_sect_to_lba(ns, blk_rq_pos(req))); 839 cmnd->rw.length = cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1); 840 841 if (req_op(req) == REQ_OP_WRITE && ctrl->nr_streams) 842 nvme_assign_write_stream(ctrl, req, &control, &dsmgmt); 843 844 if (ns->ms) { 845 /* 846 * If formated with metadata, the block layer always provides a 847 * metadata buffer if CONFIG_BLK_DEV_INTEGRITY is enabled. Else 848 * we enable the PRACT bit for protection information or set the 849 * namespace capacity to zero to prevent any I/O. 850 */ 851 if (!blk_integrity_rq(req)) { 852 if (WARN_ON_ONCE(!nvme_ns_has_pi(ns))) 853 return BLK_STS_NOTSUPP; 854 control |= NVME_RW_PRINFO_PRACT; 855 } 856 857 switch (ns->pi_type) { 858 case NVME_NS_DPS_PI_TYPE3: 859 control |= NVME_RW_PRINFO_PRCHK_GUARD; 860 break; 861 case NVME_NS_DPS_PI_TYPE1: 862 case NVME_NS_DPS_PI_TYPE2: 863 control |= NVME_RW_PRINFO_PRCHK_GUARD | 864 NVME_RW_PRINFO_PRCHK_REF; 865 if (op == nvme_cmd_zone_append) 866 control |= NVME_RW_APPEND_PIREMAP; 867 cmnd->rw.reftag = cpu_to_le32(t10_pi_ref_tag(req)); 868 break; 869 } 870 } 871 872 cmnd->rw.control = cpu_to_le16(control); 873 cmnd->rw.dsmgmt = cpu_to_le32(dsmgmt); 874 return 0; 875 } 876 877 void nvme_cleanup_cmd(struct request *req) 878 { 879 if (req->rq_flags & RQF_SPECIAL_PAYLOAD) { 880 struct nvme_ctrl *ctrl = nvme_req(req)->ctrl; 881 struct page *page = req->special_vec.bv_page; 882 883 if (page == ctrl->discard_page) 884 clear_bit_unlock(0, &ctrl->discard_page_busy); 885 else 886 kfree(page_address(page) + req->special_vec.bv_offset); 887 } 888 } 889 EXPORT_SYMBOL_GPL(nvme_cleanup_cmd); 890 891 blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req, 892 struct nvme_command *cmd) 893 { 894 blk_status_t ret = BLK_STS_OK; 895 896 nvme_clear_nvme_request(req); 897 898 memset(cmd, 0, sizeof(*cmd)); 899 switch (req_op(req)) { 900 case REQ_OP_DRV_IN: 901 case REQ_OP_DRV_OUT: 902 nvme_setup_passthrough(req, cmd); 903 break; 904 case REQ_OP_FLUSH: 905 nvme_setup_flush(ns, cmd); 906 break; 907 case REQ_OP_ZONE_RESET_ALL: 908 case REQ_OP_ZONE_RESET: 909 ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_RESET); 910 break; 911 case REQ_OP_ZONE_OPEN: 912 ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_OPEN); 913 break; 914 case REQ_OP_ZONE_CLOSE: 915 ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_CLOSE); 916 break; 917 case REQ_OP_ZONE_FINISH: 918 ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_FINISH); 919 break; 920 case REQ_OP_WRITE_ZEROES: 921 ret = nvme_setup_write_zeroes(ns, req, cmd); 922 break; 923 case REQ_OP_DISCARD: 924 ret = nvme_setup_discard(ns, req, cmd); 925 break; 926 case REQ_OP_READ: 927 ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_read); 928 break; 929 case REQ_OP_WRITE: 930 ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_write); 931 break; 932 case REQ_OP_ZONE_APPEND: 933 ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_zone_append); 934 break; 935 default: 936 WARN_ON_ONCE(1); 937 return BLK_STS_IOERR; 938 } 939 940 cmd->common.command_id = req->tag; 941 trace_nvme_setup_cmd(req, cmd); 942 return ret; 943 } 944 EXPORT_SYMBOL_GPL(nvme_setup_cmd); 945 946 static void nvme_end_sync_rq(struct request *rq, blk_status_t error) 947 { 948 struct completion *waiting = rq->end_io_data; 949 950 rq->end_io_data = NULL; 951 complete(waiting); 952 } 953 954 static void nvme_execute_rq_polled(struct request_queue *q, 955 struct gendisk *bd_disk, struct request *rq, int at_head) 956 { 957 DECLARE_COMPLETION_ONSTACK(wait); 958 959 WARN_ON_ONCE(!test_bit(QUEUE_FLAG_POLL, &q->queue_flags)); 960 961 rq->cmd_flags |= REQ_HIPRI; 962 rq->end_io_data = &wait; 963 blk_execute_rq_nowait(bd_disk, rq, at_head, nvme_end_sync_rq); 964 965 while (!completion_done(&wait)) { 966 blk_poll(q, request_to_qc_t(rq->mq_hctx, rq), true); 967 cond_resched(); 968 } 969 } 970 971 /* 972 * Returns 0 on success. If the result is negative, it's a Linux error code; 973 * if the result is positive, it's an NVM Express status code 974 */ 975 int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, 976 union nvme_result *result, void *buffer, unsigned bufflen, 977 unsigned timeout, int qid, int at_head, 978 blk_mq_req_flags_t flags, bool poll) 979 { 980 struct request *req; 981 int ret; 982 983 if (qid == NVME_QID_ANY) 984 req = nvme_alloc_request(q, cmd, flags); 985 else 986 req = nvme_alloc_request_qid(q, cmd, flags, qid); 987 if (IS_ERR(req)) 988 return PTR_ERR(req); 989 990 if (timeout) 991 req->timeout = timeout; 992 993 if (buffer && bufflen) { 994 ret = blk_rq_map_kern(q, req, buffer, bufflen, GFP_KERNEL); 995 if (ret) 996 goto out; 997 } 998 999 if (poll) 1000 nvme_execute_rq_polled(req->q, NULL, req, at_head); 1001 else 1002 blk_execute_rq(NULL, req, at_head); 1003 if (result) 1004 *result = nvme_req(req)->result; 1005 if (nvme_req(req)->flags & NVME_REQ_CANCELLED) 1006 ret = -EINTR; 1007 else 1008 ret = nvme_req(req)->status; 1009 out: 1010 blk_mq_free_request(req); 1011 return ret; 1012 } 1013 EXPORT_SYMBOL_GPL(__nvme_submit_sync_cmd); 1014 1015 int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, 1016 void *buffer, unsigned bufflen) 1017 { 1018 return __nvme_submit_sync_cmd(q, cmd, NULL, buffer, bufflen, 0, 1019 NVME_QID_ANY, 0, 0, false); 1020 } 1021 EXPORT_SYMBOL_GPL(nvme_submit_sync_cmd); 1022 1023 static void *nvme_add_user_metadata(struct bio *bio, void __user *ubuf, 1024 unsigned len, u32 seed, bool write) 1025 { 1026 struct bio_integrity_payload *bip; 1027 int ret = -ENOMEM; 1028 void *buf; 1029 1030 buf = kmalloc(len, GFP_KERNEL); 1031 if (!buf) 1032 goto out; 1033 1034 ret = -EFAULT; 1035 if (write && copy_from_user(buf, ubuf, len)) 1036 goto out_free_meta; 1037 1038 bip = bio_integrity_alloc(bio, GFP_KERNEL, 1); 1039 if (IS_ERR(bip)) { 1040 ret = PTR_ERR(bip); 1041 goto out_free_meta; 1042 } 1043 1044 bip->bip_iter.bi_size = len; 1045 bip->bip_iter.bi_sector = seed; 1046 ret = bio_integrity_add_page(bio, virt_to_page(buf), len, 1047 offset_in_page(buf)); 1048 if (ret == len) 1049 return buf; 1050 ret = -ENOMEM; 1051 out_free_meta: 1052 kfree(buf); 1053 out: 1054 return ERR_PTR(ret); 1055 } 1056 1057 static u32 nvme_known_admin_effects(u8 opcode) 1058 { 1059 switch (opcode) { 1060 case nvme_admin_format_nvm: 1061 return NVME_CMD_EFFECTS_LBCC | NVME_CMD_EFFECTS_NCC | 1062 NVME_CMD_EFFECTS_CSE_MASK; 1063 case nvme_admin_sanitize_nvm: 1064 return NVME_CMD_EFFECTS_LBCC | NVME_CMD_EFFECTS_CSE_MASK; 1065 default: 1066 break; 1067 } 1068 return 0; 1069 } 1070 1071 u32 nvme_command_effects(struct nvme_ctrl *ctrl, struct nvme_ns *ns, u8 opcode) 1072 { 1073 u32 effects = 0; 1074 1075 if (ns) { 1076 if (ns->head->effects) 1077 effects = le32_to_cpu(ns->head->effects->iocs[opcode]); 1078 if (effects & ~(NVME_CMD_EFFECTS_CSUPP | NVME_CMD_EFFECTS_LBCC)) 1079 dev_warn(ctrl->device, 1080 "IO command:%02x has unhandled effects:%08x\n", 1081 opcode, effects); 1082 return 0; 1083 } 1084 1085 if (ctrl->effects) 1086 effects = le32_to_cpu(ctrl->effects->acs[opcode]); 1087 effects |= nvme_known_admin_effects(opcode); 1088 1089 return effects; 1090 } 1091 EXPORT_SYMBOL_NS_GPL(nvme_command_effects, NVME_TARGET_PASSTHRU); 1092 1093 static u32 nvme_passthru_start(struct nvme_ctrl *ctrl, struct nvme_ns *ns, 1094 u8 opcode) 1095 { 1096 u32 effects = nvme_command_effects(ctrl, ns, opcode); 1097 1098 /* 1099 * For simplicity, IO to all namespaces is quiesced even if the command 1100 * effects say only one namespace is affected. 1101 */ 1102 if (effects & NVME_CMD_EFFECTS_CSE_MASK) { 1103 mutex_lock(&ctrl->scan_lock); 1104 mutex_lock(&ctrl->subsys->lock); 1105 nvme_mpath_start_freeze(ctrl->subsys); 1106 nvme_mpath_wait_freeze(ctrl->subsys); 1107 nvme_start_freeze(ctrl); 1108 nvme_wait_freeze(ctrl); 1109 } 1110 return effects; 1111 } 1112 1113 static void nvme_passthru_end(struct nvme_ctrl *ctrl, u32 effects) 1114 { 1115 if (effects & NVME_CMD_EFFECTS_CSE_MASK) { 1116 nvme_unfreeze(ctrl); 1117 nvme_mpath_unfreeze(ctrl->subsys); 1118 mutex_unlock(&ctrl->subsys->lock); 1119 nvme_remove_invalid_namespaces(ctrl, NVME_NSID_ALL); 1120 mutex_unlock(&ctrl->scan_lock); 1121 } 1122 if (effects & NVME_CMD_EFFECTS_CCC) 1123 nvme_init_identify(ctrl); 1124 if (effects & (NVME_CMD_EFFECTS_NIC | NVME_CMD_EFFECTS_NCC)) { 1125 nvme_queue_scan(ctrl); 1126 flush_work(&ctrl->scan_work); 1127 } 1128 } 1129 1130 void nvme_execute_passthru_rq(struct request *rq) 1131 { 1132 struct nvme_command *cmd = nvme_req(rq)->cmd; 1133 struct nvme_ctrl *ctrl = nvme_req(rq)->ctrl; 1134 struct nvme_ns *ns = rq->q->queuedata; 1135 struct gendisk *disk = ns ? ns->disk : NULL; 1136 u32 effects; 1137 1138 effects = nvme_passthru_start(ctrl, ns, cmd->common.opcode); 1139 blk_execute_rq(disk, rq, 0); 1140 nvme_passthru_end(ctrl, effects); 1141 } 1142 EXPORT_SYMBOL_NS_GPL(nvme_execute_passthru_rq, NVME_TARGET_PASSTHRU); 1143 1144 static int nvme_submit_user_cmd(struct request_queue *q, 1145 struct nvme_command *cmd, void __user *ubuffer, 1146 unsigned bufflen, void __user *meta_buffer, unsigned meta_len, 1147 u32 meta_seed, u64 *result, unsigned timeout) 1148 { 1149 bool write = nvme_is_write(cmd); 1150 struct nvme_ns *ns = q->queuedata; 1151 struct block_device *bdev = ns ? ns->disk->part0 : NULL; 1152 struct request *req; 1153 struct bio *bio = NULL; 1154 void *meta = NULL; 1155 int ret; 1156 1157 req = nvme_alloc_request(q, cmd, 0); 1158 if (IS_ERR(req)) 1159 return PTR_ERR(req); 1160 1161 if (timeout) 1162 req->timeout = timeout; 1163 nvme_req(req)->flags |= NVME_REQ_USERCMD; 1164 1165 if (ubuffer && bufflen) { 1166 ret = blk_rq_map_user(q, req, NULL, ubuffer, bufflen, 1167 GFP_KERNEL); 1168 if (ret) 1169 goto out; 1170 bio = req->bio; 1171 if (bdev) 1172 bio_set_dev(bio, bdev); 1173 if (bdev && meta_buffer && meta_len) { 1174 meta = nvme_add_user_metadata(bio, meta_buffer, meta_len, 1175 meta_seed, write); 1176 if (IS_ERR(meta)) { 1177 ret = PTR_ERR(meta); 1178 goto out_unmap; 1179 } 1180 req->cmd_flags |= REQ_INTEGRITY; 1181 } 1182 } 1183 1184 nvme_execute_passthru_rq(req); 1185 if (nvme_req(req)->flags & NVME_REQ_CANCELLED) 1186 ret = -EINTR; 1187 else 1188 ret = nvme_req(req)->status; 1189 if (result) 1190 *result = le64_to_cpu(nvme_req(req)->result.u64); 1191 if (meta && !ret && !write) { 1192 if (copy_to_user(meta_buffer, meta, meta_len)) 1193 ret = -EFAULT; 1194 } 1195 kfree(meta); 1196 out_unmap: 1197 if (bio) 1198 blk_rq_unmap_user(bio); 1199 out: 1200 blk_mq_free_request(req); 1201 return ret; 1202 } 1203 1204 static void nvme_keep_alive_end_io(struct request *rq, blk_status_t status) 1205 { 1206 struct nvme_ctrl *ctrl = rq->end_io_data; 1207 unsigned long flags; 1208 bool startka = false; 1209 1210 blk_mq_free_request(rq); 1211 1212 if (status) { 1213 dev_err(ctrl->device, 1214 "failed nvme_keep_alive_end_io error=%d\n", 1215 status); 1216 return; 1217 } 1218 1219 ctrl->comp_seen = false; 1220 spin_lock_irqsave(&ctrl->lock, flags); 1221 if (ctrl->state == NVME_CTRL_LIVE || 1222 ctrl->state == NVME_CTRL_CONNECTING) 1223 startka = true; 1224 spin_unlock_irqrestore(&ctrl->lock, flags); 1225 if (startka) 1226 queue_delayed_work(nvme_wq, &ctrl->ka_work, ctrl->kato * HZ); 1227 } 1228 1229 static void nvme_keep_alive_work(struct work_struct *work) 1230 { 1231 struct nvme_ctrl *ctrl = container_of(to_delayed_work(work), 1232 struct nvme_ctrl, ka_work); 1233 bool comp_seen = ctrl->comp_seen; 1234 struct request *rq; 1235 1236 if ((ctrl->ctratt & NVME_CTRL_ATTR_TBKAS) && comp_seen) { 1237 dev_dbg(ctrl->device, 1238 "reschedule traffic based keep-alive timer\n"); 1239 ctrl->comp_seen = false; 1240 queue_delayed_work(nvme_wq, &ctrl->ka_work, ctrl->kato * HZ); 1241 return; 1242 } 1243 1244 rq = nvme_alloc_request(ctrl->admin_q, &ctrl->ka_cmd, 1245 BLK_MQ_REQ_RESERVED | BLK_MQ_REQ_NOWAIT); 1246 if (IS_ERR(rq)) { 1247 /* allocation failure, reset the controller */ 1248 dev_err(ctrl->device, "keep-alive failed: %ld\n", PTR_ERR(rq)); 1249 nvme_reset_ctrl(ctrl); 1250 return; 1251 } 1252 1253 rq->timeout = ctrl->kato * HZ; 1254 rq->end_io_data = ctrl; 1255 blk_execute_rq_nowait(NULL, rq, 0, nvme_keep_alive_end_io); 1256 } 1257 1258 static void nvme_start_keep_alive(struct nvme_ctrl *ctrl) 1259 { 1260 if (unlikely(ctrl->kato == 0)) 1261 return; 1262 1263 queue_delayed_work(nvme_wq, &ctrl->ka_work, ctrl->kato * HZ); 1264 } 1265 1266 void nvme_stop_keep_alive(struct nvme_ctrl *ctrl) 1267 { 1268 if (unlikely(ctrl->kato == 0)) 1269 return; 1270 1271 cancel_delayed_work_sync(&ctrl->ka_work); 1272 } 1273 EXPORT_SYMBOL_GPL(nvme_stop_keep_alive); 1274 1275 /* 1276 * In NVMe 1.0 the CNS field was just a binary controller or namespace 1277 * flag, thus sending any new CNS opcodes has a big chance of not working. 1278 * Qemu unfortunately had that bug after reporting a 1.1 version compliance 1279 * (but not for any later version). 1280 */ 1281 static bool nvme_ctrl_limited_cns(struct nvme_ctrl *ctrl) 1282 { 1283 if (ctrl->quirks & NVME_QUIRK_IDENTIFY_CNS) 1284 return ctrl->vs < NVME_VS(1, 2, 0); 1285 return ctrl->vs < NVME_VS(1, 1, 0); 1286 } 1287 1288 static int nvme_identify_ctrl(struct nvme_ctrl *dev, struct nvme_id_ctrl **id) 1289 { 1290 struct nvme_command c = { }; 1291 int error; 1292 1293 /* gcc-4.4.4 (at least) has issues with initializers and anon unions */ 1294 c.identify.opcode = nvme_admin_identify; 1295 c.identify.cns = NVME_ID_CNS_CTRL; 1296 1297 *id = kmalloc(sizeof(struct nvme_id_ctrl), GFP_KERNEL); 1298 if (!*id) 1299 return -ENOMEM; 1300 1301 error = nvme_submit_sync_cmd(dev->admin_q, &c, *id, 1302 sizeof(struct nvme_id_ctrl)); 1303 if (error) 1304 kfree(*id); 1305 return error; 1306 } 1307 1308 static bool nvme_multi_css(struct nvme_ctrl *ctrl) 1309 { 1310 return (ctrl->ctrl_config & NVME_CC_CSS_MASK) == NVME_CC_CSS_CSI; 1311 } 1312 1313 static int nvme_process_ns_desc(struct nvme_ctrl *ctrl, struct nvme_ns_ids *ids, 1314 struct nvme_ns_id_desc *cur, bool *csi_seen) 1315 { 1316 const char *warn_str = "ctrl returned bogus length:"; 1317 void *data = cur; 1318 1319 switch (cur->nidt) { 1320 case NVME_NIDT_EUI64: 1321 if (cur->nidl != NVME_NIDT_EUI64_LEN) { 1322 dev_warn(ctrl->device, "%s %d for NVME_NIDT_EUI64\n", 1323 warn_str, cur->nidl); 1324 return -1; 1325 } 1326 memcpy(ids->eui64, data + sizeof(*cur), NVME_NIDT_EUI64_LEN); 1327 return NVME_NIDT_EUI64_LEN; 1328 case NVME_NIDT_NGUID: 1329 if (cur->nidl != NVME_NIDT_NGUID_LEN) { 1330 dev_warn(ctrl->device, "%s %d for NVME_NIDT_NGUID\n", 1331 warn_str, cur->nidl); 1332 return -1; 1333 } 1334 memcpy(ids->nguid, data + sizeof(*cur), NVME_NIDT_NGUID_LEN); 1335 return NVME_NIDT_NGUID_LEN; 1336 case NVME_NIDT_UUID: 1337 if (cur->nidl != NVME_NIDT_UUID_LEN) { 1338 dev_warn(ctrl->device, "%s %d for NVME_NIDT_UUID\n", 1339 warn_str, cur->nidl); 1340 return -1; 1341 } 1342 uuid_copy(&ids->uuid, data + sizeof(*cur)); 1343 return NVME_NIDT_UUID_LEN; 1344 case NVME_NIDT_CSI: 1345 if (cur->nidl != NVME_NIDT_CSI_LEN) { 1346 dev_warn(ctrl->device, "%s %d for NVME_NIDT_CSI\n", 1347 warn_str, cur->nidl); 1348 return -1; 1349 } 1350 memcpy(&ids->csi, data + sizeof(*cur), NVME_NIDT_CSI_LEN); 1351 *csi_seen = true; 1352 return NVME_NIDT_CSI_LEN; 1353 default: 1354 /* Skip unknown types */ 1355 return cur->nidl; 1356 } 1357 } 1358 1359 static int nvme_identify_ns_descs(struct nvme_ctrl *ctrl, unsigned nsid, 1360 struct nvme_ns_ids *ids) 1361 { 1362 struct nvme_command c = { }; 1363 bool csi_seen = false; 1364 int status, pos, len; 1365 void *data; 1366 1367 if (ctrl->vs < NVME_VS(1, 3, 0) && !nvme_multi_css(ctrl)) 1368 return 0; 1369 if (ctrl->quirks & NVME_QUIRK_NO_NS_DESC_LIST) 1370 return 0; 1371 1372 c.identify.opcode = nvme_admin_identify; 1373 c.identify.nsid = cpu_to_le32(nsid); 1374 c.identify.cns = NVME_ID_CNS_NS_DESC_LIST; 1375 1376 data = kzalloc(NVME_IDENTIFY_DATA_SIZE, GFP_KERNEL); 1377 if (!data) 1378 return -ENOMEM; 1379 1380 status = nvme_submit_sync_cmd(ctrl->admin_q, &c, data, 1381 NVME_IDENTIFY_DATA_SIZE); 1382 if (status) { 1383 dev_warn(ctrl->device, 1384 "Identify Descriptors failed (nsid=%u, status=0x%x)\n", 1385 nsid, status); 1386 goto free_data; 1387 } 1388 1389 for (pos = 0; pos < NVME_IDENTIFY_DATA_SIZE; pos += len) { 1390 struct nvme_ns_id_desc *cur = data + pos; 1391 1392 if (cur->nidl == 0) 1393 break; 1394 1395 len = nvme_process_ns_desc(ctrl, ids, cur, &csi_seen); 1396 if (len < 0) 1397 break; 1398 1399 len += sizeof(*cur); 1400 } 1401 1402 if (nvme_multi_css(ctrl) && !csi_seen) { 1403 dev_warn(ctrl->device, "Command set not reported for nsid:%d\n", 1404 nsid); 1405 status = -EINVAL; 1406 } 1407 1408 free_data: 1409 kfree(data); 1410 return status; 1411 } 1412 1413 static int nvme_identify_ns(struct nvme_ctrl *ctrl, unsigned nsid, 1414 struct nvme_ns_ids *ids, struct nvme_id_ns **id) 1415 { 1416 struct nvme_command c = { }; 1417 int error; 1418 1419 /* gcc-4.4.4 (at least) has issues with initializers and anon unions */ 1420 c.identify.opcode = nvme_admin_identify; 1421 c.identify.nsid = cpu_to_le32(nsid); 1422 c.identify.cns = NVME_ID_CNS_NS; 1423 1424 *id = kmalloc(sizeof(**id), GFP_KERNEL); 1425 if (!*id) 1426 return -ENOMEM; 1427 1428 error = nvme_submit_sync_cmd(ctrl->admin_q, &c, *id, sizeof(**id)); 1429 if (error) { 1430 dev_warn(ctrl->device, "Identify namespace failed (%d)\n", error); 1431 goto out_free_id; 1432 } 1433 1434 error = NVME_SC_INVALID_NS | NVME_SC_DNR; 1435 if ((*id)->ncap == 0) /* namespace not allocated or attached */ 1436 goto out_free_id; 1437 1438 if (ctrl->vs >= NVME_VS(1, 1, 0) && 1439 !memchr_inv(ids->eui64, 0, sizeof(ids->eui64))) 1440 memcpy(ids->eui64, (*id)->eui64, sizeof(ids->eui64)); 1441 if (ctrl->vs >= NVME_VS(1, 2, 0) && 1442 !memchr_inv(ids->nguid, 0, sizeof(ids->nguid))) 1443 memcpy(ids->nguid, (*id)->nguid, sizeof(ids->nguid)); 1444 1445 return 0; 1446 1447 out_free_id: 1448 kfree(*id); 1449 return error; 1450 } 1451 1452 static int nvme_features(struct nvme_ctrl *dev, u8 op, unsigned int fid, 1453 unsigned int dword11, void *buffer, size_t buflen, u32 *result) 1454 { 1455 union nvme_result res = { 0 }; 1456 struct nvme_command c; 1457 int ret; 1458 1459 memset(&c, 0, sizeof(c)); 1460 c.features.opcode = op; 1461 c.features.fid = cpu_to_le32(fid); 1462 c.features.dword11 = cpu_to_le32(dword11); 1463 1464 ret = __nvme_submit_sync_cmd(dev->admin_q, &c, &res, 1465 buffer, buflen, 0, NVME_QID_ANY, 0, 0, false); 1466 if (ret >= 0 && result) 1467 *result = le32_to_cpu(res.u32); 1468 return ret; 1469 } 1470 1471 int nvme_set_features(struct nvme_ctrl *dev, unsigned int fid, 1472 unsigned int dword11, void *buffer, size_t buflen, 1473 u32 *result) 1474 { 1475 return nvme_features(dev, nvme_admin_set_features, fid, dword11, buffer, 1476 buflen, result); 1477 } 1478 EXPORT_SYMBOL_GPL(nvme_set_features); 1479 1480 int nvme_get_features(struct nvme_ctrl *dev, unsigned int fid, 1481 unsigned int dword11, void *buffer, size_t buflen, 1482 u32 *result) 1483 { 1484 return nvme_features(dev, nvme_admin_get_features, fid, dword11, buffer, 1485 buflen, result); 1486 } 1487 EXPORT_SYMBOL_GPL(nvme_get_features); 1488 1489 int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count) 1490 { 1491 u32 q_count = (*count - 1) | ((*count - 1) << 16); 1492 u32 result; 1493 int status, nr_io_queues; 1494 1495 status = nvme_set_features(ctrl, NVME_FEAT_NUM_QUEUES, q_count, NULL, 0, 1496 &result); 1497 if (status < 0) 1498 return status; 1499 1500 /* 1501 * Degraded controllers might return an error when setting the queue 1502 * count. We still want to be able to bring them online and offer 1503 * access to the admin queue, as that might be only way to fix them up. 1504 */ 1505 if (status > 0) { 1506 dev_err(ctrl->device, "Could not set queue count (%d)\n", status); 1507 *count = 0; 1508 } else { 1509 nr_io_queues = min(result & 0xffff, result >> 16) + 1; 1510 *count = min(*count, nr_io_queues); 1511 } 1512 1513 return 0; 1514 } 1515 EXPORT_SYMBOL_GPL(nvme_set_queue_count); 1516 1517 #define NVME_AEN_SUPPORTED \ 1518 (NVME_AEN_CFG_NS_ATTR | NVME_AEN_CFG_FW_ACT | \ 1519 NVME_AEN_CFG_ANA_CHANGE | NVME_AEN_CFG_DISC_CHANGE) 1520 1521 static void nvme_enable_aen(struct nvme_ctrl *ctrl) 1522 { 1523 u32 result, supported_aens = ctrl->oaes & NVME_AEN_SUPPORTED; 1524 int status; 1525 1526 if (!supported_aens) 1527 return; 1528 1529 status = nvme_set_features(ctrl, NVME_FEAT_ASYNC_EVENT, supported_aens, 1530 NULL, 0, &result); 1531 if (status) 1532 dev_warn(ctrl->device, "Failed to configure AEN (cfg %x)\n", 1533 supported_aens); 1534 1535 queue_work(nvme_wq, &ctrl->async_event_work); 1536 } 1537 1538 /* 1539 * Convert integer values from ioctl structures to user pointers, silently 1540 * ignoring the upper bits in the compat case to match behaviour of 32-bit 1541 * kernels. 1542 */ 1543 static void __user *nvme_to_user_ptr(uintptr_t ptrval) 1544 { 1545 if (in_compat_syscall()) 1546 ptrval = (compat_uptr_t)ptrval; 1547 return (void __user *)ptrval; 1548 } 1549 1550 static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio) 1551 { 1552 struct nvme_user_io io; 1553 struct nvme_command c; 1554 unsigned length, meta_len; 1555 void __user *metadata; 1556 1557 if (copy_from_user(&io, uio, sizeof(io))) 1558 return -EFAULT; 1559 if (io.flags) 1560 return -EINVAL; 1561 1562 switch (io.opcode) { 1563 case nvme_cmd_write: 1564 case nvme_cmd_read: 1565 case nvme_cmd_compare: 1566 break; 1567 default: 1568 return -EINVAL; 1569 } 1570 1571 length = (io.nblocks + 1) << ns->lba_shift; 1572 1573 if ((io.control & NVME_RW_PRINFO_PRACT) && 1574 ns->ms == sizeof(struct t10_pi_tuple)) { 1575 /* 1576 * Protection information is stripped/inserted by the 1577 * controller. 1578 */ 1579 if (nvme_to_user_ptr(io.metadata)) 1580 return -EINVAL; 1581 meta_len = 0; 1582 metadata = NULL; 1583 } else { 1584 meta_len = (io.nblocks + 1) * ns->ms; 1585 metadata = nvme_to_user_ptr(io.metadata); 1586 } 1587 1588 if (ns->features & NVME_NS_EXT_LBAS) { 1589 length += meta_len; 1590 meta_len = 0; 1591 } else if (meta_len) { 1592 if ((io.metadata & 3) || !io.metadata) 1593 return -EINVAL; 1594 } 1595 1596 memset(&c, 0, sizeof(c)); 1597 c.rw.opcode = io.opcode; 1598 c.rw.flags = io.flags; 1599 c.rw.nsid = cpu_to_le32(ns->head->ns_id); 1600 c.rw.slba = cpu_to_le64(io.slba); 1601 c.rw.length = cpu_to_le16(io.nblocks); 1602 c.rw.control = cpu_to_le16(io.control); 1603 c.rw.dsmgmt = cpu_to_le32(io.dsmgmt); 1604 c.rw.reftag = cpu_to_le32(io.reftag); 1605 c.rw.apptag = cpu_to_le16(io.apptag); 1606 c.rw.appmask = cpu_to_le16(io.appmask); 1607 1608 return nvme_submit_user_cmd(ns->queue, &c, 1609 nvme_to_user_ptr(io.addr), length, 1610 metadata, meta_len, lower_32_bits(io.slba), NULL, 0); 1611 } 1612 1613 static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns, 1614 struct nvme_passthru_cmd __user *ucmd) 1615 { 1616 struct nvme_passthru_cmd cmd; 1617 struct nvme_command c; 1618 unsigned timeout = 0; 1619 u64 result; 1620 int status; 1621 1622 if (!capable(CAP_SYS_ADMIN)) 1623 return -EACCES; 1624 if (copy_from_user(&cmd, ucmd, sizeof(cmd))) 1625 return -EFAULT; 1626 if (cmd.flags) 1627 return -EINVAL; 1628 1629 memset(&c, 0, sizeof(c)); 1630 c.common.opcode = cmd.opcode; 1631 c.common.flags = cmd.flags; 1632 c.common.nsid = cpu_to_le32(cmd.nsid); 1633 c.common.cdw2[0] = cpu_to_le32(cmd.cdw2); 1634 c.common.cdw2[1] = cpu_to_le32(cmd.cdw3); 1635 c.common.cdw10 = cpu_to_le32(cmd.cdw10); 1636 c.common.cdw11 = cpu_to_le32(cmd.cdw11); 1637 c.common.cdw12 = cpu_to_le32(cmd.cdw12); 1638 c.common.cdw13 = cpu_to_le32(cmd.cdw13); 1639 c.common.cdw14 = cpu_to_le32(cmd.cdw14); 1640 c.common.cdw15 = cpu_to_le32(cmd.cdw15); 1641 1642 if (cmd.timeout_ms) 1643 timeout = msecs_to_jiffies(cmd.timeout_ms); 1644 1645 status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c, 1646 nvme_to_user_ptr(cmd.addr), cmd.data_len, 1647 nvme_to_user_ptr(cmd.metadata), cmd.metadata_len, 1648 0, &result, timeout); 1649 1650 if (status >= 0) { 1651 if (put_user(result, &ucmd->result)) 1652 return -EFAULT; 1653 } 1654 1655 return status; 1656 } 1657 1658 static int nvme_user_cmd64(struct nvme_ctrl *ctrl, struct nvme_ns *ns, 1659 struct nvme_passthru_cmd64 __user *ucmd) 1660 { 1661 struct nvme_passthru_cmd64 cmd; 1662 struct nvme_command c; 1663 unsigned timeout = 0; 1664 int status; 1665 1666 if (!capable(CAP_SYS_ADMIN)) 1667 return -EACCES; 1668 if (copy_from_user(&cmd, ucmd, sizeof(cmd))) 1669 return -EFAULT; 1670 if (cmd.flags) 1671 return -EINVAL; 1672 1673 memset(&c, 0, sizeof(c)); 1674 c.common.opcode = cmd.opcode; 1675 c.common.flags = cmd.flags; 1676 c.common.nsid = cpu_to_le32(cmd.nsid); 1677 c.common.cdw2[0] = cpu_to_le32(cmd.cdw2); 1678 c.common.cdw2[1] = cpu_to_le32(cmd.cdw3); 1679 c.common.cdw10 = cpu_to_le32(cmd.cdw10); 1680 c.common.cdw11 = cpu_to_le32(cmd.cdw11); 1681 c.common.cdw12 = cpu_to_le32(cmd.cdw12); 1682 c.common.cdw13 = cpu_to_le32(cmd.cdw13); 1683 c.common.cdw14 = cpu_to_le32(cmd.cdw14); 1684 c.common.cdw15 = cpu_to_le32(cmd.cdw15); 1685 1686 if (cmd.timeout_ms) 1687 timeout = msecs_to_jiffies(cmd.timeout_ms); 1688 1689 status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c, 1690 nvme_to_user_ptr(cmd.addr), cmd.data_len, 1691 nvme_to_user_ptr(cmd.metadata), cmd.metadata_len, 1692 0, &cmd.result, timeout); 1693 1694 if (status >= 0) { 1695 if (put_user(cmd.result, &ucmd->result)) 1696 return -EFAULT; 1697 } 1698 1699 return status; 1700 } 1701 1702 /* 1703 * Issue ioctl requests on the first available path. Note that unlike normal 1704 * block layer requests we will not retry failed request on another controller. 1705 */ 1706 struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk, 1707 struct nvme_ns_head **head, int *srcu_idx) 1708 { 1709 #ifdef CONFIG_NVME_MULTIPATH 1710 if (disk->fops == &nvme_ns_head_ops) { 1711 struct nvme_ns *ns; 1712 1713 *head = disk->private_data; 1714 *srcu_idx = srcu_read_lock(&(*head)->srcu); 1715 ns = nvme_find_path(*head); 1716 if (!ns) 1717 srcu_read_unlock(&(*head)->srcu, *srcu_idx); 1718 return ns; 1719 } 1720 #endif 1721 *head = NULL; 1722 *srcu_idx = -1; 1723 return disk->private_data; 1724 } 1725 1726 void nvme_put_ns_from_disk(struct nvme_ns_head *head, int idx) 1727 { 1728 if (head) 1729 srcu_read_unlock(&head->srcu, idx); 1730 } 1731 1732 static bool is_ctrl_ioctl(unsigned int cmd) 1733 { 1734 if (cmd == NVME_IOCTL_ADMIN_CMD || cmd == NVME_IOCTL_ADMIN64_CMD) 1735 return true; 1736 if (is_sed_ioctl(cmd)) 1737 return true; 1738 return false; 1739 } 1740 1741 static int nvme_handle_ctrl_ioctl(struct nvme_ns *ns, unsigned int cmd, 1742 void __user *argp, 1743 struct nvme_ns_head *head, 1744 int srcu_idx) 1745 { 1746 struct nvme_ctrl *ctrl = ns->ctrl; 1747 int ret; 1748 1749 nvme_get_ctrl(ns->ctrl); 1750 nvme_put_ns_from_disk(head, srcu_idx); 1751 1752 switch (cmd) { 1753 case NVME_IOCTL_ADMIN_CMD: 1754 ret = nvme_user_cmd(ctrl, NULL, argp); 1755 break; 1756 case NVME_IOCTL_ADMIN64_CMD: 1757 ret = nvme_user_cmd64(ctrl, NULL, argp); 1758 break; 1759 default: 1760 ret = sed_ioctl(ctrl->opal_dev, cmd, argp); 1761 break; 1762 } 1763 nvme_put_ctrl(ctrl); 1764 return ret; 1765 } 1766 1767 static int nvme_ioctl(struct block_device *bdev, fmode_t mode, 1768 unsigned int cmd, unsigned long arg) 1769 { 1770 struct nvme_ns_head *head = NULL; 1771 void __user *argp = (void __user *)arg; 1772 struct nvme_ns *ns; 1773 int srcu_idx, ret; 1774 1775 ns = nvme_get_ns_from_disk(bdev->bd_disk, &head, &srcu_idx); 1776 if (unlikely(!ns)) 1777 return -EWOULDBLOCK; 1778 1779 /* 1780 * Handle ioctls that apply to the controller instead of the namespace 1781 * seperately and drop the ns SRCU reference early. This avoids a 1782 * deadlock when deleting namespaces using the passthrough interface. 1783 */ 1784 if (is_ctrl_ioctl(cmd)) 1785 return nvme_handle_ctrl_ioctl(ns, cmd, argp, head, srcu_idx); 1786 1787 switch (cmd) { 1788 case NVME_IOCTL_ID: 1789 force_successful_syscall_return(); 1790 ret = ns->head->ns_id; 1791 break; 1792 case NVME_IOCTL_IO_CMD: 1793 ret = nvme_user_cmd(ns->ctrl, ns, argp); 1794 break; 1795 case NVME_IOCTL_SUBMIT_IO: 1796 ret = nvme_submit_io(ns, argp); 1797 break; 1798 case NVME_IOCTL_IO64_CMD: 1799 ret = nvme_user_cmd64(ns->ctrl, ns, argp); 1800 break; 1801 default: 1802 if (ns->ndev) 1803 ret = nvme_nvm_ioctl(ns, cmd, arg); 1804 else 1805 ret = -ENOTTY; 1806 } 1807 1808 nvme_put_ns_from_disk(head, srcu_idx); 1809 return ret; 1810 } 1811 1812 #ifdef CONFIG_COMPAT 1813 struct nvme_user_io32 { 1814 __u8 opcode; 1815 __u8 flags; 1816 __u16 control; 1817 __u16 nblocks; 1818 __u16 rsvd; 1819 __u64 metadata; 1820 __u64 addr; 1821 __u64 slba; 1822 __u32 dsmgmt; 1823 __u32 reftag; 1824 __u16 apptag; 1825 __u16 appmask; 1826 } __attribute__((__packed__)); 1827 1828 #define NVME_IOCTL_SUBMIT_IO32 _IOW('N', 0x42, struct nvme_user_io32) 1829 1830 static int nvme_compat_ioctl(struct block_device *bdev, fmode_t mode, 1831 unsigned int cmd, unsigned long arg) 1832 { 1833 /* 1834 * Corresponds to the difference of NVME_IOCTL_SUBMIT_IO 1835 * between 32 bit programs and 64 bit kernel. 1836 * The cause is that the results of sizeof(struct nvme_user_io), 1837 * which is used to define NVME_IOCTL_SUBMIT_IO, 1838 * are not same between 32 bit compiler and 64 bit compiler. 1839 * NVME_IOCTL_SUBMIT_IO32 is for 64 bit kernel handling 1840 * NVME_IOCTL_SUBMIT_IO issued from 32 bit programs. 1841 * Other IOCTL numbers are same between 32 bit and 64 bit. 1842 * So there is nothing to do regarding to other IOCTL numbers. 1843 */ 1844 if (cmd == NVME_IOCTL_SUBMIT_IO32) 1845 return nvme_ioctl(bdev, mode, NVME_IOCTL_SUBMIT_IO, arg); 1846 1847 return nvme_ioctl(bdev, mode, cmd, arg); 1848 } 1849 #else 1850 #define nvme_compat_ioctl NULL 1851 #endif /* CONFIG_COMPAT */ 1852 1853 static int nvme_open(struct block_device *bdev, fmode_t mode) 1854 { 1855 struct nvme_ns *ns = bdev->bd_disk->private_data; 1856 1857 #ifdef CONFIG_NVME_MULTIPATH 1858 /* should never be called due to GENHD_FL_HIDDEN */ 1859 if (WARN_ON_ONCE(ns->head->disk)) 1860 goto fail; 1861 #endif 1862 if (!kref_get_unless_zero(&ns->kref)) 1863 goto fail; 1864 if (!try_module_get(ns->ctrl->ops->module)) 1865 goto fail_put_ns; 1866 1867 return 0; 1868 1869 fail_put_ns: 1870 nvme_put_ns(ns); 1871 fail: 1872 return -ENXIO; 1873 } 1874 1875 static void nvme_release(struct gendisk *disk, fmode_t mode) 1876 { 1877 struct nvme_ns *ns = disk->private_data; 1878 1879 module_put(ns->ctrl->ops->module); 1880 nvme_put_ns(ns); 1881 } 1882 1883 static int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo) 1884 { 1885 /* some standard values */ 1886 geo->heads = 1 << 6; 1887 geo->sectors = 1 << 5; 1888 geo->cylinders = get_capacity(bdev->bd_disk) >> 11; 1889 return 0; 1890 } 1891 1892 #ifdef CONFIG_BLK_DEV_INTEGRITY 1893 static void nvme_init_integrity(struct gendisk *disk, u16 ms, u8 pi_type, 1894 u32 max_integrity_segments) 1895 { 1896 struct blk_integrity integrity; 1897 1898 memset(&integrity, 0, sizeof(integrity)); 1899 switch (pi_type) { 1900 case NVME_NS_DPS_PI_TYPE3: 1901 integrity.profile = &t10_pi_type3_crc; 1902 integrity.tag_size = sizeof(u16) + sizeof(u32); 1903 integrity.flags |= BLK_INTEGRITY_DEVICE_CAPABLE; 1904 break; 1905 case NVME_NS_DPS_PI_TYPE1: 1906 case NVME_NS_DPS_PI_TYPE2: 1907 integrity.profile = &t10_pi_type1_crc; 1908 integrity.tag_size = sizeof(u16); 1909 integrity.flags |= BLK_INTEGRITY_DEVICE_CAPABLE; 1910 break; 1911 default: 1912 integrity.profile = NULL; 1913 break; 1914 } 1915 integrity.tuple_size = ms; 1916 blk_integrity_register(disk, &integrity); 1917 blk_queue_max_integrity_segments(disk->queue, max_integrity_segments); 1918 } 1919 #else 1920 static void nvme_init_integrity(struct gendisk *disk, u16 ms, u8 pi_type, 1921 u32 max_integrity_segments) 1922 { 1923 } 1924 #endif /* CONFIG_BLK_DEV_INTEGRITY */ 1925 1926 static void nvme_config_discard(struct gendisk *disk, struct nvme_ns *ns) 1927 { 1928 struct nvme_ctrl *ctrl = ns->ctrl; 1929 struct request_queue *queue = disk->queue; 1930 u32 size = queue_logical_block_size(queue); 1931 1932 if (!(ctrl->oncs & NVME_CTRL_ONCS_DSM)) { 1933 blk_queue_flag_clear(QUEUE_FLAG_DISCARD, queue); 1934 return; 1935 } 1936 1937 if (ctrl->nr_streams && ns->sws && ns->sgs) 1938 size *= ns->sws * ns->sgs; 1939 1940 BUILD_BUG_ON(PAGE_SIZE / sizeof(struct nvme_dsm_range) < 1941 NVME_DSM_MAX_RANGES); 1942 1943 queue->limits.discard_alignment = 0; 1944 queue->limits.discard_granularity = size; 1945 1946 /* If discard is already enabled, don't reset queue limits */ 1947 if (blk_queue_flag_test_and_set(QUEUE_FLAG_DISCARD, queue)) 1948 return; 1949 1950 blk_queue_max_discard_sectors(queue, UINT_MAX); 1951 blk_queue_max_discard_segments(queue, NVME_DSM_MAX_RANGES); 1952 1953 if (ctrl->quirks & NVME_QUIRK_DEALLOCATE_ZEROES) 1954 blk_queue_max_write_zeroes_sectors(queue, UINT_MAX); 1955 } 1956 1957 /* 1958 * Even though NVMe spec explicitly states that MDTS is not applicable to the 1959 * write-zeroes, we are cautious and limit the size to the controllers 1960 * max_hw_sectors value, which is based on the MDTS field and possibly other 1961 * limiting factors. 1962 */ 1963 static void nvme_config_write_zeroes(struct request_queue *q, 1964 struct nvme_ctrl *ctrl) 1965 { 1966 if ((ctrl->oncs & NVME_CTRL_ONCS_WRITE_ZEROES) && 1967 !(ctrl->quirks & NVME_QUIRK_DISABLE_WRITE_ZEROES)) 1968 blk_queue_max_write_zeroes_sectors(q, ctrl->max_hw_sectors); 1969 } 1970 1971 static bool nvme_ns_ids_valid(struct nvme_ns_ids *ids) 1972 { 1973 return !uuid_is_null(&ids->uuid) || 1974 memchr_inv(ids->nguid, 0, sizeof(ids->nguid)) || 1975 memchr_inv(ids->eui64, 0, sizeof(ids->eui64)); 1976 } 1977 1978 static bool nvme_ns_ids_equal(struct nvme_ns_ids *a, struct nvme_ns_ids *b) 1979 { 1980 return uuid_equal(&a->uuid, &b->uuid) && 1981 memcmp(&a->nguid, &b->nguid, sizeof(a->nguid)) == 0 && 1982 memcmp(&a->eui64, &b->eui64, sizeof(a->eui64)) == 0 && 1983 a->csi == b->csi; 1984 } 1985 1986 static int nvme_setup_streams_ns(struct nvme_ctrl *ctrl, struct nvme_ns *ns, 1987 u32 *phys_bs, u32 *io_opt) 1988 { 1989 struct streams_directive_params s; 1990 int ret; 1991 1992 if (!ctrl->nr_streams) 1993 return 0; 1994 1995 ret = nvme_get_stream_params(ctrl, &s, ns->head->ns_id); 1996 if (ret) 1997 return ret; 1998 1999 ns->sws = le32_to_cpu(s.sws); 2000 ns->sgs = le16_to_cpu(s.sgs); 2001 2002 if (ns->sws) { 2003 *phys_bs = ns->sws * (1 << ns->lba_shift); 2004 if (ns->sgs) 2005 *io_opt = *phys_bs * ns->sgs; 2006 } 2007 2008 return 0; 2009 } 2010 2011 static int nvme_configure_metadata(struct nvme_ns *ns, struct nvme_id_ns *id) 2012 { 2013 struct nvme_ctrl *ctrl = ns->ctrl; 2014 2015 /* 2016 * The PI implementation requires the metadata size to be equal to the 2017 * t10 pi tuple size. 2018 */ 2019 ns->ms = le16_to_cpu(id->lbaf[id->flbas & NVME_NS_FLBAS_LBA_MASK].ms); 2020 if (ns->ms == sizeof(struct t10_pi_tuple)) 2021 ns->pi_type = id->dps & NVME_NS_DPS_PI_MASK; 2022 else 2023 ns->pi_type = 0; 2024 2025 ns->features &= ~(NVME_NS_METADATA_SUPPORTED | NVME_NS_EXT_LBAS); 2026 if (!ns->ms || !(ctrl->ops->flags & NVME_F_METADATA_SUPPORTED)) 2027 return 0; 2028 if (ctrl->ops->flags & NVME_F_FABRICS) { 2029 /* 2030 * The NVMe over Fabrics specification only supports metadata as 2031 * part of the extended data LBA. We rely on HCA/HBA support to 2032 * remap the separate metadata buffer from the block layer. 2033 */ 2034 if (WARN_ON_ONCE(!(id->flbas & NVME_NS_FLBAS_META_EXT))) 2035 return -EINVAL; 2036 if (ctrl->max_integrity_segments) 2037 ns->features |= 2038 (NVME_NS_METADATA_SUPPORTED | NVME_NS_EXT_LBAS); 2039 } else { 2040 /* 2041 * For PCIe controllers, we can't easily remap the separate 2042 * metadata buffer from the block layer and thus require a 2043 * separate metadata buffer for block layer metadata/PI support. 2044 * We allow extended LBAs for the passthrough interface, though. 2045 */ 2046 if (id->flbas & NVME_NS_FLBAS_META_EXT) 2047 ns->features |= NVME_NS_EXT_LBAS; 2048 else 2049 ns->features |= NVME_NS_METADATA_SUPPORTED; 2050 } 2051 2052 return 0; 2053 } 2054 2055 static void nvme_set_queue_limits(struct nvme_ctrl *ctrl, 2056 struct request_queue *q) 2057 { 2058 bool vwc = ctrl->vwc & NVME_CTRL_VWC_PRESENT; 2059 2060 if (ctrl->max_hw_sectors) { 2061 u32 max_segments = 2062 (ctrl->max_hw_sectors / (NVME_CTRL_PAGE_SIZE >> 9)) + 1; 2063 2064 max_segments = min_not_zero(max_segments, ctrl->max_segments); 2065 blk_queue_max_hw_sectors(q, ctrl->max_hw_sectors); 2066 blk_queue_max_segments(q, min_t(u32, max_segments, USHRT_MAX)); 2067 } 2068 blk_queue_virt_boundary(q, NVME_CTRL_PAGE_SIZE - 1); 2069 blk_queue_dma_alignment(q, 7); 2070 blk_queue_write_cache(q, vwc, vwc); 2071 } 2072 2073 static void nvme_update_disk_info(struct gendisk *disk, 2074 struct nvme_ns *ns, struct nvme_id_ns *id) 2075 { 2076 sector_t capacity = nvme_lba_to_sect(ns, le64_to_cpu(id->nsze)); 2077 unsigned short bs = 1 << ns->lba_shift; 2078 u32 atomic_bs, phys_bs, io_opt = 0; 2079 2080 /* 2081 * The block layer can't support LBA sizes larger than the page size 2082 * yet, so catch this early and don't allow block I/O. 2083 */ 2084 if (ns->lba_shift > PAGE_SHIFT) { 2085 capacity = 0; 2086 bs = (1 << 9); 2087 } 2088 2089 blk_integrity_unregister(disk); 2090 2091 atomic_bs = phys_bs = bs; 2092 nvme_setup_streams_ns(ns->ctrl, ns, &phys_bs, &io_opt); 2093 if (id->nabo == 0) { 2094 /* 2095 * Bit 1 indicates whether NAWUPF is defined for this namespace 2096 * and whether it should be used instead of AWUPF. If NAWUPF == 2097 * 0 then AWUPF must be used instead. 2098 */ 2099 if (id->nsfeat & NVME_NS_FEAT_ATOMICS && id->nawupf) 2100 atomic_bs = (1 + le16_to_cpu(id->nawupf)) * bs; 2101 else 2102 atomic_bs = (1 + ns->ctrl->subsys->awupf) * bs; 2103 } 2104 2105 if (id->nsfeat & NVME_NS_FEAT_IO_OPT) { 2106 /* NPWG = Namespace Preferred Write Granularity */ 2107 phys_bs = bs * (1 + le16_to_cpu(id->npwg)); 2108 /* NOWS = Namespace Optimal Write Size */ 2109 io_opt = bs * (1 + le16_to_cpu(id->nows)); 2110 } 2111 2112 blk_queue_logical_block_size(disk->queue, bs); 2113 /* 2114 * Linux filesystems assume writing a single physical block is 2115 * an atomic operation. Hence limit the physical block size to the 2116 * value of the Atomic Write Unit Power Fail parameter. 2117 */ 2118 blk_queue_physical_block_size(disk->queue, min(phys_bs, atomic_bs)); 2119 blk_queue_io_min(disk->queue, phys_bs); 2120 blk_queue_io_opt(disk->queue, io_opt); 2121 2122 /* 2123 * Register a metadata profile for PI, or the plain non-integrity NVMe 2124 * metadata masquerading as Type 0 if supported, otherwise reject block 2125 * I/O to namespaces with metadata except when the namespace supports 2126 * PI, as it can strip/insert in that case. 2127 */ 2128 if (ns->ms) { 2129 if (IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY) && 2130 (ns->features & NVME_NS_METADATA_SUPPORTED)) 2131 nvme_init_integrity(disk, ns->ms, ns->pi_type, 2132 ns->ctrl->max_integrity_segments); 2133 else if (!nvme_ns_has_pi(ns)) 2134 capacity = 0; 2135 } 2136 2137 set_capacity_and_notify(disk, capacity); 2138 2139 nvme_config_discard(disk, ns); 2140 nvme_config_write_zeroes(disk->queue, ns->ctrl); 2141 2142 set_disk_ro(disk, (id->nsattr & NVME_NS_ATTR_RO) || 2143 test_bit(NVME_NS_FORCE_RO, &ns->flags)); 2144 } 2145 2146 static inline bool nvme_first_scan(struct gendisk *disk) 2147 { 2148 /* nvme_alloc_ns() scans the disk prior to adding it */ 2149 return !(disk->flags & GENHD_FL_UP); 2150 } 2151 2152 static void nvme_set_chunk_sectors(struct nvme_ns *ns, struct nvme_id_ns *id) 2153 { 2154 struct nvme_ctrl *ctrl = ns->ctrl; 2155 u32 iob; 2156 2157 if ((ctrl->quirks & NVME_QUIRK_STRIPE_SIZE) && 2158 is_power_of_2(ctrl->max_hw_sectors)) 2159 iob = ctrl->max_hw_sectors; 2160 else 2161 iob = nvme_lba_to_sect(ns, le16_to_cpu(id->noiob)); 2162 2163 if (!iob) 2164 return; 2165 2166 if (!is_power_of_2(iob)) { 2167 if (nvme_first_scan(ns->disk)) 2168 pr_warn("%s: ignoring unaligned IO boundary:%u\n", 2169 ns->disk->disk_name, iob); 2170 return; 2171 } 2172 2173 if (blk_queue_is_zoned(ns->disk->queue)) { 2174 if (nvme_first_scan(ns->disk)) 2175 pr_warn("%s: ignoring zoned namespace IO boundary\n", 2176 ns->disk->disk_name); 2177 return; 2178 } 2179 2180 blk_queue_chunk_sectors(ns->queue, iob); 2181 } 2182 2183 static int nvme_update_ns_info(struct nvme_ns *ns, struct nvme_id_ns *id) 2184 { 2185 unsigned lbaf = id->flbas & NVME_NS_FLBAS_LBA_MASK; 2186 int ret; 2187 2188 blk_mq_freeze_queue(ns->disk->queue); 2189 ns->lba_shift = id->lbaf[lbaf].ds; 2190 nvme_set_queue_limits(ns->ctrl, ns->queue); 2191 2192 ret = nvme_configure_metadata(ns, id); 2193 if (ret) 2194 goto out_unfreeze; 2195 nvme_set_chunk_sectors(ns, id); 2196 nvme_update_disk_info(ns->disk, ns, id); 2197 2198 if (ns->head->ids.csi == NVME_CSI_ZNS) { 2199 ret = nvme_update_zone_info(ns, lbaf); 2200 if (ret) 2201 goto out_unfreeze; 2202 } 2203 2204 blk_mq_unfreeze_queue(ns->disk->queue); 2205 2206 if (blk_queue_is_zoned(ns->queue)) { 2207 ret = nvme_revalidate_zones(ns); 2208 if (ret && !nvme_first_scan(ns->disk)) 2209 return ret; 2210 } 2211 2212 #ifdef CONFIG_NVME_MULTIPATH 2213 if (ns->head->disk) { 2214 blk_mq_freeze_queue(ns->head->disk->queue); 2215 nvme_update_disk_info(ns->head->disk, ns, id); 2216 blk_stack_limits(&ns->head->disk->queue->limits, 2217 &ns->queue->limits, 0); 2218 blk_queue_update_readahead(ns->head->disk->queue); 2219 blk_mq_unfreeze_queue(ns->head->disk->queue); 2220 } 2221 #endif 2222 return 0; 2223 2224 out_unfreeze: 2225 blk_mq_unfreeze_queue(ns->disk->queue); 2226 return ret; 2227 } 2228 2229 static char nvme_pr_type(enum pr_type type) 2230 { 2231 switch (type) { 2232 case PR_WRITE_EXCLUSIVE: 2233 return 1; 2234 case PR_EXCLUSIVE_ACCESS: 2235 return 2; 2236 case PR_WRITE_EXCLUSIVE_REG_ONLY: 2237 return 3; 2238 case PR_EXCLUSIVE_ACCESS_REG_ONLY: 2239 return 4; 2240 case PR_WRITE_EXCLUSIVE_ALL_REGS: 2241 return 5; 2242 case PR_EXCLUSIVE_ACCESS_ALL_REGS: 2243 return 6; 2244 default: 2245 return 0; 2246 } 2247 }; 2248 2249 static int nvme_pr_command(struct block_device *bdev, u32 cdw10, 2250 u64 key, u64 sa_key, u8 op) 2251 { 2252 struct nvme_ns_head *head = NULL; 2253 struct nvme_ns *ns; 2254 struct nvme_command c; 2255 int srcu_idx, ret; 2256 u8 data[16] = { 0, }; 2257 2258 ns = nvme_get_ns_from_disk(bdev->bd_disk, &head, &srcu_idx); 2259 if (unlikely(!ns)) 2260 return -EWOULDBLOCK; 2261 2262 put_unaligned_le64(key, &data[0]); 2263 put_unaligned_le64(sa_key, &data[8]); 2264 2265 memset(&c, 0, sizeof(c)); 2266 c.common.opcode = op; 2267 c.common.nsid = cpu_to_le32(ns->head->ns_id); 2268 c.common.cdw10 = cpu_to_le32(cdw10); 2269 2270 ret = nvme_submit_sync_cmd(ns->queue, &c, data, 16); 2271 nvme_put_ns_from_disk(head, srcu_idx); 2272 return ret; 2273 } 2274 2275 static int nvme_pr_register(struct block_device *bdev, u64 old, 2276 u64 new, unsigned flags) 2277 { 2278 u32 cdw10; 2279 2280 if (flags & ~PR_FL_IGNORE_KEY) 2281 return -EOPNOTSUPP; 2282 2283 cdw10 = old ? 2 : 0; 2284 cdw10 |= (flags & PR_FL_IGNORE_KEY) ? 1 << 3 : 0; 2285 cdw10 |= (1 << 30) | (1 << 31); /* PTPL=1 */ 2286 return nvme_pr_command(bdev, cdw10, old, new, nvme_cmd_resv_register); 2287 } 2288 2289 static int nvme_pr_reserve(struct block_device *bdev, u64 key, 2290 enum pr_type type, unsigned flags) 2291 { 2292 u32 cdw10; 2293 2294 if (flags & ~PR_FL_IGNORE_KEY) 2295 return -EOPNOTSUPP; 2296 2297 cdw10 = nvme_pr_type(type) << 8; 2298 cdw10 |= ((flags & PR_FL_IGNORE_KEY) ? 1 << 3 : 0); 2299 return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_acquire); 2300 } 2301 2302 static int nvme_pr_preempt(struct block_device *bdev, u64 old, u64 new, 2303 enum pr_type type, bool abort) 2304 { 2305 u32 cdw10 = nvme_pr_type(type) << 8 | (abort ? 2 : 1); 2306 return nvme_pr_command(bdev, cdw10, old, new, nvme_cmd_resv_acquire); 2307 } 2308 2309 static int nvme_pr_clear(struct block_device *bdev, u64 key) 2310 { 2311 u32 cdw10 = 1 | (key ? 1 << 3 : 0); 2312 return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_register); 2313 } 2314 2315 static int nvme_pr_release(struct block_device *bdev, u64 key, enum pr_type type) 2316 { 2317 u32 cdw10 = nvme_pr_type(type) << 8 | (key ? 1 << 3 : 0); 2318 return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_release); 2319 } 2320 2321 static const struct pr_ops nvme_pr_ops = { 2322 .pr_register = nvme_pr_register, 2323 .pr_reserve = nvme_pr_reserve, 2324 .pr_release = nvme_pr_release, 2325 .pr_preempt = nvme_pr_preempt, 2326 .pr_clear = nvme_pr_clear, 2327 }; 2328 2329 #ifdef CONFIG_BLK_SED_OPAL 2330 int nvme_sec_submit(void *data, u16 spsp, u8 secp, void *buffer, size_t len, 2331 bool send) 2332 { 2333 struct nvme_ctrl *ctrl = data; 2334 struct nvme_command cmd; 2335 2336 memset(&cmd, 0, sizeof(cmd)); 2337 if (send) 2338 cmd.common.opcode = nvme_admin_security_send; 2339 else 2340 cmd.common.opcode = nvme_admin_security_recv; 2341 cmd.common.nsid = 0; 2342 cmd.common.cdw10 = cpu_to_le32(((u32)secp) << 24 | ((u32)spsp) << 8); 2343 cmd.common.cdw11 = cpu_to_le32(len); 2344 2345 return __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, NULL, buffer, len, 0, 2346 NVME_QID_ANY, 1, 0, false); 2347 } 2348 EXPORT_SYMBOL_GPL(nvme_sec_submit); 2349 #endif /* CONFIG_BLK_SED_OPAL */ 2350 2351 static const struct block_device_operations nvme_bdev_ops = { 2352 .owner = THIS_MODULE, 2353 .ioctl = nvme_ioctl, 2354 .compat_ioctl = nvme_compat_ioctl, 2355 .open = nvme_open, 2356 .release = nvme_release, 2357 .getgeo = nvme_getgeo, 2358 .report_zones = nvme_report_zones, 2359 .pr_ops = &nvme_pr_ops, 2360 }; 2361 2362 #ifdef CONFIG_NVME_MULTIPATH 2363 static int nvme_ns_head_open(struct block_device *bdev, fmode_t mode) 2364 { 2365 struct nvme_ns_head *head = bdev->bd_disk->private_data; 2366 2367 if (!kref_get_unless_zero(&head->ref)) 2368 return -ENXIO; 2369 return 0; 2370 } 2371 2372 static void nvme_ns_head_release(struct gendisk *disk, fmode_t mode) 2373 { 2374 nvme_put_ns_head(disk->private_data); 2375 } 2376 2377 const struct block_device_operations nvme_ns_head_ops = { 2378 .owner = THIS_MODULE, 2379 .submit_bio = nvme_ns_head_submit_bio, 2380 .open = nvme_ns_head_open, 2381 .release = nvme_ns_head_release, 2382 .ioctl = nvme_ioctl, 2383 .compat_ioctl = nvme_compat_ioctl, 2384 .getgeo = nvme_getgeo, 2385 .report_zones = nvme_report_zones, 2386 .pr_ops = &nvme_pr_ops, 2387 }; 2388 #endif /* CONFIG_NVME_MULTIPATH */ 2389 2390 static int nvme_wait_ready(struct nvme_ctrl *ctrl, u64 cap, bool enabled) 2391 { 2392 unsigned long timeout = 2393 ((NVME_CAP_TIMEOUT(cap) + 1) * HZ / 2) + jiffies; 2394 u32 csts, bit = enabled ? NVME_CSTS_RDY : 0; 2395 int ret; 2396 2397 while ((ret = ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &csts)) == 0) { 2398 if (csts == ~0) 2399 return -ENODEV; 2400 if ((csts & NVME_CSTS_RDY) == bit) 2401 break; 2402 2403 usleep_range(1000, 2000); 2404 if (fatal_signal_pending(current)) 2405 return -EINTR; 2406 if (time_after(jiffies, timeout)) { 2407 dev_err(ctrl->device, 2408 "Device not ready; aborting %s, CSTS=0x%x\n", 2409 enabled ? "initialisation" : "reset", csts); 2410 return -ENODEV; 2411 } 2412 } 2413 2414 return ret; 2415 } 2416 2417 /* 2418 * If the device has been passed off to us in an enabled state, just clear 2419 * the enabled bit. The spec says we should set the 'shutdown notification 2420 * bits', but doing so may cause the device to complete commands to the 2421 * admin queue ... and we don't know what memory that might be pointing at! 2422 */ 2423 int nvme_disable_ctrl(struct nvme_ctrl *ctrl) 2424 { 2425 int ret; 2426 2427 ctrl->ctrl_config &= ~NVME_CC_SHN_MASK; 2428 ctrl->ctrl_config &= ~NVME_CC_ENABLE; 2429 2430 ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config); 2431 if (ret) 2432 return ret; 2433 2434 if (ctrl->quirks & NVME_QUIRK_DELAY_BEFORE_CHK_RDY) 2435 msleep(NVME_QUIRK_DELAY_AMOUNT); 2436 2437 return nvme_wait_ready(ctrl, ctrl->cap, false); 2438 } 2439 EXPORT_SYMBOL_GPL(nvme_disable_ctrl); 2440 2441 int nvme_enable_ctrl(struct nvme_ctrl *ctrl) 2442 { 2443 unsigned dev_page_min; 2444 int ret; 2445 2446 ret = ctrl->ops->reg_read64(ctrl, NVME_REG_CAP, &ctrl->cap); 2447 if (ret) { 2448 dev_err(ctrl->device, "Reading CAP failed (%d)\n", ret); 2449 return ret; 2450 } 2451 dev_page_min = NVME_CAP_MPSMIN(ctrl->cap) + 12; 2452 2453 if (NVME_CTRL_PAGE_SHIFT < dev_page_min) { 2454 dev_err(ctrl->device, 2455 "Minimum device page size %u too large for host (%u)\n", 2456 1 << dev_page_min, 1 << NVME_CTRL_PAGE_SHIFT); 2457 return -ENODEV; 2458 } 2459 2460 if (NVME_CAP_CSS(ctrl->cap) & NVME_CAP_CSS_CSI) 2461 ctrl->ctrl_config = NVME_CC_CSS_CSI; 2462 else 2463 ctrl->ctrl_config = NVME_CC_CSS_NVM; 2464 ctrl->ctrl_config |= (NVME_CTRL_PAGE_SHIFT - 12) << NVME_CC_MPS_SHIFT; 2465 ctrl->ctrl_config |= NVME_CC_AMS_RR | NVME_CC_SHN_NONE; 2466 ctrl->ctrl_config |= NVME_CC_IOSQES | NVME_CC_IOCQES; 2467 ctrl->ctrl_config |= NVME_CC_ENABLE; 2468 2469 ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config); 2470 if (ret) 2471 return ret; 2472 return nvme_wait_ready(ctrl, ctrl->cap, true); 2473 } 2474 EXPORT_SYMBOL_GPL(nvme_enable_ctrl); 2475 2476 int nvme_shutdown_ctrl(struct nvme_ctrl *ctrl) 2477 { 2478 unsigned long timeout = jiffies + (ctrl->shutdown_timeout * HZ); 2479 u32 csts; 2480 int ret; 2481 2482 ctrl->ctrl_config &= ~NVME_CC_SHN_MASK; 2483 ctrl->ctrl_config |= NVME_CC_SHN_NORMAL; 2484 2485 ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config); 2486 if (ret) 2487 return ret; 2488 2489 while ((ret = ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &csts)) == 0) { 2490 if ((csts & NVME_CSTS_SHST_MASK) == NVME_CSTS_SHST_CMPLT) 2491 break; 2492 2493 msleep(100); 2494 if (fatal_signal_pending(current)) 2495 return -EINTR; 2496 if (time_after(jiffies, timeout)) { 2497 dev_err(ctrl->device, 2498 "Device shutdown incomplete; abort shutdown\n"); 2499 return -ENODEV; 2500 } 2501 } 2502 2503 return ret; 2504 } 2505 EXPORT_SYMBOL_GPL(nvme_shutdown_ctrl); 2506 2507 static int nvme_configure_timestamp(struct nvme_ctrl *ctrl) 2508 { 2509 __le64 ts; 2510 int ret; 2511 2512 if (!(ctrl->oncs & NVME_CTRL_ONCS_TIMESTAMP)) 2513 return 0; 2514 2515 ts = cpu_to_le64(ktime_to_ms(ktime_get_real())); 2516 ret = nvme_set_features(ctrl, NVME_FEAT_TIMESTAMP, 0, &ts, sizeof(ts), 2517 NULL); 2518 if (ret) 2519 dev_warn_once(ctrl->device, 2520 "could not set timestamp (%d)\n", ret); 2521 return ret; 2522 } 2523 2524 static int nvme_configure_acre(struct nvme_ctrl *ctrl) 2525 { 2526 struct nvme_feat_host_behavior *host; 2527 int ret; 2528 2529 /* Don't bother enabling the feature if retry delay is not reported */ 2530 if (!ctrl->crdt[0]) 2531 return 0; 2532 2533 host = kzalloc(sizeof(*host), GFP_KERNEL); 2534 if (!host) 2535 return 0; 2536 2537 host->acre = NVME_ENABLE_ACRE; 2538 ret = nvme_set_features(ctrl, NVME_FEAT_HOST_BEHAVIOR, 0, 2539 host, sizeof(*host), NULL); 2540 kfree(host); 2541 return ret; 2542 } 2543 2544 static int nvme_configure_apst(struct nvme_ctrl *ctrl) 2545 { 2546 /* 2547 * APST (Autonomous Power State Transition) lets us program a 2548 * table of power state transitions that the controller will 2549 * perform automatically. We configure it with a simple 2550 * heuristic: we are willing to spend at most 2% of the time 2551 * transitioning between power states. Therefore, when running 2552 * in any given state, we will enter the next lower-power 2553 * non-operational state after waiting 50 * (enlat + exlat) 2554 * microseconds, as long as that state's exit latency is under 2555 * the requested maximum latency. 2556 * 2557 * We will not autonomously enter any non-operational state for 2558 * which the total latency exceeds ps_max_latency_us. Users 2559 * can set ps_max_latency_us to zero to turn off APST. 2560 */ 2561 2562 unsigned apste; 2563 struct nvme_feat_auto_pst *table; 2564 u64 max_lat_us = 0; 2565 int max_ps = -1; 2566 int ret; 2567 2568 /* 2569 * If APST isn't supported or if we haven't been initialized yet, 2570 * then don't do anything. 2571 */ 2572 if (!ctrl->apsta) 2573 return 0; 2574 2575 if (ctrl->npss > 31) { 2576 dev_warn(ctrl->device, "NPSS is invalid; not using APST\n"); 2577 return 0; 2578 } 2579 2580 table = kzalloc(sizeof(*table), GFP_KERNEL); 2581 if (!table) 2582 return 0; 2583 2584 if (!ctrl->apst_enabled || ctrl->ps_max_latency_us == 0) { 2585 /* Turn off APST. */ 2586 apste = 0; 2587 dev_dbg(ctrl->device, "APST disabled\n"); 2588 } else { 2589 __le64 target = cpu_to_le64(0); 2590 int state; 2591 2592 /* 2593 * Walk through all states from lowest- to highest-power. 2594 * According to the spec, lower-numbered states use more 2595 * power. NPSS, despite the name, is the index of the 2596 * lowest-power state, not the number of states. 2597 */ 2598 for (state = (int)ctrl->npss; state >= 0; state--) { 2599 u64 total_latency_us, exit_latency_us, transition_ms; 2600 2601 if (target) 2602 table->entries[state] = target; 2603 2604 /* 2605 * Don't allow transitions to the deepest state 2606 * if it's quirked off. 2607 */ 2608 if (state == ctrl->npss && 2609 (ctrl->quirks & NVME_QUIRK_NO_DEEPEST_PS)) 2610 continue; 2611 2612 /* 2613 * Is this state a useful non-operational state for 2614 * higher-power states to autonomously transition to? 2615 */ 2616 if (!(ctrl->psd[state].flags & 2617 NVME_PS_FLAGS_NON_OP_STATE)) 2618 continue; 2619 2620 exit_latency_us = 2621 (u64)le32_to_cpu(ctrl->psd[state].exit_lat); 2622 if (exit_latency_us > ctrl->ps_max_latency_us) 2623 continue; 2624 2625 total_latency_us = 2626 exit_latency_us + 2627 le32_to_cpu(ctrl->psd[state].entry_lat); 2628 2629 /* 2630 * This state is good. Use it as the APST idle 2631 * target for higher power states. 2632 */ 2633 transition_ms = total_latency_us + 19; 2634 do_div(transition_ms, 20); 2635 if (transition_ms > (1 << 24) - 1) 2636 transition_ms = (1 << 24) - 1; 2637 2638 target = cpu_to_le64((state << 3) | 2639 (transition_ms << 8)); 2640 2641 if (max_ps == -1) 2642 max_ps = state; 2643 2644 if (total_latency_us > max_lat_us) 2645 max_lat_us = total_latency_us; 2646 } 2647 2648 apste = 1; 2649 2650 if (max_ps == -1) { 2651 dev_dbg(ctrl->device, "APST enabled but no non-operational states are available\n"); 2652 } else { 2653 dev_dbg(ctrl->device, "APST enabled: max PS = %d, max round-trip latency = %lluus, table = %*phN\n", 2654 max_ps, max_lat_us, (int)sizeof(*table), table); 2655 } 2656 } 2657 2658 ret = nvme_set_features(ctrl, NVME_FEAT_AUTO_PST, apste, 2659 table, sizeof(*table), NULL); 2660 if (ret) 2661 dev_err(ctrl->device, "failed to set APST feature (%d)\n", ret); 2662 2663 kfree(table); 2664 return ret; 2665 } 2666 2667 static void nvme_set_latency_tolerance(struct device *dev, s32 val) 2668 { 2669 struct nvme_ctrl *ctrl = dev_get_drvdata(dev); 2670 u64 latency; 2671 2672 switch (val) { 2673 case PM_QOS_LATENCY_TOLERANCE_NO_CONSTRAINT: 2674 case PM_QOS_LATENCY_ANY: 2675 latency = U64_MAX; 2676 break; 2677 2678 default: 2679 latency = val; 2680 } 2681 2682 if (ctrl->ps_max_latency_us != latency) { 2683 ctrl->ps_max_latency_us = latency; 2684 nvme_configure_apst(ctrl); 2685 } 2686 } 2687 2688 struct nvme_core_quirk_entry { 2689 /* 2690 * NVMe model and firmware strings are padded with spaces. For 2691 * simplicity, strings in the quirk table are padded with NULLs 2692 * instead. 2693 */ 2694 u16 vid; 2695 const char *mn; 2696 const char *fr; 2697 unsigned long quirks; 2698 }; 2699 2700 static const struct nvme_core_quirk_entry core_quirks[] = { 2701 { 2702 /* 2703 * This Toshiba device seems to die using any APST states. See: 2704 * https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1678184/comments/11 2705 */ 2706 .vid = 0x1179, 2707 .mn = "THNSF5256GPUK TOSHIBA", 2708 .quirks = NVME_QUIRK_NO_APST, 2709 }, 2710 { 2711 /* 2712 * This LiteON CL1-3D*-Q11 firmware version has a race 2713 * condition associated with actions related to suspend to idle 2714 * LiteON has resolved the problem in future firmware 2715 */ 2716 .vid = 0x14a4, 2717 .fr = "22301111", 2718 .quirks = NVME_QUIRK_SIMPLE_SUSPEND, 2719 } 2720 }; 2721 2722 /* match is null-terminated but idstr is space-padded. */ 2723 static bool string_matches(const char *idstr, const char *match, size_t len) 2724 { 2725 size_t matchlen; 2726 2727 if (!match) 2728 return true; 2729 2730 matchlen = strlen(match); 2731 WARN_ON_ONCE(matchlen > len); 2732 2733 if (memcmp(idstr, match, matchlen)) 2734 return false; 2735 2736 for (; matchlen < len; matchlen++) 2737 if (idstr[matchlen] != ' ') 2738 return false; 2739 2740 return true; 2741 } 2742 2743 static bool quirk_matches(const struct nvme_id_ctrl *id, 2744 const struct nvme_core_quirk_entry *q) 2745 { 2746 return q->vid == le16_to_cpu(id->vid) && 2747 string_matches(id->mn, q->mn, sizeof(id->mn)) && 2748 string_matches(id->fr, q->fr, sizeof(id->fr)); 2749 } 2750 2751 static void nvme_init_subnqn(struct nvme_subsystem *subsys, struct nvme_ctrl *ctrl, 2752 struct nvme_id_ctrl *id) 2753 { 2754 size_t nqnlen; 2755 int off; 2756 2757 if(!(ctrl->quirks & NVME_QUIRK_IGNORE_DEV_SUBNQN)) { 2758 nqnlen = strnlen(id->subnqn, NVMF_NQN_SIZE); 2759 if (nqnlen > 0 && nqnlen < NVMF_NQN_SIZE) { 2760 strlcpy(subsys->subnqn, id->subnqn, NVMF_NQN_SIZE); 2761 return; 2762 } 2763 2764 if (ctrl->vs >= NVME_VS(1, 2, 1)) 2765 dev_warn(ctrl->device, "missing or invalid SUBNQN field.\n"); 2766 } 2767 2768 /* Generate a "fake" NQN per Figure 254 in NVMe 1.3 + ECN 001 */ 2769 off = snprintf(subsys->subnqn, NVMF_NQN_SIZE, 2770 "nqn.2014.08.org.nvmexpress:%04x%04x", 2771 le16_to_cpu(id->vid), le16_to_cpu(id->ssvid)); 2772 memcpy(subsys->subnqn + off, id->sn, sizeof(id->sn)); 2773 off += sizeof(id->sn); 2774 memcpy(subsys->subnqn + off, id->mn, sizeof(id->mn)); 2775 off += sizeof(id->mn); 2776 memset(subsys->subnqn + off, 0, sizeof(subsys->subnqn) - off); 2777 } 2778 2779 static void nvme_release_subsystem(struct device *dev) 2780 { 2781 struct nvme_subsystem *subsys = 2782 container_of(dev, struct nvme_subsystem, dev); 2783 2784 if (subsys->instance >= 0) 2785 ida_simple_remove(&nvme_instance_ida, subsys->instance); 2786 kfree(subsys); 2787 } 2788 2789 static void nvme_destroy_subsystem(struct kref *ref) 2790 { 2791 struct nvme_subsystem *subsys = 2792 container_of(ref, struct nvme_subsystem, ref); 2793 2794 mutex_lock(&nvme_subsystems_lock); 2795 list_del(&subsys->entry); 2796 mutex_unlock(&nvme_subsystems_lock); 2797 2798 ida_destroy(&subsys->ns_ida); 2799 device_del(&subsys->dev); 2800 put_device(&subsys->dev); 2801 } 2802 2803 static void nvme_put_subsystem(struct nvme_subsystem *subsys) 2804 { 2805 kref_put(&subsys->ref, nvme_destroy_subsystem); 2806 } 2807 2808 static struct nvme_subsystem *__nvme_find_get_subsystem(const char *subsysnqn) 2809 { 2810 struct nvme_subsystem *subsys; 2811 2812 lockdep_assert_held(&nvme_subsystems_lock); 2813 2814 /* 2815 * Fail matches for discovery subsystems. This results 2816 * in each discovery controller bound to a unique subsystem. 2817 * This avoids issues with validating controller values 2818 * that can only be true when there is a single unique subsystem. 2819 * There may be multiple and completely independent entities 2820 * that provide discovery controllers. 2821 */ 2822 if (!strcmp(subsysnqn, NVME_DISC_SUBSYS_NAME)) 2823 return NULL; 2824 2825 list_for_each_entry(subsys, &nvme_subsystems, entry) { 2826 if (strcmp(subsys->subnqn, subsysnqn)) 2827 continue; 2828 if (!kref_get_unless_zero(&subsys->ref)) 2829 continue; 2830 return subsys; 2831 } 2832 2833 return NULL; 2834 } 2835 2836 #define SUBSYS_ATTR_RO(_name, _mode, _show) \ 2837 struct device_attribute subsys_attr_##_name = \ 2838 __ATTR(_name, _mode, _show, NULL) 2839 2840 static ssize_t nvme_subsys_show_nqn(struct device *dev, 2841 struct device_attribute *attr, 2842 char *buf) 2843 { 2844 struct nvme_subsystem *subsys = 2845 container_of(dev, struct nvme_subsystem, dev); 2846 2847 return sysfs_emit(buf, "%s\n", subsys->subnqn); 2848 } 2849 static SUBSYS_ATTR_RO(subsysnqn, S_IRUGO, nvme_subsys_show_nqn); 2850 2851 #define nvme_subsys_show_str_function(field) \ 2852 static ssize_t subsys_##field##_show(struct device *dev, \ 2853 struct device_attribute *attr, char *buf) \ 2854 { \ 2855 struct nvme_subsystem *subsys = \ 2856 container_of(dev, struct nvme_subsystem, dev); \ 2857 return sprintf(buf, "%.*s\n", \ 2858 (int)sizeof(subsys->field), subsys->field); \ 2859 } \ 2860 static SUBSYS_ATTR_RO(field, S_IRUGO, subsys_##field##_show); 2861 2862 nvme_subsys_show_str_function(model); 2863 nvme_subsys_show_str_function(serial); 2864 nvme_subsys_show_str_function(firmware_rev); 2865 2866 static struct attribute *nvme_subsys_attrs[] = { 2867 &subsys_attr_model.attr, 2868 &subsys_attr_serial.attr, 2869 &subsys_attr_firmware_rev.attr, 2870 &subsys_attr_subsysnqn.attr, 2871 #ifdef CONFIG_NVME_MULTIPATH 2872 &subsys_attr_iopolicy.attr, 2873 #endif 2874 NULL, 2875 }; 2876 2877 static const struct attribute_group nvme_subsys_attrs_group = { 2878 .attrs = nvme_subsys_attrs, 2879 }; 2880 2881 static const struct attribute_group *nvme_subsys_attrs_groups[] = { 2882 &nvme_subsys_attrs_group, 2883 NULL, 2884 }; 2885 2886 static inline bool nvme_discovery_ctrl(struct nvme_ctrl *ctrl) 2887 { 2888 return ctrl->opts && ctrl->opts->discovery_nqn; 2889 } 2890 2891 static bool nvme_validate_cntlid(struct nvme_subsystem *subsys, 2892 struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id) 2893 { 2894 struct nvme_ctrl *tmp; 2895 2896 lockdep_assert_held(&nvme_subsystems_lock); 2897 2898 list_for_each_entry(tmp, &subsys->ctrls, subsys_entry) { 2899 if (nvme_state_terminal(tmp)) 2900 continue; 2901 2902 if (tmp->cntlid == ctrl->cntlid) { 2903 dev_err(ctrl->device, 2904 "Duplicate cntlid %u with %s, rejecting\n", 2905 ctrl->cntlid, dev_name(tmp->device)); 2906 return false; 2907 } 2908 2909 if ((id->cmic & NVME_CTRL_CMIC_MULTI_CTRL) || 2910 nvme_discovery_ctrl(ctrl)) 2911 continue; 2912 2913 dev_err(ctrl->device, 2914 "Subsystem does not support multiple controllers\n"); 2915 return false; 2916 } 2917 2918 return true; 2919 } 2920 2921 static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id) 2922 { 2923 struct nvme_subsystem *subsys, *found; 2924 int ret; 2925 2926 subsys = kzalloc(sizeof(*subsys), GFP_KERNEL); 2927 if (!subsys) 2928 return -ENOMEM; 2929 2930 subsys->instance = -1; 2931 mutex_init(&subsys->lock); 2932 kref_init(&subsys->ref); 2933 INIT_LIST_HEAD(&subsys->ctrls); 2934 INIT_LIST_HEAD(&subsys->nsheads); 2935 nvme_init_subnqn(subsys, ctrl, id); 2936 memcpy(subsys->serial, id->sn, sizeof(subsys->serial)); 2937 memcpy(subsys->model, id->mn, sizeof(subsys->model)); 2938 memcpy(subsys->firmware_rev, id->fr, sizeof(subsys->firmware_rev)); 2939 subsys->vendor_id = le16_to_cpu(id->vid); 2940 subsys->cmic = id->cmic; 2941 subsys->awupf = le16_to_cpu(id->awupf); 2942 #ifdef CONFIG_NVME_MULTIPATH 2943 subsys->iopolicy = NVME_IOPOLICY_NUMA; 2944 #endif 2945 2946 subsys->dev.class = nvme_subsys_class; 2947 subsys->dev.release = nvme_release_subsystem; 2948 subsys->dev.groups = nvme_subsys_attrs_groups; 2949 dev_set_name(&subsys->dev, "nvme-subsys%d", ctrl->instance); 2950 device_initialize(&subsys->dev); 2951 2952 mutex_lock(&nvme_subsystems_lock); 2953 found = __nvme_find_get_subsystem(subsys->subnqn); 2954 if (found) { 2955 put_device(&subsys->dev); 2956 subsys = found; 2957 2958 if (!nvme_validate_cntlid(subsys, ctrl, id)) { 2959 ret = -EINVAL; 2960 goto out_put_subsystem; 2961 } 2962 } else { 2963 ret = device_add(&subsys->dev); 2964 if (ret) { 2965 dev_err(ctrl->device, 2966 "failed to register subsystem device.\n"); 2967 put_device(&subsys->dev); 2968 goto out_unlock; 2969 } 2970 ida_init(&subsys->ns_ida); 2971 list_add_tail(&subsys->entry, &nvme_subsystems); 2972 } 2973 2974 ret = sysfs_create_link(&subsys->dev.kobj, &ctrl->device->kobj, 2975 dev_name(ctrl->device)); 2976 if (ret) { 2977 dev_err(ctrl->device, 2978 "failed to create sysfs link from subsystem.\n"); 2979 goto out_put_subsystem; 2980 } 2981 2982 if (!found) 2983 subsys->instance = ctrl->instance; 2984 ctrl->subsys = subsys; 2985 list_add_tail(&ctrl->subsys_entry, &subsys->ctrls); 2986 mutex_unlock(&nvme_subsystems_lock); 2987 return 0; 2988 2989 out_put_subsystem: 2990 nvme_put_subsystem(subsys); 2991 out_unlock: 2992 mutex_unlock(&nvme_subsystems_lock); 2993 return ret; 2994 } 2995 2996 int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp, u8 csi, 2997 void *log, size_t size, u64 offset) 2998 { 2999 struct nvme_command c = { }; 3000 u32 dwlen = nvme_bytes_to_numd(size); 3001 3002 c.get_log_page.opcode = nvme_admin_get_log_page; 3003 c.get_log_page.nsid = cpu_to_le32(nsid); 3004 c.get_log_page.lid = log_page; 3005 c.get_log_page.lsp = lsp; 3006 c.get_log_page.numdl = cpu_to_le16(dwlen & ((1 << 16) - 1)); 3007 c.get_log_page.numdu = cpu_to_le16(dwlen >> 16); 3008 c.get_log_page.lpol = cpu_to_le32(lower_32_bits(offset)); 3009 c.get_log_page.lpou = cpu_to_le32(upper_32_bits(offset)); 3010 c.get_log_page.csi = csi; 3011 3012 return nvme_submit_sync_cmd(ctrl->admin_q, &c, log, size); 3013 } 3014 3015 static int nvme_get_effects_log(struct nvme_ctrl *ctrl, u8 csi, 3016 struct nvme_effects_log **log) 3017 { 3018 struct nvme_effects_log *cel = xa_load(&ctrl->cels, csi); 3019 int ret; 3020 3021 if (cel) 3022 goto out; 3023 3024 cel = kzalloc(sizeof(*cel), GFP_KERNEL); 3025 if (!cel) 3026 return -ENOMEM; 3027 3028 ret = nvme_get_log(ctrl, 0x00, NVME_LOG_CMD_EFFECTS, 0, csi, 3029 cel, sizeof(*cel), 0); 3030 if (ret) { 3031 kfree(cel); 3032 return ret; 3033 } 3034 3035 xa_store(&ctrl->cels, csi, cel, GFP_KERNEL); 3036 out: 3037 *log = cel; 3038 return 0; 3039 } 3040 3041 /* 3042 * Initialize the cached copies of the Identify data and various controller 3043 * register in our nvme_ctrl structure. This should be called as soon as 3044 * the admin queue is fully up and running. 3045 */ 3046 int nvme_init_identify(struct nvme_ctrl *ctrl) 3047 { 3048 struct nvme_id_ctrl *id; 3049 int ret, page_shift; 3050 u32 max_hw_sectors; 3051 bool prev_apst_enabled; 3052 3053 ret = ctrl->ops->reg_read32(ctrl, NVME_REG_VS, &ctrl->vs); 3054 if (ret) { 3055 dev_err(ctrl->device, "Reading VS failed (%d)\n", ret); 3056 return ret; 3057 } 3058 page_shift = NVME_CAP_MPSMIN(ctrl->cap) + 12; 3059 ctrl->sqsize = min_t(u16, NVME_CAP_MQES(ctrl->cap), ctrl->sqsize); 3060 3061 if (ctrl->vs >= NVME_VS(1, 1, 0)) 3062 ctrl->subsystem = NVME_CAP_NSSRC(ctrl->cap); 3063 3064 ret = nvme_identify_ctrl(ctrl, &id); 3065 if (ret) { 3066 dev_err(ctrl->device, "Identify Controller failed (%d)\n", ret); 3067 return -EIO; 3068 } 3069 3070 if (id->lpa & NVME_CTRL_LPA_CMD_EFFECTS_LOG) { 3071 ret = nvme_get_effects_log(ctrl, NVME_CSI_NVM, &ctrl->effects); 3072 if (ret < 0) 3073 goto out_free; 3074 } 3075 3076 if (!(ctrl->ops->flags & NVME_F_FABRICS)) 3077 ctrl->cntlid = le16_to_cpu(id->cntlid); 3078 3079 if (!ctrl->identified) { 3080 int i; 3081 3082 ret = nvme_init_subsystem(ctrl, id); 3083 if (ret) 3084 goto out_free; 3085 3086 /* 3087 * Check for quirks. Quirk can depend on firmware version, 3088 * so, in principle, the set of quirks present can change 3089 * across a reset. As a possible future enhancement, we 3090 * could re-scan for quirks every time we reinitialize 3091 * the device, but we'd have to make sure that the driver 3092 * behaves intelligently if the quirks change. 3093 */ 3094 for (i = 0; i < ARRAY_SIZE(core_quirks); i++) { 3095 if (quirk_matches(id, &core_quirks[i])) 3096 ctrl->quirks |= core_quirks[i].quirks; 3097 } 3098 } 3099 3100 if (force_apst && (ctrl->quirks & NVME_QUIRK_NO_DEEPEST_PS)) { 3101 dev_warn(ctrl->device, "forcibly allowing all power states due to nvme_core.force_apst -- use at your own risk\n"); 3102 ctrl->quirks &= ~NVME_QUIRK_NO_DEEPEST_PS; 3103 } 3104 3105 ctrl->crdt[0] = le16_to_cpu(id->crdt1); 3106 ctrl->crdt[1] = le16_to_cpu(id->crdt2); 3107 ctrl->crdt[2] = le16_to_cpu(id->crdt3); 3108 3109 ctrl->oacs = le16_to_cpu(id->oacs); 3110 ctrl->oncs = le16_to_cpu(id->oncs); 3111 ctrl->mtfa = le16_to_cpu(id->mtfa); 3112 ctrl->oaes = le32_to_cpu(id->oaes); 3113 ctrl->wctemp = le16_to_cpu(id->wctemp); 3114 ctrl->cctemp = le16_to_cpu(id->cctemp); 3115 3116 atomic_set(&ctrl->abort_limit, id->acl + 1); 3117 ctrl->vwc = id->vwc; 3118 if (id->mdts) 3119 max_hw_sectors = 1 << (id->mdts + page_shift - 9); 3120 else 3121 max_hw_sectors = UINT_MAX; 3122 ctrl->max_hw_sectors = 3123 min_not_zero(ctrl->max_hw_sectors, max_hw_sectors); 3124 3125 nvme_set_queue_limits(ctrl, ctrl->admin_q); 3126 ctrl->sgls = le32_to_cpu(id->sgls); 3127 ctrl->kas = le16_to_cpu(id->kas); 3128 ctrl->max_namespaces = le32_to_cpu(id->mnan); 3129 ctrl->ctratt = le32_to_cpu(id->ctratt); 3130 3131 if (id->rtd3e) { 3132 /* us -> s */ 3133 u32 transition_time = le32_to_cpu(id->rtd3e) / USEC_PER_SEC; 3134 3135 ctrl->shutdown_timeout = clamp_t(unsigned int, transition_time, 3136 shutdown_timeout, 60); 3137 3138 if (ctrl->shutdown_timeout != shutdown_timeout) 3139 dev_info(ctrl->device, 3140 "Shutdown timeout set to %u seconds\n", 3141 ctrl->shutdown_timeout); 3142 } else 3143 ctrl->shutdown_timeout = shutdown_timeout; 3144 3145 ctrl->npss = id->npss; 3146 ctrl->apsta = id->apsta; 3147 prev_apst_enabled = ctrl->apst_enabled; 3148 if (ctrl->quirks & NVME_QUIRK_NO_APST) { 3149 if (force_apst && id->apsta) { 3150 dev_warn(ctrl->device, "forcibly allowing APST due to nvme_core.force_apst -- use at your own risk\n"); 3151 ctrl->apst_enabled = true; 3152 } else { 3153 ctrl->apst_enabled = false; 3154 } 3155 } else { 3156 ctrl->apst_enabled = id->apsta; 3157 } 3158 memcpy(ctrl->psd, id->psd, sizeof(ctrl->psd)); 3159 3160 if (ctrl->ops->flags & NVME_F_FABRICS) { 3161 ctrl->icdoff = le16_to_cpu(id->icdoff); 3162 ctrl->ioccsz = le32_to_cpu(id->ioccsz); 3163 ctrl->iorcsz = le32_to_cpu(id->iorcsz); 3164 ctrl->maxcmd = le16_to_cpu(id->maxcmd); 3165 3166 /* 3167 * In fabrics we need to verify the cntlid matches the 3168 * admin connect 3169 */ 3170 if (ctrl->cntlid != le16_to_cpu(id->cntlid)) { 3171 dev_err(ctrl->device, 3172 "Mismatching cntlid: Connect %u vs Identify " 3173 "%u, rejecting\n", 3174 ctrl->cntlid, le16_to_cpu(id->cntlid)); 3175 ret = -EINVAL; 3176 goto out_free; 3177 } 3178 3179 if (!nvme_discovery_ctrl(ctrl) && !ctrl->kas) { 3180 dev_err(ctrl->device, 3181 "keep-alive support is mandatory for fabrics\n"); 3182 ret = -EINVAL; 3183 goto out_free; 3184 } 3185 } else { 3186 ctrl->hmpre = le32_to_cpu(id->hmpre); 3187 ctrl->hmmin = le32_to_cpu(id->hmmin); 3188 ctrl->hmminds = le32_to_cpu(id->hmminds); 3189 ctrl->hmmaxd = le16_to_cpu(id->hmmaxd); 3190 } 3191 3192 ret = nvme_mpath_init(ctrl, id); 3193 kfree(id); 3194 3195 if (ret < 0) 3196 return ret; 3197 3198 if (ctrl->apst_enabled && !prev_apst_enabled) 3199 dev_pm_qos_expose_latency_tolerance(ctrl->device); 3200 else if (!ctrl->apst_enabled && prev_apst_enabled) 3201 dev_pm_qos_hide_latency_tolerance(ctrl->device); 3202 3203 ret = nvme_configure_apst(ctrl); 3204 if (ret < 0) 3205 return ret; 3206 3207 ret = nvme_configure_timestamp(ctrl); 3208 if (ret < 0) 3209 return ret; 3210 3211 ret = nvme_configure_directives(ctrl); 3212 if (ret < 0) 3213 return ret; 3214 3215 ret = nvme_configure_acre(ctrl); 3216 if (ret < 0) 3217 return ret; 3218 3219 if (!ctrl->identified && !nvme_discovery_ctrl(ctrl)) { 3220 ret = nvme_hwmon_init(ctrl); 3221 if (ret < 0) 3222 return ret; 3223 } 3224 3225 ctrl->identified = true; 3226 3227 return 0; 3228 3229 out_free: 3230 kfree(id); 3231 return ret; 3232 } 3233 EXPORT_SYMBOL_GPL(nvme_init_identify); 3234 3235 static int nvme_dev_open(struct inode *inode, struct file *file) 3236 { 3237 struct nvme_ctrl *ctrl = 3238 container_of(inode->i_cdev, struct nvme_ctrl, cdev); 3239 3240 switch (ctrl->state) { 3241 case NVME_CTRL_LIVE: 3242 break; 3243 default: 3244 return -EWOULDBLOCK; 3245 } 3246 3247 nvme_get_ctrl(ctrl); 3248 if (!try_module_get(ctrl->ops->module)) { 3249 nvme_put_ctrl(ctrl); 3250 return -EINVAL; 3251 } 3252 3253 file->private_data = ctrl; 3254 return 0; 3255 } 3256 3257 static int nvme_dev_release(struct inode *inode, struct file *file) 3258 { 3259 struct nvme_ctrl *ctrl = 3260 container_of(inode->i_cdev, struct nvme_ctrl, cdev); 3261 3262 module_put(ctrl->ops->module); 3263 nvme_put_ctrl(ctrl); 3264 return 0; 3265 } 3266 3267 static int nvme_dev_user_cmd(struct nvme_ctrl *ctrl, void __user *argp) 3268 { 3269 struct nvme_ns *ns; 3270 int ret; 3271 3272 down_read(&ctrl->namespaces_rwsem); 3273 if (list_empty(&ctrl->namespaces)) { 3274 ret = -ENOTTY; 3275 goto out_unlock; 3276 } 3277 3278 ns = list_first_entry(&ctrl->namespaces, struct nvme_ns, list); 3279 if (ns != list_last_entry(&ctrl->namespaces, struct nvme_ns, list)) { 3280 dev_warn(ctrl->device, 3281 "NVME_IOCTL_IO_CMD not supported when multiple namespaces present!\n"); 3282 ret = -EINVAL; 3283 goto out_unlock; 3284 } 3285 3286 dev_warn(ctrl->device, 3287 "using deprecated NVME_IOCTL_IO_CMD ioctl on the char device!\n"); 3288 kref_get(&ns->kref); 3289 up_read(&ctrl->namespaces_rwsem); 3290 3291 ret = nvme_user_cmd(ctrl, ns, argp); 3292 nvme_put_ns(ns); 3293 return ret; 3294 3295 out_unlock: 3296 up_read(&ctrl->namespaces_rwsem); 3297 return ret; 3298 } 3299 3300 static long nvme_dev_ioctl(struct file *file, unsigned int cmd, 3301 unsigned long arg) 3302 { 3303 struct nvme_ctrl *ctrl = file->private_data; 3304 void __user *argp = (void __user *)arg; 3305 3306 switch (cmd) { 3307 case NVME_IOCTL_ADMIN_CMD: 3308 return nvme_user_cmd(ctrl, NULL, argp); 3309 case NVME_IOCTL_ADMIN64_CMD: 3310 return nvme_user_cmd64(ctrl, NULL, argp); 3311 case NVME_IOCTL_IO_CMD: 3312 return nvme_dev_user_cmd(ctrl, argp); 3313 case NVME_IOCTL_RESET: 3314 dev_warn(ctrl->device, "resetting controller\n"); 3315 return nvme_reset_ctrl_sync(ctrl); 3316 case NVME_IOCTL_SUBSYS_RESET: 3317 return nvme_reset_subsystem(ctrl); 3318 case NVME_IOCTL_RESCAN: 3319 nvme_queue_scan(ctrl); 3320 return 0; 3321 default: 3322 return -ENOTTY; 3323 } 3324 } 3325 3326 static const struct file_operations nvme_dev_fops = { 3327 .owner = THIS_MODULE, 3328 .open = nvme_dev_open, 3329 .release = nvme_dev_release, 3330 .unlocked_ioctl = nvme_dev_ioctl, 3331 .compat_ioctl = compat_ptr_ioctl, 3332 }; 3333 3334 static ssize_t nvme_sysfs_reset(struct device *dev, 3335 struct device_attribute *attr, const char *buf, 3336 size_t count) 3337 { 3338 struct nvme_ctrl *ctrl = dev_get_drvdata(dev); 3339 int ret; 3340 3341 ret = nvme_reset_ctrl_sync(ctrl); 3342 if (ret < 0) 3343 return ret; 3344 return count; 3345 } 3346 static DEVICE_ATTR(reset_controller, S_IWUSR, NULL, nvme_sysfs_reset); 3347 3348 static ssize_t nvme_sysfs_rescan(struct device *dev, 3349 struct device_attribute *attr, const char *buf, 3350 size_t count) 3351 { 3352 struct nvme_ctrl *ctrl = dev_get_drvdata(dev); 3353 3354 nvme_queue_scan(ctrl); 3355 return count; 3356 } 3357 static DEVICE_ATTR(rescan_controller, S_IWUSR, NULL, nvme_sysfs_rescan); 3358 3359 static inline struct nvme_ns_head *dev_to_ns_head(struct device *dev) 3360 { 3361 struct gendisk *disk = dev_to_disk(dev); 3362 3363 if (disk->fops == &nvme_bdev_ops) 3364 return nvme_get_ns_from_dev(dev)->head; 3365 else 3366 return disk->private_data; 3367 } 3368 3369 static ssize_t wwid_show(struct device *dev, struct device_attribute *attr, 3370 char *buf) 3371 { 3372 struct nvme_ns_head *head = dev_to_ns_head(dev); 3373 struct nvme_ns_ids *ids = &head->ids; 3374 struct nvme_subsystem *subsys = head->subsys; 3375 int serial_len = sizeof(subsys->serial); 3376 int model_len = sizeof(subsys->model); 3377 3378 if (!uuid_is_null(&ids->uuid)) 3379 return sprintf(buf, "uuid.%pU\n", &ids->uuid); 3380 3381 if (memchr_inv(ids->nguid, 0, sizeof(ids->nguid))) 3382 return sprintf(buf, "eui.%16phN\n", ids->nguid); 3383 3384 if (memchr_inv(ids->eui64, 0, sizeof(ids->eui64))) 3385 return sprintf(buf, "eui.%8phN\n", ids->eui64); 3386 3387 while (serial_len > 0 && (subsys->serial[serial_len - 1] == ' ' || 3388 subsys->serial[serial_len - 1] == '\0')) 3389 serial_len--; 3390 while (model_len > 0 && (subsys->model[model_len - 1] == ' ' || 3391 subsys->model[model_len - 1] == '\0')) 3392 model_len--; 3393 3394 return sprintf(buf, "nvme.%04x-%*phN-%*phN-%08x\n", subsys->vendor_id, 3395 serial_len, subsys->serial, model_len, subsys->model, 3396 head->ns_id); 3397 } 3398 static DEVICE_ATTR_RO(wwid); 3399 3400 static ssize_t nguid_show(struct device *dev, struct device_attribute *attr, 3401 char *buf) 3402 { 3403 return sprintf(buf, "%pU\n", dev_to_ns_head(dev)->ids.nguid); 3404 } 3405 static DEVICE_ATTR_RO(nguid); 3406 3407 static ssize_t uuid_show(struct device *dev, struct device_attribute *attr, 3408 char *buf) 3409 { 3410 struct nvme_ns_ids *ids = &dev_to_ns_head(dev)->ids; 3411 3412 /* For backward compatibility expose the NGUID to userspace if 3413 * we have no UUID set 3414 */ 3415 if (uuid_is_null(&ids->uuid)) { 3416 printk_ratelimited(KERN_WARNING 3417 "No UUID available providing old NGUID\n"); 3418 return sprintf(buf, "%pU\n", ids->nguid); 3419 } 3420 return sprintf(buf, "%pU\n", &ids->uuid); 3421 } 3422 static DEVICE_ATTR_RO(uuid); 3423 3424 static ssize_t eui_show(struct device *dev, struct device_attribute *attr, 3425 char *buf) 3426 { 3427 return sprintf(buf, "%8ph\n", dev_to_ns_head(dev)->ids.eui64); 3428 } 3429 static DEVICE_ATTR_RO(eui); 3430 3431 static ssize_t nsid_show(struct device *dev, struct device_attribute *attr, 3432 char *buf) 3433 { 3434 return sprintf(buf, "%d\n", dev_to_ns_head(dev)->ns_id); 3435 } 3436 static DEVICE_ATTR_RO(nsid); 3437 3438 static struct attribute *nvme_ns_id_attrs[] = { 3439 &dev_attr_wwid.attr, 3440 &dev_attr_uuid.attr, 3441 &dev_attr_nguid.attr, 3442 &dev_attr_eui.attr, 3443 &dev_attr_nsid.attr, 3444 #ifdef CONFIG_NVME_MULTIPATH 3445 &dev_attr_ana_grpid.attr, 3446 &dev_attr_ana_state.attr, 3447 #endif 3448 NULL, 3449 }; 3450 3451 static umode_t nvme_ns_id_attrs_are_visible(struct kobject *kobj, 3452 struct attribute *a, int n) 3453 { 3454 struct device *dev = container_of(kobj, struct device, kobj); 3455 struct nvme_ns_ids *ids = &dev_to_ns_head(dev)->ids; 3456 3457 if (a == &dev_attr_uuid.attr) { 3458 if (uuid_is_null(&ids->uuid) && 3459 !memchr_inv(ids->nguid, 0, sizeof(ids->nguid))) 3460 return 0; 3461 } 3462 if (a == &dev_attr_nguid.attr) { 3463 if (!memchr_inv(ids->nguid, 0, sizeof(ids->nguid))) 3464 return 0; 3465 } 3466 if (a == &dev_attr_eui.attr) { 3467 if (!memchr_inv(ids->eui64, 0, sizeof(ids->eui64))) 3468 return 0; 3469 } 3470 #ifdef CONFIG_NVME_MULTIPATH 3471 if (a == &dev_attr_ana_grpid.attr || a == &dev_attr_ana_state.attr) { 3472 if (dev_to_disk(dev)->fops != &nvme_bdev_ops) /* per-path attr */ 3473 return 0; 3474 if (!nvme_ctrl_use_ana(nvme_get_ns_from_dev(dev)->ctrl)) 3475 return 0; 3476 } 3477 #endif 3478 return a->mode; 3479 } 3480 3481 static const struct attribute_group nvme_ns_id_attr_group = { 3482 .attrs = nvme_ns_id_attrs, 3483 .is_visible = nvme_ns_id_attrs_are_visible, 3484 }; 3485 3486 const struct attribute_group *nvme_ns_id_attr_groups[] = { 3487 &nvme_ns_id_attr_group, 3488 #ifdef CONFIG_NVM 3489 &nvme_nvm_attr_group, 3490 #endif 3491 NULL, 3492 }; 3493 3494 #define nvme_show_str_function(field) \ 3495 static ssize_t field##_show(struct device *dev, \ 3496 struct device_attribute *attr, char *buf) \ 3497 { \ 3498 struct nvme_ctrl *ctrl = dev_get_drvdata(dev); \ 3499 return sprintf(buf, "%.*s\n", \ 3500 (int)sizeof(ctrl->subsys->field), ctrl->subsys->field); \ 3501 } \ 3502 static DEVICE_ATTR(field, S_IRUGO, field##_show, NULL); 3503 3504 nvme_show_str_function(model); 3505 nvme_show_str_function(serial); 3506 nvme_show_str_function(firmware_rev); 3507 3508 #define nvme_show_int_function(field) \ 3509 static ssize_t field##_show(struct device *dev, \ 3510 struct device_attribute *attr, char *buf) \ 3511 { \ 3512 struct nvme_ctrl *ctrl = dev_get_drvdata(dev); \ 3513 return sprintf(buf, "%d\n", ctrl->field); \ 3514 } \ 3515 static DEVICE_ATTR(field, S_IRUGO, field##_show, NULL); 3516 3517 nvme_show_int_function(cntlid); 3518 nvme_show_int_function(numa_node); 3519 nvme_show_int_function(queue_count); 3520 nvme_show_int_function(sqsize); 3521 3522 static ssize_t nvme_sysfs_delete(struct device *dev, 3523 struct device_attribute *attr, const char *buf, 3524 size_t count) 3525 { 3526 struct nvme_ctrl *ctrl = dev_get_drvdata(dev); 3527 3528 if (device_remove_file_self(dev, attr)) 3529 nvme_delete_ctrl_sync(ctrl); 3530 return count; 3531 } 3532 static DEVICE_ATTR(delete_controller, S_IWUSR, NULL, nvme_sysfs_delete); 3533 3534 static ssize_t nvme_sysfs_show_transport(struct device *dev, 3535 struct device_attribute *attr, 3536 char *buf) 3537 { 3538 struct nvme_ctrl *ctrl = dev_get_drvdata(dev); 3539 3540 return sysfs_emit(buf, "%s\n", ctrl->ops->name); 3541 } 3542 static DEVICE_ATTR(transport, S_IRUGO, nvme_sysfs_show_transport, NULL); 3543 3544 static ssize_t nvme_sysfs_show_state(struct device *dev, 3545 struct device_attribute *attr, 3546 char *buf) 3547 { 3548 struct nvme_ctrl *ctrl = dev_get_drvdata(dev); 3549 static const char *const state_name[] = { 3550 [NVME_CTRL_NEW] = "new", 3551 [NVME_CTRL_LIVE] = "live", 3552 [NVME_CTRL_RESETTING] = "resetting", 3553 [NVME_CTRL_CONNECTING] = "connecting", 3554 [NVME_CTRL_DELETING] = "deleting", 3555 [NVME_CTRL_DELETING_NOIO]= "deleting (no IO)", 3556 [NVME_CTRL_DEAD] = "dead", 3557 }; 3558 3559 if ((unsigned)ctrl->state < ARRAY_SIZE(state_name) && 3560 state_name[ctrl->state]) 3561 return sprintf(buf, "%s\n", state_name[ctrl->state]); 3562 3563 return sprintf(buf, "unknown state\n"); 3564 } 3565 3566 static DEVICE_ATTR(state, S_IRUGO, nvme_sysfs_show_state, NULL); 3567 3568 static ssize_t nvme_sysfs_show_subsysnqn(struct device *dev, 3569 struct device_attribute *attr, 3570 char *buf) 3571 { 3572 struct nvme_ctrl *ctrl = dev_get_drvdata(dev); 3573 3574 return sysfs_emit(buf, "%s\n", ctrl->subsys->subnqn); 3575 } 3576 static DEVICE_ATTR(subsysnqn, S_IRUGO, nvme_sysfs_show_subsysnqn, NULL); 3577 3578 static ssize_t nvme_sysfs_show_hostnqn(struct device *dev, 3579 struct device_attribute *attr, 3580 char *buf) 3581 { 3582 struct nvme_ctrl *ctrl = dev_get_drvdata(dev); 3583 3584 return sysfs_emit(buf, "%s\n", ctrl->opts->host->nqn); 3585 } 3586 static DEVICE_ATTR(hostnqn, S_IRUGO, nvme_sysfs_show_hostnqn, NULL); 3587 3588 static ssize_t nvme_sysfs_show_hostid(struct device *dev, 3589 struct device_attribute *attr, 3590 char *buf) 3591 { 3592 struct nvme_ctrl *ctrl = dev_get_drvdata(dev); 3593 3594 return sysfs_emit(buf, "%pU\n", &ctrl->opts->host->id); 3595 } 3596 static DEVICE_ATTR(hostid, S_IRUGO, nvme_sysfs_show_hostid, NULL); 3597 3598 static ssize_t nvme_sysfs_show_address(struct device *dev, 3599 struct device_attribute *attr, 3600 char *buf) 3601 { 3602 struct nvme_ctrl *ctrl = dev_get_drvdata(dev); 3603 3604 return ctrl->ops->get_address(ctrl, buf, PAGE_SIZE); 3605 } 3606 static DEVICE_ATTR(address, S_IRUGO, nvme_sysfs_show_address, NULL); 3607 3608 static ssize_t nvme_ctrl_loss_tmo_show(struct device *dev, 3609 struct device_attribute *attr, char *buf) 3610 { 3611 struct nvme_ctrl *ctrl = dev_get_drvdata(dev); 3612 struct nvmf_ctrl_options *opts = ctrl->opts; 3613 3614 if (ctrl->opts->max_reconnects == -1) 3615 return sprintf(buf, "off\n"); 3616 return sprintf(buf, "%d\n", 3617 opts->max_reconnects * opts->reconnect_delay); 3618 } 3619 3620 static ssize_t nvme_ctrl_loss_tmo_store(struct device *dev, 3621 struct device_attribute *attr, const char *buf, size_t count) 3622 { 3623 struct nvme_ctrl *ctrl = dev_get_drvdata(dev); 3624 struct nvmf_ctrl_options *opts = ctrl->opts; 3625 int ctrl_loss_tmo, err; 3626 3627 err = kstrtoint(buf, 10, &ctrl_loss_tmo); 3628 if (err) 3629 return -EINVAL; 3630 3631 else if (ctrl_loss_tmo < 0) 3632 opts->max_reconnects = -1; 3633 else 3634 opts->max_reconnects = DIV_ROUND_UP(ctrl_loss_tmo, 3635 opts->reconnect_delay); 3636 return count; 3637 } 3638 static DEVICE_ATTR(ctrl_loss_tmo, S_IRUGO | S_IWUSR, 3639 nvme_ctrl_loss_tmo_show, nvme_ctrl_loss_tmo_store); 3640 3641 static ssize_t nvme_ctrl_reconnect_delay_show(struct device *dev, 3642 struct device_attribute *attr, char *buf) 3643 { 3644 struct nvme_ctrl *ctrl = dev_get_drvdata(dev); 3645 3646 if (ctrl->opts->reconnect_delay == -1) 3647 return sprintf(buf, "off\n"); 3648 return sprintf(buf, "%d\n", ctrl->opts->reconnect_delay); 3649 } 3650 3651 static ssize_t nvme_ctrl_reconnect_delay_store(struct device *dev, 3652 struct device_attribute *attr, const char *buf, size_t count) 3653 { 3654 struct nvme_ctrl *ctrl = dev_get_drvdata(dev); 3655 unsigned int v; 3656 int err; 3657 3658 err = kstrtou32(buf, 10, &v); 3659 if (err) 3660 return err; 3661 3662 ctrl->opts->reconnect_delay = v; 3663 return count; 3664 } 3665 static DEVICE_ATTR(reconnect_delay, S_IRUGO | S_IWUSR, 3666 nvme_ctrl_reconnect_delay_show, nvme_ctrl_reconnect_delay_store); 3667 3668 static struct attribute *nvme_dev_attrs[] = { 3669 &dev_attr_reset_controller.attr, 3670 &dev_attr_rescan_controller.attr, 3671 &dev_attr_model.attr, 3672 &dev_attr_serial.attr, 3673 &dev_attr_firmware_rev.attr, 3674 &dev_attr_cntlid.attr, 3675 &dev_attr_delete_controller.attr, 3676 &dev_attr_transport.attr, 3677 &dev_attr_subsysnqn.attr, 3678 &dev_attr_address.attr, 3679 &dev_attr_state.attr, 3680 &dev_attr_numa_node.attr, 3681 &dev_attr_queue_count.attr, 3682 &dev_attr_sqsize.attr, 3683 &dev_attr_hostnqn.attr, 3684 &dev_attr_hostid.attr, 3685 &dev_attr_ctrl_loss_tmo.attr, 3686 &dev_attr_reconnect_delay.attr, 3687 NULL 3688 }; 3689 3690 static umode_t nvme_dev_attrs_are_visible(struct kobject *kobj, 3691 struct attribute *a, int n) 3692 { 3693 struct device *dev = container_of(kobj, struct device, kobj); 3694 struct nvme_ctrl *ctrl = dev_get_drvdata(dev); 3695 3696 if (a == &dev_attr_delete_controller.attr && !ctrl->ops->delete_ctrl) 3697 return 0; 3698 if (a == &dev_attr_address.attr && !ctrl->ops->get_address) 3699 return 0; 3700 if (a == &dev_attr_hostnqn.attr && !ctrl->opts) 3701 return 0; 3702 if (a == &dev_attr_hostid.attr && !ctrl->opts) 3703 return 0; 3704 if (a == &dev_attr_ctrl_loss_tmo.attr && !ctrl->opts) 3705 return 0; 3706 if (a == &dev_attr_reconnect_delay.attr && !ctrl->opts) 3707 return 0; 3708 3709 return a->mode; 3710 } 3711 3712 static const struct attribute_group nvme_dev_attrs_group = { 3713 .attrs = nvme_dev_attrs, 3714 .is_visible = nvme_dev_attrs_are_visible, 3715 }; 3716 3717 static const struct attribute_group *nvme_dev_attr_groups[] = { 3718 &nvme_dev_attrs_group, 3719 NULL, 3720 }; 3721 3722 static struct nvme_ns_head *nvme_find_ns_head(struct nvme_subsystem *subsys, 3723 unsigned nsid) 3724 { 3725 struct nvme_ns_head *h; 3726 3727 lockdep_assert_held(&subsys->lock); 3728 3729 list_for_each_entry(h, &subsys->nsheads, entry) { 3730 if (h->ns_id == nsid && kref_get_unless_zero(&h->ref)) 3731 return h; 3732 } 3733 3734 return NULL; 3735 } 3736 3737 static int __nvme_check_ids(struct nvme_subsystem *subsys, 3738 struct nvme_ns_head *new) 3739 { 3740 struct nvme_ns_head *h; 3741 3742 lockdep_assert_held(&subsys->lock); 3743 3744 list_for_each_entry(h, &subsys->nsheads, entry) { 3745 if (nvme_ns_ids_valid(&new->ids) && 3746 nvme_ns_ids_equal(&new->ids, &h->ids)) 3747 return -EINVAL; 3748 } 3749 3750 return 0; 3751 } 3752 3753 static struct nvme_ns_head *nvme_alloc_ns_head(struct nvme_ctrl *ctrl, 3754 unsigned nsid, struct nvme_ns_ids *ids) 3755 { 3756 struct nvme_ns_head *head; 3757 size_t size = sizeof(*head); 3758 int ret = -ENOMEM; 3759 3760 #ifdef CONFIG_NVME_MULTIPATH 3761 size += num_possible_nodes() * sizeof(struct nvme_ns *); 3762 #endif 3763 3764 head = kzalloc(size, GFP_KERNEL); 3765 if (!head) 3766 goto out; 3767 ret = ida_simple_get(&ctrl->subsys->ns_ida, 1, 0, GFP_KERNEL); 3768 if (ret < 0) 3769 goto out_free_head; 3770 head->instance = ret; 3771 INIT_LIST_HEAD(&head->list); 3772 ret = init_srcu_struct(&head->srcu); 3773 if (ret) 3774 goto out_ida_remove; 3775 head->subsys = ctrl->subsys; 3776 head->ns_id = nsid; 3777 head->ids = *ids; 3778 kref_init(&head->ref); 3779 3780 ret = __nvme_check_ids(ctrl->subsys, head); 3781 if (ret) { 3782 dev_err(ctrl->device, 3783 "duplicate IDs for nsid %d\n", nsid); 3784 goto out_cleanup_srcu; 3785 } 3786 3787 if (head->ids.csi) { 3788 ret = nvme_get_effects_log(ctrl, head->ids.csi, &head->effects); 3789 if (ret) 3790 goto out_cleanup_srcu; 3791 } else 3792 head->effects = ctrl->effects; 3793 3794 ret = nvme_mpath_alloc_disk(ctrl, head); 3795 if (ret) 3796 goto out_cleanup_srcu; 3797 3798 list_add_tail(&head->entry, &ctrl->subsys->nsheads); 3799 3800 kref_get(&ctrl->subsys->ref); 3801 3802 return head; 3803 out_cleanup_srcu: 3804 cleanup_srcu_struct(&head->srcu); 3805 out_ida_remove: 3806 ida_simple_remove(&ctrl->subsys->ns_ida, head->instance); 3807 out_free_head: 3808 kfree(head); 3809 out: 3810 if (ret > 0) 3811 ret = blk_status_to_errno(nvme_error_status(ret)); 3812 return ERR_PTR(ret); 3813 } 3814 3815 static int nvme_init_ns_head(struct nvme_ns *ns, unsigned nsid, 3816 struct nvme_ns_ids *ids, bool is_shared) 3817 { 3818 struct nvme_ctrl *ctrl = ns->ctrl; 3819 struct nvme_ns_head *head = NULL; 3820 int ret = 0; 3821 3822 mutex_lock(&ctrl->subsys->lock); 3823 head = nvme_find_ns_head(ctrl->subsys, nsid); 3824 if (!head) { 3825 head = nvme_alloc_ns_head(ctrl, nsid, ids); 3826 if (IS_ERR(head)) { 3827 ret = PTR_ERR(head); 3828 goto out_unlock; 3829 } 3830 head->shared = is_shared; 3831 } else { 3832 ret = -EINVAL; 3833 if (!is_shared || !head->shared) { 3834 dev_err(ctrl->device, 3835 "Duplicate unshared namespace %d\n", nsid); 3836 goto out_put_ns_head; 3837 } 3838 if (!nvme_ns_ids_equal(&head->ids, ids)) { 3839 dev_err(ctrl->device, 3840 "IDs don't match for shared namespace %d\n", 3841 nsid); 3842 goto out_put_ns_head; 3843 } 3844 } 3845 3846 list_add_tail_rcu(&ns->siblings, &head->list); 3847 ns->head = head; 3848 mutex_unlock(&ctrl->subsys->lock); 3849 return 0; 3850 3851 out_put_ns_head: 3852 nvme_put_ns_head(head); 3853 out_unlock: 3854 mutex_unlock(&ctrl->subsys->lock); 3855 return ret; 3856 } 3857 3858 static int ns_cmp(void *priv, struct list_head *a, struct list_head *b) 3859 { 3860 struct nvme_ns *nsa = container_of(a, struct nvme_ns, list); 3861 struct nvme_ns *nsb = container_of(b, struct nvme_ns, list); 3862 3863 return nsa->head->ns_id - nsb->head->ns_id; 3864 } 3865 3866 struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid) 3867 { 3868 struct nvme_ns *ns, *ret = NULL; 3869 3870 down_read(&ctrl->namespaces_rwsem); 3871 list_for_each_entry(ns, &ctrl->namespaces, list) { 3872 if (ns->head->ns_id == nsid) { 3873 if (!kref_get_unless_zero(&ns->kref)) 3874 continue; 3875 ret = ns; 3876 break; 3877 } 3878 if (ns->head->ns_id > nsid) 3879 break; 3880 } 3881 up_read(&ctrl->namespaces_rwsem); 3882 return ret; 3883 } 3884 EXPORT_SYMBOL_NS_GPL(nvme_find_get_ns, NVME_TARGET_PASSTHRU); 3885 3886 static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid, 3887 struct nvme_ns_ids *ids) 3888 { 3889 struct nvme_ns *ns; 3890 struct gendisk *disk; 3891 struct nvme_id_ns *id; 3892 char disk_name[DISK_NAME_LEN]; 3893 int node = ctrl->numa_node, flags = GENHD_FL_EXT_DEVT; 3894 3895 if (nvme_identify_ns(ctrl, nsid, ids, &id)) 3896 return; 3897 3898 ns = kzalloc_node(sizeof(*ns), GFP_KERNEL, node); 3899 if (!ns) 3900 goto out_free_id; 3901 3902 ns->queue = blk_mq_init_queue(ctrl->tagset); 3903 if (IS_ERR(ns->queue)) 3904 goto out_free_ns; 3905 3906 if (ctrl->opts && ctrl->opts->data_digest) 3907 blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, ns->queue); 3908 3909 blk_queue_flag_set(QUEUE_FLAG_NONROT, ns->queue); 3910 if (ctrl->ops->flags & NVME_F_PCI_P2PDMA) 3911 blk_queue_flag_set(QUEUE_FLAG_PCI_P2PDMA, ns->queue); 3912 3913 ns->queue->queuedata = ns; 3914 ns->ctrl = ctrl; 3915 kref_init(&ns->kref); 3916 3917 if (nvme_init_ns_head(ns, nsid, ids, id->nmic & NVME_NS_NMIC_SHARED)) 3918 goto out_free_queue; 3919 nvme_set_disk_name(disk_name, ns, ctrl, &flags); 3920 3921 disk = alloc_disk_node(0, node); 3922 if (!disk) 3923 goto out_unlink_ns; 3924 3925 disk->fops = &nvme_bdev_ops; 3926 disk->private_data = ns; 3927 disk->queue = ns->queue; 3928 disk->flags = flags; 3929 memcpy(disk->disk_name, disk_name, DISK_NAME_LEN); 3930 ns->disk = disk; 3931 3932 if (nvme_update_ns_info(ns, id)) 3933 goto out_put_disk; 3934 3935 if ((ctrl->quirks & NVME_QUIRK_LIGHTNVM) && id->vs[0] == 0x1) { 3936 if (nvme_nvm_register(ns, disk_name, node)) { 3937 dev_warn(ctrl->device, "LightNVM init failure\n"); 3938 goto out_put_disk; 3939 } 3940 } 3941 3942 down_write(&ctrl->namespaces_rwsem); 3943 list_add_tail(&ns->list, &ctrl->namespaces); 3944 up_write(&ctrl->namespaces_rwsem); 3945 3946 nvme_get_ctrl(ctrl); 3947 3948 device_add_disk(ctrl->device, ns->disk, nvme_ns_id_attr_groups); 3949 3950 nvme_mpath_add_disk(ns, id); 3951 nvme_fault_inject_init(&ns->fault_inject, ns->disk->disk_name); 3952 kfree(id); 3953 3954 return; 3955 out_put_disk: 3956 /* prevent double queue cleanup */ 3957 ns->disk->queue = NULL; 3958 put_disk(ns->disk); 3959 out_unlink_ns: 3960 mutex_lock(&ctrl->subsys->lock); 3961 list_del_rcu(&ns->siblings); 3962 if (list_empty(&ns->head->list)) 3963 list_del_init(&ns->head->entry); 3964 mutex_unlock(&ctrl->subsys->lock); 3965 nvme_put_ns_head(ns->head); 3966 out_free_queue: 3967 blk_cleanup_queue(ns->queue); 3968 out_free_ns: 3969 kfree(ns); 3970 out_free_id: 3971 kfree(id); 3972 } 3973 3974 static void nvme_ns_remove(struct nvme_ns *ns) 3975 { 3976 if (test_and_set_bit(NVME_NS_REMOVING, &ns->flags)) 3977 return; 3978 3979 set_capacity(ns->disk, 0); 3980 nvme_fault_inject_fini(&ns->fault_inject); 3981 3982 mutex_lock(&ns->ctrl->subsys->lock); 3983 list_del_rcu(&ns->siblings); 3984 if (list_empty(&ns->head->list)) 3985 list_del_init(&ns->head->entry); 3986 mutex_unlock(&ns->ctrl->subsys->lock); 3987 3988 synchronize_rcu(); /* guarantee not available in head->list */ 3989 nvme_mpath_clear_current_path(ns); 3990 synchronize_srcu(&ns->head->srcu); /* wait for concurrent submissions */ 3991 3992 if (ns->disk->flags & GENHD_FL_UP) { 3993 del_gendisk(ns->disk); 3994 blk_cleanup_queue(ns->queue); 3995 if (blk_get_integrity(ns->disk)) 3996 blk_integrity_unregister(ns->disk); 3997 } 3998 3999 down_write(&ns->ctrl->namespaces_rwsem); 4000 list_del_init(&ns->list); 4001 up_write(&ns->ctrl->namespaces_rwsem); 4002 4003 nvme_mpath_check_last_path(ns); 4004 nvme_put_ns(ns); 4005 } 4006 4007 static void nvme_ns_remove_by_nsid(struct nvme_ctrl *ctrl, u32 nsid) 4008 { 4009 struct nvme_ns *ns = nvme_find_get_ns(ctrl, nsid); 4010 4011 if (ns) { 4012 nvme_ns_remove(ns); 4013 nvme_put_ns(ns); 4014 } 4015 } 4016 4017 static void nvme_validate_ns(struct nvme_ns *ns, struct nvme_ns_ids *ids) 4018 { 4019 struct nvme_id_ns *id; 4020 int ret = NVME_SC_INVALID_NS | NVME_SC_DNR; 4021 4022 if (test_bit(NVME_NS_DEAD, &ns->flags)) 4023 goto out; 4024 4025 ret = nvme_identify_ns(ns->ctrl, ns->head->ns_id, ids, &id); 4026 if (ret) 4027 goto out; 4028 4029 ret = NVME_SC_INVALID_NS | NVME_SC_DNR; 4030 if (!nvme_ns_ids_equal(&ns->head->ids, ids)) { 4031 dev_err(ns->ctrl->device, 4032 "identifiers changed for nsid %d\n", ns->head->ns_id); 4033 goto out_free_id; 4034 } 4035 4036 ret = nvme_update_ns_info(ns, id); 4037 4038 out_free_id: 4039 kfree(id); 4040 out: 4041 /* 4042 * Only remove the namespace if we got a fatal error back from the 4043 * device, otherwise ignore the error and just move on. 4044 * 4045 * TODO: we should probably schedule a delayed retry here. 4046 */ 4047 if (ret > 0 && (ret & NVME_SC_DNR)) 4048 nvme_ns_remove(ns); 4049 } 4050 4051 static void nvme_validate_or_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) 4052 { 4053 struct nvme_ns_ids ids = { }; 4054 struct nvme_ns *ns; 4055 4056 if (nvme_identify_ns_descs(ctrl, nsid, &ids)) 4057 return; 4058 4059 ns = nvme_find_get_ns(ctrl, nsid); 4060 if (ns) { 4061 nvme_validate_ns(ns, &ids); 4062 nvme_put_ns(ns); 4063 return; 4064 } 4065 4066 switch (ids.csi) { 4067 case NVME_CSI_NVM: 4068 nvme_alloc_ns(ctrl, nsid, &ids); 4069 break; 4070 case NVME_CSI_ZNS: 4071 if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED)) { 4072 dev_warn(ctrl->device, 4073 "nsid %u not supported without CONFIG_BLK_DEV_ZONED\n", 4074 nsid); 4075 break; 4076 } 4077 if (!nvme_multi_css(ctrl)) { 4078 dev_warn(ctrl->device, 4079 "command set not reported for nsid: %d\n", 4080 nsid); 4081 break; 4082 } 4083 nvme_alloc_ns(ctrl, nsid, &ids); 4084 break; 4085 default: 4086 dev_warn(ctrl->device, "unknown csi %u for nsid %u\n", 4087 ids.csi, nsid); 4088 break; 4089 } 4090 } 4091 4092 static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl, 4093 unsigned nsid) 4094 { 4095 struct nvme_ns *ns, *next; 4096 LIST_HEAD(rm_list); 4097 4098 down_write(&ctrl->namespaces_rwsem); 4099 list_for_each_entry_safe(ns, next, &ctrl->namespaces, list) { 4100 if (ns->head->ns_id > nsid || test_bit(NVME_NS_DEAD, &ns->flags)) 4101 list_move_tail(&ns->list, &rm_list); 4102 } 4103 up_write(&ctrl->namespaces_rwsem); 4104 4105 list_for_each_entry_safe(ns, next, &rm_list, list) 4106 nvme_ns_remove(ns); 4107 4108 } 4109 4110 static int nvme_scan_ns_list(struct nvme_ctrl *ctrl) 4111 { 4112 const int nr_entries = NVME_IDENTIFY_DATA_SIZE / sizeof(__le32); 4113 __le32 *ns_list; 4114 u32 prev = 0; 4115 int ret = 0, i; 4116 4117 if (nvme_ctrl_limited_cns(ctrl)) 4118 return -EOPNOTSUPP; 4119 4120 ns_list = kzalloc(NVME_IDENTIFY_DATA_SIZE, GFP_KERNEL); 4121 if (!ns_list) 4122 return -ENOMEM; 4123 4124 for (;;) { 4125 struct nvme_command cmd = { 4126 .identify.opcode = nvme_admin_identify, 4127 .identify.cns = NVME_ID_CNS_NS_ACTIVE_LIST, 4128 .identify.nsid = cpu_to_le32(prev), 4129 }; 4130 4131 ret = nvme_submit_sync_cmd(ctrl->admin_q, &cmd, ns_list, 4132 NVME_IDENTIFY_DATA_SIZE); 4133 if (ret) { 4134 dev_warn(ctrl->device, 4135 "Identify NS List failed (status=0x%x)\n", ret); 4136 goto free; 4137 } 4138 4139 for (i = 0; i < nr_entries; i++) { 4140 u32 nsid = le32_to_cpu(ns_list[i]); 4141 4142 if (!nsid) /* end of the list? */ 4143 goto out; 4144 nvme_validate_or_alloc_ns(ctrl, nsid); 4145 while (++prev < nsid) 4146 nvme_ns_remove_by_nsid(ctrl, prev); 4147 } 4148 } 4149 out: 4150 nvme_remove_invalid_namespaces(ctrl, prev); 4151 free: 4152 kfree(ns_list); 4153 return ret; 4154 } 4155 4156 static void nvme_scan_ns_sequential(struct nvme_ctrl *ctrl) 4157 { 4158 struct nvme_id_ctrl *id; 4159 u32 nn, i; 4160 4161 if (nvme_identify_ctrl(ctrl, &id)) 4162 return; 4163 nn = le32_to_cpu(id->nn); 4164 kfree(id); 4165 4166 for (i = 1; i <= nn; i++) 4167 nvme_validate_or_alloc_ns(ctrl, i); 4168 4169 nvme_remove_invalid_namespaces(ctrl, nn); 4170 } 4171 4172 static void nvme_clear_changed_ns_log(struct nvme_ctrl *ctrl) 4173 { 4174 size_t log_size = NVME_MAX_CHANGED_NAMESPACES * sizeof(__le32); 4175 __le32 *log; 4176 int error; 4177 4178 log = kzalloc(log_size, GFP_KERNEL); 4179 if (!log) 4180 return; 4181 4182 /* 4183 * We need to read the log to clear the AEN, but we don't want to rely 4184 * on it for the changed namespace information as userspace could have 4185 * raced with us in reading the log page, which could cause us to miss 4186 * updates. 4187 */ 4188 error = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_CHANGED_NS, 0, 4189 NVME_CSI_NVM, log, log_size, 0); 4190 if (error) 4191 dev_warn(ctrl->device, 4192 "reading changed ns log failed: %d\n", error); 4193 4194 kfree(log); 4195 } 4196 4197 static void nvme_scan_work(struct work_struct *work) 4198 { 4199 struct nvme_ctrl *ctrl = 4200 container_of(work, struct nvme_ctrl, scan_work); 4201 4202 /* No tagset on a live ctrl means IO queues could not created */ 4203 if (ctrl->state != NVME_CTRL_LIVE || !ctrl->tagset) 4204 return; 4205 4206 if (test_and_clear_bit(NVME_AER_NOTICE_NS_CHANGED, &ctrl->events)) { 4207 dev_info(ctrl->device, "rescanning namespaces.\n"); 4208 nvme_clear_changed_ns_log(ctrl); 4209 } 4210 4211 mutex_lock(&ctrl->scan_lock); 4212 if (nvme_scan_ns_list(ctrl) != 0) 4213 nvme_scan_ns_sequential(ctrl); 4214 mutex_unlock(&ctrl->scan_lock); 4215 4216 down_write(&ctrl->namespaces_rwsem); 4217 list_sort(NULL, &ctrl->namespaces, ns_cmp); 4218 up_write(&ctrl->namespaces_rwsem); 4219 } 4220 4221 /* 4222 * This function iterates the namespace list unlocked to allow recovery from 4223 * controller failure. It is up to the caller to ensure the namespace list is 4224 * not modified by scan work while this function is executing. 4225 */ 4226 void nvme_remove_namespaces(struct nvme_ctrl *ctrl) 4227 { 4228 struct nvme_ns *ns, *next; 4229 LIST_HEAD(ns_list); 4230 4231 /* 4232 * make sure to requeue I/O to all namespaces as these 4233 * might result from the scan itself and must complete 4234 * for the scan_work to make progress 4235 */ 4236 nvme_mpath_clear_ctrl_paths(ctrl); 4237 4238 /* prevent racing with ns scanning */ 4239 flush_work(&ctrl->scan_work); 4240 4241 /* 4242 * The dead states indicates the controller was not gracefully 4243 * disconnected. In that case, we won't be able to flush any data while 4244 * removing the namespaces' disks; fail all the queues now to avoid 4245 * potentially having to clean up the failed sync later. 4246 */ 4247 if (ctrl->state == NVME_CTRL_DEAD) 4248 nvme_kill_queues(ctrl); 4249 4250 /* this is a no-op when called from the controller reset handler */ 4251 nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING_NOIO); 4252 4253 down_write(&ctrl->namespaces_rwsem); 4254 list_splice_init(&ctrl->namespaces, &ns_list); 4255 up_write(&ctrl->namespaces_rwsem); 4256 4257 list_for_each_entry_safe(ns, next, &ns_list, list) 4258 nvme_ns_remove(ns); 4259 } 4260 EXPORT_SYMBOL_GPL(nvme_remove_namespaces); 4261 4262 static int nvme_class_uevent(struct device *dev, struct kobj_uevent_env *env) 4263 { 4264 struct nvme_ctrl *ctrl = 4265 container_of(dev, struct nvme_ctrl, ctrl_device); 4266 struct nvmf_ctrl_options *opts = ctrl->opts; 4267 int ret; 4268 4269 ret = add_uevent_var(env, "NVME_TRTYPE=%s", ctrl->ops->name); 4270 if (ret) 4271 return ret; 4272 4273 if (opts) { 4274 ret = add_uevent_var(env, "NVME_TRADDR=%s", opts->traddr); 4275 if (ret) 4276 return ret; 4277 4278 ret = add_uevent_var(env, "NVME_TRSVCID=%s", 4279 opts->trsvcid ?: "none"); 4280 if (ret) 4281 return ret; 4282 4283 ret = add_uevent_var(env, "NVME_HOST_TRADDR=%s", 4284 opts->host_traddr ?: "none"); 4285 } 4286 return ret; 4287 } 4288 4289 static void nvme_aen_uevent(struct nvme_ctrl *ctrl) 4290 { 4291 char *envp[2] = { NULL, NULL }; 4292 u32 aen_result = ctrl->aen_result; 4293 4294 ctrl->aen_result = 0; 4295 if (!aen_result) 4296 return; 4297 4298 envp[0] = kasprintf(GFP_KERNEL, "NVME_AEN=%#08x", aen_result); 4299 if (!envp[0]) 4300 return; 4301 kobject_uevent_env(&ctrl->device->kobj, KOBJ_CHANGE, envp); 4302 kfree(envp[0]); 4303 } 4304 4305 static void nvme_async_event_work(struct work_struct *work) 4306 { 4307 struct nvme_ctrl *ctrl = 4308 container_of(work, struct nvme_ctrl, async_event_work); 4309 4310 nvme_aen_uevent(ctrl); 4311 ctrl->ops->submit_async_event(ctrl); 4312 } 4313 4314 static bool nvme_ctrl_pp_status(struct nvme_ctrl *ctrl) 4315 { 4316 4317 u32 csts; 4318 4319 if (ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &csts)) 4320 return false; 4321 4322 if (csts == ~0) 4323 return false; 4324 4325 return ((ctrl->ctrl_config & NVME_CC_ENABLE) && (csts & NVME_CSTS_PP)); 4326 } 4327 4328 static void nvme_get_fw_slot_info(struct nvme_ctrl *ctrl) 4329 { 4330 struct nvme_fw_slot_info_log *log; 4331 4332 log = kmalloc(sizeof(*log), GFP_KERNEL); 4333 if (!log) 4334 return; 4335 4336 if (nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_FW_SLOT, 0, NVME_CSI_NVM, 4337 log, sizeof(*log), 0)) 4338 dev_warn(ctrl->device, "Get FW SLOT INFO log error\n"); 4339 kfree(log); 4340 } 4341 4342 static void nvme_fw_act_work(struct work_struct *work) 4343 { 4344 struct nvme_ctrl *ctrl = container_of(work, 4345 struct nvme_ctrl, fw_act_work); 4346 unsigned long fw_act_timeout; 4347 4348 if (ctrl->mtfa) 4349 fw_act_timeout = jiffies + 4350 msecs_to_jiffies(ctrl->mtfa * 100); 4351 else 4352 fw_act_timeout = jiffies + 4353 msecs_to_jiffies(admin_timeout * 1000); 4354 4355 nvme_stop_queues(ctrl); 4356 while (nvme_ctrl_pp_status(ctrl)) { 4357 if (time_after(jiffies, fw_act_timeout)) { 4358 dev_warn(ctrl->device, 4359 "Fw activation timeout, reset controller\n"); 4360 nvme_try_sched_reset(ctrl); 4361 return; 4362 } 4363 msleep(100); 4364 } 4365 4366 if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_LIVE)) 4367 return; 4368 4369 nvme_start_queues(ctrl); 4370 /* read FW slot information to clear the AER */ 4371 nvme_get_fw_slot_info(ctrl); 4372 } 4373 4374 static void nvme_handle_aen_notice(struct nvme_ctrl *ctrl, u32 result) 4375 { 4376 u32 aer_notice_type = (result & 0xff00) >> 8; 4377 4378 trace_nvme_async_event(ctrl, aer_notice_type); 4379 4380 switch (aer_notice_type) { 4381 case NVME_AER_NOTICE_NS_CHANGED: 4382 set_bit(NVME_AER_NOTICE_NS_CHANGED, &ctrl->events); 4383 nvme_queue_scan(ctrl); 4384 break; 4385 case NVME_AER_NOTICE_FW_ACT_STARTING: 4386 /* 4387 * We are (ab)using the RESETTING state to prevent subsequent 4388 * recovery actions from interfering with the controller's 4389 * firmware activation. 4390 */ 4391 if (nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING)) 4392 queue_work(nvme_wq, &ctrl->fw_act_work); 4393 break; 4394 #ifdef CONFIG_NVME_MULTIPATH 4395 case NVME_AER_NOTICE_ANA: 4396 if (!ctrl->ana_log_buf) 4397 break; 4398 queue_work(nvme_wq, &ctrl->ana_work); 4399 break; 4400 #endif 4401 case NVME_AER_NOTICE_DISC_CHANGED: 4402 ctrl->aen_result = result; 4403 break; 4404 default: 4405 dev_warn(ctrl->device, "async event result %08x\n", result); 4406 } 4407 } 4408 4409 void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status, 4410 volatile union nvme_result *res) 4411 { 4412 u32 result = le32_to_cpu(res->u32); 4413 u32 aer_type = result & 0x07; 4414 4415 if (le16_to_cpu(status) >> 1 != NVME_SC_SUCCESS) 4416 return; 4417 4418 switch (aer_type) { 4419 case NVME_AER_NOTICE: 4420 nvme_handle_aen_notice(ctrl, result); 4421 break; 4422 case NVME_AER_ERROR: 4423 case NVME_AER_SMART: 4424 case NVME_AER_CSS: 4425 case NVME_AER_VS: 4426 trace_nvme_async_event(ctrl, aer_type); 4427 ctrl->aen_result = result; 4428 break; 4429 default: 4430 break; 4431 } 4432 queue_work(nvme_wq, &ctrl->async_event_work); 4433 } 4434 EXPORT_SYMBOL_GPL(nvme_complete_async_event); 4435 4436 void nvme_stop_ctrl(struct nvme_ctrl *ctrl) 4437 { 4438 nvme_mpath_stop(ctrl); 4439 nvme_stop_keep_alive(ctrl); 4440 nvme_stop_failfast_work(ctrl); 4441 flush_work(&ctrl->async_event_work); 4442 cancel_work_sync(&ctrl->fw_act_work); 4443 } 4444 EXPORT_SYMBOL_GPL(nvme_stop_ctrl); 4445 4446 void nvme_start_ctrl(struct nvme_ctrl *ctrl) 4447 { 4448 nvme_start_keep_alive(ctrl); 4449 4450 nvme_enable_aen(ctrl); 4451 4452 if (ctrl->queue_count > 1) { 4453 nvme_queue_scan(ctrl); 4454 nvme_start_queues(ctrl); 4455 } 4456 } 4457 EXPORT_SYMBOL_GPL(nvme_start_ctrl); 4458 4459 void nvme_uninit_ctrl(struct nvme_ctrl *ctrl) 4460 { 4461 nvme_hwmon_exit(ctrl); 4462 nvme_fault_inject_fini(&ctrl->fault_inject); 4463 dev_pm_qos_hide_latency_tolerance(ctrl->device); 4464 cdev_device_del(&ctrl->cdev, ctrl->device); 4465 nvme_put_ctrl(ctrl); 4466 } 4467 EXPORT_SYMBOL_GPL(nvme_uninit_ctrl); 4468 4469 static void nvme_free_cels(struct nvme_ctrl *ctrl) 4470 { 4471 struct nvme_effects_log *cel; 4472 unsigned long i; 4473 4474 xa_for_each(&ctrl->cels, i, cel) { 4475 xa_erase(&ctrl->cels, i); 4476 kfree(cel); 4477 } 4478 4479 xa_destroy(&ctrl->cels); 4480 } 4481 4482 static void nvme_free_ctrl(struct device *dev) 4483 { 4484 struct nvme_ctrl *ctrl = 4485 container_of(dev, struct nvme_ctrl, ctrl_device); 4486 struct nvme_subsystem *subsys = ctrl->subsys; 4487 4488 if (!subsys || ctrl->instance != subsys->instance) 4489 ida_simple_remove(&nvme_instance_ida, ctrl->instance); 4490 4491 nvme_free_cels(ctrl); 4492 nvme_mpath_uninit(ctrl); 4493 __free_page(ctrl->discard_page); 4494 4495 if (subsys) { 4496 mutex_lock(&nvme_subsystems_lock); 4497 list_del(&ctrl->subsys_entry); 4498 sysfs_remove_link(&subsys->dev.kobj, dev_name(ctrl->device)); 4499 mutex_unlock(&nvme_subsystems_lock); 4500 } 4501 4502 ctrl->ops->free_ctrl(ctrl); 4503 4504 if (subsys) 4505 nvme_put_subsystem(subsys); 4506 } 4507 4508 /* 4509 * Initialize a NVMe controller structures. This needs to be called during 4510 * earliest initialization so that we have the initialized structured around 4511 * during probing. 4512 */ 4513 int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, 4514 const struct nvme_ctrl_ops *ops, unsigned long quirks) 4515 { 4516 int ret; 4517 4518 ctrl->state = NVME_CTRL_NEW; 4519 clear_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags); 4520 spin_lock_init(&ctrl->lock); 4521 mutex_init(&ctrl->scan_lock); 4522 INIT_LIST_HEAD(&ctrl->namespaces); 4523 xa_init(&ctrl->cels); 4524 init_rwsem(&ctrl->namespaces_rwsem); 4525 ctrl->dev = dev; 4526 ctrl->ops = ops; 4527 ctrl->quirks = quirks; 4528 ctrl->numa_node = NUMA_NO_NODE; 4529 INIT_WORK(&ctrl->scan_work, nvme_scan_work); 4530 INIT_WORK(&ctrl->async_event_work, nvme_async_event_work); 4531 INIT_WORK(&ctrl->fw_act_work, nvme_fw_act_work); 4532 INIT_WORK(&ctrl->delete_work, nvme_delete_ctrl_work); 4533 init_waitqueue_head(&ctrl->state_wq); 4534 4535 INIT_DELAYED_WORK(&ctrl->ka_work, nvme_keep_alive_work); 4536 INIT_DELAYED_WORK(&ctrl->failfast_work, nvme_failfast_work); 4537 memset(&ctrl->ka_cmd, 0, sizeof(ctrl->ka_cmd)); 4538 ctrl->ka_cmd.common.opcode = nvme_admin_keep_alive; 4539 4540 BUILD_BUG_ON(NVME_DSM_MAX_RANGES * sizeof(struct nvme_dsm_range) > 4541 PAGE_SIZE); 4542 ctrl->discard_page = alloc_page(GFP_KERNEL); 4543 if (!ctrl->discard_page) { 4544 ret = -ENOMEM; 4545 goto out; 4546 } 4547 4548 ret = ida_simple_get(&nvme_instance_ida, 0, 0, GFP_KERNEL); 4549 if (ret < 0) 4550 goto out; 4551 ctrl->instance = ret; 4552 4553 device_initialize(&ctrl->ctrl_device); 4554 ctrl->device = &ctrl->ctrl_device; 4555 ctrl->device->devt = MKDEV(MAJOR(nvme_ctrl_base_chr_devt), 4556 ctrl->instance); 4557 ctrl->device->class = nvme_class; 4558 ctrl->device->parent = ctrl->dev; 4559 ctrl->device->groups = nvme_dev_attr_groups; 4560 ctrl->device->release = nvme_free_ctrl; 4561 dev_set_drvdata(ctrl->device, ctrl); 4562 ret = dev_set_name(ctrl->device, "nvme%d", ctrl->instance); 4563 if (ret) 4564 goto out_release_instance; 4565 4566 nvme_get_ctrl(ctrl); 4567 cdev_init(&ctrl->cdev, &nvme_dev_fops); 4568 ctrl->cdev.owner = ops->module; 4569 ret = cdev_device_add(&ctrl->cdev, ctrl->device); 4570 if (ret) 4571 goto out_free_name; 4572 4573 /* 4574 * Initialize latency tolerance controls. The sysfs files won't 4575 * be visible to userspace unless the device actually supports APST. 4576 */ 4577 ctrl->device->power.set_latency_tolerance = nvme_set_latency_tolerance; 4578 dev_pm_qos_update_user_latency_tolerance(ctrl->device, 4579 min(default_ps_max_latency_us, (unsigned long)S32_MAX)); 4580 4581 nvme_fault_inject_init(&ctrl->fault_inject, dev_name(ctrl->device)); 4582 4583 return 0; 4584 out_free_name: 4585 nvme_put_ctrl(ctrl); 4586 kfree_const(ctrl->device->kobj.name); 4587 out_release_instance: 4588 ida_simple_remove(&nvme_instance_ida, ctrl->instance); 4589 out: 4590 if (ctrl->discard_page) 4591 __free_page(ctrl->discard_page); 4592 return ret; 4593 } 4594 EXPORT_SYMBOL_GPL(nvme_init_ctrl); 4595 4596 /** 4597 * nvme_kill_queues(): Ends all namespace queues 4598 * @ctrl: the dead controller that needs to end 4599 * 4600 * Call this function when the driver determines it is unable to get the 4601 * controller in a state capable of servicing IO. 4602 */ 4603 void nvme_kill_queues(struct nvme_ctrl *ctrl) 4604 { 4605 struct nvme_ns *ns; 4606 4607 down_read(&ctrl->namespaces_rwsem); 4608 4609 /* Forcibly unquiesce queues to avoid blocking dispatch */ 4610 if (ctrl->admin_q && !blk_queue_dying(ctrl->admin_q)) 4611 blk_mq_unquiesce_queue(ctrl->admin_q); 4612 4613 list_for_each_entry(ns, &ctrl->namespaces, list) 4614 nvme_set_queue_dying(ns); 4615 4616 up_read(&ctrl->namespaces_rwsem); 4617 } 4618 EXPORT_SYMBOL_GPL(nvme_kill_queues); 4619 4620 void nvme_unfreeze(struct nvme_ctrl *ctrl) 4621 { 4622 struct nvme_ns *ns; 4623 4624 down_read(&ctrl->namespaces_rwsem); 4625 list_for_each_entry(ns, &ctrl->namespaces, list) 4626 blk_mq_unfreeze_queue(ns->queue); 4627 up_read(&ctrl->namespaces_rwsem); 4628 } 4629 EXPORT_SYMBOL_GPL(nvme_unfreeze); 4630 4631 int nvme_wait_freeze_timeout(struct nvme_ctrl *ctrl, long timeout) 4632 { 4633 struct nvme_ns *ns; 4634 4635 down_read(&ctrl->namespaces_rwsem); 4636 list_for_each_entry(ns, &ctrl->namespaces, list) { 4637 timeout = blk_mq_freeze_queue_wait_timeout(ns->queue, timeout); 4638 if (timeout <= 0) 4639 break; 4640 } 4641 up_read(&ctrl->namespaces_rwsem); 4642 return timeout; 4643 } 4644 EXPORT_SYMBOL_GPL(nvme_wait_freeze_timeout); 4645 4646 void nvme_wait_freeze(struct nvme_ctrl *ctrl) 4647 { 4648 struct nvme_ns *ns; 4649 4650 down_read(&ctrl->namespaces_rwsem); 4651 list_for_each_entry(ns, &ctrl->namespaces, list) 4652 blk_mq_freeze_queue_wait(ns->queue); 4653 up_read(&ctrl->namespaces_rwsem); 4654 } 4655 EXPORT_SYMBOL_GPL(nvme_wait_freeze); 4656 4657 void nvme_start_freeze(struct nvme_ctrl *ctrl) 4658 { 4659 struct nvme_ns *ns; 4660 4661 down_read(&ctrl->namespaces_rwsem); 4662 list_for_each_entry(ns, &ctrl->namespaces, list) 4663 blk_freeze_queue_start(ns->queue); 4664 up_read(&ctrl->namespaces_rwsem); 4665 } 4666 EXPORT_SYMBOL_GPL(nvme_start_freeze); 4667 4668 void nvme_stop_queues(struct nvme_ctrl *ctrl) 4669 { 4670 struct nvme_ns *ns; 4671 4672 down_read(&ctrl->namespaces_rwsem); 4673 list_for_each_entry(ns, &ctrl->namespaces, list) 4674 blk_mq_quiesce_queue(ns->queue); 4675 up_read(&ctrl->namespaces_rwsem); 4676 } 4677 EXPORT_SYMBOL_GPL(nvme_stop_queues); 4678 4679 void nvme_start_queues(struct nvme_ctrl *ctrl) 4680 { 4681 struct nvme_ns *ns; 4682 4683 down_read(&ctrl->namespaces_rwsem); 4684 list_for_each_entry(ns, &ctrl->namespaces, list) 4685 blk_mq_unquiesce_queue(ns->queue); 4686 up_read(&ctrl->namespaces_rwsem); 4687 } 4688 EXPORT_SYMBOL_GPL(nvme_start_queues); 4689 4690 void nvme_sync_io_queues(struct nvme_ctrl *ctrl) 4691 { 4692 struct nvme_ns *ns; 4693 4694 down_read(&ctrl->namespaces_rwsem); 4695 list_for_each_entry(ns, &ctrl->namespaces, list) 4696 blk_sync_queue(ns->queue); 4697 up_read(&ctrl->namespaces_rwsem); 4698 } 4699 EXPORT_SYMBOL_GPL(nvme_sync_io_queues); 4700 4701 void nvme_sync_queues(struct nvme_ctrl *ctrl) 4702 { 4703 nvme_sync_io_queues(ctrl); 4704 if (ctrl->admin_q) 4705 blk_sync_queue(ctrl->admin_q); 4706 } 4707 EXPORT_SYMBOL_GPL(nvme_sync_queues); 4708 4709 struct nvme_ctrl *nvme_ctrl_from_file(struct file *file) 4710 { 4711 if (file->f_op != &nvme_dev_fops) 4712 return NULL; 4713 return file->private_data; 4714 } 4715 EXPORT_SYMBOL_NS_GPL(nvme_ctrl_from_file, NVME_TARGET_PASSTHRU); 4716 4717 /* 4718 * Check we didn't inadvertently grow the command structure sizes: 4719 */ 4720 static inline void _nvme_check_size(void) 4721 { 4722 BUILD_BUG_ON(sizeof(struct nvme_common_command) != 64); 4723 BUILD_BUG_ON(sizeof(struct nvme_rw_command) != 64); 4724 BUILD_BUG_ON(sizeof(struct nvme_identify) != 64); 4725 BUILD_BUG_ON(sizeof(struct nvme_features) != 64); 4726 BUILD_BUG_ON(sizeof(struct nvme_download_firmware) != 64); 4727 BUILD_BUG_ON(sizeof(struct nvme_format_cmd) != 64); 4728 BUILD_BUG_ON(sizeof(struct nvme_dsm_cmd) != 64); 4729 BUILD_BUG_ON(sizeof(struct nvme_write_zeroes_cmd) != 64); 4730 BUILD_BUG_ON(sizeof(struct nvme_abort_cmd) != 64); 4731 BUILD_BUG_ON(sizeof(struct nvme_get_log_page_command) != 64); 4732 BUILD_BUG_ON(sizeof(struct nvme_command) != 64); 4733 BUILD_BUG_ON(sizeof(struct nvme_id_ctrl) != NVME_IDENTIFY_DATA_SIZE); 4734 BUILD_BUG_ON(sizeof(struct nvme_id_ns) != NVME_IDENTIFY_DATA_SIZE); 4735 BUILD_BUG_ON(sizeof(struct nvme_id_ns_zns) != NVME_IDENTIFY_DATA_SIZE); 4736 BUILD_BUG_ON(sizeof(struct nvme_id_ctrl_zns) != NVME_IDENTIFY_DATA_SIZE); 4737 BUILD_BUG_ON(sizeof(struct nvme_lba_range_type) != 64); 4738 BUILD_BUG_ON(sizeof(struct nvme_smart_log) != 512); 4739 BUILD_BUG_ON(sizeof(struct nvme_dbbuf) != 64); 4740 BUILD_BUG_ON(sizeof(struct nvme_directive_cmd) != 64); 4741 } 4742 4743 4744 static int __init nvme_core_init(void) 4745 { 4746 int result = -ENOMEM; 4747 4748 _nvme_check_size(); 4749 4750 nvme_wq = alloc_workqueue("nvme-wq", 4751 WQ_UNBOUND | WQ_MEM_RECLAIM | WQ_SYSFS, 0); 4752 if (!nvme_wq) 4753 goto out; 4754 4755 nvme_reset_wq = alloc_workqueue("nvme-reset-wq", 4756 WQ_UNBOUND | WQ_MEM_RECLAIM | WQ_SYSFS, 0); 4757 if (!nvme_reset_wq) 4758 goto destroy_wq; 4759 4760 nvme_delete_wq = alloc_workqueue("nvme-delete-wq", 4761 WQ_UNBOUND | WQ_MEM_RECLAIM | WQ_SYSFS, 0); 4762 if (!nvme_delete_wq) 4763 goto destroy_reset_wq; 4764 4765 result = alloc_chrdev_region(&nvme_ctrl_base_chr_devt, 0, 4766 NVME_MINORS, "nvme"); 4767 if (result < 0) 4768 goto destroy_delete_wq; 4769 4770 nvme_class = class_create(THIS_MODULE, "nvme"); 4771 if (IS_ERR(nvme_class)) { 4772 result = PTR_ERR(nvme_class); 4773 goto unregister_chrdev; 4774 } 4775 nvme_class->dev_uevent = nvme_class_uevent; 4776 4777 nvme_subsys_class = class_create(THIS_MODULE, "nvme-subsystem"); 4778 if (IS_ERR(nvme_subsys_class)) { 4779 result = PTR_ERR(nvme_subsys_class); 4780 goto destroy_class; 4781 } 4782 return 0; 4783 4784 destroy_class: 4785 class_destroy(nvme_class); 4786 unregister_chrdev: 4787 unregister_chrdev_region(nvme_ctrl_base_chr_devt, NVME_MINORS); 4788 destroy_delete_wq: 4789 destroy_workqueue(nvme_delete_wq); 4790 destroy_reset_wq: 4791 destroy_workqueue(nvme_reset_wq); 4792 destroy_wq: 4793 destroy_workqueue(nvme_wq); 4794 out: 4795 return result; 4796 } 4797 4798 static void __exit nvme_core_exit(void) 4799 { 4800 class_destroy(nvme_subsys_class); 4801 class_destroy(nvme_class); 4802 unregister_chrdev_region(nvme_ctrl_base_chr_devt, NVME_MINORS); 4803 destroy_workqueue(nvme_delete_wq); 4804 destroy_workqueue(nvme_reset_wq); 4805 destroy_workqueue(nvme_wq); 4806 ida_destroy(&nvme_instance_ida); 4807 } 4808 4809 MODULE_LICENSE("GPL"); 4810 MODULE_VERSION("1.0"); 4811 module_init(nvme_core_init); 4812 module_exit(nvme_core_exit); 4813