1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * NVM Express device driver 4 * Copyright (c) 2011-2014, Intel Corporation. 5 */ 6 7 #include <linux/blkdev.h> 8 #include <linux/blk-mq.h> 9 #include <linux/delay.h> 10 #include <linux/errno.h> 11 #include <linux/hdreg.h> 12 #include <linux/kernel.h> 13 #include <linux/module.h> 14 #include <linux/backing-dev.h> 15 #include <linux/list_sort.h> 16 #include <linux/slab.h> 17 #include <linux/types.h> 18 #include <linux/pr.h> 19 #include <linux/ptrace.h> 20 #include <linux/nvme_ioctl.h> 21 #include <linux/t10-pi.h> 22 #include <linux/pm_qos.h> 23 #include <asm/unaligned.h> 24 25 #include "nvme.h" 26 #include "fabrics.h" 27 28 #define CREATE_TRACE_POINTS 29 #include "trace.h" 30 31 #define NVME_MINORS (1U << MINORBITS) 32 33 unsigned int admin_timeout = 60; 34 module_param(admin_timeout, uint, 0644); 35 MODULE_PARM_DESC(admin_timeout, "timeout in seconds for admin commands"); 36 EXPORT_SYMBOL_GPL(admin_timeout); 37 38 unsigned int nvme_io_timeout = 30; 39 module_param_named(io_timeout, nvme_io_timeout, uint, 0644); 40 MODULE_PARM_DESC(io_timeout, "timeout in seconds for I/O"); 41 EXPORT_SYMBOL_GPL(nvme_io_timeout); 42 43 static unsigned char shutdown_timeout = 5; 44 module_param(shutdown_timeout, byte, 0644); 45 MODULE_PARM_DESC(shutdown_timeout, "timeout in seconds for controller shutdown"); 46 47 static u8 nvme_max_retries = 5; 48 module_param_named(max_retries, nvme_max_retries, byte, 0644); 49 MODULE_PARM_DESC(max_retries, "max number of retries a command may have"); 50 51 static unsigned long default_ps_max_latency_us = 100000; 52 module_param(default_ps_max_latency_us, ulong, 0644); 53 MODULE_PARM_DESC(default_ps_max_latency_us, 54 "max power saving latency for new devices; use PM QOS to change per device"); 55 56 static bool force_apst; 57 module_param(force_apst, bool, 0644); 58 MODULE_PARM_DESC(force_apst, "allow APST for newly enumerated devices even if quirked off"); 59 60 static bool streams; 61 module_param(streams, bool, 0644); 62 MODULE_PARM_DESC(streams, "turn on support for Streams write directives"); 63 64 /* 65 * nvme_wq - hosts nvme related works that are not reset or delete 66 * nvme_reset_wq - hosts nvme reset works 67 * nvme_delete_wq - hosts nvme delete works 68 * 69 * nvme_wq will host works such are scan, aen handling, fw activation, 70 * keep-alive error recovery, periodic reconnects etc. nvme_reset_wq 71 * runs reset works which also flush works hosted on nvme_wq for 72 * serialization purposes. nvme_delete_wq host controller deletion 73 * works which flush reset works for serialization. 74 */ 75 struct workqueue_struct *nvme_wq; 76 EXPORT_SYMBOL_GPL(nvme_wq); 77 78 struct workqueue_struct *nvme_reset_wq; 79 EXPORT_SYMBOL_GPL(nvme_reset_wq); 80 81 struct workqueue_struct *nvme_delete_wq; 82 EXPORT_SYMBOL_GPL(nvme_delete_wq); 83 84 static LIST_HEAD(nvme_subsystems); 85 static DEFINE_MUTEX(nvme_subsystems_lock); 86 87 static DEFINE_IDA(nvme_instance_ida); 88 static dev_t nvme_chr_devt; 89 static struct class *nvme_class; 90 static struct class *nvme_subsys_class; 91 92 static int nvme_revalidate_disk(struct gendisk *disk); 93 static void nvme_put_subsystem(struct nvme_subsystem *subsys); 94 static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl, 95 unsigned nsid); 96 97 static void nvme_set_queue_dying(struct nvme_ns *ns) 98 { 99 /* 100 * Revalidating a dead namespace sets capacity to 0. This will end 101 * buffered writers dirtying pages that can't be synced. 102 */ 103 if (!ns->disk || test_and_set_bit(NVME_NS_DEAD, &ns->flags)) 104 return; 105 revalidate_disk(ns->disk); 106 blk_set_queue_dying(ns->queue); 107 /* Forcibly unquiesce queues to avoid blocking dispatch */ 108 blk_mq_unquiesce_queue(ns->queue); 109 } 110 111 static void nvme_queue_scan(struct nvme_ctrl *ctrl) 112 { 113 /* 114 * Only new queue scan work when admin and IO queues are both alive 115 */ 116 if (ctrl->state == NVME_CTRL_LIVE) 117 queue_work(nvme_wq, &ctrl->scan_work); 118 } 119 120 int nvme_reset_ctrl(struct nvme_ctrl *ctrl) 121 { 122 if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING)) 123 return -EBUSY; 124 if (!queue_work(nvme_reset_wq, &ctrl->reset_work)) 125 return -EBUSY; 126 return 0; 127 } 128 EXPORT_SYMBOL_GPL(nvme_reset_ctrl); 129 130 int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl) 131 { 132 int ret; 133 134 ret = nvme_reset_ctrl(ctrl); 135 if (!ret) { 136 flush_work(&ctrl->reset_work); 137 if (ctrl->state != NVME_CTRL_LIVE && 138 ctrl->state != NVME_CTRL_ADMIN_ONLY) 139 ret = -ENETRESET; 140 } 141 142 return ret; 143 } 144 EXPORT_SYMBOL_GPL(nvme_reset_ctrl_sync); 145 146 static void nvme_do_delete_ctrl(struct nvme_ctrl *ctrl) 147 { 148 dev_info(ctrl->device, 149 "Removing ctrl: NQN \"%s\"\n", ctrl->opts->subsysnqn); 150 151 flush_work(&ctrl->reset_work); 152 nvme_stop_ctrl(ctrl); 153 nvme_remove_namespaces(ctrl); 154 ctrl->ops->delete_ctrl(ctrl); 155 nvme_uninit_ctrl(ctrl); 156 nvme_put_ctrl(ctrl); 157 } 158 159 static void nvme_delete_ctrl_work(struct work_struct *work) 160 { 161 struct nvme_ctrl *ctrl = 162 container_of(work, struct nvme_ctrl, delete_work); 163 164 nvme_do_delete_ctrl(ctrl); 165 } 166 167 int nvme_delete_ctrl(struct nvme_ctrl *ctrl) 168 { 169 if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING)) 170 return -EBUSY; 171 if (!queue_work(nvme_delete_wq, &ctrl->delete_work)) 172 return -EBUSY; 173 return 0; 174 } 175 EXPORT_SYMBOL_GPL(nvme_delete_ctrl); 176 177 static int nvme_delete_ctrl_sync(struct nvme_ctrl *ctrl) 178 { 179 int ret = 0; 180 181 /* 182 * Keep a reference until nvme_do_delete_ctrl() complete, 183 * since ->delete_ctrl can free the controller. 184 */ 185 nvme_get_ctrl(ctrl); 186 if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING)) 187 ret = -EBUSY; 188 if (!ret) 189 nvme_do_delete_ctrl(ctrl); 190 nvme_put_ctrl(ctrl); 191 return ret; 192 } 193 194 static inline bool nvme_ns_has_pi(struct nvme_ns *ns) 195 { 196 return ns->pi_type && ns->ms == sizeof(struct t10_pi_tuple); 197 } 198 199 static blk_status_t nvme_error_status(u16 status) 200 { 201 switch (status & 0x7ff) { 202 case NVME_SC_SUCCESS: 203 return BLK_STS_OK; 204 case NVME_SC_CAP_EXCEEDED: 205 return BLK_STS_NOSPC; 206 case NVME_SC_LBA_RANGE: 207 return BLK_STS_TARGET; 208 case NVME_SC_BAD_ATTRIBUTES: 209 case NVME_SC_ONCS_NOT_SUPPORTED: 210 case NVME_SC_INVALID_OPCODE: 211 case NVME_SC_INVALID_FIELD: 212 case NVME_SC_INVALID_NS: 213 return BLK_STS_NOTSUPP; 214 case NVME_SC_WRITE_FAULT: 215 case NVME_SC_READ_ERROR: 216 case NVME_SC_UNWRITTEN_BLOCK: 217 case NVME_SC_ACCESS_DENIED: 218 case NVME_SC_READ_ONLY: 219 case NVME_SC_COMPARE_FAILED: 220 return BLK_STS_MEDIUM; 221 case NVME_SC_GUARD_CHECK: 222 case NVME_SC_APPTAG_CHECK: 223 case NVME_SC_REFTAG_CHECK: 224 case NVME_SC_INVALID_PI: 225 return BLK_STS_PROTECTION; 226 case NVME_SC_RESERVATION_CONFLICT: 227 return BLK_STS_NEXUS; 228 case NVME_SC_HOST_PATH_ERROR: 229 return BLK_STS_TRANSPORT; 230 default: 231 return BLK_STS_IOERR; 232 } 233 } 234 235 static inline bool nvme_req_needs_retry(struct request *req) 236 { 237 if (blk_noretry_request(req)) 238 return false; 239 if (nvme_req(req)->status & NVME_SC_DNR) 240 return false; 241 if (nvme_req(req)->retries >= nvme_max_retries) 242 return false; 243 return true; 244 } 245 246 static void nvme_retry_req(struct request *req) 247 { 248 struct nvme_ns *ns = req->q->queuedata; 249 unsigned long delay = 0; 250 u16 crd; 251 252 /* The mask and shift result must be <= 3 */ 253 crd = (nvme_req(req)->status & NVME_SC_CRD) >> 11; 254 if (ns && crd) 255 delay = ns->ctrl->crdt[crd - 1] * 100; 256 257 nvme_req(req)->retries++; 258 blk_mq_requeue_request(req, false); 259 blk_mq_delay_kick_requeue_list(req->q, delay); 260 } 261 262 void nvme_complete_rq(struct request *req) 263 { 264 blk_status_t status = nvme_error_status(nvme_req(req)->status); 265 266 trace_nvme_complete_rq(req); 267 268 if (nvme_req(req)->ctrl->kas) 269 nvme_req(req)->ctrl->comp_seen = true; 270 271 if (unlikely(status != BLK_STS_OK && nvme_req_needs_retry(req))) { 272 if ((req->cmd_flags & REQ_NVME_MPATH) && 273 blk_path_error(status)) { 274 nvme_failover_req(req); 275 return; 276 } 277 278 if (!blk_queue_dying(req->q)) { 279 nvme_retry_req(req); 280 return; 281 } 282 } 283 284 nvme_trace_bio_complete(req, status); 285 blk_mq_end_request(req, status); 286 } 287 EXPORT_SYMBOL_GPL(nvme_complete_rq); 288 289 bool nvme_cancel_request(struct request *req, void *data, bool reserved) 290 { 291 dev_dbg_ratelimited(((struct nvme_ctrl *) data)->device, 292 "Cancelling I/O %d", req->tag); 293 294 /* don't abort one completed request */ 295 if (blk_mq_request_completed(req)) 296 return true; 297 298 nvme_req(req)->status = NVME_SC_HOST_PATH_ERROR; 299 blk_mq_complete_request(req); 300 return true; 301 } 302 EXPORT_SYMBOL_GPL(nvme_cancel_request); 303 304 bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl, 305 enum nvme_ctrl_state new_state) 306 { 307 enum nvme_ctrl_state old_state; 308 unsigned long flags; 309 bool changed = false; 310 311 spin_lock_irqsave(&ctrl->lock, flags); 312 313 old_state = ctrl->state; 314 switch (new_state) { 315 case NVME_CTRL_ADMIN_ONLY: 316 switch (old_state) { 317 case NVME_CTRL_CONNECTING: 318 changed = true; 319 /* FALLTHRU */ 320 default: 321 break; 322 } 323 break; 324 case NVME_CTRL_LIVE: 325 switch (old_state) { 326 case NVME_CTRL_NEW: 327 case NVME_CTRL_RESETTING: 328 case NVME_CTRL_CONNECTING: 329 changed = true; 330 /* FALLTHRU */ 331 default: 332 break; 333 } 334 break; 335 case NVME_CTRL_RESETTING: 336 switch (old_state) { 337 case NVME_CTRL_NEW: 338 case NVME_CTRL_LIVE: 339 case NVME_CTRL_ADMIN_ONLY: 340 changed = true; 341 /* FALLTHRU */ 342 default: 343 break; 344 } 345 break; 346 case NVME_CTRL_CONNECTING: 347 switch (old_state) { 348 case NVME_CTRL_NEW: 349 case NVME_CTRL_RESETTING: 350 changed = true; 351 /* FALLTHRU */ 352 default: 353 break; 354 } 355 break; 356 case NVME_CTRL_DELETING: 357 switch (old_state) { 358 case NVME_CTRL_LIVE: 359 case NVME_CTRL_ADMIN_ONLY: 360 case NVME_CTRL_RESETTING: 361 case NVME_CTRL_CONNECTING: 362 changed = true; 363 /* FALLTHRU */ 364 default: 365 break; 366 } 367 break; 368 case NVME_CTRL_DEAD: 369 switch (old_state) { 370 case NVME_CTRL_DELETING: 371 changed = true; 372 /* FALLTHRU */ 373 default: 374 break; 375 } 376 break; 377 default: 378 break; 379 } 380 381 if (changed) 382 ctrl->state = new_state; 383 384 spin_unlock_irqrestore(&ctrl->lock, flags); 385 if (changed && ctrl->state == NVME_CTRL_LIVE) 386 nvme_kick_requeue_lists(ctrl); 387 return changed; 388 } 389 EXPORT_SYMBOL_GPL(nvme_change_ctrl_state); 390 391 static void nvme_free_ns_head(struct kref *ref) 392 { 393 struct nvme_ns_head *head = 394 container_of(ref, struct nvme_ns_head, ref); 395 396 nvme_mpath_remove_disk(head); 397 ida_simple_remove(&head->subsys->ns_ida, head->instance); 398 list_del_init(&head->entry); 399 cleanup_srcu_struct(&head->srcu); 400 nvme_put_subsystem(head->subsys); 401 kfree(head); 402 } 403 404 static void nvme_put_ns_head(struct nvme_ns_head *head) 405 { 406 kref_put(&head->ref, nvme_free_ns_head); 407 } 408 409 static void nvme_free_ns(struct kref *kref) 410 { 411 struct nvme_ns *ns = container_of(kref, struct nvme_ns, kref); 412 413 if (ns->ndev) 414 nvme_nvm_unregister(ns); 415 416 put_disk(ns->disk); 417 nvme_put_ns_head(ns->head); 418 nvme_put_ctrl(ns->ctrl); 419 kfree(ns); 420 } 421 422 static void nvme_put_ns(struct nvme_ns *ns) 423 { 424 kref_put(&ns->kref, nvme_free_ns); 425 } 426 427 static inline void nvme_clear_nvme_request(struct request *req) 428 { 429 if (!(req->rq_flags & RQF_DONTPREP)) { 430 nvme_req(req)->retries = 0; 431 nvme_req(req)->flags = 0; 432 req->rq_flags |= RQF_DONTPREP; 433 } 434 } 435 436 struct request *nvme_alloc_request(struct request_queue *q, 437 struct nvme_command *cmd, blk_mq_req_flags_t flags, int qid) 438 { 439 unsigned op = nvme_is_write(cmd) ? REQ_OP_DRV_OUT : REQ_OP_DRV_IN; 440 struct request *req; 441 442 if (qid == NVME_QID_ANY) { 443 req = blk_mq_alloc_request(q, op, flags); 444 } else { 445 req = blk_mq_alloc_request_hctx(q, op, flags, 446 qid ? qid - 1 : 0); 447 } 448 if (IS_ERR(req)) 449 return req; 450 451 req->cmd_flags |= REQ_FAILFAST_DRIVER; 452 nvme_clear_nvme_request(req); 453 nvme_req(req)->cmd = cmd; 454 455 return req; 456 } 457 EXPORT_SYMBOL_GPL(nvme_alloc_request); 458 459 static int nvme_toggle_streams(struct nvme_ctrl *ctrl, bool enable) 460 { 461 struct nvme_command c; 462 463 memset(&c, 0, sizeof(c)); 464 465 c.directive.opcode = nvme_admin_directive_send; 466 c.directive.nsid = cpu_to_le32(NVME_NSID_ALL); 467 c.directive.doper = NVME_DIR_SND_ID_OP_ENABLE; 468 c.directive.dtype = NVME_DIR_IDENTIFY; 469 c.directive.tdtype = NVME_DIR_STREAMS; 470 c.directive.endir = enable ? NVME_DIR_ENDIR : 0; 471 472 return nvme_submit_sync_cmd(ctrl->admin_q, &c, NULL, 0); 473 } 474 475 static int nvme_disable_streams(struct nvme_ctrl *ctrl) 476 { 477 return nvme_toggle_streams(ctrl, false); 478 } 479 480 static int nvme_enable_streams(struct nvme_ctrl *ctrl) 481 { 482 return nvme_toggle_streams(ctrl, true); 483 } 484 485 static int nvme_get_stream_params(struct nvme_ctrl *ctrl, 486 struct streams_directive_params *s, u32 nsid) 487 { 488 struct nvme_command c; 489 490 memset(&c, 0, sizeof(c)); 491 memset(s, 0, sizeof(*s)); 492 493 c.directive.opcode = nvme_admin_directive_recv; 494 c.directive.nsid = cpu_to_le32(nsid); 495 c.directive.numd = cpu_to_le32((sizeof(*s) >> 2) - 1); 496 c.directive.doper = NVME_DIR_RCV_ST_OP_PARAM; 497 c.directive.dtype = NVME_DIR_STREAMS; 498 499 return nvme_submit_sync_cmd(ctrl->admin_q, &c, s, sizeof(*s)); 500 } 501 502 static int nvme_configure_directives(struct nvme_ctrl *ctrl) 503 { 504 struct streams_directive_params s; 505 int ret; 506 507 if (!(ctrl->oacs & NVME_CTRL_OACS_DIRECTIVES)) 508 return 0; 509 if (!streams) 510 return 0; 511 512 ret = nvme_enable_streams(ctrl); 513 if (ret) 514 return ret; 515 516 ret = nvme_get_stream_params(ctrl, &s, NVME_NSID_ALL); 517 if (ret) 518 return ret; 519 520 ctrl->nssa = le16_to_cpu(s.nssa); 521 if (ctrl->nssa < BLK_MAX_WRITE_HINTS - 1) { 522 dev_info(ctrl->device, "too few streams (%u) available\n", 523 ctrl->nssa); 524 nvme_disable_streams(ctrl); 525 return 0; 526 } 527 528 ctrl->nr_streams = min_t(unsigned, ctrl->nssa, BLK_MAX_WRITE_HINTS - 1); 529 dev_info(ctrl->device, "Using %u streams\n", ctrl->nr_streams); 530 return 0; 531 } 532 533 /* 534 * Check if 'req' has a write hint associated with it. If it does, assign 535 * a valid namespace stream to the write. 536 */ 537 static void nvme_assign_write_stream(struct nvme_ctrl *ctrl, 538 struct request *req, u16 *control, 539 u32 *dsmgmt) 540 { 541 enum rw_hint streamid = req->write_hint; 542 543 if (streamid == WRITE_LIFE_NOT_SET || streamid == WRITE_LIFE_NONE) 544 streamid = 0; 545 else { 546 streamid--; 547 if (WARN_ON_ONCE(streamid > ctrl->nr_streams)) 548 return; 549 550 *control |= NVME_RW_DTYPE_STREAMS; 551 *dsmgmt |= streamid << 16; 552 } 553 554 if (streamid < ARRAY_SIZE(req->q->write_hints)) 555 req->q->write_hints[streamid] += blk_rq_bytes(req) >> 9; 556 } 557 558 static inline void nvme_setup_flush(struct nvme_ns *ns, 559 struct nvme_command *cmnd) 560 { 561 cmnd->common.opcode = nvme_cmd_flush; 562 cmnd->common.nsid = cpu_to_le32(ns->head->ns_id); 563 } 564 565 static blk_status_t nvme_setup_discard(struct nvme_ns *ns, struct request *req, 566 struct nvme_command *cmnd) 567 { 568 unsigned short segments = blk_rq_nr_discard_segments(req), n = 0; 569 struct nvme_dsm_range *range; 570 struct bio *bio; 571 572 range = kmalloc_array(segments, sizeof(*range), 573 GFP_ATOMIC | __GFP_NOWARN); 574 if (!range) { 575 /* 576 * If we fail allocation our range, fallback to the controller 577 * discard page. If that's also busy, it's safe to return 578 * busy, as we know we can make progress once that's freed. 579 */ 580 if (test_and_set_bit_lock(0, &ns->ctrl->discard_page_busy)) 581 return BLK_STS_RESOURCE; 582 583 range = page_address(ns->ctrl->discard_page); 584 } 585 586 __rq_for_each_bio(bio, req) { 587 u64 slba = nvme_block_nr(ns, bio->bi_iter.bi_sector); 588 u32 nlb = bio->bi_iter.bi_size >> ns->lba_shift; 589 590 if (n < segments) { 591 range[n].cattr = cpu_to_le32(0); 592 range[n].nlb = cpu_to_le32(nlb); 593 range[n].slba = cpu_to_le64(slba); 594 } 595 n++; 596 } 597 598 if (WARN_ON_ONCE(n != segments)) { 599 if (virt_to_page(range) == ns->ctrl->discard_page) 600 clear_bit_unlock(0, &ns->ctrl->discard_page_busy); 601 else 602 kfree(range); 603 return BLK_STS_IOERR; 604 } 605 606 cmnd->dsm.opcode = nvme_cmd_dsm; 607 cmnd->dsm.nsid = cpu_to_le32(ns->head->ns_id); 608 cmnd->dsm.nr = cpu_to_le32(segments - 1); 609 cmnd->dsm.attributes = cpu_to_le32(NVME_DSMGMT_AD); 610 611 req->special_vec.bv_page = virt_to_page(range); 612 req->special_vec.bv_offset = offset_in_page(range); 613 req->special_vec.bv_len = sizeof(*range) * segments; 614 req->rq_flags |= RQF_SPECIAL_PAYLOAD; 615 616 return BLK_STS_OK; 617 } 618 619 static inline blk_status_t nvme_setup_write_zeroes(struct nvme_ns *ns, 620 struct request *req, struct nvme_command *cmnd) 621 { 622 if (ns->ctrl->quirks & NVME_QUIRK_DEALLOCATE_ZEROES) 623 return nvme_setup_discard(ns, req, cmnd); 624 625 cmnd->write_zeroes.opcode = nvme_cmd_write_zeroes; 626 cmnd->write_zeroes.nsid = cpu_to_le32(ns->head->ns_id); 627 cmnd->write_zeroes.slba = 628 cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req))); 629 cmnd->write_zeroes.length = 630 cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1); 631 cmnd->write_zeroes.control = 0; 632 return BLK_STS_OK; 633 } 634 635 static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns, 636 struct request *req, struct nvme_command *cmnd) 637 { 638 struct nvme_ctrl *ctrl = ns->ctrl; 639 u16 control = 0; 640 u32 dsmgmt = 0; 641 642 if (req->cmd_flags & REQ_FUA) 643 control |= NVME_RW_FUA; 644 if (req->cmd_flags & (REQ_FAILFAST_DEV | REQ_RAHEAD)) 645 control |= NVME_RW_LR; 646 647 if (req->cmd_flags & REQ_RAHEAD) 648 dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH; 649 650 cmnd->rw.opcode = (rq_data_dir(req) ? nvme_cmd_write : nvme_cmd_read); 651 cmnd->rw.nsid = cpu_to_le32(ns->head->ns_id); 652 cmnd->rw.slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req))); 653 cmnd->rw.length = cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1); 654 655 if (req_op(req) == REQ_OP_WRITE && ctrl->nr_streams) 656 nvme_assign_write_stream(ctrl, req, &control, &dsmgmt); 657 658 if (ns->ms) { 659 /* 660 * If formated with metadata, the block layer always provides a 661 * metadata buffer if CONFIG_BLK_DEV_INTEGRITY is enabled. Else 662 * we enable the PRACT bit for protection information or set the 663 * namespace capacity to zero to prevent any I/O. 664 */ 665 if (!blk_integrity_rq(req)) { 666 if (WARN_ON_ONCE(!nvme_ns_has_pi(ns))) 667 return BLK_STS_NOTSUPP; 668 control |= NVME_RW_PRINFO_PRACT; 669 } else if (req_op(req) == REQ_OP_WRITE) { 670 t10_pi_prepare(req, ns->pi_type); 671 } 672 673 switch (ns->pi_type) { 674 case NVME_NS_DPS_PI_TYPE3: 675 control |= NVME_RW_PRINFO_PRCHK_GUARD; 676 break; 677 case NVME_NS_DPS_PI_TYPE1: 678 case NVME_NS_DPS_PI_TYPE2: 679 control |= NVME_RW_PRINFO_PRCHK_GUARD | 680 NVME_RW_PRINFO_PRCHK_REF; 681 cmnd->rw.reftag = cpu_to_le32(t10_pi_ref_tag(req)); 682 break; 683 } 684 } 685 686 cmnd->rw.control = cpu_to_le16(control); 687 cmnd->rw.dsmgmt = cpu_to_le32(dsmgmt); 688 return 0; 689 } 690 691 void nvme_cleanup_cmd(struct request *req) 692 { 693 if (blk_integrity_rq(req) && req_op(req) == REQ_OP_READ && 694 nvme_req(req)->status == 0) { 695 struct nvme_ns *ns = req->rq_disk->private_data; 696 697 t10_pi_complete(req, ns->pi_type, 698 blk_rq_bytes(req) >> ns->lba_shift); 699 } 700 if (req->rq_flags & RQF_SPECIAL_PAYLOAD) { 701 struct nvme_ns *ns = req->rq_disk->private_data; 702 struct page *page = req->special_vec.bv_page; 703 704 if (page == ns->ctrl->discard_page) 705 clear_bit_unlock(0, &ns->ctrl->discard_page_busy); 706 else 707 kfree(page_address(page) + req->special_vec.bv_offset); 708 } 709 } 710 EXPORT_SYMBOL_GPL(nvme_cleanup_cmd); 711 712 blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req, 713 struct nvme_command *cmd) 714 { 715 blk_status_t ret = BLK_STS_OK; 716 717 nvme_clear_nvme_request(req); 718 719 memset(cmd, 0, sizeof(*cmd)); 720 switch (req_op(req)) { 721 case REQ_OP_DRV_IN: 722 case REQ_OP_DRV_OUT: 723 memcpy(cmd, nvme_req(req)->cmd, sizeof(*cmd)); 724 break; 725 case REQ_OP_FLUSH: 726 nvme_setup_flush(ns, cmd); 727 break; 728 case REQ_OP_WRITE_ZEROES: 729 ret = nvme_setup_write_zeroes(ns, req, cmd); 730 break; 731 case REQ_OP_DISCARD: 732 ret = nvme_setup_discard(ns, req, cmd); 733 break; 734 case REQ_OP_READ: 735 case REQ_OP_WRITE: 736 ret = nvme_setup_rw(ns, req, cmd); 737 break; 738 default: 739 WARN_ON_ONCE(1); 740 return BLK_STS_IOERR; 741 } 742 743 cmd->common.command_id = req->tag; 744 trace_nvme_setup_cmd(req, cmd); 745 return ret; 746 } 747 EXPORT_SYMBOL_GPL(nvme_setup_cmd); 748 749 static void nvme_end_sync_rq(struct request *rq, blk_status_t error) 750 { 751 struct completion *waiting = rq->end_io_data; 752 753 rq->end_io_data = NULL; 754 complete(waiting); 755 } 756 757 static void nvme_execute_rq_polled(struct request_queue *q, 758 struct gendisk *bd_disk, struct request *rq, int at_head) 759 { 760 DECLARE_COMPLETION_ONSTACK(wait); 761 762 WARN_ON_ONCE(!test_bit(QUEUE_FLAG_POLL, &q->queue_flags)); 763 764 rq->cmd_flags |= REQ_HIPRI; 765 rq->end_io_data = &wait; 766 blk_execute_rq_nowait(q, bd_disk, rq, at_head, nvme_end_sync_rq); 767 768 while (!completion_done(&wait)) { 769 blk_poll(q, request_to_qc_t(rq->mq_hctx, rq), true); 770 cond_resched(); 771 } 772 } 773 774 /* 775 * Returns 0 on success. If the result is negative, it's a Linux error code; 776 * if the result is positive, it's an NVM Express status code 777 */ 778 int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, 779 union nvme_result *result, void *buffer, unsigned bufflen, 780 unsigned timeout, int qid, int at_head, 781 blk_mq_req_flags_t flags, bool poll) 782 { 783 struct request *req; 784 int ret; 785 786 req = nvme_alloc_request(q, cmd, flags, qid); 787 if (IS_ERR(req)) 788 return PTR_ERR(req); 789 790 req->timeout = timeout ? timeout : ADMIN_TIMEOUT; 791 792 if (buffer && bufflen) { 793 ret = blk_rq_map_kern(q, req, buffer, bufflen, GFP_KERNEL); 794 if (ret) 795 goto out; 796 } 797 798 if (poll) 799 nvme_execute_rq_polled(req->q, NULL, req, at_head); 800 else 801 blk_execute_rq(req->q, NULL, req, at_head); 802 if (result) 803 *result = nvme_req(req)->result; 804 if (nvme_req(req)->flags & NVME_REQ_CANCELLED) 805 ret = -EINTR; 806 else 807 ret = nvme_req(req)->status; 808 out: 809 blk_mq_free_request(req); 810 return ret; 811 } 812 EXPORT_SYMBOL_GPL(__nvme_submit_sync_cmd); 813 814 int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, 815 void *buffer, unsigned bufflen) 816 { 817 return __nvme_submit_sync_cmd(q, cmd, NULL, buffer, bufflen, 0, 818 NVME_QID_ANY, 0, 0, false); 819 } 820 EXPORT_SYMBOL_GPL(nvme_submit_sync_cmd); 821 822 static void *nvme_add_user_metadata(struct bio *bio, void __user *ubuf, 823 unsigned len, u32 seed, bool write) 824 { 825 struct bio_integrity_payload *bip; 826 int ret = -ENOMEM; 827 void *buf; 828 829 buf = kmalloc(len, GFP_KERNEL); 830 if (!buf) 831 goto out; 832 833 ret = -EFAULT; 834 if (write && copy_from_user(buf, ubuf, len)) 835 goto out_free_meta; 836 837 bip = bio_integrity_alloc(bio, GFP_KERNEL, 1); 838 if (IS_ERR(bip)) { 839 ret = PTR_ERR(bip); 840 goto out_free_meta; 841 } 842 843 bip->bip_iter.bi_size = len; 844 bip->bip_iter.bi_sector = seed; 845 ret = bio_integrity_add_page(bio, virt_to_page(buf), len, 846 offset_in_page(buf)); 847 if (ret == len) 848 return buf; 849 ret = -ENOMEM; 850 out_free_meta: 851 kfree(buf); 852 out: 853 return ERR_PTR(ret); 854 } 855 856 static int nvme_submit_user_cmd(struct request_queue *q, 857 struct nvme_command *cmd, void __user *ubuffer, 858 unsigned bufflen, void __user *meta_buffer, unsigned meta_len, 859 u32 meta_seed, u32 *result, unsigned timeout) 860 { 861 bool write = nvme_is_write(cmd); 862 struct nvme_ns *ns = q->queuedata; 863 struct gendisk *disk = ns ? ns->disk : NULL; 864 struct request *req; 865 struct bio *bio = NULL; 866 void *meta = NULL; 867 int ret; 868 869 req = nvme_alloc_request(q, cmd, 0, NVME_QID_ANY); 870 if (IS_ERR(req)) 871 return PTR_ERR(req); 872 873 req->timeout = timeout ? timeout : ADMIN_TIMEOUT; 874 nvme_req(req)->flags |= NVME_REQ_USERCMD; 875 876 if (ubuffer && bufflen) { 877 ret = blk_rq_map_user(q, req, NULL, ubuffer, bufflen, 878 GFP_KERNEL); 879 if (ret) 880 goto out; 881 bio = req->bio; 882 bio->bi_disk = disk; 883 if (disk && meta_buffer && meta_len) { 884 meta = nvme_add_user_metadata(bio, meta_buffer, meta_len, 885 meta_seed, write); 886 if (IS_ERR(meta)) { 887 ret = PTR_ERR(meta); 888 goto out_unmap; 889 } 890 req->cmd_flags |= REQ_INTEGRITY; 891 } 892 } 893 894 blk_execute_rq(req->q, disk, req, 0); 895 if (nvme_req(req)->flags & NVME_REQ_CANCELLED) 896 ret = -EINTR; 897 else 898 ret = nvme_req(req)->status; 899 if (result) 900 *result = le32_to_cpu(nvme_req(req)->result.u32); 901 if (meta && !ret && !write) { 902 if (copy_to_user(meta_buffer, meta, meta_len)) 903 ret = -EFAULT; 904 } 905 kfree(meta); 906 out_unmap: 907 if (bio) 908 blk_rq_unmap_user(bio); 909 out: 910 blk_mq_free_request(req); 911 return ret; 912 } 913 914 static void nvme_keep_alive_end_io(struct request *rq, blk_status_t status) 915 { 916 struct nvme_ctrl *ctrl = rq->end_io_data; 917 unsigned long flags; 918 bool startka = false; 919 920 blk_mq_free_request(rq); 921 922 if (status) { 923 dev_err(ctrl->device, 924 "failed nvme_keep_alive_end_io error=%d\n", 925 status); 926 return; 927 } 928 929 ctrl->comp_seen = false; 930 spin_lock_irqsave(&ctrl->lock, flags); 931 if (ctrl->state == NVME_CTRL_LIVE || 932 ctrl->state == NVME_CTRL_CONNECTING) 933 startka = true; 934 spin_unlock_irqrestore(&ctrl->lock, flags); 935 if (startka) 936 schedule_delayed_work(&ctrl->ka_work, ctrl->kato * HZ); 937 } 938 939 static int nvme_keep_alive(struct nvme_ctrl *ctrl) 940 { 941 struct request *rq; 942 943 rq = nvme_alloc_request(ctrl->admin_q, &ctrl->ka_cmd, BLK_MQ_REQ_RESERVED, 944 NVME_QID_ANY); 945 if (IS_ERR(rq)) 946 return PTR_ERR(rq); 947 948 rq->timeout = ctrl->kato * HZ; 949 rq->end_io_data = ctrl; 950 951 blk_execute_rq_nowait(rq->q, NULL, rq, 0, nvme_keep_alive_end_io); 952 953 return 0; 954 } 955 956 static void nvme_keep_alive_work(struct work_struct *work) 957 { 958 struct nvme_ctrl *ctrl = container_of(to_delayed_work(work), 959 struct nvme_ctrl, ka_work); 960 bool comp_seen = ctrl->comp_seen; 961 962 if ((ctrl->ctratt & NVME_CTRL_ATTR_TBKAS) && comp_seen) { 963 dev_dbg(ctrl->device, 964 "reschedule traffic based keep-alive timer\n"); 965 ctrl->comp_seen = false; 966 schedule_delayed_work(&ctrl->ka_work, ctrl->kato * HZ); 967 return; 968 } 969 970 if (nvme_keep_alive(ctrl)) { 971 /* allocation failure, reset the controller */ 972 dev_err(ctrl->device, "keep-alive failed\n"); 973 nvme_reset_ctrl(ctrl); 974 return; 975 } 976 } 977 978 static void nvme_start_keep_alive(struct nvme_ctrl *ctrl) 979 { 980 if (unlikely(ctrl->kato == 0)) 981 return; 982 983 schedule_delayed_work(&ctrl->ka_work, ctrl->kato * HZ); 984 } 985 986 void nvme_stop_keep_alive(struct nvme_ctrl *ctrl) 987 { 988 if (unlikely(ctrl->kato == 0)) 989 return; 990 991 cancel_delayed_work_sync(&ctrl->ka_work); 992 } 993 EXPORT_SYMBOL_GPL(nvme_stop_keep_alive); 994 995 static int nvme_identify_ctrl(struct nvme_ctrl *dev, struct nvme_id_ctrl **id) 996 { 997 struct nvme_command c = { }; 998 int error; 999 1000 /* gcc-4.4.4 (at least) has issues with initializers and anon unions */ 1001 c.identify.opcode = nvme_admin_identify; 1002 c.identify.cns = NVME_ID_CNS_CTRL; 1003 1004 *id = kmalloc(sizeof(struct nvme_id_ctrl), GFP_KERNEL); 1005 if (!*id) 1006 return -ENOMEM; 1007 1008 error = nvme_submit_sync_cmd(dev->admin_q, &c, *id, 1009 sizeof(struct nvme_id_ctrl)); 1010 if (error) 1011 kfree(*id); 1012 return error; 1013 } 1014 1015 static int nvme_identify_ns_descs(struct nvme_ctrl *ctrl, unsigned nsid, 1016 struct nvme_ns_ids *ids) 1017 { 1018 struct nvme_command c = { }; 1019 int status; 1020 void *data; 1021 int pos; 1022 int len; 1023 1024 c.identify.opcode = nvme_admin_identify; 1025 c.identify.nsid = cpu_to_le32(nsid); 1026 c.identify.cns = NVME_ID_CNS_NS_DESC_LIST; 1027 1028 data = kzalloc(NVME_IDENTIFY_DATA_SIZE, GFP_KERNEL); 1029 if (!data) 1030 return -ENOMEM; 1031 1032 status = nvme_submit_sync_cmd(ctrl->admin_q, &c, data, 1033 NVME_IDENTIFY_DATA_SIZE); 1034 if (status) 1035 goto free_data; 1036 1037 for (pos = 0; pos < NVME_IDENTIFY_DATA_SIZE; pos += len) { 1038 struct nvme_ns_id_desc *cur = data + pos; 1039 1040 if (cur->nidl == 0) 1041 break; 1042 1043 switch (cur->nidt) { 1044 case NVME_NIDT_EUI64: 1045 if (cur->nidl != NVME_NIDT_EUI64_LEN) { 1046 dev_warn(ctrl->device, 1047 "ctrl returned bogus length: %d for NVME_NIDT_EUI64\n", 1048 cur->nidl); 1049 goto free_data; 1050 } 1051 len = NVME_NIDT_EUI64_LEN; 1052 memcpy(ids->eui64, data + pos + sizeof(*cur), len); 1053 break; 1054 case NVME_NIDT_NGUID: 1055 if (cur->nidl != NVME_NIDT_NGUID_LEN) { 1056 dev_warn(ctrl->device, 1057 "ctrl returned bogus length: %d for NVME_NIDT_NGUID\n", 1058 cur->nidl); 1059 goto free_data; 1060 } 1061 len = NVME_NIDT_NGUID_LEN; 1062 memcpy(ids->nguid, data + pos + sizeof(*cur), len); 1063 break; 1064 case NVME_NIDT_UUID: 1065 if (cur->nidl != NVME_NIDT_UUID_LEN) { 1066 dev_warn(ctrl->device, 1067 "ctrl returned bogus length: %d for NVME_NIDT_UUID\n", 1068 cur->nidl); 1069 goto free_data; 1070 } 1071 len = NVME_NIDT_UUID_LEN; 1072 uuid_copy(&ids->uuid, data + pos + sizeof(*cur)); 1073 break; 1074 default: 1075 /* Skip unknown types */ 1076 len = cur->nidl; 1077 break; 1078 } 1079 1080 len += sizeof(*cur); 1081 } 1082 free_data: 1083 kfree(data); 1084 return status; 1085 } 1086 1087 static int nvme_identify_ns_list(struct nvme_ctrl *dev, unsigned nsid, __le32 *ns_list) 1088 { 1089 struct nvme_command c = { }; 1090 1091 c.identify.opcode = nvme_admin_identify; 1092 c.identify.cns = NVME_ID_CNS_NS_ACTIVE_LIST; 1093 c.identify.nsid = cpu_to_le32(nsid); 1094 return nvme_submit_sync_cmd(dev->admin_q, &c, ns_list, 1095 NVME_IDENTIFY_DATA_SIZE); 1096 } 1097 1098 static int nvme_identify_ns(struct nvme_ctrl *ctrl, 1099 unsigned nsid, struct nvme_id_ns **id) 1100 { 1101 struct nvme_command c = { }; 1102 int error; 1103 1104 /* gcc-4.4.4 (at least) has issues with initializers and anon unions */ 1105 c.identify.opcode = nvme_admin_identify; 1106 c.identify.nsid = cpu_to_le32(nsid); 1107 c.identify.cns = NVME_ID_CNS_NS; 1108 1109 *id = kmalloc(sizeof(**id), GFP_KERNEL); 1110 if (!*id) 1111 return -ENOMEM; 1112 1113 error = nvme_submit_sync_cmd(ctrl->admin_q, &c, *id, sizeof(**id)); 1114 if (error) { 1115 dev_warn(ctrl->device, "Identify namespace failed (%d)\n", error); 1116 kfree(*id); 1117 } 1118 1119 return error; 1120 } 1121 1122 static int nvme_features(struct nvme_ctrl *dev, u8 op, unsigned int fid, 1123 unsigned int dword11, void *buffer, size_t buflen, u32 *result) 1124 { 1125 struct nvme_command c; 1126 union nvme_result res; 1127 int ret; 1128 1129 memset(&c, 0, sizeof(c)); 1130 c.features.opcode = op; 1131 c.features.fid = cpu_to_le32(fid); 1132 c.features.dword11 = cpu_to_le32(dword11); 1133 1134 ret = __nvme_submit_sync_cmd(dev->admin_q, &c, &res, 1135 buffer, buflen, 0, NVME_QID_ANY, 0, 0, false); 1136 if (ret >= 0 && result) 1137 *result = le32_to_cpu(res.u32); 1138 return ret; 1139 } 1140 1141 int nvme_set_features(struct nvme_ctrl *dev, unsigned int fid, 1142 unsigned int dword11, void *buffer, size_t buflen, 1143 u32 *result) 1144 { 1145 return nvme_features(dev, nvme_admin_set_features, fid, dword11, buffer, 1146 buflen, result); 1147 } 1148 EXPORT_SYMBOL_GPL(nvme_set_features); 1149 1150 int nvme_get_features(struct nvme_ctrl *dev, unsigned int fid, 1151 unsigned int dword11, void *buffer, size_t buflen, 1152 u32 *result) 1153 { 1154 return nvme_features(dev, nvme_admin_get_features, fid, dword11, buffer, 1155 buflen, result); 1156 } 1157 EXPORT_SYMBOL_GPL(nvme_get_features); 1158 1159 int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count) 1160 { 1161 u32 q_count = (*count - 1) | ((*count - 1) << 16); 1162 u32 result; 1163 int status, nr_io_queues; 1164 1165 status = nvme_set_features(ctrl, NVME_FEAT_NUM_QUEUES, q_count, NULL, 0, 1166 &result); 1167 if (status < 0) 1168 return status; 1169 1170 /* 1171 * Degraded controllers might return an error when setting the queue 1172 * count. We still want to be able to bring them online and offer 1173 * access to the admin queue, as that might be only way to fix them up. 1174 */ 1175 if (status > 0) { 1176 dev_err(ctrl->device, "Could not set queue count (%d)\n", status); 1177 *count = 0; 1178 } else { 1179 nr_io_queues = min(result & 0xffff, result >> 16) + 1; 1180 *count = min(*count, nr_io_queues); 1181 } 1182 1183 return 0; 1184 } 1185 EXPORT_SYMBOL_GPL(nvme_set_queue_count); 1186 1187 #define NVME_AEN_SUPPORTED \ 1188 (NVME_AEN_CFG_NS_ATTR | NVME_AEN_CFG_FW_ACT | \ 1189 NVME_AEN_CFG_ANA_CHANGE | NVME_AEN_CFG_DISC_CHANGE) 1190 1191 static void nvme_enable_aen(struct nvme_ctrl *ctrl) 1192 { 1193 u32 result, supported_aens = ctrl->oaes & NVME_AEN_SUPPORTED; 1194 int status; 1195 1196 if (!supported_aens) 1197 return; 1198 1199 status = nvme_set_features(ctrl, NVME_FEAT_ASYNC_EVENT, supported_aens, 1200 NULL, 0, &result); 1201 if (status) 1202 dev_warn(ctrl->device, "Failed to configure AEN (cfg %x)\n", 1203 supported_aens); 1204 1205 queue_work(nvme_wq, &ctrl->async_event_work); 1206 } 1207 1208 static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio) 1209 { 1210 struct nvme_user_io io; 1211 struct nvme_command c; 1212 unsigned length, meta_len; 1213 void __user *metadata; 1214 1215 if (copy_from_user(&io, uio, sizeof(io))) 1216 return -EFAULT; 1217 if (io.flags) 1218 return -EINVAL; 1219 1220 switch (io.opcode) { 1221 case nvme_cmd_write: 1222 case nvme_cmd_read: 1223 case nvme_cmd_compare: 1224 break; 1225 default: 1226 return -EINVAL; 1227 } 1228 1229 length = (io.nblocks + 1) << ns->lba_shift; 1230 meta_len = (io.nblocks + 1) * ns->ms; 1231 metadata = (void __user *)(uintptr_t)io.metadata; 1232 1233 if (ns->ext) { 1234 length += meta_len; 1235 meta_len = 0; 1236 } else if (meta_len) { 1237 if ((io.metadata & 3) || !io.metadata) 1238 return -EINVAL; 1239 } 1240 1241 memset(&c, 0, sizeof(c)); 1242 c.rw.opcode = io.opcode; 1243 c.rw.flags = io.flags; 1244 c.rw.nsid = cpu_to_le32(ns->head->ns_id); 1245 c.rw.slba = cpu_to_le64(io.slba); 1246 c.rw.length = cpu_to_le16(io.nblocks); 1247 c.rw.control = cpu_to_le16(io.control); 1248 c.rw.dsmgmt = cpu_to_le32(io.dsmgmt); 1249 c.rw.reftag = cpu_to_le32(io.reftag); 1250 c.rw.apptag = cpu_to_le16(io.apptag); 1251 c.rw.appmask = cpu_to_le16(io.appmask); 1252 1253 return nvme_submit_user_cmd(ns->queue, &c, 1254 (void __user *)(uintptr_t)io.addr, length, 1255 metadata, meta_len, lower_32_bits(io.slba), NULL, 0); 1256 } 1257 1258 static u32 nvme_known_admin_effects(u8 opcode) 1259 { 1260 switch (opcode) { 1261 case nvme_admin_format_nvm: 1262 return NVME_CMD_EFFECTS_CSUPP | NVME_CMD_EFFECTS_LBCC | 1263 NVME_CMD_EFFECTS_CSE_MASK; 1264 case nvme_admin_sanitize_nvm: 1265 return NVME_CMD_EFFECTS_CSE_MASK; 1266 default: 1267 break; 1268 } 1269 return 0; 1270 } 1271 1272 static u32 nvme_passthru_start(struct nvme_ctrl *ctrl, struct nvme_ns *ns, 1273 u8 opcode) 1274 { 1275 u32 effects = 0; 1276 1277 if (ns) { 1278 if (ctrl->effects) 1279 effects = le32_to_cpu(ctrl->effects->iocs[opcode]); 1280 if (effects & ~(NVME_CMD_EFFECTS_CSUPP | NVME_CMD_EFFECTS_LBCC)) 1281 dev_warn(ctrl->device, 1282 "IO command:%02x has unhandled effects:%08x\n", 1283 opcode, effects); 1284 return 0; 1285 } 1286 1287 if (ctrl->effects) 1288 effects = le32_to_cpu(ctrl->effects->acs[opcode]); 1289 effects |= nvme_known_admin_effects(opcode); 1290 1291 /* 1292 * For simplicity, IO to all namespaces is quiesced even if the command 1293 * effects say only one namespace is affected. 1294 */ 1295 if (effects & (NVME_CMD_EFFECTS_LBCC | NVME_CMD_EFFECTS_CSE_MASK)) { 1296 mutex_lock(&ctrl->scan_lock); 1297 mutex_lock(&ctrl->subsys->lock); 1298 nvme_mpath_start_freeze(ctrl->subsys); 1299 nvme_mpath_wait_freeze(ctrl->subsys); 1300 nvme_start_freeze(ctrl); 1301 nvme_wait_freeze(ctrl); 1302 } 1303 return effects; 1304 } 1305 1306 static void nvme_update_formats(struct nvme_ctrl *ctrl) 1307 { 1308 struct nvme_ns *ns; 1309 1310 down_read(&ctrl->namespaces_rwsem); 1311 list_for_each_entry(ns, &ctrl->namespaces, list) 1312 if (ns->disk && nvme_revalidate_disk(ns->disk)) 1313 nvme_set_queue_dying(ns); 1314 up_read(&ctrl->namespaces_rwsem); 1315 1316 nvme_remove_invalid_namespaces(ctrl, NVME_NSID_ALL); 1317 } 1318 1319 static void nvme_passthru_end(struct nvme_ctrl *ctrl, u32 effects) 1320 { 1321 /* 1322 * Revalidate LBA changes prior to unfreezing. This is necessary to 1323 * prevent memory corruption if a logical block size was changed by 1324 * this command. 1325 */ 1326 if (effects & NVME_CMD_EFFECTS_LBCC) 1327 nvme_update_formats(ctrl); 1328 if (effects & (NVME_CMD_EFFECTS_LBCC | NVME_CMD_EFFECTS_CSE_MASK)) { 1329 nvme_unfreeze(ctrl); 1330 nvme_mpath_unfreeze(ctrl->subsys); 1331 mutex_unlock(&ctrl->subsys->lock); 1332 mutex_unlock(&ctrl->scan_lock); 1333 } 1334 if (effects & NVME_CMD_EFFECTS_CCC) 1335 nvme_init_identify(ctrl); 1336 if (effects & (NVME_CMD_EFFECTS_NIC | NVME_CMD_EFFECTS_NCC)) 1337 nvme_queue_scan(ctrl); 1338 } 1339 1340 static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns, 1341 struct nvme_passthru_cmd __user *ucmd) 1342 { 1343 struct nvme_passthru_cmd cmd; 1344 struct nvme_command c; 1345 unsigned timeout = 0; 1346 u32 effects; 1347 int status; 1348 1349 if (!capable(CAP_SYS_ADMIN)) 1350 return -EACCES; 1351 if (copy_from_user(&cmd, ucmd, sizeof(cmd))) 1352 return -EFAULT; 1353 if (cmd.flags) 1354 return -EINVAL; 1355 1356 memset(&c, 0, sizeof(c)); 1357 c.common.opcode = cmd.opcode; 1358 c.common.flags = cmd.flags; 1359 c.common.nsid = cpu_to_le32(cmd.nsid); 1360 c.common.cdw2[0] = cpu_to_le32(cmd.cdw2); 1361 c.common.cdw2[1] = cpu_to_le32(cmd.cdw3); 1362 c.common.cdw10 = cpu_to_le32(cmd.cdw10); 1363 c.common.cdw11 = cpu_to_le32(cmd.cdw11); 1364 c.common.cdw12 = cpu_to_le32(cmd.cdw12); 1365 c.common.cdw13 = cpu_to_le32(cmd.cdw13); 1366 c.common.cdw14 = cpu_to_le32(cmd.cdw14); 1367 c.common.cdw15 = cpu_to_le32(cmd.cdw15); 1368 1369 if (cmd.timeout_ms) 1370 timeout = msecs_to_jiffies(cmd.timeout_ms); 1371 1372 effects = nvme_passthru_start(ctrl, ns, cmd.opcode); 1373 status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c, 1374 (void __user *)(uintptr_t)cmd.addr, cmd.data_len, 1375 (void __user *)(uintptr_t)cmd.metadata, cmd.metadata_len, 1376 0, &cmd.result, timeout); 1377 nvme_passthru_end(ctrl, effects); 1378 1379 if (status >= 0) { 1380 if (put_user(cmd.result, &ucmd->result)) 1381 return -EFAULT; 1382 } 1383 1384 return status; 1385 } 1386 1387 /* 1388 * Issue ioctl requests on the first available path. Note that unlike normal 1389 * block layer requests we will not retry failed request on another controller. 1390 */ 1391 static struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk, 1392 struct nvme_ns_head **head, int *srcu_idx) 1393 { 1394 #ifdef CONFIG_NVME_MULTIPATH 1395 if (disk->fops == &nvme_ns_head_ops) { 1396 struct nvme_ns *ns; 1397 1398 *head = disk->private_data; 1399 *srcu_idx = srcu_read_lock(&(*head)->srcu); 1400 ns = nvme_find_path(*head); 1401 if (!ns) 1402 srcu_read_unlock(&(*head)->srcu, *srcu_idx); 1403 return ns; 1404 } 1405 #endif 1406 *head = NULL; 1407 *srcu_idx = -1; 1408 return disk->private_data; 1409 } 1410 1411 static void nvme_put_ns_from_disk(struct nvme_ns_head *head, int idx) 1412 { 1413 if (head) 1414 srcu_read_unlock(&head->srcu, idx); 1415 } 1416 1417 static int nvme_ioctl(struct block_device *bdev, fmode_t mode, 1418 unsigned int cmd, unsigned long arg) 1419 { 1420 struct nvme_ns_head *head = NULL; 1421 void __user *argp = (void __user *)arg; 1422 struct nvme_ns *ns; 1423 int srcu_idx, ret; 1424 1425 ns = nvme_get_ns_from_disk(bdev->bd_disk, &head, &srcu_idx); 1426 if (unlikely(!ns)) 1427 return -EWOULDBLOCK; 1428 1429 /* 1430 * Handle ioctls that apply to the controller instead of the namespace 1431 * seperately and drop the ns SRCU reference early. This avoids a 1432 * deadlock when deleting namespaces using the passthrough interface. 1433 */ 1434 if (cmd == NVME_IOCTL_ADMIN_CMD || is_sed_ioctl(cmd)) { 1435 struct nvme_ctrl *ctrl = ns->ctrl; 1436 1437 nvme_get_ctrl(ns->ctrl); 1438 nvme_put_ns_from_disk(head, srcu_idx); 1439 1440 if (cmd == NVME_IOCTL_ADMIN_CMD) 1441 ret = nvme_user_cmd(ctrl, NULL, argp); 1442 else 1443 ret = sed_ioctl(ctrl->opal_dev, cmd, argp); 1444 1445 nvme_put_ctrl(ctrl); 1446 return ret; 1447 } 1448 1449 switch (cmd) { 1450 case NVME_IOCTL_ID: 1451 force_successful_syscall_return(); 1452 ret = ns->head->ns_id; 1453 break; 1454 case NVME_IOCTL_IO_CMD: 1455 ret = nvme_user_cmd(ns->ctrl, ns, argp); 1456 break; 1457 case NVME_IOCTL_SUBMIT_IO: 1458 ret = nvme_submit_io(ns, argp); 1459 break; 1460 default: 1461 if (ns->ndev) 1462 ret = nvme_nvm_ioctl(ns, cmd, arg); 1463 else 1464 ret = -ENOTTY; 1465 } 1466 1467 nvme_put_ns_from_disk(head, srcu_idx); 1468 return ret; 1469 } 1470 1471 static int nvme_open(struct block_device *bdev, fmode_t mode) 1472 { 1473 struct nvme_ns *ns = bdev->bd_disk->private_data; 1474 1475 #ifdef CONFIG_NVME_MULTIPATH 1476 /* should never be called due to GENHD_FL_HIDDEN */ 1477 if (WARN_ON_ONCE(ns->head->disk)) 1478 goto fail; 1479 #endif 1480 if (!kref_get_unless_zero(&ns->kref)) 1481 goto fail; 1482 if (!try_module_get(ns->ctrl->ops->module)) 1483 goto fail_put_ns; 1484 1485 return 0; 1486 1487 fail_put_ns: 1488 nvme_put_ns(ns); 1489 fail: 1490 return -ENXIO; 1491 } 1492 1493 static void nvme_release(struct gendisk *disk, fmode_t mode) 1494 { 1495 struct nvme_ns *ns = disk->private_data; 1496 1497 module_put(ns->ctrl->ops->module); 1498 nvme_put_ns(ns); 1499 } 1500 1501 static int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo) 1502 { 1503 /* some standard values */ 1504 geo->heads = 1 << 6; 1505 geo->sectors = 1 << 5; 1506 geo->cylinders = get_capacity(bdev->bd_disk) >> 11; 1507 return 0; 1508 } 1509 1510 #ifdef CONFIG_BLK_DEV_INTEGRITY 1511 static void nvme_init_integrity(struct gendisk *disk, u16 ms, u8 pi_type) 1512 { 1513 struct blk_integrity integrity; 1514 1515 memset(&integrity, 0, sizeof(integrity)); 1516 switch (pi_type) { 1517 case NVME_NS_DPS_PI_TYPE3: 1518 integrity.profile = &t10_pi_type3_crc; 1519 integrity.tag_size = sizeof(u16) + sizeof(u32); 1520 integrity.flags |= BLK_INTEGRITY_DEVICE_CAPABLE; 1521 break; 1522 case NVME_NS_DPS_PI_TYPE1: 1523 case NVME_NS_DPS_PI_TYPE2: 1524 integrity.profile = &t10_pi_type1_crc; 1525 integrity.tag_size = sizeof(u16); 1526 integrity.flags |= BLK_INTEGRITY_DEVICE_CAPABLE; 1527 break; 1528 default: 1529 integrity.profile = NULL; 1530 break; 1531 } 1532 integrity.tuple_size = ms; 1533 blk_integrity_register(disk, &integrity); 1534 blk_queue_max_integrity_segments(disk->queue, 1); 1535 } 1536 #else 1537 static void nvme_init_integrity(struct gendisk *disk, u16 ms, u8 pi_type) 1538 { 1539 } 1540 #endif /* CONFIG_BLK_DEV_INTEGRITY */ 1541 1542 static void nvme_set_chunk_size(struct nvme_ns *ns) 1543 { 1544 u32 chunk_size = (((u32)ns->noiob) << (ns->lba_shift - 9)); 1545 blk_queue_chunk_sectors(ns->queue, rounddown_pow_of_two(chunk_size)); 1546 } 1547 1548 static void nvme_config_discard(struct gendisk *disk, struct nvme_ns *ns) 1549 { 1550 struct nvme_ctrl *ctrl = ns->ctrl; 1551 struct request_queue *queue = disk->queue; 1552 u32 size = queue_logical_block_size(queue); 1553 1554 if (!(ctrl->oncs & NVME_CTRL_ONCS_DSM)) { 1555 blk_queue_flag_clear(QUEUE_FLAG_DISCARD, queue); 1556 return; 1557 } 1558 1559 if (ctrl->nr_streams && ns->sws && ns->sgs) 1560 size *= ns->sws * ns->sgs; 1561 1562 BUILD_BUG_ON(PAGE_SIZE / sizeof(struct nvme_dsm_range) < 1563 NVME_DSM_MAX_RANGES); 1564 1565 queue->limits.discard_alignment = 0; 1566 queue->limits.discard_granularity = size; 1567 1568 /* If discard is already enabled, don't reset queue limits */ 1569 if (blk_queue_flag_test_and_set(QUEUE_FLAG_DISCARD, queue)) 1570 return; 1571 1572 blk_queue_max_discard_sectors(queue, UINT_MAX); 1573 blk_queue_max_discard_segments(queue, NVME_DSM_MAX_RANGES); 1574 1575 if (ctrl->quirks & NVME_QUIRK_DEALLOCATE_ZEROES) 1576 blk_queue_max_write_zeroes_sectors(queue, UINT_MAX); 1577 } 1578 1579 static void nvme_config_write_zeroes(struct gendisk *disk, struct nvme_ns *ns) 1580 { 1581 u32 max_sectors; 1582 unsigned short bs = 1 << ns->lba_shift; 1583 1584 if (!(ns->ctrl->oncs & NVME_CTRL_ONCS_WRITE_ZEROES) || 1585 (ns->ctrl->quirks & NVME_QUIRK_DISABLE_WRITE_ZEROES)) 1586 return; 1587 /* 1588 * Even though NVMe spec explicitly states that MDTS is not 1589 * applicable to the write-zeroes:- "The restriction does not apply to 1590 * commands that do not transfer data between the host and the 1591 * controller (e.g., Write Uncorrectable ro Write Zeroes command).". 1592 * In order to be more cautious use controller's max_hw_sectors value 1593 * to configure the maximum sectors for the write-zeroes which is 1594 * configured based on the controller's MDTS field in the 1595 * nvme_init_identify() if available. 1596 */ 1597 if (ns->ctrl->max_hw_sectors == UINT_MAX) 1598 max_sectors = ((u32)(USHRT_MAX + 1) * bs) >> 9; 1599 else 1600 max_sectors = ((u32)(ns->ctrl->max_hw_sectors + 1) * bs) >> 9; 1601 1602 blk_queue_max_write_zeroes_sectors(disk->queue, max_sectors); 1603 } 1604 1605 static int nvme_report_ns_ids(struct nvme_ctrl *ctrl, unsigned int nsid, 1606 struct nvme_id_ns *id, struct nvme_ns_ids *ids) 1607 { 1608 int ret = 0; 1609 1610 memset(ids, 0, sizeof(*ids)); 1611 1612 if (ctrl->vs >= NVME_VS(1, 1, 0)) 1613 memcpy(ids->eui64, id->eui64, sizeof(id->eui64)); 1614 if (ctrl->vs >= NVME_VS(1, 2, 0)) 1615 memcpy(ids->nguid, id->nguid, sizeof(id->nguid)); 1616 if (ctrl->vs >= NVME_VS(1, 3, 0)) { 1617 /* Don't treat error as fatal we potentially 1618 * already have a NGUID or EUI-64 1619 */ 1620 ret = nvme_identify_ns_descs(ctrl, nsid, ids); 1621 if (ret) 1622 dev_warn(ctrl->device, 1623 "Identify Descriptors failed (%d)\n", ret); 1624 } 1625 return ret; 1626 } 1627 1628 static bool nvme_ns_ids_valid(struct nvme_ns_ids *ids) 1629 { 1630 return !uuid_is_null(&ids->uuid) || 1631 memchr_inv(ids->nguid, 0, sizeof(ids->nguid)) || 1632 memchr_inv(ids->eui64, 0, sizeof(ids->eui64)); 1633 } 1634 1635 static bool nvme_ns_ids_equal(struct nvme_ns_ids *a, struct nvme_ns_ids *b) 1636 { 1637 return uuid_equal(&a->uuid, &b->uuid) && 1638 memcmp(&a->nguid, &b->nguid, sizeof(a->nguid)) == 0 && 1639 memcmp(&a->eui64, &b->eui64, sizeof(a->eui64)) == 0; 1640 } 1641 1642 static void nvme_update_disk_info(struct gendisk *disk, 1643 struct nvme_ns *ns, struct nvme_id_ns *id) 1644 { 1645 sector_t capacity = le64_to_cpu(id->nsze) << (ns->lba_shift - 9); 1646 unsigned short bs = 1 << ns->lba_shift; 1647 u32 atomic_bs, phys_bs, io_opt; 1648 1649 if (ns->lba_shift > PAGE_SHIFT) { 1650 /* unsupported block size, set capacity to 0 later */ 1651 bs = (1 << 9); 1652 } 1653 blk_mq_freeze_queue(disk->queue); 1654 blk_integrity_unregister(disk); 1655 1656 if (id->nabo == 0) { 1657 /* 1658 * Bit 1 indicates whether NAWUPF is defined for this namespace 1659 * and whether it should be used instead of AWUPF. If NAWUPF == 1660 * 0 then AWUPF must be used instead. 1661 */ 1662 if (id->nsfeat & (1 << 1) && id->nawupf) 1663 atomic_bs = (1 + le16_to_cpu(id->nawupf)) * bs; 1664 else 1665 atomic_bs = (1 + ns->ctrl->subsys->awupf) * bs; 1666 } else { 1667 atomic_bs = bs; 1668 } 1669 phys_bs = bs; 1670 io_opt = bs; 1671 if (id->nsfeat & (1 << 4)) { 1672 /* NPWG = Namespace Preferred Write Granularity */ 1673 phys_bs *= 1 + le16_to_cpu(id->npwg); 1674 /* NOWS = Namespace Optimal Write Size */ 1675 io_opt *= 1 + le16_to_cpu(id->nows); 1676 } 1677 1678 blk_queue_logical_block_size(disk->queue, bs); 1679 /* 1680 * Linux filesystems assume writing a single physical block is 1681 * an atomic operation. Hence limit the physical block size to the 1682 * value of the Atomic Write Unit Power Fail parameter. 1683 */ 1684 blk_queue_physical_block_size(disk->queue, min(phys_bs, atomic_bs)); 1685 blk_queue_io_min(disk->queue, phys_bs); 1686 blk_queue_io_opt(disk->queue, io_opt); 1687 1688 if (ns->ms && !ns->ext && 1689 (ns->ctrl->ops->flags & NVME_F_METADATA_SUPPORTED)) 1690 nvme_init_integrity(disk, ns->ms, ns->pi_type); 1691 if ((ns->ms && !nvme_ns_has_pi(ns) && !blk_get_integrity(disk)) || 1692 ns->lba_shift > PAGE_SHIFT) 1693 capacity = 0; 1694 1695 set_capacity(disk, capacity); 1696 1697 nvme_config_discard(disk, ns); 1698 nvme_config_write_zeroes(disk, ns); 1699 1700 if (id->nsattr & (1 << 0)) 1701 set_disk_ro(disk, true); 1702 else 1703 set_disk_ro(disk, false); 1704 1705 blk_mq_unfreeze_queue(disk->queue); 1706 } 1707 1708 static void __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id) 1709 { 1710 struct nvme_ns *ns = disk->private_data; 1711 1712 /* 1713 * If identify namespace failed, use default 512 byte block size so 1714 * block layer can use before failing read/write for 0 capacity. 1715 */ 1716 ns->lba_shift = id->lbaf[id->flbas & NVME_NS_FLBAS_LBA_MASK].ds; 1717 if (ns->lba_shift == 0) 1718 ns->lba_shift = 9; 1719 ns->noiob = le16_to_cpu(id->noiob); 1720 ns->ms = le16_to_cpu(id->lbaf[id->flbas & NVME_NS_FLBAS_LBA_MASK].ms); 1721 ns->ext = ns->ms && (id->flbas & NVME_NS_FLBAS_META_EXT); 1722 /* the PI implementation requires metadata equal t10 pi tuple size */ 1723 if (ns->ms == sizeof(struct t10_pi_tuple)) 1724 ns->pi_type = id->dps & NVME_NS_DPS_PI_MASK; 1725 else 1726 ns->pi_type = 0; 1727 1728 if (ns->noiob) 1729 nvme_set_chunk_size(ns); 1730 nvme_update_disk_info(disk, ns, id); 1731 #ifdef CONFIG_NVME_MULTIPATH 1732 if (ns->head->disk) { 1733 nvme_update_disk_info(ns->head->disk, ns, id); 1734 blk_queue_stack_limits(ns->head->disk->queue, ns->queue); 1735 revalidate_disk(ns->head->disk); 1736 } 1737 #endif 1738 } 1739 1740 static int nvme_revalidate_disk(struct gendisk *disk) 1741 { 1742 struct nvme_ns *ns = disk->private_data; 1743 struct nvme_ctrl *ctrl = ns->ctrl; 1744 struct nvme_id_ns *id; 1745 struct nvme_ns_ids ids; 1746 int ret = 0; 1747 1748 if (test_bit(NVME_NS_DEAD, &ns->flags)) { 1749 set_capacity(disk, 0); 1750 return -ENODEV; 1751 } 1752 1753 ret = nvme_identify_ns(ctrl, ns->head->ns_id, &id); 1754 if (ret) 1755 goto out; 1756 1757 if (id->ncap == 0) { 1758 ret = -ENODEV; 1759 goto free_id; 1760 } 1761 1762 __nvme_revalidate_disk(disk, id); 1763 ret = nvme_report_ns_ids(ctrl, ns->head->ns_id, id, &ids); 1764 if (ret) 1765 goto free_id; 1766 1767 if (!nvme_ns_ids_equal(&ns->head->ids, &ids)) { 1768 dev_err(ctrl->device, 1769 "identifiers changed for nsid %d\n", ns->head->ns_id); 1770 ret = -ENODEV; 1771 } 1772 1773 free_id: 1774 kfree(id); 1775 out: 1776 /* 1777 * Only fail the function if we got a fatal error back from the 1778 * device, otherwise ignore the error and just move on. 1779 */ 1780 if (ret == -ENOMEM || (ret > 0 && !(ret & NVME_SC_DNR))) 1781 ret = 0; 1782 else if (ret > 0) 1783 ret = blk_status_to_errno(nvme_error_status(ret)); 1784 return ret; 1785 } 1786 1787 static char nvme_pr_type(enum pr_type type) 1788 { 1789 switch (type) { 1790 case PR_WRITE_EXCLUSIVE: 1791 return 1; 1792 case PR_EXCLUSIVE_ACCESS: 1793 return 2; 1794 case PR_WRITE_EXCLUSIVE_REG_ONLY: 1795 return 3; 1796 case PR_EXCLUSIVE_ACCESS_REG_ONLY: 1797 return 4; 1798 case PR_WRITE_EXCLUSIVE_ALL_REGS: 1799 return 5; 1800 case PR_EXCLUSIVE_ACCESS_ALL_REGS: 1801 return 6; 1802 default: 1803 return 0; 1804 } 1805 }; 1806 1807 static int nvme_pr_command(struct block_device *bdev, u32 cdw10, 1808 u64 key, u64 sa_key, u8 op) 1809 { 1810 struct nvme_ns_head *head = NULL; 1811 struct nvme_ns *ns; 1812 struct nvme_command c; 1813 int srcu_idx, ret; 1814 u8 data[16] = { 0, }; 1815 1816 ns = nvme_get_ns_from_disk(bdev->bd_disk, &head, &srcu_idx); 1817 if (unlikely(!ns)) 1818 return -EWOULDBLOCK; 1819 1820 put_unaligned_le64(key, &data[0]); 1821 put_unaligned_le64(sa_key, &data[8]); 1822 1823 memset(&c, 0, sizeof(c)); 1824 c.common.opcode = op; 1825 c.common.nsid = cpu_to_le32(ns->head->ns_id); 1826 c.common.cdw10 = cpu_to_le32(cdw10); 1827 1828 ret = nvme_submit_sync_cmd(ns->queue, &c, data, 16); 1829 nvme_put_ns_from_disk(head, srcu_idx); 1830 return ret; 1831 } 1832 1833 static int nvme_pr_register(struct block_device *bdev, u64 old, 1834 u64 new, unsigned flags) 1835 { 1836 u32 cdw10; 1837 1838 if (flags & ~PR_FL_IGNORE_KEY) 1839 return -EOPNOTSUPP; 1840 1841 cdw10 = old ? 2 : 0; 1842 cdw10 |= (flags & PR_FL_IGNORE_KEY) ? 1 << 3 : 0; 1843 cdw10 |= (1 << 30) | (1 << 31); /* PTPL=1 */ 1844 return nvme_pr_command(bdev, cdw10, old, new, nvme_cmd_resv_register); 1845 } 1846 1847 static int nvme_pr_reserve(struct block_device *bdev, u64 key, 1848 enum pr_type type, unsigned flags) 1849 { 1850 u32 cdw10; 1851 1852 if (flags & ~PR_FL_IGNORE_KEY) 1853 return -EOPNOTSUPP; 1854 1855 cdw10 = nvme_pr_type(type) << 8; 1856 cdw10 |= ((flags & PR_FL_IGNORE_KEY) ? 1 << 3 : 0); 1857 return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_acquire); 1858 } 1859 1860 static int nvme_pr_preempt(struct block_device *bdev, u64 old, u64 new, 1861 enum pr_type type, bool abort) 1862 { 1863 u32 cdw10 = nvme_pr_type(type) << 8 | (abort ? 2 : 1); 1864 return nvme_pr_command(bdev, cdw10, old, new, nvme_cmd_resv_acquire); 1865 } 1866 1867 static int nvme_pr_clear(struct block_device *bdev, u64 key) 1868 { 1869 u32 cdw10 = 1 | (key ? 1 << 3 : 0); 1870 return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_register); 1871 } 1872 1873 static int nvme_pr_release(struct block_device *bdev, u64 key, enum pr_type type) 1874 { 1875 u32 cdw10 = nvme_pr_type(type) << 8 | (key ? 1 << 3 : 0); 1876 return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_release); 1877 } 1878 1879 static const struct pr_ops nvme_pr_ops = { 1880 .pr_register = nvme_pr_register, 1881 .pr_reserve = nvme_pr_reserve, 1882 .pr_release = nvme_pr_release, 1883 .pr_preempt = nvme_pr_preempt, 1884 .pr_clear = nvme_pr_clear, 1885 }; 1886 1887 #ifdef CONFIG_BLK_SED_OPAL 1888 int nvme_sec_submit(void *data, u16 spsp, u8 secp, void *buffer, size_t len, 1889 bool send) 1890 { 1891 struct nvme_ctrl *ctrl = data; 1892 struct nvme_command cmd; 1893 1894 memset(&cmd, 0, sizeof(cmd)); 1895 if (send) 1896 cmd.common.opcode = nvme_admin_security_send; 1897 else 1898 cmd.common.opcode = nvme_admin_security_recv; 1899 cmd.common.nsid = 0; 1900 cmd.common.cdw10 = cpu_to_le32(((u32)secp) << 24 | ((u32)spsp) << 8); 1901 cmd.common.cdw11 = cpu_to_le32(len); 1902 1903 return __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, NULL, buffer, len, 1904 ADMIN_TIMEOUT, NVME_QID_ANY, 1, 0, false); 1905 } 1906 EXPORT_SYMBOL_GPL(nvme_sec_submit); 1907 #endif /* CONFIG_BLK_SED_OPAL */ 1908 1909 static const struct block_device_operations nvme_fops = { 1910 .owner = THIS_MODULE, 1911 .ioctl = nvme_ioctl, 1912 .compat_ioctl = nvme_ioctl, 1913 .open = nvme_open, 1914 .release = nvme_release, 1915 .getgeo = nvme_getgeo, 1916 .revalidate_disk= nvme_revalidate_disk, 1917 .pr_ops = &nvme_pr_ops, 1918 }; 1919 1920 #ifdef CONFIG_NVME_MULTIPATH 1921 static int nvme_ns_head_open(struct block_device *bdev, fmode_t mode) 1922 { 1923 struct nvme_ns_head *head = bdev->bd_disk->private_data; 1924 1925 if (!kref_get_unless_zero(&head->ref)) 1926 return -ENXIO; 1927 return 0; 1928 } 1929 1930 static void nvme_ns_head_release(struct gendisk *disk, fmode_t mode) 1931 { 1932 nvme_put_ns_head(disk->private_data); 1933 } 1934 1935 const struct block_device_operations nvme_ns_head_ops = { 1936 .owner = THIS_MODULE, 1937 .open = nvme_ns_head_open, 1938 .release = nvme_ns_head_release, 1939 .ioctl = nvme_ioctl, 1940 .compat_ioctl = nvme_ioctl, 1941 .getgeo = nvme_getgeo, 1942 .pr_ops = &nvme_pr_ops, 1943 }; 1944 #endif /* CONFIG_NVME_MULTIPATH */ 1945 1946 static int nvme_wait_ready(struct nvme_ctrl *ctrl, u64 cap, bool enabled) 1947 { 1948 unsigned long timeout = 1949 ((NVME_CAP_TIMEOUT(cap) + 1) * HZ / 2) + jiffies; 1950 u32 csts, bit = enabled ? NVME_CSTS_RDY : 0; 1951 int ret; 1952 1953 while ((ret = ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &csts)) == 0) { 1954 if (csts == ~0) 1955 return -ENODEV; 1956 if ((csts & NVME_CSTS_RDY) == bit) 1957 break; 1958 1959 msleep(100); 1960 if (fatal_signal_pending(current)) 1961 return -EINTR; 1962 if (time_after(jiffies, timeout)) { 1963 dev_err(ctrl->device, 1964 "Device not ready; aborting %s\n", enabled ? 1965 "initialisation" : "reset"); 1966 return -ENODEV; 1967 } 1968 } 1969 1970 return ret; 1971 } 1972 1973 /* 1974 * If the device has been passed off to us in an enabled state, just clear 1975 * the enabled bit. The spec says we should set the 'shutdown notification 1976 * bits', but doing so may cause the device to complete commands to the 1977 * admin queue ... and we don't know what memory that might be pointing at! 1978 */ 1979 int nvme_disable_ctrl(struct nvme_ctrl *ctrl) 1980 { 1981 int ret; 1982 1983 ctrl->ctrl_config &= ~NVME_CC_SHN_MASK; 1984 ctrl->ctrl_config &= ~NVME_CC_ENABLE; 1985 1986 ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config); 1987 if (ret) 1988 return ret; 1989 1990 if (ctrl->quirks & NVME_QUIRK_DELAY_BEFORE_CHK_RDY) 1991 msleep(NVME_QUIRK_DELAY_AMOUNT); 1992 1993 return nvme_wait_ready(ctrl, ctrl->cap, false); 1994 } 1995 EXPORT_SYMBOL_GPL(nvme_disable_ctrl); 1996 1997 int nvme_enable_ctrl(struct nvme_ctrl *ctrl) 1998 { 1999 /* 2000 * Default to a 4K page size, with the intention to update this 2001 * path in the future to accomodate architectures with differing 2002 * kernel and IO page sizes. 2003 */ 2004 unsigned dev_page_min, page_shift = 12; 2005 int ret; 2006 2007 ret = ctrl->ops->reg_read64(ctrl, NVME_REG_CAP, &ctrl->cap); 2008 if (ret) { 2009 dev_err(ctrl->device, "Reading CAP failed (%d)\n", ret); 2010 return ret; 2011 } 2012 dev_page_min = NVME_CAP_MPSMIN(ctrl->cap) + 12; 2013 2014 if (page_shift < dev_page_min) { 2015 dev_err(ctrl->device, 2016 "Minimum device page size %u too large for host (%u)\n", 2017 1 << dev_page_min, 1 << page_shift); 2018 return -ENODEV; 2019 } 2020 2021 ctrl->page_size = 1 << page_shift; 2022 2023 ctrl->ctrl_config = NVME_CC_CSS_NVM; 2024 ctrl->ctrl_config |= (page_shift - 12) << NVME_CC_MPS_SHIFT; 2025 ctrl->ctrl_config |= NVME_CC_AMS_RR | NVME_CC_SHN_NONE; 2026 ctrl->ctrl_config |= NVME_CC_IOSQES | NVME_CC_IOCQES; 2027 ctrl->ctrl_config |= NVME_CC_ENABLE; 2028 2029 ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config); 2030 if (ret) 2031 return ret; 2032 return nvme_wait_ready(ctrl, ctrl->cap, true); 2033 } 2034 EXPORT_SYMBOL_GPL(nvme_enable_ctrl); 2035 2036 int nvme_shutdown_ctrl(struct nvme_ctrl *ctrl) 2037 { 2038 unsigned long timeout = jiffies + (ctrl->shutdown_timeout * HZ); 2039 u32 csts; 2040 int ret; 2041 2042 ctrl->ctrl_config &= ~NVME_CC_SHN_MASK; 2043 ctrl->ctrl_config |= NVME_CC_SHN_NORMAL; 2044 2045 ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config); 2046 if (ret) 2047 return ret; 2048 2049 while ((ret = ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &csts)) == 0) { 2050 if ((csts & NVME_CSTS_SHST_MASK) == NVME_CSTS_SHST_CMPLT) 2051 break; 2052 2053 msleep(100); 2054 if (fatal_signal_pending(current)) 2055 return -EINTR; 2056 if (time_after(jiffies, timeout)) { 2057 dev_err(ctrl->device, 2058 "Device shutdown incomplete; abort shutdown\n"); 2059 return -ENODEV; 2060 } 2061 } 2062 2063 return ret; 2064 } 2065 EXPORT_SYMBOL_GPL(nvme_shutdown_ctrl); 2066 2067 static void nvme_set_queue_limits(struct nvme_ctrl *ctrl, 2068 struct request_queue *q) 2069 { 2070 bool vwc = false; 2071 2072 if (ctrl->max_hw_sectors) { 2073 u32 max_segments = 2074 (ctrl->max_hw_sectors / (ctrl->page_size >> 9)) + 1; 2075 2076 max_segments = min_not_zero(max_segments, ctrl->max_segments); 2077 blk_queue_max_hw_sectors(q, ctrl->max_hw_sectors); 2078 blk_queue_max_segments(q, min_t(u32, max_segments, USHRT_MAX)); 2079 } 2080 if ((ctrl->quirks & NVME_QUIRK_STRIPE_SIZE) && 2081 is_power_of_2(ctrl->max_hw_sectors)) 2082 blk_queue_chunk_sectors(q, ctrl->max_hw_sectors); 2083 blk_queue_virt_boundary(q, ctrl->page_size - 1); 2084 if (ctrl->vwc & NVME_CTRL_VWC_PRESENT) 2085 vwc = true; 2086 blk_queue_write_cache(q, vwc, vwc); 2087 } 2088 2089 static int nvme_configure_timestamp(struct nvme_ctrl *ctrl) 2090 { 2091 __le64 ts; 2092 int ret; 2093 2094 if (!(ctrl->oncs & NVME_CTRL_ONCS_TIMESTAMP)) 2095 return 0; 2096 2097 ts = cpu_to_le64(ktime_to_ms(ktime_get_real())); 2098 ret = nvme_set_features(ctrl, NVME_FEAT_TIMESTAMP, 0, &ts, sizeof(ts), 2099 NULL); 2100 if (ret) 2101 dev_warn_once(ctrl->device, 2102 "could not set timestamp (%d)\n", ret); 2103 return ret; 2104 } 2105 2106 static int nvme_configure_acre(struct nvme_ctrl *ctrl) 2107 { 2108 struct nvme_feat_host_behavior *host; 2109 int ret; 2110 2111 /* Don't bother enabling the feature if retry delay is not reported */ 2112 if (!ctrl->crdt[0]) 2113 return 0; 2114 2115 host = kzalloc(sizeof(*host), GFP_KERNEL); 2116 if (!host) 2117 return 0; 2118 2119 host->acre = NVME_ENABLE_ACRE; 2120 ret = nvme_set_features(ctrl, NVME_FEAT_HOST_BEHAVIOR, 0, 2121 host, sizeof(*host), NULL); 2122 kfree(host); 2123 return ret; 2124 } 2125 2126 static int nvme_configure_apst(struct nvme_ctrl *ctrl) 2127 { 2128 /* 2129 * APST (Autonomous Power State Transition) lets us program a 2130 * table of power state transitions that the controller will 2131 * perform automatically. We configure it with a simple 2132 * heuristic: we are willing to spend at most 2% of the time 2133 * transitioning between power states. Therefore, when running 2134 * in any given state, we will enter the next lower-power 2135 * non-operational state after waiting 50 * (enlat + exlat) 2136 * microseconds, as long as that state's exit latency is under 2137 * the requested maximum latency. 2138 * 2139 * We will not autonomously enter any non-operational state for 2140 * which the total latency exceeds ps_max_latency_us. Users 2141 * can set ps_max_latency_us to zero to turn off APST. 2142 */ 2143 2144 unsigned apste; 2145 struct nvme_feat_auto_pst *table; 2146 u64 max_lat_us = 0; 2147 int max_ps = -1; 2148 int ret; 2149 2150 /* 2151 * If APST isn't supported or if we haven't been initialized yet, 2152 * then don't do anything. 2153 */ 2154 if (!ctrl->apsta) 2155 return 0; 2156 2157 if (ctrl->npss > 31) { 2158 dev_warn(ctrl->device, "NPSS is invalid; not using APST\n"); 2159 return 0; 2160 } 2161 2162 table = kzalloc(sizeof(*table), GFP_KERNEL); 2163 if (!table) 2164 return 0; 2165 2166 if (!ctrl->apst_enabled || ctrl->ps_max_latency_us == 0) { 2167 /* Turn off APST. */ 2168 apste = 0; 2169 dev_dbg(ctrl->device, "APST disabled\n"); 2170 } else { 2171 __le64 target = cpu_to_le64(0); 2172 int state; 2173 2174 /* 2175 * Walk through all states from lowest- to highest-power. 2176 * According to the spec, lower-numbered states use more 2177 * power. NPSS, despite the name, is the index of the 2178 * lowest-power state, not the number of states. 2179 */ 2180 for (state = (int)ctrl->npss; state >= 0; state--) { 2181 u64 total_latency_us, exit_latency_us, transition_ms; 2182 2183 if (target) 2184 table->entries[state] = target; 2185 2186 /* 2187 * Don't allow transitions to the deepest state 2188 * if it's quirked off. 2189 */ 2190 if (state == ctrl->npss && 2191 (ctrl->quirks & NVME_QUIRK_NO_DEEPEST_PS)) 2192 continue; 2193 2194 /* 2195 * Is this state a useful non-operational state for 2196 * higher-power states to autonomously transition to? 2197 */ 2198 if (!(ctrl->psd[state].flags & 2199 NVME_PS_FLAGS_NON_OP_STATE)) 2200 continue; 2201 2202 exit_latency_us = 2203 (u64)le32_to_cpu(ctrl->psd[state].exit_lat); 2204 if (exit_latency_us > ctrl->ps_max_latency_us) 2205 continue; 2206 2207 total_latency_us = 2208 exit_latency_us + 2209 le32_to_cpu(ctrl->psd[state].entry_lat); 2210 2211 /* 2212 * This state is good. Use it as the APST idle 2213 * target for higher power states. 2214 */ 2215 transition_ms = total_latency_us + 19; 2216 do_div(transition_ms, 20); 2217 if (transition_ms > (1 << 24) - 1) 2218 transition_ms = (1 << 24) - 1; 2219 2220 target = cpu_to_le64((state << 3) | 2221 (transition_ms << 8)); 2222 2223 if (max_ps == -1) 2224 max_ps = state; 2225 2226 if (total_latency_us > max_lat_us) 2227 max_lat_us = total_latency_us; 2228 } 2229 2230 apste = 1; 2231 2232 if (max_ps == -1) { 2233 dev_dbg(ctrl->device, "APST enabled but no non-operational states are available\n"); 2234 } else { 2235 dev_dbg(ctrl->device, "APST enabled: max PS = %d, max round-trip latency = %lluus, table = %*phN\n", 2236 max_ps, max_lat_us, (int)sizeof(*table), table); 2237 } 2238 } 2239 2240 ret = nvme_set_features(ctrl, NVME_FEAT_AUTO_PST, apste, 2241 table, sizeof(*table), NULL); 2242 if (ret) 2243 dev_err(ctrl->device, "failed to set APST feature (%d)\n", ret); 2244 2245 kfree(table); 2246 return ret; 2247 } 2248 2249 static void nvme_set_latency_tolerance(struct device *dev, s32 val) 2250 { 2251 struct nvme_ctrl *ctrl = dev_get_drvdata(dev); 2252 u64 latency; 2253 2254 switch (val) { 2255 case PM_QOS_LATENCY_TOLERANCE_NO_CONSTRAINT: 2256 case PM_QOS_LATENCY_ANY: 2257 latency = U64_MAX; 2258 break; 2259 2260 default: 2261 latency = val; 2262 } 2263 2264 if (ctrl->ps_max_latency_us != latency) { 2265 ctrl->ps_max_latency_us = latency; 2266 nvme_configure_apst(ctrl); 2267 } 2268 } 2269 2270 struct nvme_core_quirk_entry { 2271 /* 2272 * NVMe model and firmware strings are padded with spaces. For 2273 * simplicity, strings in the quirk table are padded with NULLs 2274 * instead. 2275 */ 2276 u16 vid; 2277 const char *mn; 2278 const char *fr; 2279 unsigned long quirks; 2280 }; 2281 2282 static const struct nvme_core_quirk_entry core_quirks[] = { 2283 { 2284 /* 2285 * This Toshiba device seems to die using any APST states. See: 2286 * https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1678184/comments/11 2287 */ 2288 .vid = 0x1179, 2289 .mn = "THNSF5256GPUK TOSHIBA", 2290 .quirks = NVME_QUIRK_NO_APST, 2291 }, 2292 { 2293 /* 2294 * This LiteON CL1-3D*-Q11 firmware version has a race 2295 * condition associated with actions related to suspend to idle 2296 * LiteON has resolved the problem in future firmware 2297 */ 2298 .vid = 0x14a4, 2299 .fr = "22301111", 2300 .quirks = NVME_QUIRK_SIMPLE_SUSPEND, 2301 } 2302 }; 2303 2304 /* match is null-terminated but idstr is space-padded. */ 2305 static bool string_matches(const char *idstr, const char *match, size_t len) 2306 { 2307 size_t matchlen; 2308 2309 if (!match) 2310 return true; 2311 2312 matchlen = strlen(match); 2313 WARN_ON_ONCE(matchlen > len); 2314 2315 if (memcmp(idstr, match, matchlen)) 2316 return false; 2317 2318 for (; matchlen < len; matchlen++) 2319 if (idstr[matchlen] != ' ') 2320 return false; 2321 2322 return true; 2323 } 2324 2325 static bool quirk_matches(const struct nvme_id_ctrl *id, 2326 const struct nvme_core_quirk_entry *q) 2327 { 2328 return q->vid == le16_to_cpu(id->vid) && 2329 string_matches(id->mn, q->mn, sizeof(id->mn)) && 2330 string_matches(id->fr, q->fr, sizeof(id->fr)); 2331 } 2332 2333 static void nvme_init_subnqn(struct nvme_subsystem *subsys, struct nvme_ctrl *ctrl, 2334 struct nvme_id_ctrl *id) 2335 { 2336 size_t nqnlen; 2337 int off; 2338 2339 if(!(ctrl->quirks & NVME_QUIRK_IGNORE_DEV_SUBNQN)) { 2340 nqnlen = strnlen(id->subnqn, NVMF_NQN_SIZE); 2341 if (nqnlen > 0 && nqnlen < NVMF_NQN_SIZE) { 2342 strlcpy(subsys->subnqn, id->subnqn, NVMF_NQN_SIZE); 2343 return; 2344 } 2345 2346 if (ctrl->vs >= NVME_VS(1, 2, 1)) 2347 dev_warn(ctrl->device, "missing or invalid SUBNQN field.\n"); 2348 } 2349 2350 /* Generate a "fake" NQN per Figure 254 in NVMe 1.3 + ECN 001 */ 2351 off = snprintf(subsys->subnqn, NVMF_NQN_SIZE, 2352 "nqn.2014.08.org.nvmexpress:%04x%04x", 2353 le16_to_cpu(id->vid), le16_to_cpu(id->ssvid)); 2354 memcpy(subsys->subnqn + off, id->sn, sizeof(id->sn)); 2355 off += sizeof(id->sn); 2356 memcpy(subsys->subnqn + off, id->mn, sizeof(id->mn)); 2357 off += sizeof(id->mn); 2358 memset(subsys->subnqn + off, 0, sizeof(subsys->subnqn) - off); 2359 } 2360 2361 static void nvme_release_subsystem(struct device *dev) 2362 { 2363 struct nvme_subsystem *subsys = 2364 container_of(dev, struct nvme_subsystem, dev); 2365 2366 if (subsys->instance >= 0) 2367 ida_simple_remove(&nvme_instance_ida, subsys->instance); 2368 kfree(subsys); 2369 } 2370 2371 static void nvme_destroy_subsystem(struct kref *ref) 2372 { 2373 struct nvme_subsystem *subsys = 2374 container_of(ref, struct nvme_subsystem, ref); 2375 2376 mutex_lock(&nvme_subsystems_lock); 2377 list_del(&subsys->entry); 2378 mutex_unlock(&nvme_subsystems_lock); 2379 2380 ida_destroy(&subsys->ns_ida); 2381 device_del(&subsys->dev); 2382 put_device(&subsys->dev); 2383 } 2384 2385 static void nvme_put_subsystem(struct nvme_subsystem *subsys) 2386 { 2387 kref_put(&subsys->ref, nvme_destroy_subsystem); 2388 } 2389 2390 static struct nvme_subsystem *__nvme_find_get_subsystem(const char *subsysnqn) 2391 { 2392 struct nvme_subsystem *subsys; 2393 2394 lockdep_assert_held(&nvme_subsystems_lock); 2395 2396 /* 2397 * Fail matches for discovery subsystems. This results 2398 * in each discovery controller bound to a unique subsystem. 2399 * This avoids issues with validating controller values 2400 * that can only be true when there is a single unique subsystem. 2401 * There may be multiple and completely independent entities 2402 * that provide discovery controllers. 2403 */ 2404 if (!strcmp(subsysnqn, NVME_DISC_SUBSYS_NAME)) 2405 return NULL; 2406 2407 list_for_each_entry(subsys, &nvme_subsystems, entry) { 2408 if (strcmp(subsys->subnqn, subsysnqn)) 2409 continue; 2410 if (!kref_get_unless_zero(&subsys->ref)) 2411 continue; 2412 return subsys; 2413 } 2414 2415 return NULL; 2416 } 2417 2418 #define SUBSYS_ATTR_RO(_name, _mode, _show) \ 2419 struct device_attribute subsys_attr_##_name = \ 2420 __ATTR(_name, _mode, _show, NULL) 2421 2422 static ssize_t nvme_subsys_show_nqn(struct device *dev, 2423 struct device_attribute *attr, 2424 char *buf) 2425 { 2426 struct nvme_subsystem *subsys = 2427 container_of(dev, struct nvme_subsystem, dev); 2428 2429 return snprintf(buf, PAGE_SIZE, "%s\n", subsys->subnqn); 2430 } 2431 static SUBSYS_ATTR_RO(subsysnqn, S_IRUGO, nvme_subsys_show_nqn); 2432 2433 #define nvme_subsys_show_str_function(field) \ 2434 static ssize_t subsys_##field##_show(struct device *dev, \ 2435 struct device_attribute *attr, char *buf) \ 2436 { \ 2437 struct nvme_subsystem *subsys = \ 2438 container_of(dev, struct nvme_subsystem, dev); \ 2439 return sprintf(buf, "%.*s\n", \ 2440 (int)sizeof(subsys->field), subsys->field); \ 2441 } \ 2442 static SUBSYS_ATTR_RO(field, S_IRUGO, subsys_##field##_show); 2443 2444 nvme_subsys_show_str_function(model); 2445 nvme_subsys_show_str_function(serial); 2446 nvme_subsys_show_str_function(firmware_rev); 2447 2448 static struct attribute *nvme_subsys_attrs[] = { 2449 &subsys_attr_model.attr, 2450 &subsys_attr_serial.attr, 2451 &subsys_attr_firmware_rev.attr, 2452 &subsys_attr_subsysnqn.attr, 2453 #ifdef CONFIG_NVME_MULTIPATH 2454 &subsys_attr_iopolicy.attr, 2455 #endif 2456 NULL, 2457 }; 2458 2459 static struct attribute_group nvme_subsys_attrs_group = { 2460 .attrs = nvme_subsys_attrs, 2461 }; 2462 2463 static const struct attribute_group *nvme_subsys_attrs_groups[] = { 2464 &nvme_subsys_attrs_group, 2465 NULL, 2466 }; 2467 2468 static bool nvme_validate_cntlid(struct nvme_subsystem *subsys, 2469 struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id) 2470 { 2471 struct nvme_ctrl *tmp; 2472 2473 lockdep_assert_held(&nvme_subsystems_lock); 2474 2475 list_for_each_entry(tmp, &subsys->ctrls, subsys_entry) { 2476 if (tmp->state == NVME_CTRL_DELETING || 2477 tmp->state == NVME_CTRL_DEAD) 2478 continue; 2479 2480 if (tmp->cntlid == ctrl->cntlid) { 2481 dev_err(ctrl->device, 2482 "Duplicate cntlid %u with %s, rejecting\n", 2483 ctrl->cntlid, dev_name(tmp->device)); 2484 return false; 2485 } 2486 2487 if ((id->cmic & (1 << 1)) || 2488 (ctrl->opts && ctrl->opts->discovery_nqn)) 2489 continue; 2490 2491 dev_err(ctrl->device, 2492 "Subsystem does not support multiple controllers\n"); 2493 return false; 2494 } 2495 2496 return true; 2497 } 2498 2499 static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id) 2500 { 2501 struct nvme_subsystem *subsys, *found; 2502 int ret; 2503 2504 subsys = kzalloc(sizeof(*subsys), GFP_KERNEL); 2505 if (!subsys) 2506 return -ENOMEM; 2507 2508 subsys->instance = -1; 2509 mutex_init(&subsys->lock); 2510 kref_init(&subsys->ref); 2511 INIT_LIST_HEAD(&subsys->ctrls); 2512 INIT_LIST_HEAD(&subsys->nsheads); 2513 nvme_init_subnqn(subsys, ctrl, id); 2514 memcpy(subsys->serial, id->sn, sizeof(subsys->serial)); 2515 memcpy(subsys->model, id->mn, sizeof(subsys->model)); 2516 memcpy(subsys->firmware_rev, id->fr, sizeof(subsys->firmware_rev)); 2517 subsys->vendor_id = le16_to_cpu(id->vid); 2518 subsys->cmic = id->cmic; 2519 subsys->awupf = le16_to_cpu(id->awupf); 2520 #ifdef CONFIG_NVME_MULTIPATH 2521 subsys->iopolicy = NVME_IOPOLICY_NUMA; 2522 #endif 2523 2524 subsys->dev.class = nvme_subsys_class; 2525 subsys->dev.release = nvme_release_subsystem; 2526 subsys->dev.groups = nvme_subsys_attrs_groups; 2527 dev_set_name(&subsys->dev, "nvme-subsys%d", ctrl->instance); 2528 device_initialize(&subsys->dev); 2529 2530 mutex_lock(&nvme_subsystems_lock); 2531 found = __nvme_find_get_subsystem(subsys->subnqn); 2532 if (found) { 2533 put_device(&subsys->dev); 2534 subsys = found; 2535 2536 if (!nvme_validate_cntlid(subsys, ctrl, id)) { 2537 ret = -EINVAL; 2538 goto out_put_subsystem; 2539 } 2540 } else { 2541 ret = device_add(&subsys->dev); 2542 if (ret) { 2543 dev_err(ctrl->device, 2544 "failed to register subsystem device.\n"); 2545 put_device(&subsys->dev); 2546 goto out_unlock; 2547 } 2548 ida_init(&subsys->ns_ida); 2549 list_add_tail(&subsys->entry, &nvme_subsystems); 2550 } 2551 2552 if (sysfs_create_link(&subsys->dev.kobj, &ctrl->device->kobj, 2553 dev_name(ctrl->device))) { 2554 dev_err(ctrl->device, 2555 "failed to create sysfs link from subsystem.\n"); 2556 goto out_put_subsystem; 2557 } 2558 2559 if (!found) 2560 subsys->instance = ctrl->instance; 2561 ctrl->subsys = subsys; 2562 list_add_tail(&ctrl->subsys_entry, &subsys->ctrls); 2563 mutex_unlock(&nvme_subsystems_lock); 2564 return 0; 2565 2566 out_put_subsystem: 2567 nvme_put_subsystem(subsys); 2568 out_unlock: 2569 mutex_unlock(&nvme_subsystems_lock); 2570 return ret; 2571 } 2572 2573 int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp, 2574 void *log, size_t size, u64 offset) 2575 { 2576 struct nvme_command c = { }; 2577 unsigned long dwlen = size / 4 - 1; 2578 2579 c.get_log_page.opcode = nvme_admin_get_log_page; 2580 c.get_log_page.nsid = cpu_to_le32(nsid); 2581 c.get_log_page.lid = log_page; 2582 c.get_log_page.lsp = lsp; 2583 c.get_log_page.numdl = cpu_to_le16(dwlen & ((1 << 16) - 1)); 2584 c.get_log_page.numdu = cpu_to_le16(dwlen >> 16); 2585 c.get_log_page.lpol = cpu_to_le32(lower_32_bits(offset)); 2586 c.get_log_page.lpou = cpu_to_le32(upper_32_bits(offset)); 2587 2588 return nvme_submit_sync_cmd(ctrl->admin_q, &c, log, size); 2589 } 2590 2591 static int nvme_get_effects_log(struct nvme_ctrl *ctrl) 2592 { 2593 int ret; 2594 2595 if (!ctrl->effects) 2596 ctrl->effects = kzalloc(sizeof(*ctrl->effects), GFP_KERNEL); 2597 2598 if (!ctrl->effects) 2599 return 0; 2600 2601 ret = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_CMD_EFFECTS, 0, 2602 ctrl->effects, sizeof(*ctrl->effects), 0); 2603 if (ret) { 2604 kfree(ctrl->effects); 2605 ctrl->effects = NULL; 2606 } 2607 return ret; 2608 } 2609 2610 /* 2611 * Initialize the cached copies of the Identify data and various controller 2612 * register in our nvme_ctrl structure. This should be called as soon as 2613 * the admin queue is fully up and running. 2614 */ 2615 int nvme_init_identify(struct nvme_ctrl *ctrl) 2616 { 2617 struct nvme_id_ctrl *id; 2618 int ret, page_shift; 2619 u32 max_hw_sectors; 2620 bool prev_apst_enabled; 2621 2622 ret = ctrl->ops->reg_read32(ctrl, NVME_REG_VS, &ctrl->vs); 2623 if (ret) { 2624 dev_err(ctrl->device, "Reading VS failed (%d)\n", ret); 2625 return ret; 2626 } 2627 page_shift = NVME_CAP_MPSMIN(ctrl->cap) + 12; 2628 ctrl->sqsize = min_t(int, NVME_CAP_MQES(ctrl->cap), ctrl->sqsize); 2629 2630 if (ctrl->vs >= NVME_VS(1, 1, 0)) 2631 ctrl->subsystem = NVME_CAP_NSSRC(ctrl->cap); 2632 2633 ret = nvme_identify_ctrl(ctrl, &id); 2634 if (ret) { 2635 dev_err(ctrl->device, "Identify Controller failed (%d)\n", ret); 2636 return -EIO; 2637 } 2638 2639 if (id->lpa & NVME_CTRL_LPA_CMD_EFFECTS_LOG) { 2640 ret = nvme_get_effects_log(ctrl); 2641 if (ret < 0) 2642 goto out_free; 2643 } 2644 2645 if (!(ctrl->ops->flags & NVME_F_FABRICS)) 2646 ctrl->cntlid = le16_to_cpu(id->cntlid); 2647 2648 if (!ctrl->identified) { 2649 int i; 2650 2651 ret = nvme_init_subsystem(ctrl, id); 2652 if (ret) 2653 goto out_free; 2654 2655 /* 2656 * Check for quirks. Quirk can depend on firmware version, 2657 * so, in principle, the set of quirks present can change 2658 * across a reset. As a possible future enhancement, we 2659 * could re-scan for quirks every time we reinitialize 2660 * the device, but we'd have to make sure that the driver 2661 * behaves intelligently if the quirks change. 2662 */ 2663 for (i = 0; i < ARRAY_SIZE(core_quirks); i++) { 2664 if (quirk_matches(id, &core_quirks[i])) 2665 ctrl->quirks |= core_quirks[i].quirks; 2666 } 2667 } 2668 2669 if (force_apst && (ctrl->quirks & NVME_QUIRK_NO_DEEPEST_PS)) { 2670 dev_warn(ctrl->device, "forcibly allowing all power states due to nvme_core.force_apst -- use at your own risk\n"); 2671 ctrl->quirks &= ~NVME_QUIRK_NO_DEEPEST_PS; 2672 } 2673 2674 ctrl->crdt[0] = le16_to_cpu(id->crdt1); 2675 ctrl->crdt[1] = le16_to_cpu(id->crdt2); 2676 ctrl->crdt[2] = le16_to_cpu(id->crdt3); 2677 2678 ctrl->oacs = le16_to_cpu(id->oacs); 2679 ctrl->oncs = le16_to_cpu(id->oncs); 2680 ctrl->mtfa = le16_to_cpu(id->mtfa); 2681 ctrl->oaes = le32_to_cpu(id->oaes); 2682 atomic_set(&ctrl->abort_limit, id->acl + 1); 2683 ctrl->vwc = id->vwc; 2684 if (id->mdts) 2685 max_hw_sectors = 1 << (id->mdts + page_shift - 9); 2686 else 2687 max_hw_sectors = UINT_MAX; 2688 ctrl->max_hw_sectors = 2689 min_not_zero(ctrl->max_hw_sectors, max_hw_sectors); 2690 2691 nvme_set_queue_limits(ctrl, ctrl->admin_q); 2692 ctrl->sgls = le32_to_cpu(id->sgls); 2693 ctrl->kas = le16_to_cpu(id->kas); 2694 ctrl->max_namespaces = le32_to_cpu(id->mnan); 2695 ctrl->ctratt = le32_to_cpu(id->ctratt); 2696 2697 if (id->rtd3e) { 2698 /* us -> s */ 2699 u32 transition_time = le32_to_cpu(id->rtd3e) / 1000000; 2700 2701 ctrl->shutdown_timeout = clamp_t(unsigned int, transition_time, 2702 shutdown_timeout, 60); 2703 2704 if (ctrl->shutdown_timeout != shutdown_timeout) 2705 dev_info(ctrl->device, 2706 "Shutdown timeout set to %u seconds\n", 2707 ctrl->shutdown_timeout); 2708 } else 2709 ctrl->shutdown_timeout = shutdown_timeout; 2710 2711 ctrl->npss = id->npss; 2712 ctrl->apsta = id->apsta; 2713 prev_apst_enabled = ctrl->apst_enabled; 2714 if (ctrl->quirks & NVME_QUIRK_NO_APST) { 2715 if (force_apst && id->apsta) { 2716 dev_warn(ctrl->device, "forcibly allowing APST due to nvme_core.force_apst -- use at your own risk\n"); 2717 ctrl->apst_enabled = true; 2718 } else { 2719 ctrl->apst_enabled = false; 2720 } 2721 } else { 2722 ctrl->apst_enabled = id->apsta; 2723 } 2724 memcpy(ctrl->psd, id->psd, sizeof(ctrl->psd)); 2725 2726 if (ctrl->ops->flags & NVME_F_FABRICS) { 2727 ctrl->icdoff = le16_to_cpu(id->icdoff); 2728 ctrl->ioccsz = le32_to_cpu(id->ioccsz); 2729 ctrl->iorcsz = le32_to_cpu(id->iorcsz); 2730 ctrl->maxcmd = le16_to_cpu(id->maxcmd); 2731 2732 /* 2733 * In fabrics we need to verify the cntlid matches the 2734 * admin connect 2735 */ 2736 if (ctrl->cntlid != le16_to_cpu(id->cntlid)) { 2737 ret = -EINVAL; 2738 goto out_free; 2739 } 2740 2741 if (!ctrl->opts->discovery_nqn && !ctrl->kas) { 2742 dev_err(ctrl->device, 2743 "keep-alive support is mandatory for fabrics\n"); 2744 ret = -EINVAL; 2745 goto out_free; 2746 } 2747 } else { 2748 ctrl->hmpre = le32_to_cpu(id->hmpre); 2749 ctrl->hmmin = le32_to_cpu(id->hmmin); 2750 ctrl->hmminds = le32_to_cpu(id->hmminds); 2751 ctrl->hmmaxd = le16_to_cpu(id->hmmaxd); 2752 } 2753 2754 ret = nvme_mpath_init(ctrl, id); 2755 kfree(id); 2756 2757 if (ret < 0) 2758 return ret; 2759 2760 if (ctrl->apst_enabled && !prev_apst_enabled) 2761 dev_pm_qos_expose_latency_tolerance(ctrl->device); 2762 else if (!ctrl->apst_enabled && prev_apst_enabled) 2763 dev_pm_qos_hide_latency_tolerance(ctrl->device); 2764 2765 ret = nvme_configure_apst(ctrl); 2766 if (ret < 0) 2767 return ret; 2768 2769 ret = nvme_configure_timestamp(ctrl); 2770 if (ret < 0) 2771 return ret; 2772 2773 ret = nvme_configure_directives(ctrl); 2774 if (ret < 0) 2775 return ret; 2776 2777 ret = nvme_configure_acre(ctrl); 2778 if (ret < 0) 2779 return ret; 2780 2781 ctrl->identified = true; 2782 2783 return 0; 2784 2785 out_free: 2786 kfree(id); 2787 return ret; 2788 } 2789 EXPORT_SYMBOL_GPL(nvme_init_identify); 2790 2791 static int nvme_dev_open(struct inode *inode, struct file *file) 2792 { 2793 struct nvme_ctrl *ctrl = 2794 container_of(inode->i_cdev, struct nvme_ctrl, cdev); 2795 2796 switch (ctrl->state) { 2797 case NVME_CTRL_LIVE: 2798 case NVME_CTRL_ADMIN_ONLY: 2799 break; 2800 default: 2801 return -EWOULDBLOCK; 2802 } 2803 2804 file->private_data = ctrl; 2805 return 0; 2806 } 2807 2808 static int nvme_dev_user_cmd(struct nvme_ctrl *ctrl, void __user *argp) 2809 { 2810 struct nvme_ns *ns; 2811 int ret; 2812 2813 down_read(&ctrl->namespaces_rwsem); 2814 if (list_empty(&ctrl->namespaces)) { 2815 ret = -ENOTTY; 2816 goto out_unlock; 2817 } 2818 2819 ns = list_first_entry(&ctrl->namespaces, struct nvme_ns, list); 2820 if (ns != list_last_entry(&ctrl->namespaces, struct nvme_ns, list)) { 2821 dev_warn(ctrl->device, 2822 "NVME_IOCTL_IO_CMD not supported when multiple namespaces present!\n"); 2823 ret = -EINVAL; 2824 goto out_unlock; 2825 } 2826 2827 dev_warn(ctrl->device, 2828 "using deprecated NVME_IOCTL_IO_CMD ioctl on the char device!\n"); 2829 kref_get(&ns->kref); 2830 up_read(&ctrl->namespaces_rwsem); 2831 2832 ret = nvme_user_cmd(ctrl, ns, argp); 2833 nvme_put_ns(ns); 2834 return ret; 2835 2836 out_unlock: 2837 up_read(&ctrl->namespaces_rwsem); 2838 return ret; 2839 } 2840 2841 static long nvme_dev_ioctl(struct file *file, unsigned int cmd, 2842 unsigned long arg) 2843 { 2844 struct nvme_ctrl *ctrl = file->private_data; 2845 void __user *argp = (void __user *)arg; 2846 2847 switch (cmd) { 2848 case NVME_IOCTL_ADMIN_CMD: 2849 return nvme_user_cmd(ctrl, NULL, argp); 2850 case NVME_IOCTL_IO_CMD: 2851 return nvme_dev_user_cmd(ctrl, argp); 2852 case NVME_IOCTL_RESET: 2853 dev_warn(ctrl->device, "resetting controller\n"); 2854 return nvme_reset_ctrl_sync(ctrl); 2855 case NVME_IOCTL_SUBSYS_RESET: 2856 return nvme_reset_subsystem(ctrl); 2857 case NVME_IOCTL_RESCAN: 2858 nvme_queue_scan(ctrl); 2859 return 0; 2860 default: 2861 return -ENOTTY; 2862 } 2863 } 2864 2865 static const struct file_operations nvme_dev_fops = { 2866 .owner = THIS_MODULE, 2867 .open = nvme_dev_open, 2868 .unlocked_ioctl = nvme_dev_ioctl, 2869 .compat_ioctl = nvme_dev_ioctl, 2870 }; 2871 2872 static ssize_t nvme_sysfs_reset(struct device *dev, 2873 struct device_attribute *attr, const char *buf, 2874 size_t count) 2875 { 2876 struct nvme_ctrl *ctrl = dev_get_drvdata(dev); 2877 int ret; 2878 2879 ret = nvme_reset_ctrl_sync(ctrl); 2880 if (ret < 0) 2881 return ret; 2882 return count; 2883 } 2884 static DEVICE_ATTR(reset_controller, S_IWUSR, NULL, nvme_sysfs_reset); 2885 2886 static ssize_t nvme_sysfs_rescan(struct device *dev, 2887 struct device_attribute *attr, const char *buf, 2888 size_t count) 2889 { 2890 struct nvme_ctrl *ctrl = dev_get_drvdata(dev); 2891 2892 nvme_queue_scan(ctrl); 2893 return count; 2894 } 2895 static DEVICE_ATTR(rescan_controller, S_IWUSR, NULL, nvme_sysfs_rescan); 2896 2897 static inline struct nvme_ns_head *dev_to_ns_head(struct device *dev) 2898 { 2899 struct gendisk *disk = dev_to_disk(dev); 2900 2901 if (disk->fops == &nvme_fops) 2902 return nvme_get_ns_from_dev(dev)->head; 2903 else 2904 return disk->private_data; 2905 } 2906 2907 static ssize_t wwid_show(struct device *dev, struct device_attribute *attr, 2908 char *buf) 2909 { 2910 struct nvme_ns_head *head = dev_to_ns_head(dev); 2911 struct nvme_ns_ids *ids = &head->ids; 2912 struct nvme_subsystem *subsys = head->subsys; 2913 int serial_len = sizeof(subsys->serial); 2914 int model_len = sizeof(subsys->model); 2915 2916 if (!uuid_is_null(&ids->uuid)) 2917 return sprintf(buf, "uuid.%pU\n", &ids->uuid); 2918 2919 if (memchr_inv(ids->nguid, 0, sizeof(ids->nguid))) 2920 return sprintf(buf, "eui.%16phN\n", ids->nguid); 2921 2922 if (memchr_inv(ids->eui64, 0, sizeof(ids->eui64))) 2923 return sprintf(buf, "eui.%8phN\n", ids->eui64); 2924 2925 while (serial_len > 0 && (subsys->serial[serial_len - 1] == ' ' || 2926 subsys->serial[serial_len - 1] == '\0')) 2927 serial_len--; 2928 while (model_len > 0 && (subsys->model[model_len - 1] == ' ' || 2929 subsys->model[model_len - 1] == '\0')) 2930 model_len--; 2931 2932 return sprintf(buf, "nvme.%04x-%*phN-%*phN-%08x\n", subsys->vendor_id, 2933 serial_len, subsys->serial, model_len, subsys->model, 2934 head->ns_id); 2935 } 2936 static DEVICE_ATTR_RO(wwid); 2937 2938 static ssize_t nguid_show(struct device *dev, struct device_attribute *attr, 2939 char *buf) 2940 { 2941 return sprintf(buf, "%pU\n", dev_to_ns_head(dev)->ids.nguid); 2942 } 2943 static DEVICE_ATTR_RO(nguid); 2944 2945 static ssize_t uuid_show(struct device *dev, struct device_attribute *attr, 2946 char *buf) 2947 { 2948 struct nvme_ns_ids *ids = &dev_to_ns_head(dev)->ids; 2949 2950 /* For backward compatibility expose the NGUID to userspace if 2951 * we have no UUID set 2952 */ 2953 if (uuid_is_null(&ids->uuid)) { 2954 printk_ratelimited(KERN_WARNING 2955 "No UUID available providing old NGUID\n"); 2956 return sprintf(buf, "%pU\n", ids->nguid); 2957 } 2958 return sprintf(buf, "%pU\n", &ids->uuid); 2959 } 2960 static DEVICE_ATTR_RO(uuid); 2961 2962 static ssize_t eui_show(struct device *dev, struct device_attribute *attr, 2963 char *buf) 2964 { 2965 return sprintf(buf, "%8ph\n", dev_to_ns_head(dev)->ids.eui64); 2966 } 2967 static DEVICE_ATTR_RO(eui); 2968 2969 static ssize_t nsid_show(struct device *dev, struct device_attribute *attr, 2970 char *buf) 2971 { 2972 return sprintf(buf, "%d\n", dev_to_ns_head(dev)->ns_id); 2973 } 2974 static DEVICE_ATTR_RO(nsid); 2975 2976 static struct attribute *nvme_ns_id_attrs[] = { 2977 &dev_attr_wwid.attr, 2978 &dev_attr_uuid.attr, 2979 &dev_attr_nguid.attr, 2980 &dev_attr_eui.attr, 2981 &dev_attr_nsid.attr, 2982 #ifdef CONFIG_NVME_MULTIPATH 2983 &dev_attr_ana_grpid.attr, 2984 &dev_attr_ana_state.attr, 2985 #endif 2986 NULL, 2987 }; 2988 2989 static umode_t nvme_ns_id_attrs_are_visible(struct kobject *kobj, 2990 struct attribute *a, int n) 2991 { 2992 struct device *dev = container_of(kobj, struct device, kobj); 2993 struct nvme_ns_ids *ids = &dev_to_ns_head(dev)->ids; 2994 2995 if (a == &dev_attr_uuid.attr) { 2996 if (uuid_is_null(&ids->uuid) && 2997 !memchr_inv(ids->nguid, 0, sizeof(ids->nguid))) 2998 return 0; 2999 } 3000 if (a == &dev_attr_nguid.attr) { 3001 if (!memchr_inv(ids->nguid, 0, sizeof(ids->nguid))) 3002 return 0; 3003 } 3004 if (a == &dev_attr_eui.attr) { 3005 if (!memchr_inv(ids->eui64, 0, sizeof(ids->eui64))) 3006 return 0; 3007 } 3008 #ifdef CONFIG_NVME_MULTIPATH 3009 if (a == &dev_attr_ana_grpid.attr || a == &dev_attr_ana_state.attr) { 3010 if (dev_to_disk(dev)->fops != &nvme_fops) /* per-path attr */ 3011 return 0; 3012 if (!nvme_ctrl_use_ana(nvme_get_ns_from_dev(dev)->ctrl)) 3013 return 0; 3014 } 3015 #endif 3016 return a->mode; 3017 } 3018 3019 static const struct attribute_group nvme_ns_id_attr_group = { 3020 .attrs = nvme_ns_id_attrs, 3021 .is_visible = nvme_ns_id_attrs_are_visible, 3022 }; 3023 3024 const struct attribute_group *nvme_ns_id_attr_groups[] = { 3025 &nvme_ns_id_attr_group, 3026 #ifdef CONFIG_NVM 3027 &nvme_nvm_attr_group, 3028 #endif 3029 NULL, 3030 }; 3031 3032 #define nvme_show_str_function(field) \ 3033 static ssize_t field##_show(struct device *dev, \ 3034 struct device_attribute *attr, char *buf) \ 3035 { \ 3036 struct nvme_ctrl *ctrl = dev_get_drvdata(dev); \ 3037 return sprintf(buf, "%.*s\n", \ 3038 (int)sizeof(ctrl->subsys->field), ctrl->subsys->field); \ 3039 } \ 3040 static DEVICE_ATTR(field, S_IRUGO, field##_show, NULL); 3041 3042 nvme_show_str_function(model); 3043 nvme_show_str_function(serial); 3044 nvme_show_str_function(firmware_rev); 3045 3046 #define nvme_show_int_function(field) \ 3047 static ssize_t field##_show(struct device *dev, \ 3048 struct device_attribute *attr, char *buf) \ 3049 { \ 3050 struct nvme_ctrl *ctrl = dev_get_drvdata(dev); \ 3051 return sprintf(buf, "%d\n", ctrl->field); \ 3052 } \ 3053 static DEVICE_ATTR(field, S_IRUGO, field##_show, NULL); 3054 3055 nvme_show_int_function(cntlid); 3056 nvme_show_int_function(numa_node); 3057 3058 static ssize_t nvme_sysfs_delete(struct device *dev, 3059 struct device_attribute *attr, const char *buf, 3060 size_t count) 3061 { 3062 struct nvme_ctrl *ctrl = dev_get_drvdata(dev); 3063 3064 if (device_remove_file_self(dev, attr)) 3065 nvme_delete_ctrl_sync(ctrl); 3066 return count; 3067 } 3068 static DEVICE_ATTR(delete_controller, S_IWUSR, NULL, nvme_sysfs_delete); 3069 3070 static ssize_t nvme_sysfs_show_transport(struct device *dev, 3071 struct device_attribute *attr, 3072 char *buf) 3073 { 3074 struct nvme_ctrl *ctrl = dev_get_drvdata(dev); 3075 3076 return snprintf(buf, PAGE_SIZE, "%s\n", ctrl->ops->name); 3077 } 3078 static DEVICE_ATTR(transport, S_IRUGO, nvme_sysfs_show_transport, NULL); 3079 3080 static ssize_t nvme_sysfs_show_state(struct device *dev, 3081 struct device_attribute *attr, 3082 char *buf) 3083 { 3084 struct nvme_ctrl *ctrl = dev_get_drvdata(dev); 3085 static const char *const state_name[] = { 3086 [NVME_CTRL_NEW] = "new", 3087 [NVME_CTRL_LIVE] = "live", 3088 [NVME_CTRL_ADMIN_ONLY] = "only-admin", 3089 [NVME_CTRL_RESETTING] = "resetting", 3090 [NVME_CTRL_CONNECTING] = "connecting", 3091 [NVME_CTRL_DELETING] = "deleting", 3092 [NVME_CTRL_DEAD] = "dead", 3093 }; 3094 3095 if ((unsigned)ctrl->state < ARRAY_SIZE(state_name) && 3096 state_name[ctrl->state]) 3097 return sprintf(buf, "%s\n", state_name[ctrl->state]); 3098 3099 return sprintf(buf, "unknown state\n"); 3100 } 3101 3102 static DEVICE_ATTR(state, S_IRUGO, nvme_sysfs_show_state, NULL); 3103 3104 static ssize_t nvme_sysfs_show_subsysnqn(struct device *dev, 3105 struct device_attribute *attr, 3106 char *buf) 3107 { 3108 struct nvme_ctrl *ctrl = dev_get_drvdata(dev); 3109 3110 return snprintf(buf, PAGE_SIZE, "%s\n", ctrl->subsys->subnqn); 3111 } 3112 static DEVICE_ATTR(subsysnqn, S_IRUGO, nvme_sysfs_show_subsysnqn, NULL); 3113 3114 static ssize_t nvme_sysfs_show_address(struct device *dev, 3115 struct device_attribute *attr, 3116 char *buf) 3117 { 3118 struct nvme_ctrl *ctrl = dev_get_drvdata(dev); 3119 3120 return ctrl->ops->get_address(ctrl, buf, PAGE_SIZE); 3121 } 3122 static DEVICE_ATTR(address, S_IRUGO, nvme_sysfs_show_address, NULL); 3123 3124 static struct attribute *nvme_dev_attrs[] = { 3125 &dev_attr_reset_controller.attr, 3126 &dev_attr_rescan_controller.attr, 3127 &dev_attr_model.attr, 3128 &dev_attr_serial.attr, 3129 &dev_attr_firmware_rev.attr, 3130 &dev_attr_cntlid.attr, 3131 &dev_attr_delete_controller.attr, 3132 &dev_attr_transport.attr, 3133 &dev_attr_subsysnqn.attr, 3134 &dev_attr_address.attr, 3135 &dev_attr_state.attr, 3136 &dev_attr_numa_node.attr, 3137 NULL 3138 }; 3139 3140 static umode_t nvme_dev_attrs_are_visible(struct kobject *kobj, 3141 struct attribute *a, int n) 3142 { 3143 struct device *dev = container_of(kobj, struct device, kobj); 3144 struct nvme_ctrl *ctrl = dev_get_drvdata(dev); 3145 3146 if (a == &dev_attr_delete_controller.attr && !ctrl->ops->delete_ctrl) 3147 return 0; 3148 if (a == &dev_attr_address.attr && !ctrl->ops->get_address) 3149 return 0; 3150 3151 return a->mode; 3152 } 3153 3154 static struct attribute_group nvme_dev_attrs_group = { 3155 .attrs = nvme_dev_attrs, 3156 .is_visible = nvme_dev_attrs_are_visible, 3157 }; 3158 3159 static const struct attribute_group *nvme_dev_attr_groups[] = { 3160 &nvme_dev_attrs_group, 3161 NULL, 3162 }; 3163 3164 static struct nvme_ns_head *__nvme_find_ns_head(struct nvme_subsystem *subsys, 3165 unsigned nsid) 3166 { 3167 struct nvme_ns_head *h; 3168 3169 lockdep_assert_held(&subsys->lock); 3170 3171 list_for_each_entry(h, &subsys->nsheads, entry) { 3172 if (h->ns_id == nsid && kref_get_unless_zero(&h->ref)) 3173 return h; 3174 } 3175 3176 return NULL; 3177 } 3178 3179 static int __nvme_check_ids(struct nvme_subsystem *subsys, 3180 struct nvme_ns_head *new) 3181 { 3182 struct nvme_ns_head *h; 3183 3184 lockdep_assert_held(&subsys->lock); 3185 3186 list_for_each_entry(h, &subsys->nsheads, entry) { 3187 if (nvme_ns_ids_valid(&new->ids) && 3188 !list_empty(&h->list) && 3189 nvme_ns_ids_equal(&new->ids, &h->ids)) 3190 return -EINVAL; 3191 } 3192 3193 return 0; 3194 } 3195 3196 static struct nvme_ns_head *nvme_alloc_ns_head(struct nvme_ctrl *ctrl, 3197 unsigned nsid, struct nvme_id_ns *id) 3198 { 3199 struct nvme_ns_head *head; 3200 size_t size = sizeof(*head); 3201 int ret = -ENOMEM; 3202 3203 #ifdef CONFIG_NVME_MULTIPATH 3204 size += num_possible_nodes() * sizeof(struct nvme_ns *); 3205 #endif 3206 3207 head = kzalloc(size, GFP_KERNEL); 3208 if (!head) 3209 goto out; 3210 ret = ida_simple_get(&ctrl->subsys->ns_ida, 1, 0, GFP_KERNEL); 3211 if (ret < 0) 3212 goto out_free_head; 3213 head->instance = ret; 3214 INIT_LIST_HEAD(&head->list); 3215 ret = init_srcu_struct(&head->srcu); 3216 if (ret) 3217 goto out_ida_remove; 3218 head->subsys = ctrl->subsys; 3219 head->ns_id = nsid; 3220 kref_init(&head->ref); 3221 3222 ret = nvme_report_ns_ids(ctrl, nsid, id, &head->ids); 3223 if (ret) 3224 goto out_cleanup_srcu; 3225 3226 ret = __nvme_check_ids(ctrl->subsys, head); 3227 if (ret) { 3228 dev_err(ctrl->device, 3229 "duplicate IDs for nsid %d\n", nsid); 3230 goto out_cleanup_srcu; 3231 } 3232 3233 ret = nvme_mpath_alloc_disk(ctrl, head); 3234 if (ret) 3235 goto out_cleanup_srcu; 3236 3237 list_add_tail(&head->entry, &ctrl->subsys->nsheads); 3238 3239 kref_get(&ctrl->subsys->ref); 3240 3241 return head; 3242 out_cleanup_srcu: 3243 cleanup_srcu_struct(&head->srcu); 3244 out_ida_remove: 3245 ida_simple_remove(&ctrl->subsys->ns_ida, head->instance); 3246 out_free_head: 3247 kfree(head); 3248 out: 3249 if (ret > 0) 3250 ret = blk_status_to_errno(nvme_error_status(ret)); 3251 return ERR_PTR(ret); 3252 } 3253 3254 static int nvme_init_ns_head(struct nvme_ns *ns, unsigned nsid, 3255 struct nvme_id_ns *id) 3256 { 3257 struct nvme_ctrl *ctrl = ns->ctrl; 3258 bool is_shared = id->nmic & (1 << 0); 3259 struct nvme_ns_head *head = NULL; 3260 int ret = 0; 3261 3262 mutex_lock(&ctrl->subsys->lock); 3263 if (is_shared) 3264 head = __nvme_find_ns_head(ctrl->subsys, nsid); 3265 if (!head) { 3266 head = nvme_alloc_ns_head(ctrl, nsid, id); 3267 if (IS_ERR(head)) { 3268 ret = PTR_ERR(head); 3269 goto out_unlock; 3270 } 3271 } else { 3272 struct nvme_ns_ids ids; 3273 3274 ret = nvme_report_ns_ids(ctrl, nsid, id, &ids); 3275 if (ret) 3276 goto out_unlock; 3277 3278 if (!nvme_ns_ids_equal(&head->ids, &ids)) { 3279 dev_err(ctrl->device, 3280 "IDs don't match for shared namespace %d\n", 3281 nsid); 3282 ret = -EINVAL; 3283 goto out_unlock; 3284 } 3285 } 3286 3287 list_add_tail(&ns->siblings, &head->list); 3288 ns->head = head; 3289 3290 out_unlock: 3291 mutex_unlock(&ctrl->subsys->lock); 3292 if (ret > 0) 3293 ret = blk_status_to_errno(nvme_error_status(ret)); 3294 return ret; 3295 } 3296 3297 static int ns_cmp(void *priv, struct list_head *a, struct list_head *b) 3298 { 3299 struct nvme_ns *nsa = container_of(a, struct nvme_ns, list); 3300 struct nvme_ns *nsb = container_of(b, struct nvme_ns, list); 3301 3302 return nsa->head->ns_id - nsb->head->ns_id; 3303 } 3304 3305 static struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid) 3306 { 3307 struct nvme_ns *ns, *ret = NULL; 3308 3309 down_read(&ctrl->namespaces_rwsem); 3310 list_for_each_entry(ns, &ctrl->namespaces, list) { 3311 if (ns->head->ns_id == nsid) { 3312 if (!kref_get_unless_zero(&ns->kref)) 3313 continue; 3314 ret = ns; 3315 break; 3316 } 3317 if (ns->head->ns_id > nsid) 3318 break; 3319 } 3320 up_read(&ctrl->namespaces_rwsem); 3321 return ret; 3322 } 3323 3324 static int nvme_setup_streams_ns(struct nvme_ctrl *ctrl, struct nvme_ns *ns) 3325 { 3326 struct streams_directive_params s; 3327 int ret; 3328 3329 if (!ctrl->nr_streams) 3330 return 0; 3331 3332 ret = nvme_get_stream_params(ctrl, &s, ns->head->ns_id); 3333 if (ret) 3334 return ret; 3335 3336 ns->sws = le32_to_cpu(s.sws); 3337 ns->sgs = le16_to_cpu(s.sgs); 3338 3339 if (ns->sws) { 3340 unsigned int bs = 1 << ns->lba_shift; 3341 3342 blk_queue_io_min(ns->queue, bs * ns->sws); 3343 if (ns->sgs) 3344 blk_queue_io_opt(ns->queue, bs * ns->sws * ns->sgs); 3345 } 3346 3347 return 0; 3348 } 3349 3350 static int nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) 3351 { 3352 struct nvme_ns *ns; 3353 struct gendisk *disk; 3354 struct nvme_id_ns *id; 3355 char disk_name[DISK_NAME_LEN]; 3356 int node = ctrl->numa_node, flags = GENHD_FL_EXT_DEVT, ret; 3357 3358 ns = kzalloc_node(sizeof(*ns), GFP_KERNEL, node); 3359 if (!ns) 3360 return -ENOMEM; 3361 3362 ns->queue = blk_mq_init_queue(ctrl->tagset); 3363 if (IS_ERR(ns->queue)) { 3364 ret = PTR_ERR(ns->queue); 3365 goto out_free_ns; 3366 } 3367 3368 if (ctrl->opts && ctrl->opts->data_digest) 3369 ns->queue->backing_dev_info->capabilities 3370 |= BDI_CAP_STABLE_WRITES; 3371 3372 blk_queue_flag_set(QUEUE_FLAG_NONROT, ns->queue); 3373 if (ctrl->ops->flags & NVME_F_PCI_P2PDMA) 3374 blk_queue_flag_set(QUEUE_FLAG_PCI_P2PDMA, ns->queue); 3375 3376 ns->queue->queuedata = ns; 3377 ns->ctrl = ctrl; 3378 3379 kref_init(&ns->kref); 3380 ns->lba_shift = 9; /* set to a default value for 512 until disk is validated */ 3381 3382 blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift); 3383 nvme_set_queue_limits(ctrl, ns->queue); 3384 3385 ret = nvme_identify_ns(ctrl, nsid, &id); 3386 if (ret) 3387 goto out_free_queue; 3388 3389 if (id->ncap == 0) { 3390 ret = -EINVAL; 3391 goto out_free_id; 3392 } 3393 3394 ret = nvme_init_ns_head(ns, nsid, id); 3395 if (ret) 3396 goto out_free_id; 3397 nvme_setup_streams_ns(ctrl, ns); 3398 nvme_set_disk_name(disk_name, ns, ctrl, &flags); 3399 3400 disk = alloc_disk_node(0, node); 3401 if (!disk) { 3402 ret = -ENOMEM; 3403 goto out_unlink_ns; 3404 } 3405 3406 disk->fops = &nvme_fops; 3407 disk->private_data = ns; 3408 disk->queue = ns->queue; 3409 disk->flags = flags; 3410 memcpy(disk->disk_name, disk_name, DISK_NAME_LEN); 3411 ns->disk = disk; 3412 3413 __nvme_revalidate_disk(disk, id); 3414 3415 if ((ctrl->quirks & NVME_QUIRK_LIGHTNVM) && id->vs[0] == 0x1) { 3416 ret = nvme_nvm_register(ns, disk_name, node); 3417 if (ret) { 3418 dev_warn(ctrl->device, "LightNVM init failure\n"); 3419 goto out_put_disk; 3420 } 3421 } 3422 3423 down_write(&ctrl->namespaces_rwsem); 3424 list_add_tail(&ns->list, &ctrl->namespaces); 3425 up_write(&ctrl->namespaces_rwsem); 3426 3427 nvme_get_ctrl(ctrl); 3428 3429 device_add_disk(ctrl->device, ns->disk, nvme_ns_id_attr_groups); 3430 3431 nvme_mpath_add_disk(ns, id); 3432 nvme_fault_inject_init(&ns->fault_inject, ns->disk->disk_name); 3433 kfree(id); 3434 3435 return 0; 3436 out_put_disk: 3437 put_disk(ns->disk); 3438 out_unlink_ns: 3439 mutex_lock(&ctrl->subsys->lock); 3440 list_del_rcu(&ns->siblings); 3441 mutex_unlock(&ctrl->subsys->lock); 3442 nvme_put_ns_head(ns->head); 3443 out_free_id: 3444 kfree(id); 3445 out_free_queue: 3446 blk_cleanup_queue(ns->queue); 3447 out_free_ns: 3448 kfree(ns); 3449 if (ret > 0) 3450 ret = blk_status_to_errno(nvme_error_status(ret)); 3451 return ret; 3452 } 3453 3454 static void nvme_ns_remove(struct nvme_ns *ns) 3455 { 3456 if (test_and_set_bit(NVME_NS_REMOVING, &ns->flags)) 3457 return; 3458 3459 nvme_fault_inject_fini(&ns->fault_inject); 3460 3461 mutex_lock(&ns->ctrl->subsys->lock); 3462 list_del_rcu(&ns->siblings); 3463 mutex_unlock(&ns->ctrl->subsys->lock); 3464 synchronize_rcu(); /* guarantee not available in head->list */ 3465 nvme_mpath_clear_current_path(ns); 3466 synchronize_srcu(&ns->head->srcu); /* wait for concurrent submissions */ 3467 3468 if (ns->disk && ns->disk->flags & GENHD_FL_UP) { 3469 del_gendisk(ns->disk); 3470 blk_cleanup_queue(ns->queue); 3471 if (blk_get_integrity(ns->disk)) 3472 blk_integrity_unregister(ns->disk); 3473 } 3474 3475 down_write(&ns->ctrl->namespaces_rwsem); 3476 list_del_init(&ns->list); 3477 up_write(&ns->ctrl->namespaces_rwsem); 3478 3479 nvme_mpath_check_last_path(ns); 3480 nvme_put_ns(ns); 3481 } 3482 3483 static void nvme_validate_ns(struct nvme_ctrl *ctrl, unsigned nsid) 3484 { 3485 struct nvme_ns *ns; 3486 3487 ns = nvme_find_get_ns(ctrl, nsid); 3488 if (ns) { 3489 if (ns->disk && revalidate_disk(ns->disk)) 3490 nvme_ns_remove(ns); 3491 nvme_put_ns(ns); 3492 } else 3493 nvme_alloc_ns(ctrl, nsid); 3494 } 3495 3496 static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl, 3497 unsigned nsid) 3498 { 3499 struct nvme_ns *ns, *next; 3500 LIST_HEAD(rm_list); 3501 3502 down_write(&ctrl->namespaces_rwsem); 3503 list_for_each_entry_safe(ns, next, &ctrl->namespaces, list) { 3504 if (ns->head->ns_id > nsid || test_bit(NVME_NS_DEAD, &ns->flags)) 3505 list_move_tail(&ns->list, &rm_list); 3506 } 3507 up_write(&ctrl->namespaces_rwsem); 3508 3509 list_for_each_entry_safe(ns, next, &rm_list, list) 3510 nvme_ns_remove(ns); 3511 3512 } 3513 3514 static int nvme_scan_ns_list(struct nvme_ctrl *ctrl, unsigned nn) 3515 { 3516 struct nvme_ns *ns; 3517 __le32 *ns_list; 3518 unsigned i, j, nsid, prev = 0; 3519 unsigned num_lists = DIV_ROUND_UP_ULL((u64)nn, 1024); 3520 int ret = 0; 3521 3522 ns_list = kzalloc(NVME_IDENTIFY_DATA_SIZE, GFP_KERNEL); 3523 if (!ns_list) 3524 return -ENOMEM; 3525 3526 for (i = 0; i < num_lists; i++) { 3527 ret = nvme_identify_ns_list(ctrl, prev, ns_list); 3528 if (ret) 3529 goto free; 3530 3531 for (j = 0; j < min(nn, 1024U); j++) { 3532 nsid = le32_to_cpu(ns_list[j]); 3533 if (!nsid) 3534 goto out; 3535 3536 nvme_validate_ns(ctrl, nsid); 3537 3538 while (++prev < nsid) { 3539 ns = nvme_find_get_ns(ctrl, prev); 3540 if (ns) { 3541 nvme_ns_remove(ns); 3542 nvme_put_ns(ns); 3543 } 3544 } 3545 } 3546 nn -= j; 3547 } 3548 out: 3549 nvme_remove_invalid_namespaces(ctrl, prev); 3550 free: 3551 kfree(ns_list); 3552 return ret; 3553 } 3554 3555 static void nvme_scan_ns_sequential(struct nvme_ctrl *ctrl, unsigned nn) 3556 { 3557 unsigned i; 3558 3559 for (i = 1; i <= nn; i++) 3560 nvme_validate_ns(ctrl, i); 3561 3562 nvme_remove_invalid_namespaces(ctrl, nn); 3563 } 3564 3565 static void nvme_clear_changed_ns_log(struct nvme_ctrl *ctrl) 3566 { 3567 size_t log_size = NVME_MAX_CHANGED_NAMESPACES * sizeof(__le32); 3568 __le32 *log; 3569 int error; 3570 3571 log = kzalloc(log_size, GFP_KERNEL); 3572 if (!log) 3573 return; 3574 3575 /* 3576 * We need to read the log to clear the AEN, but we don't want to rely 3577 * on it for the changed namespace information as userspace could have 3578 * raced with us in reading the log page, which could cause us to miss 3579 * updates. 3580 */ 3581 error = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_CHANGED_NS, 0, log, 3582 log_size, 0); 3583 if (error) 3584 dev_warn(ctrl->device, 3585 "reading changed ns log failed: %d\n", error); 3586 3587 kfree(log); 3588 } 3589 3590 static void nvme_scan_work(struct work_struct *work) 3591 { 3592 struct nvme_ctrl *ctrl = 3593 container_of(work, struct nvme_ctrl, scan_work); 3594 struct nvme_id_ctrl *id; 3595 unsigned nn; 3596 3597 if (ctrl->state != NVME_CTRL_LIVE) 3598 return; 3599 3600 WARN_ON_ONCE(!ctrl->tagset); 3601 3602 if (test_and_clear_bit(NVME_AER_NOTICE_NS_CHANGED, &ctrl->events)) { 3603 dev_info(ctrl->device, "rescanning namespaces.\n"); 3604 nvme_clear_changed_ns_log(ctrl); 3605 } 3606 3607 if (nvme_identify_ctrl(ctrl, &id)) 3608 return; 3609 3610 mutex_lock(&ctrl->scan_lock); 3611 nn = le32_to_cpu(id->nn); 3612 if (ctrl->vs >= NVME_VS(1, 1, 0) && 3613 !(ctrl->quirks & NVME_QUIRK_IDENTIFY_CNS)) { 3614 if (!nvme_scan_ns_list(ctrl, nn)) 3615 goto out_free_id; 3616 } 3617 nvme_scan_ns_sequential(ctrl, nn); 3618 out_free_id: 3619 mutex_unlock(&ctrl->scan_lock); 3620 kfree(id); 3621 down_write(&ctrl->namespaces_rwsem); 3622 list_sort(NULL, &ctrl->namespaces, ns_cmp); 3623 up_write(&ctrl->namespaces_rwsem); 3624 } 3625 3626 /* 3627 * This function iterates the namespace list unlocked to allow recovery from 3628 * controller failure. It is up to the caller to ensure the namespace list is 3629 * not modified by scan work while this function is executing. 3630 */ 3631 void nvme_remove_namespaces(struct nvme_ctrl *ctrl) 3632 { 3633 struct nvme_ns *ns, *next; 3634 LIST_HEAD(ns_list); 3635 3636 /* 3637 * make sure to requeue I/O to all namespaces as these 3638 * might result from the scan itself and must complete 3639 * for the scan_work to make progress 3640 */ 3641 nvme_mpath_clear_ctrl_paths(ctrl); 3642 3643 /* prevent racing with ns scanning */ 3644 flush_work(&ctrl->scan_work); 3645 3646 /* 3647 * The dead states indicates the controller was not gracefully 3648 * disconnected. In that case, we won't be able to flush any data while 3649 * removing the namespaces' disks; fail all the queues now to avoid 3650 * potentially having to clean up the failed sync later. 3651 */ 3652 if (ctrl->state == NVME_CTRL_DEAD) 3653 nvme_kill_queues(ctrl); 3654 3655 down_write(&ctrl->namespaces_rwsem); 3656 list_splice_init(&ctrl->namespaces, &ns_list); 3657 up_write(&ctrl->namespaces_rwsem); 3658 3659 list_for_each_entry_safe(ns, next, &ns_list, list) 3660 nvme_ns_remove(ns); 3661 } 3662 EXPORT_SYMBOL_GPL(nvme_remove_namespaces); 3663 3664 static int nvme_class_uevent(struct device *dev, struct kobj_uevent_env *env) 3665 { 3666 struct nvme_ctrl *ctrl = 3667 container_of(dev, struct nvme_ctrl, ctrl_device); 3668 struct nvmf_ctrl_options *opts = ctrl->opts; 3669 int ret; 3670 3671 ret = add_uevent_var(env, "NVME_TRTYPE=%s", ctrl->ops->name); 3672 if (ret) 3673 return ret; 3674 3675 if (opts) { 3676 ret = add_uevent_var(env, "NVME_TRADDR=%s", opts->traddr); 3677 if (ret) 3678 return ret; 3679 3680 ret = add_uevent_var(env, "NVME_TRSVCID=%s", 3681 opts->trsvcid ?: "none"); 3682 if (ret) 3683 return ret; 3684 3685 ret = add_uevent_var(env, "NVME_HOST_TRADDR=%s", 3686 opts->host_traddr ?: "none"); 3687 } 3688 return ret; 3689 } 3690 3691 static void nvme_aen_uevent(struct nvme_ctrl *ctrl) 3692 { 3693 char *envp[2] = { NULL, NULL }; 3694 u32 aen_result = ctrl->aen_result; 3695 3696 ctrl->aen_result = 0; 3697 if (!aen_result) 3698 return; 3699 3700 envp[0] = kasprintf(GFP_KERNEL, "NVME_AEN=%#08x", aen_result); 3701 if (!envp[0]) 3702 return; 3703 kobject_uevent_env(&ctrl->device->kobj, KOBJ_CHANGE, envp); 3704 kfree(envp[0]); 3705 } 3706 3707 static void nvme_async_event_work(struct work_struct *work) 3708 { 3709 struct nvme_ctrl *ctrl = 3710 container_of(work, struct nvme_ctrl, async_event_work); 3711 3712 nvme_aen_uevent(ctrl); 3713 ctrl->ops->submit_async_event(ctrl); 3714 } 3715 3716 static bool nvme_ctrl_pp_status(struct nvme_ctrl *ctrl) 3717 { 3718 3719 u32 csts; 3720 3721 if (ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &csts)) 3722 return false; 3723 3724 if (csts == ~0) 3725 return false; 3726 3727 return ((ctrl->ctrl_config & NVME_CC_ENABLE) && (csts & NVME_CSTS_PP)); 3728 } 3729 3730 static void nvme_get_fw_slot_info(struct nvme_ctrl *ctrl) 3731 { 3732 struct nvme_fw_slot_info_log *log; 3733 3734 log = kmalloc(sizeof(*log), GFP_KERNEL); 3735 if (!log) 3736 return; 3737 3738 if (nvme_get_log(ctrl, NVME_NSID_ALL, 0, NVME_LOG_FW_SLOT, log, 3739 sizeof(*log), 0)) 3740 dev_warn(ctrl->device, "Get FW SLOT INFO log error\n"); 3741 kfree(log); 3742 } 3743 3744 static void nvme_fw_act_work(struct work_struct *work) 3745 { 3746 struct nvme_ctrl *ctrl = container_of(work, 3747 struct nvme_ctrl, fw_act_work); 3748 unsigned long fw_act_timeout; 3749 3750 if (ctrl->mtfa) 3751 fw_act_timeout = jiffies + 3752 msecs_to_jiffies(ctrl->mtfa * 100); 3753 else 3754 fw_act_timeout = jiffies + 3755 msecs_to_jiffies(admin_timeout * 1000); 3756 3757 nvme_stop_queues(ctrl); 3758 while (nvme_ctrl_pp_status(ctrl)) { 3759 if (time_after(jiffies, fw_act_timeout)) { 3760 dev_warn(ctrl->device, 3761 "Fw activation timeout, reset controller\n"); 3762 nvme_reset_ctrl(ctrl); 3763 break; 3764 } 3765 msleep(100); 3766 } 3767 3768 if (ctrl->state != NVME_CTRL_LIVE) 3769 return; 3770 3771 nvme_start_queues(ctrl); 3772 /* read FW slot information to clear the AER */ 3773 nvme_get_fw_slot_info(ctrl); 3774 } 3775 3776 static void nvme_handle_aen_notice(struct nvme_ctrl *ctrl, u32 result) 3777 { 3778 u32 aer_notice_type = (result & 0xff00) >> 8; 3779 3780 trace_nvme_async_event(ctrl, aer_notice_type); 3781 3782 switch (aer_notice_type) { 3783 case NVME_AER_NOTICE_NS_CHANGED: 3784 set_bit(NVME_AER_NOTICE_NS_CHANGED, &ctrl->events); 3785 nvme_queue_scan(ctrl); 3786 break; 3787 case NVME_AER_NOTICE_FW_ACT_STARTING: 3788 queue_work(nvme_wq, &ctrl->fw_act_work); 3789 break; 3790 #ifdef CONFIG_NVME_MULTIPATH 3791 case NVME_AER_NOTICE_ANA: 3792 if (!ctrl->ana_log_buf) 3793 break; 3794 queue_work(nvme_wq, &ctrl->ana_work); 3795 break; 3796 #endif 3797 case NVME_AER_NOTICE_DISC_CHANGED: 3798 ctrl->aen_result = result; 3799 break; 3800 default: 3801 dev_warn(ctrl->device, "async event result %08x\n", result); 3802 } 3803 } 3804 3805 void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status, 3806 volatile union nvme_result *res) 3807 { 3808 u32 result = le32_to_cpu(res->u32); 3809 u32 aer_type = result & 0x07; 3810 3811 if (le16_to_cpu(status) >> 1 != NVME_SC_SUCCESS) 3812 return; 3813 3814 switch (aer_type) { 3815 case NVME_AER_NOTICE: 3816 nvme_handle_aen_notice(ctrl, result); 3817 break; 3818 case NVME_AER_ERROR: 3819 case NVME_AER_SMART: 3820 case NVME_AER_CSS: 3821 case NVME_AER_VS: 3822 trace_nvme_async_event(ctrl, aer_type); 3823 ctrl->aen_result = result; 3824 break; 3825 default: 3826 break; 3827 } 3828 queue_work(nvme_wq, &ctrl->async_event_work); 3829 } 3830 EXPORT_SYMBOL_GPL(nvme_complete_async_event); 3831 3832 void nvme_stop_ctrl(struct nvme_ctrl *ctrl) 3833 { 3834 nvme_mpath_stop(ctrl); 3835 nvme_stop_keep_alive(ctrl); 3836 flush_work(&ctrl->async_event_work); 3837 cancel_work_sync(&ctrl->fw_act_work); 3838 } 3839 EXPORT_SYMBOL_GPL(nvme_stop_ctrl); 3840 3841 void nvme_start_ctrl(struct nvme_ctrl *ctrl) 3842 { 3843 if (ctrl->kato) 3844 nvme_start_keep_alive(ctrl); 3845 3846 nvme_enable_aen(ctrl); 3847 3848 if (ctrl->queue_count > 1) { 3849 nvme_queue_scan(ctrl); 3850 nvme_start_queues(ctrl); 3851 } 3852 } 3853 EXPORT_SYMBOL_GPL(nvme_start_ctrl); 3854 3855 void nvme_uninit_ctrl(struct nvme_ctrl *ctrl) 3856 { 3857 nvme_fault_inject_fini(&ctrl->fault_inject); 3858 dev_pm_qos_hide_latency_tolerance(ctrl->device); 3859 cdev_device_del(&ctrl->cdev, ctrl->device); 3860 } 3861 EXPORT_SYMBOL_GPL(nvme_uninit_ctrl); 3862 3863 static void nvme_free_ctrl(struct device *dev) 3864 { 3865 struct nvme_ctrl *ctrl = 3866 container_of(dev, struct nvme_ctrl, ctrl_device); 3867 struct nvme_subsystem *subsys = ctrl->subsys; 3868 3869 if (subsys && ctrl->instance != subsys->instance) 3870 ida_simple_remove(&nvme_instance_ida, ctrl->instance); 3871 3872 kfree(ctrl->effects); 3873 nvme_mpath_uninit(ctrl); 3874 __free_page(ctrl->discard_page); 3875 3876 if (subsys) { 3877 mutex_lock(&nvme_subsystems_lock); 3878 list_del(&ctrl->subsys_entry); 3879 sysfs_remove_link(&subsys->dev.kobj, dev_name(ctrl->device)); 3880 mutex_unlock(&nvme_subsystems_lock); 3881 } 3882 3883 ctrl->ops->free_ctrl(ctrl); 3884 3885 if (subsys) 3886 nvme_put_subsystem(subsys); 3887 } 3888 3889 /* 3890 * Initialize a NVMe controller structures. This needs to be called during 3891 * earliest initialization so that we have the initialized structured around 3892 * during probing. 3893 */ 3894 int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, 3895 const struct nvme_ctrl_ops *ops, unsigned long quirks) 3896 { 3897 int ret; 3898 3899 ctrl->state = NVME_CTRL_NEW; 3900 spin_lock_init(&ctrl->lock); 3901 mutex_init(&ctrl->scan_lock); 3902 INIT_LIST_HEAD(&ctrl->namespaces); 3903 init_rwsem(&ctrl->namespaces_rwsem); 3904 ctrl->dev = dev; 3905 ctrl->ops = ops; 3906 ctrl->quirks = quirks; 3907 INIT_WORK(&ctrl->scan_work, nvme_scan_work); 3908 INIT_WORK(&ctrl->async_event_work, nvme_async_event_work); 3909 INIT_WORK(&ctrl->fw_act_work, nvme_fw_act_work); 3910 INIT_WORK(&ctrl->delete_work, nvme_delete_ctrl_work); 3911 3912 INIT_DELAYED_WORK(&ctrl->ka_work, nvme_keep_alive_work); 3913 memset(&ctrl->ka_cmd, 0, sizeof(ctrl->ka_cmd)); 3914 ctrl->ka_cmd.common.opcode = nvme_admin_keep_alive; 3915 3916 BUILD_BUG_ON(NVME_DSM_MAX_RANGES * sizeof(struct nvme_dsm_range) > 3917 PAGE_SIZE); 3918 ctrl->discard_page = alloc_page(GFP_KERNEL); 3919 if (!ctrl->discard_page) { 3920 ret = -ENOMEM; 3921 goto out; 3922 } 3923 3924 ret = ida_simple_get(&nvme_instance_ida, 0, 0, GFP_KERNEL); 3925 if (ret < 0) 3926 goto out; 3927 ctrl->instance = ret; 3928 3929 device_initialize(&ctrl->ctrl_device); 3930 ctrl->device = &ctrl->ctrl_device; 3931 ctrl->device->devt = MKDEV(MAJOR(nvme_chr_devt), ctrl->instance); 3932 ctrl->device->class = nvme_class; 3933 ctrl->device->parent = ctrl->dev; 3934 ctrl->device->groups = nvme_dev_attr_groups; 3935 ctrl->device->release = nvme_free_ctrl; 3936 dev_set_drvdata(ctrl->device, ctrl); 3937 ret = dev_set_name(ctrl->device, "nvme%d", ctrl->instance); 3938 if (ret) 3939 goto out_release_instance; 3940 3941 cdev_init(&ctrl->cdev, &nvme_dev_fops); 3942 ctrl->cdev.owner = ops->module; 3943 ret = cdev_device_add(&ctrl->cdev, ctrl->device); 3944 if (ret) 3945 goto out_free_name; 3946 3947 /* 3948 * Initialize latency tolerance controls. The sysfs files won't 3949 * be visible to userspace unless the device actually supports APST. 3950 */ 3951 ctrl->device->power.set_latency_tolerance = nvme_set_latency_tolerance; 3952 dev_pm_qos_update_user_latency_tolerance(ctrl->device, 3953 min(default_ps_max_latency_us, (unsigned long)S32_MAX)); 3954 3955 nvme_fault_inject_init(&ctrl->fault_inject, dev_name(ctrl->device)); 3956 3957 return 0; 3958 out_free_name: 3959 kfree_const(ctrl->device->kobj.name); 3960 out_release_instance: 3961 ida_simple_remove(&nvme_instance_ida, ctrl->instance); 3962 out: 3963 if (ctrl->discard_page) 3964 __free_page(ctrl->discard_page); 3965 return ret; 3966 } 3967 EXPORT_SYMBOL_GPL(nvme_init_ctrl); 3968 3969 /** 3970 * nvme_kill_queues(): Ends all namespace queues 3971 * @ctrl: the dead controller that needs to end 3972 * 3973 * Call this function when the driver determines it is unable to get the 3974 * controller in a state capable of servicing IO. 3975 */ 3976 void nvme_kill_queues(struct nvme_ctrl *ctrl) 3977 { 3978 struct nvme_ns *ns; 3979 3980 down_read(&ctrl->namespaces_rwsem); 3981 3982 /* Forcibly unquiesce queues to avoid blocking dispatch */ 3983 if (ctrl->admin_q && !blk_queue_dying(ctrl->admin_q)) 3984 blk_mq_unquiesce_queue(ctrl->admin_q); 3985 3986 list_for_each_entry(ns, &ctrl->namespaces, list) 3987 nvme_set_queue_dying(ns); 3988 3989 up_read(&ctrl->namespaces_rwsem); 3990 } 3991 EXPORT_SYMBOL_GPL(nvme_kill_queues); 3992 3993 void nvme_unfreeze(struct nvme_ctrl *ctrl) 3994 { 3995 struct nvme_ns *ns; 3996 3997 down_read(&ctrl->namespaces_rwsem); 3998 list_for_each_entry(ns, &ctrl->namespaces, list) 3999 blk_mq_unfreeze_queue(ns->queue); 4000 up_read(&ctrl->namespaces_rwsem); 4001 } 4002 EXPORT_SYMBOL_GPL(nvme_unfreeze); 4003 4004 void nvme_wait_freeze_timeout(struct nvme_ctrl *ctrl, long timeout) 4005 { 4006 struct nvme_ns *ns; 4007 4008 down_read(&ctrl->namespaces_rwsem); 4009 list_for_each_entry(ns, &ctrl->namespaces, list) { 4010 timeout = blk_mq_freeze_queue_wait_timeout(ns->queue, timeout); 4011 if (timeout <= 0) 4012 break; 4013 } 4014 up_read(&ctrl->namespaces_rwsem); 4015 } 4016 EXPORT_SYMBOL_GPL(nvme_wait_freeze_timeout); 4017 4018 void nvme_wait_freeze(struct nvme_ctrl *ctrl) 4019 { 4020 struct nvme_ns *ns; 4021 4022 down_read(&ctrl->namespaces_rwsem); 4023 list_for_each_entry(ns, &ctrl->namespaces, list) 4024 blk_mq_freeze_queue_wait(ns->queue); 4025 up_read(&ctrl->namespaces_rwsem); 4026 } 4027 EXPORT_SYMBOL_GPL(nvme_wait_freeze); 4028 4029 void nvme_start_freeze(struct nvme_ctrl *ctrl) 4030 { 4031 struct nvme_ns *ns; 4032 4033 down_read(&ctrl->namespaces_rwsem); 4034 list_for_each_entry(ns, &ctrl->namespaces, list) 4035 blk_freeze_queue_start(ns->queue); 4036 up_read(&ctrl->namespaces_rwsem); 4037 } 4038 EXPORT_SYMBOL_GPL(nvme_start_freeze); 4039 4040 void nvme_stop_queues(struct nvme_ctrl *ctrl) 4041 { 4042 struct nvme_ns *ns; 4043 4044 down_read(&ctrl->namespaces_rwsem); 4045 list_for_each_entry(ns, &ctrl->namespaces, list) 4046 blk_mq_quiesce_queue(ns->queue); 4047 up_read(&ctrl->namespaces_rwsem); 4048 } 4049 EXPORT_SYMBOL_GPL(nvme_stop_queues); 4050 4051 void nvme_start_queues(struct nvme_ctrl *ctrl) 4052 { 4053 struct nvme_ns *ns; 4054 4055 down_read(&ctrl->namespaces_rwsem); 4056 list_for_each_entry(ns, &ctrl->namespaces, list) 4057 blk_mq_unquiesce_queue(ns->queue); 4058 up_read(&ctrl->namespaces_rwsem); 4059 } 4060 EXPORT_SYMBOL_GPL(nvme_start_queues); 4061 4062 4063 void nvme_sync_queues(struct nvme_ctrl *ctrl) 4064 { 4065 struct nvme_ns *ns; 4066 4067 down_read(&ctrl->namespaces_rwsem); 4068 list_for_each_entry(ns, &ctrl->namespaces, list) 4069 blk_sync_queue(ns->queue); 4070 up_read(&ctrl->namespaces_rwsem); 4071 4072 if (ctrl->admin_q) 4073 blk_sync_queue(ctrl->admin_q); 4074 } 4075 EXPORT_SYMBOL_GPL(nvme_sync_queues); 4076 4077 /* 4078 * Check we didn't inadvertently grow the command structure sizes: 4079 */ 4080 static inline void _nvme_check_size(void) 4081 { 4082 BUILD_BUG_ON(sizeof(struct nvme_common_command) != 64); 4083 BUILD_BUG_ON(sizeof(struct nvme_rw_command) != 64); 4084 BUILD_BUG_ON(sizeof(struct nvme_identify) != 64); 4085 BUILD_BUG_ON(sizeof(struct nvme_features) != 64); 4086 BUILD_BUG_ON(sizeof(struct nvme_download_firmware) != 64); 4087 BUILD_BUG_ON(sizeof(struct nvme_format_cmd) != 64); 4088 BUILD_BUG_ON(sizeof(struct nvme_dsm_cmd) != 64); 4089 BUILD_BUG_ON(sizeof(struct nvme_write_zeroes_cmd) != 64); 4090 BUILD_BUG_ON(sizeof(struct nvme_abort_cmd) != 64); 4091 BUILD_BUG_ON(sizeof(struct nvme_get_log_page_command) != 64); 4092 BUILD_BUG_ON(sizeof(struct nvme_command) != 64); 4093 BUILD_BUG_ON(sizeof(struct nvme_id_ctrl) != NVME_IDENTIFY_DATA_SIZE); 4094 BUILD_BUG_ON(sizeof(struct nvme_id_ns) != NVME_IDENTIFY_DATA_SIZE); 4095 BUILD_BUG_ON(sizeof(struct nvme_lba_range_type) != 64); 4096 BUILD_BUG_ON(sizeof(struct nvme_smart_log) != 512); 4097 BUILD_BUG_ON(sizeof(struct nvme_dbbuf) != 64); 4098 BUILD_BUG_ON(sizeof(struct nvme_directive_cmd) != 64); 4099 } 4100 4101 4102 static int __init nvme_core_init(void) 4103 { 4104 int result = -ENOMEM; 4105 4106 _nvme_check_size(); 4107 4108 nvme_wq = alloc_workqueue("nvme-wq", 4109 WQ_UNBOUND | WQ_MEM_RECLAIM | WQ_SYSFS, 0); 4110 if (!nvme_wq) 4111 goto out; 4112 4113 nvme_reset_wq = alloc_workqueue("nvme-reset-wq", 4114 WQ_UNBOUND | WQ_MEM_RECLAIM | WQ_SYSFS, 0); 4115 if (!nvme_reset_wq) 4116 goto destroy_wq; 4117 4118 nvme_delete_wq = alloc_workqueue("nvme-delete-wq", 4119 WQ_UNBOUND | WQ_MEM_RECLAIM | WQ_SYSFS, 0); 4120 if (!nvme_delete_wq) 4121 goto destroy_reset_wq; 4122 4123 result = alloc_chrdev_region(&nvme_chr_devt, 0, NVME_MINORS, "nvme"); 4124 if (result < 0) 4125 goto destroy_delete_wq; 4126 4127 nvme_class = class_create(THIS_MODULE, "nvme"); 4128 if (IS_ERR(nvme_class)) { 4129 result = PTR_ERR(nvme_class); 4130 goto unregister_chrdev; 4131 } 4132 nvme_class->dev_uevent = nvme_class_uevent; 4133 4134 nvme_subsys_class = class_create(THIS_MODULE, "nvme-subsystem"); 4135 if (IS_ERR(nvme_subsys_class)) { 4136 result = PTR_ERR(nvme_subsys_class); 4137 goto destroy_class; 4138 } 4139 return 0; 4140 4141 destroy_class: 4142 class_destroy(nvme_class); 4143 unregister_chrdev: 4144 unregister_chrdev_region(nvme_chr_devt, NVME_MINORS); 4145 destroy_delete_wq: 4146 destroy_workqueue(nvme_delete_wq); 4147 destroy_reset_wq: 4148 destroy_workqueue(nvme_reset_wq); 4149 destroy_wq: 4150 destroy_workqueue(nvme_wq); 4151 out: 4152 return result; 4153 } 4154 4155 static void __exit nvme_core_exit(void) 4156 { 4157 class_destroy(nvme_subsys_class); 4158 class_destroy(nvme_class); 4159 unregister_chrdev_region(nvme_chr_devt, NVME_MINORS); 4160 destroy_workqueue(nvme_delete_wq); 4161 destroy_workqueue(nvme_reset_wq); 4162 destroy_workqueue(nvme_wq); 4163 } 4164 4165 MODULE_LICENSE("GPL"); 4166 MODULE_VERSION("1.0"); 4167 module_init(nvme_core_init); 4168 module_exit(nvme_core_exit); 4169