1 /* 2 * NVM Express device driver 3 * Copyright (c) 2011-2014, Intel Corporation. 4 * 5 * This program is free software; you can redistribute it and/or modify it 6 * under the terms and conditions of the GNU General Public License, 7 * version 2, as published by the Free Software Foundation. 8 * 9 * This program is distributed in the hope it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 12 * more details. 13 */ 14 15 #include <linux/bitops.h> 16 #include <linux/blkdev.h> 17 #include <linux/blk-mq.h> 18 #include <linux/cpu.h> 19 #include <linux/delay.h> 20 #include <linux/errno.h> 21 #include <linux/fs.h> 22 #include <linux/genhd.h> 23 #include <linux/hdreg.h> 24 #include <linux/idr.h> 25 #include <linux/init.h> 26 #include <linux/interrupt.h> 27 #include <linux/io.h> 28 #include <linux/kdev_t.h> 29 #include <linux/kthread.h> 30 #include <linux/kernel.h> 31 #include <linux/list_sort.h> 32 #include <linux/mm.h> 33 #include <linux/module.h> 34 #include <linux/moduleparam.h> 35 #include <linux/pci.h> 36 #include <linux/poison.h> 37 #include <linux/ptrace.h> 38 #include <linux/sched.h> 39 #include <linux/slab.h> 40 #include <linux/t10-pi.h> 41 #include <linux/types.h> 42 #include <linux/pr.h> 43 #include <scsi/sg.h> 44 #include <linux/io-64-nonatomic-lo-hi.h> 45 #include <asm/unaligned.h> 46 47 #include <uapi/linux/nvme_ioctl.h> 48 #include "nvme.h" 49 50 #define NVME_MINORS (1U << MINORBITS) 51 #define NVME_Q_DEPTH 1024 52 #define NVME_AQ_DEPTH 256 53 #define SQ_SIZE(depth) (depth * sizeof(struct nvme_command)) 54 #define CQ_SIZE(depth) (depth * sizeof(struct nvme_completion)) 55 56 unsigned char admin_timeout = 60; 57 module_param(admin_timeout, byte, 0644); 58 MODULE_PARM_DESC(admin_timeout, "timeout in seconds for admin commands"); 59 60 unsigned char nvme_io_timeout = 30; 61 module_param_named(io_timeout, nvme_io_timeout, byte, 0644); 62 MODULE_PARM_DESC(io_timeout, "timeout in seconds for I/O"); 63 64 unsigned char shutdown_timeout = 5; 65 module_param(shutdown_timeout, byte, 0644); 66 MODULE_PARM_DESC(shutdown_timeout, "timeout in seconds for controller shutdown"); 67 68 static int nvme_major; 69 module_param(nvme_major, int, 0); 70 71 static int nvme_char_major; 72 module_param(nvme_char_major, int, 0); 73 74 static int use_threaded_interrupts; 75 module_param(use_threaded_interrupts, int, 0); 76 77 static bool use_cmb_sqes = true; 78 module_param(use_cmb_sqes, bool, 0644); 79 MODULE_PARM_DESC(use_cmb_sqes, "use controller's memory buffer for I/O SQes"); 80 81 static LIST_HEAD(dev_list); 82 static struct task_struct *nvme_thread; 83 static struct workqueue_struct *nvme_workq; 84 static wait_queue_head_t nvme_kthread_wait; 85 86 static struct class *nvme_class; 87 88 struct nvme_dev; 89 struct nvme_queue; 90 struct nvme_iod; 91 92 static int __nvme_reset(struct nvme_dev *dev); 93 static int nvme_reset(struct nvme_dev *dev); 94 static void nvme_process_cq(struct nvme_queue *nvmeq); 95 static void nvme_unmap_data(struct nvme_dev *dev, struct nvme_iod *iod); 96 static void nvme_dead_ctrl(struct nvme_dev *dev); 97 98 struct async_cmd_info { 99 struct kthread_work work; 100 struct kthread_worker *worker; 101 struct request *req; 102 u32 result; 103 int status; 104 void *ctx; 105 }; 106 107 /* 108 * Represents an NVM Express device. Each nvme_dev is a PCI function. 109 */ 110 struct nvme_dev { 111 struct list_head node; 112 struct nvme_queue **queues; 113 struct blk_mq_tag_set tagset; 114 struct blk_mq_tag_set admin_tagset; 115 u32 __iomem *dbs; 116 struct device *dev; 117 struct dma_pool *prp_page_pool; 118 struct dma_pool *prp_small_pool; 119 unsigned queue_count; 120 unsigned online_queues; 121 unsigned max_qid; 122 int q_depth; 123 u32 db_stride; 124 struct msix_entry *entry; 125 void __iomem *bar; 126 struct list_head namespaces; 127 struct device *device; 128 struct work_struct reset_work; 129 struct work_struct probe_work; 130 struct work_struct scan_work; 131 bool subsystem; 132 void __iomem *cmb; 133 dma_addr_t cmb_dma_addr; 134 u64 cmb_size; 135 u32 cmbsz; 136 137 struct nvme_ctrl ctrl; 138 }; 139 140 static inline struct nvme_dev *to_nvme_dev(struct nvme_ctrl *ctrl) 141 { 142 return container_of(ctrl, struct nvme_dev, ctrl); 143 } 144 145 /* 146 * An NVM Express queue. Each device has at least two (one for admin 147 * commands and one for I/O commands). 148 */ 149 struct nvme_queue { 150 struct device *q_dmadev; 151 struct nvme_dev *dev; 152 char irqname[24]; /* nvme4294967295-65535\0 */ 153 spinlock_t q_lock; 154 struct nvme_command *sq_cmds; 155 struct nvme_command __iomem *sq_cmds_io; 156 volatile struct nvme_completion *cqes; 157 struct blk_mq_tags **tags; 158 dma_addr_t sq_dma_addr; 159 dma_addr_t cq_dma_addr; 160 u32 __iomem *q_db; 161 u16 q_depth; 162 s16 cq_vector; 163 u16 sq_head; 164 u16 sq_tail; 165 u16 cq_head; 166 u16 qid; 167 u8 cq_phase; 168 u8 cqe_seen; 169 struct async_cmd_info cmdinfo; 170 }; 171 172 /* 173 * The nvme_iod describes the data in an I/O, including the list of PRP 174 * entries. You can't see it in this data structure because C doesn't let 175 * me express that. Use nvme_alloc_iod to ensure there's enough space 176 * allocated to store the PRP list. 177 */ 178 struct nvme_iod { 179 unsigned long private; /* For the use of the submitter of the I/O */ 180 int npages; /* In the PRP list. 0 means small pool in use */ 181 int offset; /* Of PRP list */ 182 int nents; /* Used in scatterlist */ 183 int length; /* Of data, in bytes */ 184 dma_addr_t first_dma; 185 struct scatterlist meta_sg[1]; /* metadata requires single contiguous buffer */ 186 struct scatterlist sg[0]; 187 }; 188 189 /* 190 * Check we didin't inadvertently grow the command struct 191 */ 192 static inline void _nvme_check_size(void) 193 { 194 BUILD_BUG_ON(sizeof(struct nvme_rw_command) != 64); 195 BUILD_BUG_ON(sizeof(struct nvme_create_cq) != 64); 196 BUILD_BUG_ON(sizeof(struct nvme_create_sq) != 64); 197 BUILD_BUG_ON(sizeof(struct nvme_delete_queue) != 64); 198 BUILD_BUG_ON(sizeof(struct nvme_features) != 64); 199 BUILD_BUG_ON(sizeof(struct nvme_format_cmd) != 64); 200 BUILD_BUG_ON(sizeof(struct nvme_abort_cmd) != 64); 201 BUILD_BUG_ON(sizeof(struct nvme_command) != 64); 202 BUILD_BUG_ON(sizeof(struct nvme_id_ctrl) != 4096); 203 BUILD_BUG_ON(sizeof(struct nvme_id_ns) != 4096); 204 BUILD_BUG_ON(sizeof(struct nvme_lba_range_type) != 64); 205 BUILD_BUG_ON(sizeof(struct nvme_smart_log) != 512); 206 } 207 208 typedef void (*nvme_completion_fn)(struct nvme_queue *, void *, 209 struct nvme_completion *); 210 211 struct nvme_cmd_info { 212 nvme_completion_fn fn; 213 void *ctx; 214 int aborted; 215 struct nvme_queue *nvmeq; 216 struct nvme_iod iod[0]; 217 }; 218 219 /* 220 * Max size of iod being embedded in the request payload 221 */ 222 #define NVME_INT_PAGES 2 223 #define NVME_INT_BYTES(dev) (NVME_INT_PAGES * (dev)->ctrl.page_size) 224 #define NVME_INT_MASK 0x01 225 226 /* 227 * Will slightly overestimate the number of pages needed. This is OK 228 * as it only leads to a small amount of wasted memory for the lifetime of 229 * the I/O. 230 */ 231 static int nvme_npages(unsigned size, struct nvme_dev *dev) 232 { 233 unsigned nprps = DIV_ROUND_UP(size + dev->ctrl.page_size, 234 dev->ctrl.page_size); 235 return DIV_ROUND_UP(8 * nprps, PAGE_SIZE - 8); 236 } 237 238 static unsigned int nvme_cmd_size(struct nvme_dev *dev) 239 { 240 unsigned int ret = sizeof(struct nvme_cmd_info); 241 242 ret += sizeof(struct nvme_iod); 243 ret += sizeof(__le64 *) * nvme_npages(NVME_INT_BYTES(dev), dev); 244 ret += sizeof(struct scatterlist) * NVME_INT_PAGES; 245 246 return ret; 247 } 248 249 static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, 250 unsigned int hctx_idx) 251 { 252 struct nvme_dev *dev = data; 253 struct nvme_queue *nvmeq = dev->queues[0]; 254 255 WARN_ON(hctx_idx != 0); 256 WARN_ON(dev->admin_tagset.tags[0] != hctx->tags); 257 WARN_ON(nvmeq->tags); 258 259 hctx->driver_data = nvmeq; 260 nvmeq->tags = &dev->admin_tagset.tags[0]; 261 return 0; 262 } 263 264 static void nvme_admin_exit_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) 265 { 266 struct nvme_queue *nvmeq = hctx->driver_data; 267 268 nvmeq->tags = NULL; 269 } 270 271 static int nvme_admin_init_request(void *data, struct request *req, 272 unsigned int hctx_idx, unsigned int rq_idx, 273 unsigned int numa_node) 274 { 275 struct nvme_dev *dev = data; 276 struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req); 277 struct nvme_queue *nvmeq = dev->queues[0]; 278 279 BUG_ON(!nvmeq); 280 cmd->nvmeq = nvmeq; 281 return 0; 282 } 283 284 static int nvme_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, 285 unsigned int hctx_idx) 286 { 287 struct nvme_dev *dev = data; 288 struct nvme_queue *nvmeq = dev->queues[hctx_idx + 1]; 289 290 if (!nvmeq->tags) 291 nvmeq->tags = &dev->tagset.tags[hctx_idx]; 292 293 WARN_ON(dev->tagset.tags[hctx_idx] != hctx->tags); 294 hctx->driver_data = nvmeq; 295 return 0; 296 } 297 298 static int nvme_init_request(void *data, struct request *req, 299 unsigned int hctx_idx, unsigned int rq_idx, 300 unsigned int numa_node) 301 { 302 struct nvme_dev *dev = data; 303 struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req); 304 struct nvme_queue *nvmeq = dev->queues[hctx_idx + 1]; 305 306 BUG_ON(!nvmeq); 307 cmd->nvmeq = nvmeq; 308 return 0; 309 } 310 311 static void nvme_set_info(struct nvme_cmd_info *cmd, void *ctx, 312 nvme_completion_fn handler) 313 { 314 cmd->fn = handler; 315 cmd->ctx = ctx; 316 cmd->aborted = 0; 317 blk_mq_start_request(blk_mq_rq_from_pdu(cmd)); 318 } 319 320 static void *iod_get_private(struct nvme_iod *iod) 321 { 322 return (void *) (iod->private & ~0x1UL); 323 } 324 325 /* 326 * If bit 0 is set, the iod is embedded in the request payload. 327 */ 328 static bool iod_should_kfree(struct nvme_iod *iod) 329 { 330 return (iod->private & NVME_INT_MASK) == 0; 331 } 332 333 /* Special values must be less than 0x1000 */ 334 #define CMD_CTX_BASE ((void *)POISON_POINTER_DELTA) 335 #define CMD_CTX_CANCELLED (0x30C + CMD_CTX_BASE) 336 #define CMD_CTX_COMPLETED (0x310 + CMD_CTX_BASE) 337 #define CMD_CTX_INVALID (0x314 + CMD_CTX_BASE) 338 339 static void special_completion(struct nvme_queue *nvmeq, void *ctx, 340 struct nvme_completion *cqe) 341 { 342 if (ctx == CMD_CTX_CANCELLED) 343 return; 344 if (ctx == CMD_CTX_COMPLETED) { 345 dev_warn(nvmeq->q_dmadev, 346 "completed id %d twice on queue %d\n", 347 cqe->command_id, le16_to_cpup(&cqe->sq_id)); 348 return; 349 } 350 if (ctx == CMD_CTX_INVALID) { 351 dev_warn(nvmeq->q_dmadev, 352 "invalid id %d completed on queue %d\n", 353 cqe->command_id, le16_to_cpup(&cqe->sq_id)); 354 return; 355 } 356 dev_warn(nvmeq->q_dmadev, "Unknown special completion %p\n", ctx); 357 } 358 359 static void *cancel_cmd_info(struct nvme_cmd_info *cmd, nvme_completion_fn *fn) 360 { 361 void *ctx; 362 363 if (fn) 364 *fn = cmd->fn; 365 ctx = cmd->ctx; 366 cmd->fn = special_completion; 367 cmd->ctx = CMD_CTX_CANCELLED; 368 return ctx; 369 } 370 371 static void async_req_completion(struct nvme_queue *nvmeq, void *ctx, 372 struct nvme_completion *cqe) 373 { 374 u32 result = le32_to_cpup(&cqe->result); 375 u16 status = le16_to_cpup(&cqe->status) >> 1; 376 377 if (status == NVME_SC_SUCCESS || status == NVME_SC_ABORT_REQ) 378 ++nvmeq->dev->ctrl.event_limit; 379 if (status != NVME_SC_SUCCESS) 380 return; 381 382 switch (result & 0xff07) { 383 case NVME_AER_NOTICE_NS_CHANGED: 384 dev_info(nvmeq->q_dmadev, "rescanning\n"); 385 schedule_work(&nvmeq->dev->scan_work); 386 default: 387 dev_warn(nvmeq->q_dmadev, "async event result %08x\n", result); 388 } 389 } 390 391 static void abort_completion(struct nvme_queue *nvmeq, void *ctx, 392 struct nvme_completion *cqe) 393 { 394 struct request *req = ctx; 395 396 u16 status = le16_to_cpup(&cqe->status) >> 1; 397 u32 result = le32_to_cpup(&cqe->result); 398 399 blk_mq_free_request(req); 400 401 dev_warn(nvmeq->q_dmadev, "Abort status:%x result:%x", status, result); 402 ++nvmeq->dev->ctrl.abort_limit; 403 } 404 405 static void async_completion(struct nvme_queue *nvmeq, void *ctx, 406 struct nvme_completion *cqe) 407 { 408 struct async_cmd_info *cmdinfo = ctx; 409 cmdinfo->result = le32_to_cpup(&cqe->result); 410 cmdinfo->status = le16_to_cpup(&cqe->status) >> 1; 411 queue_kthread_work(cmdinfo->worker, &cmdinfo->work); 412 blk_mq_free_request(cmdinfo->req); 413 } 414 415 static inline struct nvme_cmd_info *get_cmd_from_tag(struct nvme_queue *nvmeq, 416 unsigned int tag) 417 { 418 struct request *req = blk_mq_tag_to_rq(*nvmeq->tags, tag); 419 420 return blk_mq_rq_to_pdu(req); 421 } 422 423 /* 424 * Called with local interrupts disabled and the q_lock held. May not sleep. 425 */ 426 static void *nvme_finish_cmd(struct nvme_queue *nvmeq, int tag, 427 nvme_completion_fn *fn) 428 { 429 struct nvme_cmd_info *cmd = get_cmd_from_tag(nvmeq, tag); 430 void *ctx; 431 if (tag >= nvmeq->q_depth) { 432 *fn = special_completion; 433 return CMD_CTX_INVALID; 434 } 435 if (fn) 436 *fn = cmd->fn; 437 ctx = cmd->ctx; 438 cmd->fn = special_completion; 439 cmd->ctx = CMD_CTX_COMPLETED; 440 return ctx; 441 } 442 443 /** 444 * nvme_submit_cmd() - Copy a command into a queue and ring the doorbell 445 * @nvmeq: The queue to use 446 * @cmd: The command to send 447 * 448 * Safe to use from interrupt context 449 */ 450 static void __nvme_submit_cmd(struct nvme_queue *nvmeq, 451 struct nvme_command *cmd) 452 { 453 u16 tail = nvmeq->sq_tail; 454 455 if (nvmeq->sq_cmds_io) 456 memcpy_toio(&nvmeq->sq_cmds_io[tail], cmd, sizeof(*cmd)); 457 else 458 memcpy(&nvmeq->sq_cmds[tail], cmd, sizeof(*cmd)); 459 460 if (++tail == nvmeq->q_depth) 461 tail = 0; 462 writel(tail, nvmeq->q_db); 463 nvmeq->sq_tail = tail; 464 } 465 466 static void nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd) 467 { 468 unsigned long flags; 469 spin_lock_irqsave(&nvmeq->q_lock, flags); 470 __nvme_submit_cmd(nvmeq, cmd); 471 spin_unlock_irqrestore(&nvmeq->q_lock, flags); 472 } 473 474 static __le64 **iod_list(struct nvme_iod *iod) 475 { 476 return ((void *)iod) + iod->offset; 477 } 478 479 static inline void iod_init(struct nvme_iod *iod, unsigned nbytes, 480 unsigned nseg, unsigned long private) 481 { 482 iod->private = private; 483 iod->offset = offsetof(struct nvme_iod, sg[nseg]); 484 iod->npages = -1; 485 iod->length = nbytes; 486 iod->nents = 0; 487 } 488 489 static struct nvme_iod * 490 __nvme_alloc_iod(unsigned nseg, unsigned bytes, struct nvme_dev *dev, 491 unsigned long priv, gfp_t gfp) 492 { 493 struct nvme_iod *iod = kmalloc(sizeof(struct nvme_iod) + 494 sizeof(__le64 *) * nvme_npages(bytes, dev) + 495 sizeof(struct scatterlist) * nseg, gfp); 496 497 if (iod) 498 iod_init(iod, bytes, nseg, priv); 499 500 return iod; 501 } 502 503 static struct nvme_iod *nvme_alloc_iod(struct request *rq, struct nvme_dev *dev, 504 gfp_t gfp) 505 { 506 unsigned size = !(rq->cmd_flags & REQ_DISCARD) ? blk_rq_bytes(rq) : 507 sizeof(struct nvme_dsm_range); 508 struct nvme_iod *iod; 509 510 if (rq->nr_phys_segments <= NVME_INT_PAGES && 511 size <= NVME_INT_BYTES(dev)) { 512 struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(rq); 513 514 iod = cmd->iod; 515 iod_init(iod, size, rq->nr_phys_segments, 516 (unsigned long) rq | NVME_INT_MASK); 517 return iod; 518 } 519 520 return __nvme_alloc_iod(rq->nr_phys_segments, size, dev, 521 (unsigned long) rq, gfp); 522 } 523 524 static void nvme_free_iod(struct nvme_dev *dev, struct nvme_iod *iod) 525 { 526 const int last_prp = dev->ctrl.page_size / 8 - 1; 527 int i; 528 __le64 **list = iod_list(iod); 529 dma_addr_t prp_dma = iod->first_dma; 530 531 if (iod->npages == 0) 532 dma_pool_free(dev->prp_small_pool, list[0], prp_dma); 533 for (i = 0; i < iod->npages; i++) { 534 __le64 *prp_list = list[i]; 535 dma_addr_t next_prp_dma = le64_to_cpu(prp_list[last_prp]); 536 dma_pool_free(dev->prp_page_pool, prp_list, prp_dma); 537 prp_dma = next_prp_dma; 538 } 539 540 if (iod_should_kfree(iod)) 541 kfree(iod); 542 } 543 544 #ifdef CONFIG_BLK_DEV_INTEGRITY 545 static void nvme_dif_prep(u32 p, u32 v, struct t10_pi_tuple *pi) 546 { 547 if (be32_to_cpu(pi->ref_tag) == v) 548 pi->ref_tag = cpu_to_be32(p); 549 } 550 551 static void nvme_dif_complete(u32 p, u32 v, struct t10_pi_tuple *pi) 552 { 553 if (be32_to_cpu(pi->ref_tag) == p) 554 pi->ref_tag = cpu_to_be32(v); 555 } 556 557 /** 558 * nvme_dif_remap - remaps ref tags to bip seed and physical lba 559 * 560 * The virtual start sector is the one that was originally submitted by the 561 * block layer. Due to partitioning, MD/DM cloning, etc. the actual physical 562 * start sector may be different. Remap protection information to match the 563 * physical LBA on writes, and back to the original seed on reads. 564 * 565 * Type 0 and 3 do not have a ref tag, so no remapping required. 566 */ 567 static void nvme_dif_remap(struct request *req, 568 void (*dif_swap)(u32 p, u32 v, struct t10_pi_tuple *pi)) 569 { 570 struct nvme_ns *ns = req->rq_disk->private_data; 571 struct bio_integrity_payload *bip; 572 struct t10_pi_tuple *pi; 573 void *p, *pmap; 574 u32 i, nlb, ts, phys, virt; 575 576 if (!ns->pi_type || ns->pi_type == NVME_NS_DPS_PI_TYPE3) 577 return; 578 579 bip = bio_integrity(req->bio); 580 if (!bip) 581 return; 582 583 pmap = kmap_atomic(bip->bip_vec->bv_page) + bip->bip_vec->bv_offset; 584 585 p = pmap; 586 virt = bip_get_seed(bip); 587 phys = nvme_block_nr(ns, blk_rq_pos(req)); 588 nlb = (blk_rq_bytes(req) >> ns->lba_shift); 589 ts = ns->disk->queue->integrity.tuple_size; 590 591 for (i = 0; i < nlb; i++, virt++, phys++) { 592 pi = (struct t10_pi_tuple *)p; 593 dif_swap(phys, virt, pi); 594 p += ts; 595 } 596 kunmap_atomic(pmap); 597 } 598 #else /* CONFIG_BLK_DEV_INTEGRITY */ 599 static void nvme_dif_remap(struct request *req, 600 void (*dif_swap)(u32 p, u32 v, struct t10_pi_tuple *pi)) 601 { 602 } 603 static void nvme_dif_prep(u32 p, u32 v, struct t10_pi_tuple *pi) 604 { 605 } 606 static void nvme_dif_complete(u32 p, u32 v, struct t10_pi_tuple *pi) 607 { 608 } 609 #endif 610 611 static void req_completion(struct nvme_queue *nvmeq, void *ctx, 612 struct nvme_completion *cqe) 613 { 614 struct nvme_iod *iod = ctx; 615 struct request *req = iod_get_private(iod); 616 struct nvme_cmd_info *cmd_rq = blk_mq_rq_to_pdu(req); 617 u16 status = le16_to_cpup(&cqe->status) >> 1; 618 int error = 0; 619 620 if (unlikely(status)) { 621 if (!(status & NVME_SC_DNR || blk_noretry_request(req)) 622 && (jiffies - req->start_time) < req->timeout) { 623 unsigned long flags; 624 625 nvme_unmap_data(nvmeq->dev, iod); 626 627 blk_mq_requeue_request(req); 628 spin_lock_irqsave(req->q->queue_lock, flags); 629 if (!blk_queue_stopped(req->q)) 630 blk_mq_kick_requeue_list(req->q); 631 spin_unlock_irqrestore(req->q->queue_lock, flags); 632 return; 633 } 634 635 if (req->cmd_type == REQ_TYPE_DRV_PRIV) { 636 if (cmd_rq->ctx == CMD_CTX_CANCELLED) 637 error = -EINTR; 638 else 639 error = status; 640 } else { 641 error = nvme_error_status(status); 642 } 643 } 644 645 if (req->cmd_type == REQ_TYPE_DRV_PRIV) { 646 u32 result = le32_to_cpup(&cqe->result); 647 req->special = (void *)(uintptr_t)result; 648 } 649 650 if (cmd_rq->aborted) 651 dev_warn(nvmeq->dev->dev, 652 "completing aborted command with status:%04x\n", 653 error); 654 655 nvme_unmap_data(nvmeq->dev, iod); 656 blk_mq_complete_request(req, error); 657 } 658 659 static bool nvme_setup_prps(struct nvme_dev *dev, struct nvme_iod *iod, 660 int total_len) 661 { 662 struct dma_pool *pool; 663 int length = total_len; 664 struct scatterlist *sg = iod->sg; 665 int dma_len = sg_dma_len(sg); 666 u64 dma_addr = sg_dma_address(sg); 667 u32 page_size = dev->ctrl.page_size; 668 int offset = dma_addr & (page_size - 1); 669 __le64 *prp_list; 670 __le64 **list = iod_list(iod); 671 dma_addr_t prp_dma; 672 int nprps, i; 673 674 length -= (page_size - offset); 675 if (length <= 0) 676 return true; 677 678 dma_len -= (page_size - offset); 679 if (dma_len) { 680 dma_addr += (page_size - offset); 681 } else { 682 sg = sg_next(sg); 683 dma_addr = sg_dma_address(sg); 684 dma_len = sg_dma_len(sg); 685 } 686 687 if (length <= page_size) { 688 iod->first_dma = dma_addr; 689 return true; 690 } 691 692 nprps = DIV_ROUND_UP(length, page_size); 693 if (nprps <= (256 / 8)) { 694 pool = dev->prp_small_pool; 695 iod->npages = 0; 696 } else { 697 pool = dev->prp_page_pool; 698 iod->npages = 1; 699 } 700 701 prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &prp_dma); 702 if (!prp_list) { 703 iod->first_dma = dma_addr; 704 iod->npages = -1; 705 return false; 706 } 707 list[0] = prp_list; 708 iod->first_dma = prp_dma; 709 i = 0; 710 for (;;) { 711 if (i == page_size >> 3) { 712 __le64 *old_prp_list = prp_list; 713 prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &prp_dma); 714 if (!prp_list) 715 return false; 716 list[iod->npages++] = prp_list; 717 prp_list[0] = old_prp_list[i - 1]; 718 old_prp_list[i - 1] = cpu_to_le64(prp_dma); 719 i = 1; 720 } 721 prp_list[i++] = cpu_to_le64(dma_addr); 722 dma_len -= page_size; 723 dma_addr += page_size; 724 length -= page_size; 725 if (length <= 0) 726 break; 727 if (dma_len > 0) 728 continue; 729 BUG_ON(dma_len < 0); 730 sg = sg_next(sg); 731 dma_addr = sg_dma_address(sg); 732 dma_len = sg_dma_len(sg); 733 } 734 735 return true; 736 } 737 738 static int nvme_map_data(struct nvme_dev *dev, struct nvme_iod *iod, 739 struct nvme_command *cmnd) 740 { 741 struct request *req = iod_get_private(iod); 742 struct request_queue *q = req->q; 743 enum dma_data_direction dma_dir = rq_data_dir(req) ? 744 DMA_TO_DEVICE : DMA_FROM_DEVICE; 745 int ret = BLK_MQ_RQ_QUEUE_ERROR; 746 747 sg_init_table(iod->sg, req->nr_phys_segments); 748 iod->nents = blk_rq_map_sg(q, req, iod->sg); 749 if (!iod->nents) 750 goto out; 751 752 ret = BLK_MQ_RQ_QUEUE_BUSY; 753 if (!dma_map_sg(dev->dev, iod->sg, iod->nents, dma_dir)) 754 goto out; 755 756 if (!nvme_setup_prps(dev, iod, blk_rq_bytes(req))) 757 goto out_unmap; 758 759 ret = BLK_MQ_RQ_QUEUE_ERROR; 760 if (blk_integrity_rq(req)) { 761 if (blk_rq_count_integrity_sg(q, req->bio) != 1) 762 goto out_unmap; 763 764 sg_init_table(iod->meta_sg, 1); 765 if (blk_rq_map_integrity_sg(q, req->bio, iod->meta_sg) != 1) 766 goto out_unmap; 767 768 if (rq_data_dir(req)) 769 nvme_dif_remap(req, nvme_dif_prep); 770 771 if (!dma_map_sg(dev->dev, iod->meta_sg, 1, dma_dir)) 772 goto out_unmap; 773 } 774 775 cmnd->rw.prp1 = cpu_to_le64(sg_dma_address(iod->sg)); 776 cmnd->rw.prp2 = cpu_to_le64(iod->first_dma); 777 if (blk_integrity_rq(req)) 778 cmnd->rw.metadata = cpu_to_le64(sg_dma_address(iod->meta_sg)); 779 return BLK_MQ_RQ_QUEUE_OK; 780 781 out_unmap: 782 dma_unmap_sg(dev->dev, iod->sg, iod->nents, dma_dir); 783 out: 784 return ret; 785 } 786 787 static void nvme_unmap_data(struct nvme_dev *dev, struct nvme_iod *iod) 788 { 789 struct request *req = iod_get_private(iod); 790 enum dma_data_direction dma_dir = rq_data_dir(req) ? 791 DMA_TO_DEVICE : DMA_FROM_DEVICE; 792 793 if (iod->nents) { 794 dma_unmap_sg(dev->dev, iod->sg, iod->nents, dma_dir); 795 if (blk_integrity_rq(req)) { 796 if (!rq_data_dir(req)) 797 nvme_dif_remap(req, nvme_dif_complete); 798 dma_unmap_sg(dev->dev, iod->meta_sg, 1, dma_dir); 799 } 800 } 801 802 nvme_free_iod(dev, iod); 803 } 804 805 /* 806 * We reuse the small pool to allocate the 16-byte range here as it is not 807 * worth having a special pool for these or additional cases to handle freeing 808 * the iod. 809 */ 810 static int nvme_setup_discard(struct nvme_queue *nvmeq, struct nvme_ns *ns, 811 struct nvme_iod *iod, struct nvme_command *cmnd) 812 { 813 struct request *req = iod_get_private(iod); 814 struct nvme_dsm_range *range; 815 816 range = dma_pool_alloc(nvmeq->dev->prp_small_pool, GFP_ATOMIC, 817 &iod->first_dma); 818 if (!range) 819 return BLK_MQ_RQ_QUEUE_BUSY; 820 iod_list(iod)[0] = (__le64 *)range; 821 iod->npages = 0; 822 823 range->cattr = cpu_to_le32(0); 824 range->nlb = cpu_to_le32(blk_rq_bytes(req) >> ns->lba_shift); 825 range->slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req))); 826 827 memset(cmnd, 0, sizeof(*cmnd)); 828 cmnd->dsm.opcode = nvme_cmd_dsm; 829 cmnd->dsm.nsid = cpu_to_le32(ns->ns_id); 830 cmnd->dsm.prp1 = cpu_to_le64(iod->first_dma); 831 cmnd->dsm.nr = 0; 832 cmnd->dsm.attributes = cpu_to_le32(NVME_DSMGMT_AD); 833 return BLK_MQ_RQ_QUEUE_OK; 834 } 835 836 /* 837 * NOTE: ns is NULL when called on the admin queue. 838 */ 839 static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx, 840 const struct blk_mq_queue_data *bd) 841 { 842 struct nvme_ns *ns = hctx->queue->queuedata; 843 struct nvme_queue *nvmeq = hctx->driver_data; 844 struct nvme_dev *dev = nvmeq->dev; 845 struct request *req = bd->rq; 846 struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req); 847 struct nvme_iod *iod; 848 struct nvme_command cmnd; 849 int ret = BLK_MQ_RQ_QUEUE_OK; 850 851 /* 852 * If formated with metadata, require the block layer provide a buffer 853 * unless this namespace is formated such that the metadata can be 854 * stripped/generated by the controller with PRACT=1. 855 */ 856 if (ns && ns->ms && !blk_integrity_rq(req)) { 857 if (!(ns->pi_type && ns->ms == 8) && 858 req->cmd_type != REQ_TYPE_DRV_PRIV) { 859 blk_mq_complete_request(req, -EFAULT); 860 return BLK_MQ_RQ_QUEUE_OK; 861 } 862 } 863 864 iod = nvme_alloc_iod(req, dev, GFP_ATOMIC); 865 if (!iod) 866 return BLK_MQ_RQ_QUEUE_BUSY; 867 868 if (req->cmd_flags & REQ_DISCARD) { 869 ret = nvme_setup_discard(nvmeq, ns, iod, &cmnd); 870 } else { 871 if (req->cmd_type == REQ_TYPE_DRV_PRIV) 872 memcpy(&cmnd, req->cmd, sizeof(cmnd)); 873 else if (req->cmd_flags & REQ_FLUSH) 874 nvme_setup_flush(ns, &cmnd); 875 else 876 nvme_setup_rw(ns, req, &cmnd); 877 878 if (req->nr_phys_segments) 879 ret = nvme_map_data(dev, iod, &cmnd); 880 } 881 882 if (ret) 883 goto out; 884 885 cmnd.common.command_id = req->tag; 886 nvme_set_info(cmd, iod, req_completion); 887 888 spin_lock_irq(&nvmeq->q_lock); 889 __nvme_submit_cmd(nvmeq, &cmnd); 890 nvme_process_cq(nvmeq); 891 spin_unlock_irq(&nvmeq->q_lock); 892 return BLK_MQ_RQ_QUEUE_OK; 893 out: 894 nvme_free_iod(dev, iod); 895 return ret; 896 } 897 898 static void __nvme_process_cq(struct nvme_queue *nvmeq, unsigned int *tag) 899 { 900 u16 head, phase; 901 902 head = nvmeq->cq_head; 903 phase = nvmeq->cq_phase; 904 905 for (;;) { 906 void *ctx; 907 nvme_completion_fn fn; 908 struct nvme_completion cqe = nvmeq->cqes[head]; 909 if ((le16_to_cpu(cqe.status) & 1) != phase) 910 break; 911 nvmeq->sq_head = le16_to_cpu(cqe.sq_head); 912 if (++head == nvmeq->q_depth) { 913 head = 0; 914 phase = !phase; 915 } 916 if (tag && *tag == cqe.command_id) 917 *tag = -1; 918 ctx = nvme_finish_cmd(nvmeq, cqe.command_id, &fn); 919 fn(nvmeq, ctx, &cqe); 920 } 921 922 /* If the controller ignores the cq head doorbell and continuously 923 * writes to the queue, it is theoretically possible to wrap around 924 * the queue twice and mistakenly return IRQ_NONE. Linux only 925 * requires that 0.1% of your interrupts are handled, so this isn't 926 * a big problem. 927 */ 928 if (head == nvmeq->cq_head && phase == nvmeq->cq_phase) 929 return; 930 931 if (likely(nvmeq->cq_vector >= 0)) 932 writel(head, nvmeq->q_db + nvmeq->dev->db_stride); 933 nvmeq->cq_head = head; 934 nvmeq->cq_phase = phase; 935 936 nvmeq->cqe_seen = 1; 937 } 938 939 static void nvme_process_cq(struct nvme_queue *nvmeq) 940 { 941 __nvme_process_cq(nvmeq, NULL); 942 } 943 944 static irqreturn_t nvme_irq(int irq, void *data) 945 { 946 irqreturn_t result; 947 struct nvme_queue *nvmeq = data; 948 spin_lock(&nvmeq->q_lock); 949 nvme_process_cq(nvmeq); 950 result = nvmeq->cqe_seen ? IRQ_HANDLED : IRQ_NONE; 951 nvmeq->cqe_seen = 0; 952 spin_unlock(&nvmeq->q_lock); 953 return result; 954 } 955 956 static irqreturn_t nvme_irq_check(int irq, void *data) 957 { 958 struct nvme_queue *nvmeq = data; 959 struct nvme_completion cqe = nvmeq->cqes[nvmeq->cq_head]; 960 if ((le16_to_cpu(cqe.status) & 1) != nvmeq->cq_phase) 961 return IRQ_NONE; 962 return IRQ_WAKE_THREAD; 963 } 964 965 static int nvme_poll(struct blk_mq_hw_ctx *hctx, unsigned int tag) 966 { 967 struct nvme_queue *nvmeq = hctx->driver_data; 968 969 if ((le16_to_cpu(nvmeq->cqes[nvmeq->cq_head].status) & 1) == 970 nvmeq->cq_phase) { 971 spin_lock_irq(&nvmeq->q_lock); 972 __nvme_process_cq(nvmeq, &tag); 973 spin_unlock_irq(&nvmeq->q_lock); 974 975 if (tag == -1) 976 return 1; 977 } 978 979 return 0; 980 } 981 982 static int nvme_submit_async_admin_req(struct nvme_dev *dev) 983 { 984 struct nvme_queue *nvmeq = dev->queues[0]; 985 struct nvme_command c; 986 struct nvme_cmd_info *cmd_info; 987 struct request *req; 988 989 req = blk_mq_alloc_request(dev->ctrl.admin_q, WRITE, 990 BLK_MQ_REQ_NOWAIT | BLK_MQ_REQ_RESERVED); 991 if (IS_ERR(req)) 992 return PTR_ERR(req); 993 994 req->cmd_flags |= REQ_NO_TIMEOUT; 995 cmd_info = blk_mq_rq_to_pdu(req); 996 nvme_set_info(cmd_info, NULL, async_req_completion); 997 998 memset(&c, 0, sizeof(c)); 999 c.common.opcode = nvme_admin_async_event; 1000 c.common.command_id = req->tag; 1001 1002 blk_mq_free_request(req); 1003 __nvme_submit_cmd(nvmeq, &c); 1004 return 0; 1005 } 1006 1007 static int nvme_submit_admin_async_cmd(struct nvme_dev *dev, 1008 struct nvme_command *cmd, 1009 struct async_cmd_info *cmdinfo, unsigned timeout) 1010 { 1011 struct nvme_queue *nvmeq = dev->queues[0]; 1012 struct request *req; 1013 struct nvme_cmd_info *cmd_rq; 1014 1015 req = blk_mq_alloc_request(dev->ctrl.admin_q, WRITE, 0); 1016 if (IS_ERR(req)) 1017 return PTR_ERR(req); 1018 1019 req->timeout = timeout; 1020 cmd_rq = blk_mq_rq_to_pdu(req); 1021 cmdinfo->req = req; 1022 nvme_set_info(cmd_rq, cmdinfo, async_completion); 1023 cmdinfo->status = -EINTR; 1024 1025 cmd->common.command_id = req->tag; 1026 1027 nvme_submit_cmd(nvmeq, cmd); 1028 return 0; 1029 } 1030 1031 static int adapter_delete_queue(struct nvme_dev *dev, u8 opcode, u16 id) 1032 { 1033 struct nvme_command c; 1034 1035 memset(&c, 0, sizeof(c)); 1036 c.delete_queue.opcode = opcode; 1037 c.delete_queue.qid = cpu_to_le16(id); 1038 1039 return nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0); 1040 } 1041 1042 static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid, 1043 struct nvme_queue *nvmeq) 1044 { 1045 struct nvme_command c; 1046 int flags = NVME_QUEUE_PHYS_CONTIG | NVME_CQ_IRQ_ENABLED; 1047 1048 /* 1049 * Note: we (ab)use the fact the the prp fields survive if no data 1050 * is attached to the request. 1051 */ 1052 memset(&c, 0, sizeof(c)); 1053 c.create_cq.opcode = nvme_admin_create_cq; 1054 c.create_cq.prp1 = cpu_to_le64(nvmeq->cq_dma_addr); 1055 c.create_cq.cqid = cpu_to_le16(qid); 1056 c.create_cq.qsize = cpu_to_le16(nvmeq->q_depth - 1); 1057 c.create_cq.cq_flags = cpu_to_le16(flags); 1058 c.create_cq.irq_vector = cpu_to_le16(nvmeq->cq_vector); 1059 1060 return nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0); 1061 } 1062 1063 static int adapter_alloc_sq(struct nvme_dev *dev, u16 qid, 1064 struct nvme_queue *nvmeq) 1065 { 1066 struct nvme_command c; 1067 int flags = NVME_QUEUE_PHYS_CONTIG | NVME_SQ_PRIO_MEDIUM; 1068 1069 /* 1070 * Note: we (ab)use the fact the the prp fields survive if no data 1071 * is attached to the request. 1072 */ 1073 memset(&c, 0, sizeof(c)); 1074 c.create_sq.opcode = nvme_admin_create_sq; 1075 c.create_sq.prp1 = cpu_to_le64(nvmeq->sq_dma_addr); 1076 c.create_sq.sqid = cpu_to_le16(qid); 1077 c.create_sq.qsize = cpu_to_le16(nvmeq->q_depth - 1); 1078 c.create_sq.sq_flags = cpu_to_le16(flags); 1079 c.create_sq.cqid = cpu_to_le16(qid); 1080 1081 return nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0); 1082 } 1083 1084 static int adapter_delete_cq(struct nvme_dev *dev, u16 cqid) 1085 { 1086 return adapter_delete_queue(dev, nvme_admin_delete_cq, cqid); 1087 } 1088 1089 static int adapter_delete_sq(struct nvme_dev *dev, u16 sqid) 1090 { 1091 return adapter_delete_queue(dev, nvme_admin_delete_sq, sqid); 1092 } 1093 1094 /** 1095 * nvme_abort_req - Attempt aborting a request 1096 * 1097 * Schedule controller reset if the command was already aborted once before and 1098 * still hasn't been returned to the driver, or if this is the admin queue. 1099 */ 1100 static void nvme_abort_req(struct request *req) 1101 { 1102 struct nvme_cmd_info *cmd_rq = blk_mq_rq_to_pdu(req); 1103 struct nvme_queue *nvmeq = cmd_rq->nvmeq; 1104 struct nvme_dev *dev = nvmeq->dev; 1105 struct request *abort_req; 1106 struct nvme_cmd_info *abort_cmd; 1107 struct nvme_command cmd; 1108 1109 if (!nvmeq->qid || cmd_rq->aborted) { 1110 spin_lock(&dev_list_lock); 1111 if (!__nvme_reset(dev)) { 1112 dev_warn(dev->dev, 1113 "I/O %d QID %d timeout, reset controller\n", 1114 req->tag, nvmeq->qid); 1115 } 1116 spin_unlock(&dev_list_lock); 1117 return; 1118 } 1119 1120 if (!dev->ctrl.abort_limit) 1121 return; 1122 1123 abort_req = blk_mq_alloc_request(dev->ctrl.admin_q, WRITE, 1124 BLK_MQ_REQ_NOWAIT); 1125 if (IS_ERR(abort_req)) 1126 return; 1127 1128 abort_cmd = blk_mq_rq_to_pdu(abort_req); 1129 nvme_set_info(abort_cmd, abort_req, abort_completion); 1130 1131 memset(&cmd, 0, sizeof(cmd)); 1132 cmd.abort.opcode = nvme_admin_abort_cmd; 1133 cmd.abort.cid = req->tag; 1134 cmd.abort.sqid = cpu_to_le16(nvmeq->qid); 1135 cmd.abort.command_id = abort_req->tag; 1136 1137 --dev->ctrl.abort_limit; 1138 cmd_rq->aborted = 1; 1139 1140 dev_warn(nvmeq->q_dmadev, "Aborting I/O %d QID %d\n", req->tag, 1141 nvmeq->qid); 1142 nvme_submit_cmd(dev->queues[0], &cmd); 1143 } 1144 1145 static void nvme_cancel_queue_ios(struct request *req, void *data, bool reserved) 1146 { 1147 struct nvme_queue *nvmeq = data; 1148 void *ctx; 1149 nvme_completion_fn fn; 1150 struct nvme_cmd_info *cmd; 1151 struct nvme_completion cqe; 1152 1153 if (!blk_mq_request_started(req)) 1154 return; 1155 1156 cmd = blk_mq_rq_to_pdu(req); 1157 1158 if (cmd->ctx == CMD_CTX_CANCELLED) 1159 return; 1160 1161 if (blk_queue_dying(req->q)) 1162 cqe.status = cpu_to_le16((NVME_SC_ABORT_REQ | NVME_SC_DNR) << 1); 1163 else 1164 cqe.status = cpu_to_le16(NVME_SC_ABORT_REQ << 1); 1165 1166 1167 dev_warn(nvmeq->q_dmadev, "Cancelling I/O %d QID %d\n", 1168 req->tag, nvmeq->qid); 1169 ctx = cancel_cmd_info(cmd, &fn); 1170 fn(nvmeq, ctx, &cqe); 1171 } 1172 1173 static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved) 1174 { 1175 struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req); 1176 struct nvme_queue *nvmeq = cmd->nvmeq; 1177 1178 dev_warn(nvmeq->q_dmadev, "Timeout I/O %d QID %d\n", req->tag, 1179 nvmeq->qid); 1180 spin_lock_irq(&nvmeq->q_lock); 1181 nvme_abort_req(req); 1182 spin_unlock_irq(&nvmeq->q_lock); 1183 1184 /* 1185 * The aborted req will be completed on receiving the abort req. 1186 * We enable the timer again. If hit twice, it'll cause a device reset, 1187 * as the device then is in a faulty state. 1188 */ 1189 return BLK_EH_RESET_TIMER; 1190 } 1191 1192 static void nvme_free_queue(struct nvme_queue *nvmeq) 1193 { 1194 dma_free_coherent(nvmeq->q_dmadev, CQ_SIZE(nvmeq->q_depth), 1195 (void *)nvmeq->cqes, nvmeq->cq_dma_addr); 1196 if (nvmeq->sq_cmds) 1197 dma_free_coherent(nvmeq->q_dmadev, SQ_SIZE(nvmeq->q_depth), 1198 nvmeq->sq_cmds, nvmeq->sq_dma_addr); 1199 kfree(nvmeq); 1200 } 1201 1202 static void nvme_free_queues(struct nvme_dev *dev, int lowest) 1203 { 1204 int i; 1205 1206 for (i = dev->queue_count - 1; i >= lowest; i--) { 1207 struct nvme_queue *nvmeq = dev->queues[i]; 1208 dev->queue_count--; 1209 dev->queues[i] = NULL; 1210 nvme_free_queue(nvmeq); 1211 } 1212 } 1213 1214 /** 1215 * nvme_suspend_queue - put queue into suspended state 1216 * @nvmeq - queue to suspend 1217 */ 1218 static int nvme_suspend_queue(struct nvme_queue *nvmeq) 1219 { 1220 int vector; 1221 1222 spin_lock_irq(&nvmeq->q_lock); 1223 if (nvmeq->cq_vector == -1) { 1224 spin_unlock_irq(&nvmeq->q_lock); 1225 return 1; 1226 } 1227 vector = nvmeq->dev->entry[nvmeq->cq_vector].vector; 1228 nvmeq->dev->online_queues--; 1229 nvmeq->cq_vector = -1; 1230 spin_unlock_irq(&nvmeq->q_lock); 1231 1232 if (!nvmeq->qid && nvmeq->dev->ctrl.admin_q) 1233 blk_mq_freeze_queue_start(nvmeq->dev->ctrl.admin_q); 1234 1235 irq_set_affinity_hint(vector, NULL); 1236 free_irq(vector, nvmeq); 1237 1238 return 0; 1239 } 1240 1241 static void nvme_clear_queue(struct nvme_queue *nvmeq) 1242 { 1243 spin_lock_irq(&nvmeq->q_lock); 1244 if (nvmeq->tags && *nvmeq->tags) 1245 blk_mq_all_tag_busy_iter(*nvmeq->tags, nvme_cancel_queue_ios, nvmeq); 1246 spin_unlock_irq(&nvmeq->q_lock); 1247 } 1248 1249 static void nvme_disable_queue(struct nvme_dev *dev, int qid) 1250 { 1251 struct nvme_queue *nvmeq = dev->queues[qid]; 1252 1253 if (!nvmeq) 1254 return; 1255 if (nvme_suspend_queue(nvmeq)) 1256 return; 1257 1258 /* Don't tell the adapter to delete the admin queue. 1259 * Don't tell a removed adapter to delete IO queues. */ 1260 if (qid && readl(dev->bar + NVME_REG_CSTS) != -1) { 1261 adapter_delete_sq(dev, qid); 1262 adapter_delete_cq(dev, qid); 1263 } 1264 1265 spin_lock_irq(&nvmeq->q_lock); 1266 nvme_process_cq(nvmeq); 1267 spin_unlock_irq(&nvmeq->q_lock); 1268 } 1269 1270 static int nvme_cmb_qdepth(struct nvme_dev *dev, int nr_io_queues, 1271 int entry_size) 1272 { 1273 int q_depth = dev->q_depth; 1274 unsigned q_size_aligned = roundup(q_depth * entry_size, 1275 dev->ctrl.page_size); 1276 1277 if (q_size_aligned * nr_io_queues > dev->cmb_size) { 1278 u64 mem_per_q = div_u64(dev->cmb_size, nr_io_queues); 1279 mem_per_q = round_down(mem_per_q, dev->ctrl.page_size); 1280 q_depth = div_u64(mem_per_q, entry_size); 1281 1282 /* 1283 * Ensure the reduced q_depth is above some threshold where it 1284 * would be better to map queues in system memory with the 1285 * original depth 1286 */ 1287 if (q_depth < 64) 1288 return -ENOMEM; 1289 } 1290 1291 return q_depth; 1292 } 1293 1294 static int nvme_alloc_sq_cmds(struct nvme_dev *dev, struct nvme_queue *nvmeq, 1295 int qid, int depth) 1296 { 1297 if (qid && dev->cmb && use_cmb_sqes && NVME_CMB_SQS(dev->cmbsz)) { 1298 unsigned offset = (qid - 1) * roundup(SQ_SIZE(depth), 1299 dev->ctrl.page_size); 1300 nvmeq->sq_dma_addr = dev->cmb_dma_addr + offset; 1301 nvmeq->sq_cmds_io = dev->cmb + offset; 1302 } else { 1303 nvmeq->sq_cmds = dma_alloc_coherent(dev->dev, SQ_SIZE(depth), 1304 &nvmeq->sq_dma_addr, GFP_KERNEL); 1305 if (!nvmeq->sq_cmds) 1306 return -ENOMEM; 1307 } 1308 1309 return 0; 1310 } 1311 1312 static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid, 1313 int depth) 1314 { 1315 struct nvme_queue *nvmeq = kzalloc(sizeof(*nvmeq), GFP_KERNEL); 1316 if (!nvmeq) 1317 return NULL; 1318 1319 nvmeq->cqes = dma_zalloc_coherent(dev->dev, CQ_SIZE(depth), 1320 &nvmeq->cq_dma_addr, GFP_KERNEL); 1321 if (!nvmeq->cqes) 1322 goto free_nvmeq; 1323 1324 if (nvme_alloc_sq_cmds(dev, nvmeq, qid, depth)) 1325 goto free_cqdma; 1326 1327 nvmeq->q_dmadev = dev->dev; 1328 nvmeq->dev = dev; 1329 snprintf(nvmeq->irqname, sizeof(nvmeq->irqname), "nvme%dq%d", 1330 dev->ctrl.instance, qid); 1331 spin_lock_init(&nvmeq->q_lock); 1332 nvmeq->cq_head = 0; 1333 nvmeq->cq_phase = 1; 1334 nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride]; 1335 nvmeq->q_depth = depth; 1336 nvmeq->qid = qid; 1337 nvmeq->cq_vector = -1; 1338 dev->queues[qid] = nvmeq; 1339 1340 /* make sure queue descriptor is set before queue count, for kthread */ 1341 mb(); 1342 dev->queue_count++; 1343 1344 return nvmeq; 1345 1346 free_cqdma: 1347 dma_free_coherent(dev->dev, CQ_SIZE(depth), (void *)nvmeq->cqes, 1348 nvmeq->cq_dma_addr); 1349 free_nvmeq: 1350 kfree(nvmeq); 1351 return NULL; 1352 } 1353 1354 static int queue_request_irq(struct nvme_dev *dev, struct nvme_queue *nvmeq, 1355 const char *name) 1356 { 1357 if (use_threaded_interrupts) 1358 return request_threaded_irq(dev->entry[nvmeq->cq_vector].vector, 1359 nvme_irq_check, nvme_irq, IRQF_SHARED, 1360 name, nvmeq); 1361 return request_irq(dev->entry[nvmeq->cq_vector].vector, nvme_irq, 1362 IRQF_SHARED, name, nvmeq); 1363 } 1364 1365 static void nvme_init_queue(struct nvme_queue *nvmeq, u16 qid) 1366 { 1367 struct nvme_dev *dev = nvmeq->dev; 1368 1369 spin_lock_irq(&nvmeq->q_lock); 1370 nvmeq->sq_tail = 0; 1371 nvmeq->cq_head = 0; 1372 nvmeq->cq_phase = 1; 1373 nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride]; 1374 memset((void *)nvmeq->cqes, 0, CQ_SIZE(nvmeq->q_depth)); 1375 dev->online_queues++; 1376 spin_unlock_irq(&nvmeq->q_lock); 1377 } 1378 1379 static int nvme_create_queue(struct nvme_queue *nvmeq, int qid) 1380 { 1381 struct nvme_dev *dev = nvmeq->dev; 1382 int result; 1383 1384 nvmeq->cq_vector = qid - 1; 1385 result = adapter_alloc_cq(dev, qid, nvmeq); 1386 if (result < 0) 1387 return result; 1388 1389 result = adapter_alloc_sq(dev, qid, nvmeq); 1390 if (result < 0) 1391 goto release_cq; 1392 1393 result = queue_request_irq(dev, nvmeq, nvmeq->irqname); 1394 if (result < 0) 1395 goto release_sq; 1396 1397 nvme_init_queue(nvmeq, qid); 1398 return result; 1399 1400 release_sq: 1401 adapter_delete_sq(dev, qid); 1402 release_cq: 1403 adapter_delete_cq(dev, qid); 1404 return result; 1405 } 1406 1407 static struct blk_mq_ops nvme_mq_admin_ops = { 1408 .queue_rq = nvme_queue_rq, 1409 .map_queue = blk_mq_map_queue, 1410 .init_hctx = nvme_admin_init_hctx, 1411 .exit_hctx = nvme_admin_exit_hctx, 1412 .init_request = nvme_admin_init_request, 1413 .timeout = nvme_timeout, 1414 }; 1415 1416 static struct blk_mq_ops nvme_mq_ops = { 1417 .queue_rq = nvme_queue_rq, 1418 .map_queue = blk_mq_map_queue, 1419 .init_hctx = nvme_init_hctx, 1420 .init_request = nvme_init_request, 1421 .timeout = nvme_timeout, 1422 .poll = nvme_poll, 1423 }; 1424 1425 static void nvme_dev_remove_admin(struct nvme_dev *dev) 1426 { 1427 if (dev->ctrl.admin_q && !blk_queue_dying(dev->ctrl.admin_q)) { 1428 blk_cleanup_queue(dev->ctrl.admin_q); 1429 blk_mq_free_tag_set(&dev->admin_tagset); 1430 } 1431 } 1432 1433 static int nvme_alloc_admin_tags(struct nvme_dev *dev) 1434 { 1435 if (!dev->ctrl.admin_q) { 1436 dev->admin_tagset.ops = &nvme_mq_admin_ops; 1437 dev->admin_tagset.nr_hw_queues = 1; 1438 dev->admin_tagset.queue_depth = NVME_AQ_DEPTH - 1; 1439 dev->admin_tagset.reserved_tags = 1; 1440 dev->admin_tagset.timeout = ADMIN_TIMEOUT; 1441 dev->admin_tagset.numa_node = dev_to_node(dev->dev); 1442 dev->admin_tagset.cmd_size = nvme_cmd_size(dev); 1443 dev->admin_tagset.driver_data = dev; 1444 1445 if (blk_mq_alloc_tag_set(&dev->admin_tagset)) 1446 return -ENOMEM; 1447 1448 dev->ctrl.admin_q = blk_mq_init_queue(&dev->admin_tagset); 1449 if (IS_ERR(dev->ctrl.admin_q)) { 1450 blk_mq_free_tag_set(&dev->admin_tagset); 1451 return -ENOMEM; 1452 } 1453 if (!blk_get_queue(dev->ctrl.admin_q)) { 1454 nvme_dev_remove_admin(dev); 1455 dev->ctrl.admin_q = NULL; 1456 return -ENODEV; 1457 } 1458 } else 1459 blk_mq_unfreeze_queue(dev->ctrl.admin_q); 1460 1461 return 0; 1462 } 1463 1464 static int nvme_configure_admin_queue(struct nvme_dev *dev) 1465 { 1466 int result; 1467 u32 aqa; 1468 u64 cap = lo_hi_readq(dev->bar + NVME_REG_CAP); 1469 struct nvme_queue *nvmeq; 1470 1471 dev->subsystem = readl(dev->bar + NVME_REG_VS) >= NVME_VS(1, 1) ? 1472 NVME_CAP_NSSRC(cap) : 0; 1473 1474 if (dev->subsystem && 1475 (readl(dev->bar + NVME_REG_CSTS) & NVME_CSTS_NSSRO)) 1476 writel(NVME_CSTS_NSSRO, dev->bar + NVME_REG_CSTS); 1477 1478 result = nvme_disable_ctrl(&dev->ctrl, cap); 1479 if (result < 0) 1480 return result; 1481 1482 nvmeq = dev->queues[0]; 1483 if (!nvmeq) { 1484 nvmeq = nvme_alloc_queue(dev, 0, NVME_AQ_DEPTH); 1485 if (!nvmeq) 1486 return -ENOMEM; 1487 } 1488 1489 aqa = nvmeq->q_depth - 1; 1490 aqa |= aqa << 16; 1491 1492 writel(aqa, dev->bar + NVME_REG_AQA); 1493 lo_hi_writeq(nvmeq->sq_dma_addr, dev->bar + NVME_REG_ASQ); 1494 lo_hi_writeq(nvmeq->cq_dma_addr, dev->bar + NVME_REG_ACQ); 1495 1496 result = nvme_enable_ctrl(&dev->ctrl, cap); 1497 if (result) 1498 goto free_nvmeq; 1499 1500 nvmeq->cq_vector = 0; 1501 result = queue_request_irq(dev, nvmeq, nvmeq->irqname); 1502 if (result) { 1503 nvmeq->cq_vector = -1; 1504 goto free_nvmeq; 1505 } 1506 1507 return result; 1508 1509 free_nvmeq: 1510 nvme_free_queues(dev, 0); 1511 return result; 1512 } 1513 1514 static int nvme_subsys_reset(struct nvme_dev *dev) 1515 { 1516 if (!dev->subsystem) 1517 return -ENOTTY; 1518 1519 writel(0x4E564D65, dev->bar + NVME_REG_NSSR); /* "NVMe" */ 1520 return 0; 1521 } 1522 1523 static int nvme_kthread(void *data) 1524 { 1525 struct nvme_dev *dev, *next; 1526 1527 while (!kthread_should_stop()) { 1528 set_current_state(TASK_INTERRUPTIBLE); 1529 spin_lock(&dev_list_lock); 1530 list_for_each_entry_safe(dev, next, &dev_list, node) { 1531 int i; 1532 u32 csts = readl(dev->bar + NVME_REG_CSTS); 1533 1534 if ((dev->subsystem && (csts & NVME_CSTS_NSSRO)) || 1535 csts & NVME_CSTS_CFS) { 1536 if (!__nvme_reset(dev)) { 1537 dev_warn(dev->dev, 1538 "Failed status: %x, reset controller\n", 1539 readl(dev->bar + NVME_REG_CSTS)); 1540 } 1541 continue; 1542 } 1543 for (i = 0; i < dev->queue_count; i++) { 1544 struct nvme_queue *nvmeq = dev->queues[i]; 1545 if (!nvmeq) 1546 continue; 1547 spin_lock_irq(&nvmeq->q_lock); 1548 nvme_process_cq(nvmeq); 1549 1550 while (i == 0 && dev->ctrl.event_limit > 0) { 1551 if (nvme_submit_async_admin_req(dev)) 1552 break; 1553 dev->ctrl.event_limit--; 1554 } 1555 spin_unlock_irq(&nvmeq->q_lock); 1556 } 1557 } 1558 spin_unlock(&dev_list_lock); 1559 schedule_timeout(round_jiffies_relative(HZ)); 1560 } 1561 return 0; 1562 } 1563 1564 static void nvme_alloc_ns(struct nvme_dev *dev, unsigned nsid) 1565 { 1566 struct nvme_ns *ns; 1567 struct gendisk *disk; 1568 int node = dev_to_node(dev->dev); 1569 1570 ns = kzalloc_node(sizeof(*ns), GFP_KERNEL, node); 1571 if (!ns) 1572 return; 1573 1574 ns->queue = blk_mq_init_queue(&dev->tagset); 1575 if (IS_ERR(ns->queue)) 1576 goto out_free_ns; 1577 queue_flag_set_unlocked(QUEUE_FLAG_NOMERGES, ns->queue); 1578 queue_flag_set_unlocked(QUEUE_FLAG_NONROT, ns->queue); 1579 ns->ctrl = &dev->ctrl; 1580 ns->queue->queuedata = ns; 1581 1582 disk = alloc_disk_node(0, node); 1583 if (!disk) 1584 goto out_free_queue; 1585 1586 kref_init(&ns->kref); 1587 ns->ns_id = nsid; 1588 ns->disk = disk; 1589 ns->lba_shift = 9; /* set to a default value for 512 until disk is validated */ 1590 list_add_tail(&ns->list, &dev->namespaces); 1591 1592 blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift); 1593 if (dev->ctrl.max_hw_sectors) { 1594 blk_queue_max_hw_sectors(ns->queue, dev->ctrl.max_hw_sectors); 1595 blk_queue_max_segments(ns->queue, 1596 (dev->ctrl.max_hw_sectors / (dev->ctrl.page_size >> 9)) + 1); 1597 } 1598 if (dev->ctrl.stripe_size) 1599 blk_queue_chunk_sectors(ns->queue, dev->ctrl.stripe_size >> 9); 1600 if (dev->ctrl.vwc & NVME_CTRL_VWC_PRESENT) 1601 blk_queue_flush(ns->queue, REQ_FLUSH | REQ_FUA); 1602 blk_queue_virt_boundary(ns->queue, dev->ctrl.page_size - 1); 1603 1604 disk->major = nvme_major; 1605 disk->first_minor = 0; 1606 disk->fops = &nvme_fops; 1607 disk->private_data = ns; 1608 disk->queue = ns->queue; 1609 disk->driverfs_dev = dev->device; 1610 disk->flags = GENHD_FL_EXT_DEVT; 1611 sprintf(disk->disk_name, "nvme%dn%d", dev->ctrl.instance, nsid); 1612 1613 /* 1614 * Initialize capacity to 0 until we establish the namespace format and 1615 * setup integrity extentions if necessary. The revalidate_disk after 1616 * add_disk allows the driver to register with integrity if the format 1617 * requires it. 1618 */ 1619 set_capacity(disk, 0); 1620 if (nvme_revalidate_disk(ns->disk)) 1621 goto out_free_disk; 1622 1623 kref_get(&dev->ctrl.kref); 1624 if (ns->type != NVME_NS_LIGHTNVM) { 1625 add_disk(ns->disk); 1626 if (ns->ms) { 1627 struct block_device *bd = bdget_disk(ns->disk, 0); 1628 if (!bd) 1629 return; 1630 if (blkdev_get(bd, FMODE_READ, NULL)) { 1631 bdput(bd); 1632 return; 1633 } 1634 blkdev_reread_part(bd); 1635 blkdev_put(bd, FMODE_READ); 1636 } 1637 } 1638 return; 1639 out_free_disk: 1640 kfree(disk); 1641 list_del(&ns->list); 1642 out_free_queue: 1643 blk_cleanup_queue(ns->queue); 1644 out_free_ns: 1645 kfree(ns); 1646 } 1647 1648 /* 1649 * Create I/O queues. Failing to create an I/O queue is not an issue, 1650 * we can continue with less than the desired amount of queues, and 1651 * even a controller without I/O queues an still be used to issue 1652 * admin commands. This might be useful to upgrade a buggy firmware 1653 * for example. 1654 */ 1655 static void nvme_create_io_queues(struct nvme_dev *dev) 1656 { 1657 unsigned i; 1658 1659 for (i = dev->queue_count; i <= dev->max_qid; i++) 1660 if (!nvme_alloc_queue(dev, i, dev->q_depth)) 1661 break; 1662 1663 for (i = dev->online_queues; i <= dev->queue_count - 1; i++) 1664 if (nvme_create_queue(dev->queues[i], i)) { 1665 nvme_free_queues(dev, i); 1666 break; 1667 } 1668 } 1669 1670 static int set_queue_count(struct nvme_dev *dev, int count) 1671 { 1672 int status; 1673 u32 result; 1674 u32 q_count = (count - 1) | ((count - 1) << 16); 1675 1676 status = nvme_set_features(&dev->ctrl, NVME_FEAT_NUM_QUEUES, q_count, 0, 1677 &result); 1678 if (status < 0) 1679 return status; 1680 if (status > 0) { 1681 dev_err(dev->dev, "Could not set queue count (%d)\n", status); 1682 return 0; 1683 } 1684 return min(result & 0xffff, result >> 16) + 1; 1685 } 1686 1687 static void __iomem *nvme_map_cmb(struct nvme_dev *dev) 1688 { 1689 u64 szu, size, offset; 1690 u32 cmbloc; 1691 resource_size_t bar_size; 1692 struct pci_dev *pdev = to_pci_dev(dev->dev); 1693 void __iomem *cmb; 1694 dma_addr_t dma_addr; 1695 1696 if (!use_cmb_sqes) 1697 return NULL; 1698 1699 dev->cmbsz = readl(dev->bar + NVME_REG_CMBSZ); 1700 if (!(NVME_CMB_SZ(dev->cmbsz))) 1701 return NULL; 1702 1703 cmbloc = readl(dev->bar + NVME_REG_CMBLOC); 1704 1705 szu = (u64)1 << (12 + 4 * NVME_CMB_SZU(dev->cmbsz)); 1706 size = szu * NVME_CMB_SZ(dev->cmbsz); 1707 offset = szu * NVME_CMB_OFST(cmbloc); 1708 bar_size = pci_resource_len(pdev, NVME_CMB_BIR(cmbloc)); 1709 1710 if (offset > bar_size) 1711 return NULL; 1712 1713 /* 1714 * Controllers may support a CMB size larger than their BAR, 1715 * for example, due to being behind a bridge. Reduce the CMB to 1716 * the reported size of the BAR 1717 */ 1718 if (size > bar_size - offset) 1719 size = bar_size - offset; 1720 1721 dma_addr = pci_resource_start(pdev, NVME_CMB_BIR(cmbloc)) + offset; 1722 cmb = ioremap_wc(dma_addr, size); 1723 if (!cmb) 1724 return NULL; 1725 1726 dev->cmb_dma_addr = dma_addr; 1727 dev->cmb_size = size; 1728 return cmb; 1729 } 1730 1731 static inline void nvme_release_cmb(struct nvme_dev *dev) 1732 { 1733 if (dev->cmb) { 1734 iounmap(dev->cmb); 1735 dev->cmb = NULL; 1736 } 1737 } 1738 1739 static size_t db_bar_size(struct nvme_dev *dev, unsigned nr_io_queues) 1740 { 1741 return 4096 + ((nr_io_queues + 1) * 8 * dev->db_stride); 1742 } 1743 1744 static int nvme_setup_io_queues(struct nvme_dev *dev) 1745 { 1746 struct nvme_queue *adminq = dev->queues[0]; 1747 struct pci_dev *pdev = to_pci_dev(dev->dev); 1748 int result, i, vecs, nr_io_queues, size; 1749 1750 nr_io_queues = num_possible_cpus(); 1751 result = set_queue_count(dev, nr_io_queues); 1752 if (result <= 0) 1753 return result; 1754 if (result < nr_io_queues) 1755 nr_io_queues = result; 1756 1757 if (dev->cmb && NVME_CMB_SQS(dev->cmbsz)) { 1758 result = nvme_cmb_qdepth(dev, nr_io_queues, 1759 sizeof(struct nvme_command)); 1760 if (result > 0) 1761 dev->q_depth = result; 1762 else 1763 nvme_release_cmb(dev); 1764 } 1765 1766 size = db_bar_size(dev, nr_io_queues); 1767 if (size > 8192) { 1768 iounmap(dev->bar); 1769 do { 1770 dev->bar = ioremap(pci_resource_start(pdev, 0), size); 1771 if (dev->bar) 1772 break; 1773 if (!--nr_io_queues) 1774 return -ENOMEM; 1775 size = db_bar_size(dev, nr_io_queues); 1776 } while (1); 1777 dev->dbs = dev->bar + 4096; 1778 adminq->q_db = dev->dbs; 1779 } 1780 1781 /* Deregister the admin queue's interrupt */ 1782 free_irq(dev->entry[0].vector, adminq); 1783 1784 /* 1785 * If we enable msix early due to not intx, disable it again before 1786 * setting up the full range we need. 1787 */ 1788 if (!pdev->irq) 1789 pci_disable_msix(pdev); 1790 1791 for (i = 0; i < nr_io_queues; i++) 1792 dev->entry[i].entry = i; 1793 vecs = pci_enable_msix_range(pdev, dev->entry, 1, nr_io_queues); 1794 if (vecs < 0) { 1795 vecs = pci_enable_msi_range(pdev, 1, min(nr_io_queues, 32)); 1796 if (vecs < 0) { 1797 vecs = 1; 1798 } else { 1799 for (i = 0; i < vecs; i++) 1800 dev->entry[i].vector = i + pdev->irq; 1801 } 1802 } 1803 1804 /* 1805 * Should investigate if there's a performance win from allocating 1806 * more queues than interrupt vectors; it might allow the submission 1807 * path to scale better, even if the receive path is limited by the 1808 * number of interrupts. 1809 */ 1810 nr_io_queues = vecs; 1811 dev->max_qid = nr_io_queues; 1812 1813 result = queue_request_irq(dev, adminq, adminq->irqname); 1814 if (result) { 1815 adminq->cq_vector = -1; 1816 goto free_queues; 1817 } 1818 1819 /* Free previously allocated queues that are no longer usable */ 1820 nvme_free_queues(dev, nr_io_queues + 1); 1821 nvme_create_io_queues(dev); 1822 1823 return 0; 1824 1825 free_queues: 1826 nvme_free_queues(dev, 1); 1827 return result; 1828 } 1829 1830 static int ns_cmp(void *priv, struct list_head *a, struct list_head *b) 1831 { 1832 struct nvme_ns *nsa = container_of(a, struct nvme_ns, list); 1833 struct nvme_ns *nsb = container_of(b, struct nvme_ns, list); 1834 1835 return nsa->ns_id - nsb->ns_id; 1836 } 1837 1838 static struct nvme_ns *nvme_find_ns(struct nvme_dev *dev, unsigned nsid) 1839 { 1840 struct nvme_ns *ns; 1841 1842 list_for_each_entry(ns, &dev->namespaces, list) { 1843 if (ns->ns_id == nsid) 1844 return ns; 1845 if (ns->ns_id > nsid) 1846 break; 1847 } 1848 return NULL; 1849 } 1850 1851 static inline bool nvme_io_incapable(struct nvme_dev *dev) 1852 { 1853 return (!dev->bar || 1854 readl(dev->bar + NVME_REG_CSTS) & NVME_CSTS_CFS || 1855 dev->online_queues < 2); 1856 } 1857 1858 static void nvme_ns_remove(struct nvme_ns *ns) 1859 { 1860 bool kill = nvme_io_incapable(to_nvme_dev(ns->ctrl)) && 1861 !blk_queue_dying(ns->queue); 1862 1863 if (kill) 1864 blk_set_queue_dying(ns->queue); 1865 if (ns->disk->flags & GENHD_FL_UP) 1866 del_gendisk(ns->disk); 1867 if (kill || !blk_queue_dying(ns->queue)) { 1868 blk_mq_abort_requeue_list(ns->queue); 1869 blk_cleanup_queue(ns->queue); 1870 } 1871 list_del_init(&ns->list); 1872 nvme_put_ns(ns); 1873 } 1874 1875 static void nvme_scan_namespaces(struct nvme_dev *dev, unsigned nn) 1876 { 1877 struct nvme_ns *ns, *next; 1878 unsigned i; 1879 1880 for (i = 1; i <= nn; i++) { 1881 ns = nvme_find_ns(dev, i); 1882 if (ns) { 1883 if (revalidate_disk(ns->disk)) 1884 nvme_ns_remove(ns); 1885 } else 1886 nvme_alloc_ns(dev, i); 1887 } 1888 list_for_each_entry_safe(ns, next, &dev->namespaces, list) { 1889 if (ns->ns_id > nn) 1890 nvme_ns_remove(ns); 1891 } 1892 list_sort(NULL, &dev->namespaces, ns_cmp); 1893 } 1894 1895 static void nvme_set_irq_hints(struct nvme_dev *dev) 1896 { 1897 struct nvme_queue *nvmeq; 1898 int i; 1899 1900 for (i = 0; i < dev->online_queues; i++) { 1901 nvmeq = dev->queues[i]; 1902 1903 if (!nvmeq->tags || !(*nvmeq->tags)) 1904 continue; 1905 1906 irq_set_affinity_hint(dev->entry[nvmeq->cq_vector].vector, 1907 blk_mq_tags_cpumask(*nvmeq->tags)); 1908 } 1909 } 1910 1911 static void nvme_dev_scan(struct work_struct *work) 1912 { 1913 struct nvme_dev *dev = container_of(work, struct nvme_dev, scan_work); 1914 struct nvme_id_ctrl *ctrl; 1915 1916 if (!dev->tagset.tags) 1917 return; 1918 if (nvme_identify_ctrl(&dev->ctrl, &ctrl)) 1919 return; 1920 nvme_scan_namespaces(dev, le32_to_cpup(&ctrl->nn)); 1921 kfree(ctrl); 1922 nvme_set_irq_hints(dev); 1923 } 1924 1925 /* 1926 * Return: error value if an error occurred setting up the queues or calling 1927 * Identify Device. 0 if these succeeded, even if adding some of the 1928 * namespaces failed. At the moment, these failures are silent. TBD which 1929 * failures should be reported. 1930 */ 1931 static int nvme_dev_add(struct nvme_dev *dev) 1932 { 1933 int res; 1934 1935 res = nvme_init_identify(&dev->ctrl); 1936 if (res) 1937 return res; 1938 1939 if (!dev->tagset.tags) { 1940 dev->tagset.ops = &nvme_mq_ops; 1941 dev->tagset.nr_hw_queues = dev->online_queues - 1; 1942 dev->tagset.timeout = NVME_IO_TIMEOUT; 1943 dev->tagset.numa_node = dev_to_node(dev->dev); 1944 dev->tagset.queue_depth = 1945 min_t(int, dev->q_depth, BLK_MQ_MAX_DEPTH) - 1; 1946 dev->tagset.cmd_size = nvme_cmd_size(dev); 1947 dev->tagset.flags = BLK_MQ_F_SHOULD_MERGE; 1948 dev->tagset.driver_data = dev; 1949 1950 if (blk_mq_alloc_tag_set(&dev->tagset)) 1951 return 0; 1952 } 1953 schedule_work(&dev->scan_work); 1954 return 0; 1955 } 1956 1957 static int nvme_dev_map(struct nvme_dev *dev) 1958 { 1959 u64 cap; 1960 int bars, result = -ENOMEM; 1961 struct pci_dev *pdev = to_pci_dev(dev->dev); 1962 1963 if (pci_enable_device_mem(pdev)) 1964 return result; 1965 1966 dev->entry[0].vector = pdev->irq; 1967 pci_set_master(pdev); 1968 bars = pci_select_bars(pdev, IORESOURCE_MEM); 1969 if (!bars) 1970 goto disable_pci; 1971 1972 if (pci_request_selected_regions(pdev, bars, "nvme")) 1973 goto disable_pci; 1974 1975 if (dma_set_mask_and_coherent(dev->dev, DMA_BIT_MASK(64)) && 1976 dma_set_mask_and_coherent(dev->dev, DMA_BIT_MASK(32))) 1977 goto disable; 1978 1979 dev->bar = ioremap(pci_resource_start(pdev, 0), 8192); 1980 if (!dev->bar) 1981 goto disable; 1982 1983 if (readl(dev->bar + NVME_REG_CSTS) == -1) { 1984 result = -ENODEV; 1985 goto unmap; 1986 } 1987 1988 /* 1989 * Some devices don't advertse INTx interrupts, pre-enable a single 1990 * MSIX vec for setup. We'll adjust this later. 1991 */ 1992 if (!pdev->irq) { 1993 result = pci_enable_msix(pdev, dev->entry, 1); 1994 if (result < 0) 1995 goto unmap; 1996 } 1997 1998 cap = lo_hi_readq(dev->bar + NVME_REG_CAP); 1999 2000 dev->q_depth = min_t(int, NVME_CAP_MQES(cap) + 1, NVME_Q_DEPTH); 2001 dev->db_stride = 1 << NVME_CAP_STRIDE(cap); 2002 dev->dbs = dev->bar + 4096; 2003 if (readl(dev->bar + NVME_REG_VS) >= NVME_VS(1, 2)) 2004 dev->cmb = nvme_map_cmb(dev); 2005 2006 return 0; 2007 2008 unmap: 2009 iounmap(dev->bar); 2010 dev->bar = NULL; 2011 disable: 2012 pci_release_regions(pdev); 2013 disable_pci: 2014 pci_disable_device(pdev); 2015 return result; 2016 } 2017 2018 static void nvme_dev_unmap(struct nvme_dev *dev) 2019 { 2020 struct pci_dev *pdev = to_pci_dev(dev->dev); 2021 2022 if (pdev->msi_enabled) 2023 pci_disable_msi(pdev); 2024 else if (pdev->msix_enabled) 2025 pci_disable_msix(pdev); 2026 2027 if (dev->bar) { 2028 iounmap(dev->bar); 2029 dev->bar = NULL; 2030 pci_release_regions(pdev); 2031 } 2032 2033 if (pci_is_enabled(pdev)) 2034 pci_disable_device(pdev); 2035 } 2036 2037 struct nvme_delq_ctx { 2038 struct task_struct *waiter; 2039 struct kthread_worker *worker; 2040 atomic_t refcount; 2041 }; 2042 2043 static void nvme_wait_dq(struct nvme_delq_ctx *dq, struct nvme_dev *dev) 2044 { 2045 dq->waiter = current; 2046 mb(); 2047 2048 for (;;) { 2049 set_current_state(TASK_KILLABLE); 2050 if (!atomic_read(&dq->refcount)) 2051 break; 2052 if (!schedule_timeout(ADMIN_TIMEOUT) || 2053 fatal_signal_pending(current)) { 2054 /* 2055 * Disable the controller first since we can't trust it 2056 * at this point, but leave the admin queue enabled 2057 * until all queue deletion requests are flushed. 2058 * FIXME: This may take a while if there are more h/w 2059 * queues than admin tags. 2060 */ 2061 set_current_state(TASK_RUNNING); 2062 nvme_disable_ctrl(&dev->ctrl, 2063 lo_hi_readq(dev->bar + NVME_REG_CAP)); 2064 nvme_clear_queue(dev->queues[0]); 2065 flush_kthread_worker(dq->worker); 2066 nvme_disable_queue(dev, 0); 2067 return; 2068 } 2069 } 2070 set_current_state(TASK_RUNNING); 2071 } 2072 2073 static void nvme_put_dq(struct nvme_delq_ctx *dq) 2074 { 2075 atomic_dec(&dq->refcount); 2076 if (dq->waiter) 2077 wake_up_process(dq->waiter); 2078 } 2079 2080 static struct nvme_delq_ctx *nvme_get_dq(struct nvme_delq_ctx *dq) 2081 { 2082 atomic_inc(&dq->refcount); 2083 return dq; 2084 } 2085 2086 static void nvme_del_queue_end(struct nvme_queue *nvmeq) 2087 { 2088 struct nvme_delq_ctx *dq = nvmeq->cmdinfo.ctx; 2089 nvme_put_dq(dq); 2090 2091 spin_lock_irq(&nvmeq->q_lock); 2092 nvme_process_cq(nvmeq); 2093 spin_unlock_irq(&nvmeq->q_lock); 2094 } 2095 2096 static int adapter_async_del_queue(struct nvme_queue *nvmeq, u8 opcode, 2097 kthread_work_func_t fn) 2098 { 2099 struct nvme_command c; 2100 2101 memset(&c, 0, sizeof(c)); 2102 c.delete_queue.opcode = opcode; 2103 c.delete_queue.qid = cpu_to_le16(nvmeq->qid); 2104 2105 init_kthread_work(&nvmeq->cmdinfo.work, fn); 2106 return nvme_submit_admin_async_cmd(nvmeq->dev, &c, &nvmeq->cmdinfo, 2107 ADMIN_TIMEOUT); 2108 } 2109 2110 static void nvme_del_cq_work_handler(struct kthread_work *work) 2111 { 2112 struct nvme_queue *nvmeq = container_of(work, struct nvme_queue, 2113 cmdinfo.work); 2114 nvme_del_queue_end(nvmeq); 2115 } 2116 2117 static int nvme_delete_cq(struct nvme_queue *nvmeq) 2118 { 2119 return adapter_async_del_queue(nvmeq, nvme_admin_delete_cq, 2120 nvme_del_cq_work_handler); 2121 } 2122 2123 static void nvme_del_sq_work_handler(struct kthread_work *work) 2124 { 2125 struct nvme_queue *nvmeq = container_of(work, struct nvme_queue, 2126 cmdinfo.work); 2127 int status = nvmeq->cmdinfo.status; 2128 2129 if (!status) 2130 status = nvme_delete_cq(nvmeq); 2131 if (status) 2132 nvme_del_queue_end(nvmeq); 2133 } 2134 2135 static int nvme_delete_sq(struct nvme_queue *nvmeq) 2136 { 2137 return adapter_async_del_queue(nvmeq, nvme_admin_delete_sq, 2138 nvme_del_sq_work_handler); 2139 } 2140 2141 static void nvme_del_queue_start(struct kthread_work *work) 2142 { 2143 struct nvme_queue *nvmeq = container_of(work, struct nvme_queue, 2144 cmdinfo.work); 2145 if (nvme_delete_sq(nvmeq)) 2146 nvme_del_queue_end(nvmeq); 2147 } 2148 2149 static void nvme_disable_io_queues(struct nvme_dev *dev) 2150 { 2151 int i; 2152 DEFINE_KTHREAD_WORKER_ONSTACK(worker); 2153 struct nvme_delq_ctx dq; 2154 struct task_struct *kworker_task = kthread_run(kthread_worker_fn, 2155 &worker, "nvme%d", dev->ctrl.instance); 2156 2157 if (IS_ERR(kworker_task)) { 2158 dev_err(dev->dev, 2159 "Failed to create queue del task\n"); 2160 for (i = dev->queue_count - 1; i > 0; i--) 2161 nvme_disable_queue(dev, i); 2162 return; 2163 } 2164 2165 dq.waiter = NULL; 2166 atomic_set(&dq.refcount, 0); 2167 dq.worker = &worker; 2168 for (i = dev->queue_count - 1; i > 0; i--) { 2169 struct nvme_queue *nvmeq = dev->queues[i]; 2170 2171 if (nvme_suspend_queue(nvmeq)) 2172 continue; 2173 nvmeq->cmdinfo.ctx = nvme_get_dq(&dq); 2174 nvmeq->cmdinfo.worker = dq.worker; 2175 init_kthread_work(&nvmeq->cmdinfo.work, nvme_del_queue_start); 2176 queue_kthread_work(dq.worker, &nvmeq->cmdinfo.work); 2177 } 2178 nvme_wait_dq(&dq, dev); 2179 kthread_stop(kworker_task); 2180 } 2181 2182 /* 2183 * Remove the node from the device list and check 2184 * for whether or not we need to stop the nvme_thread. 2185 */ 2186 static void nvme_dev_list_remove(struct nvme_dev *dev) 2187 { 2188 struct task_struct *tmp = NULL; 2189 2190 spin_lock(&dev_list_lock); 2191 list_del_init(&dev->node); 2192 if (list_empty(&dev_list) && !IS_ERR_OR_NULL(nvme_thread)) { 2193 tmp = nvme_thread; 2194 nvme_thread = NULL; 2195 } 2196 spin_unlock(&dev_list_lock); 2197 2198 if (tmp) 2199 kthread_stop(tmp); 2200 } 2201 2202 static void nvme_freeze_queues(struct nvme_dev *dev) 2203 { 2204 struct nvme_ns *ns; 2205 2206 list_for_each_entry(ns, &dev->namespaces, list) { 2207 blk_mq_freeze_queue_start(ns->queue); 2208 2209 spin_lock_irq(ns->queue->queue_lock); 2210 queue_flag_set(QUEUE_FLAG_STOPPED, ns->queue); 2211 spin_unlock_irq(ns->queue->queue_lock); 2212 2213 blk_mq_cancel_requeue_work(ns->queue); 2214 blk_mq_stop_hw_queues(ns->queue); 2215 } 2216 } 2217 2218 static void nvme_unfreeze_queues(struct nvme_dev *dev) 2219 { 2220 struct nvme_ns *ns; 2221 2222 list_for_each_entry(ns, &dev->namespaces, list) { 2223 queue_flag_clear_unlocked(QUEUE_FLAG_STOPPED, ns->queue); 2224 blk_mq_unfreeze_queue(ns->queue); 2225 blk_mq_start_stopped_hw_queues(ns->queue, true); 2226 blk_mq_kick_requeue_list(ns->queue); 2227 } 2228 } 2229 2230 static void nvme_dev_shutdown(struct nvme_dev *dev) 2231 { 2232 int i; 2233 u32 csts = -1; 2234 2235 nvme_dev_list_remove(dev); 2236 2237 if (dev->bar) { 2238 nvme_freeze_queues(dev); 2239 csts = readl(dev->bar + NVME_REG_CSTS); 2240 } 2241 if (csts & NVME_CSTS_CFS || !(csts & NVME_CSTS_RDY)) { 2242 for (i = dev->queue_count - 1; i >= 0; i--) { 2243 struct nvme_queue *nvmeq = dev->queues[i]; 2244 nvme_suspend_queue(nvmeq); 2245 } 2246 } else { 2247 nvme_disable_io_queues(dev); 2248 nvme_shutdown_ctrl(&dev->ctrl); 2249 nvme_disable_queue(dev, 0); 2250 } 2251 nvme_dev_unmap(dev); 2252 2253 for (i = dev->queue_count - 1; i >= 0; i--) 2254 nvme_clear_queue(dev->queues[i]); 2255 } 2256 2257 static void nvme_dev_remove(struct nvme_dev *dev) 2258 { 2259 struct nvme_ns *ns, *next; 2260 2261 list_for_each_entry_safe(ns, next, &dev->namespaces, list) 2262 nvme_ns_remove(ns); 2263 } 2264 2265 static int nvme_setup_prp_pools(struct nvme_dev *dev) 2266 { 2267 dev->prp_page_pool = dma_pool_create("prp list page", dev->dev, 2268 PAGE_SIZE, PAGE_SIZE, 0); 2269 if (!dev->prp_page_pool) 2270 return -ENOMEM; 2271 2272 /* Optimisation for I/Os between 4k and 128k */ 2273 dev->prp_small_pool = dma_pool_create("prp list 256", dev->dev, 2274 256, 256, 0); 2275 if (!dev->prp_small_pool) { 2276 dma_pool_destroy(dev->prp_page_pool); 2277 return -ENOMEM; 2278 } 2279 return 0; 2280 } 2281 2282 static void nvme_release_prp_pools(struct nvme_dev *dev) 2283 { 2284 dma_pool_destroy(dev->prp_page_pool); 2285 dma_pool_destroy(dev->prp_small_pool); 2286 } 2287 2288 static DEFINE_IDA(nvme_instance_ida); 2289 2290 static int nvme_set_instance(struct nvme_dev *dev) 2291 { 2292 int instance, error; 2293 2294 do { 2295 if (!ida_pre_get(&nvme_instance_ida, GFP_KERNEL)) 2296 return -ENODEV; 2297 2298 spin_lock(&dev_list_lock); 2299 error = ida_get_new(&nvme_instance_ida, &instance); 2300 spin_unlock(&dev_list_lock); 2301 } while (error == -EAGAIN); 2302 2303 if (error) 2304 return -ENODEV; 2305 2306 dev->ctrl.instance = instance; 2307 return 0; 2308 } 2309 2310 static void nvme_release_instance(struct nvme_dev *dev) 2311 { 2312 spin_lock(&dev_list_lock); 2313 ida_remove(&nvme_instance_ida, dev->ctrl.instance); 2314 spin_unlock(&dev_list_lock); 2315 } 2316 2317 static void nvme_pci_free_ctrl(struct nvme_ctrl *ctrl) 2318 { 2319 struct nvme_dev *dev = to_nvme_dev(ctrl); 2320 2321 put_device(dev->dev); 2322 put_device(dev->device); 2323 nvme_release_instance(dev); 2324 if (dev->tagset.tags) 2325 blk_mq_free_tag_set(&dev->tagset); 2326 if (dev->ctrl.admin_q) 2327 blk_put_queue(dev->ctrl.admin_q); 2328 kfree(dev->queues); 2329 kfree(dev->entry); 2330 kfree(dev); 2331 } 2332 2333 static int nvme_dev_open(struct inode *inode, struct file *f) 2334 { 2335 struct nvme_dev *dev; 2336 int instance = iminor(inode); 2337 int ret = -ENODEV; 2338 2339 spin_lock(&dev_list_lock); 2340 list_for_each_entry(dev, &dev_list, node) { 2341 if (dev->ctrl.instance == instance) { 2342 if (!dev->ctrl.admin_q) { 2343 ret = -EWOULDBLOCK; 2344 break; 2345 } 2346 if (!kref_get_unless_zero(&dev->ctrl.kref)) 2347 break; 2348 f->private_data = dev; 2349 ret = 0; 2350 break; 2351 } 2352 } 2353 spin_unlock(&dev_list_lock); 2354 2355 return ret; 2356 } 2357 2358 static int nvme_dev_release(struct inode *inode, struct file *f) 2359 { 2360 struct nvme_dev *dev = f->private_data; 2361 nvme_put_ctrl(&dev->ctrl); 2362 return 0; 2363 } 2364 2365 static long nvme_dev_ioctl(struct file *f, unsigned int cmd, unsigned long arg) 2366 { 2367 struct nvme_dev *dev = f->private_data; 2368 struct nvme_ns *ns; 2369 2370 switch (cmd) { 2371 case NVME_IOCTL_ADMIN_CMD: 2372 return nvme_user_cmd(&dev->ctrl, NULL, (void __user *)arg); 2373 case NVME_IOCTL_IO_CMD: 2374 if (list_empty(&dev->namespaces)) 2375 return -ENOTTY; 2376 ns = list_first_entry(&dev->namespaces, struct nvme_ns, list); 2377 return nvme_user_cmd(&dev->ctrl, ns, (void __user *)arg); 2378 case NVME_IOCTL_RESET: 2379 dev_warn(dev->dev, "resetting controller\n"); 2380 return nvme_reset(dev); 2381 case NVME_IOCTL_SUBSYS_RESET: 2382 return nvme_subsys_reset(dev); 2383 default: 2384 return -ENOTTY; 2385 } 2386 } 2387 2388 static const struct file_operations nvme_dev_fops = { 2389 .owner = THIS_MODULE, 2390 .open = nvme_dev_open, 2391 .release = nvme_dev_release, 2392 .unlocked_ioctl = nvme_dev_ioctl, 2393 .compat_ioctl = nvme_dev_ioctl, 2394 }; 2395 2396 static void nvme_probe_work(struct work_struct *work) 2397 { 2398 struct nvme_dev *dev = container_of(work, struct nvme_dev, probe_work); 2399 bool start_thread = false; 2400 int result; 2401 2402 result = nvme_dev_map(dev); 2403 if (result) 2404 goto out; 2405 2406 result = nvme_configure_admin_queue(dev); 2407 if (result) 2408 goto unmap; 2409 2410 spin_lock(&dev_list_lock); 2411 if (list_empty(&dev_list) && IS_ERR_OR_NULL(nvme_thread)) { 2412 start_thread = true; 2413 nvme_thread = NULL; 2414 } 2415 list_add(&dev->node, &dev_list); 2416 spin_unlock(&dev_list_lock); 2417 2418 if (start_thread) { 2419 nvme_thread = kthread_run(nvme_kthread, NULL, "nvme"); 2420 wake_up_all(&nvme_kthread_wait); 2421 } else 2422 wait_event_killable(nvme_kthread_wait, nvme_thread); 2423 2424 if (IS_ERR_OR_NULL(nvme_thread)) { 2425 result = nvme_thread ? PTR_ERR(nvme_thread) : -EINTR; 2426 goto disable; 2427 } 2428 2429 nvme_init_queue(dev->queues[0], 0); 2430 result = nvme_alloc_admin_tags(dev); 2431 if (result) 2432 goto disable; 2433 2434 result = nvme_setup_io_queues(dev); 2435 if (result) 2436 goto free_tags; 2437 2438 dev->ctrl.event_limit = 1; 2439 2440 /* 2441 * Keep the controller around but remove all namespaces if we don't have 2442 * any working I/O queue. 2443 */ 2444 if (dev->online_queues < 2) { 2445 dev_warn(dev->dev, "IO queues not created\n"); 2446 nvme_dev_remove(dev); 2447 } else { 2448 nvme_unfreeze_queues(dev); 2449 nvme_dev_add(dev); 2450 } 2451 2452 return; 2453 2454 free_tags: 2455 nvme_dev_remove_admin(dev); 2456 blk_put_queue(dev->ctrl.admin_q); 2457 dev->ctrl.admin_q = NULL; 2458 dev->queues[0]->tags = NULL; 2459 disable: 2460 nvme_disable_queue(dev, 0); 2461 nvme_dev_list_remove(dev); 2462 unmap: 2463 nvme_dev_unmap(dev); 2464 out: 2465 if (!work_busy(&dev->reset_work)) 2466 nvme_dead_ctrl(dev); 2467 } 2468 2469 static int nvme_remove_dead_ctrl(void *arg) 2470 { 2471 struct nvme_dev *dev = (struct nvme_dev *)arg; 2472 struct pci_dev *pdev = to_pci_dev(dev->dev); 2473 2474 if (pci_get_drvdata(pdev)) 2475 pci_stop_and_remove_bus_device_locked(pdev); 2476 nvme_put_ctrl(&dev->ctrl); 2477 return 0; 2478 } 2479 2480 static void nvme_dead_ctrl(struct nvme_dev *dev) 2481 { 2482 dev_warn(dev->dev, "Device failed to resume\n"); 2483 kref_get(&dev->ctrl.kref); 2484 if (IS_ERR(kthread_run(nvme_remove_dead_ctrl, dev, "nvme%d", 2485 dev->ctrl.instance))) { 2486 dev_err(dev->dev, 2487 "Failed to start controller remove task\n"); 2488 nvme_put_ctrl(&dev->ctrl); 2489 } 2490 } 2491 2492 static void nvme_reset_work(struct work_struct *ws) 2493 { 2494 struct nvme_dev *dev = container_of(ws, struct nvme_dev, reset_work); 2495 bool in_probe = work_busy(&dev->probe_work); 2496 2497 nvme_dev_shutdown(dev); 2498 2499 /* Synchronize with device probe so that work will see failure status 2500 * and exit gracefully without trying to schedule another reset */ 2501 flush_work(&dev->probe_work); 2502 2503 /* Fail this device if reset occured during probe to avoid 2504 * infinite initialization loops. */ 2505 if (in_probe) { 2506 nvme_dead_ctrl(dev); 2507 return; 2508 } 2509 /* Schedule device resume asynchronously so the reset work is available 2510 * to cleanup errors that may occur during reinitialization */ 2511 schedule_work(&dev->probe_work); 2512 } 2513 2514 static int __nvme_reset(struct nvme_dev *dev) 2515 { 2516 if (work_pending(&dev->reset_work)) 2517 return -EBUSY; 2518 list_del_init(&dev->node); 2519 queue_work(nvme_workq, &dev->reset_work); 2520 return 0; 2521 } 2522 2523 static int nvme_reset(struct nvme_dev *dev) 2524 { 2525 int ret; 2526 2527 if (!dev->ctrl.admin_q || blk_queue_dying(dev->ctrl.admin_q)) 2528 return -ENODEV; 2529 2530 spin_lock(&dev_list_lock); 2531 ret = __nvme_reset(dev); 2532 spin_unlock(&dev_list_lock); 2533 2534 if (!ret) { 2535 flush_work(&dev->reset_work); 2536 flush_work(&dev->probe_work); 2537 return 0; 2538 } 2539 2540 return ret; 2541 } 2542 2543 static ssize_t nvme_sysfs_reset(struct device *dev, 2544 struct device_attribute *attr, const char *buf, 2545 size_t count) 2546 { 2547 struct nvme_dev *ndev = dev_get_drvdata(dev); 2548 int ret; 2549 2550 ret = nvme_reset(ndev); 2551 if (ret < 0) 2552 return ret; 2553 2554 return count; 2555 } 2556 static DEVICE_ATTR(reset_controller, S_IWUSR, NULL, nvme_sysfs_reset); 2557 2558 static int nvme_pci_reg_read32(struct nvme_ctrl *ctrl, u32 off, u32 *val) 2559 { 2560 *val = readl(to_nvme_dev(ctrl)->bar + off); 2561 return 0; 2562 } 2563 2564 static int nvme_pci_reg_write32(struct nvme_ctrl *ctrl, u32 off, u32 val) 2565 { 2566 writel(val, to_nvme_dev(ctrl)->bar + off); 2567 return 0; 2568 } 2569 2570 static int nvme_pci_reg_read64(struct nvme_ctrl *ctrl, u32 off, u64 *val) 2571 { 2572 *val = readq(to_nvme_dev(ctrl)->bar + off); 2573 return 0; 2574 } 2575 2576 static const struct nvme_ctrl_ops nvme_pci_ctrl_ops = { 2577 .reg_read32 = nvme_pci_reg_read32, 2578 .reg_write32 = nvme_pci_reg_write32, 2579 .reg_read64 = nvme_pci_reg_read64, 2580 .free_ctrl = nvme_pci_free_ctrl, 2581 }; 2582 2583 static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) 2584 { 2585 int node, result = -ENOMEM; 2586 struct nvme_dev *dev; 2587 2588 node = dev_to_node(&pdev->dev); 2589 if (node == NUMA_NO_NODE) 2590 set_dev_node(&pdev->dev, 0); 2591 2592 dev = kzalloc_node(sizeof(*dev), GFP_KERNEL, node); 2593 if (!dev) 2594 return -ENOMEM; 2595 dev->entry = kzalloc_node(num_possible_cpus() * sizeof(*dev->entry), 2596 GFP_KERNEL, node); 2597 if (!dev->entry) 2598 goto free; 2599 dev->queues = kzalloc_node((num_possible_cpus() + 1) * sizeof(void *), 2600 GFP_KERNEL, node); 2601 if (!dev->queues) 2602 goto free; 2603 2604 INIT_LIST_HEAD(&dev->namespaces); 2605 INIT_WORK(&dev->reset_work, nvme_reset_work); 2606 dev->dev = get_device(&pdev->dev); 2607 pci_set_drvdata(pdev, dev); 2608 2609 dev->ctrl.ops = &nvme_pci_ctrl_ops; 2610 dev->ctrl.dev = dev->dev; 2611 dev->ctrl.quirks = id->driver_data; 2612 2613 result = nvme_set_instance(dev); 2614 if (result) 2615 goto put_pci; 2616 2617 result = nvme_setup_prp_pools(dev); 2618 if (result) 2619 goto release; 2620 2621 kref_init(&dev->ctrl.kref); 2622 dev->device = device_create(nvme_class, &pdev->dev, 2623 MKDEV(nvme_char_major, dev->ctrl.instance), 2624 dev, "nvme%d", dev->ctrl.instance); 2625 if (IS_ERR(dev->device)) { 2626 result = PTR_ERR(dev->device); 2627 goto release_pools; 2628 } 2629 get_device(dev->device); 2630 dev_set_drvdata(dev->device, dev); 2631 2632 result = device_create_file(dev->device, &dev_attr_reset_controller); 2633 if (result) 2634 goto put_dev; 2635 2636 INIT_LIST_HEAD(&dev->node); 2637 INIT_WORK(&dev->scan_work, nvme_dev_scan); 2638 INIT_WORK(&dev->probe_work, nvme_probe_work); 2639 schedule_work(&dev->probe_work); 2640 return 0; 2641 2642 put_dev: 2643 device_destroy(nvme_class, MKDEV(nvme_char_major, dev->ctrl.instance)); 2644 put_device(dev->device); 2645 release_pools: 2646 nvme_release_prp_pools(dev); 2647 release: 2648 nvme_release_instance(dev); 2649 put_pci: 2650 put_device(dev->dev); 2651 free: 2652 kfree(dev->queues); 2653 kfree(dev->entry); 2654 kfree(dev); 2655 return result; 2656 } 2657 2658 static void nvme_reset_notify(struct pci_dev *pdev, bool prepare) 2659 { 2660 struct nvme_dev *dev = pci_get_drvdata(pdev); 2661 2662 if (prepare) 2663 nvme_dev_shutdown(dev); 2664 else 2665 schedule_work(&dev->probe_work); 2666 } 2667 2668 static void nvme_shutdown(struct pci_dev *pdev) 2669 { 2670 struct nvme_dev *dev = pci_get_drvdata(pdev); 2671 nvme_dev_shutdown(dev); 2672 } 2673 2674 static void nvme_remove(struct pci_dev *pdev) 2675 { 2676 struct nvme_dev *dev = pci_get_drvdata(pdev); 2677 2678 spin_lock(&dev_list_lock); 2679 list_del_init(&dev->node); 2680 spin_unlock(&dev_list_lock); 2681 2682 pci_set_drvdata(pdev, NULL); 2683 flush_work(&dev->probe_work); 2684 flush_work(&dev->reset_work); 2685 flush_work(&dev->scan_work); 2686 device_remove_file(dev->device, &dev_attr_reset_controller); 2687 nvme_dev_remove(dev); 2688 nvme_dev_shutdown(dev); 2689 nvme_dev_remove_admin(dev); 2690 device_destroy(nvme_class, MKDEV(nvme_char_major, dev->ctrl.instance)); 2691 nvme_free_queues(dev, 0); 2692 nvme_release_cmb(dev); 2693 nvme_release_prp_pools(dev); 2694 nvme_put_ctrl(&dev->ctrl); 2695 } 2696 2697 /* These functions are yet to be implemented */ 2698 #define nvme_error_detected NULL 2699 #define nvme_dump_registers NULL 2700 #define nvme_link_reset NULL 2701 #define nvme_slot_reset NULL 2702 #define nvme_error_resume NULL 2703 2704 #ifdef CONFIG_PM_SLEEP 2705 static int nvme_suspend(struct device *dev) 2706 { 2707 struct pci_dev *pdev = to_pci_dev(dev); 2708 struct nvme_dev *ndev = pci_get_drvdata(pdev); 2709 2710 nvme_dev_shutdown(ndev); 2711 return 0; 2712 } 2713 2714 static int nvme_resume(struct device *dev) 2715 { 2716 struct pci_dev *pdev = to_pci_dev(dev); 2717 struct nvme_dev *ndev = pci_get_drvdata(pdev); 2718 2719 schedule_work(&ndev->probe_work); 2720 return 0; 2721 } 2722 #endif 2723 2724 static SIMPLE_DEV_PM_OPS(nvme_dev_pm_ops, nvme_suspend, nvme_resume); 2725 2726 static const struct pci_error_handlers nvme_err_handler = { 2727 .error_detected = nvme_error_detected, 2728 .mmio_enabled = nvme_dump_registers, 2729 .link_reset = nvme_link_reset, 2730 .slot_reset = nvme_slot_reset, 2731 .resume = nvme_error_resume, 2732 .reset_notify = nvme_reset_notify, 2733 }; 2734 2735 /* Move to pci_ids.h later */ 2736 #define PCI_CLASS_STORAGE_EXPRESS 0x010802 2737 2738 static const struct pci_device_id nvme_id_table[] = { 2739 { PCI_VDEVICE(INTEL, 0x0953), 2740 .driver_data = NVME_QUIRK_STRIPE_SIZE, }, 2741 { PCI_DEVICE_CLASS(PCI_CLASS_STORAGE_EXPRESS, 0xffffff) }, 2742 { PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2001) }, 2743 { 0, } 2744 }; 2745 MODULE_DEVICE_TABLE(pci, nvme_id_table); 2746 2747 static struct pci_driver nvme_driver = { 2748 .name = "nvme", 2749 .id_table = nvme_id_table, 2750 .probe = nvme_probe, 2751 .remove = nvme_remove, 2752 .shutdown = nvme_shutdown, 2753 .driver = { 2754 .pm = &nvme_dev_pm_ops, 2755 }, 2756 .err_handler = &nvme_err_handler, 2757 }; 2758 2759 static int __init nvme_init(void) 2760 { 2761 int result; 2762 2763 init_waitqueue_head(&nvme_kthread_wait); 2764 2765 nvme_workq = create_singlethread_workqueue("nvme"); 2766 if (!nvme_workq) 2767 return -ENOMEM; 2768 2769 result = register_blkdev(nvme_major, "nvme"); 2770 if (result < 0) 2771 goto kill_workq; 2772 else if (result > 0) 2773 nvme_major = result; 2774 2775 result = __register_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme", 2776 &nvme_dev_fops); 2777 if (result < 0) 2778 goto unregister_blkdev; 2779 else if (result > 0) 2780 nvme_char_major = result; 2781 2782 nvme_class = class_create(THIS_MODULE, "nvme"); 2783 if (IS_ERR(nvme_class)) { 2784 result = PTR_ERR(nvme_class); 2785 goto unregister_chrdev; 2786 } 2787 2788 result = pci_register_driver(&nvme_driver); 2789 if (result) 2790 goto destroy_class; 2791 return 0; 2792 2793 destroy_class: 2794 class_destroy(nvme_class); 2795 unregister_chrdev: 2796 __unregister_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme"); 2797 unregister_blkdev: 2798 unregister_blkdev(nvme_major, "nvme"); 2799 kill_workq: 2800 destroy_workqueue(nvme_workq); 2801 return result; 2802 } 2803 2804 static void __exit nvme_exit(void) 2805 { 2806 pci_unregister_driver(&nvme_driver); 2807 unregister_blkdev(nvme_major, "nvme"); 2808 destroy_workqueue(nvme_workq); 2809 class_destroy(nvme_class); 2810 __unregister_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme"); 2811 BUG_ON(nvme_thread && !IS_ERR(nvme_thread)); 2812 _nvme_check_size(); 2813 } 2814 2815 MODULE_AUTHOR("Matthew Wilcox <willy@linux.intel.com>"); 2816 MODULE_LICENSE("GPL"); 2817 MODULE_VERSION("1.0"); 2818 module_init(nvme_init); 2819 module_exit(nvme_exit); 2820