1 /* 2 * NVM Express device driver 3 * Copyright (c) 2011-2014, Intel Corporation. 4 * 5 * This program is free software; you can redistribute it and/or modify it 6 * under the terms and conditions of the GNU General Public License, 7 * version 2, as published by the Free Software Foundation. 8 * 9 * This program is distributed in the hope it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 12 * more details. 13 */ 14 15 #include <linux/bitops.h> 16 #include <linux/blkdev.h> 17 #include <linux/blk-mq.h> 18 #include <linux/cpu.h> 19 #include <linux/delay.h> 20 #include <linux/errno.h> 21 #include <linux/fs.h> 22 #include <linux/genhd.h> 23 #include <linux/hdreg.h> 24 #include <linux/idr.h> 25 #include <linux/init.h> 26 #include <linux/interrupt.h> 27 #include <linux/io.h> 28 #include <linux/kdev_t.h> 29 #include <linux/kthread.h> 30 #include <linux/kernel.h> 31 #include <linux/mm.h> 32 #include <linux/module.h> 33 #include <linux/moduleparam.h> 34 #include <linux/mutex.h> 35 #include <linux/pci.h> 36 #include <linux/poison.h> 37 #include <linux/ptrace.h> 38 #include <linux/sched.h> 39 #include <linux/slab.h> 40 #include <linux/t10-pi.h> 41 #include <linux/types.h> 42 #include <linux/io-64-nonatomic-lo-hi.h> 43 #include <asm/unaligned.h> 44 45 #include "nvme.h" 46 47 #define NVME_Q_DEPTH 1024 48 #define NVME_AQ_DEPTH 256 49 #define SQ_SIZE(depth) (depth * sizeof(struct nvme_command)) 50 #define CQ_SIZE(depth) (depth * sizeof(struct nvme_completion)) 51 52 unsigned char admin_timeout = 60; 53 module_param(admin_timeout, byte, 0644); 54 MODULE_PARM_DESC(admin_timeout, "timeout in seconds for admin commands"); 55 56 unsigned char nvme_io_timeout = 30; 57 module_param_named(io_timeout, nvme_io_timeout, byte, 0644); 58 MODULE_PARM_DESC(io_timeout, "timeout in seconds for I/O"); 59 60 unsigned char shutdown_timeout = 5; 61 module_param(shutdown_timeout, byte, 0644); 62 MODULE_PARM_DESC(shutdown_timeout, "timeout in seconds for controller shutdown"); 63 64 static int use_threaded_interrupts; 65 module_param(use_threaded_interrupts, int, 0); 66 67 static bool use_cmb_sqes = true; 68 module_param(use_cmb_sqes, bool, 0644); 69 MODULE_PARM_DESC(use_cmb_sqes, "use controller's memory buffer for I/O SQes"); 70 71 static LIST_HEAD(dev_list); 72 static struct task_struct *nvme_thread; 73 static struct workqueue_struct *nvme_workq; 74 static wait_queue_head_t nvme_kthread_wait; 75 76 struct nvme_dev; 77 struct nvme_queue; 78 struct nvme_iod; 79 80 static int nvme_reset(struct nvme_dev *dev); 81 static void nvme_process_cq(struct nvme_queue *nvmeq); 82 static void nvme_unmap_data(struct nvme_dev *dev, struct nvme_iod *iod); 83 static void nvme_dead_ctrl(struct nvme_dev *dev); 84 static void nvme_dev_shutdown(struct nvme_dev *dev); 85 86 struct async_cmd_info { 87 struct kthread_work work; 88 struct kthread_worker *worker; 89 struct request *req; 90 u32 result; 91 int status; 92 void *ctx; 93 }; 94 95 /* 96 * Represents an NVM Express device. Each nvme_dev is a PCI function. 97 */ 98 struct nvme_dev { 99 struct list_head node; 100 struct nvme_queue **queues; 101 struct blk_mq_tag_set tagset; 102 struct blk_mq_tag_set admin_tagset; 103 u32 __iomem *dbs; 104 struct device *dev; 105 struct dma_pool *prp_page_pool; 106 struct dma_pool *prp_small_pool; 107 unsigned queue_count; 108 unsigned online_queues; 109 unsigned max_qid; 110 int q_depth; 111 u32 db_stride; 112 struct msix_entry *entry; 113 void __iomem *bar; 114 struct work_struct reset_work; 115 struct work_struct scan_work; 116 struct mutex shutdown_lock; 117 bool subsystem; 118 void __iomem *cmb; 119 dma_addr_t cmb_dma_addr; 120 u64 cmb_size; 121 u32 cmbsz; 122 unsigned long flags; 123 #define NVME_CTRL_RESETTING 0 124 125 struct nvme_ctrl ctrl; 126 }; 127 128 static inline struct nvme_dev *to_nvme_dev(struct nvme_ctrl *ctrl) 129 { 130 return container_of(ctrl, struct nvme_dev, ctrl); 131 } 132 133 /* 134 * An NVM Express queue. Each device has at least two (one for admin 135 * commands and one for I/O commands). 136 */ 137 struct nvme_queue { 138 struct device *q_dmadev; 139 struct nvme_dev *dev; 140 char irqname[24]; /* nvme4294967295-65535\0 */ 141 spinlock_t q_lock; 142 struct nvme_command *sq_cmds; 143 struct nvme_command __iomem *sq_cmds_io; 144 volatile struct nvme_completion *cqes; 145 struct blk_mq_tags **tags; 146 dma_addr_t sq_dma_addr; 147 dma_addr_t cq_dma_addr; 148 u32 __iomem *q_db; 149 u16 q_depth; 150 s16 cq_vector; 151 u16 sq_head; 152 u16 sq_tail; 153 u16 cq_head; 154 u16 qid; 155 u8 cq_phase; 156 u8 cqe_seen; 157 struct async_cmd_info cmdinfo; 158 }; 159 160 /* 161 * The nvme_iod describes the data in an I/O, including the list of PRP 162 * entries. You can't see it in this data structure because C doesn't let 163 * me express that. Use nvme_alloc_iod to ensure there's enough space 164 * allocated to store the PRP list. 165 */ 166 struct nvme_iod { 167 unsigned long private; /* For the use of the submitter of the I/O */ 168 int npages; /* In the PRP list. 0 means small pool in use */ 169 int offset; /* Of PRP list */ 170 int nents; /* Used in scatterlist */ 171 int length; /* Of data, in bytes */ 172 dma_addr_t first_dma; 173 struct scatterlist meta_sg[1]; /* metadata requires single contiguous buffer */ 174 struct scatterlist sg[0]; 175 }; 176 177 /* 178 * Check we didin't inadvertently grow the command struct 179 */ 180 static inline void _nvme_check_size(void) 181 { 182 BUILD_BUG_ON(sizeof(struct nvme_rw_command) != 64); 183 BUILD_BUG_ON(sizeof(struct nvme_create_cq) != 64); 184 BUILD_BUG_ON(sizeof(struct nvme_create_sq) != 64); 185 BUILD_BUG_ON(sizeof(struct nvme_delete_queue) != 64); 186 BUILD_BUG_ON(sizeof(struct nvme_features) != 64); 187 BUILD_BUG_ON(sizeof(struct nvme_format_cmd) != 64); 188 BUILD_BUG_ON(sizeof(struct nvme_abort_cmd) != 64); 189 BUILD_BUG_ON(sizeof(struct nvme_command) != 64); 190 BUILD_BUG_ON(sizeof(struct nvme_id_ctrl) != 4096); 191 BUILD_BUG_ON(sizeof(struct nvme_id_ns) != 4096); 192 BUILD_BUG_ON(sizeof(struct nvme_lba_range_type) != 64); 193 BUILD_BUG_ON(sizeof(struct nvme_smart_log) != 512); 194 } 195 196 typedef void (*nvme_completion_fn)(struct nvme_queue *, void *, 197 struct nvme_completion *); 198 199 struct nvme_cmd_info { 200 nvme_completion_fn fn; 201 void *ctx; 202 int aborted; 203 struct nvme_queue *nvmeq; 204 struct nvme_iod iod[0]; 205 }; 206 207 /* 208 * Max size of iod being embedded in the request payload 209 */ 210 #define NVME_INT_PAGES 2 211 #define NVME_INT_BYTES(dev) (NVME_INT_PAGES * (dev)->ctrl.page_size) 212 #define NVME_INT_MASK 0x01 213 214 /* 215 * Will slightly overestimate the number of pages needed. This is OK 216 * as it only leads to a small amount of wasted memory for the lifetime of 217 * the I/O. 218 */ 219 static int nvme_npages(unsigned size, struct nvme_dev *dev) 220 { 221 unsigned nprps = DIV_ROUND_UP(size + dev->ctrl.page_size, 222 dev->ctrl.page_size); 223 return DIV_ROUND_UP(8 * nprps, PAGE_SIZE - 8); 224 } 225 226 static unsigned int nvme_cmd_size(struct nvme_dev *dev) 227 { 228 unsigned int ret = sizeof(struct nvme_cmd_info); 229 230 ret += sizeof(struct nvme_iod); 231 ret += sizeof(__le64 *) * nvme_npages(NVME_INT_BYTES(dev), dev); 232 ret += sizeof(struct scatterlist) * NVME_INT_PAGES; 233 234 return ret; 235 } 236 237 static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, 238 unsigned int hctx_idx) 239 { 240 struct nvme_dev *dev = data; 241 struct nvme_queue *nvmeq = dev->queues[0]; 242 243 WARN_ON(hctx_idx != 0); 244 WARN_ON(dev->admin_tagset.tags[0] != hctx->tags); 245 WARN_ON(nvmeq->tags); 246 247 hctx->driver_data = nvmeq; 248 nvmeq->tags = &dev->admin_tagset.tags[0]; 249 return 0; 250 } 251 252 static void nvme_admin_exit_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) 253 { 254 struct nvme_queue *nvmeq = hctx->driver_data; 255 256 nvmeq->tags = NULL; 257 } 258 259 static int nvme_admin_init_request(void *data, struct request *req, 260 unsigned int hctx_idx, unsigned int rq_idx, 261 unsigned int numa_node) 262 { 263 struct nvme_dev *dev = data; 264 struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req); 265 struct nvme_queue *nvmeq = dev->queues[0]; 266 267 BUG_ON(!nvmeq); 268 cmd->nvmeq = nvmeq; 269 return 0; 270 } 271 272 static int nvme_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, 273 unsigned int hctx_idx) 274 { 275 struct nvme_dev *dev = data; 276 struct nvme_queue *nvmeq = dev->queues[hctx_idx + 1]; 277 278 if (!nvmeq->tags) 279 nvmeq->tags = &dev->tagset.tags[hctx_idx]; 280 281 WARN_ON(dev->tagset.tags[hctx_idx] != hctx->tags); 282 hctx->driver_data = nvmeq; 283 return 0; 284 } 285 286 static int nvme_init_request(void *data, struct request *req, 287 unsigned int hctx_idx, unsigned int rq_idx, 288 unsigned int numa_node) 289 { 290 struct nvme_dev *dev = data; 291 struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req); 292 struct nvme_queue *nvmeq = dev->queues[hctx_idx + 1]; 293 294 BUG_ON(!nvmeq); 295 cmd->nvmeq = nvmeq; 296 return 0; 297 } 298 299 static void nvme_set_info(struct nvme_cmd_info *cmd, void *ctx, 300 nvme_completion_fn handler) 301 { 302 cmd->fn = handler; 303 cmd->ctx = ctx; 304 cmd->aborted = 0; 305 blk_mq_start_request(blk_mq_rq_from_pdu(cmd)); 306 } 307 308 static void *iod_get_private(struct nvme_iod *iod) 309 { 310 return (void *) (iod->private & ~0x1UL); 311 } 312 313 /* 314 * If bit 0 is set, the iod is embedded in the request payload. 315 */ 316 static bool iod_should_kfree(struct nvme_iod *iod) 317 { 318 return (iod->private & NVME_INT_MASK) == 0; 319 } 320 321 /* Special values must be less than 0x1000 */ 322 #define CMD_CTX_BASE ((void *)POISON_POINTER_DELTA) 323 #define CMD_CTX_CANCELLED (0x30C + CMD_CTX_BASE) 324 #define CMD_CTX_COMPLETED (0x310 + CMD_CTX_BASE) 325 #define CMD_CTX_INVALID (0x314 + CMD_CTX_BASE) 326 327 static void special_completion(struct nvme_queue *nvmeq, void *ctx, 328 struct nvme_completion *cqe) 329 { 330 if (ctx == CMD_CTX_CANCELLED) 331 return; 332 if (ctx == CMD_CTX_COMPLETED) { 333 dev_warn(nvmeq->q_dmadev, 334 "completed id %d twice on queue %d\n", 335 cqe->command_id, le16_to_cpup(&cqe->sq_id)); 336 return; 337 } 338 if (ctx == CMD_CTX_INVALID) { 339 dev_warn(nvmeq->q_dmadev, 340 "invalid id %d completed on queue %d\n", 341 cqe->command_id, le16_to_cpup(&cqe->sq_id)); 342 return; 343 } 344 dev_warn(nvmeq->q_dmadev, "Unknown special completion %p\n", ctx); 345 } 346 347 static void *cancel_cmd_info(struct nvme_cmd_info *cmd, nvme_completion_fn *fn) 348 { 349 void *ctx; 350 351 if (fn) 352 *fn = cmd->fn; 353 ctx = cmd->ctx; 354 cmd->fn = special_completion; 355 cmd->ctx = CMD_CTX_CANCELLED; 356 return ctx; 357 } 358 359 static void async_req_completion(struct nvme_queue *nvmeq, void *ctx, 360 struct nvme_completion *cqe) 361 { 362 u32 result = le32_to_cpup(&cqe->result); 363 u16 status = le16_to_cpup(&cqe->status) >> 1; 364 365 if (status == NVME_SC_SUCCESS || status == NVME_SC_ABORT_REQ) 366 ++nvmeq->dev->ctrl.event_limit; 367 if (status != NVME_SC_SUCCESS) 368 return; 369 370 switch (result & 0xff07) { 371 case NVME_AER_NOTICE_NS_CHANGED: 372 dev_info(nvmeq->q_dmadev, "rescanning\n"); 373 schedule_work(&nvmeq->dev->scan_work); 374 default: 375 dev_warn(nvmeq->q_dmadev, "async event result %08x\n", result); 376 } 377 } 378 379 static void abort_completion(struct nvme_queue *nvmeq, void *ctx, 380 struct nvme_completion *cqe) 381 { 382 struct request *req = ctx; 383 384 u16 status = le16_to_cpup(&cqe->status) >> 1; 385 u32 result = le32_to_cpup(&cqe->result); 386 387 blk_mq_free_request(req); 388 389 dev_warn(nvmeq->q_dmadev, "Abort status:%x result:%x", status, result); 390 ++nvmeq->dev->ctrl.abort_limit; 391 } 392 393 static void async_completion(struct nvme_queue *nvmeq, void *ctx, 394 struct nvme_completion *cqe) 395 { 396 struct async_cmd_info *cmdinfo = ctx; 397 cmdinfo->result = le32_to_cpup(&cqe->result); 398 cmdinfo->status = le16_to_cpup(&cqe->status) >> 1; 399 queue_kthread_work(cmdinfo->worker, &cmdinfo->work); 400 blk_mq_free_request(cmdinfo->req); 401 } 402 403 static inline struct nvme_cmd_info *get_cmd_from_tag(struct nvme_queue *nvmeq, 404 unsigned int tag) 405 { 406 struct request *req = blk_mq_tag_to_rq(*nvmeq->tags, tag); 407 408 return blk_mq_rq_to_pdu(req); 409 } 410 411 /* 412 * Called with local interrupts disabled and the q_lock held. May not sleep. 413 */ 414 static void *nvme_finish_cmd(struct nvme_queue *nvmeq, int tag, 415 nvme_completion_fn *fn) 416 { 417 struct nvme_cmd_info *cmd = get_cmd_from_tag(nvmeq, tag); 418 void *ctx; 419 if (tag >= nvmeq->q_depth) { 420 *fn = special_completion; 421 return CMD_CTX_INVALID; 422 } 423 if (fn) 424 *fn = cmd->fn; 425 ctx = cmd->ctx; 426 cmd->fn = special_completion; 427 cmd->ctx = CMD_CTX_COMPLETED; 428 return ctx; 429 } 430 431 /** 432 * nvme_submit_cmd() - Copy a command into a queue and ring the doorbell 433 * @nvmeq: The queue to use 434 * @cmd: The command to send 435 * 436 * Safe to use from interrupt context 437 */ 438 static void __nvme_submit_cmd(struct nvme_queue *nvmeq, 439 struct nvme_command *cmd) 440 { 441 u16 tail = nvmeq->sq_tail; 442 443 if (nvmeq->sq_cmds_io) 444 memcpy_toio(&nvmeq->sq_cmds_io[tail], cmd, sizeof(*cmd)); 445 else 446 memcpy(&nvmeq->sq_cmds[tail], cmd, sizeof(*cmd)); 447 448 if (++tail == nvmeq->q_depth) 449 tail = 0; 450 writel(tail, nvmeq->q_db); 451 nvmeq->sq_tail = tail; 452 } 453 454 static void nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd) 455 { 456 unsigned long flags; 457 spin_lock_irqsave(&nvmeq->q_lock, flags); 458 __nvme_submit_cmd(nvmeq, cmd); 459 spin_unlock_irqrestore(&nvmeq->q_lock, flags); 460 } 461 462 static __le64 **iod_list(struct nvme_iod *iod) 463 { 464 return ((void *)iod) + iod->offset; 465 } 466 467 static inline void iod_init(struct nvme_iod *iod, unsigned nbytes, 468 unsigned nseg, unsigned long private) 469 { 470 iod->private = private; 471 iod->offset = offsetof(struct nvme_iod, sg[nseg]); 472 iod->npages = -1; 473 iod->length = nbytes; 474 iod->nents = 0; 475 } 476 477 static struct nvme_iod * 478 __nvme_alloc_iod(unsigned nseg, unsigned bytes, struct nvme_dev *dev, 479 unsigned long priv, gfp_t gfp) 480 { 481 struct nvme_iod *iod = kmalloc(sizeof(struct nvme_iod) + 482 sizeof(__le64 *) * nvme_npages(bytes, dev) + 483 sizeof(struct scatterlist) * nseg, gfp); 484 485 if (iod) 486 iod_init(iod, bytes, nseg, priv); 487 488 return iod; 489 } 490 491 static struct nvme_iod *nvme_alloc_iod(struct request *rq, struct nvme_dev *dev, 492 gfp_t gfp) 493 { 494 unsigned size = !(rq->cmd_flags & REQ_DISCARD) ? blk_rq_bytes(rq) : 495 sizeof(struct nvme_dsm_range); 496 struct nvme_iod *iod; 497 498 if (rq->nr_phys_segments <= NVME_INT_PAGES && 499 size <= NVME_INT_BYTES(dev)) { 500 struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(rq); 501 502 iod = cmd->iod; 503 iod_init(iod, size, rq->nr_phys_segments, 504 (unsigned long) rq | NVME_INT_MASK); 505 return iod; 506 } 507 508 return __nvme_alloc_iod(rq->nr_phys_segments, size, dev, 509 (unsigned long) rq, gfp); 510 } 511 512 static void nvme_free_iod(struct nvme_dev *dev, struct nvme_iod *iod) 513 { 514 const int last_prp = dev->ctrl.page_size / 8 - 1; 515 int i; 516 __le64 **list = iod_list(iod); 517 dma_addr_t prp_dma = iod->first_dma; 518 519 if (iod->npages == 0) 520 dma_pool_free(dev->prp_small_pool, list[0], prp_dma); 521 for (i = 0; i < iod->npages; i++) { 522 __le64 *prp_list = list[i]; 523 dma_addr_t next_prp_dma = le64_to_cpu(prp_list[last_prp]); 524 dma_pool_free(dev->prp_page_pool, prp_list, prp_dma); 525 prp_dma = next_prp_dma; 526 } 527 528 if (iod_should_kfree(iod)) 529 kfree(iod); 530 } 531 532 #ifdef CONFIG_BLK_DEV_INTEGRITY 533 static void nvme_dif_prep(u32 p, u32 v, struct t10_pi_tuple *pi) 534 { 535 if (be32_to_cpu(pi->ref_tag) == v) 536 pi->ref_tag = cpu_to_be32(p); 537 } 538 539 static void nvme_dif_complete(u32 p, u32 v, struct t10_pi_tuple *pi) 540 { 541 if (be32_to_cpu(pi->ref_tag) == p) 542 pi->ref_tag = cpu_to_be32(v); 543 } 544 545 /** 546 * nvme_dif_remap - remaps ref tags to bip seed and physical lba 547 * 548 * The virtual start sector is the one that was originally submitted by the 549 * block layer. Due to partitioning, MD/DM cloning, etc. the actual physical 550 * start sector may be different. Remap protection information to match the 551 * physical LBA on writes, and back to the original seed on reads. 552 * 553 * Type 0 and 3 do not have a ref tag, so no remapping required. 554 */ 555 static void nvme_dif_remap(struct request *req, 556 void (*dif_swap)(u32 p, u32 v, struct t10_pi_tuple *pi)) 557 { 558 struct nvme_ns *ns = req->rq_disk->private_data; 559 struct bio_integrity_payload *bip; 560 struct t10_pi_tuple *pi; 561 void *p, *pmap; 562 u32 i, nlb, ts, phys, virt; 563 564 if (!ns->pi_type || ns->pi_type == NVME_NS_DPS_PI_TYPE3) 565 return; 566 567 bip = bio_integrity(req->bio); 568 if (!bip) 569 return; 570 571 pmap = kmap_atomic(bip->bip_vec->bv_page) + bip->bip_vec->bv_offset; 572 573 p = pmap; 574 virt = bip_get_seed(bip); 575 phys = nvme_block_nr(ns, blk_rq_pos(req)); 576 nlb = (blk_rq_bytes(req) >> ns->lba_shift); 577 ts = ns->disk->queue->integrity.tuple_size; 578 579 for (i = 0; i < nlb; i++, virt++, phys++) { 580 pi = (struct t10_pi_tuple *)p; 581 dif_swap(phys, virt, pi); 582 p += ts; 583 } 584 kunmap_atomic(pmap); 585 } 586 #else /* CONFIG_BLK_DEV_INTEGRITY */ 587 static void nvme_dif_remap(struct request *req, 588 void (*dif_swap)(u32 p, u32 v, struct t10_pi_tuple *pi)) 589 { 590 } 591 static void nvme_dif_prep(u32 p, u32 v, struct t10_pi_tuple *pi) 592 { 593 } 594 static void nvme_dif_complete(u32 p, u32 v, struct t10_pi_tuple *pi) 595 { 596 } 597 #endif 598 599 static void req_completion(struct nvme_queue *nvmeq, void *ctx, 600 struct nvme_completion *cqe) 601 { 602 struct nvme_iod *iod = ctx; 603 struct request *req = iod_get_private(iod); 604 struct nvme_cmd_info *cmd_rq = blk_mq_rq_to_pdu(req); 605 u16 status = le16_to_cpup(&cqe->status) >> 1; 606 int error = 0; 607 608 if (unlikely(status)) { 609 if (!(status & NVME_SC_DNR || blk_noretry_request(req)) 610 && (jiffies - req->start_time) < req->timeout) { 611 unsigned long flags; 612 613 nvme_unmap_data(nvmeq->dev, iod); 614 615 blk_mq_requeue_request(req); 616 spin_lock_irqsave(req->q->queue_lock, flags); 617 if (!blk_queue_stopped(req->q)) 618 blk_mq_kick_requeue_list(req->q); 619 spin_unlock_irqrestore(req->q->queue_lock, flags); 620 return; 621 } 622 623 if (req->cmd_type == REQ_TYPE_DRV_PRIV) { 624 if (cmd_rq->ctx == CMD_CTX_CANCELLED) 625 error = NVME_SC_CANCELLED; 626 else 627 error = status; 628 } else { 629 error = nvme_error_status(status); 630 } 631 } 632 633 if (req->cmd_type == REQ_TYPE_DRV_PRIV) { 634 u32 result = le32_to_cpup(&cqe->result); 635 req->special = (void *)(uintptr_t)result; 636 } 637 638 if (cmd_rq->aborted) 639 dev_warn(nvmeq->dev->dev, 640 "completing aborted command with status:%04x\n", 641 error); 642 643 nvme_unmap_data(nvmeq->dev, iod); 644 blk_mq_complete_request(req, error); 645 } 646 647 static bool nvme_setup_prps(struct nvme_dev *dev, struct nvme_iod *iod, 648 int total_len) 649 { 650 struct dma_pool *pool; 651 int length = total_len; 652 struct scatterlist *sg = iod->sg; 653 int dma_len = sg_dma_len(sg); 654 u64 dma_addr = sg_dma_address(sg); 655 u32 page_size = dev->ctrl.page_size; 656 int offset = dma_addr & (page_size - 1); 657 __le64 *prp_list; 658 __le64 **list = iod_list(iod); 659 dma_addr_t prp_dma; 660 int nprps, i; 661 662 length -= (page_size - offset); 663 if (length <= 0) 664 return true; 665 666 dma_len -= (page_size - offset); 667 if (dma_len) { 668 dma_addr += (page_size - offset); 669 } else { 670 sg = sg_next(sg); 671 dma_addr = sg_dma_address(sg); 672 dma_len = sg_dma_len(sg); 673 } 674 675 if (length <= page_size) { 676 iod->first_dma = dma_addr; 677 return true; 678 } 679 680 nprps = DIV_ROUND_UP(length, page_size); 681 if (nprps <= (256 / 8)) { 682 pool = dev->prp_small_pool; 683 iod->npages = 0; 684 } else { 685 pool = dev->prp_page_pool; 686 iod->npages = 1; 687 } 688 689 prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &prp_dma); 690 if (!prp_list) { 691 iod->first_dma = dma_addr; 692 iod->npages = -1; 693 return false; 694 } 695 list[0] = prp_list; 696 iod->first_dma = prp_dma; 697 i = 0; 698 for (;;) { 699 if (i == page_size >> 3) { 700 __le64 *old_prp_list = prp_list; 701 prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &prp_dma); 702 if (!prp_list) 703 return false; 704 list[iod->npages++] = prp_list; 705 prp_list[0] = old_prp_list[i - 1]; 706 old_prp_list[i - 1] = cpu_to_le64(prp_dma); 707 i = 1; 708 } 709 prp_list[i++] = cpu_to_le64(dma_addr); 710 dma_len -= page_size; 711 dma_addr += page_size; 712 length -= page_size; 713 if (length <= 0) 714 break; 715 if (dma_len > 0) 716 continue; 717 BUG_ON(dma_len < 0); 718 sg = sg_next(sg); 719 dma_addr = sg_dma_address(sg); 720 dma_len = sg_dma_len(sg); 721 } 722 723 return true; 724 } 725 726 static int nvme_map_data(struct nvme_dev *dev, struct nvme_iod *iod, 727 struct nvme_command *cmnd) 728 { 729 struct request *req = iod_get_private(iod); 730 struct request_queue *q = req->q; 731 enum dma_data_direction dma_dir = rq_data_dir(req) ? 732 DMA_TO_DEVICE : DMA_FROM_DEVICE; 733 int ret = BLK_MQ_RQ_QUEUE_ERROR; 734 735 sg_init_table(iod->sg, req->nr_phys_segments); 736 iod->nents = blk_rq_map_sg(q, req, iod->sg); 737 if (!iod->nents) 738 goto out; 739 740 ret = BLK_MQ_RQ_QUEUE_BUSY; 741 if (!dma_map_sg(dev->dev, iod->sg, iod->nents, dma_dir)) 742 goto out; 743 744 if (!nvme_setup_prps(dev, iod, blk_rq_bytes(req))) 745 goto out_unmap; 746 747 ret = BLK_MQ_RQ_QUEUE_ERROR; 748 if (blk_integrity_rq(req)) { 749 if (blk_rq_count_integrity_sg(q, req->bio) != 1) 750 goto out_unmap; 751 752 sg_init_table(iod->meta_sg, 1); 753 if (blk_rq_map_integrity_sg(q, req->bio, iod->meta_sg) != 1) 754 goto out_unmap; 755 756 if (rq_data_dir(req)) 757 nvme_dif_remap(req, nvme_dif_prep); 758 759 if (!dma_map_sg(dev->dev, iod->meta_sg, 1, dma_dir)) 760 goto out_unmap; 761 } 762 763 cmnd->rw.prp1 = cpu_to_le64(sg_dma_address(iod->sg)); 764 cmnd->rw.prp2 = cpu_to_le64(iod->first_dma); 765 if (blk_integrity_rq(req)) 766 cmnd->rw.metadata = cpu_to_le64(sg_dma_address(iod->meta_sg)); 767 return BLK_MQ_RQ_QUEUE_OK; 768 769 out_unmap: 770 dma_unmap_sg(dev->dev, iod->sg, iod->nents, dma_dir); 771 out: 772 return ret; 773 } 774 775 static void nvme_unmap_data(struct nvme_dev *dev, struct nvme_iod *iod) 776 { 777 struct request *req = iod_get_private(iod); 778 enum dma_data_direction dma_dir = rq_data_dir(req) ? 779 DMA_TO_DEVICE : DMA_FROM_DEVICE; 780 781 if (iod->nents) { 782 dma_unmap_sg(dev->dev, iod->sg, iod->nents, dma_dir); 783 if (blk_integrity_rq(req)) { 784 if (!rq_data_dir(req)) 785 nvme_dif_remap(req, nvme_dif_complete); 786 dma_unmap_sg(dev->dev, iod->meta_sg, 1, dma_dir); 787 } 788 } 789 790 nvme_free_iod(dev, iod); 791 } 792 793 /* 794 * We reuse the small pool to allocate the 16-byte range here as it is not 795 * worth having a special pool for these or additional cases to handle freeing 796 * the iod. 797 */ 798 static int nvme_setup_discard(struct nvme_queue *nvmeq, struct nvme_ns *ns, 799 struct nvme_iod *iod, struct nvme_command *cmnd) 800 { 801 struct request *req = iod_get_private(iod); 802 struct nvme_dsm_range *range; 803 804 range = dma_pool_alloc(nvmeq->dev->prp_small_pool, GFP_ATOMIC, 805 &iod->first_dma); 806 if (!range) 807 return BLK_MQ_RQ_QUEUE_BUSY; 808 iod_list(iod)[0] = (__le64 *)range; 809 iod->npages = 0; 810 811 range->cattr = cpu_to_le32(0); 812 range->nlb = cpu_to_le32(blk_rq_bytes(req) >> ns->lba_shift); 813 range->slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req))); 814 815 memset(cmnd, 0, sizeof(*cmnd)); 816 cmnd->dsm.opcode = nvme_cmd_dsm; 817 cmnd->dsm.nsid = cpu_to_le32(ns->ns_id); 818 cmnd->dsm.prp1 = cpu_to_le64(iod->first_dma); 819 cmnd->dsm.nr = 0; 820 cmnd->dsm.attributes = cpu_to_le32(NVME_DSMGMT_AD); 821 return BLK_MQ_RQ_QUEUE_OK; 822 } 823 824 /* 825 * NOTE: ns is NULL when called on the admin queue. 826 */ 827 static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx, 828 const struct blk_mq_queue_data *bd) 829 { 830 struct nvme_ns *ns = hctx->queue->queuedata; 831 struct nvme_queue *nvmeq = hctx->driver_data; 832 struct nvme_dev *dev = nvmeq->dev; 833 struct request *req = bd->rq; 834 struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req); 835 struct nvme_iod *iod; 836 struct nvme_command cmnd; 837 int ret = BLK_MQ_RQ_QUEUE_OK; 838 839 /* 840 * If formated with metadata, require the block layer provide a buffer 841 * unless this namespace is formated such that the metadata can be 842 * stripped/generated by the controller with PRACT=1. 843 */ 844 if (ns && ns->ms && !blk_integrity_rq(req)) { 845 if (!(ns->pi_type && ns->ms == 8) && 846 req->cmd_type != REQ_TYPE_DRV_PRIV) { 847 blk_mq_complete_request(req, -EFAULT); 848 return BLK_MQ_RQ_QUEUE_OK; 849 } 850 } 851 852 iod = nvme_alloc_iod(req, dev, GFP_ATOMIC); 853 if (!iod) 854 return BLK_MQ_RQ_QUEUE_BUSY; 855 856 if (req->cmd_flags & REQ_DISCARD) { 857 ret = nvme_setup_discard(nvmeq, ns, iod, &cmnd); 858 } else { 859 if (req->cmd_type == REQ_TYPE_DRV_PRIV) 860 memcpy(&cmnd, req->cmd, sizeof(cmnd)); 861 else if (req->cmd_flags & REQ_FLUSH) 862 nvme_setup_flush(ns, &cmnd); 863 else 864 nvme_setup_rw(ns, req, &cmnd); 865 866 if (req->nr_phys_segments) 867 ret = nvme_map_data(dev, iod, &cmnd); 868 } 869 870 if (ret) 871 goto out; 872 873 cmnd.common.command_id = req->tag; 874 nvme_set_info(cmd, iod, req_completion); 875 876 spin_lock_irq(&nvmeq->q_lock); 877 __nvme_submit_cmd(nvmeq, &cmnd); 878 nvme_process_cq(nvmeq); 879 spin_unlock_irq(&nvmeq->q_lock); 880 return BLK_MQ_RQ_QUEUE_OK; 881 out: 882 nvme_free_iod(dev, iod); 883 return ret; 884 } 885 886 static void __nvme_process_cq(struct nvme_queue *nvmeq, unsigned int *tag) 887 { 888 u16 head, phase; 889 890 head = nvmeq->cq_head; 891 phase = nvmeq->cq_phase; 892 893 for (;;) { 894 void *ctx; 895 nvme_completion_fn fn; 896 struct nvme_completion cqe = nvmeq->cqes[head]; 897 if ((le16_to_cpu(cqe.status) & 1) != phase) 898 break; 899 nvmeq->sq_head = le16_to_cpu(cqe.sq_head); 900 if (++head == nvmeq->q_depth) { 901 head = 0; 902 phase = !phase; 903 } 904 if (tag && *tag == cqe.command_id) 905 *tag = -1; 906 ctx = nvme_finish_cmd(nvmeq, cqe.command_id, &fn); 907 fn(nvmeq, ctx, &cqe); 908 } 909 910 /* If the controller ignores the cq head doorbell and continuously 911 * writes to the queue, it is theoretically possible to wrap around 912 * the queue twice and mistakenly return IRQ_NONE. Linux only 913 * requires that 0.1% of your interrupts are handled, so this isn't 914 * a big problem. 915 */ 916 if (head == nvmeq->cq_head && phase == nvmeq->cq_phase) 917 return; 918 919 if (likely(nvmeq->cq_vector >= 0)) 920 writel(head, nvmeq->q_db + nvmeq->dev->db_stride); 921 nvmeq->cq_head = head; 922 nvmeq->cq_phase = phase; 923 924 nvmeq->cqe_seen = 1; 925 } 926 927 static void nvme_process_cq(struct nvme_queue *nvmeq) 928 { 929 __nvme_process_cq(nvmeq, NULL); 930 } 931 932 static irqreturn_t nvme_irq(int irq, void *data) 933 { 934 irqreturn_t result; 935 struct nvme_queue *nvmeq = data; 936 spin_lock(&nvmeq->q_lock); 937 nvme_process_cq(nvmeq); 938 result = nvmeq->cqe_seen ? IRQ_HANDLED : IRQ_NONE; 939 nvmeq->cqe_seen = 0; 940 spin_unlock(&nvmeq->q_lock); 941 return result; 942 } 943 944 static irqreturn_t nvme_irq_check(int irq, void *data) 945 { 946 struct nvme_queue *nvmeq = data; 947 struct nvme_completion cqe = nvmeq->cqes[nvmeq->cq_head]; 948 if ((le16_to_cpu(cqe.status) & 1) != nvmeq->cq_phase) 949 return IRQ_NONE; 950 return IRQ_WAKE_THREAD; 951 } 952 953 static int nvme_poll(struct blk_mq_hw_ctx *hctx, unsigned int tag) 954 { 955 struct nvme_queue *nvmeq = hctx->driver_data; 956 957 if ((le16_to_cpu(nvmeq->cqes[nvmeq->cq_head].status) & 1) == 958 nvmeq->cq_phase) { 959 spin_lock_irq(&nvmeq->q_lock); 960 __nvme_process_cq(nvmeq, &tag); 961 spin_unlock_irq(&nvmeq->q_lock); 962 963 if (tag == -1) 964 return 1; 965 } 966 967 return 0; 968 } 969 970 static int nvme_submit_async_admin_req(struct nvme_dev *dev) 971 { 972 struct nvme_queue *nvmeq = dev->queues[0]; 973 struct nvme_command c; 974 struct nvme_cmd_info *cmd_info; 975 struct request *req; 976 977 req = blk_mq_alloc_request(dev->ctrl.admin_q, WRITE, 978 BLK_MQ_REQ_NOWAIT | BLK_MQ_REQ_RESERVED); 979 if (IS_ERR(req)) 980 return PTR_ERR(req); 981 982 req->cmd_flags |= REQ_NO_TIMEOUT; 983 cmd_info = blk_mq_rq_to_pdu(req); 984 nvme_set_info(cmd_info, NULL, async_req_completion); 985 986 memset(&c, 0, sizeof(c)); 987 c.common.opcode = nvme_admin_async_event; 988 c.common.command_id = req->tag; 989 990 blk_mq_free_request(req); 991 __nvme_submit_cmd(nvmeq, &c); 992 return 0; 993 } 994 995 static int nvme_submit_admin_async_cmd(struct nvme_dev *dev, 996 struct nvme_command *cmd, 997 struct async_cmd_info *cmdinfo, unsigned timeout) 998 { 999 struct nvme_queue *nvmeq = dev->queues[0]; 1000 struct request *req; 1001 struct nvme_cmd_info *cmd_rq; 1002 1003 req = blk_mq_alloc_request(dev->ctrl.admin_q, WRITE, 0); 1004 if (IS_ERR(req)) 1005 return PTR_ERR(req); 1006 1007 req->timeout = timeout; 1008 cmd_rq = blk_mq_rq_to_pdu(req); 1009 cmdinfo->req = req; 1010 nvme_set_info(cmd_rq, cmdinfo, async_completion); 1011 cmdinfo->status = -EINTR; 1012 1013 cmd->common.command_id = req->tag; 1014 1015 nvme_submit_cmd(nvmeq, cmd); 1016 return 0; 1017 } 1018 1019 static int adapter_delete_queue(struct nvme_dev *dev, u8 opcode, u16 id) 1020 { 1021 struct nvme_command c; 1022 1023 memset(&c, 0, sizeof(c)); 1024 c.delete_queue.opcode = opcode; 1025 c.delete_queue.qid = cpu_to_le16(id); 1026 1027 return nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0); 1028 } 1029 1030 static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid, 1031 struct nvme_queue *nvmeq) 1032 { 1033 struct nvme_command c; 1034 int flags = NVME_QUEUE_PHYS_CONTIG | NVME_CQ_IRQ_ENABLED; 1035 1036 /* 1037 * Note: we (ab)use the fact the the prp fields survive if no data 1038 * is attached to the request. 1039 */ 1040 memset(&c, 0, sizeof(c)); 1041 c.create_cq.opcode = nvme_admin_create_cq; 1042 c.create_cq.prp1 = cpu_to_le64(nvmeq->cq_dma_addr); 1043 c.create_cq.cqid = cpu_to_le16(qid); 1044 c.create_cq.qsize = cpu_to_le16(nvmeq->q_depth - 1); 1045 c.create_cq.cq_flags = cpu_to_le16(flags); 1046 c.create_cq.irq_vector = cpu_to_le16(nvmeq->cq_vector); 1047 1048 return nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0); 1049 } 1050 1051 static int adapter_alloc_sq(struct nvme_dev *dev, u16 qid, 1052 struct nvme_queue *nvmeq) 1053 { 1054 struct nvme_command c; 1055 int flags = NVME_QUEUE_PHYS_CONTIG | NVME_SQ_PRIO_MEDIUM; 1056 1057 /* 1058 * Note: we (ab)use the fact the the prp fields survive if no data 1059 * is attached to the request. 1060 */ 1061 memset(&c, 0, sizeof(c)); 1062 c.create_sq.opcode = nvme_admin_create_sq; 1063 c.create_sq.prp1 = cpu_to_le64(nvmeq->sq_dma_addr); 1064 c.create_sq.sqid = cpu_to_le16(qid); 1065 c.create_sq.qsize = cpu_to_le16(nvmeq->q_depth - 1); 1066 c.create_sq.sq_flags = cpu_to_le16(flags); 1067 c.create_sq.cqid = cpu_to_le16(qid); 1068 1069 return nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0); 1070 } 1071 1072 static int adapter_delete_cq(struct nvme_dev *dev, u16 cqid) 1073 { 1074 return adapter_delete_queue(dev, nvme_admin_delete_cq, cqid); 1075 } 1076 1077 static int adapter_delete_sq(struct nvme_dev *dev, u16 sqid) 1078 { 1079 return adapter_delete_queue(dev, nvme_admin_delete_sq, sqid); 1080 } 1081 1082 static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved) 1083 { 1084 struct nvme_cmd_info *cmd_rq = blk_mq_rq_to_pdu(req); 1085 struct nvme_queue *nvmeq = cmd_rq->nvmeq; 1086 struct nvme_dev *dev = nvmeq->dev; 1087 struct request *abort_req; 1088 struct nvme_cmd_info *abort_cmd; 1089 struct nvme_command cmd; 1090 1091 /* 1092 * Shutdown immediately if controller times out while starting. The 1093 * reset work will see the pci device disabled when it gets the forced 1094 * cancellation error. All outstanding requests are completed on 1095 * shutdown, so we return BLK_EH_HANDLED. 1096 */ 1097 if (test_bit(NVME_CTRL_RESETTING, &dev->flags)) { 1098 dev_warn(dev->dev, 1099 "I/O %d QID %d timeout, disable controller\n", 1100 req->tag, nvmeq->qid); 1101 nvme_dev_shutdown(dev); 1102 req->errors = NVME_SC_CANCELLED; 1103 return BLK_EH_HANDLED; 1104 } 1105 1106 /* 1107 * Shutdown the controller immediately and schedule a reset if the 1108 * command was already aborted once before and still hasn't been 1109 * returned to the driver, or if this is the admin queue. 1110 */ 1111 if (!nvmeq->qid || cmd_rq->aborted) { 1112 dev_warn(dev->dev, 1113 "I/O %d QID %d timeout, reset controller\n", 1114 req->tag, nvmeq->qid); 1115 nvme_dev_shutdown(dev); 1116 queue_work(nvme_workq, &dev->reset_work); 1117 1118 /* 1119 * Mark the request as handled, since the inline shutdown 1120 * forces all outstanding requests to complete. 1121 */ 1122 req->errors = NVME_SC_CANCELLED; 1123 return BLK_EH_HANDLED; 1124 } 1125 1126 if (!dev->ctrl.abort_limit) 1127 return BLK_EH_RESET_TIMER; 1128 1129 abort_req = blk_mq_alloc_request(dev->ctrl.admin_q, WRITE, 1130 BLK_MQ_REQ_NOWAIT); 1131 if (IS_ERR(abort_req)) 1132 return BLK_EH_RESET_TIMER; 1133 1134 abort_cmd = blk_mq_rq_to_pdu(abort_req); 1135 nvme_set_info(abort_cmd, abort_req, abort_completion); 1136 1137 memset(&cmd, 0, sizeof(cmd)); 1138 cmd.abort.opcode = nvme_admin_abort_cmd; 1139 cmd.abort.cid = req->tag; 1140 cmd.abort.sqid = cpu_to_le16(nvmeq->qid); 1141 cmd.abort.command_id = abort_req->tag; 1142 1143 --dev->ctrl.abort_limit; 1144 cmd_rq->aborted = 1; 1145 1146 dev_warn(nvmeq->q_dmadev, "I/O %d QID %d timeout, aborting\n", 1147 req->tag, nvmeq->qid); 1148 nvme_submit_cmd(dev->queues[0], &cmd); 1149 1150 /* 1151 * The aborted req will be completed on receiving the abort req. 1152 * We enable the timer again. If hit twice, it'll cause a device reset, 1153 * as the device then is in a faulty state. 1154 */ 1155 return BLK_EH_RESET_TIMER; 1156 } 1157 1158 static void nvme_cancel_queue_ios(struct request *req, void *data, bool reserved) 1159 { 1160 struct nvme_queue *nvmeq = data; 1161 void *ctx; 1162 nvme_completion_fn fn; 1163 struct nvme_cmd_info *cmd; 1164 struct nvme_completion cqe; 1165 1166 if (!blk_mq_request_started(req)) 1167 return; 1168 1169 cmd = blk_mq_rq_to_pdu(req); 1170 1171 if (cmd->ctx == CMD_CTX_CANCELLED) 1172 return; 1173 1174 if (blk_queue_dying(req->q)) 1175 cqe.status = cpu_to_le16((NVME_SC_ABORT_REQ | NVME_SC_DNR) << 1); 1176 else 1177 cqe.status = cpu_to_le16(NVME_SC_ABORT_REQ << 1); 1178 1179 1180 dev_warn(nvmeq->q_dmadev, "Cancelling I/O %d QID %d\n", 1181 req->tag, nvmeq->qid); 1182 ctx = cancel_cmd_info(cmd, &fn); 1183 fn(nvmeq, ctx, &cqe); 1184 } 1185 1186 static void nvme_free_queue(struct nvme_queue *nvmeq) 1187 { 1188 dma_free_coherent(nvmeq->q_dmadev, CQ_SIZE(nvmeq->q_depth), 1189 (void *)nvmeq->cqes, nvmeq->cq_dma_addr); 1190 if (nvmeq->sq_cmds) 1191 dma_free_coherent(nvmeq->q_dmadev, SQ_SIZE(nvmeq->q_depth), 1192 nvmeq->sq_cmds, nvmeq->sq_dma_addr); 1193 kfree(nvmeq); 1194 } 1195 1196 static void nvme_free_queues(struct nvme_dev *dev, int lowest) 1197 { 1198 int i; 1199 1200 for (i = dev->queue_count - 1; i >= lowest; i--) { 1201 struct nvme_queue *nvmeq = dev->queues[i]; 1202 dev->queue_count--; 1203 dev->queues[i] = NULL; 1204 nvme_free_queue(nvmeq); 1205 } 1206 } 1207 1208 /** 1209 * nvme_suspend_queue - put queue into suspended state 1210 * @nvmeq - queue to suspend 1211 */ 1212 static int nvme_suspend_queue(struct nvme_queue *nvmeq) 1213 { 1214 int vector; 1215 1216 spin_lock_irq(&nvmeq->q_lock); 1217 if (nvmeq->cq_vector == -1) { 1218 spin_unlock_irq(&nvmeq->q_lock); 1219 return 1; 1220 } 1221 vector = nvmeq->dev->entry[nvmeq->cq_vector].vector; 1222 nvmeq->dev->online_queues--; 1223 nvmeq->cq_vector = -1; 1224 spin_unlock_irq(&nvmeq->q_lock); 1225 1226 if (!nvmeq->qid && nvmeq->dev->ctrl.admin_q) 1227 blk_mq_freeze_queue_start(nvmeq->dev->ctrl.admin_q); 1228 1229 irq_set_affinity_hint(vector, NULL); 1230 free_irq(vector, nvmeq); 1231 1232 return 0; 1233 } 1234 1235 static void nvme_clear_queue(struct nvme_queue *nvmeq) 1236 { 1237 spin_lock_irq(&nvmeq->q_lock); 1238 if (nvmeq->tags && *nvmeq->tags) 1239 blk_mq_all_tag_busy_iter(*nvmeq->tags, nvme_cancel_queue_ios, nvmeq); 1240 spin_unlock_irq(&nvmeq->q_lock); 1241 } 1242 1243 static void nvme_disable_queue(struct nvme_dev *dev, int qid) 1244 { 1245 struct nvme_queue *nvmeq = dev->queues[qid]; 1246 1247 if (!nvmeq) 1248 return; 1249 if (nvme_suspend_queue(nvmeq)) 1250 return; 1251 1252 /* Don't tell the adapter to delete the admin queue. 1253 * Don't tell a removed adapter to delete IO queues. */ 1254 if (qid && readl(dev->bar + NVME_REG_CSTS) != -1) { 1255 adapter_delete_sq(dev, qid); 1256 adapter_delete_cq(dev, qid); 1257 } 1258 1259 spin_lock_irq(&nvmeq->q_lock); 1260 nvme_process_cq(nvmeq); 1261 spin_unlock_irq(&nvmeq->q_lock); 1262 } 1263 1264 static int nvme_cmb_qdepth(struct nvme_dev *dev, int nr_io_queues, 1265 int entry_size) 1266 { 1267 int q_depth = dev->q_depth; 1268 unsigned q_size_aligned = roundup(q_depth * entry_size, 1269 dev->ctrl.page_size); 1270 1271 if (q_size_aligned * nr_io_queues > dev->cmb_size) { 1272 u64 mem_per_q = div_u64(dev->cmb_size, nr_io_queues); 1273 mem_per_q = round_down(mem_per_q, dev->ctrl.page_size); 1274 q_depth = div_u64(mem_per_q, entry_size); 1275 1276 /* 1277 * Ensure the reduced q_depth is above some threshold where it 1278 * would be better to map queues in system memory with the 1279 * original depth 1280 */ 1281 if (q_depth < 64) 1282 return -ENOMEM; 1283 } 1284 1285 return q_depth; 1286 } 1287 1288 static int nvme_alloc_sq_cmds(struct nvme_dev *dev, struct nvme_queue *nvmeq, 1289 int qid, int depth) 1290 { 1291 if (qid && dev->cmb && use_cmb_sqes && NVME_CMB_SQS(dev->cmbsz)) { 1292 unsigned offset = (qid - 1) * roundup(SQ_SIZE(depth), 1293 dev->ctrl.page_size); 1294 nvmeq->sq_dma_addr = dev->cmb_dma_addr + offset; 1295 nvmeq->sq_cmds_io = dev->cmb + offset; 1296 } else { 1297 nvmeq->sq_cmds = dma_alloc_coherent(dev->dev, SQ_SIZE(depth), 1298 &nvmeq->sq_dma_addr, GFP_KERNEL); 1299 if (!nvmeq->sq_cmds) 1300 return -ENOMEM; 1301 } 1302 1303 return 0; 1304 } 1305 1306 static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid, 1307 int depth) 1308 { 1309 struct nvme_queue *nvmeq = kzalloc(sizeof(*nvmeq), GFP_KERNEL); 1310 if (!nvmeq) 1311 return NULL; 1312 1313 nvmeq->cqes = dma_zalloc_coherent(dev->dev, CQ_SIZE(depth), 1314 &nvmeq->cq_dma_addr, GFP_KERNEL); 1315 if (!nvmeq->cqes) 1316 goto free_nvmeq; 1317 1318 if (nvme_alloc_sq_cmds(dev, nvmeq, qid, depth)) 1319 goto free_cqdma; 1320 1321 nvmeq->q_dmadev = dev->dev; 1322 nvmeq->dev = dev; 1323 snprintf(nvmeq->irqname, sizeof(nvmeq->irqname), "nvme%dq%d", 1324 dev->ctrl.instance, qid); 1325 spin_lock_init(&nvmeq->q_lock); 1326 nvmeq->cq_head = 0; 1327 nvmeq->cq_phase = 1; 1328 nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride]; 1329 nvmeq->q_depth = depth; 1330 nvmeq->qid = qid; 1331 nvmeq->cq_vector = -1; 1332 dev->queues[qid] = nvmeq; 1333 1334 /* make sure queue descriptor is set before queue count, for kthread */ 1335 mb(); 1336 dev->queue_count++; 1337 1338 return nvmeq; 1339 1340 free_cqdma: 1341 dma_free_coherent(dev->dev, CQ_SIZE(depth), (void *)nvmeq->cqes, 1342 nvmeq->cq_dma_addr); 1343 free_nvmeq: 1344 kfree(nvmeq); 1345 return NULL; 1346 } 1347 1348 static int queue_request_irq(struct nvme_dev *dev, struct nvme_queue *nvmeq, 1349 const char *name) 1350 { 1351 if (use_threaded_interrupts) 1352 return request_threaded_irq(dev->entry[nvmeq->cq_vector].vector, 1353 nvme_irq_check, nvme_irq, IRQF_SHARED, 1354 name, nvmeq); 1355 return request_irq(dev->entry[nvmeq->cq_vector].vector, nvme_irq, 1356 IRQF_SHARED, name, nvmeq); 1357 } 1358 1359 static void nvme_init_queue(struct nvme_queue *nvmeq, u16 qid) 1360 { 1361 struct nvme_dev *dev = nvmeq->dev; 1362 1363 spin_lock_irq(&nvmeq->q_lock); 1364 nvmeq->sq_tail = 0; 1365 nvmeq->cq_head = 0; 1366 nvmeq->cq_phase = 1; 1367 nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride]; 1368 memset((void *)nvmeq->cqes, 0, CQ_SIZE(nvmeq->q_depth)); 1369 dev->online_queues++; 1370 spin_unlock_irq(&nvmeq->q_lock); 1371 } 1372 1373 static int nvme_create_queue(struct nvme_queue *nvmeq, int qid) 1374 { 1375 struct nvme_dev *dev = nvmeq->dev; 1376 int result; 1377 1378 nvmeq->cq_vector = qid - 1; 1379 result = adapter_alloc_cq(dev, qid, nvmeq); 1380 if (result < 0) 1381 return result; 1382 1383 result = adapter_alloc_sq(dev, qid, nvmeq); 1384 if (result < 0) 1385 goto release_cq; 1386 1387 result = queue_request_irq(dev, nvmeq, nvmeq->irqname); 1388 if (result < 0) 1389 goto release_sq; 1390 1391 nvme_init_queue(nvmeq, qid); 1392 return result; 1393 1394 release_sq: 1395 adapter_delete_sq(dev, qid); 1396 release_cq: 1397 adapter_delete_cq(dev, qid); 1398 return result; 1399 } 1400 1401 static struct blk_mq_ops nvme_mq_admin_ops = { 1402 .queue_rq = nvme_queue_rq, 1403 .map_queue = blk_mq_map_queue, 1404 .init_hctx = nvme_admin_init_hctx, 1405 .exit_hctx = nvme_admin_exit_hctx, 1406 .init_request = nvme_admin_init_request, 1407 .timeout = nvme_timeout, 1408 }; 1409 1410 static struct blk_mq_ops nvme_mq_ops = { 1411 .queue_rq = nvme_queue_rq, 1412 .map_queue = blk_mq_map_queue, 1413 .init_hctx = nvme_init_hctx, 1414 .init_request = nvme_init_request, 1415 .timeout = nvme_timeout, 1416 .poll = nvme_poll, 1417 }; 1418 1419 static void nvme_dev_remove_admin(struct nvme_dev *dev) 1420 { 1421 if (dev->ctrl.admin_q && !blk_queue_dying(dev->ctrl.admin_q)) { 1422 blk_cleanup_queue(dev->ctrl.admin_q); 1423 blk_mq_free_tag_set(&dev->admin_tagset); 1424 } 1425 } 1426 1427 static int nvme_alloc_admin_tags(struct nvme_dev *dev) 1428 { 1429 if (!dev->ctrl.admin_q) { 1430 dev->admin_tagset.ops = &nvme_mq_admin_ops; 1431 dev->admin_tagset.nr_hw_queues = 1; 1432 dev->admin_tagset.queue_depth = NVME_AQ_DEPTH - 1; 1433 dev->admin_tagset.reserved_tags = 1; 1434 dev->admin_tagset.timeout = ADMIN_TIMEOUT; 1435 dev->admin_tagset.numa_node = dev_to_node(dev->dev); 1436 dev->admin_tagset.cmd_size = nvme_cmd_size(dev); 1437 dev->admin_tagset.driver_data = dev; 1438 1439 if (blk_mq_alloc_tag_set(&dev->admin_tagset)) 1440 return -ENOMEM; 1441 1442 dev->ctrl.admin_q = blk_mq_init_queue(&dev->admin_tagset); 1443 if (IS_ERR(dev->ctrl.admin_q)) { 1444 blk_mq_free_tag_set(&dev->admin_tagset); 1445 return -ENOMEM; 1446 } 1447 if (!blk_get_queue(dev->ctrl.admin_q)) { 1448 nvme_dev_remove_admin(dev); 1449 dev->ctrl.admin_q = NULL; 1450 return -ENODEV; 1451 } 1452 } else 1453 blk_mq_unfreeze_queue(dev->ctrl.admin_q); 1454 1455 return 0; 1456 } 1457 1458 static int nvme_configure_admin_queue(struct nvme_dev *dev) 1459 { 1460 int result; 1461 u32 aqa; 1462 u64 cap = lo_hi_readq(dev->bar + NVME_REG_CAP); 1463 struct nvme_queue *nvmeq; 1464 1465 dev->subsystem = readl(dev->bar + NVME_REG_VS) >= NVME_VS(1, 1) ? 1466 NVME_CAP_NSSRC(cap) : 0; 1467 1468 if (dev->subsystem && 1469 (readl(dev->bar + NVME_REG_CSTS) & NVME_CSTS_NSSRO)) 1470 writel(NVME_CSTS_NSSRO, dev->bar + NVME_REG_CSTS); 1471 1472 result = nvme_disable_ctrl(&dev->ctrl, cap); 1473 if (result < 0) 1474 return result; 1475 1476 nvmeq = dev->queues[0]; 1477 if (!nvmeq) { 1478 nvmeq = nvme_alloc_queue(dev, 0, NVME_AQ_DEPTH); 1479 if (!nvmeq) 1480 return -ENOMEM; 1481 } 1482 1483 aqa = nvmeq->q_depth - 1; 1484 aqa |= aqa << 16; 1485 1486 writel(aqa, dev->bar + NVME_REG_AQA); 1487 lo_hi_writeq(nvmeq->sq_dma_addr, dev->bar + NVME_REG_ASQ); 1488 lo_hi_writeq(nvmeq->cq_dma_addr, dev->bar + NVME_REG_ACQ); 1489 1490 result = nvme_enable_ctrl(&dev->ctrl, cap); 1491 if (result) 1492 goto free_nvmeq; 1493 1494 nvmeq->cq_vector = 0; 1495 result = queue_request_irq(dev, nvmeq, nvmeq->irqname); 1496 if (result) { 1497 nvmeq->cq_vector = -1; 1498 goto free_nvmeq; 1499 } 1500 1501 return result; 1502 1503 free_nvmeq: 1504 nvme_free_queues(dev, 0); 1505 return result; 1506 } 1507 1508 static int nvme_kthread(void *data) 1509 { 1510 struct nvme_dev *dev, *next; 1511 1512 while (!kthread_should_stop()) { 1513 set_current_state(TASK_INTERRUPTIBLE); 1514 spin_lock(&dev_list_lock); 1515 list_for_each_entry_safe(dev, next, &dev_list, node) { 1516 int i; 1517 u32 csts = readl(dev->bar + NVME_REG_CSTS); 1518 1519 /* 1520 * Skip controllers currently under reset. 1521 */ 1522 if (work_pending(&dev->reset_work) || work_busy(&dev->reset_work)) 1523 continue; 1524 1525 if ((dev->subsystem && (csts & NVME_CSTS_NSSRO)) || 1526 csts & NVME_CSTS_CFS) { 1527 if (queue_work(nvme_workq, &dev->reset_work)) { 1528 dev_warn(dev->dev, 1529 "Failed status: %x, reset controller\n", 1530 readl(dev->bar + NVME_REG_CSTS)); 1531 } 1532 continue; 1533 } 1534 for (i = 0; i < dev->queue_count; i++) { 1535 struct nvme_queue *nvmeq = dev->queues[i]; 1536 if (!nvmeq) 1537 continue; 1538 spin_lock_irq(&nvmeq->q_lock); 1539 nvme_process_cq(nvmeq); 1540 1541 while (i == 0 && dev->ctrl.event_limit > 0) { 1542 if (nvme_submit_async_admin_req(dev)) 1543 break; 1544 dev->ctrl.event_limit--; 1545 } 1546 spin_unlock_irq(&nvmeq->q_lock); 1547 } 1548 } 1549 spin_unlock(&dev_list_lock); 1550 schedule_timeout(round_jiffies_relative(HZ)); 1551 } 1552 return 0; 1553 } 1554 1555 static int nvme_create_io_queues(struct nvme_dev *dev) 1556 { 1557 unsigned i; 1558 int ret = 0; 1559 1560 for (i = dev->queue_count; i <= dev->max_qid; i++) { 1561 if (!nvme_alloc_queue(dev, i, dev->q_depth)) { 1562 ret = -ENOMEM; 1563 break; 1564 } 1565 } 1566 1567 for (i = dev->online_queues; i <= dev->queue_count - 1; i++) { 1568 ret = nvme_create_queue(dev->queues[i], i); 1569 if (ret) { 1570 nvme_free_queues(dev, i); 1571 break; 1572 } 1573 } 1574 1575 /* 1576 * Ignore failing Create SQ/CQ commands, we can continue with less 1577 * than the desired aount of queues, and even a controller without 1578 * I/O queues an still be used to issue admin commands. This might 1579 * be useful to upgrade a buggy firmware for example. 1580 */ 1581 return ret >= 0 ? 0 : ret; 1582 } 1583 1584 static void __iomem *nvme_map_cmb(struct nvme_dev *dev) 1585 { 1586 u64 szu, size, offset; 1587 u32 cmbloc; 1588 resource_size_t bar_size; 1589 struct pci_dev *pdev = to_pci_dev(dev->dev); 1590 void __iomem *cmb; 1591 dma_addr_t dma_addr; 1592 1593 if (!use_cmb_sqes) 1594 return NULL; 1595 1596 dev->cmbsz = readl(dev->bar + NVME_REG_CMBSZ); 1597 if (!(NVME_CMB_SZ(dev->cmbsz))) 1598 return NULL; 1599 1600 cmbloc = readl(dev->bar + NVME_REG_CMBLOC); 1601 1602 szu = (u64)1 << (12 + 4 * NVME_CMB_SZU(dev->cmbsz)); 1603 size = szu * NVME_CMB_SZ(dev->cmbsz); 1604 offset = szu * NVME_CMB_OFST(cmbloc); 1605 bar_size = pci_resource_len(pdev, NVME_CMB_BIR(cmbloc)); 1606 1607 if (offset > bar_size) 1608 return NULL; 1609 1610 /* 1611 * Controllers may support a CMB size larger than their BAR, 1612 * for example, due to being behind a bridge. Reduce the CMB to 1613 * the reported size of the BAR 1614 */ 1615 if (size > bar_size - offset) 1616 size = bar_size - offset; 1617 1618 dma_addr = pci_resource_start(pdev, NVME_CMB_BIR(cmbloc)) + offset; 1619 cmb = ioremap_wc(dma_addr, size); 1620 if (!cmb) 1621 return NULL; 1622 1623 dev->cmb_dma_addr = dma_addr; 1624 dev->cmb_size = size; 1625 return cmb; 1626 } 1627 1628 static inline void nvme_release_cmb(struct nvme_dev *dev) 1629 { 1630 if (dev->cmb) { 1631 iounmap(dev->cmb); 1632 dev->cmb = NULL; 1633 } 1634 } 1635 1636 static size_t db_bar_size(struct nvme_dev *dev, unsigned nr_io_queues) 1637 { 1638 return 4096 + ((nr_io_queues + 1) * 8 * dev->db_stride); 1639 } 1640 1641 static int nvme_setup_io_queues(struct nvme_dev *dev) 1642 { 1643 struct nvme_queue *adminq = dev->queues[0]; 1644 struct pci_dev *pdev = to_pci_dev(dev->dev); 1645 int result, i, vecs, nr_io_queues, size; 1646 1647 nr_io_queues = num_possible_cpus(); 1648 result = nvme_set_queue_count(&dev->ctrl, &nr_io_queues); 1649 if (result < 0) 1650 return result; 1651 1652 /* 1653 * Degraded controllers might return an error when setting the queue 1654 * count. We still want to be able to bring them online and offer 1655 * access to the admin queue, as that might be only way to fix them up. 1656 */ 1657 if (result > 0) { 1658 dev_err(dev->dev, "Could not set queue count (%d)\n", result); 1659 nr_io_queues = 0; 1660 result = 0; 1661 } 1662 1663 if (dev->cmb && NVME_CMB_SQS(dev->cmbsz)) { 1664 result = nvme_cmb_qdepth(dev, nr_io_queues, 1665 sizeof(struct nvme_command)); 1666 if (result > 0) 1667 dev->q_depth = result; 1668 else 1669 nvme_release_cmb(dev); 1670 } 1671 1672 size = db_bar_size(dev, nr_io_queues); 1673 if (size > 8192) { 1674 iounmap(dev->bar); 1675 do { 1676 dev->bar = ioremap(pci_resource_start(pdev, 0), size); 1677 if (dev->bar) 1678 break; 1679 if (!--nr_io_queues) 1680 return -ENOMEM; 1681 size = db_bar_size(dev, nr_io_queues); 1682 } while (1); 1683 dev->dbs = dev->bar + 4096; 1684 adminq->q_db = dev->dbs; 1685 } 1686 1687 /* Deregister the admin queue's interrupt */ 1688 free_irq(dev->entry[0].vector, adminq); 1689 1690 /* 1691 * If we enable msix early due to not intx, disable it again before 1692 * setting up the full range we need. 1693 */ 1694 if (!pdev->irq) 1695 pci_disable_msix(pdev); 1696 1697 for (i = 0; i < nr_io_queues; i++) 1698 dev->entry[i].entry = i; 1699 vecs = pci_enable_msix_range(pdev, dev->entry, 1, nr_io_queues); 1700 if (vecs < 0) { 1701 vecs = pci_enable_msi_range(pdev, 1, min(nr_io_queues, 32)); 1702 if (vecs < 0) { 1703 vecs = 1; 1704 } else { 1705 for (i = 0; i < vecs; i++) 1706 dev->entry[i].vector = i + pdev->irq; 1707 } 1708 } 1709 1710 /* 1711 * Should investigate if there's a performance win from allocating 1712 * more queues than interrupt vectors; it might allow the submission 1713 * path to scale better, even if the receive path is limited by the 1714 * number of interrupts. 1715 */ 1716 nr_io_queues = vecs; 1717 dev->max_qid = nr_io_queues; 1718 1719 result = queue_request_irq(dev, adminq, adminq->irqname); 1720 if (result) { 1721 adminq->cq_vector = -1; 1722 goto free_queues; 1723 } 1724 1725 /* Free previously allocated queues that are no longer usable */ 1726 nvme_free_queues(dev, nr_io_queues + 1); 1727 return nvme_create_io_queues(dev); 1728 1729 free_queues: 1730 nvme_free_queues(dev, 1); 1731 return result; 1732 } 1733 1734 static void nvme_set_irq_hints(struct nvme_dev *dev) 1735 { 1736 struct nvme_queue *nvmeq; 1737 int i; 1738 1739 for (i = 0; i < dev->online_queues; i++) { 1740 nvmeq = dev->queues[i]; 1741 1742 if (!nvmeq->tags || !(*nvmeq->tags)) 1743 continue; 1744 1745 irq_set_affinity_hint(dev->entry[nvmeq->cq_vector].vector, 1746 blk_mq_tags_cpumask(*nvmeq->tags)); 1747 } 1748 } 1749 1750 static void nvme_dev_scan(struct work_struct *work) 1751 { 1752 struct nvme_dev *dev = container_of(work, struct nvme_dev, scan_work); 1753 1754 if (!dev->tagset.tags) 1755 return; 1756 nvme_scan_namespaces(&dev->ctrl); 1757 nvme_set_irq_hints(dev); 1758 } 1759 1760 /* 1761 * Return: error value if an error occurred setting up the queues or calling 1762 * Identify Device. 0 if these succeeded, even if adding some of the 1763 * namespaces failed. At the moment, these failures are silent. TBD which 1764 * failures should be reported. 1765 */ 1766 static int nvme_dev_add(struct nvme_dev *dev) 1767 { 1768 if (!dev->ctrl.tagset) { 1769 dev->tagset.ops = &nvme_mq_ops; 1770 dev->tagset.nr_hw_queues = dev->online_queues - 1; 1771 dev->tagset.timeout = NVME_IO_TIMEOUT; 1772 dev->tagset.numa_node = dev_to_node(dev->dev); 1773 dev->tagset.queue_depth = 1774 min_t(int, dev->q_depth, BLK_MQ_MAX_DEPTH) - 1; 1775 dev->tagset.cmd_size = nvme_cmd_size(dev); 1776 dev->tagset.flags = BLK_MQ_F_SHOULD_MERGE; 1777 dev->tagset.driver_data = dev; 1778 1779 if (blk_mq_alloc_tag_set(&dev->tagset)) 1780 return 0; 1781 dev->ctrl.tagset = &dev->tagset; 1782 } 1783 schedule_work(&dev->scan_work); 1784 return 0; 1785 } 1786 1787 static int nvme_dev_map(struct nvme_dev *dev) 1788 { 1789 u64 cap; 1790 int bars, result = -ENOMEM; 1791 struct pci_dev *pdev = to_pci_dev(dev->dev); 1792 1793 if (pci_enable_device_mem(pdev)) 1794 return result; 1795 1796 dev->entry[0].vector = pdev->irq; 1797 pci_set_master(pdev); 1798 bars = pci_select_bars(pdev, IORESOURCE_MEM); 1799 if (!bars) 1800 goto disable_pci; 1801 1802 if (pci_request_selected_regions(pdev, bars, "nvme")) 1803 goto disable_pci; 1804 1805 if (dma_set_mask_and_coherent(dev->dev, DMA_BIT_MASK(64)) && 1806 dma_set_mask_and_coherent(dev->dev, DMA_BIT_MASK(32))) 1807 goto disable; 1808 1809 dev->bar = ioremap(pci_resource_start(pdev, 0), 8192); 1810 if (!dev->bar) 1811 goto disable; 1812 1813 if (readl(dev->bar + NVME_REG_CSTS) == -1) { 1814 result = -ENODEV; 1815 goto unmap; 1816 } 1817 1818 /* 1819 * Some devices don't advertse INTx interrupts, pre-enable a single 1820 * MSIX vec for setup. We'll adjust this later. 1821 */ 1822 if (!pdev->irq) { 1823 result = pci_enable_msix(pdev, dev->entry, 1); 1824 if (result < 0) 1825 goto unmap; 1826 } 1827 1828 cap = lo_hi_readq(dev->bar + NVME_REG_CAP); 1829 1830 dev->q_depth = min_t(int, NVME_CAP_MQES(cap) + 1, NVME_Q_DEPTH); 1831 dev->db_stride = 1 << NVME_CAP_STRIDE(cap); 1832 dev->dbs = dev->bar + 4096; 1833 if (readl(dev->bar + NVME_REG_VS) >= NVME_VS(1, 2)) 1834 dev->cmb = nvme_map_cmb(dev); 1835 1836 return 0; 1837 1838 unmap: 1839 iounmap(dev->bar); 1840 dev->bar = NULL; 1841 disable: 1842 pci_release_regions(pdev); 1843 disable_pci: 1844 pci_disable_device(pdev); 1845 return result; 1846 } 1847 1848 static void nvme_dev_unmap(struct nvme_dev *dev) 1849 { 1850 struct pci_dev *pdev = to_pci_dev(dev->dev); 1851 1852 if (pdev->msi_enabled) 1853 pci_disable_msi(pdev); 1854 else if (pdev->msix_enabled) 1855 pci_disable_msix(pdev); 1856 1857 if (dev->bar) { 1858 iounmap(dev->bar); 1859 dev->bar = NULL; 1860 pci_release_regions(pdev); 1861 } 1862 1863 if (pci_is_enabled(pdev)) 1864 pci_disable_device(pdev); 1865 } 1866 1867 struct nvme_delq_ctx { 1868 struct task_struct *waiter; 1869 struct kthread_worker *worker; 1870 atomic_t refcount; 1871 }; 1872 1873 static void nvme_wait_dq(struct nvme_delq_ctx *dq, struct nvme_dev *dev) 1874 { 1875 dq->waiter = current; 1876 mb(); 1877 1878 for (;;) { 1879 set_current_state(TASK_KILLABLE); 1880 if (!atomic_read(&dq->refcount)) 1881 break; 1882 if (!schedule_timeout(ADMIN_TIMEOUT) || 1883 fatal_signal_pending(current)) { 1884 /* 1885 * Disable the controller first since we can't trust it 1886 * at this point, but leave the admin queue enabled 1887 * until all queue deletion requests are flushed. 1888 * FIXME: This may take a while if there are more h/w 1889 * queues than admin tags. 1890 */ 1891 set_current_state(TASK_RUNNING); 1892 nvme_disable_ctrl(&dev->ctrl, 1893 lo_hi_readq(dev->bar + NVME_REG_CAP)); 1894 nvme_clear_queue(dev->queues[0]); 1895 flush_kthread_worker(dq->worker); 1896 nvme_disable_queue(dev, 0); 1897 return; 1898 } 1899 } 1900 set_current_state(TASK_RUNNING); 1901 } 1902 1903 static void nvme_put_dq(struct nvme_delq_ctx *dq) 1904 { 1905 atomic_dec(&dq->refcount); 1906 if (dq->waiter) 1907 wake_up_process(dq->waiter); 1908 } 1909 1910 static struct nvme_delq_ctx *nvme_get_dq(struct nvme_delq_ctx *dq) 1911 { 1912 atomic_inc(&dq->refcount); 1913 return dq; 1914 } 1915 1916 static void nvme_del_queue_end(struct nvme_queue *nvmeq) 1917 { 1918 struct nvme_delq_ctx *dq = nvmeq->cmdinfo.ctx; 1919 nvme_put_dq(dq); 1920 1921 spin_lock_irq(&nvmeq->q_lock); 1922 nvme_process_cq(nvmeq); 1923 spin_unlock_irq(&nvmeq->q_lock); 1924 } 1925 1926 static int adapter_async_del_queue(struct nvme_queue *nvmeq, u8 opcode, 1927 kthread_work_func_t fn) 1928 { 1929 struct nvme_command c; 1930 1931 memset(&c, 0, sizeof(c)); 1932 c.delete_queue.opcode = opcode; 1933 c.delete_queue.qid = cpu_to_le16(nvmeq->qid); 1934 1935 init_kthread_work(&nvmeq->cmdinfo.work, fn); 1936 return nvme_submit_admin_async_cmd(nvmeq->dev, &c, &nvmeq->cmdinfo, 1937 ADMIN_TIMEOUT); 1938 } 1939 1940 static void nvme_del_cq_work_handler(struct kthread_work *work) 1941 { 1942 struct nvme_queue *nvmeq = container_of(work, struct nvme_queue, 1943 cmdinfo.work); 1944 nvme_del_queue_end(nvmeq); 1945 } 1946 1947 static int nvme_delete_cq(struct nvme_queue *nvmeq) 1948 { 1949 return adapter_async_del_queue(nvmeq, nvme_admin_delete_cq, 1950 nvme_del_cq_work_handler); 1951 } 1952 1953 static void nvme_del_sq_work_handler(struct kthread_work *work) 1954 { 1955 struct nvme_queue *nvmeq = container_of(work, struct nvme_queue, 1956 cmdinfo.work); 1957 int status = nvmeq->cmdinfo.status; 1958 1959 if (!status) 1960 status = nvme_delete_cq(nvmeq); 1961 if (status) 1962 nvme_del_queue_end(nvmeq); 1963 } 1964 1965 static int nvme_delete_sq(struct nvme_queue *nvmeq) 1966 { 1967 return adapter_async_del_queue(nvmeq, nvme_admin_delete_sq, 1968 nvme_del_sq_work_handler); 1969 } 1970 1971 static void nvme_del_queue_start(struct kthread_work *work) 1972 { 1973 struct nvme_queue *nvmeq = container_of(work, struct nvme_queue, 1974 cmdinfo.work); 1975 if (nvme_delete_sq(nvmeq)) 1976 nvme_del_queue_end(nvmeq); 1977 } 1978 1979 static void nvme_disable_io_queues(struct nvme_dev *dev) 1980 { 1981 int i; 1982 DEFINE_KTHREAD_WORKER_ONSTACK(worker); 1983 struct nvme_delq_ctx dq; 1984 struct task_struct *kworker_task = kthread_run(kthread_worker_fn, 1985 &worker, "nvme%d", dev->ctrl.instance); 1986 1987 if (IS_ERR(kworker_task)) { 1988 dev_err(dev->dev, 1989 "Failed to create queue del task\n"); 1990 for (i = dev->queue_count - 1; i > 0; i--) 1991 nvme_disable_queue(dev, i); 1992 return; 1993 } 1994 1995 dq.waiter = NULL; 1996 atomic_set(&dq.refcount, 0); 1997 dq.worker = &worker; 1998 for (i = dev->queue_count - 1; i > 0; i--) { 1999 struct nvme_queue *nvmeq = dev->queues[i]; 2000 2001 if (nvme_suspend_queue(nvmeq)) 2002 continue; 2003 nvmeq->cmdinfo.ctx = nvme_get_dq(&dq); 2004 nvmeq->cmdinfo.worker = dq.worker; 2005 init_kthread_work(&nvmeq->cmdinfo.work, nvme_del_queue_start); 2006 queue_kthread_work(dq.worker, &nvmeq->cmdinfo.work); 2007 } 2008 nvme_wait_dq(&dq, dev); 2009 kthread_stop(kworker_task); 2010 } 2011 2012 static int nvme_dev_list_add(struct nvme_dev *dev) 2013 { 2014 bool start_thread = false; 2015 2016 spin_lock(&dev_list_lock); 2017 if (list_empty(&dev_list) && IS_ERR_OR_NULL(nvme_thread)) { 2018 start_thread = true; 2019 nvme_thread = NULL; 2020 } 2021 list_add(&dev->node, &dev_list); 2022 spin_unlock(&dev_list_lock); 2023 2024 if (start_thread) { 2025 nvme_thread = kthread_run(nvme_kthread, NULL, "nvme"); 2026 wake_up_all(&nvme_kthread_wait); 2027 } else 2028 wait_event_killable(nvme_kthread_wait, nvme_thread); 2029 2030 if (IS_ERR_OR_NULL(nvme_thread)) 2031 return nvme_thread ? PTR_ERR(nvme_thread) : -EINTR; 2032 2033 return 0; 2034 } 2035 2036 /* 2037 * Remove the node from the device list and check 2038 * for whether or not we need to stop the nvme_thread. 2039 */ 2040 static void nvme_dev_list_remove(struct nvme_dev *dev) 2041 { 2042 struct task_struct *tmp = NULL; 2043 2044 spin_lock(&dev_list_lock); 2045 list_del_init(&dev->node); 2046 if (list_empty(&dev_list) && !IS_ERR_OR_NULL(nvme_thread)) { 2047 tmp = nvme_thread; 2048 nvme_thread = NULL; 2049 } 2050 spin_unlock(&dev_list_lock); 2051 2052 if (tmp) 2053 kthread_stop(tmp); 2054 } 2055 2056 static void nvme_freeze_queues(struct nvme_dev *dev) 2057 { 2058 struct nvme_ns *ns; 2059 2060 list_for_each_entry(ns, &dev->ctrl.namespaces, list) { 2061 blk_mq_freeze_queue_start(ns->queue); 2062 2063 spin_lock_irq(ns->queue->queue_lock); 2064 queue_flag_set(QUEUE_FLAG_STOPPED, ns->queue); 2065 spin_unlock_irq(ns->queue->queue_lock); 2066 2067 blk_mq_cancel_requeue_work(ns->queue); 2068 blk_mq_stop_hw_queues(ns->queue); 2069 } 2070 } 2071 2072 static void nvme_unfreeze_queues(struct nvme_dev *dev) 2073 { 2074 struct nvme_ns *ns; 2075 2076 list_for_each_entry(ns, &dev->ctrl.namespaces, list) { 2077 queue_flag_clear_unlocked(QUEUE_FLAG_STOPPED, ns->queue); 2078 blk_mq_unfreeze_queue(ns->queue); 2079 blk_mq_start_stopped_hw_queues(ns->queue, true); 2080 blk_mq_kick_requeue_list(ns->queue); 2081 } 2082 } 2083 2084 static void nvme_dev_shutdown(struct nvme_dev *dev) 2085 { 2086 int i; 2087 u32 csts = -1; 2088 2089 nvme_dev_list_remove(dev); 2090 2091 mutex_lock(&dev->shutdown_lock); 2092 if (dev->bar) { 2093 nvme_freeze_queues(dev); 2094 csts = readl(dev->bar + NVME_REG_CSTS); 2095 } 2096 if (csts & NVME_CSTS_CFS || !(csts & NVME_CSTS_RDY)) { 2097 for (i = dev->queue_count - 1; i >= 0; i--) { 2098 struct nvme_queue *nvmeq = dev->queues[i]; 2099 nvme_suspend_queue(nvmeq); 2100 } 2101 } else { 2102 nvme_disable_io_queues(dev); 2103 nvme_shutdown_ctrl(&dev->ctrl); 2104 nvme_disable_queue(dev, 0); 2105 } 2106 nvme_dev_unmap(dev); 2107 2108 for (i = dev->queue_count - 1; i >= 0; i--) 2109 nvme_clear_queue(dev->queues[i]); 2110 mutex_unlock(&dev->shutdown_lock); 2111 } 2112 2113 static int nvme_setup_prp_pools(struct nvme_dev *dev) 2114 { 2115 dev->prp_page_pool = dma_pool_create("prp list page", dev->dev, 2116 PAGE_SIZE, PAGE_SIZE, 0); 2117 if (!dev->prp_page_pool) 2118 return -ENOMEM; 2119 2120 /* Optimisation for I/Os between 4k and 128k */ 2121 dev->prp_small_pool = dma_pool_create("prp list 256", dev->dev, 2122 256, 256, 0); 2123 if (!dev->prp_small_pool) { 2124 dma_pool_destroy(dev->prp_page_pool); 2125 return -ENOMEM; 2126 } 2127 return 0; 2128 } 2129 2130 static void nvme_release_prp_pools(struct nvme_dev *dev) 2131 { 2132 dma_pool_destroy(dev->prp_page_pool); 2133 dma_pool_destroy(dev->prp_small_pool); 2134 } 2135 2136 static void nvme_pci_free_ctrl(struct nvme_ctrl *ctrl) 2137 { 2138 struct nvme_dev *dev = to_nvme_dev(ctrl); 2139 2140 put_device(dev->dev); 2141 if (dev->tagset.tags) 2142 blk_mq_free_tag_set(&dev->tagset); 2143 if (dev->ctrl.admin_q) 2144 blk_put_queue(dev->ctrl.admin_q); 2145 kfree(dev->queues); 2146 kfree(dev->entry); 2147 kfree(dev); 2148 } 2149 2150 static void nvme_reset_work(struct work_struct *work) 2151 { 2152 struct nvme_dev *dev = container_of(work, struct nvme_dev, reset_work); 2153 int result; 2154 2155 if (WARN_ON(test_bit(NVME_CTRL_RESETTING, &dev->flags))) 2156 goto out; 2157 2158 /* 2159 * If we're called to reset a live controller first shut it down before 2160 * moving on. 2161 */ 2162 if (dev->bar) 2163 nvme_dev_shutdown(dev); 2164 2165 set_bit(NVME_CTRL_RESETTING, &dev->flags); 2166 2167 result = nvme_dev_map(dev); 2168 if (result) 2169 goto out; 2170 2171 result = nvme_configure_admin_queue(dev); 2172 if (result) 2173 goto unmap; 2174 2175 nvme_init_queue(dev->queues[0], 0); 2176 result = nvme_alloc_admin_tags(dev); 2177 if (result) 2178 goto disable; 2179 2180 result = nvme_init_identify(&dev->ctrl); 2181 if (result) 2182 goto free_tags; 2183 2184 result = nvme_setup_io_queues(dev); 2185 if (result) 2186 goto free_tags; 2187 2188 dev->ctrl.event_limit = 1; 2189 2190 result = nvme_dev_list_add(dev); 2191 if (result) 2192 goto remove; 2193 2194 /* 2195 * Keep the controller around but remove all namespaces if we don't have 2196 * any working I/O queue. 2197 */ 2198 if (dev->online_queues < 2) { 2199 dev_warn(dev->dev, "IO queues not created\n"); 2200 nvme_remove_namespaces(&dev->ctrl); 2201 } else { 2202 nvme_unfreeze_queues(dev); 2203 nvme_dev_add(dev); 2204 } 2205 2206 clear_bit(NVME_CTRL_RESETTING, &dev->flags); 2207 return; 2208 2209 remove: 2210 nvme_dev_list_remove(dev); 2211 free_tags: 2212 nvme_dev_remove_admin(dev); 2213 blk_put_queue(dev->ctrl.admin_q); 2214 dev->ctrl.admin_q = NULL; 2215 dev->queues[0]->tags = NULL; 2216 disable: 2217 nvme_disable_queue(dev, 0); 2218 unmap: 2219 nvme_dev_unmap(dev); 2220 out: 2221 if (!work_pending(&dev->reset_work)) 2222 nvme_dead_ctrl(dev); 2223 } 2224 2225 static int nvme_remove_dead_ctrl(void *arg) 2226 { 2227 struct nvme_dev *dev = (struct nvme_dev *)arg; 2228 struct pci_dev *pdev = to_pci_dev(dev->dev); 2229 2230 if (pci_get_drvdata(pdev)) 2231 pci_stop_and_remove_bus_device_locked(pdev); 2232 nvme_put_ctrl(&dev->ctrl); 2233 return 0; 2234 } 2235 2236 static void nvme_dead_ctrl(struct nvme_dev *dev) 2237 { 2238 dev_warn(dev->dev, "Device failed to resume\n"); 2239 kref_get(&dev->ctrl.kref); 2240 if (IS_ERR(kthread_run(nvme_remove_dead_ctrl, dev, "nvme%d", 2241 dev->ctrl.instance))) { 2242 dev_err(dev->dev, 2243 "Failed to start controller remove task\n"); 2244 nvme_put_ctrl(&dev->ctrl); 2245 } 2246 } 2247 2248 static int nvme_reset(struct nvme_dev *dev) 2249 { 2250 if (!dev->ctrl.admin_q || blk_queue_dying(dev->ctrl.admin_q)) 2251 return -ENODEV; 2252 2253 if (!queue_work(nvme_workq, &dev->reset_work)) 2254 return -EBUSY; 2255 2256 flush_work(&dev->reset_work); 2257 return 0; 2258 } 2259 2260 static int nvme_pci_reg_read32(struct nvme_ctrl *ctrl, u32 off, u32 *val) 2261 { 2262 *val = readl(to_nvme_dev(ctrl)->bar + off); 2263 return 0; 2264 } 2265 2266 static int nvme_pci_reg_write32(struct nvme_ctrl *ctrl, u32 off, u32 val) 2267 { 2268 writel(val, to_nvme_dev(ctrl)->bar + off); 2269 return 0; 2270 } 2271 2272 static int nvme_pci_reg_read64(struct nvme_ctrl *ctrl, u32 off, u64 *val) 2273 { 2274 *val = readq(to_nvme_dev(ctrl)->bar + off); 2275 return 0; 2276 } 2277 2278 static bool nvme_pci_io_incapable(struct nvme_ctrl *ctrl) 2279 { 2280 struct nvme_dev *dev = to_nvme_dev(ctrl); 2281 2282 return !dev->bar || dev->online_queues < 2; 2283 } 2284 2285 static int nvme_pci_reset_ctrl(struct nvme_ctrl *ctrl) 2286 { 2287 return nvme_reset(to_nvme_dev(ctrl)); 2288 } 2289 2290 static const struct nvme_ctrl_ops nvme_pci_ctrl_ops = { 2291 .reg_read32 = nvme_pci_reg_read32, 2292 .reg_write32 = nvme_pci_reg_write32, 2293 .reg_read64 = nvme_pci_reg_read64, 2294 .io_incapable = nvme_pci_io_incapable, 2295 .reset_ctrl = nvme_pci_reset_ctrl, 2296 .free_ctrl = nvme_pci_free_ctrl, 2297 }; 2298 2299 static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) 2300 { 2301 int node, result = -ENOMEM; 2302 struct nvme_dev *dev; 2303 2304 node = dev_to_node(&pdev->dev); 2305 if (node == NUMA_NO_NODE) 2306 set_dev_node(&pdev->dev, 0); 2307 2308 dev = kzalloc_node(sizeof(*dev), GFP_KERNEL, node); 2309 if (!dev) 2310 return -ENOMEM; 2311 dev->entry = kzalloc_node(num_possible_cpus() * sizeof(*dev->entry), 2312 GFP_KERNEL, node); 2313 if (!dev->entry) 2314 goto free; 2315 dev->queues = kzalloc_node((num_possible_cpus() + 1) * sizeof(void *), 2316 GFP_KERNEL, node); 2317 if (!dev->queues) 2318 goto free; 2319 2320 dev->dev = get_device(&pdev->dev); 2321 pci_set_drvdata(pdev, dev); 2322 2323 INIT_LIST_HEAD(&dev->node); 2324 INIT_WORK(&dev->scan_work, nvme_dev_scan); 2325 INIT_WORK(&dev->reset_work, nvme_reset_work); 2326 mutex_init(&dev->shutdown_lock); 2327 2328 result = nvme_setup_prp_pools(dev); 2329 if (result) 2330 goto put_pci; 2331 2332 result = nvme_init_ctrl(&dev->ctrl, &pdev->dev, &nvme_pci_ctrl_ops, 2333 id->driver_data); 2334 if (result) 2335 goto release_pools; 2336 2337 schedule_work(&dev->reset_work); 2338 return 0; 2339 2340 release_pools: 2341 nvme_release_prp_pools(dev); 2342 put_pci: 2343 put_device(dev->dev); 2344 free: 2345 kfree(dev->queues); 2346 kfree(dev->entry); 2347 kfree(dev); 2348 return result; 2349 } 2350 2351 static void nvme_reset_notify(struct pci_dev *pdev, bool prepare) 2352 { 2353 struct nvme_dev *dev = pci_get_drvdata(pdev); 2354 2355 if (prepare) 2356 nvme_dev_shutdown(dev); 2357 else 2358 schedule_work(&dev->reset_work); 2359 } 2360 2361 static void nvme_shutdown(struct pci_dev *pdev) 2362 { 2363 struct nvme_dev *dev = pci_get_drvdata(pdev); 2364 nvme_dev_shutdown(dev); 2365 } 2366 2367 static void nvme_remove(struct pci_dev *pdev) 2368 { 2369 struct nvme_dev *dev = pci_get_drvdata(pdev); 2370 2371 spin_lock(&dev_list_lock); 2372 list_del_init(&dev->node); 2373 spin_unlock(&dev_list_lock); 2374 2375 pci_set_drvdata(pdev, NULL); 2376 flush_work(&dev->reset_work); 2377 flush_work(&dev->scan_work); 2378 nvme_remove_namespaces(&dev->ctrl); 2379 nvme_dev_shutdown(dev); 2380 nvme_dev_remove_admin(dev); 2381 nvme_free_queues(dev, 0); 2382 nvme_release_cmb(dev); 2383 nvme_release_prp_pools(dev); 2384 nvme_put_ctrl(&dev->ctrl); 2385 } 2386 2387 /* These functions are yet to be implemented */ 2388 #define nvme_error_detected NULL 2389 #define nvme_dump_registers NULL 2390 #define nvme_link_reset NULL 2391 #define nvme_slot_reset NULL 2392 #define nvme_error_resume NULL 2393 2394 #ifdef CONFIG_PM_SLEEP 2395 static int nvme_suspend(struct device *dev) 2396 { 2397 struct pci_dev *pdev = to_pci_dev(dev); 2398 struct nvme_dev *ndev = pci_get_drvdata(pdev); 2399 2400 nvme_dev_shutdown(ndev); 2401 return 0; 2402 } 2403 2404 static int nvme_resume(struct device *dev) 2405 { 2406 struct pci_dev *pdev = to_pci_dev(dev); 2407 struct nvme_dev *ndev = pci_get_drvdata(pdev); 2408 2409 schedule_work(&ndev->reset_work); 2410 return 0; 2411 } 2412 #endif 2413 2414 static SIMPLE_DEV_PM_OPS(nvme_dev_pm_ops, nvme_suspend, nvme_resume); 2415 2416 static const struct pci_error_handlers nvme_err_handler = { 2417 .error_detected = nvme_error_detected, 2418 .mmio_enabled = nvme_dump_registers, 2419 .link_reset = nvme_link_reset, 2420 .slot_reset = nvme_slot_reset, 2421 .resume = nvme_error_resume, 2422 .reset_notify = nvme_reset_notify, 2423 }; 2424 2425 /* Move to pci_ids.h later */ 2426 #define PCI_CLASS_STORAGE_EXPRESS 0x010802 2427 2428 static const struct pci_device_id nvme_id_table[] = { 2429 { PCI_VDEVICE(INTEL, 0x0953), 2430 .driver_data = NVME_QUIRK_STRIPE_SIZE, }, 2431 { PCI_DEVICE_CLASS(PCI_CLASS_STORAGE_EXPRESS, 0xffffff) }, 2432 { PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2001) }, 2433 { 0, } 2434 }; 2435 MODULE_DEVICE_TABLE(pci, nvme_id_table); 2436 2437 static struct pci_driver nvme_driver = { 2438 .name = "nvme", 2439 .id_table = nvme_id_table, 2440 .probe = nvme_probe, 2441 .remove = nvme_remove, 2442 .shutdown = nvme_shutdown, 2443 .driver = { 2444 .pm = &nvme_dev_pm_ops, 2445 }, 2446 .err_handler = &nvme_err_handler, 2447 }; 2448 2449 static int __init nvme_init(void) 2450 { 2451 int result; 2452 2453 init_waitqueue_head(&nvme_kthread_wait); 2454 2455 nvme_workq = create_singlethread_workqueue("nvme"); 2456 if (!nvme_workq) 2457 return -ENOMEM; 2458 2459 result = nvme_core_init(); 2460 if (result < 0) 2461 goto kill_workq; 2462 2463 result = pci_register_driver(&nvme_driver); 2464 if (result) 2465 goto core_exit; 2466 return 0; 2467 2468 core_exit: 2469 nvme_core_exit(); 2470 kill_workq: 2471 destroy_workqueue(nvme_workq); 2472 return result; 2473 } 2474 2475 static void __exit nvme_exit(void) 2476 { 2477 pci_unregister_driver(&nvme_driver); 2478 nvme_core_exit(); 2479 destroy_workqueue(nvme_workq); 2480 BUG_ON(nvme_thread && !IS_ERR(nvme_thread)); 2481 _nvme_check_size(); 2482 } 2483 2484 MODULE_AUTHOR("Matthew Wilcox <willy@linux.intel.com>"); 2485 MODULE_LICENSE("GPL"); 2486 MODULE_VERSION("1.0"); 2487 module_init(nvme_init); 2488 module_exit(nvme_exit); 2489