1 /* 2 * NVM Express device driver 3 * Copyright (c) 2011-2014, Intel Corporation. 4 * 5 * This program is free software; you can redistribute it and/or modify it 6 * under the terms and conditions of the GNU General Public License, 7 * version 2, as published by the Free Software Foundation. 8 * 9 * This program is distributed in the hope it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 12 * more details. 13 */ 14 15 #include <linux/bitops.h> 16 #include <linux/blkdev.h> 17 #include <linux/blk-mq.h> 18 #include <linux/cpu.h> 19 #include <linux/delay.h> 20 #include <linux/errno.h> 21 #include <linux/fs.h> 22 #include <linux/genhd.h> 23 #include <linux/hdreg.h> 24 #include <linux/idr.h> 25 #include <linux/init.h> 26 #include <linux/interrupt.h> 27 #include <linux/io.h> 28 #include <linux/kdev_t.h> 29 #include <linux/kthread.h> 30 #include <linux/kernel.h> 31 #include <linux/list_sort.h> 32 #include <linux/mm.h> 33 #include <linux/module.h> 34 #include <linux/moduleparam.h> 35 #include <linux/pci.h> 36 #include <linux/poison.h> 37 #include <linux/ptrace.h> 38 #include <linux/sched.h> 39 #include <linux/slab.h> 40 #include <linux/t10-pi.h> 41 #include <linux/types.h> 42 #include <scsi/sg.h> 43 #include <asm-generic/io-64-nonatomic-lo-hi.h> 44 45 #include <uapi/linux/nvme_ioctl.h> 46 #include "nvme.h" 47 48 #define NVME_MINORS (1U << MINORBITS) 49 #define NVME_Q_DEPTH 1024 50 #define NVME_AQ_DEPTH 256 51 #define SQ_SIZE(depth) (depth * sizeof(struct nvme_command)) 52 #define CQ_SIZE(depth) (depth * sizeof(struct nvme_completion)) 53 #define ADMIN_TIMEOUT (admin_timeout * HZ) 54 #define SHUTDOWN_TIMEOUT (shutdown_timeout * HZ) 55 56 static unsigned char admin_timeout = 60; 57 module_param(admin_timeout, byte, 0644); 58 MODULE_PARM_DESC(admin_timeout, "timeout in seconds for admin commands"); 59 60 unsigned char nvme_io_timeout = 30; 61 module_param_named(io_timeout, nvme_io_timeout, byte, 0644); 62 MODULE_PARM_DESC(io_timeout, "timeout in seconds for I/O"); 63 64 static unsigned char shutdown_timeout = 5; 65 module_param(shutdown_timeout, byte, 0644); 66 MODULE_PARM_DESC(shutdown_timeout, "timeout in seconds for controller shutdown"); 67 68 static int nvme_major; 69 module_param(nvme_major, int, 0); 70 71 static int nvme_char_major; 72 module_param(nvme_char_major, int, 0); 73 74 static int use_threaded_interrupts; 75 module_param(use_threaded_interrupts, int, 0); 76 77 static bool use_cmb_sqes = true; 78 module_param(use_cmb_sqes, bool, 0644); 79 MODULE_PARM_DESC(use_cmb_sqes, "use controller's memory buffer for I/O SQes"); 80 81 static DEFINE_SPINLOCK(dev_list_lock); 82 static LIST_HEAD(dev_list); 83 static struct task_struct *nvme_thread; 84 static struct workqueue_struct *nvme_workq; 85 static wait_queue_head_t nvme_kthread_wait; 86 87 static struct class *nvme_class; 88 89 static int __nvme_reset(struct nvme_dev *dev); 90 static int nvme_reset(struct nvme_dev *dev); 91 static int nvme_process_cq(struct nvme_queue *nvmeq); 92 static void nvme_dead_ctrl(struct nvme_dev *dev); 93 94 struct async_cmd_info { 95 struct kthread_work work; 96 struct kthread_worker *worker; 97 struct request *req; 98 u32 result; 99 int status; 100 void *ctx; 101 }; 102 103 /* 104 * An NVM Express queue. Each device has at least two (one for admin 105 * commands and one for I/O commands). 106 */ 107 struct nvme_queue { 108 struct device *q_dmadev; 109 struct nvme_dev *dev; 110 char irqname[24]; /* nvme4294967295-65535\0 */ 111 spinlock_t q_lock; 112 struct nvme_command *sq_cmds; 113 struct nvme_command __iomem *sq_cmds_io; 114 volatile struct nvme_completion *cqes; 115 struct blk_mq_tags **tags; 116 dma_addr_t sq_dma_addr; 117 dma_addr_t cq_dma_addr; 118 u32 __iomem *q_db; 119 u16 q_depth; 120 s16 cq_vector; 121 u16 sq_head; 122 u16 sq_tail; 123 u16 cq_head; 124 u16 qid; 125 u8 cq_phase; 126 u8 cqe_seen; 127 struct async_cmd_info cmdinfo; 128 }; 129 130 /* 131 * Check we didin't inadvertently grow the command struct 132 */ 133 static inline void _nvme_check_size(void) 134 { 135 BUILD_BUG_ON(sizeof(struct nvme_rw_command) != 64); 136 BUILD_BUG_ON(sizeof(struct nvme_create_cq) != 64); 137 BUILD_BUG_ON(sizeof(struct nvme_create_sq) != 64); 138 BUILD_BUG_ON(sizeof(struct nvme_delete_queue) != 64); 139 BUILD_BUG_ON(sizeof(struct nvme_features) != 64); 140 BUILD_BUG_ON(sizeof(struct nvme_format_cmd) != 64); 141 BUILD_BUG_ON(sizeof(struct nvme_abort_cmd) != 64); 142 BUILD_BUG_ON(sizeof(struct nvme_command) != 64); 143 BUILD_BUG_ON(sizeof(struct nvme_id_ctrl) != 4096); 144 BUILD_BUG_ON(sizeof(struct nvme_id_ns) != 4096); 145 BUILD_BUG_ON(sizeof(struct nvme_lba_range_type) != 64); 146 BUILD_BUG_ON(sizeof(struct nvme_smart_log) != 512); 147 } 148 149 typedef void (*nvme_completion_fn)(struct nvme_queue *, void *, 150 struct nvme_completion *); 151 152 struct nvme_cmd_info { 153 nvme_completion_fn fn; 154 void *ctx; 155 int aborted; 156 struct nvme_queue *nvmeq; 157 struct nvme_iod iod[0]; 158 }; 159 160 /* 161 * Max size of iod being embedded in the request payload 162 */ 163 #define NVME_INT_PAGES 2 164 #define NVME_INT_BYTES(dev) (NVME_INT_PAGES * (dev)->page_size) 165 #define NVME_INT_MASK 0x01 166 167 /* 168 * Will slightly overestimate the number of pages needed. This is OK 169 * as it only leads to a small amount of wasted memory for the lifetime of 170 * the I/O. 171 */ 172 static int nvme_npages(unsigned size, struct nvme_dev *dev) 173 { 174 unsigned nprps = DIV_ROUND_UP(size + dev->page_size, dev->page_size); 175 return DIV_ROUND_UP(8 * nprps, PAGE_SIZE - 8); 176 } 177 178 static unsigned int nvme_cmd_size(struct nvme_dev *dev) 179 { 180 unsigned int ret = sizeof(struct nvme_cmd_info); 181 182 ret += sizeof(struct nvme_iod); 183 ret += sizeof(__le64 *) * nvme_npages(NVME_INT_BYTES(dev), dev); 184 ret += sizeof(struct scatterlist) * NVME_INT_PAGES; 185 186 return ret; 187 } 188 189 static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, 190 unsigned int hctx_idx) 191 { 192 struct nvme_dev *dev = data; 193 struct nvme_queue *nvmeq = dev->queues[0]; 194 195 WARN_ON(hctx_idx != 0); 196 WARN_ON(dev->admin_tagset.tags[0] != hctx->tags); 197 WARN_ON(nvmeq->tags); 198 199 hctx->driver_data = nvmeq; 200 nvmeq->tags = &dev->admin_tagset.tags[0]; 201 return 0; 202 } 203 204 static void nvme_admin_exit_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) 205 { 206 struct nvme_queue *nvmeq = hctx->driver_data; 207 208 nvmeq->tags = NULL; 209 } 210 211 static int nvme_admin_init_request(void *data, struct request *req, 212 unsigned int hctx_idx, unsigned int rq_idx, 213 unsigned int numa_node) 214 { 215 struct nvme_dev *dev = data; 216 struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req); 217 struct nvme_queue *nvmeq = dev->queues[0]; 218 219 BUG_ON(!nvmeq); 220 cmd->nvmeq = nvmeq; 221 return 0; 222 } 223 224 static int nvme_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, 225 unsigned int hctx_idx) 226 { 227 struct nvme_dev *dev = data; 228 struct nvme_queue *nvmeq = dev->queues[hctx_idx + 1]; 229 230 if (!nvmeq->tags) 231 nvmeq->tags = &dev->tagset.tags[hctx_idx]; 232 233 WARN_ON(dev->tagset.tags[hctx_idx] != hctx->tags); 234 hctx->driver_data = nvmeq; 235 return 0; 236 } 237 238 static int nvme_init_request(void *data, struct request *req, 239 unsigned int hctx_idx, unsigned int rq_idx, 240 unsigned int numa_node) 241 { 242 struct nvme_dev *dev = data; 243 struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req); 244 struct nvme_queue *nvmeq = dev->queues[hctx_idx + 1]; 245 246 BUG_ON(!nvmeq); 247 cmd->nvmeq = nvmeq; 248 return 0; 249 } 250 251 static void nvme_set_info(struct nvme_cmd_info *cmd, void *ctx, 252 nvme_completion_fn handler) 253 { 254 cmd->fn = handler; 255 cmd->ctx = ctx; 256 cmd->aborted = 0; 257 blk_mq_start_request(blk_mq_rq_from_pdu(cmd)); 258 } 259 260 static void *iod_get_private(struct nvme_iod *iod) 261 { 262 return (void *) (iod->private & ~0x1UL); 263 } 264 265 /* 266 * If bit 0 is set, the iod is embedded in the request payload. 267 */ 268 static bool iod_should_kfree(struct nvme_iod *iod) 269 { 270 return (iod->private & NVME_INT_MASK) == 0; 271 } 272 273 /* Special values must be less than 0x1000 */ 274 #define CMD_CTX_BASE ((void *)POISON_POINTER_DELTA) 275 #define CMD_CTX_CANCELLED (0x30C + CMD_CTX_BASE) 276 #define CMD_CTX_COMPLETED (0x310 + CMD_CTX_BASE) 277 #define CMD_CTX_INVALID (0x314 + CMD_CTX_BASE) 278 279 static void special_completion(struct nvme_queue *nvmeq, void *ctx, 280 struct nvme_completion *cqe) 281 { 282 if (ctx == CMD_CTX_CANCELLED) 283 return; 284 if (ctx == CMD_CTX_COMPLETED) { 285 dev_warn(nvmeq->q_dmadev, 286 "completed id %d twice on queue %d\n", 287 cqe->command_id, le16_to_cpup(&cqe->sq_id)); 288 return; 289 } 290 if (ctx == CMD_CTX_INVALID) { 291 dev_warn(nvmeq->q_dmadev, 292 "invalid id %d completed on queue %d\n", 293 cqe->command_id, le16_to_cpup(&cqe->sq_id)); 294 return; 295 } 296 dev_warn(nvmeq->q_dmadev, "Unknown special completion %p\n", ctx); 297 } 298 299 static void *cancel_cmd_info(struct nvme_cmd_info *cmd, nvme_completion_fn *fn) 300 { 301 void *ctx; 302 303 if (fn) 304 *fn = cmd->fn; 305 ctx = cmd->ctx; 306 cmd->fn = special_completion; 307 cmd->ctx = CMD_CTX_CANCELLED; 308 return ctx; 309 } 310 311 static void async_req_completion(struct nvme_queue *nvmeq, void *ctx, 312 struct nvme_completion *cqe) 313 { 314 u32 result = le32_to_cpup(&cqe->result); 315 u16 status = le16_to_cpup(&cqe->status) >> 1; 316 317 if (status == NVME_SC_SUCCESS || status == NVME_SC_ABORT_REQ) 318 ++nvmeq->dev->event_limit; 319 if (status != NVME_SC_SUCCESS) 320 return; 321 322 switch (result & 0xff07) { 323 case NVME_AER_NOTICE_NS_CHANGED: 324 dev_info(nvmeq->q_dmadev, "rescanning\n"); 325 schedule_work(&nvmeq->dev->scan_work); 326 default: 327 dev_warn(nvmeq->q_dmadev, "async event result %08x\n", result); 328 } 329 } 330 331 static void abort_completion(struct nvme_queue *nvmeq, void *ctx, 332 struct nvme_completion *cqe) 333 { 334 struct request *req = ctx; 335 336 u16 status = le16_to_cpup(&cqe->status) >> 1; 337 u32 result = le32_to_cpup(&cqe->result); 338 339 blk_mq_free_request(req); 340 341 dev_warn(nvmeq->q_dmadev, "Abort status:%x result:%x", status, result); 342 ++nvmeq->dev->abort_limit; 343 } 344 345 static void async_completion(struct nvme_queue *nvmeq, void *ctx, 346 struct nvme_completion *cqe) 347 { 348 struct async_cmd_info *cmdinfo = ctx; 349 cmdinfo->result = le32_to_cpup(&cqe->result); 350 cmdinfo->status = le16_to_cpup(&cqe->status) >> 1; 351 queue_kthread_work(cmdinfo->worker, &cmdinfo->work); 352 blk_mq_free_request(cmdinfo->req); 353 } 354 355 static inline struct nvme_cmd_info *get_cmd_from_tag(struct nvme_queue *nvmeq, 356 unsigned int tag) 357 { 358 struct request *req = blk_mq_tag_to_rq(*nvmeq->tags, tag); 359 360 return blk_mq_rq_to_pdu(req); 361 } 362 363 /* 364 * Called with local interrupts disabled and the q_lock held. May not sleep. 365 */ 366 static void *nvme_finish_cmd(struct nvme_queue *nvmeq, int tag, 367 nvme_completion_fn *fn) 368 { 369 struct nvme_cmd_info *cmd = get_cmd_from_tag(nvmeq, tag); 370 void *ctx; 371 if (tag >= nvmeq->q_depth) { 372 *fn = special_completion; 373 return CMD_CTX_INVALID; 374 } 375 if (fn) 376 *fn = cmd->fn; 377 ctx = cmd->ctx; 378 cmd->fn = special_completion; 379 cmd->ctx = CMD_CTX_COMPLETED; 380 return ctx; 381 } 382 383 /** 384 * nvme_submit_cmd() - Copy a command into a queue and ring the doorbell 385 * @nvmeq: The queue to use 386 * @cmd: The command to send 387 * 388 * Safe to use from interrupt context 389 */ 390 static void __nvme_submit_cmd(struct nvme_queue *nvmeq, 391 struct nvme_command *cmd) 392 { 393 u16 tail = nvmeq->sq_tail; 394 395 if (nvmeq->sq_cmds_io) 396 memcpy_toio(&nvmeq->sq_cmds_io[tail], cmd, sizeof(*cmd)); 397 else 398 memcpy(&nvmeq->sq_cmds[tail], cmd, sizeof(*cmd)); 399 400 if (++tail == nvmeq->q_depth) 401 tail = 0; 402 writel(tail, nvmeq->q_db); 403 nvmeq->sq_tail = tail; 404 } 405 406 static void nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd) 407 { 408 unsigned long flags; 409 spin_lock_irqsave(&nvmeq->q_lock, flags); 410 __nvme_submit_cmd(nvmeq, cmd); 411 spin_unlock_irqrestore(&nvmeq->q_lock, flags); 412 } 413 414 static __le64 **iod_list(struct nvme_iod *iod) 415 { 416 return ((void *)iod) + iod->offset; 417 } 418 419 static inline void iod_init(struct nvme_iod *iod, unsigned nbytes, 420 unsigned nseg, unsigned long private) 421 { 422 iod->private = private; 423 iod->offset = offsetof(struct nvme_iod, sg[nseg]); 424 iod->npages = -1; 425 iod->length = nbytes; 426 iod->nents = 0; 427 } 428 429 static struct nvme_iod * 430 __nvme_alloc_iod(unsigned nseg, unsigned bytes, struct nvme_dev *dev, 431 unsigned long priv, gfp_t gfp) 432 { 433 struct nvme_iod *iod = kmalloc(sizeof(struct nvme_iod) + 434 sizeof(__le64 *) * nvme_npages(bytes, dev) + 435 sizeof(struct scatterlist) * nseg, gfp); 436 437 if (iod) 438 iod_init(iod, bytes, nseg, priv); 439 440 return iod; 441 } 442 443 static struct nvme_iod *nvme_alloc_iod(struct request *rq, struct nvme_dev *dev, 444 gfp_t gfp) 445 { 446 unsigned size = !(rq->cmd_flags & REQ_DISCARD) ? blk_rq_bytes(rq) : 447 sizeof(struct nvme_dsm_range); 448 struct nvme_iod *iod; 449 450 if (rq->nr_phys_segments <= NVME_INT_PAGES && 451 size <= NVME_INT_BYTES(dev)) { 452 struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(rq); 453 454 iod = cmd->iod; 455 iod_init(iod, size, rq->nr_phys_segments, 456 (unsigned long) rq | NVME_INT_MASK); 457 return iod; 458 } 459 460 return __nvme_alloc_iod(rq->nr_phys_segments, size, dev, 461 (unsigned long) rq, gfp); 462 } 463 464 static void nvme_free_iod(struct nvme_dev *dev, struct nvme_iod *iod) 465 { 466 const int last_prp = dev->page_size / 8 - 1; 467 int i; 468 __le64 **list = iod_list(iod); 469 dma_addr_t prp_dma = iod->first_dma; 470 471 if (iod->npages == 0) 472 dma_pool_free(dev->prp_small_pool, list[0], prp_dma); 473 for (i = 0; i < iod->npages; i++) { 474 __le64 *prp_list = list[i]; 475 dma_addr_t next_prp_dma = le64_to_cpu(prp_list[last_prp]); 476 dma_pool_free(dev->prp_page_pool, prp_list, prp_dma); 477 prp_dma = next_prp_dma; 478 } 479 480 if (iod_should_kfree(iod)) 481 kfree(iod); 482 } 483 484 static int nvme_error_status(u16 status) 485 { 486 switch (status & 0x7ff) { 487 case NVME_SC_SUCCESS: 488 return 0; 489 case NVME_SC_CAP_EXCEEDED: 490 return -ENOSPC; 491 default: 492 return -EIO; 493 } 494 } 495 496 #ifdef CONFIG_BLK_DEV_INTEGRITY 497 static void nvme_dif_prep(u32 p, u32 v, struct t10_pi_tuple *pi) 498 { 499 if (be32_to_cpu(pi->ref_tag) == v) 500 pi->ref_tag = cpu_to_be32(p); 501 } 502 503 static void nvme_dif_complete(u32 p, u32 v, struct t10_pi_tuple *pi) 504 { 505 if (be32_to_cpu(pi->ref_tag) == p) 506 pi->ref_tag = cpu_to_be32(v); 507 } 508 509 /** 510 * nvme_dif_remap - remaps ref tags to bip seed and physical lba 511 * 512 * The virtual start sector is the one that was originally submitted by the 513 * block layer. Due to partitioning, MD/DM cloning, etc. the actual physical 514 * start sector may be different. Remap protection information to match the 515 * physical LBA on writes, and back to the original seed on reads. 516 * 517 * Type 0 and 3 do not have a ref tag, so no remapping required. 518 */ 519 static void nvme_dif_remap(struct request *req, 520 void (*dif_swap)(u32 p, u32 v, struct t10_pi_tuple *pi)) 521 { 522 struct nvme_ns *ns = req->rq_disk->private_data; 523 struct bio_integrity_payload *bip; 524 struct t10_pi_tuple *pi; 525 void *p, *pmap; 526 u32 i, nlb, ts, phys, virt; 527 528 if (!ns->pi_type || ns->pi_type == NVME_NS_DPS_PI_TYPE3) 529 return; 530 531 bip = bio_integrity(req->bio); 532 if (!bip) 533 return; 534 535 pmap = kmap_atomic(bip->bip_vec->bv_page) + bip->bip_vec->bv_offset; 536 537 p = pmap; 538 virt = bip_get_seed(bip); 539 phys = nvme_block_nr(ns, blk_rq_pos(req)); 540 nlb = (blk_rq_bytes(req) >> ns->lba_shift); 541 ts = ns->disk->integrity->tuple_size; 542 543 for (i = 0; i < nlb; i++, virt++, phys++) { 544 pi = (struct t10_pi_tuple *)p; 545 dif_swap(phys, virt, pi); 546 p += ts; 547 } 548 kunmap_atomic(pmap); 549 } 550 551 static int nvme_noop_verify(struct blk_integrity_iter *iter) 552 { 553 return 0; 554 } 555 556 static int nvme_noop_generate(struct blk_integrity_iter *iter) 557 { 558 return 0; 559 } 560 561 struct blk_integrity nvme_meta_noop = { 562 .name = "NVME_META_NOOP", 563 .generate_fn = nvme_noop_generate, 564 .verify_fn = nvme_noop_verify, 565 }; 566 567 static void nvme_init_integrity(struct nvme_ns *ns) 568 { 569 struct blk_integrity integrity; 570 571 switch (ns->pi_type) { 572 case NVME_NS_DPS_PI_TYPE3: 573 integrity = t10_pi_type3_crc; 574 break; 575 case NVME_NS_DPS_PI_TYPE1: 576 case NVME_NS_DPS_PI_TYPE2: 577 integrity = t10_pi_type1_crc; 578 break; 579 default: 580 integrity = nvme_meta_noop; 581 break; 582 } 583 integrity.tuple_size = ns->ms; 584 blk_integrity_register(ns->disk, &integrity); 585 blk_queue_max_integrity_segments(ns->queue, 1); 586 } 587 #else /* CONFIG_BLK_DEV_INTEGRITY */ 588 static void nvme_dif_remap(struct request *req, 589 void (*dif_swap)(u32 p, u32 v, struct t10_pi_tuple *pi)) 590 { 591 } 592 static void nvme_dif_prep(u32 p, u32 v, struct t10_pi_tuple *pi) 593 { 594 } 595 static void nvme_dif_complete(u32 p, u32 v, struct t10_pi_tuple *pi) 596 { 597 } 598 static void nvme_init_integrity(struct nvme_ns *ns) 599 { 600 } 601 #endif 602 603 static void req_completion(struct nvme_queue *nvmeq, void *ctx, 604 struct nvme_completion *cqe) 605 { 606 struct nvme_iod *iod = ctx; 607 struct request *req = iod_get_private(iod); 608 struct nvme_cmd_info *cmd_rq = blk_mq_rq_to_pdu(req); 609 u16 status = le16_to_cpup(&cqe->status) >> 1; 610 int error = 0; 611 612 if (unlikely(status)) { 613 if (!(status & NVME_SC_DNR || blk_noretry_request(req)) 614 && (jiffies - req->start_time) < req->timeout) { 615 unsigned long flags; 616 617 blk_mq_requeue_request(req); 618 spin_lock_irqsave(req->q->queue_lock, flags); 619 if (!blk_queue_stopped(req->q)) 620 blk_mq_kick_requeue_list(req->q); 621 spin_unlock_irqrestore(req->q->queue_lock, flags); 622 return; 623 } 624 625 if (req->cmd_type == REQ_TYPE_DRV_PRIV) { 626 if (cmd_rq->ctx == CMD_CTX_CANCELLED) 627 error = -EINTR; 628 else 629 error = status; 630 } else { 631 error = nvme_error_status(status); 632 } 633 } 634 635 if (req->cmd_type == REQ_TYPE_DRV_PRIV) { 636 u32 result = le32_to_cpup(&cqe->result); 637 req->special = (void *)(uintptr_t)result; 638 } 639 640 if (cmd_rq->aborted) 641 dev_warn(nvmeq->dev->dev, 642 "completing aborted command with status:%04x\n", 643 error); 644 645 if (iod->nents) { 646 dma_unmap_sg(nvmeq->dev->dev, iod->sg, iod->nents, 647 rq_data_dir(req) ? DMA_TO_DEVICE : DMA_FROM_DEVICE); 648 if (blk_integrity_rq(req)) { 649 if (!rq_data_dir(req)) 650 nvme_dif_remap(req, nvme_dif_complete); 651 dma_unmap_sg(nvmeq->dev->dev, iod->meta_sg, 1, 652 rq_data_dir(req) ? DMA_TO_DEVICE : DMA_FROM_DEVICE); 653 } 654 } 655 nvme_free_iod(nvmeq->dev, iod); 656 657 blk_mq_complete_request(req, error); 658 } 659 660 /* length is in bytes. gfp flags indicates whether we may sleep. */ 661 static int nvme_setup_prps(struct nvme_dev *dev, struct nvme_iod *iod, 662 int total_len, gfp_t gfp) 663 { 664 struct dma_pool *pool; 665 int length = total_len; 666 struct scatterlist *sg = iod->sg; 667 int dma_len = sg_dma_len(sg); 668 u64 dma_addr = sg_dma_address(sg); 669 u32 page_size = dev->page_size; 670 int offset = dma_addr & (page_size - 1); 671 __le64 *prp_list; 672 __le64 **list = iod_list(iod); 673 dma_addr_t prp_dma; 674 int nprps, i; 675 676 length -= (page_size - offset); 677 if (length <= 0) 678 return total_len; 679 680 dma_len -= (page_size - offset); 681 if (dma_len) { 682 dma_addr += (page_size - offset); 683 } else { 684 sg = sg_next(sg); 685 dma_addr = sg_dma_address(sg); 686 dma_len = sg_dma_len(sg); 687 } 688 689 if (length <= page_size) { 690 iod->first_dma = dma_addr; 691 return total_len; 692 } 693 694 nprps = DIV_ROUND_UP(length, page_size); 695 if (nprps <= (256 / 8)) { 696 pool = dev->prp_small_pool; 697 iod->npages = 0; 698 } else { 699 pool = dev->prp_page_pool; 700 iod->npages = 1; 701 } 702 703 prp_list = dma_pool_alloc(pool, gfp, &prp_dma); 704 if (!prp_list) { 705 iod->first_dma = dma_addr; 706 iod->npages = -1; 707 return (total_len - length) + page_size; 708 } 709 list[0] = prp_list; 710 iod->first_dma = prp_dma; 711 i = 0; 712 for (;;) { 713 if (i == page_size >> 3) { 714 __le64 *old_prp_list = prp_list; 715 prp_list = dma_pool_alloc(pool, gfp, &prp_dma); 716 if (!prp_list) 717 return total_len - length; 718 list[iod->npages++] = prp_list; 719 prp_list[0] = old_prp_list[i - 1]; 720 old_prp_list[i - 1] = cpu_to_le64(prp_dma); 721 i = 1; 722 } 723 prp_list[i++] = cpu_to_le64(dma_addr); 724 dma_len -= page_size; 725 dma_addr += page_size; 726 length -= page_size; 727 if (length <= 0) 728 break; 729 if (dma_len > 0) 730 continue; 731 BUG_ON(dma_len < 0); 732 sg = sg_next(sg); 733 dma_addr = sg_dma_address(sg); 734 dma_len = sg_dma_len(sg); 735 } 736 737 return total_len; 738 } 739 740 static void nvme_submit_priv(struct nvme_queue *nvmeq, struct request *req, 741 struct nvme_iod *iod) 742 { 743 struct nvme_command cmnd; 744 745 memcpy(&cmnd, req->cmd, sizeof(cmnd)); 746 cmnd.rw.command_id = req->tag; 747 if (req->nr_phys_segments) { 748 cmnd.rw.prp1 = cpu_to_le64(sg_dma_address(iod->sg)); 749 cmnd.rw.prp2 = cpu_to_le64(iod->first_dma); 750 } 751 752 __nvme_submit_cmd(nvmeq, &cmnd); 753 } 754 755 /* 756 * We reuse the small pool to allocate the 16-byte range here as it is not 757 * worth having a special pool for these or additional cases to handle freeing 758 * the iod. 759 */ 760 static void nvme_submit_discard(struct nvme_queue *nvmeq, struct nvme_ns *ns, 761 struct request *req, struct nvme_iod *iod) 762 { 763 struct nvme_dsm_range *range = 764 (struct nvme_dsm_range *)iod_list(iod)[0]; 765 struct nvme_command cmnd; 766 767 range->cattr = cpu_to_le32(0); 768 range->nlb = cpu_to_le32(blk_rq_bytes(req) >> ns->lba_shift); 769 range->slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req))); 770 771 memset(&cmnd, 0, sizeof(cmnd)); 772 cmnd.dsm.opcode = nvme_cmd_dsm; 773 cmnd.dsm.command_id = req->tag; 774 cmnd.dsm.nsid = cpu_to_le32(ns->ns_id); 775 cmnd.dsm.prp1 = cpu_to_le64(iod->first_dma); 776 cmnd.dsm.nr = 0; 777 cmnd.dsm.attributes = cpu_to_le32(NVME_DSMGMT_AD); 778 779 __nvme_submit_cmd(nvmeq, &cmnd); 780 } 781 782 static void nvme_submit_flush(struct nvme_queue *nvmeq, struct nvme_ns *ns, 783 int cmdid) 784 { 785 struct nvme_command cmnd; 786 787 memset(&cmnd, 0, sizeof(cmnd)); 788 cmnd.common.opcode = nvme_cmd_flush; 789 cmnd.common.command_id = cmdid; 790 cmnd.common.nsid = cpu_to_le32(ns->ns_id); 791 792 __nvme_submit_cmd(nvmeq, &cmnd); 793 } 794 795 static int nvme_submit_iod(struct nvme_queue *nvmeq, struct nvme_iod *iod, 796 struct nvme_ns *ns) 797 { 798 struct request *req = iod_get_private(iod); 799 struct nvme_command cmnd; 800 u16 control = 0; 801 u32 dsmgmt = 0; 802 803 if (req->cmd_flags & REQ_FUA) 804 control |= NVME_RW_FUA; 805 if (req->cmd_flags & (REQ_FAILFAST_DEV | REQ_RAHEAD)) 806 control |= NVME_RW_LR; 807 808 if (req->cmd_flags & REQ_RAHEAD) 809 dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH; 810 811 memset(&cmnd, 0, sizeof(cmnd)); 812 cmnd.rw.opcode = (rq_data_dir(req) ? nvme_cmd_write : nvme_cmd_read); 813 cmnd.rw.command_id = req->tag; 814 cmnd.rw.nsid = cpu_to_le32(ns->ns_id); 815 cmnd.rw.prp1 = cpu_to_le64(sg_dma_address(iod->sg)); 816 cmnd.rw.prp2 = cpu_to_le64(iod->first_dma); 817 cmnd.rw.slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req))); 818 cmnd.rw.length = cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1); 819 820 if (ns->ms) { 821 switch (ns->pi_type) { 822 case NVME_NS_DPS_PI_TYPE3: 823 control |= NVME_RW_PRINFO_PRCHK_GUARD; 824 break; 825 case NVME_NS_DPS_PI_TYPE1: 826 case NVME_NS_DPS_PI_TYPE2: 827 control |= NVME_RW_PRINFO_PRCHK_GUARD | 828 NVME_RW_PRINFO_PRCHK_REF; 829 cmnd.rw.reftag = cpu_to_le32( 830 nvme_block_nr(ns, blk_rq_pos(req))); 831 break; 832 } 833 if (blk_integrity_rq(req)) 834 cmnd.rw.metadata = 835 cpu_to_le64(sg_dma_address(iod->meta_sg)); 836 else 837 control |= NVME_RW_PRINFO_PRACT; 838 } 839 840 cmnd.rw.control = cpu_to_le16(control); 841 cmnd.rw.dsmgmt = cpu_to_le32(dsmgmt); 842 843 __nvme_submit_cmd(nvmeq, &cmnd); 844 845 return 0; 846 } 847 848 /* 849 * NOTE: ns is NULL when called on the admin queue. 850 */ 851 static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx, 852 const struct blk_mq_queue_data *bd) 853 { 854 struct nvme_ns *ns = hctx->queue->queuedata; 855 struct nvme_queue *nvmeq = hctx->driver_data; 856 struct nvme_dev *dev = nvmeq->dev; 857 struct request *req = bd->rq; 858 struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req); 859 struct nvme_iod *iod; 860 enum dma_data_direction dma_dir; 861 862 /* 863 * If formated with metadata, require the block layer provide a buffer 864 * unless this namespace is formated such that the metadata can be 865 * stripped/generated by the controller with PRACT=1. 866 */ 867 if (ns && ns->ms && !blk_integrity_rq(req)) { 868 if (!(ns->pi_type && ns->ms == 8) && 869 req->cmd_type != REQ_TYPE_DRV_PRIV) { 870 blk_mq_complete_request(req, -EFAULT); 871 return BLK_MQ_RQ_QUEUE_OK; 872 } 873 } 874 875 iod = nvme_alloc_iod(req, dev, GFP_ATOMIC); 876 if (!iod) 877 return BLK_MQ_RQ_QUEUE_BUSY; 878 879 if (req->cmd_flags & REQ_DISCARD) { 880 void *range; 881 /* 882 * We reuse the small pool to allocate the 16-byte range here 883 * as it is not worth having a special pool for these or 884 * additional cases to handle freeing the iod. 885 */ 886 range = dma_pool_alloc(dev->prp_small_pool, GFP_ATOMIC, 887 &iod->first_dma); 888 if (!range) 889 goto retry_cmd; 890 iod_list(iod)[0] = (__le64 *)range; 891 iod->npages = 0; 892 } else if (req->nr_phys_segments) { 893 dma_dir = rq_data_dir(req) ? DMA_TO_DEVICE : DMA_FROM_DEVICE; 894 895 sg_init_table(iod->sg, req->nr_phys_segments); 896 iod->nents = blk_rq_map_sg(req->q, req, iod->sg); 897 if (!iod->nents) 898 goto error_cmd; 899 900 if (!dma_map_sg(nvmeq->q_dmadev, iod->sg, iod->nents, dma_dir)) 901 goto retry_cmd; 902 903 if (blk_rq_bytes(req) != 904 nvme_setup_prps(dev, iod, blk_rq_bytes(req), GFP_ATOMIC)) { 905 dma_unmap_sg(dev->dev, iod->sg, iod->nents, dma_dir); 906 goto retry_cmd; 907 } 908 if (blk_integrity_rq(req)) { 909 if (blk_rq_count_integrity_sg(req->q, req->bio) != 1) 910 goto error_cmd; 911 912 sg_init_table(iod->meta_sg, 1); 913 if (blk_rq_map_integrity_sg( 914 req->q, req->bio, iod->meta_sg) != 1) 915 goto error_cmd; 916 917 if (rq_data_dir(req)) 918 nvme_dif_remap(req, nvme_dif_prep); 919 920 if (!dma_map_sg(nvmeq->q_dmadev, iod->meta_sg, 1, dma_dir)) 921 goto error_cmd; 922 } 923 } 924 925 nvme_set_info(cmd, iod, req_completion); 926 spin_lock_irq(&nvmeq->q_lock); 927 if (req->cmd_type == REQ_TYPE_DRV_PRIV) 928 nvme_submit_priv(nvmeq, req, iod); 929 else if (req->cmd_flags & REQ_DISCARD) 930 nvme_submit_discard(nvmeq, ns, req, iod); 931 else if (req->cmd_flags & REQ_FLUSH) 932 nvme_submit_flush(nvmeq, ns, req->tag); 933 else 934 nvme_submit_iod(nvmeq, iod, ns); 935 936 nvme_process_cq(nvmeq); 937 spin_unlock_irq(&nvmeq->q_lock); 938 return BLK_MQ_RQ_QUEUE_OK; 939 940 error_cmd: 941 nvme_free_iod(dev, iod); 942 return BLK_MQ_RQ_QUEUE_ERROR; 943 retry_cmd: 944 nvme_free_iod(dev, iod); 945 return BLK_MQ_RQ_QUEUE_BUSY; 946 } 947 948 static int nvme_process_cq(struct nvme_queue *nvmeq) 949 { 950 u16 head, phase; 951 952 head = nvmeq->cq_head; 953 phase = nvmeq->cq_phase; 954 955 for (;;) { 956 void *ctx; 957 nvme_completion_fn fn; 958 struct nvme_completion cqe = nvmeq->cqes[head]; 959 if ((le16_to_cpu(cqe.status) & 1) != phase) 960 break; 961 nvmeq->sq_head = le16_to_cpu(cqe.sq_head); 962 if (++head == nvmeq->q_depth) { 963 head = 0; 964 phase = !phase; 965 } 966 ctx = nvme_finish_cmd(nvmeq, cqe.command_id, &fn); 967 fn(nvmeq, ctx, &cqe); 968 } 969 970 /* If the controller ignores the cq head doorbell and continuously 971 * writes to the queue, it is theoretically possible to wrap around 972 * the queue twice and mistakenly return IRQ_NONE. Linux only 973 * requires that 0.1% of your interrupts are handled, so this isn't 974 * a big problem. 975 */ 976 if (head == nvmeq->cq_head && phase == nvmeq->cq_phase) 977 return 0; 978 979 writel(head, nvmeq->q_db + nvmeq->dev->db_stride); 980 nvmeq->cq_head = head; 981 nvmeq->cq_phase = phase; 982 983 nvmeq->cqe_seen = 1; 984 return 1; 985 } 986 987 static irqreturn_t nvme_irq(int irq, void *data) 988 { 989 irqreturn_t result; 990 struct nvme_queue *nvmeq = data; 991 spin_lock(&nvmeq->q_lock); 992 nvme_process_cq(nvmeq); 993 result = nvmeq->cqe_seen ? IRQ_HANDLED : IRQ_NONE; 994 nvmeq->cqe_seen = 0; 995 spin_unlock(&nvmeq->q_lock); 996 return result; 997 } 998 999 static irqreturn_t nvme_irq_check(int irq, void *data) 1000 { 1001 struct nvme_queue *nvmeq = data; 1002 struct nvme_completion cqe = nvmeq->cqes[nvmeq->cq_head]; 1003 if ((le16_to_cpu(cqe.status) & 1) != nvmeq->cq_phase) 1004 return IRQ_NONE; 1005 return IRQ_WAKE_THREAD; 1006 } 1007 1008 /* 1009 * Returns 0 on success. If the result is negative, it's a Linux error code; 1010 * if the result is positive, it's an NVM Express status code 1011 */ 1012 int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, 1013 void *buffer, void __user *ubuffer, unsigned bufflen, 1014 u32 *result, unsigned timeout) 1015 { 1016 bool write = cmd->common.opcode & 1; 1017 struct bio *bio = NULL; 1018 struct request *req; 1019 int ret; 1020 1021 req = blk_mq_alloc_request(q, write, GFP_KERNEL, false); 1022 if (IS_ERR(req)) 1023 return PTR_ERR(req); 1024 1025 req->cmd_type = REQ_TYPE_DRV_PRIV; 1026 req->cmd_flags |= REQ_FAILFAST_DRIVER; 1027 req->__data_len = 0; 1028 req->__sector = (sector_t) -1; 1029 req->bio = req->biotail = NULL; 1030 1031 req->timeout = timeout ? timeout : ADMIN_TIMEOUT; 1032 1033 req->cmd = (unsigned char *)cmd; 1034 req->cmd_len = sizeof(struct nvme_command); 1035 req->special = (void *)0; 1036 1037 if (buffer && bufflen) { 1038 ret = blk_rq_map_kern(q, req, buffer, bufflen, __GFP_WAIT); 1039 if (ret) 1040 goto out; 1041 } else if (ubuffer && bufflen) { 1042 ret = blk_rq_map_user(q, req, NULL, ubuffer, bufflen, __GFP_WAIT); 1043 if (ret) 1044 goto out; 1045 bio = req->bio; 1046 } 1047 1048 blk_execute_rq(req->q, NULL, req, 0); 1049 if (bio) 1050 blk_rq_unmap_user(bio); 1051 if (result) 1052 *result = (u32)(uintptr_t)req->special; 1053 ret = req->errors; 1054 out: 1055 blk_mq_free_request(req); 1056 return ret; 1057 } 1058 1059 int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, 1060 void *buffer, unsigned bufflen) 1061 { 1062 return __nvme_submit_sync_cmd(q, cmd, buffer, NULL, bufflen, NULL, 0); 1063 } 1064 1065 static int nvme_submit_async_admin_req(struct nvme_dev *dev) 1066 { 1067 struct nvme_queue *nvmeq = dev->queues[0]; 1068 struct nvme_command c; 1069 struct nvme_cmd_info *cmd_info; 1070 struct request *req; 1071 1072 req = blk_mq_alloc_request(dev->admin_q, WRITE, GFP_ATOMIC, true); 1073 if (IS_ERR(req)) 1074 return PTR_ERR(req); 1075 1076 req->cmd_flags |= REQ_NO_TIMEOUT; 1077 cmd_info = blk_mq_rq_to_pdu(req); 1078 nvme_set_info(cmd_info, NULL, async_req_completion); 1079 1080 memset(&c, 0, sizeof(c)); 1081 c.common.opcode = nvme_admin_async_event; 1082 c.common.command_id = req->tag; 1083 1084 blk_mq_free_request(req); 1085 __nvme_submit_cmd(nvmeq, &c); 1086 return 0; 1087 } 1088 1089 static int nvme_submit_admin_async_cmd(struct nvme_dev *dev, 1090 struct nvme_command *cmd, 1091 struct async_cmd_info *cmdinfo, unsigned timeout) 1092 { 1093 struct nvme_queue *nvmeq = dev->queues[0]; 1094 struct request *req; 1095 struct nvme_cmd_info *cmd_rq; 1096 1097 req = blk_mq_alloc_request(dev->admin_q, WRITE, GFP_KERNEL, false); 1098 if (IS_ERR(req)) 1099 return PTR_ERR(req); 1100 1101 req->timeout = timeout; 1102 cmd_rq = blk_mq_rq_to_pdu(req); 1103 cmdinfo->req = req; 1104 nvme_set_info(cmd_rq, cmdinfo, async_completion); 1105 cmdinfo->status = -EINTR; 1106 1107 cmd->common.command_id = req->tag; 1108 1109 nvme_submit_cmd(nvmeq, cmd); 1110 return 0; 1111 } 1112 1113 static int adapter_delete_queue(struct nvme_dev *dev, u8 opcode, u16 id) 1114 { 1115 struct nvme_command c; 1116 1117 memset(&c, 0, sizeof(c)); 1118 c.delete_queue.opcode = opcode; 1119 c.delete_queue.qid = cpu_to_le16(id); 1120 1121 return nvme_submit_sync_cmd(dev->admin_q, &c, NULL, 0); 1122 } 1123 1124 static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid, 1125 struct nvme_queue *nvmeq) 1126 { 1127 struct nvme_command c; 1128 int flags = NVME_QUEUE_PHYS_CONTIG | NVME_CQ_IRQ_ENABLED; 1129 1130 /* 1131 * Note: we (ab)use the fact the the prp fields survive if no data 1132 * is attached to the request. 1133 */ 1134 memset(&c, 0, sizeof(c)); 1135 c.create_cq.opcode = nvme_admin_create_cq; 1136 c.create_cq.prp1 = cpu_to_le64(nvmeq->cq_dma_addr); 1137 c.create_cq.cqid = cpu_to_le16(qid); 1138 c.create_cq.qsize = cpu_to_le16(nvmeq->q_depth - 1); 1139 c.create_cq.cq_flags = cpu_to_le16(flags); 1140 c.create_cq.irq_vector = cpu_to_le16(nvmeq->cq_vector); 1141 1142 return nvme_submit_sync_cmd(dev->admin_q, &c, NULL, 0); 1143 } 1144 1145 static int adapter_alloc_sq(struct nvme_dev *dev, u16 qid, 1146 struct nvme_queue *nvmeq) 1147 { 1148 struct nvme_command c; 1149 int flags = NVME_QUEUE_PHYS_CONTIG | NVME_SQ_PRIO_MEDIUM; 1150 1151 /* 1152 * Note: we (ab)use the fact the the prp fields survive if no data 1153 * is attached to the request. 1154 */ 1155 memset(&c, 0, sizeof(c)); 1156 c.create_sq.opcode = nvme_admin_create_sq; 1157 c.create_sq.prp1 = cpu_to_le64(nvmeq->sq_dma_addr); 1158 c.create_sq.sqid = cpu_to_le16(qid); 1159 c.create_sq.qsize = cpu_to_le16(nvmeq->q_depth - 1); 1160 c.create_sq.sq_flags = cpu_to_le16(flags); 1161 c.create_sq.cqid = cpu_to_le16(qid); 1162 1163 return nvme_submit_sync_cmd(dev->admin_q, &c, NULL, 0); 1164 } 1165 1166 static int adapter_delete_cq(struct nvme_dev *dev, u16 cqid) 1167 { 1168 return adapter_delete_queue(dev, nvme_admin_delete_cq, cqid); 1169 } 1170 1171 static int adapter_delete_sq(struct nvme_dev *dev, u16 sqid) 1172 { 1173 return adapter_delete_queue(dev, nvme_admin_delete_sq, sqid); 1174 } 1175 1176 int nvme_identify_ctrl(struct nvme_dev *dev, struct nvme_id_ctrl **id) 1177 { 1178 struct nvme_command c = { }; 1179 int error; 1180 1181 /* gcc-4.4.4 (at least) has issues with initializers and anon unions */ 1182 c.identify.opcode = nvme_admin_identify; 1183 c.identify.cns = cpu_to_le32(1); 1184 1185 *id = kmalloc(sizeof(struct nvme_id_ctrl), GFP_KERNEL); 1186 if (!*id) 1187 return -ENOMEM; 1188 1189 error = nvme_submit_sync_cmd(dev->admin_q, &c, *id, 1190 sizeof(struct nvme_id_ctrl)); 1191 if (error) 1192 kfree(*id); 1193 return error; 1194 } 1195 1196 int nvme_identify_ns(struct nvme_dev *dev, unsigned nsid, 1197 struct nvme_id_ns **id) 1198 { 1199 struct nvme_command c = { }; 1200 int error; 1201 1202 /* gcc-4.4.4 (at least) has issues with initializers and anon unions */ 1203 c.identify.opcode = nvme_admin_identify, 1204 c.identify.nsid = cpu_to_le32(nsid), 1205 1206 *id = kmalloc(sizeof(struct nvme_id_ns), GFP_KERNEL); 1207 if (!*id) 1208 return -ENOMEM; 1209 1210 error = nvme_submit_sync_cmd(dev->admin_q, &c, *id, 1211 sizeof(struct nvme_id_ns)); 1212 if (error) 1213 kfree(*id); 1214 return error; 1215 } 1216 1217 int nvme_get_features(struct nvme_dev *dev, unsigned fid, unsigned nsid, 1218 dma_addr_t dma_addr, u32 *result) 1219 { 1220 struct nvme_command c; 1221 1222 memset(&c, 0, sizeof(c)); 1223 c.features.opcode = nvme_admin_get_features; 1224 c.features.nsid = cpu_to_le32(nsid); 1225 c.features.prp1 = cpu_to_le64(dma_addr); 1226 c.features.fid = cpu_to_le32(fid); 1227 1228 return __nvme_submit_sync_cmd(dev->admin_q, &c, NULL, NULL, 0, 1229 result, 0); 1230 } 1231 1232 int nvme_set_features(struct nvme_dev *dev, unsigned fid, unsigned dword11, 1233 dma_addr_t dma_addr, u32 *result) 1234 { 1235 struct nvme_command c; 1236 1237 memset(&c, 0, sizeof(c)); 1238 c.features.opcode = nvme_admin_set_features; 1239 c.features.prp1 = cpu_to_le64(dma_addr); 1240 c.features.fid = cpu_to_le32(fid); 1241 c.features.dword11 = cpu_to_le32(dword11); 1242 1243 return __nvme_submit_sync_cmd(dev->admin_q, &c, NULL, NULL, 0, 1244 result, 0); 1245 } 1246 1247 int nvme_get_log_page(struct nvme_dev *dev, struct nvme_smart_log **log) 1248 { 1249 struct nvme_command c = { }; 1250 int error; 1251 1252 c.common.opcode = nvme_admin_get_log_page, 1253 c.common.nsid = cpu_to_le32(0xFFFFFFFF), 1254 c.common.cdw10[0] = cpu_to_le32( 1255 (((sizeof(struct nvme_smart_log) / 4) - 1) << 16) | 1256 NVME_LOG_SMART), 1257 1258 *log = kmalloc(sizeof(struct nvme_smart_log), GFP_KERNEL); 1259 if (!*log) 1260 return -ENOMEM; 1261 1262 error = nvme_submit_sync_cmd(dev->admin_q, &c, *log, 1263 sizeof(struct nvme_smart_log)); 1264 if (error) 1265 kfree(*log); 1266 return error; 1267 } 1268 1269 /** 1270 * nvme_abort_req - Attempt aborting a request 1271 * 1272 * Schedule controller reset if the command was already aborted once before and 1273 * still hasn't been returned to the driver, or if this is the admin queue. 1274 */ 1275 static void nvme_abort_req(struct request *req) 1276 { 1277 struct nvme_cmd_info *cmd_rq = blk_mq_rq_to_pdu(req); 1278 struct nvme_queue *nvmeq = cmd_rq->nvmeq; 1279 struct nvme_dev *dev = nvmeq->dev; 1280 struct request *abort_req; 1281 struct nvme_cmd_info *abort_cmd; 1282 struct nvme_command cmd; 1283 1284 if (!nvmeq->qid || cmd_rq->aborted) { 1285 spin_lock(&dev_list_lock); 1286 if (!__nvme_reset(dev)) { 1287 dev_warn(dev->dev, 1288 "I/O %d QID %d timeout, reset controller\n", 1289 req->tag, nvmeq->qid); 1290 } 1291 spin_unlock(&dev_list_lock); 1292 return; 1293 } 1294 1295 if (!dev->abort_limit) 1296 return; 1297 1298 abort_req = blk_mq_alloc_request(dev->admin_q, WRITE, GFP_ATOMIC, 1299 false); 1300 if (IS_ERR(abort_req)) 1301 return; 1302 1303 abort_cmd = blk_mq_rq_to_pdu(abort_req); 1304 nvme_set_info(abort_cmd, abort_req, abort_completion); 1305 1306 memset(&cmd, 0, sizeof(cmd)); 1307 cmd.abort.opcode = nvme_admin_abort_cmd; 1308 cmd.abort.cid = req->tag; 1309 cmd.abort.sqid = cpu_to_le16(nvmeq->qid); 1310 cmd.abort.command_id = abort_req->tag; 1311 1312 --dev->abort_limit; 1313 cmd_rq->aborted = 1; 1314 1315 dev_warn(nvmeq->q_dmadev, "Aborting I/O %d QID %d\n", req->tag, 1316 nvmeq->qid); 1317 nvme_submit_cmd(dev->queues[0], &cmd); 1318 } 1319 1320 static void nvme_cancel_queue_ios(struct request *req, void *data, bool reserved) 1321 { 1322 struct nvme_queue *nvmeq = data; 1323 void *ctx; 1324 nvme_completion_fn fn; 1325 struct nvme_cmd_info *cmd; 1326 struct nvme_completion cqe; 1327 1328 if (!blk_mq_request_started(req)) 1329 return; 1330 1331 cmd = blk_mq_rq_to_pdu(req); 1332 1333 if (cmd->ctx == CMD_CTX_CANCELLED) 1334 return; 1335 1336 if (blk_queue_dying(req->q)) 1337 cqe.status = cpu_to_le16((NVME_SC_ABORT_REQ | NVME_SC_DNR) << 1); 1338 else 1339 cqe.status = cpu_to_le16(NVME_SC_ABORT_REQ << 1); 1340 1341 1342 dev_warn(nvmeq->q_dmadev, "Cancelling I/O %d QID %d\n", 1343 req->tag, nvmeq->qid); 1344 ctx = cancel_cmd_info(cmd, &fn); 1345 fn(nvmeq, ctx, &cqe); 1346 } 1347 1348 static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved) 1349 { 1350 struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req); 1351 struct nvme_queue *nvmeq = cmd->nvmeq; 1352 1353 dev_warn(nvmeq->q_dmadev, "Timeout I/O %d QID %d\n", req->tag, 1354 nvmeq->qid); 1355 spin_lock_irq(&nvmeq->q_lock); 1356 nvme_abort_req(req); 1357 spin_unlock_irq(&nvmeq->q_lock); 1358 1359 /* 1360 * The aborted req will be completed on receiving the abort req. 1361 * We enable the timer again. If hit twice, it'll cause a device reset, 1362 * as the device then is in a faulty state. 1363 */ 1364 return BLK_EH_RESET_TIMER; 1365 } 1366 1367 static void nvme_free_queue(struct nvme_queue *nvmeq) 1368 { 1369 dma_free_coherent(nvmeq->q_dmadev, CQ_SIZE(nvmeq->q_depth), 1370 (void *)nvmeq->cqes, nvmeq->cq_dma_addr); 1371 if (nvmeq->sq_cmds) 1372 dma_free_coherent(nvmeq->q_dmadev, SQ_SIZE(nvmeq->q_depth), 1373 nvmeq->sq_cmds, nvmeq->sq_dma_addr); 1374 kfree(nvmeq); 1375 } 1376 1377 static void nvme_free_queues(struct nvme_dev *dev, int lowest) 1378 { 1379 int i; 1380 1381 for (i = dev->queue_count - 1; i >= lowest; i--) { 1382 struct nvme_queue *nvmeq = dev->queues[i]; 1383 dev->queue_count--; 1384 dev->queues[i] = NULL; 1385 nvme_free_queue(nvmeq); 1386 } 1387 } 1388 1389 /** 1390 * nvme_suspend_queue - put queue into suspended state 1391 * @nvmeq - queue to suspend 1392 */ 1393 static int nvme_suspend_queue(struct nvme_queue *nvmeq) 1394 { 1395 int vector; 1396 1397 spin_lock_irq(&nvmeq->q_lock); 1398 if (nvmeq->cq_vector == -1) { 1399 spin_unlock_irq(&nvmeq->q_lock); 1400 return 1; 1401 } 1402 vector = nvmeq->dev->entry[nvmeq->cq_vector].vector; 1403 nvmeq->dev->online_queues--; 1404 nvmeq->cq_vector = -1; 1405 spin_unlock_irq(&nvmeq->q_lock); 1406 1407 if (!nvmeq->qid && nvmeq->dev->admin_q) 1408 blk_mq_freeze_queue_start(nvmeq->dev->admin_q); 1409 1410 irq_set_affinity_hint(vector, NULL); 1411 free_irq(vector, nvmeq); 1412 1413 return 0; 1414 } 1415 1416 static void nvme_clear_queue(struct nvme_queue *nvmeq) 1417 { 1418 spin_lock_irq(&nvmeq->q_lock); 1419 if (nvmeq->tags && *nvmeq->tags) 1420 blk_mq_all_tag_busy_iter(*nvmeq->tags, nvme_cancel_queue_ios, nvmeq); 1421 spin_unlock_irq(&nvmeq->q_lock); 1422 } 1423 1424 static void nvme_disable_queue(struct nvme_dev *dev, int qid) 1425 { 1426 struct nvme_queue *nvmeq = dev->queues[qid]; 1427 1428 if (!nvmeq) 1429 return; 1430 if (nvme_suspend_queue(nvmeq)) 1431 return; 1432 1433 /* Don't tell the adapter to delete the admin queue. 1434 * Don't tell a removed adapter to delete IO queues. */ 1435 if (qid && readl(&dev->bar->csts) != -1) { 1436 adapter_delete_sq(dev, qid); 1437 adapter_delete_cq(dev, qid); 1438 } 1439 1440 spin_lock_irq(&nvmeq->q_lock); 1441 nvme_process_cq(nvmeq); 1442 spin_unlock_irq(&nvmeq->q_lock); 1443 } 1444 1445 static int nvme_cmb_qdepth(struct nvme_dev *dev, int nr_io_queues, 1446 int entry_size) 1447 { 1448 int q_depth = dev->q_depth; 1449 unsigned q_size_aligned = roundup(q_depth * entry_size, dev->page_size); 1450 1451 if (q_size_aligned * nr_io_queues > dev->cmb_size) { 1452 u64 mem_per_q = div_u64(dev->cmb_size, nr_io_queues); 1453 mem_per_q = round_down(mem_per_q, dev->page_size); 1454 q_depth = div_u64(mem_per_q, entry_size); 1455 1456 /* 1457 * Ensure the reduced q_depth is above some threshold where it 1458 * would be better to map queues in system memory with the 1459 * original depth 1460 */ 1461 if (q_depth < 64) 1462 return -ENOMEM; 1463 } 1464 1465 return q_depth; 1466 } 1467 1468 static int nvme_alloc_sq_cmds(struct nvme_dev *dev, struct nvme_queue *nvmeq, 1469 int qid, int depth) 1470 { 1471 if (qid && dev->cmb && use_cmb_sqes && NVME_CMB_SQS(dev->cmbsz)) { 1472 unsigned offset = (qid - 1) * 1473 roundup(SQ_SIZE(depth), dev->page_size); 1474 nvmeq->sq_dma_addr = dev->cmb_dma_addr + offset; 1475 nvmeq->sq_cmds_io = dev->cmb + offset; 1476 } else { 1477 nvmeq->sq_cmds = dma_alloc_coherent(dev->dev, SQ_SIZE(depth), 1478 &nvmeq->sq_dma_addr, GFP_KERNEL); 1479 if (!nvmeq->sq_cmds) 1480 return -ENOMEM; 1481 } 1482 1483 return 0; 1484 } 1485 1486 static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid, 1487 int depth) 1488 { 1489 struct nvme_queue *nvmeq = kzalloc(sizeof(*nvmeq), GFP_KERNEL); 1490 if (!nvmeq) 1491 return NULL; 1492 1493 nvmeq->cqes = dma_zalloc_coherent(dev->dev, CQ_SIZE(depth), 1494 &nvmeq->cq_dma_addr, GFP_KERNEL); 1495 if (!nvmeq->cqes) 1496 goto free_nvmeq; 1497 1498 if (nvme_alloc_sq_cmds(dev, nvmeq, qid, depth)) 1499 goto free_cqdma; 1500 1501 nvmeq->q_dmadev = dev->dev; 1502 nvmeq->dev = dev; 1503 snprintf(nvmeq->irqname, sizeof(nvmeq->irqname), "nvme%dq%d", 1504 dev->instance, qid); 1505 spin_lock_init(&nvmeq->q_lock); 1506 nvmeq->cq_head = 0; 1507 nvmeq->cq_phase = 1; 1508 nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride]; 1509 nvmeq->q_depth = depth; 1510 nvmeq->qid = qid; 1511 nvmeq->cq_vector = -1; 1512 dev->queues[qid] = nvmeq; 1513 1514 /* make sure queue descriptor is set before queue count, for kthread */ 1515 mb(); 1516 dev->queue_count++; 1517 1518 return nvmeq; 1519 1520 free_cqdma: 1521 dma_free_coherent(dev->dev, CQ_SIZE(depth), (void *)nvmeq->cqes, 1522 nvmeq->cq_dma_addr); 1523 free_nvmeq: 1524 kfree(nvmeq); 1525 return NULL; 1526 } 1527 1528 static int queue_request_irq(struct nvme_dev *dev, struct nvme_queue *nvmeq, 1529 const char *name) 1530 { 1531 if (use_threaded_interrupts) 1532 return request_threaded_irq(dev->entry[nvmeq->cq_vector].vector, 1533 nvme_irq_check, nvme_irq, IRQF_SHARED, 1534 name, nvmeq); 1535 return request_irq(dev->entry[nvmeq->cq_vector].vector, nvme_irq, 1536 IRQF_SHARED, name, nvmeq); 1537 } 1538 1539 static void nvme_init_queue(struct nvme_queue *nvmeq, u16 qid) 1540 { 1541 struct nvme_dev *dev = nvmeq->dev; 1542 1543 spin_lock_irq(&nvmeq->q_lock); 1544 nvmeq->sq_tail = 0; 1545 nvmeq->cq_head = 0; 1546 nvmeq->cq_phase = 1; 1547 nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride]; 1548 memset((void *)nvmeq->cqes, 0, CQ_SIZE(nvmeq->q_depth)); 1549 dev->online_queues++; 1550 spin_unlock_irq(&nvmeq->q_lock); 1551 } 1552 1553 static int nvme_create_queue(struct nvme_queue *nvmeq, int qid) 1554 { 1555 struct nvme_dev *dev = nvmeq->dev; 1556 int result; 1557 1558 nvmeq->cq_vector = qid - 1; 1559 result = adapter_alloc_cq(dev, qid, nvmeq); 1560 if (result < 0) 1561 return result; 1562 1563 result = adapter_alloc_sq(dev, qid, nvmeq); 1564 if (result < 0) 1565 goto release_cq; 1566 1567 result = queue_request_irq(dev, nvmeq, nvmeq->irqname); 1568 if (result < 0) 1569 goto release_sq; 1570 1571 nvme_init_queue(nvmeq, qid); 1572 return result; 1573 1574 release_sq: 1575 adapter_delete_sq(dev, qid); 1576 release_cq: 1577 adapter_delete_cq(dev, qid); 1578 return result; 1579 } 1580 1581 static int nvme_wait_ready(struct nvme_dev *dev, u64 cap, bool enabled) 1582 { 1583 unsigned long timeout; 1584 u32 bit = enabled ? NVME_CSTS_RDY : 0; 1585 1586 timeout = ((NVME_CAP_TIMEOUT(cap) + 1) * HZ / 2) + jiffies; 1587 1588 while ((readl(&dev->bar->csts) & NVME_CSTS_RDY) != bit) { 1589 msleep(100); 1590 if (fatal_signal_pending(current)) 1591 return -EINTR; 1592 if (time_after(jiffies, timeout)) { 1593 dev_err(dev->dev, 1594 "Device not ready; aborting %s\n", enabled ? 1595 "initialisation" : "reset"); 1596 return -ENODEV; 1597 } 1598 } 1599 1600 return 0; 1601 } 1602 1603 /* 1604 * If the device has been passed off to us in an enabled state, just clear 1605 * the enabled bit. The spec says we should set the 'shutdown notification 1606 * bits', but doing so may cause the device to complete commands to the 1607 * admin queue ... and we don't know what memory that might be pointing at! 1608 */ 1609 static int nvme_disable_ctrl(struct nvme_dev *dev, u64 cap) 1610 { 1611 dev->ctrl_config &= ~NVME_CC_SHN_MASK; 1612 dev->ctrl_config &= ~NVME_CC_ENABLE; 1613 writel(dev->ctrl_config, &dev->bar->cc); 1614 1615 return nvme_wait_ready(dev, cap, false); 1616 } 1617 1618 static int nvme_enable_ctrl(struct nvme_dev *dev, u64 cap) 1619 { 1620 dev->ctrl_config &= ~NVME_CC_SHN_MASK; 1621 dev->ctrl_config |= NVME_CC_ENABLE; 1622 writel(dev->ctrl_config, &dev->bar->cc); 1623 1624 return nvme_wait_ready(dev, cap, true); 1625 } 1626 1627 static int nvme_shutdown_ctrl(struct nvme_dev *dev) 1628 { 1629 unsigned long timeout; 1630 1631 dev->ctrl_config &= ~NVME_CC_SHN_MASK; 1632 dev->ctrl_config |= NVME_CC_SHN_NORMAL; 1633 1634 writel(dev->ctrl_config, &dev->bar->cc); 1635 1636 timeout = SHUTDOWN_TIMEOUT + jiffies; 1637 while ((readl(&dev->bar->csts) & NVME_CSTS_SHST_MASK) != 1638 NVME_CSTS_SHST_CMPLT) { 1639 msleep(100); 1640 if (fatal_signal_pending(current)) 1641 return -EINTR; 1642 if (time_after(jiffies, timeout)) { 1643 dev_err(dev->dev, 1644 "Device shutdown incomplete; abort shutdown\n"); 1645 return -ENODEV; 1646 } 1647 } 1648 1649 return 0; 1650 } 1651 1652 static struct blk_mq_ops nvme_mq_admin_ops = { 1653 .queue_rq = nvme_queue_rq, 1654 .map_queue = blk_mq_map_queue, 1655 .init_hctx = nvme_admin_init_hctx, 1656 .exit_hctx = nvme_admin_exit_hctx, 1657 .init_request = nvme_admin_init_request, 1658 .timeout = nvme_timeout, 1659 }; 1660 1661 static struct blk_mq_ops nvme_mq_ops = { 1662 .queue_rq = nvme_queue_rq, 1663 .map_queue = blk_mq_map_queue, 1664 .init_hctx = nvme_init_hctx, 1665 .init_request = nvme_init_request, 1666 .timeout = nvme_timeout, 1667 }; 1668 1669 static void nvme_dev_remove_admin(struct nvme_dev *dev) 1670 { 1671 if (dev->admin_q && !blk_queue_dying(dev->admin_q)) { 1672 blk_cleanup_queue(dev->admin_q); 1673 blk_mq_free_tag_set(&dev->admin_tagset); 1674 } 1675 } 1676 1677 static int nvme_alloc_admin_tags(struct nvme_dev *dev) 1678 { 1679 if (!dev->admin_q) { 1680 dev->admin_tagset.ops = &nvme_mq_admin_ops; 1681 dev->admin_tagset.nr_hw_queues = 1; 1682 dev->admin_tagset.queue_depth = NVME_AQ_DEPTH - 1; 1683 dev->admin_tagset.reserved_tags = 1; 1684 dev->admin_tagset.timeout = ADMIN_TIMEOUT; 1685 dev->admin_tagset.numa_node = dev_to_node(dev->dev); 1686 dev->admin_tagset.cmd_size = nvme_cmd_size(dev); 1687 dev->admin_tagset.driver_data = dev; 1688 1689 if (blk_mq_alloc_tag_set(&dev->admin_tagset)) 1690 return -ENOMEM; 1691 1692 dev->admin_q = blk_mq_init_queue(&dev->admin_tagset); 1693 if (IS_ERR(dev->admin_q)) { 1694 blk_mq_free_tag_set(&dev->admin_tagset); 1695 return -ENOMEM; 1696 } 1697 if (!blk_get_queue(dev->admin_q)) { 1698 nvme_dev_remove_admin(dev); 1699 dev->admin_q = NULL; 1700 return -ENODEV; 1701 } 1702 } else 1703 blk_mq_unfreeze_queue(dev->admin_q); 1704 1705 return 0; 1706 } 1707 1708 static int nvme_configure_admin_queue(struct nvme_dev *dev) 1709 { 1710 int result; 1711 u32 aqa; 1712 u64 cap = readq(&dev->bar->cap); 1713 struct nvme_queue *nvmeq; 1714 unsigned page_shift = PAGE_SHIFT; 1715 unsigned dev_page_min = NVME_CAP_MPSMIN(cap) + 12; 1716 unsigned dev_page_max = NVME_CAP_MPSMAX(cap) + 12; 1717 1718 if (page_shift < dev_page_min) { 1719 dev_err(dev->dev, 1720 "Minimum device page size (%u) too large for " 1721 "host (%u)\n", 1 << dev_page_min, 1722 1 << page_shift); 1723 return -ENODEV; 1724 } 1725 if (page_shift > dev_page_max) { 1726 dev_info(dev->dev, 1727 "Device maximum page size (%u) smaller than " 1728 "host (%u); enabling work-around\n", 1729 1 << dev_page_max, 1 << page_shift); 1730 page_shift = dev_page_max; 1731 } 1732 1733 dev->subsystem = readl(&dev->bar->vs) >= NVME_VS(1, 1) ? 1734 NVME_CAP_NSSRC(cap) : 0; 1735 1736 if (dev->subsystem && (readl(&dev->bar->csts) & NVME_CSTS_NSSRO)) 1737 writel(NVME_CSTS_NSSRO, &dev->bar->csts); 1738 1739 result = nvme_disable_ctrl(dev, cap); 1740 if (result < 0) 1741 return result; 1742 1743 nvmeq = dev->queues[0]; 1744 if (!nvmeq) { 1745 nvmeq = nvme_alloc_queue(dev, 0, NVME_AQ_DEPTH); 1746 if (!nvmeq) 1747 return -ENOMEM; 1748 } 1749 1750 aqa = nvmeq->q_depth - 1; 1751 aqa |= aqa << 16; 1752 1753 dev->page_size = 1 << page_shift; 1754 1755 dev->ctrl_config = NVME_CC_CSS_NVM; 1756 dev->ctrl_config |= (page_shift - 12) << NVME_CC_MPS_SHIFT; 1757 dev->ctrl_config |= NVME_CC_ARB_RR | NVME_CC_SHN_NONE; 1758 dev->ctrl_config |= NVME_CC_IOSQES | NVME_CC_IOCQES; 1759 1760 writel(aqa, &dev->bar->aqa); 1761 writeq(nvmeq->sq_dma_addr, &dev->bar->asq); 1762 writeq(nvmeq->cq_dma_addr, &dev->bar->acq); 1763 1764 result = nvme_enable_ctrl(dev, cap); 1765 if (result) 1766 goto free_nvmeq; 1767 1768 nvmeq->cq_vector = 0; 1769 result = queue_request_irq(dev, nvmeq, nvmeq->irqname); 1770 if (result) { 1771 nvmeq->cq_vector = -1; 1772 goto free_nvmeq; 1773 } 1774 1775 return result; 1776 1777 free_nvmeq: 1778 nvme_free_queues(dev, 0); 1779 return result; 1780 } 1781 1782 static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio) 1783 { 1784 struct nvme_dev *dev = ns->dev; 1785 struct nvme_user_io io; 1786 struct nvme_command c; 1787 unsigned length, meta_len; 1788 int status, write; 1789 dma_addr_t meta_dma = 0; 1790 void *meta = NULL; 1791 void __user *metadata; 1792 1793 if (copy_from_user(&io, uio, sizeof(io))) 1794 return -EFAULT; 1795 1796 switch (io.opcode) { 1797 case nvme_cmd_write: 1798 case nvme_cmd_read: 1799 case nvme_cmd_compare: 1800 break; 1801 default: 1802 return -EINVAL; 1803 } 1804 1805 length = (io.nblocks + 1) << ns->lba_shift; 1806 meta_len = (io.nblocks + 1) * ns->ms; 1807 metadata = (void __user *)(uintptr_t)io.metadata; 1808 write = io.opcode & 1; 1809 1810 if (ns->ext) { 1811 length += meta_len; 1812 meta_len = 0; 1813 } 1814 if (meta_len) { 1815 if (((io.metadata & 3) || !io.metadata) && !ns->ext) 1816 return -EINVAL; 1817 1818 meta = dma_alloc_coherent(dev->dev, meta_len, 1819 &meta_dma, GFP_KERNEL); 1820 1821 if (!meta) { 1822 status = -ENOMEM; 1823 goto unmap; 1824 } 1825 if (write) { 1826 if (copy_from_user(meta, metadata, meta_len)) { 1827 status = -EFAULT; 1828 goto unmap; 1829 } 1830 } 1831 } 1832 1833 memset(&c, 0, sizeof(c)); 1834 c.rw.opcode = io.opcode; 1835 c.rw.flags = io.flags; 1836 c.rw.nsid = cpu_to_le32(ns->ns_id); 1837 c.rw.slba = cpu_to_le64(io.slba); 1838 c.rw.length = cpu_to_le16(io.nblocks); 1839 c.rw.control = cpu_to_le16(io.control); 1840 c.rw.dsmgmt = cpu_to_le32(io.dsmgmt); 1841 c.rw.reftag = cpu_to_le32(io.reftag); 1842 c.rw.apptag = cpu_to_le16(io.apptag); 1843 c.rw.appmask = cpu_to_le16(io.appmask); 1844 c.rw.metadata = cpu_to_le64(meta_dma); 1845 1846 status = __nvme_submit_sync_cmd(ns->queue, &c, NULL, 1847 (void __user *)(uintptr_t)io.addr, length, NULL, 0); 1848 unmap: 1849 if (meta) { 1850 if (status == NVME_SC_SUCCESS && !write) { 1851 if (copy_to_user(metadata, meta, meta_len)) 1852 status = -EFAULT; 1853 } 1854 dma_free_coherent(dev->dev, meta_len, meta, meta_dma); 1855 } 1856 return status; 1857 } 1858 1859 static int nvme_user_cmd(struct nvme_dev *dev, struct nvme_ns *ns, 1860 struct nvme_passthru_cmd __user *ucmd) 1861 { 1862 struct nvme_passthru_cmd cmd; 1863 struct nvme_command c; 1864 unsigned timeout = 0; 1865 int status; 1866 1867 if (!capable(CAP_SYS_ADMIN)) 1868 return -EACCES; 1869 if (copy_from_user(&cmd, ucmd, sizeof(cmd))) 1870 return -EFAULT; 1871 1872 memset(&c, 0, sizeof(c)); 1873 c.common.opcode = cmd.opcode; 1874 c.common.flags = cmd.flags; 1875 c.common.nsid = cpu_to_le32(cmd.nsid); 1876 c.common.cdw2[0] = cpu_to_le32(cmd.cdw2); 1877 c.common.cdw2[1] = cpu_to_le32(cmd.cdw3); 1878 c.common.cdw10[0] = cpu_to_le32(cmd.cdw10); 1879 c.common.cdw10[1] = cpu_to_le32(cmd.cdw11); 1880 c.common.cdw10[2] = cpu_to_le32(cmd.cdw12); 1881 c.common.cdw10[3] = cpu_to_le32(cmd.cdw13); 1882 c.common.cdw10[4] = cpu_to_le32(cmd.cdw14); 1883 c.common.cdw10[5] = cpu_to_le32(cmd.cdw15); 1884 1885 if (cmd.timeout_ms) 1886 timeout = msecs_to_jiffies(cmd.timeout_ms); 1887 1888 status = __nvme_submit_sync_cmd(ns ? ns->queue : dev->admin_q, &c, 1889 NULL, (void __user *)(uintptr_t)cmd.addr, cmd.data_len, 1890 &cmd.result, timeout); 1891 if (status >= 0) { 1892 if (put_user(cmd.result, &ucmd->result)) 1893 return -EFAULT; 1894 } 1895 1896 return status; 1897 } 1898 1899 static int nvme_subsys_reset(struct nvme_dev *dev) 1900 { 1901 if (!dev->subsystem) 1902 return -ENOTTY; 1903 1904 writel(0x4E564D65, &dev->bar->nssr); /* "NVMe" */ 1905 return 0; 1906 } 1907 1908 static int nvme_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, 1909 unsigned long arg) 1910 { 1911 struct nvme_ns *ns = bdev->bd_disk->private_data; 1912 1913 switch (cmd) { 1914 case NVME_IOCTL_ID: 1915 force_successful_syscall_return(); 1916 return ns->ns_id; 1917 case NVME_IOCTL_ADMIN_CMD: 1918 return nvme_user_cmd(ns->dev, NULL, (void __user *)arg); 1919 case NVME_IOCTL_IO_CMD: 1920 return nvme_user_cmd(ns->dev, ns, (void __user *)arg); 1921 case NVME_IOCTL_SUBMIT_IO: 1922 return nvme_submit_io(ns, (void __user *)arg); 1923 case SG_GET_VERSION_NUM: 1924 return nvme_sg_get_version_num((void __user *)arg); 1925 case SG_IO: 1926 return nvme_sg_io(ns, (void __user *)arg); 1927 default: 1928 return -ENOTTY; 1929 } 1930 } 1931 1932 #ifdef CONFIG_COMPAT 1933 static int nvme_compat_ioctl(struct block_device *bdev, fmode_t mode, 1934 unsigned int cmd, unsigned long arg) 1935 { 1936 switch (cmd) { 1937 case SG_IO: 1938 return -ENOIOCTLCMD; 1939 } 1940 return nvme_ioctl(bdev, mode, cmd, arg); 1941 } 1942 #else 1943 #define nvme_compat_ioctl NULL 1944 #endif 1945 1946 static void nvme_free_dev(struct kref *kref); 1947 static void nvme_free_ns(struct kref *kref) 1948 { 1949 struct nvme_ns *ns = container_of(kref, struct nvme_ns, kref); 1950 1951 spin_lock(&dev_list_lock); 1952 ns->disk->private_data = NULL; 1953 spin_unlock(&dev_list_lock); 1954 1955 kref_put(&ns->dev->kref, nvme_free_dev); 1956 put_disk(ns->disk); 1957 kfree(ns); 1958 } 1959 1960 static int nvme_open(struct block_device *bdev, fmode_t mode) 1961 { 1962 int ret = 0; 1963 struct nvme_ns *ns; 1964 1965 spin_lock(&dev_list_lock); 1966 ns = bdev->bd_disk->private_data; 1967 if (!ns) 1968 ret = -ENXIO; 1969 else if (!kref_get_unless_zero(&ns->kref)) 1970 ret = -ENXIO; 1971 spin_unlock(&dev_list_lock); 1972 1973 return ret; 1974 } 1975 1976 static void nvme_release(struct gendisk *disk, fmode_t mode) 1977 { 1978 struct nvme_ns *ns = disk->private_data; 1979 kref_put(&ns->kref, nvme_free_ns); 1980 } 1981 1982 static int nvme_getgeo(struct block_device *bd, struct hd_geometry *geo) 1983 { 1984 /* some standard values */ 1985 geo->heads = 1 << 6; 1986 geo->sectors = 1 << 5; 1987 geo->cylinders = get_capacity(bd->bd_disk) >> 11; 1988 return 0; 1989 } 1990 1991 static void nvme_config_discard(struct nvme_ns *ns) 1992 { 1993 u32 logical_block_size = queue_logical_block_size(ns->queue); 1994 ns->queue->limits.discard_zeroes_data = 0; 1995 ns->queue->limits.discard_alignment = logical_block_size; 1996 ns->queue->limits.discard_granularity = logical_block_size; 1997 blk_queue_max_discard_sectors(ns->queue, 0xffffffff); 1998 queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, ns->queue); 1999 } 2000 2001 static int nvme_revalidate_disk(struct gendisk *disk) 2002 { 2003 struct nvme_ns *ns = disk->private_data; 2004 struct nvme_dev *dev = ns->dev; 2005 struct nvme_id_ns *id; 2006 u8 lbaf, pi_type; 2007 u16 old_ms; 2008 unsigned short bs; 2009 2010 if (nvme_identify_ns(dev, ns->ns_id, &id)) { 2011 dev_warn(dev->dev, "%s: Identify failure nvme%dn%d\n", __func__, 2012 dev->instance, ns->ns_id); 2013 return -ENODEV; 2014 } 2015 if (id->ncap == 0) { 2016 kfree(id); 2017 return -ENODEV; 2018 } 2019 2020 old_ms = ns->ms; 2021 lbaf = id->flbas & NVME_NS_FLBAS_LBA_MASK; 2022 ns->lba_shift = id->lbaf[lbaf].ds; 2023 ns->ms = le16_to_cpu(id->lbaf[lbaf].ms); 2024 ns->ext = ns->ms && (id->flbas & NVME_NS_FLBAS_META_EXT); 2025 2026 /* 2027 * If identify namespace failed, use default 512 byte block size so 2028 * block layer can use before failing read/write for 0 capacity. 2029 */ 2030 if (ns->lba_shift == 0) 2031 ns->lba_shift = 9; 2032 bs = 1 << ns->lba_shift; 2033 2034 /* XXX: PI implementation requires metadata equal t10 pi tuple size */ 2035 pi_type = ns->ms == sizeof(struct t10_pi_tuple) ? 2036 id->dps & NVME_NS_DPS_PI_MASK : 0; 2037 2038 if (blk_get_integrity(disk) && (ns->pi_type != pi_type || 2039 ns->ms != old_ms || 2040 bs != queue_logical_block_size(disk->queue) || 2041 (ns->ms && ns->ext))) 2042 blk_integrity_unregister(disk); 2043 2044 ns->pi_type = pi_type; 2045 blk_queue_logical_block_size(ns->queue, bs); 2046 2047 if (ns->ms && !blk_get_integrity(disk) && (disk->flags & GENHD_FL_UP) && 2048 !ns->ext) 2049 nvme_init_integrity(ns); 2050 2051 if (ns->ms && !(ns->ms == 8 && ns->pi_type) && !blk_get_integrity(disk)) 2052 set_capacity(disk, 0); 2053 else 2054 set_capacity(disk, le64_to_cpup(&id->nsze) << (ns->lba_shift - 9)); 2055 2056 if (dev->oncs & NVME_CTRL_ONCS_DSM) 2057 nvme_config_discard(ns); 2058 2059 kfree(id); 2060 return 0; 2061 } 2062 2063 static const struct block_device_operations nvme_fops = { 2064 .owner = THIS_MODULE, 2065 .ioctl = nvme_ioctl, 2066 .compat_ioctl = nvme_compat_ioctl, 2067 .open = nvme_open, 2068 .release = nvme_release, 2069 .getgeo = nvme_getgeo, 2070 .revalidate_disk= nvme_revalidate_disk, 2071 }; 2072 2073 static int nvme_kthread(void *data) 2074 { 2075 struct nvme_dev *dev, *next; 2076 2077 while (!kthread_should_stop()) { 2078 set_current_state(TASK_INTERRUPTIBLE); 2079 spin_lock(&dev_list_lock); 2080 list_for_each_entry_safe(dev, next, &dev_list, node) { 2081 int i; 2082 u32 csts = readl(&dev->bar->csts); 2083 2084 if ((dev->subsystem && (csts & NVME_CSTS_NSSRO)) || 2085 csts & NVME_CSTS_CFS) { 2086 if (!__nvme_reset(dev)) { 2087 dev_warn(dev->dev, 2088 "Failed status: %x, reset controller\n", 2089 readl(&dev->bar->csts)); 2090 } 2091 continue; 2092 } 2093 for (i = 0; i < dev->queue_count; i++) { 2094 struct nvme_queue *nvmeq = dev->queues[i]; 2095 if (!nvmeq) 2096 continue; 2097 spin_lock_irq(&nvmeq->q_lock); 2098 nvme_process_cq(nvmeq); 2099 2100 while ((i == 0) && (dev->event_limit > 0)) { 2101 if (nvme_submit_async_admin_req(dev)) 2102 break; 2103 dev->event_limit--; 2104 } 2105 spin_unlock_irq(&nvmeq->q_lock); 2106 } 2107 } 2108 spin_unlock(&dev_list_lock); 2109 schedule_timeout(round_jiffies_relative(HZ)); 2110 } 2111 return 0; 2112 } 2113 2114 static void nvme_alloc_ns(struct nvme_dev *dev, unsigned nsid) 2115 { 2116 struct nvme_ns *ns; 2117 struct gendisk *disk; 2118 int node = dev_to_node(dev->dev); 2119 2120 ns = kzalloc_node(sizeof(*ns), GFP_KERNEL, node); 2121 if (!ns) 2122 return; 2123 2124 ns->queue = blk_mq_init_queue(&dev->tagset); 2125 if (IS_ERR(ns->queue)) 2126 goto out_free_ns; 2127 queue_flag_set_unlocked(QUEUE_FLAG_NOMERGES, ns->queue); 2128 queue_flag_set_unlocked(QUEUE_FLAG_NONROT, ns->queue); 2129 ns->dev = dev; 2130 ns->queue->queuedata = ns; 2131 2132 disk = alloc_disk_node(0, node); 2133 if (!disk) 2134 goto out_free_queue; 2135 2136 kref_init(&ns->kref); 2137 ns->ns_id = nsid; 2138 ns->disk = disk; 2139 ns->lba_shift = 9; /* set to a default value for 512 until disk is validated */ 2140 list_add_tail(&ns->list, &dev->namespaces); 2141 2142 blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift); 2143 if (dev->max_hw_sectors) { 2144 blk_queue_max_hw_sectors(ns->queue, dev->max_hw_sectors); 2145 blk_queue_max_segments(ns->queue, 2146 ((dev->max_hw_sectors << 9) / dev->page_size) + 1); 2147 } 2148 if (dev->stripe_size) 2149 blk_queue_chunk_sectors(ns->queue, dev->stripe_size >> 9); 2150 if (dev->vwc & NVME_CTRL_VWC_PRESENT) 2151 blk_queue_flush(ns->queue, REQ_FLUSH | REQ_FUA); 2152 blk_queue_virt_boundary(ns->queue, dev->page_size - 1); 2153 2154 disk->major = nvme_major; 2155 disk->first_minor = 0; 2156 disk->fops = &nvme_fops; 2157 disk->private_data = ns; 2158 disk->queue = ns->queue; 2159 disk->driverfs_dev = dev->device; 2160 disk->flags = GENHD_FL_EXT_DEVT; 2161 sprintf(disk->disk_name, "nvme%dn%d", dev->instance, nsid); 2162 2163 /* 2164 * Initialize capacity to 0 until we establish the namespace format and 2165 * setup integrity extentions if necessary. The revalidate_disk after 2166 * add_disk allows the driver to register with integrity if the format 2167 * requires it. 2168 */ 2169 set_capacity(disk, 0); 2170 if (nvme_revalidate_disk(ns->disk)) 2171 goto out_free_disk; 2172 2173 kref_get(&dev->kref); 2174 add_disk(ns->disk); 2175 if (ns->ms) { 2176 struct block_device *bd = bdget_disk(ns->disk, 0); 2177 if (!bd) 2178 return; 2179 if (blkdev_get(bd, FMODE_READ, NULL)) { 2180 bdput(bd); 2181 return; 2182 } 2183 blkdev_reread_part(bd); 2184 blkdev_put(bd, FMODE_READ); 2185 } 2186 return; 2187 out_free_disk: 2188 kfree(disk); 2189 list_del(&ns->list); 2190 out_free_queue: 2191 blk_cleanup_queue(ns->queue); 2192 out_free_ns: 2193 kfree(ns); 2194 } 2195 2196 /* 2197 * Create I/O queues. Failing to create an I/O queue is not an issue, 2198 * we can continue with less than the desired amount of queues, and 2199 * even a controller without I/O queues an still be used to issue 2200 * admin commands. This might be useful to upgrade a buggy firmware 2201 * for example. 2202 */ 2203 static void nvme_create_io_queues(struct nvme_dev *dev) 2204 { 2205 unsigned i; 2206 2207 for (i = dev->queue_count; i <= dev->max_qid; i++) 2208 if (!nvme_alloc_queue(dev, i, dev->q_depth)) 2209 break; 2210 2211 for (i = dev->online_queues; i <= dev->queue_count - 1; i++) 2212 if (nvme_create_queue(dev->queues[i], i)) { 2213 nvme_free_queues(dev, i); 2214 break; 2215 } 2216 } 2217 2218 static int set_queue_count(struct nvme_dev *dev, int count) 2219 { 2220 int status; 2221 u32 result; 2222 u32 q_count = (count - 1) | ((count - 1) << 16); 2223 2224 status = nvme_set_features(dev, NVME_FEAT_NUM_QUEUES, q_count, 0, 2225 &result); 2226 if (status < 0) 2227 return status; 2228 if (status > 0) { 2229 dev_err(dev->dev, "Could not set queue count (%d)\n", status); 2230 return 0; 2231 } 2232 return min(result & 0xffff, result >> 16) + 1; 2233 } 2234 2235 static void __iomem *nvme_map_cmb(struct nvme_dev *dev) 2236 { 2237 u64 szu, size, offset; 2238 u32 cmbloc; 2239 resource_size_t bar_size; 2240 struct pci_dev *pdev = to_pci_dev(dev->dev); 2241 void __iomem *cmb; 2242 dma_addr_t dma_addr; 2243 2244 if (!use_cmb_sqes) 2245 return NULL; 2246 2247 dev->cmbsz = readl(&dev->bar->cmbsz); 2248 if (!(NVME_CMB_SZ(dev->cmbsz))) 2249 return NULL; 2250 2251 cmbloc = readl(&dev->bar->cmbloc); 2252 2253 szu = (u64)1 << (12 + 4 * NVME_CMB_SZU(dev->cmbsz)); 2254 size = szu * NVME_CMB_SZ(dev->cmbsz); 2255 offset = szu * NVME_CMB_OFST(cmbloc); 2256 bar_size = pci_resource_len(pdev, NVME_CMB_BIR(cmbloc)); 2257 2258 if (offset > bar_size) 2259 return NULL; 2260 2261 /* 2262 * Controllers may support a CMB size larger than their BAR, 2263 * for example, due to being behind a bridge. Reduce the CMB to 2264 * the reported size of the BAR 2265 */ 2266 if (size > bar_size - offset) 2267 size = bar_size - offset; 2268 2269 dma_addr = pci_resource_start(pdev, NVME_CMB_BIR(cmbloc)) + offset; 2270 cmb = ioremap_wc(dma_addr, size); 2271 if (!cmb) 2272 return NULL; 2273 2274 dev->cmb_dma_addr = dma_addr; 2275 dev->cmb_size = size; 2276 return cmb; 2277 } 2278 2279 static inline void nvme_release_cmb(struct nvme_dev *dev) 2280 { 2281 if (dev->cmb) { 2282 iounmap(dev->cmb); 2283 dev->cmb = NULL; 2284 } 2285 } 2286 2287 static size_t db_bar_size(struct nvme_dev *dev, unsigned nr_io_queues) 2288 { 2289 return 4096 + ((nr_io_queues + 1) * 8 * dev->db_stride); 2290 } 2291 2292 static int nvme_setup_io_queues(struct nvme_dev *dev) 2293 { 2294 struct nvme_queue *adminq = dev->queues[0]; 2295 struct pci_dev *pdev = to_pci_dev(dev->dev); 2296 int result, i, vecs, nr_io_queues, size; 2297 2298 nr_io_queues = num_possible_cpus(); 2299 result = set_queue_count(dev, nr_io_queues); 2300 if (result <= 0) 2301 return result; 2302 if (result < nr_io_queues) 2303 nr_io_queues = result; 2304 2305 if (dev->cmb && NVME_CMB_SQS(dev->cmbsz)) { 2306 result = nvme_cmb_qdepth(dev, nr_io_queues, 2307 sizeof(struct nvme_command)); 2308 if (result > 0) 2309 dev->q_depth = result; 2310 else 2311 nvme_release_cmb(dev); 2312 } 2313 2314 size = db_bar_size(dev, nr_io_queues); 2315 if (size > 8192) { 2316 iounmap(dev->bar); 2317 do { 2318 dev->bar = ioremap(pci_resource_start(pdev, 0), size); 2319 if (dev->bar) 2320 break; 2321 if (!--nr_io_queues) 2322 return -ENOMEM; 2323 size = db_bar_size(dev, nr_io_queues); 2324 } while (1); 2325 dev->dbs = ((void __iomem *)dev->bar) + 4096; 2326 adminq->q_db = dev->dbs; 2327 } 2328 2329 /* Deregister the admin queue's interrupt */ 2330 free_irq(dev->entry[0].vector, adminq); 2331 2332 /* 2333 * If we enable msix early due to not intx, disable it again before 2334 * setting up the full range we need. 2335 */ 2336 if (!pdev->irq) 2337 pci_disable_msix(pdev); 2338 2339 for (i = 0; i < nr_io_queues; i++) 2340 dev->entry[i].entry = i; 2341 vecs = pci_enable_msix_range(pdev, dev->entry, 1, nr_io_queues); 2342 if (vecs < 0) { 2343 vecs = pci_enable_msi_range(pdev, 1, min(nr_io_queues, 32)); 2344 if (vecs < 0) { 2345 vecs = 1; 2346 } else { 2347 for (i = 0; i < vecs; i++) 2348 dev->entry[i].vector = i + pdev->irq; 2349 } 2350 } 2351 2352 /* 2353 * Should investigate if there's a performance win from allocating 2354 * more queues than interrupt vectors; it might allow the submission 2355 * path to scale better, even if the receive path is limited by the 2356 * number of interrupts. 2357 */ 2358 nr_io_queues = vecs; 2359 dev->max_qid = nr_io_queues; 2360 2361 result = queue_request_irq(dev, adminq, adminq->irqname); 2362 if (result) { 2363 adminq->cq_vector = -1; 2364 goto free_queues; 2365 } 2366 2367 /* Free previously allocated queues that are no longer usable */ 2368 nvme_free_queues(dev, nr_io_queues + 1); 2369 nvme_create_io_queues(dev); 2370 2371 return 0; 2372 2373 free_queues: 2374 nvme_free_queues(dev, 1); 2375 return result; 2376 } 2377 2378 static int ns_cmp(void *priv, struct list_head *a, struct list_head *b) 2379 { 2380 struct nvme_ns *nsa = container_of(a, struct nvme_ns, list); 2381 struct nvme_ns *nsb = container_of(b, struct nvme_ns, list); 2382 2383 return nsa->ns_id - nsb->ns_id; 2384 } 2385 2386 static struct nvme_ns *nvme_find_ns(struct nvme_dev *dev, unsigned nsid) 2387 { 2388 struct nvme_ns *ns; 2389 2390 list_for_each_entry(ns, &dev->namespaces, list) { 2391 if (ns->ns_id == nsid) 2392 return ns; 2393 if (ns->ns_id > nsid) 2394 break; 2395 } 2396 return NULL; 2397 } 2398 2399 static inline bool nvme_io_incapable(struct nvme_dev *dev) 2400 { 2401 return (!dev->bar || readl(&dev->bar->csts) & NVME_CSTS_CFS || 2402 dev->online_queues < 2); 2403 } 2404 2405 static void nvme_ns_remove(struct nvme_ns *ns) 2406 { 2407 bool kill = nvme_io_incapable(ns->dev) && !blk_queue_dying(ns->queue); 2408 2409 if (kill) 2410 blk_set_queue_dying(ns->queue); 2411 if (ns->disk->flags & GENHD_FL_UP) { 2412 if (blk_get_integrity(ns->disk)) 2413 blk_integrity_unregister(ns->disk); 2414 del_gendisk(ns->disk); 2415 } 2416 if (kill || !blk_queue_dying(ns->queue)) { 2417 blk_mq_abort_requeue_list(ns->queue); 2418 blk_cleanup_queue(ns->queue); 2419 } 2420 list_del_init(&ns->list); 2421 kref_put(&ns->kref, nvme_free_ns); 2422 } 2423 2424 static void nvme_scan_namespaces(struct nvme_dev *dev, unsigned nn) 2425 { 2426 struct nvme_ns *ns, *next; 2427 unsigned i; 2428 2429 for (i = 1; i <= nn; i++) { 2430 ns = nvme_find_ns(dev, i); 2431 if (ns) { 2432 if (revalidate_disk(ns->disk)) 2433 nvme_ns_remove(ns); 2434 } else 2435 nvme_alloc_ns(dev, i); 2436 } 2437 list_for_each_entry_safe(ns, next, &dev->namespaces, list) { 2438 if (ns->ns_id > nn) 2439 nvme_ns_remove(ns); 2440 } 2441 list_sort(NULL, &dev->namespaces, ns_cmp); 2442 } 2443 2444 static void nvme_set_irq_hints(struct nvme_dev *dev) 2445 { 2446 struct nvme_queue *nvmeq; 2447 int i; 2448 2449 for (i = 0; i < dev->online_queues; i++) { 2450 nvmeq = dev->queues[i]; 2451 2452 if (!nvmeq->tags || !(*nvmeq->tags)) 2453 continue; 2454 2455 irq_set_affinity_hint(dev->entry[nvmeq->cq_vector].vector, 2456 blk_mq_tags_cpumask(*nvmeq->tags)); 2457 } 2458 } 2459 2460 static void nvme_dev_scan(struct work_struct *work) 2461 { 2462 struct nvme_dev *dev = container_of(work, struct nvme_dev, scan_work); 2463 struct nvme_id_ctrl *ctrl; 2464 2465 if (!dev->tagset.tags) 2466 return; 2467 if (nvme_identify_ctrl(dev, &ctrl)) 2468 return; 2469 nvme_scan_namespaces(dev, le32_to_cpup(&ctrl->nn)); 2470 kfree(ctrl); 2471 nvme_set_irq_hints(dev); 2472 } 2473 2474 /* 2475 * Return: error value if an error occurred setting up the queues or calling 2476 * Identify Device. 0 if these succeeded, even if adding some of the 2477 * namespaces failed. At the moment, these failures are silent. TBD which 2478 * failures should be reported. 2479 */ 2480 static int nvme_dev_add(struct nvme_dev *dev) 2481 { 2482 struct pci_dev *pdev = to_pci_dev(dev->dev); 2483 int res; 2484 struct nvme_id_ctrl *ctrl; 2485 int shift = NVME_CAP_MPSMIN(readq(&dev->bar->cap)) + 12; 2486 2487 res = nvme_identify_ctrl(dev, &ctrl); 2488 if (res) { 2489 dev_err(dev->dev, "Identify Controller failed (%d)\n", res); 2490 return -EIO; 2491 } 2492 2493 dev->oncs = le16_to_cpup(&ctrl->oncs); 2494 dev->abort_limit = ctrl->acl + 1; 2495 dev->vwc = ctrl->vwc; 2496 memcpy(dev->serial, ctrl->sn, sizeof(ctrl->sn)); 2497 memcpy(dev->model, ctrl->mn, sizeof(ctrl->mn)); 2498 memcpy(dev->firmware_rev, ctrl->fr, sizeof(ctrl->fr)); 2499 if (ctrl->mdts) 2500 dev->max_hw_sectors = 1 << (ctrl->mdts + shift - 9); 2501 if ((pdev->vendor == PCI_VENDOR_ID_INTEL) && 2502 (pdev->device == 0x0953) && ctrl->vs[3]) { 2503 unsigned int max_hw_sectors; 2504 2505 dev->stripe_size = 1 << (ctrl->vs[3] + shift); 2506 max_hw_sectors = dev->stripe_size >> (shift - 9); 2507 if (dev->max_hw_sectors) { 2508 dev->max_hw_sectors = min(max_hw_sectors, 2509 dev->max_hw_sectors); 2510 } else 2511 dev->max_hw_sectors = max_hw_sectors; 2512 } 2513 kfree(ctrl); 2514 2515 if (!dev->tagset.tags) { 2516 dev->tagset.ops = &nvme_mq_ops; 2517 dev->tagset.nr_hw_queues = dev->online_queues - 1; 2518 dev->tagset.timeout = NVME_IO_TIMEOUT; 2519 dev->tagset.numa_node = dev_to_node(dev->dev); 2520 dev->tagset.queue_depth = 2521 min_t(int, dev->q_depth, BLK_MQ_MAX_DEPTH) - 1; 2522 dev->tagset.cmd_size = nvme_cmd_size(dev); 2523 dev->tagset.flags = BLK_MQ_F_SHOULD_MERGE; 2524 dev->tagset.driver_data = dev; 2525 2526 if (blk_mq_alloc_tag_set(&dev->tagset)) 2527 return 0; 2528 } 2529 schedule_work(&dev->scan_work); 2530 return 0; 2531 } 2532 2533 static int nvme_dev_map(struct nvme_dev *dev) 2534 { 2535 u64 cap; 2536 int bars, result = -ENOMEM; 2537 struct pci_dev *pdev = to_pci_dev(dev->dev); 2538 2539 if (pci_enable_device_mem(pdev)) 2540 return result; 2541 2542 dev->entry[0].vector = pdev->irq; 2543 pci_set_master(pdev); 2544 bars = pci_select_bars(pdev, IORESOURCE_MEM); 2545 if (!bars) 2546 goto disable_pci; 2547 2548 if (pci_request_selected_regions(pdev, bars, "nvme")) 2549 goto disable_pci; 2550 2551 if (dma_set_mask_and_coherent(dev->dev, DMA_BIT_MASK(64)) && 2552 dma_set_mask_and_coherent(dev->dev, DMA_BIT_MASK(32))) 2553 goto disable; 2554 2555 dev->bar = ioremap(pci_resource_start(pdev, 0), 8192); 2556 if (!dev->bar) 2557 goto disable; 2558 2559 if (readl(&dev->bar->csts) == -1) { 2560 result = -ENODEV; 2561 goto unmap; 2562 } 2563 2564 /* 2565 * Some devices don't advertse INTx interrupts, pre-enable a single 2566 * MSIX vec for setup. We'll adjust this later. 2567 */ 2568 if (!pdev->irq) { 2569 result = pci_enable_msix(pdev, dev->entry, 1); 2570 if (result < 0) 2571 goto unmap; 2572 } 2573 2574 cap = readq(&dev->bar->cap); 2575 dev->q_depth = min_t(int, NVME_CAP_MQES(cap) + 1, NVME_Q_DEPTH); 2576 dev->db_stride = 1 << NVME_CAP_STRIDE(cap); 2577 dev->dbs = ((void __iomem *)dev->bar) + 4096; 2578 if (readl(&dev->bar->vs) >= NVME_VS(1, 2)) 2579 dev->cmb = nvme_map_cmb(dev); 2580 2581 return 0; 2582 2583 unmap: 2584 iounmap(dev->bar); 2585 dev->bar = NULL; 2586 disable: 2587 pci_release_regions(pdev); 2588 disable_pci: 2589 pci_disable_device(pdev); 2590 return result; 2591 } 2592 2593 static void nvme_dev_unmap(struct nvme_dev *dev) 2594 { 2595 struct pci_dev *pdev = to_pci_dev(dev->dev); 2596 2597 if (pdev->msi_enabled) 2598 pci_disable_msi(pdev); 2599 else if (pdev->msix_enabled) 2600 pci_disable_msix(pdev); 2601 2602 if (dev->bar) { 2603 iounmap(dev->bar); 2604 dev->bar = NULL; 2605 pci_release_regions(pdev); 2606 } 2607 2608 if (pci_is_enabled(pdev)) 2609 pci_disable_device(pdev); 2610 } 2611 2612 struct nvme_delq_ctx { 2613 struct task_struct *waiter; 2614 struct kthread_worker *worker; 2615 atomic_t refcount; 2616 }; 2617 2618 static void nvme_wait_dq(struct nvme_delq_ctx *dq, struct nvme_dev *dev) 2619 { 2620 dq->waiter = current; 2621 mb(); 2622 2623 for (;;) { 2624 set_current_state(TASK_KILLABLE); 2625 if (!atomic_read(&dq->refcount)) 2626 break; 2627 if (!schedule_timeout(ADMIN_TIMEOUT) || 2628 fatal_signal_pending(current)) { 2629 /* 2630 * Disable the controller first since we can't trust it 2631 * at this point, but leave the admin queue enabled 2632 * until all queue deletion requests are flushed. 2633 * FIXME: This may take a while if there are more h/w 2634 * queues than admin tags. 2635 */ 2636 set_current_state(TASK_RUNNING); 2637 nvme_disable_ctrl(dev, readq(&dev->bar->cap)); 2638 nvme_clear_queue(dev->queues[0]); 2639 flush_kthread_worker(dq->worker); 2640 nvme_disable_queue(dev, 0); 2641 return; 2642 } 2643 } 2644 set_current_state(TASK_RUNNING); 2645 } 2646 2647 static void nvme_put_dq(struct nvme_delq_ctx *dq) 2648 { 2649 atomic_dec(&dq->refcount); 2650 if (dq->waiter) 2651 wake_up_process(dq->waiter); 2652 } 2653 2654 static struct nvme_delq_ctx *nvme_get_dq(struct nvme_delq_ctx *dq) 2655 { 2656 atomic_inc(&dq->refcount); 2657 return dq; 2658 } 2659 2660 static void nvme_del_queue_end(struct nvme_queue *nvmeq) 2661 { 2662 struct nvme_delq_ctx *dq = nvmeq->cmdinfo.ctx; 2663 nvme_put_dq(dq); 2664 } 2665 2666 static int adapter_async_del_queue(struct nvme_queue *nvmeq, u8 opcode, 2667 kthread_work_func_t fn) 2668 { 2669 struct nvme_command c; 2670 2671 memset(&c, 0, sizeof(c)); 2672 c.delete_queue.opcode = opcode; 2673 c.delete_queue.qid = cpu_to_le16(nvmeq->qid); 2674 2675 init_kthread_work(&nvmeq->cmdinfo.work, fn); 2676 return nvme_submit_admin_async_cmd(nvmeq->dev, &c, &nvmeq->cmdinfo, 2677 ADMIN_TIMEOUT); 2678 } 2679 2680 static void nvme_del_cq_work_handler(struct kthread_work *work) 2681 { 2682 struct nvme_queue *nvmeq = container_of(work, struct nvme_queue, 2683 cmdinfo.work); 2684 nvme_del_queue_end(nvmeq); 2685 } 2686 2687 static int nvme_delete_cq(struct nvme_queue *nvmeq) 2688 { 2689 return adapter_async_del_queue(nvmeq, nvme_admin_delete_cq, 2690 nvme_del_cq_work_handler); 2691 } 2692 2693 static void nvme_del_sq_work_handler(struct kthread_work *work) 2694 { 2695 struct nvme_queue *nvmeq = container_of(work, struct nvme_queue, 2696 cmdinfo.work); 2697 int status = nvmeq->cmdinfo.status; 2698 2699 if (!status) 2700 status = nvme_delete_cq(nvmeq); 2701 if (status) 2702 nvme_del_queue_end(nvmeq); 2703 } 2704 2705 static int nvme_delete_sq(struct nvme_queue *nvmeq) 2706 { 2707 return adapter_async_del_queue(nvmeq, nvme_admin_delete_sq, 2708 nvme_del_sq_work_handler); 2709 } 2710 2711 static void nvme_del_queue_start(struct kthread_work *work) 2712 { 2713 struct nvme_queue *nvmeq = container_of(work, struct nvme_queue, 2714 cmdinfo.work); 2715 if (nvme_delete_sq(nvmeq)) 2716 nvme_del_queue_end(nvmeq); 2717 } 2718 2719 static void nvme_disable_io_queues(struct nvme_dev *dev) 2720 { 2721 int i; 2722 DEFINE_KTHREAD_WORKER_ONSTACK(worker); 2723 struct nvme_delq_ctx dq; 2724 struct task_struct *kworker_task = kthread_run(kthread_worker_fn, 2725 &worker, "nvme%d", dev->instance); 2726 2727 if (IS_ERR(kworker_task)) { 2728 dev_err(dev->dev, 2729 "Failed to create queue del task\n"); 2730 for (i = dev->queue_count - 1; i > 0; i--) 2731 nvme_disable_queue(dev, i); 2732 return; 2733 } 2734 2735 dq.waiter = NULL; 2736 atomic_set(&dq.refcount, 0); 2737 dq.worker = &worker; 2738 for (i = dev->queue_count - 1; i > 0; i--) { 2739 struct nvme_queue *nvmeq = dev->queues[i]; 2740 2741 if (nvme_suspend_queue(nvmeq)) 2742 continue; 2743 nvmeq->cmdinfo.ctx = nvme_get_dq(&dq); 2744 nvmeq->cmdinfo.worker = dq.worker; 2745 init_kthread_work(&nvmeq->cmdinfo.work, nvme_del_queue_start); 2746 queue_kthread_work(dq.worker, &nvmeq->cmdinfo.work); 2747 } 2748 nvme_wait_dq(&dq, dev); 2749 kthread_stop(kworker_task); 2750 } 2751 2752 /* 2753 * Remove the node from the device list and check 2754 * for whether or not we need to stop the nvme_thread. 2755 */ 2756 static void nvme_dev_list_remove(struct nvme_dev *dev) 2757 { 2758 struct task_struct *tmp = NULL; 2759 2760 spin_lock(&dev_list_lock); 2761 list_del_init(&dev->node); 2762 if (list_empty(&dev_list) && !IS_ERR_OR_NULL(nvme_thread)) { 2763 tmp = nvme_thread; 2764 nvme_thread = NULL; 2765 } 2766 spin_unlock(&dev_list_lock); 2767 2768 if (tmp) 2769 kthread_stop(tmp); 2770 } 2771 2772 static void nvme_freeze_queues(struct nvme_dev *dev) 2773 { 2774 struct nvme_ns *ns; 2775 2776 list_for_each_entry(ns, &dev->namespaces, list) { 2777 blk_mq_freeze_queue_start(ns->queue); 2778 2779 spin_lock_irq(ns->queue->queue_lock); 2780 queue_flag_set(QUEUE_FLAG_STOPPED, ns->queue); 2781 spin_unlock_irq(ns->queue->queue_lock); 2782 2783 blk_mq_cancel_requeue_work(ns->queue); 2784 blk_mq_stop_hw_queues(ns->queue); 2785 } 2786 } 2787 2788 static void nvme_unfreeze_queues(struct nvme_dev *dev) 2789 { 2790 struct nvme_ns *ns; 2791 2792 list_for_each_entry(ns, &dev->namespaces, list) { 2793 queue_flag_clear_unlocked(QUEUE_FLAG_STOPPED, ns->queue); 2794 blk_mq_unfreeze_queue(ns->queue); 2795 blk_mq_start_stopped_hw_queues(ns->queue, true); 2796 blk_mq_kick_requeue_list(ns->queue); 2797 } 2798 } 2799 2800 static void nvme_dev_shutdown(struct nvme_dev *dev) 2801 { 2802 int i; 2803 u32 csts = -1; 2804 2805 nvme_dev_list_remove(dev); 2806 2807 if (dev->bar) { 2808 nvme_freeze_queues(dev); 2809 csts = readl(&dev->bar->csts); 2810 } 2811 if (csts & NVME_CSTS_CFS || !(csts & NVME_CSTS_RDY)) { 2812 for (i = dev->queue_count - 1; i >= 0; i--) { 2813 struct nvme_queue *nvmeq = dev->queues[i]; 2814 nvme_suspend_queue(nvmeq); 2815 } 2816 } else { 2817 nvme_disable_io_queues(dev); 2818 nvme_shutdown_ctrl(dev); 2819 nvme_disable_queue(dev, 0); 2820 } 2821 nvme_dev_unmap(dev); 2822 2823 for (i = dev->queue_count - 1; i >= 0; i--) 2824 nvme_clear_queue(dev->queues[i]); 2825 } 2826 2827 static void nvme_dev_remove(struct nvme_dev *dev) 2828 { 2829 struct nvme_ns *ns, *next; 2830 2831 list_for_each_entry_safe(ns, next, &dev->namespaces, list) 2832 nvme_ns_remove(ns); 2833 } 2834 2835 static int nvme_setup_prp_pools(struct nvme_dev *dev) 2836 { 2837 dev->prp_page_pool = dma_pool_create("prp list page", dev->dev, 2838 PAGE_SIZE, PAGE_SIZE, 0); 2839 if (!dev->prp_page_pool) 2840 return -ENOMEM; 2841 2842 /* Optimisation for I/Os between 4k and 128k */ 2843 dev->prp_small_pool = dma_pool_create("prp list 256", dev->dev, 2844 256, 256, 0); 2845 if (!dev->prp_small_pool) { 2846 dma_pool_destroy(dev->prp_page_pool); 2847 return -ENOMEM; 2848 } 2849 return 0; 2850 } 2851 2852 static void nvme_release_prp_pools(struct nvme_dev *dev) 2853 { 2854 dma_pool_destroy(dev->prp_page_pool); 2855 dma_pool_destroy(dev->prp_small_pool); 2856 } 2857 2858 static DEFINE_IDA(nvme_instance_ida); 2859 2860 static int nvme_set_instance(struct nvme_dev *dev) 2861 { 2862 int instance, error; 2863 2864 do { 2865 if (!ida_pre_get(&nvme_instance_ida, GFP_KERNEL)) 2866 return -ENODEV; 2867 2868 spin_lock(&dev_list_lock); 2869 error = ida_get_new(&nvme_instance_ida, &instance); 2870 spin_unlock(&dev_list_lock); 2871 } while (error == -EAGAIN); 2872 2873 if (error) 2874 return -ENODEV; 2875 2876 dev->instance = instance; 2877 return 0; 2878 } 2879 2880 static void nvme_release_instance(struct nvme_dev *dev) 2881 { 2882 spin_lock(&dev_list_lock); 2883 ida_remove(&nvme_instance_ida, dev->instance); 2884 spin_unlock(&dev_list_lock); 2885 } 2886 2887 static void nvme_free_dev(struct kref *kref) 2888 { 2889 struct nvme_dev *dev = container_of(kref, struct nvme_dev, kref); 2890 2891 put_device(dev->dev); 2892 put_device(dev->device); 2893 nvme_release_instance(dev); 2894 if (dev->tagset.tags) 2895 blk_mq_free_tag_set(&dev->tagset); 2896 if (dev->admin_q) 2897 blk_put_queue(dev->admin_q); 2898 kfree(dev->queues); 2899 kfree(dev->entry); 2900 kfree(dev); 2901 } 2902 2903 static int nvme_dev_open(struct inode *inode, struct file *f) 2904 { 2905 struct nvme_dev *dev; 2906 int instance = iminor(inode); 2907 int ret = -ENODEV; 2908 2909 spin_lock(&dev_list_lock); 2910 list_for_each_entry(dev, &dev_list, node) { 2911 if (dev->instance == instance) { 2912 if (!dev->admin_q) { 2913 ret = -EWOULDBLOCK; 2914 break; 2915 } 2916 if (!kref_get_unless_zero(&dev->kref)) 2917 break; 2918 f->private_data = dev; 2919 ret = 0; 2920 break; 2921 } 2922 } 2923 spin_unlock(&dev_list_lock); 2924 2925 return ret; 2926 } 2927 2928 static int nvme_dev_release(struct inode *inode, struct file *f) 2929 { 2930 struct nvme_dev *dev = f->private_data; 2931 kref_put(&dev->kref, nvme_free_dev); 2932 return 0; 2933 } 2934 2935 static long nvme_dev_ioctl(struct file *f, unsigned int cmd, unsigned long arg) 2936 { 2937 struct nvme_dev *dev = f->private_data; 2938 struct nvme_ns *ns; 2939 2940 switch (cmd) { 2941 case NVME_IOCTL_ADMIN_CMD: 2942 return nvme_user_cmd(dev, NULL, (void __user *)arg); 2943 case NVME_IOCTL_IO_CMD: 2944 if (list_empty(&dev->namespaces)) 2945 return -ENOTTY; 2946 ns = list_first_entry(&dev->namespaces, struct nvme_ns, list); 2947 return nvme_user_cmd(dev, ns, (void __user *)arg); 2948 case NVME_IOCTL_RESET: 2949 dev_warn(dev->dev, "resetting controller\n"); 2950 return nvme_reset(dev); 2951 case NVME_IOCTL_SUBSYS_RESET: 2952 return nvme_subsys_reset(dev); 2953 default: 2954 return -ENOTTY; 2955 } 2956 } 2957 2958 static const struct file_operations nvme_dev_fops = { 2959 .owner = THIS_MODULE, 2960 .open = nvme_dev_open, 2961 .release = nvme_dev_release, 2962 .unlocked_ioctl = nvme_dev_ioctl, 2963 .compat_ioctl = nvme_dev_ioctl, 2964 }; 2965 2966 static void nvme_probe_work(struct work_struct *work) 2967 { 2968 struct nvme_dev *dev = container_of(work, struct nvme_dev, probe_work); 2969 bool start_thread = false; 2970 int result; 2971 2972 result = nvme_dev_map(dev); 2973 if (result) 2974 goto out; 2975 2976 result = nvme_configure_admin_queue(dev); 2977 if (result) 2978 goto unmap; 2979 2980 spin_lock(&dev_list_lock); 2981 if (list_empty(&dev_list) && IS_ERR_OR_NULL(nvme_thread)) { 2982 start_thread = true; 2983 nvme_thread = NULL; 2984 } 2985 list_add(&dev->node, &dev_list); 2986 spin_unlock(&dev_list_lock); 2987 2988 if (start_thread) { 2989 nvme_thread = kthread_run(nvme_kthread, NULL, "nvme"); 2990 wake_up_all(&nvme_kthread_wait); 2991 } else 2992 wait_event_killable(nvme_kthread_wait, nvme_thread); 2993 2994 if (IS_ERR_OR_NULL(nvme_thread)) { 2995 result = nvme_thread ? PTR_ERR(nvme_thread) : -EINTR; 2996 goto disable; 2997 } 2998 2999 nvme_init_queue(dev->queues[0], 0); 3000 result = nvme_alloc_admin_tags(dev); 3001 if (result) 3002 goto disable; 3003 3004 result = nvme_setup_io_queues(dev); 3005 if (result) 3006 goto free_tags; 3007 3008 dev->event_limit = 1; 3009 3010 /* 3011 * Keep the controller around but remove all namespaces if we don't have 3012 * any working I/O queue. 3013 */ 3014 if (dev->online_queues < 2) { 3015 dev_warn(dev->dev, "IO queues not created\n"); 3016 nvme_dev_remove(dev); 3017 } else { 3018 nvme_unfreeze_queues(dev); 3019 nvme_dev_add(dev); 3020 } 3021 3022 return; 3023 3024 free_tags: 3025 nvme_dev_remove_admin(dev); 3026 blk_put_queue(dev->admin_q); 3027 dev->admin_q = NULL; 3028 dev->queues[0]->tags = NULL; 3029 disable: 3030 nvme_disable_queue(dev, 0); 3031 nvme_dev_list_remove(dev); 3032 unmap: 3033 nvme_dev_unmap(dev); 3034 out: 3035 if (!work_busy(&dev->reset_work)) 3036 nvme_dead_ctrl(dev); 3037 } 3038 3039 static int nvme_remove_dead_ctrl(void *arg) 3040 { 3041 struct nvme_dev *dev = (struct nvme_dev *)arg; 3042 struct pci_dev *pdev = to_pci_dev(dev->dev); 3043 3044 if (pci_get_drvdata(pdev)) 3045 pci_stop_and_remove_bus_device_locked(pdev); 3046 kref_put(&dev->kref, nvme_free_dev); 3047 return 0; 3048 } 3049 3050 static void nvme_dead_ctrl(struct nvme_dev *dev) 3051 { 3052 dev_warn(dev->dev, "Device failed to resume\n"); 3053 kref_get(&dev->kref); 3054 if (IS_ERR(kthread_run(nvme_remove_dead_ctrl, dev, "nvme%d", 3055 dev->instance))) { 3056 dev_err(dev->dev, 3057 "Failed to start controller remove task\n"); 3058 kref_put(&dev->kref, nvme_free_dev); 3059 } 3060 } 3061 3062 static void nvme_reset_work(struct work_struct *ws) 3063 { 3064 struct nvme_dev *dev = container_of(ws, struct nvme_dev, reset_work); 3065 bool in_probe = work_busy(&dev->probe_work); 3066 3067 nvme_dev_shutdown(dev); 3068 3069 /* Synchronize with device probe so that work will see failure status 3070 * and exit gracefully without trying to schedule another reset */ 3071 flush_work(&dev->probe_work); 3072 3073 /* Fail this device if reset occured during probe to avoid 3074 * infinite initialization loops. */ 3075 if (in_probe) { 3076 nvme_dead_ctrl(dev); 3077 return; 3078 } 3079 /* Schedule device resume asynchronously so the reset work is available 3080 * to cleanup errors that may occur during reinitialization */ 3081 schedule_work(&dev->probe_work); 3082 } 3083 3084 static int __nvme_reset(struct nvme_dev *dev) 3085 { 3086 if (work_pending(&dev->reset_work)) 3087 return -EBUSY; 3088 list_del_init(&dev->node); 3089 queue_work(nvme_workq, &dev->reset_work); 3090 return 0; 3091 } 3092 3093 static int nvme_reset(struct nvme_dev *dev) 3094 { 3095 int ret; 3096 3097 if (!dev->admin_q || blk_queue_dying(dev->admin_q)) 3098 return -ENODEV; 3099 3100 spin_lock(&dev_list_lock); 3101 ret = __nvme_reset(dev); 3102 spin_unlock(&dev_list_lock); 3103 3104 if (!ret) { 3105 flush_work(&dev->reset_work); 3106 flush_work(&dev->probe_work); 3107 return 0; 3108 } 3109 3110 return ret; 3111 } 3112 3113 static ssize_t nvme_sysfs_reset(struct device *dev, 3114 struct device_attribute *attr, const char *buf, 3115 size_t count) 3116 { 3117 struct nvme_dev *ndev = dev_get_drvdata(dev); 3118 int ret; 3119 3120 ret = nvme_reset(ndev); 3121 if (ret < 0) 3122 return ret; 3123 3124 return count; 3125 } 3126 static DEVICE_ATTR(reset_controller, S_IWUSR, NULL, nvme_sysfs_reset); 3127 3128 static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) 3129 { 3130 int node, result = -ENOMEM; 3131 struct nvme_dev *dev; 3132 3133 node = dev_to_node(&pdev->dev); 3134 if (node == NUMA_NO_NODE) 3135 set_dev_node(&pdev->dev, 0); 3136 3137 dev = kzalloc_node(sizeof(*dev), GFP_KERNEL, node); 3138 if (!dev) 3139 return -ENOMEM; 3140 dev->entry = kzalloc_node(num_possible_cpus() * sizeof(*dev->entry), 3141 GFP_KERNEL, node); 3142 if (!dev->entry) 3143 goto free; 3144 dev->queues = kzalloc_node((num_possible_cpus() + 1) * sizeof(void *), 3145 GFP_KERNEL, node); 3146 if (!dev->queues) 3147 goto free; 3148 3149 INIT_LIST_HEAD(&dev->namespaces); 3150 INIT_WORK(&dev->reset_work, nvme_reset_work); 3151 dev->dev = get_device(&pdev->dev); 3152 pci_set_drvdata(pdev, dev); 3153 result = nvme_set_instance(dev); 3154 if (result) 3155 goto put_pci; 3156 3157 result = nvme_setup_prp_pools(dev); 3158 if (result) 3159 goto release; 3160 3161 kref_init(&dev->kref); 3162 dev->device = device_create(nvme_class, &pdev->dev, 3163 MKDEV(nvme_char_major, dev->instance), 3164 dev, "nvme%d", dev->instance); 3165 if (IS_ERR(dev->device)) { 3166 result = PTR_ERR(dev->device); 3167 goto release_pools; 3168 } 3169 get_device(dev->device); 3170 dev_set_drvdata(dev->device, dev); 3171 3172 result = device_create_file(dev->device, &dev_attr_reset_controller); 3173 if (result) 3174 goto put_dev; 3175 3176 INIT_LIST_HEAD(&dev->node); 3177 INIT_WORK(&dev->scan_work, nvme_dev_scan); 3178 INIT_WORK(&dev->probe_work, nvme_probe_work); 3179 schedule_work(&dev->probe_work); 3180 return 0; 3181 3182 put_dev: 3183 device_destroy(nvme_class, MKDEV(nvme_char_major, dev->instance)); 3184 put_device(dev->device); 3185 release_pools: 3186 nvme_release_prp_pools(dev); 3187 release: 3188 nvme_release_instance(dev); 3189 put_pci: 3190 put_device(dev->dev); 3191 free: 3192 kfree(dev->queues); 3193 kfree(dev->entry); 3194 kfree(dev); 3195 return result; 3196 } 3197 3198 static void nvme_reset_notify(struct pci_dev *pdev, bool prepare) 3199 { 3200 struct nvme_dev *dev = pci_get_drvdata(pdev); 3201 3202 if (prepare) 3203 nvme_dev_shutdown(dev); 3204 else 3205 schedule_work(&dev->probe_work); 3206 } 3207 3208 static void nvme_shutdown(struct pci_dev *pdev) 3209 { 3210 struct nvme_dev *dev = pci_get_drvdata(pdev); 3211 nvme_dev_shutdown(dev); 3212 } 3213 3214 static void nvme_remove(struct pci_dev *pdev) 3215 { 3216 struct nvme_dev *dev = pci_get_drvdata(pdev); 3217 3218 spin_lock(&dev_list_lock); 3219 list_del_init(&dev->node); 3220 spin_unlock(&dev_list_lock); 3221 3222 pci_set_drvdata(pdev, NULL); 3223 flush_work(&dev->probe_work); 3224 flush_work(&dev->reset_work); 3225 flush_work(&dev->scan_work); 3226 device_remove_file(dev->device, &dev_attr_reset_controller); 3227 nvme_dev_remove(dev); 3228 nvme_dev_shutdown(dev); 3229 nvme_dev_remove_admin(dev); 3230 device_destroy(nvme_class, MKDEV(nvme_char_major, dev->instance)); 3231 nvme_free_queues(dev, 0); 3232 nvme_release_cmb(dev); 3233 nvme_release_prp_pools(dev); 3234 kref_put(&dev->kref, nvme_free_dev); 3235 } 3236 3237 /* These functions are yet to be implemented */ 3238 #define nvme_error_detected NULL 3239 #define nvme_dump_registers NULL 3240 #define nvme_link_reset NULL 3241 #define nvme_slot_reset NULL 3242 #define nvme_error_resume NULL 3243 3244 #ifdef CONFIG_PM_SLEEP 3245 static int nvme_suspend(struct device *dev) 3246 { 3247 struct pci_dev *pdev = to_pci_dev(dev); 3248 struct nvme_dev *ndev = pci_get_drvdata(pdev); 3249 3250 nvme_dev_shutdown(ndev); 3251 return 0; 3252 } 3253 3254 static int nvme_resume(struct device *dev) 3255 { 3256 struct pci_dev *pdev = to_pci_dev(dev); 3257 struct nvme_dev *ndev = pci_get_drvdata(pdev); 3258 3259 schedule_work(&ndev->probe_work); 3260 return 0; 3261 } 3262 #endif 3263 3264 static SIMPLE_DEV_PM_OPS(nvme_dev_pm_ops, nvme_suspend, nvme_resume); 3265 3266 static const struct pci_error_handlers nvme_err_handler = { 3267 .error_detected = nvme_error_detected, 3268 .mmio_enabled = nvme_dump_registers, 3269 .link_reset = nvme_link_reset, 3270 .slot_reset = nvme_slot_reset, 3271 .resume = nvme_error_resume, 3272 .reset_notify = nvme_reset_notify, 3273 }; 3274 3275 /* Move to pci_ids.h later */ 3276 #define PCI_CLASS_STORAGE_EXPRESS 0x010802 3277 3278 static const struct pci_device_id nvme_id_table[] = { 3279 { PCI_DEVICE_CLASS(PCI_CLASS_STORAGE_EXPRESS, 0xffffff) }, 3280 { 0, } 3281 }; 3282 MODULE_DEVICE_TABLE(pci, nvme_id_table); 3283 3284 static struct pci_driver nvme_driver = { 3285 .name = "nvme", 3286 .id_table = nvme_id_table, 3287 .probe = nvme_probe, 3288 .remove = nvme_remove, 3289 .shutdown = nvme_shutdown, 3290 .driver = { 3291 .pm = &nvme_dev_pm_ops, 3292 }, 3293 .err_handler = &nvme_err_handler, 3294 }; 3295 3296 static int __init nvme_init(void) 3297 { 3298 int result; 3299 3300 init_waitqueue_head(&nvme_kthread_wait); 3301 3302 nvme_workq = create_singlethread_workqueue("nvme"); 3303 if (!nvme_workq) 3304 return -ENOMEM; 3305 3306 result = register_blkdev(nvme_major, "nvme"); 3307 if (result < 0) 3308 goto kill_workq; 3309 else if (result > 0) 3310 nvme_major = result; 3311 3312 result = __register_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme", 3313 &nvme_dev_fops); 3314 if (result < 0) 3315 goto unregister_blkdev; 3316 else if (result > 0) 3317 nvme_char_major = result; 3318 3319 nvme_class = class_create(THIS_MODULE, "nvme"); 3320 if (IS_ERR(nvme_class)) { 3321 result = PTR_ERR(nvme_class); 3322 goto unregister_chrdev; 3323 } 3324 3325 result = pci_register_driver(&nvme_driver); 3326 if (result) 3327 goto destroy_class; 3328 return 0; 3329 3330 destroy_class: 3331 class_destroy(nvme_class); 3332 unregister_chrdev: 3333 __unregister_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme"); 3334 unregister_blkdev: 3335 unregister_blkdev(nvme_major, "nvme"); 3336 kill_workq: 3337 destroy_workqueue(nvme_workq); 3338 return result; 3339 } 3340 3341 static void __exit nvme_exit(void) 3342 { 3343 pci_unregister_driver(&nvme_driver); 3344 unregister_blkdev(nvme_major, "nvme"); 3345 destroy_workqueue(nvme_workq); 3346 class_destroy(nvme_class); 3347 __unregister_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme"); 3348 BUG_ON(nvme_thread && !IS_ERR(nvme_thread)); 3349 _nvme_check_size(); 3350 } 3351 3352 MODULE_AUTHOR("Matthew Wilcox <willy@linux.intel.com>"); 3353 MODULE_LICENSE("GPL"); 3354 MODULE_VERSION("1.0"); 3355 module_init(nvme_init); 3356 module_exit(nvme_exit); 3357