1 /* 2 * NVM Express device driver 3 * Copyright (c) 2011-2014, Intel Corporation. 4 * 5 * This program is free software; you can redistribute it and/or modify it 6 * under the terms and conditions of the GNU General Public License, 7 * version 2, as published by the Free Software Foundation. 8 * 9 * This program is distributed in the hope it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 12 * more details. 13 */ 14 15 #include <linux/aer.h> 16 #include <linux/async.h> 17 #include <linux/blkdev.h> 18 #include <linux/blk-mq.h> 19 #include <linux/blk-mq-pci.h> 20 #include <linux/dmi.h> 21 #include <linux/init.h> 22 #include <linux/interrupt.h> 23 #include <linux/io.h> 24 #include <linux/mm.h> 25 #include <linux/module.h> 26 #include <linux/mutex.h> 27 #include <linux/once.h> 28 #include <linux/pci.h> 29 #include <linux/t10-pi.h> 30 #include <linux/types.h> 31 #include <linux/io-64-nonatomic-lo-hi.h> 32 #include <linux/sed-opal.h> 33 34 #include "nvme.h" 35 36 #define SQ_SIZE(depth) (depth * sizeof(struct nvme_command)) 37 #define CQ_SIZE(depth) (depth * sizeof(struct nvme_completion)) 38 39 #define SGES_PER_PAGE (PAGE_SIZE / sizeof(struct nvme_sgl_desc)) 40 41 /* 42 * These can be higher, but we need to ensure that any command doesn't 43 * require an sg allocation that needs more than a page of data. 44 */ 45 #define NVME_MAX_KB_SZ 4096 46 #define NVME_MAX_SEGS 127 47 48 static int use_threaded_interrupts; 49 module_param(use_threaded_interrupts, int, 0); 50 51 static bool use_cmb_sqes = true; 52 module_param(use_cmb_sqes, bool, 0444); 53 MODULE_PARM_DESC(use_cmb_sqes, "use controller's memory buffer for I/O SQes"); 54 55 static unsigned int max_host_mem_size_mb = 128; 56 module_param(max_host_mem_size_mb, uint, 0444); 57 MODULE_PARM_DESC(max_host_mem_size_mb, 58 "Maximum Host Memory Buffer (HMB) size per controller (in MiB)"); 59 60 static unsigned int sgl_threshold = SZ_32K; 61 module_param(sgl_threshold, uint, 0644); 62 MODULE_PARM_DESC(sgl_threshold, 63 "Use SGLs when average request segment size is larger or equal to " 64 "this size. Use 0 to disable SGLs."); 65 66 static int io_queue_depth_set(const char *val, const struct kernel_param *kp); 67 static const struct kernel_param_ops io_queue_depth_ops = { 68 .set = io_queue_depth_set, 69 .get = param_get_int, 70 }; 71 72 static int io_queue_depth = 1024; 73 module_param_cb(io_queue_depth, &io_queue_depth_ops, &io_queue_depth, 0644); 74 MODULE_PARM_DESC(io_queue_depth, "set io queue depth, should >= 2"); 75 76 struct nvme_dev; 77 struct nvme_queue; 78 79 static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown); 80 81 /* 82 * Represents an NVM Express device. Each nvme_dev is a PCI function. 83 */ 84 struct nvme_dev { 85 struct nvme_queue *queues; 86 struct blk_mq_tag_set tagset; 87 struct blk_mq_tag_set admin_tagset; 88 u32 __iomem *dbs; 89 struct device *dev; 90 struct dma_pool *prp_page_pool; 91 struct dma_pool *prp_small_pool; 92 unsigned online_queues; 93 unsigned max_qid; 94 unsigned int num_vecs; 95 int q_depth; 96 u32 db_stride; 97 void __iomem *bar; 98 unsigned long bar_mapped_size; 99 struct work_struct remove_work; 100 struct mutex shutdown_lock; 101 bool subsystem; 102 void __iomem *cmb; 103 pci_bus_addr_t cmb_bus_addr; 104 u64 cmb_size; 105 u32 cmbsz; 106 u32 cmbloc; 107 struct nvme_ctrl ctrl; 108 struct completion ioq_wait; 109 110 mempool_t *iod_mempool; 111 112 /* shadow doorbell buffer support: */ 113 u32 *dbbuf_dbs; 114 dma_addr_t dbbuf_dbs_dma_addr; 115 u32 *dbbuf_eis; 116 dma_addr_t dbbuf_eis_dma_addr; 117 118 /* host memory buffer support: */ 119 u64 host_mem_size; 120 u32 nr_host_mem_descs; 121 dma_addr_t host_mem_descs_dma; 122 struct nvme_host_mem_buf_desc *host_mem_descs; 123 void **host_mem_desc_bufs; 124 }; 125 126 static int io_queue_depth_set(const char *val, const struct kernel_param *kp) 127 { 128 int n = 0, ret; 129 130 ret = kstrtoint(val, 10, &n); 131 if (ret != 0 || n < 2) 132 return -EINVAL; 133 134 return param_set_int(val, kp); 135 } 136 137 static inline unsigned int sq_idx(unsigned int qid, u32 stride) 138 { 139 return qid * 2 * stride; 140 } 141 142 static inline unsigned int cq_idx(unsigned int qid, u32 stride) 143 { 144 return (qid * 2 + 1) * stride; 145 } 146 147 static inline struct nvme_dev *to_nvme_dev(struct nvme_ctrl *ctrl) 148 { 149 return container_of(ctrl, struct nvme_dev, ctrl); 150 } 151 152 /* 153 * An NVM Express queue. Each device has at least two (one for admin 154 * commands and one for I/O commands). 155 */ 156 struct nvme_queue { 157 struct device *q_dmadev; 158 struct nvme_dev *dev; 159 spinlock_t sq_lock; 160 struct nvme_command *sq_cmds; 161 struct nvme_command __iomem *sq_cmds_io; 162 spinlock_t cq_lock ____cacheline_aligned_in_smp; 163 volatile struct nvme_completion *cqes; 164 struct blk_mq_tags **tags; 165 dma_addr_t sq_dma_addr; 166 dma_addr_t cq_dma_addr; 167 u32 __iomem *q_db; 168 u16 q_depth; 169 s16 cq_vector; 170 u16 sq_tail; 171 u16 cq_head; 172 u16 last_cq_head; 173 u16 qid; 174 u8 cq_phase; 175 u32 *dbbuf_sq_db; 176 u32 *dbbuf_cq_db; 177 u32 *dbbuf_sq_ei; 178 u32 *dbbuf_cq_ei; 179 }; 180 181 /* 182 * The nvme_iod describes the data in an I/O, including the list of PRP 183 * entries. You can't see it in this data structure because C doesn't let 184 * me express that. Use nvme_init_iod to ensure there's enough space 185 * allocated to store the PRP list. 186 */ 187 struct nvme_iod { 188 struct nvme_request req; 189 struct nvme_queue *nvmeq; 190 bool use_sgl; 191 int aborted; 192 int npages; /* In the PRP list. 0 means small pool in use */ 193 int nents; /* Used in scatterlist */ 194 int length; /* Of data, in bytes */ 195 dma_addr_t first_dma; 196 struct scatterlist meta_sg; /* metadata requires single contiguous buffer */ 197 struct scatterlist *sg; 198 struct scatterlist inline_sg[0]; 199 }; 200 201 /* 202 * Check we didin't inadvertently grow the command struct 203 */ 204 static inline void _nvme_check_size(void) 205 { 206 BUILD_BUG_ON(sizeof(struct nvme_rw_command) != 64); 207 BUILD_BUG_ON(sizeof(struct nvme_create_cq) != 64); 208 BUILD_BUG_ON(sizeof(struct nvme_create_sq) != 64); 209 BUILD_BUG_ON(sizeof(struct nvme_delete_queue) != 64); 210 BUILD_BUG_ON(sizeof(struct nvme_features) != 64); 211 BUILD_BUG_ON(sizeof(struct nvme_format_cmd) != 64); 212 BUILD_BUG_ON(sizeof(struct nvme_abort_cmd) != 64); 213 BUILD_BUG_ON(sizeof(struct nvme_command) != 64); 214 BUILD_BUG_ON(sizeof(struct nvme_id_ctrl) != NVME_IDENTIFY_DATA_SIZE); 215 BUILD_BUG_ON(sizeof(struct nvme_id_ns) != NVME_IDENTIFY_DATA_SIZE); 216 BUILD_BUG_ON(sizeof(struct nvme_lba_range_type) != 64); 217 BUILD_BUG_ON(sizeof(struct nvme_smart_log) != 512); 218 BUILD_BUG_ON(sizeof(struct nvme_dbbuf) != 64); 219 } 220 221 static inline unsigned int nvme_dbbuf_size(u32 stride) 222 { 223 return ((num_possible_cpus() + 1) * 8 * stride); 224 } 225 226 static int nvme_dbbuf_dma_alloc(struct nvme_dev *dev) 227 { 228 unsigned int mem_size = nvme_dbbuf_size(dev->db_stride); 229 230 if (dev->dbbuf_dbs) 231 return 0; 232 233 dev->dbbuf_dbs = dma_alloc_coherent(dev->dev, mem_size, 234 &dev->dbbuf_dbs_dma_addr, 235 GFP_KERNEL); 236 if (!dev->dbbuf_dbs) 237 return -ENOMEM; 238 dev->dbbuf_eis = dma_alloc_coherent(dev->dev, mem_size, 239 &dev->dbbuf_eis_dma_addr, 240 GFP_KERNEL); 241 if (!dev->dbbuf_eis) { 242 dma_free_coherent(dev->dev, mem_size, 243 dev->dbbuf_dbs, dev->dbbuf_dbs_dma_addr); 244 dev->dbbuf_dbs = NULL; 245 return -ENOMEM; 246 } 247 248 return 0; 249 } 250 251 static void nvme_dbbuf_dma_free(struct nvme_dev *dev) 252 { 253 unsigned int mem_size = nvme_dbbuf_size(dev->db_stride); 254 255 if (dev->dbbuf_dbs) { 256 dma_free_coherent(dev->dev, mem_size, 257 dev->dbbuf_dbs, dev->dbbuf_dbs_dma_addr); 258 dev->dbbuf_dbs = NULL; 259 } 260 if (dev->dbbuf_eis) { 261 dma_free_coherent(dev->dev, mem_size, 262 dev->dbbuf_eis, dev->dbbuf_eis_dma_addr); 263 dev->dbbuf_eis = NULL; 264 } 265 } 266 267 static void nvme_dbbuf_init(struct nvme_dev *dev, 268 struct nvme_queue *nvmeq, int qid) 269 { 270 if (!dev->dbbuf_dbs || !qid) 271 return; 272 273 nvmeq->dbbuf_sq_db = &dev->dbbuf_dbs[sq_idx(qid, dev->db_stride)]; 274 nvmeq->dbbuf_cq_db = &dev->dbbuf_dbs[cq_idx(qid, dev->db_stride)]; 275 nvmeq->dbbuf_sq_ei = &dev->dbbuf_eis[sq_idx(qid, dev->db_stride)]; 276 nvmeq->dbbuf_cq_ei = &dev->dbbuf_eis[cq_idx(qid, dev->db_stride)]; 277 } 278 279 static void nvme_dbbuf_set(struct nvme_dev *dev) 280 { 281 struct nvme_command c; 282 283 if (!dev->dbbuf_dbs) 284 return; 285 286 memset(&c, 0, sizeof(c)); 287 c.dbbuf.opcode = nvme_admin_dbbuf; 288 c.dbbuf.prp1 = cpu_to_le64(dev->dbbuf_dbs_dma_addr); 289 c.dbbuf.prp2 = cpu_to_le64(dev->dbbuf_eis_dma_addr); 290 291 if (nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0)) { 292 dev_warn(dev->ctrl.device, "unable to set dbbuf\n"); 293 /* Free memory and continue on */ 294 nvme_dbbuf_dma_free(dev); 295 } 296 } 297 298 static inline int nvme_dbbuf_need_event(u16 event_idx, u16 new_idx, u16 old) 299 { 300 return (u16)(new_idx - event_idx - 1) < (u16)(new_idx - old); 301 } 302 303 /* Update dbbuf and return true if an MMIO is required */ 304 static bool nvme_dbbuf_update_and_check_event(u16 value, u32 *dbbuf_db, 305 volatile u32 *dbbuf_ei) 306 { 307 if (dbbuf_db) { 308 u16 old_value; 309 310 /* 311 * Ensure that the queue is written before updating 312 * the doorbell in memory 313 */ 314 wmb(); 315 316 old_value = *dbbuf_db; 317 *dbbuf_db = value; 318 319 if (!nvme_dbbuf_need_event(*dbbuf_ei, value, old_value)) 320 return false; 321 } 322 323 return true; 324 } 325 326 /* 327 * Max size of iod being embedded in the request payload 328 */ 329 #define NVME_INT_PAGES 2 330 #define NVME_INT_BYTES(dev) (NVME_INT_PAGES * (dev)->ctrl.page_size) 331 332 /* 333 * Will slightly overestimate the number of pages needed. This is OK 334 * as it only leads to a small amount of wasted memory for the lifetime of 335 * the I/O. 336 */ 337 static int nvme_npages(unsigned size, struct nvme_dev *dev) 338 { 339 unsigned nprps = DIV_ROUND_UP(size + dev->ctrl.page_size, 340 dev->ctrl.page_size); 341 return DIV_ROUND_UP(8 * nprps, PAGE_SIZE - 8); 342 } 343 344 /* 345 * Calculates the number of pages needed for the SGL segments. For example a 4k 346 * page can accommodate 256 SGL descriptors. 347 */ 348 static int nvme_pci_npages_sgl(unsigned int num_seg) 349 { 350 return DIV_ROUND_UP(num_seg * sizeof(struct nvme_sgl_desc), PAGE_SIZE); 351 } 352 353 static unsigned int nvme_pci_iod_alloc_size(struct nvme_dev *dev, 354 unsigned int size, unsigned int nseg, bool use_sgl) 355 { 356 size_t alloc_size; 357 358 if (use_sgl) 359 alloc_size = sizeof(__le64 *) * nvme_pci_npages_sgl(nseg); 360 else 361 alloc_size = sizeof(__le64 *) * nvme_npages(size, dev); 362 363 return alloc_size + sizeof(struct scatterlist) * nseg; 364 } 365 366 static unsigned int nvme_pci_cmd_size(struct nvme_dev *dev, bool use_sgl) 367 { 368 unsigned int alloc_size = nvme_pci_iod_alloc_size(dev, 369 NVME_INT_BYTES(dev), NVME_INT_PAGES, 370 use_sgl); 371 372 return sizeof(struct nvme_iod) + alloc_size; 373 } 374 375 static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, 376 unsigned int hctx_idx) 377 { 378 struct nvme_dev *dev = data; 379 struct nvme_queue *nvmeq = &dev->queues[0]; 380 381 WARN_ON(hctx_idx != 0); 382 WARN_ON(dev->admin_tagset.tags[0] != hctx->tags); 383 WARN_ON(nvmeq->tags); 384 385 hctx->driver_data = nvmeq; 386 nvmeq->tags = &dev->admin_tagset.tags[0]; 387 return 0; 388 } 389 390 static void nvme_admin_exit_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) 391 { 392 struct nvme_queue *nvmeq = hctx->driver_data; 393 394 nvmeq->tags = NULL; 395 } 396 397 static int nvme_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, 398 unsigned int hctx_idx) 399 { 400 struct nvme_dev *dev = data; 401 struct nvme_queue *nvmeq = &dev->queues[hctx_idx + 1]; 402 403 if (!nvmeq->tags) 404 nvmeq->tags = &dev->tagset.tags[hctx_idx]; 405 406 WARN_ON(dev->tagset.tags[hctx_idx] != hctx->tags); 407 hctx->driver_data = nvmeq; 408 return 0; 409 } 410 411 static int nvme_init_request(struct blk_mq_tag_set *set, struct request *req, 412 unsigned int hctx_idx, unsigned int numa_node) 413 { 414 struct nvme_dev *dev = set->driver_data; 415 struct nvme_iod *iod = blk_mq_rq_to_pdu(req); 416 int queue_idx = (set == &dev->tagset) ? hctx_idx + 1 : 0; 417 struct nvme_queue *nvmeq = &dev->queues[queue_idx]; 418 419 BUG_ON(!nvmeq); 420 iod->nvmeq = nvmeq; 421 422 nvme_req(req)->ctrl = &dev->ctrl; 423 return 0; 424 } 425 426 static int nvme_pci_map_queues(struct blk_mq_tag_set *set) 427 { 428 struct nvme_dev *dev = set->driver_data; 429 430 return blk_mq_pci_map_queues(set, to_pci_dev(dev->dev), 431 dev->num_vecs > 1 ? 1 /* admin queue */ : 0); 432 } 433 434 /** 435 * nvme_submit_cmd() - Copy a command into a queue and ring the doorbell 436 * @nvmeq: The queue to use 437 * @cmd: The command to send 438 */ 439 static void nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd) 440 { 441 spin_lock(&nvmeq->sq_lock); 442 if (nvmeq->sq_cmds_io) 443 memcpy_toio(&nvmeq->sq_cmds_io[nvmeq->sq_tail], cmd, 444 sizeof(*cmd)); 445 else 446 memcpy(&nvmeq->sq_cmds[nvmeq->sq_tail], cmd, sizeof(*cmd)); 447 448 if (++nvmeq->sq_tail == nvmeq->q_depth) 449 nvmeq->sq_tail = 0; 450 if (nvme_dbbuf_update_and_check_event(nvmeq->sq_tail, 451 nvmeq->dbbuf_sq_db, nvmeq->dbbuf_sq_ei)) 452 writel(nvmeq->sq_tail, nvmeq->q_db); 453 spin_unlock(&nvmeq->sq_lock); 454 } 455 456 static void **nvme_pci_iod_list(struct request *req) 457 { 458 struct nvme_iod *iod = blk_mq_rq_to_pdu(req); 459 return (void **)(iod->sg + blk_rq_nr_phys_segments(req)); 460 } 461 462 static inline bool nvme_pci_use_sgls(struct nvme_dev *dev, struct request *req) 463 { 464 struct nvme_iod *iod = blk_mq_rq_to_pdu(req); 465 int nseg = blk_rq_nr_phys_segments(req); 466 unsigned int avg_seg_size; 467 468 if (nseg == 0) 469 return false; 470 471 avg_seg_size = DIV_ROUND_UP(blk_rq_payload_bytes(req), nseg); 472 473 if (!(dev->ctrl.sgls & ((1 << 0) | (1 << 1)))) 474 return false; 475 if (!iod->nvmeq->qid) 476 return false; 477 if (!sgl_threshold || avg_seg_size < sgl_threshold) 478 return false; 479 return true; 480 } 481 482 static blk_status_t nvme_init_iod(struct request *rq, struct nvme_dev *dev) 483 { 484 struct nvme_iod *iod = blk_mq_rq_to_pdu(rq); 485 int nseg = blk_rq_nr_phys_segments(rq); 486 unsigned int size = blk_rq_payload_bytes(rq); 487 488 iod->use_sgl = nvme_pci_use_sgls(dev, rq); 489 490 if (nseg > NVME_INT_PAGES || size > NVME_INT_BYTES(dev)) { 491 iod->sg = mempool_alloc(dev->iod_mempool, GFP_ATOMIC); 492 if (!iod->sg) 493 return BLK_STS_RESOURCE; 494 } else { 495 iod->sg = iod->inline_sg; 496 } 497 498 iod->aborted = 0; 499 iod->npages = -1; 500 iod->nents = 0; 501 iod->length = size; 502 503 return BLK_STS_OK; 504 } 505 506 static void nvme_free_iod(struct nvme_dev *dev, struct request *req) 507 { 508 struct nvme_iod *iod = blk_mq_rq_to_pdu(req); 509 const int last_prp = dev->ctrl.page_size / sizeof(__le64) - 1; 510 dma_addr_t dma_addr = iod->first_dma, next_dma_addr; 511 512 int i; 513 514 if (iod->npages == 0) 515 dma_pool_free(dev->prp_small_pool, nvme_pci_iod_list(req)[0], 516 dma_addr); 517 518 for (i = 0; i < iod->npages; i++) { 519 void *addr = nvme_pci_iod_list(req)[i]; 520 521 if (iod->use_sgl) { 522 struct nvme_sgl_desc *sg_list = addr; 523 524 next_dma_addr = 525 le64_to_cpu((sg_list[SGES_PER_PAGE - 1]).addr); 526 } else { 527 __le64 *prp_list = addr; 528 529 next_dma_addr = le64_to_cpu(prp_list[last_prp]); 530 } 531 532 dma_pool_free(dev->prp_page_pool, addr, dma_addr); 533 dma_addr = next_dma_addr; 534 } 535 536 if (iod->sg != iod->inline_sg) 537 mempool_free(iod->sg, dev->iod_mempool); 538 } 539 540 static void nvme_print_sgl(struct scatterlist *sgl, int nents) 541 { 542 int i; 543 struct scatterlist *sg; 544 545 for_each_sg(sgl, sg, nents, i) { 546 dma_addr_t phys = sg_phys(sg); 547 pr_warn("sg[%d] phys_addr:%pad offset:%d length:%d " 548 "dma_address:%pad dma_length:%d\n", 549 i, &phys, sg->offset, sg->length, &sg_dma_address(sg), 550 sg_dma_len(sg)); 551 } 552 } 553 554 static blk_status_t nvme_pci_setup_prps(struct nvme_dev *dev, 555 struct request *req, struct nvme_rw_command *cmnd) 556 { 557 struct nvme_iod *iod = blk_mq_rq_to_pdu(req); 558 struct dma_pool *pool; 559 int length = blk_rq_payload_bytes(req); 560 struct scatterlist *sg = iod->sg; 561 int dma_len = sg_dma_len(sg); 562 u64 dma_addr = sg_dma_address(sg); 563 u32 page_size = dev->ctrl.page_size; 564 int offset = dma_addr & (page_size - 1); 565 __le64 *prp_list; 566 void **list = nvme_pci_iod_list(req); 567 dma_addr_t prp_dma; 568 int nprps, i; 569 570 length -= (page_size - offset); 571 if (length <= 0) { 572 iod->first_dma = 0; 573 goto done; 574 } 575 576 dma_len -= (page_size - offset); 577 if (dma_len) { 578 dma_addr += (page_size - offset); 579 } else { 580 sg = sg_next(sg); 581 dma_addr = sg_dma_address(sg); 582 dma_len = sg_dma_len(sg); 583 } 584 585 if (length <= page_size) { 586 iod->first_dma = dma_addr; 587 goto done; 588 } 589 590 nprps = DIV_ROUND_UP(length, page_size); 591 if (nprps <= (256 / 8)) { 592 pool = dev->prp_small_pool; 593 iod->npages = 0; 594 } else { 595 pool = dev->prp_page_pool; 596 iod->npages = 1; 597 } 598 599 prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &prp_dma); 600 if (!prp_list) { 601 iod->first_dma = dma_addr; 602 iod->npages = -1; 603 return BLK_STS_RESOURCE; 604 } 605 list[0] = prp_list; 606 iod->first_dma = prp_dma; 607 i = 0; 608 for (;;) { 609 if (i == page_size >> 3) { 610 __le64 *old_prp_list = prp_list; 611 prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &prp_dma); 612 if (!prp_list) 613 return BLK_STS_RESOURCE; 614 list[iod->npages++] = prp_list; 615 prp_list[0] = old_prp_list[i - 1]; 616 old_prp_list[i - 1] = cpu_to_le64(prp_dma); 617 i = 1; 618 } 619 prp_list[i++] = cpu_to_le64(dma_addr); 620 dma_len -= page_size; 621 dma_addr += page_size; 622 length -= page_size; 623 if (length <= 0) 624 break; 625 if (dma_len > 0) 626 continue; 627 if (unlikely(dma_len < 0)) 628 goto bad_sgl; 629 sg = sg_next(sg); 630 dma_addr = sg_dma_address(sg); 631 dma_len = sg_dma_len(sg); 632 } 633 634 done: 635 cmnd->dptr.prp1 = cpu_to_le64(sg_dma_address(iod->sg)); 636 cmnd->dptr.prp2 = cpu_to_le64(iod->first_dma); 637 638 return BLK_STS_OK; 639 640 bad_sgl: 641 WARN(DO_ONCE(nvme_print_sgl, iod->sg, iod->nents), 642 "Invalid SGL for payload:%d nents:%d\n", 643 blk_rq_payload_bytes(req), iod->nents); 644 return BLK_STS_IOERR; 645 } 646 647 static void nvme_pci_sgl_set_data(struct nvme_sgl_desc *sge, 648 struct scatterlist *sg) 649 { 650 sge->addr = cpu_to_le64(sg_dma_address(sg)); 651 sge->length = cpu_to_le32(sg_dma_len(sg)); 652 sge->type = NVME_SGL_FMT_DATA_DESC << 4; 653 } 654 655 static void nvme_pci_sgl_set_seg(struct nvme_sgl_desc *sge, 656 dma_addr_t dma_addr, int entries) 657 { 658 sge->addr = cpu_to_le64(dma_addr); 659 if (entries < SGES_PER_PAGE) { 660 sge->length = cpu_to_le32(entries * sizeof(*sge)); 661 sge->type = NVME_SGL_FMT_LAST_SEG_DESC << 4; 662 } else { 663 sge->length = cpu_to_le32(PAGE_SIZE); 664 sge->type = NVME_SGL_FMT_SEG_DESC << 4; 665 } 666 } 667 668 static blk_status_t nvme_pci_setup_sgls(struct nvme_dev *dev, 669 struct request *req, struct nvme_rw_command *cmd, int entries) 670 { 671 struct nvme_iod *iod = blk_mq_rq_to_pdu(req); 672 struct dma_pool *pool; 673 struct nvme_sgl_desc *sg_list; 674 struct scatterlist *sg = iod->sg; 675 dma_addr_t sgl_dma; 676 int i = 0; 677 678 /* setting the transfer type as SGL */ 679 cmd->flags = NVME_CMD_SGL_METABUF; 680 681 if (entries == 1) { 682 nvme_pci_sgl_set_data(&cmd->dptr.sgl, sg); 683 return BLK_STS_OK; 684 } 685 686 if (entries <= (256 / sizeof(struct nvme_sgl_desc))) { 687 pool = dev->prp_small_pool; 688 iod->npages = 0; 689 } else { 690 pool = dev->prp_page_pool; 691 iod->npages = 1; 692 } 693 694 sg_list = dma_pool_alloc(pool, GFP_ATOMIC, &sgl_dma); 695 if (!sg_list) { 696 iod->npages = -1; 697 return BLK_STS_RESOURCE; 698 } 699 700 nvme_pci_iod_list(req)[0] = sg_list; 701 iod->first_dma = sgl_dma; 702 703 nvme_pci_sgl_set_seg(&cmd->dptr.sgl, sgl_dma, entries); 704 705 do { 706 if (i == SGES_PER_PAGE) { 707 struct nvme_sgl_desc *old_sg_desc = sg_list; 708 struct nvme_sgl_desc *link = &old_sg_desc[i - 1]; 709 710 sg_list = dma_pool_alloc(pool, GFP_ATOMIC, &sgl_dma); 711 if (!sg_list) 712 return BLK_STS_RESOURCE; 713 714 i = 0; 715 nvme_pci_iod_list(req)[iod->npages++] = sg_list; 716 sg_list[i++] = *link; 717 nvme_pci_sgl_set_seg(link, sgl_dma, entries); 718 } 719 720 nvme_pci_sgl_set_data(&sg_list[i++], sg); 721 sg = sg_next(sg); 722 } while (--entries > 0); 723 724 return BLK_STS_OK; 725 } 726 727 static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req, 728 struct nvme_command *cmnd) 729 { 730 struct nvme_iod *iod = blk_mq_rq_to_pdu(req); 731 struct request_queue *q = req->q; 732 enum dma_data_direction dma_dir = rq_data_dir(req) ? 733 DMA_TO_DEVICE : DMA_FROM_DEVICE; 734 blk_status_t ret = BLK_STS_IOERR; 735 int nr_mapped; 736 737 sg_init_table(iod->sg, blk_rq_nr_phys_segments(req)); 738 iod->nents = blk_rq_map_sg(q, req, iod->sg); 739 if (!iod->nents) 740 goto out; 741 742 ret = BLK_STS_RESOURCE; 743 nr_mapped = dma_map_sg_attrs(dev->dev, iod->sg, iod->nents, dma_dir, 744 DMA_ATTR_NO_WARN); 745 if (!nr_mapped) 746 goto out; 747 748 if (iod->use_sgl) 749 ret = nvme_pci_setup_sgls(dev, req, &cmnd->rw, nr_mapped); 750 else 751 ret = nvme_pci_setup_prps(dev, req, &cmnd->rw); 752 753 if (ret != BLK_STS_OK) 754 goto out_unmap; 755 756 ret = BLK_STS_IOERR; 757 if (blk_integrity_rq(req)) { 758 if (blk_rq_count_integrity_sg(q, req->bio) != 1) 759 goto out_unmap; 760 761 sg_init_table(&iod->meta_sg, 1); 762 if (blk_rq_map_integrity_sg(q, req->bio, &iod->meta_sg) != 1) 763 goto out_unmap; 764 765 if (!dma_map_sg(dev->dev, &iod->meta_sg, 1, dma_dir)) 766 goto out_unmap; 767 } 768 769 if (blk_integrity_rq(req)) 770 cmnd->rw.metadata = cpu_to_le64(sg_dma_address(&iod->meta_sg)); 771 return BLK_STS_OK; 772 773 out_unmap: 774 dma_unmap_sg(dev->dev, iod->sg, iod->nents, dma_dir); 775 out: 776 return ret; 777 } 778 779 static void nvme_unmap_data(struct nvme_dev *dev, struct request *req) 780 { 781 struct nvme_iod *iod = blk_mq_rq_to_pdu(req); 782 enum dma_data_direction dma_dir = rq_data_dir(req) ? 783 DMA_TO_DEVICE : DMA_FROM_DEVICE; 784 785 if (iod->nents) { 786 dma_unmap_sg(dev->dev, iod->sg, iod->nents, dma_dir); 787 if (blk_integrity_rq(req)) 788 dma_unmap_sg(dev->dev, &iod->meta_sg, 1, dma_dir); 789 } 790 791 nvme_cleanup_cmd(req); 792 nvme_free_iod(dev, req); 793 } 794 795 /* 796 * NOTE: ns is NULL when called on the admin queue. 797 */ 798 static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx, 799 const struct blk_mq_queue_data *bd) 800 { 801 struct nvme_ns *ns = hctx->queue->queuedata; 802 struct nvme_queue *nvmeq = hctx->driver_data; 803 struct nvme_dev *dev = nvmeq->dev; 804 struct request *req = bd->rq; 805 struct nvme_command cmnd; 806 blk_status_t ret; 807 808 /* 809 * We should not need to do this, but we're still using this to 810 * ensure we can drain requests on a dying queue. 811 */ 812 if (unlikely(nvmeq->cq_vector < 0)) 813 return BLK_STS_IOERR; 814 815 ret = nvme_setup_cmd(ns, req, &cmnd); 816 if (ret) 817 return ret; 818 819 ret = nvme_init_iod(req, dev); 820 if (ret) 821 goto out_free_cmd; 822 823 if (blk_rq_nr_phys_segments(req)) { 824 ret = nvme_map_data(dev, req, &cmnd); 825 if (ret) 826 goto out_cleanup_iod; 827 } 828 829 blk_mq_start_request(req); 830 nvme_submit_cmd(nvmeq, &cmnd); 831 return BLK_STS_OK; 832 out_cleanup_iod: 833 nvme_free_iod(dev, req); 834 out_free_cmd: 835 nvme_cleanup_cmd(req); 836 return ret; 837 } 838 839 static void nvme_pci_complete_rq(struct request *req) 840 { 841 struct nvme_iod *iod = blk_mq_rq_to_pdu(req); 842 843 nvme_unmap_data(iod->nvmeq->dev, req); 844 nvme_complete_rq(req); 845 } 846 847 /* We read the CQE phase first to check if the rest of the entry is valid */ 848 static inline bool nvme_cqe_pending(struct nvme_queue *nvmeq) 849 { 850 return (le16_to_cpu(nvmeq->cqes[nvmeq->cq_head].status) & 1) == 851 nvmeq->cq_phase; 852 } 853 854 static inline void nvme_ring_cq_doorbell(struct nvme_queue *nvmeq) 855 { 856 u16 head = nvmeq->cq_head; 857 858 if (nvme_dbbuf_update_and_check_event(head, nvmeq->dbbuf_cq_db, 859 nvmeq->dbbuf_cq_ei)) 860 writel(head, nvmeq->q_db + nvmeq->dev->db_stride); 861 } 862 863 static inline void nvme_handle_cqe(struct nvme_queue *nvmeq, u16 idx) 864 { 865 volatile struct nvme_completion *cqe = &nvmeq->cqes[idx]; 866 struct request *req; 867 868 if (unlikely(cqe->command_id >= nvmeq->q_depth)) { 869 dev_warn(nvmeq->dev->ctrl.device, 870 "invalid id %d completed on queue %d\n", 871 cqe->command_id, le16_to_cpu(cqe->sq_id)); 872 return; 873 } 874 875 /* 876 * AEN requests are special as they don't time out and can 877 * survive any kind of queue freeze and often don't respond to 878 * aborts. We don't even bother to allocate a struct request 879 * for them but rather special case them here. 880 */ 881 if (unlikely(nvmeq->qid == 0 && 882 cqe->command_id >= NVME_AQ_BLK_MQ_DEPTH)) { 883 nvme_complete_async_event(&nvmeq->dev->ctrl, 884 cqe->status, &cqe->result); 885 return; 886 } 887 888 req = blk_mq_tag_to_rq(*nvmeq->tags, cqe->command_id); 889 nvme_end_request(req, cqe->status, cqe->result); 890 } 891 892 static void nvme_complete_cqes(struct nvme_queue *nvmeq, u16 start, u16 end) 893 { 894 while (start != end) { 895 nvme_handle_cqe(nvmeq, start); 896 if (++start == nvmeq->q_depth) 897 start = 0; 898 } 899 } 900 901 static inline void nvme_update_cq_head(struct nvme_queue *nvmeq) 902 { 903 if (++nvmeq->cq_head == nvmeq->q_depth) { 904 nvmeq->cq_head = 0; 905 nvmeq->cq_phase = !nvmeq->cq_phase; 906 } 907 } 908 909 static inline bool nvme_process_cq(struct nvme_queue *nvmeq, u16 *start, 910 u16 *end, int tag) 911 { 912 bool found = false; 913 914 *start = nvmeq->cq_head; 915 while (!found && nvme_cqe_pending(nvmeq)) { 916 if (nvmeq->cqes[nvmeq->cq_head].command_id == tag) 917 found = true; 918 nvme_update_cq_head(nvmeq); 919 } 920 *end = nvmeq->cq_head; 921 922 if (*start != *end) 923 nvme_ring_cq_doorbell(nvmeq); 924 return found; 925 } 926 927 static irqreturn_t nvme_irq(int irq, void *data) 928 { 929 struct nvme_queue *nvmeq = data; 930 irqreturn_t ret = IRQ_NONE; 931 u16 start, end; 932 933 spin_lock(&nvmeq->cq_lock); 934 if (nvmeq->cq_head != nvmeq->last_cq_head) 935 ret = IRQ_HANDLED; 936 nvme_process_cq(nvmeq, &start, &end, -1); 937 nvmeq->last_cq_head = nvmeq->cq_head; 938 spin_unlock(&nvmeq->cq_lock); 939 940 if (start != end) { 941 nvme_complete_cqes(nvmeq, start, end); 942 return IRQ_HANDLED; 943 } 944 945 return ret; 946 } 947 948 static irqreturn_t nvme_irq_check(int irq, void *data) 949 { 950 struct nvme_queue *nvmeq = data; 951 if (nvme_cqe_pending(nvmeq)) 952 return IRQ_WAKE_THREAD; 953 return IRQ_NONE; 954 } 955 956 static int __nvme_poll(struct nvme_queue *nvmeq, unsigned int tag) 957 { 958 u16 start, end; 959 bool found; 960 961 if (!nvme_cqe_pending(nvmeq)) 962 return 0; 963 964 spin_lock_irq(&nvmeq->cq_lock); 965 found = nvme_process_cq(nvmeq, &start, &end, tag); 966 spin_unlock_irq(&nvmeq->cq_lock); 967 968 nvme_complete_cqes(nvmeq, start, end); 969 return found; 970 } 971 972 static int nvme_poll(struct blk_mq_hw_ctx *hctx, unsigned int tag) 973 { 974 struct nvme_queue *nvmeq = hctx->driver_data; 975 976 return __nvme_poll(nvmeq, tag); 977 } 978 979 static void nvme_pci_submit_async_event(struct nvme_ctrl *ctrl) 980 { 981 struct nvme_dev *dev = to_nvme_dev(ctrl); 982 struct nvme_queue *nvmeq = &dev->queues[0]; 983 struct nvme_command c; 984 985 memset(&c, 0, sizeof(c)); 986 c.common.opcode = nvme_admin_async_event; 987 c.common.command_id = NVME_AQ_BLK_MQ_DEPTH; 988 nvme_submit_cmd(nvmeq, &c); 989 } 990 991 static int adapter_delete_queue(struct nvme_dev *dev, u8 opcode, u16 id) 992 { 993 struct nvme_command c; 994 995 memset(&c, 0, sizeof(c)); 996 c.delete_queue.opcode = opcode; 997 c.delete_queue.qid = cpu_to_le16(id); 998 999 return nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0); 1000 } 1001 1002 static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid, 1003 struct nvme_queue *nvmeq, s16 vector) 1004 { 1005 struct nvme_command c; 1006 int flags = NVME_QUEUE_PHYS_CONTIG | NVME_CQ_IRQ_ENABLED; 1007 1008 /* 1009 * Note: we (ab)use the fact that the prp fields survive if no data 1010 * is attached to the request. 1011 */ 1012 memset(&c, 0, sizeof(c)); 1013 c.create_cq.opcode = nvme_admin_create_cq; 1014 c.create_cq.prp1 = cpu_to_le64(nvmeq->cq_dma_addr); 1015 c.create_cq.cqid = cpu_to_le16(qid); 1016 c.create_cq.qsize = cpu_to_le16(nvmeq->q_depth - 1); 1017 c.create_cq.cq_flags = cpu_to_le16(flags); 1018 c.create_cq.irq_vector = cpu_to_le16(vector); 1019 1020 return nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0); 1021 } 1022 1023 static int adapter_alloc_sq(struct nvme_dev *dev, u16 qid, 1024 struct nvme_queue *nvmeq) 1025 { 1026 struct nvme_ctrl *ctrl = &dev->ctrl; 1027 struct nvme_command c; 1028 int flags = NVME_QUEUE_PHYS_CONTIG; 1029 1030 /* 1031 * Some drives have a bug that auto-enables WRRU if MEDIUM isn't 1032 * set. Since URGENT priority is zeroes, it makes all queues 1033 * URGENT. 1034 */ 1035 if (ctrl->quirks & NVME_QUIRK_MEDIUM_PRIO_SQ) 1036 flags |= NVME_SQ_PRIO_MEDIUM; 1037 1038 /* 1039 * Note: we (ab)use the fact that the prp fields survive if no data 1040 * is attached to the request. 1041 */ 1042 memset(&c, 0, sizeof(c)); 1043 c.create_sq.opcode = nvme_admin_create_sq; 1044 c.create_sq.prp1 = cpu_to_le64(nvmeq->sq_dma_addr); 1045 c.create_sq.sqid = cpu_to_le16(qid); 1046 c.create_sq.qsize = cpu_to_le16(nvmeq->q_depth - 1); 1047 c.create_sq.sq_flags = cpu_to_le16(flags); 1048 c.create_sq.cqid = cpu_to_le16(qid); 1049 1050 return nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0); 1051 } 1052 1053 static int adapter_delete_cq(struct nvme_dev *dev, u16 cqid) 1054 { 1055 return adapter_delete_queue(dev, nvme_admin_delete_cq, cqid); 1056 } 1057 1058 static int adapter_delete_sq(struct nvme_dev *dev, u16 sqid) 1059 { 1060 return adapter_delete_queue(dev, nvme_admin_delete_sq, sqid); 1061 } 1062 1063 static void abort_endio(struct request *req, blk_status_t error) 1064 { 1065 struct nvme_iod *iod = blk_mq_rq_to_pdu(req); 1066 struct nvme_queue *nvmeq = iod->nvmeq; 1067 1068 dev_warn(nvmeq->dev->ctrl.device, 1069 "Abort status: 0x%x", nvme_req(req)->status); 1070 atomic_inc(&nvmeq->dev->ctrl.abort_limit); 1071 blk_mq_free_request(req); 1072 } 1073 1074 static bool nvme_should_reset(struct nvme_dev *dev, u32 csts) 1075 { 1076 1077 /* If true, indicates loss of adapter communication, possibly by a 1078 * NVMe Subsystem reset. 1079 */ 1080 bool nssro = dev->subsystem && (csts & NVME_CSTS_NSSRO); 1081 1082 /* If there is a reset/reinit ongoing, we shouldn't reset again. */ 1083 switch (dev->ctrl.state) { 1084 case NVME_CTRL_RESETTING: 1085 case NVME_CTRL_CONNECTING: 1086 return false; 1087 default: 1088 break; 1089 } 1090 1091 /* We shouldn't reset unless the controller is on fatal error state 1092 * _or_ if we lost the communication with it. 1093 */ 1094 if (!(csts & NVME_CSTS_CFS) && !nssro) 1095 return false; 1096 1097 return true; 1098 } 1099 1100 static void nvme_warn_reset(struct nvme_dev *dev, u32 csts) 1101 { 1102 /* Read a config register to help see what died. */ 1103 u16 pci_status; 1104 int result; 1105 1106 result = pci_read_config_word(to_pci_dev(dev->dev), PCI_STATUS, 1107 &pci_status); 1108 if (result == PCIBIOS_SUCCESSFUL) 1109 dev_warn(dev->ctrl.device, 1110 "controller is down; will reset: CSTS=0x%x, PCI_STATUS=0x%hx\n", 1111 csts, pci_status); 1112 else 1113 dev_warn(dev->ctrl.device, 1114 "controller is down; will reset: CSTS=0x%x, PCI_STATUS read failed (%d)\n", 1115 csts, result); 1116 } 1117 1118 static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved) 1119 { 1120 struct nvme_iod *iod = blk_mq_rq_to_pdu(req); 1121 struct nvme_queue *nvmeq = iod->nvmeq; 1122 struct nvme_dev *dev = nvmeq->dev; 1123 struct request *abort_req; 1124 struct nvme_command cmd; 1125 u32 csts = readl(dev->bar + NVME_REG_CSTS); 1126 1127 /* If PCI error recovery process is happening, we cannot reset or 1128 * the recovery mechanism will surely fail. 1129 */ 1130 mb(); 1131 if (pci_channel_offline(to_pci_dev(dev->dev))) 1132 return BLK_EH_RESET_TIMER; 1133 1134 /* 1135 * Reset immediately if the controller is failed 1136 */ 1137 if (nvme_should_reset(dev, csts)) { 1138 nvme_warn_reset(dev, csts); 1139 nvme_dev_disable(dev, false); 1140 nvme_reset_ctrl(&dev->ctrl); 1141 return BLK_EH_DONE; 1142 } 1143 1144 /* 1145 * Did we miss an interrupt? 1146 */ 1147 if (__nvme_poll(nvmeq, req->tag)) { 1148 dev_warn(dev->ctrl.device, 1149 "I/O %d QID %d timeout, completion polled\n", 1150 req->tag, nvmeq->qid); 1151 return BLK_EH_DONE; 1152 } 1153 1154 /* 1155 * Shutdown immediately if controller times out while starting. The 1156 * reset work will see the pci device disabled when it gets the forced 1157 * cancellation error. All outstanding requests are completed on 1158 * shutdown, so we return BLK_EH_DONE. 1159 */ 1160 switch (dev->ctrl.state) { 1161 case NVME_CTRL_CONNECTING: 1162 case NVME_CTRL_RESETTING: 1163 dev_warn_ratelimited(dev->ctrl.device, 1164 "I/O %d QID %d timeout, disable controller\n", 1165 req->tag, nvmeq->qid); 1166 nvme_dev_disable(dev, false); 1167 nvme_req(req)->flags |= NVME_REQ_CANCELLED; 1168 return BLK_EH_DONE; 1169 default: 1170 break; 1171 } 1172 1173 /* 1174 * Shutdown the controller immediately and schedule a reset if the 1175 * command was already aborted once before and still hasn't been 1176 * returned to the driver, or if this is the admin queue. 1177 */ 1178 if (!nvmeq->qid || iod->aborted) { 1179 dev_warn(dev->ctrl.device, 1180 "I/O %d QID %d timeout, reset controller\n", 1181 req->tag, nvmeq->qid); 1182 nvme_dev_disable(dev, false); 1183 nvme_reset_ctrl(&dev->ctrl); 1184 1185 nvme_req(req)->flags |= NVME_REQ_CANCELLED; 1186 return BLK_EH_DONE; 1187 } 1188 1189 if (atomic_dec_return(&dev->ctrl.abort_limit) < 0) { 1190 atomic_inc(&dev->ctrl.abort_limit); 1191 return BLK_EH_RESET_TIMER; 1192 } 1193 iod->aborted = 1; 1194 1195 memset(&cmd, 0, sizeof(cmd)); 1196 cmd.abort.opcode = nvme_admin_abort_cmd; 1197 cmd.abort.cid = req->tag; 1198 cmd.abort.sqid = cpu_to_le16(nvmeq->qid); 1199 1200 dev_warn(nvmeq->dev->ctrl.device, 1201 "I/O %d QID %d timeout, aborting\n", 1202 req->tag, nvmeq->qid); 1203 1204 abort_req = nvme_alloc_request(dev->ctrl.admin_q, &cmd, 1205 BLK_MQ_REQ_NOWAIT, NVME_QID_ANY); 1206 if (IS_ERR(abort_req)) { 1207 atomic_inc(&dev->ctrl.abort_limit); 1208 return BLK_EH_RESET_TIMER; 1209 } 1210 1211 abort_req->timeout = ADMIN_TIMEOUT; 1212 abort_req->end_io_data = NULL; 1213 blk_execute_rq_nowait(abort_req->q, NULL, abort_req, 0, abort_endio); 1214 1215 /* 1216 * The aborted req will be completed on receiving the abort req. 1217 * We enable the timer again. If hit twice, it'll cause a device reset, 1218 * as the device then is in a faulty state. 1219 */ 1220 return BLK_EH_RESET_TIMER; 1221 } 1222 1223 static void nvme_free_queue(struct nvme_queue *nvmeq) 1224 { 1225 dma_free_coherent(nvmeq->q_dmadev, CQ_SIZE(nvmeq->q_depth), 1226 (void *)nvmeq->cqes, nvmeq->cq_dma_addr); 1227 if (nvmeq->sq_cmds) 1228 dma_free_coherent(nvmeq->q_dmadev, SQ_SIZE(nvmeq->q_depth), 1229 nvmeq->sq_cmds, nvmeq->sq_dma_addr); 1230 } 1231 1232 static void nvme_free_queues(struct nvme_dev *dev, int lowest) 1233 { 1234 int i; 1235 1236 for (i = dev->ctrl.queue_count - 1; i >= lowest; i--) { 1237 dev->ctrl.queue_count--; 1238 nvme_free_queue(&dev->queues[i]); 1239 } 1240 } 1241 1242 /** 1243 * nvme_suspend_queue - put queue into suspended state 1244 * @nvmeq - queue to suspend 1245 */ 1246 static int nvme_suspend_queue(struct nvme_queue *nvmeq) 1247 { 1248 int vector; 1249 1250 spin_lock_irq(&nvmeq->cq_lock); 1251 if (nvmeq->cq_vector == -1) { 1252 spin_unlock_irq(&nvmeq->cq_lock); 1253 return 1; 1254 } 1255 vector = nvmeq->cq_vector; 1256 nvmeq->dev->online_queues--; 1257 nvmeq->cq_vector = -1; 1258 spin_unlock_irq(&nvmeq->cq_lock); 1259 1260 /* 1261 * Ensure that nvme_queue_rq() sees it ->cq_vector == -1 without 1262 * having to grab the lock. 1263 */ 1264 mb(); 1265 1266 if (!nvmeq->qid && nvmeq->dev->ctrl.admin_q) 1267 blk_mq_quiesce_queue(nvmeq->dev->ctrl.admin_q); 1268 1269 pci_free_irq(to_pci_dev(nvmeq->dev->dev), vector, nvmeq); 1270 1271 return 0; 1272 } 1273 1274 static void nvme_disable_admin_queue(struct nvme_dev *dev, bool shutdown) 1275 { 1276 struct nvme_queue *nvmeq = &dev->queues[0]; 1277 u16 start, end; 1278 1279 if (shutdown) 1280 nvme_shutdown_ctrl(&dev->ctrl); 1281 else 1282 nvme_disable_ctrl(&dev->ctrl, dev->ctrl.cap); 1283 1284 spin_lock_irq(&nvmeq->cq_lock); 1285 nvme_process_cq(nvmeq, &start, &end, -1); 1286 spin_unlock_irq(&nvmeq->cq_lock); 1287 1288 nvme_complete_cqes(nvmeq, start, end); 1289 } 1290 1291 static int nvme_cmb_qdepth(struct nvme_dev *dev, int nr_io_queues, 1292 int entry_size) 1293 { 1294 int q_depth = dev->q_depth; 1295 unsigned q_size_aligned = roundup(q_depth * entry_size, 1296 dev->ctrl.page_size); 1297 1298 if (q_size_aligned * nr_io_queues > dev->cmb_size) { 1299 u64 mem_per_q = div_u64(dev->cmb_size, nr_io_queues); 1300 mem_per_q = round_down(mem_per_q, dev->ctrl.page_size); 1301 q_depth = div_u64(mem_per_q, entry_size); 1302 1303 /* 1304 * Ensure the reduced q_depth is above some threshold where it 1305 * would be better to map queues in system memory with the 1306 * original depth 1307 */ 1308 if (q_depth < 64) 1309 return -ENOMEM; 1310 } 1311 1312 return q_depth; 1313 } 1314 1315 static int nvme_alloc_sq_cmds(struct nvme_dev *dev, struct nvme_queue *nvmeq, 1316 int qid, int depth) 1317 { 1318 /* CMB SQEs will be mapped before creation */ 1319 if (qid && dev->cmb && use_cmb_sqes && (dev->cmbsz & NVME_CMBSZ_SQS)) 1320 return 0; 1321 1322 nvmeq->sq_cmds = dma_alloc_coherent(dev->dev, SQ_SIZE(depth), 1323 &nvmeq->sq_dma_addr, GFP_KERNEL); 1324 if (!nvmeq->sq_cmds) 1325 return -ENOMEM; 1326 return 0; 1327 } 1328 1329 static int nvme_alloc_queue(struct nvme_dev *dev, int qid, int depth) 1330 { 1331 struct nvme_queue *nvmeq = &dev->queues[qid]; 1332 1333 if (dev->ctrl.queue_count > qid) 1334 return 0; 1335 1336 nvmeq->cqes = dma_zalloc_coherent(dev->dev, CQ_SIZE(depth), 1337 &nvmeq->cq_dma_addr, GFP_KERNEL); 1338 if (!nvmeq->cqes) 1339 goto free_nvmeq; 1340 1341 if (nvme_alloc_sq_cmds(dev, nvmeq, qid, depth)) 1342 goto free_cqdma; 1343 1344 nvmeq->q_dmadev = dev->dev; 1345 nvmeq->dev = dev; 1346 spin_lock_init(&nvmeq->sq_lock); 1347 spin_lock_init(&nvmeq->cq_lock); 1348 nvmeq->cq_head = 0; 1349 nvmeq->cq_phase = 1; 1350 nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride]; 1351 nvmeq->q_depth = depth; 1352 nvmeq->qid = qid; 1353 nvmeq->cq_vector = -1; 1354 dev->ctrl.queue_count++; 1355 1356 return 0; 1357 1358 free_cqdma: 1359 dma_free_coherent(dev->dev, CQ_SIZE(depth), (void *)nvmeq->cqes, 1360 nvmeq->cq_dma_addr); 1361 free_nvmeq: 1362 return -ENOMEM; 1363 } 1364 1365 static int queue_request_irq(struct nvme_queue *nvmeq) 1366 { 1367 struct pci_dev *pdev = to_pci_dev(nvmeq->dev->dev); 1368 int nr = nvmeq->dev->ctrl.instance; 1369 1370 if (use_threaded_interrupts) { 1371 return pci_request_irq(pdev, nvmeq->cq_vector, nvme_irq_check, 1372 nvme_irq, nvmeq, "nvme%dq%d", nr, nvmeq->qid); 1373 } else { 1374 return pci_request_irq(pdev, nvmeq->cq_vector, nvme_irq, 1375 NULL, nvmeq, "nvme%dq%d", nr, nvmeq->qid); 1376 } 1377 } 1378 1379 static void nvme_init_queue(struct nvme_queue *nvmeq, u16 qid) 1380 { 1381 struct nvme_dev *dev = nvmeq->dev; 1382 1383 spin_lock_irq(&nvmeq->cq_lock); 1384 nvmeq->sq_tail = 0; 1385 nvmeq->cq_head = 0; 1386 nvmeq->cq_phase = 1; 1387 nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride]; 1388 memset((void *)nvmeq->cqes, 0, CQ_SIZE(nvmeq->q_depth)); 1389 nvme_dbbuf_init(dev, nvmeq, qid); 1390 dev->online_queues++; 1391 spin_unlock_irq(&nvmeq->cq_lock); 1392 } 1393 1394 static int nvme_create_queue(struct nvme_queue *nvmeq, int qid) 1395 { 1396 struct nvme_dev *dev = nvmeq->dev; 1397 int result; 1398 s16 vector; 1399 1400 if (dev->cmb && use_cmb_sqes && (dev->cmbsz & NVME_CMBSZ_SQS)) { 1401 unsigned offset = (qid - 1) * roundup(SQ_SIZE(nvmeq->q_depth), 1402 dev->ctrl.page_size); 1403 nvmeq->sq_dma_addr = dev->cmb_bus_addr + offset; 1404 nvmeq->sq_cmds_io = dev->cmb + offset; 1405 } 1406 1407 /* 1408 * A queue's vector matches the queue identifier unless the controller 1409 * has only one vector available. 1410 */ 1411 vector = dev->num_vecs == 1 ? 0 : qid; 1412 result = adapter_alloc_cq(dev, qid, nvmeq, vector); 1413 if (result) 1414 return result; 1415 1416 result = adapter_alloc_sq(dev, qid, nvmeq); 1417 if (result < 0) 1418 return result; 1419 else if (result) 1420 goto release_cq; 1421 1422 /* 1423 * Set cq_vector after alloc cq/sq, otherwise nvme_suspend_queue will 1424 * invoke free_irq for it and cause a 'Trying to free already-free IRQ 1425 * xxx' warning if the create CQ/SQ command times out. 1426 */ 1427 nvmeq->cq_vector = vector; 1428 nvme_init_queue(nvmeq, qid); 1429 result = queue_request_irq(nvmeq); 1430 if (result < 0) 1431 goto release_sq; 1432 1433 return result; 1434 1435 release_sq: 1436 nvmeq->cq_vector = -1; 1437 dev->online_queues--; 1438 adapter_delete_sq(dev, qid); 1439 release_cq: 1440 adapter_delete_cq(dev, qid); 1441 return result; 1442 } 1443 1444 static const struct blk_mq_ops nvme_mq_admin_ops = { 1445 .queue_rq = nvme_queue_rq, 1446 .complete = nvme_pci_complete_rq, 1447 .init_hctx = nvme_admin_init_hctx, 1448 .exit_hctx = nvme_admin_exit_hctx, 1449 .init_request = nvme_init_request, 1450 .timeout = nvme_timeout, 1451 }; 1452 1453 static const struct blk_mq_ops nvme_mq_ops = { 1454 .queue_rq = nvme_queue_rq, 1455 .complete = nvme_pci_complete_rq, 1456 .init_hctx = nvme_init_hctx, 1457 .init_request = nvme_init_request, 1458 .map_queues = nvme_pci_map_queues, 1459 .timeout = nvme_timeout, 1460 .poll = nvme_poll, 1461 }; 1462 1463 static void nvme_dev_remove_admin(struct nvme_dev *dev) 1464 { 1465 if (dev->ctrl.admin_q && !blk_queue_dying(dev->ctrl.admin_q)) { 1466 /* 1467 * If the controller was reset during removal, it's possible 1468 * user requests may be waiting on a stopped queue. Start the 1469 * queue to flush these to completion. 1470 */ 1471 blk_mq_unquiesce_queue(dev->ctrl.admin_q); 1472 blk_cleanup_queue(dev->ctrl.admin_q); 1473 blk_mq_free_tag_set(&dev->admin_tagset); 1474 } 1475 } 1476 1477 static int nvme_alloc_admin_tags(struct nvme_dev *dev) 1478 { 1479 if (!dev->ctrl.admin_q) { 1480 dev->admin_tagset.ops = &nvme_mq_admin_ops; 1481 dev->admin_tagset.nr_hw_queues = 1; 1482 1483 dev->admin_tagset.queue_depth = NVME_AQ_MQ_TAG_DEPTH; 1484 dev->admin_tagset.timeout = ADMIN_TIMEOUT; 1485 dev->admin_tagset.numa_node = dev_to_node(dev->dev); 1486 dev->admin_tagset.cmd_size = nvme_pci_cmd_size(dev, false); 1487 dev->admin_tagset.flags = BLK_MQ_F_NO_SCHED; 1488 dev->admin_tagset.driver_data = dev; 1489 1490 if (blk_mq_alloc_tag_set(&dev->admin_tagset)) 1491 return -ENOMEM; 1492 dev->ctrl.admin_tagset = &dev->admin_tagset; 1493 1494 dev->ctrl.admin_q = blk_mq_init_queue(&dev->admin_tagset); 1495 if (IS_ERR(dev->ctrl.admin_q)) { 1496 blk_mq_free_tag_set(&dev->admin_tagset); 1497 return -ENOMEM; 1498 } 1499 if (!blk_get_queue(dev->ctrl.admin_q)) { 1500 nvme_dev_remove_admin(dev); 1501 dev->ctrl.admin_q = NULL; 1502 return -ENODEV; 1503 } 1504 } else 1505 blk_mq_unquiesce_queue(dev->ctrl.admin_q); 1506 1507 return 0; 1508 } 1509 1510 static unsigned long db_bar_size(struct nvme_dev *dev, unsigned nr_io_queues) 1511 { 1512 return NVME_REG_DBS + ((nr_io_queues + 1) * 8 * dev->db_stride); 1513 } 1514 1515 static int nvme_remap_bar(struct nvme_dev *dev, unsigned long size) 1516 { 1517 struct pci_dev *pdev = to_pci_dev(dev->dev); 1518 1519 if (size <= dev->bar_mapped_size) 1520 return 0; 1521 if (size > pci_resource_len(pdev, 0)) 1522 return -ENOMEM; 1523 if (dev->bar) 1524 iounmap(dev->bar); 1525 dev->bar = ioremap(pci_resource_start(pdev, 0), size); 1526 if (!dev->bar) { 1527 dev->bar_mapped_size = 0; 1528 return -ENOMEM; 1529 } 1530 dev->bar_mapped_size = size; 1531 dev->dbs = dev->bar + NVME_REG_DBS; 1532 1533 return 0; 1534 } 1535 1536 static int nvme_pci_configure_admin_queue(struct nvme_dev *dev) 1537 { 1538 int result; 1539 u32 aqa; 1540 struct nvme_queue *nvmeq; 1541 1542 result = nvme_remap_bar(dev, db_bar_size(dev, 0)); 1543 if (result < 0) 1544 return result; 1545 1546 dev->subsystem = readl(dev->bar + NVME_REG_VS) >= NVME_VS(1, 1, 0) ? 1547 NVME_CAP_NSSRC(dev->ctrl.cap) : 0; 1548 1549 if (dev->subsystem && 1550 (readl(dev->bar + NVME_REG_CSTS) & NVME_CSTS_NSSRO)) 1551 writel(NVME_CSTS_NSSRO, dev->bar + NVME_REG_CSTS); 1552 1553 result = nvme_disable_ctrl(&dev->ctrl, dev->ctrl.cap); 1554 if (result < 0) 1555 return result; 1556 1557 result = nvme_alloc_queue(dev, 0, NVME_AQ_DEPTH); 1558 if (result) 1559 return result; 1560 1561 nvmeq = &dev->queues[0]; 1562 aqa = nvmeq->q_depth - 1; 1563 aqa |= aqa << 16; 1564 1565 writel(aqa, dev->bar + NVME_REG_AQA); 1566 lo_hi_writeq(nvmeq->sq_dma_addr, dev->bar + NVME_REG_ASQ); 1567 lo_hi_writeq(nvmeq->cq_dma_addr, dev->bar + NVME_REG_ACQ); 1568 1569 result = nvme_enable_ctrl(&dev->ctrl, dev->ctrl.cap); 1570 if (result) 1571 return result; 1572 1573 nvmeq->cq_vector = 0; 1574 nvme_init_queue(nvmeq, 0); 1575 result = queue_request_irq(nvmeq); 1576 if (result) { 1577 nvmeq->cq_vector = -1; 1578 return result; 1579 } 1580 1581 return result; 1582 } 1583 1584 static int nvme_create_io_queues(struct nvme_dev *dev) 1585 { 1586 unsigned i, max; 1587 int ret = 0; 1588 1589 for (i = dev->ctrl.queue_count; i <= dev->max_qid; i++) { 1590 if (nvme_alloc_queue(dev, i, dev->q_depth)) { 1591 ret = -ENOMEM; 1592 break; 1593 } 1594 } 1595 1596 max = min(dev->max_qid, dev->ctrl.queue_count - 1); 1597 for (i = dev->online_queues; i <= max; i++) { 1598 ret = nvme_create_queue(&dev->queues[i], i); 1599 if (ret) 1600 break; 1601 } 1602 1603 /* 1604 * Ignore failing Create SQ/CQ commands, we can continue with less 1605 * than the desired amount of queues, and even a controller without 1606 * I/O queues can still be used to issue admin commands. This might 1607 * be useful to upgrade a buggy firmware for example. 1608 */ 1609 return ret >= 0 ? 0 : ret; 1610 } 1611 1612 static ssize_t nvme_cmb_show(struct device *dev, 1613 struct device_attribute *attr, 1614 char *buf) 1615 { 1616 struct nvme_dev *ndev = to_nvme_dev(dev_get_drvdata(dev)); 1617 1618 return scnprintf(buf, PAGE_SIZE, "cmbloc : x%08x\ncmbsz : x%08x\n", 1619 ndev->cmbloc, ndev->cmbsz); 1620 } 1621 static DEVICE_ATTR(cmb, S_IRUGO, nvme_cmb_show, NULL); 1622 1623 static u64 nvme_cmb_size_unit(struct nvme_dev *dev) 1624 { 1625 u8 szu = (dev->cmbsz >> NVME_CMBSZ_SZU_SHIFT) & NVME_CMBSZ_SZU_MASK; 1626 1627 return 1ULL << (12 + 4 * szu); 1628 } 1629 1630 static u32 nvme_cmb_size(struct nvme_dev *dev) 1631 { 1632 return (dev->cmbsz >> NVME_CMBSZ_SZ_SHIFT) & NVME_CMBSZ_SZ_MASK; 1633 } 1634 1635 static void nvme_map_cmb(struct nvme_dev *dev) 1636 { 1637 u64 size, offset; 1638 resource_size_t bar_size; 1639 struct pci_dev *pdev = to_pci_dev(dev->dev); 1640 int bar; 1641 1642 dev->cmbsz = readl(dev->bar + NVME_REG_CMBSZ); 1643 if (!dev->cmbsz) 1644 return; 1645 dev->cmbloc = readl(dev->bar + NVME_REG_CMBLOC); 1646 1647 if (!use_cmb_sqes) 1648 return; 1649 1650 size = nvme_cmb_size_unit(dev) * nvme_cmb_size(dev); 1651 offset = nvme_cmb_size_unit(dev) * NVME_CMB_OFST(dev->cmbloc); 1652 bar = NVME_CMB_BIR(dev->cmbloc); 1653 bar_size = pci_resource_len(pdev, bar); 1654 1655 if (offset > bar_size) 1656 return; 1657 1658 /* 1659 * Controllers may support a CMB size larger than their BAR, 1660 * for example, due to being behind a bridge. Reduce the CMB to 1661 * the reported size of the BAR 1662 */ 1663 if (size > bar_size - offset) 1664 size = bar_size - offset; 1665 1666 dev->cmb = ioremap_wc(pci_resource_start(pdev, bar) + offset, size); 1667 if (!dev->cmb) 1668 return; 1669 dev->cmb_bus_addr = pci_bus_address(pdev, bar) + offset; 1670 dev->cmb_size = size; 1671 1672 if (sysfs_add_file_to_group(&dev->ctrl.device->kobj, 1673 &dev_attr_cmb.attr, NULL)) 1674 dev_warn(dev->ctrl.device, 1675 "failed to add sysfs attribute for CMB\n"); 1676 } 1677 1678 static inline void nvme_release_cmb(struct nvme_dev *dev) 1679 { 1680 if (dev->cmb) { 1681 iounmap(dev->cmb); 1682 dev->cmb = NULL; 1683 sysfs_remove_file_from_group(&dev->ctrl.device->kobj, 1684 &dev_attr_cmb.attr, NULL); 1685 dev->cmbsz = 0; 1686 } 1687 } 1688 1689 static int nvme_set_host_mem(struct nvme_dev *dev, u32 bits) 1690 { 1691 u64 dma_addr = dev->host_mem_descs_dma; 1692 struct nvme_command c; 1693 int ret; 1694 1695 memset(&c, 0, sizeof(c)); 1696 c.features.opcode = nvme_admin_set_features; 1697 c.features.fid = cpu_to_le32(NVME_FEAT_HOST_MEM_BUF); 1698 c.features.dword11 = cpu_to_le32(bits); 1699 c.features.dword12 = cpu_to_le32(dev->host_mem_size >> 1700 ilog2(dev->ctrl.page_size)); 1701 c.features.dword13 = cpu_to_le32(lower_32_bits(dma_addr)); 1702 c.features.dword14 = cpu_to_le32(upper_32_bits(dma_addr)); 1703 c.features.dword15 = cpu_to_le32(dev->nr_host_mem_descs); 1704 1705 ret = nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0); 1706 if (ret) { 1707 dev_warn(dev->ctrl.device, 1708 "failed to set host mem (err %d, flags %#x).\n", 1709 ret, bits); 1710 } 1711 return ret; 1712 } 1713 1714 static void nvme_free_host_mem(struct nvme_dev *dev) 1715 { 1716 int i; 1717 1718 for (i = 0; i < dev->nr_host_mem_descs; i++) { 1719 struct nvme_host_mem_buf_desc *desc = &dev->host_mem_descs[i]; 1720 size_t size = le32_to_cpu(desc->size) * dev->ctrl.page_size; 1721 1722 dma_free_coherent(dev->dev, size, dev->host_mem_desc_bufs[i], 1723 le64_to_cpu(desc->addr)); 1724 } 1725 1726 kfree(dev->host_mem_desc_bufs); 1727 dev->host_mem_desc_bufs = NULL; 1728 dma_free_coherent(dev->dev, 1729 dev->nr_host_mem_descs * sizeof(*dev->host_mem_descs), 1730 dev->host_mem_descs, dev->host_mem_descs_dma); 1731 dev->host_mem_descs = NULL; 1732 dev->nr_host_mem_descs = 0; 1733 } 1734 1735 static int __nvme_alloc_host_mem(struct nvme_dev *dev, u64 preferred, 1736 u32 chunk_size) 1737 { 1738 struct nvme_host_mem_buf_desc *descs; 1739 u32 max_entries, len; 1740 dma_addr_t descs_dma; 1741 int i = 0; 1742 void **bufs; 1743 u64 size, tmp; 1744 1745 tmp = (preferred + chunk_size - 1); 1746 do_div(tmp, chunk_size); 1747 max_entries = tmp; 1748 1749 if (dev->ctrl.hmmaxd && dev->ctrl.hmmaxd < max_entries) 1750 max_entries = dev->ctrl.hmmaxd; 1751 1752 descs = dma_zalloc_coherent(dev->dev, max_entries * sizeof(*descs), 1753 &descs_dma, GFP_KERNEL); 1754 if (!descs) 1755 goto out; 1756 1757 bufs = kcalloc(max_entries, sizeof(*bufs), GFP_KERNEL); 1758 if (!bufs) 1759 goto out_free_descs; 1760 1761 for (size = 0; size < preferred && i < max_entries; size += len) { 1762 dma_addr_t dma_addr; 1763 1764 len = min_t(u64, chunk_size, preferred - size); 1765 bufs[i] = dma_alloc_attrs(dev->dev, len, &dma_addr, GFP_KERNEL, 1766 DMA_ATTR_NO_KERNEL_MAPPING | DMA_ATTR_NO_WARN); 1767 if (!bufs[i]) 1768 break; 1769 1770 descs[i].addr = cpu_to_le64(dma_addr); 1771 descs[i].size = cpu_to_le32(len / dev->ctrl.page_size); 1772 i++; 1773 } 1774 1775 if (!size) 1776 goto out_free_bufs; 1777 1778 dev->nr_host_mem_descs = i; 1779 dev->host_mem_size = size; 1780 dev->host_mem_descs = descs; 1781 dev->host_mem_descs_dma = descs_dma; 1782 dev->host_mem_desc_bufs = bufs; 1783 return 0; 1784 1785 out_free_bufs: 1786 while (--i >= 0) { 1787 size_t size = le32_to_cpu(descs[i].size) * dev->ctrl.page_size; 1788 1789 dma_free_coherent(dev->dev, size, bufs[i], 1790 le64_to_cpu(descs[i].addr)); 1791 } 1792 1793 kfree(bufs); 1794 out_free_descs: 1795 dma_free_coherent(dev->dev, max_entries * sizeof(*descs), descs, 1796 descs_dma); 1797 out: 1798 dev->host_mem_descs = NULL; 1799 return -ENOMEM; 1800 } 1801 1802 static int nvme_alloc_host_mem(struct nvme_dev *dev, u64 min, u64 preferred) 1803 { 1804 u32 chunk_size; 1805 1806 /* start big and work our way down */ 1807 for (chunk_size = min_t(u64, preferred, PAGE_SIZE * MAX_ORDER_NR_PAGES); 1808 chunk_size >= max_t(u32, dev->ctrl.hmminds * 4096, PAGE_SIZE * 2); 1809 chunk_size /= 2) { 1810 if (!__nvme_alloc_host_mem(dev, preferred, chunk_size)) { 1811 if (!min || dev->host_mem_size >= min) 1812 return 0; 1813 nvme_free_host_mem(dev); 1814 } 1815 } 1816 1817 return -ENOMEM; 1818 } 1819 1820 static int nvme_setup_host_mem(struct nvme_dev *dev) 1821 { 1822 u64 max = (u64)max_host_mem_size_mb * SZ_1M; 1823 u64 preferred = (u64)dev->ctrl.hmpre * 4096; 1824 u64 min = (u64)dev->ctrl.hmmin * 4096; 1825 u32 enable_bits = NVME_HOST_MEM_ENABLE; 1826 int ret; 1827 1828 preferred = min(preferred, max); 1829 if (min > max) { 1830 dev_warn(dev->ctrl.device, 1831 "min host memory (%lld MiB) above limit (%d MiB).\n", 1832 min >> ilog2(SZ_1M), max_host_mem_size_mb); 1833 nvme_free_host_mem(dev); 1834 return 0; 1835 } 1836 1837 /* 1838 * If we already have a buffer allocated check if we can reuse it. 1839 */ 1840 if (dev->host_mem_descs) { 1841 if (dev->host_mem_size >= min) 1842 enable_bits |= NVME_HOST_MEM_RETURN; 1843 else 1844 nvme_free_host_mem(dev); 1845 } 1846 1847 if (!dev->host_mem_descs) { 1848 if (nvme_alloc_host_mem(dev, min, preferred)) { 1849 dev_warn(dev->ctrl.device, 1850 "failed to allocate host memory buffer.\n"); 1851 return 0; /* controller must work without HMB */ 1852 } 1853 1854 dev_info(dev->ctrl.device, 1855 "allocated %lld MiB host memory buffer.\n", 1856 dev->host_mem_size >> ilog2(SZ_1M)); 1857 } 1858 1859 ret = nvme_set_host_mem(dev, enable_bits); 1860 if (ret) 1861 nvme_free_host_mem(dev); 1862 return ret; 1863 } 1864 1865 static int nvme_setup_io_queues(struct nvme_dev *dev) 1866 { 1867 struct nvme_queue *adminq = &dev->queues[0]; 1868 struct pci_dev *pdev = to_pci_dev(dev->dev); 1869 int result, nr_io_queues; 1870 unsigned long size; 1871 1872 struct irq_affinity affd = { 1873 .pre_vectors = 1 1874 }; 1875 1876 nr_io_queues = num_possible_cpus(); 1877 result = nvme_set_queue_count(&dev->ctrl, &nr_io_queues); 1878 if (result < 0) 1879 return result; 1880 1881 if (nr_io_queues == 0) 1882 return 0; 1883 1884 if (dev->cmb && (dev->cmbsz & NVME_CMBSZ_SQS)) { 1885 result = nvme_cmb_qdepth(dev, nr_io_queues, 1886 sizeof(struct nvme_command)); 1887 if (result > 0) 1888 dev->q_depth = result; 1889 else 1890 nvme_release_cmb(dev); 1891 } 1892 1893 do { 1894 size = db_bar_size(dev, nr_io_queues); 1895 result = nvme_remap_bar(dev, size); 1896 if (!result) 1897 break; 1898 if (!--nr_io_queues) 1899 return -ENOMEM; 1900 } while (1); 1901 adminq->q_db = dev->dbs; 1902 1903 /* Deregister the admin queue's interrupt */ 1904 pci_free_irq(pdev, 0, adminq); 1905 1906 /* 1907 * If we enable msix early due to not intx, disable it again before 1908 * setting up the full range we need. 1909 */ 1910 pci_free_irq_vectors(pdev); 1911 result = pci_alloc_irq_vectors_affinity(pdev, 1, nr_io_queues + 1, 1912 PCI_IRQ_ALL_TYPES | PCI_IRQ_AFFINITY, &affd); 1913 if (result <= 0) 1914 return -EIO; 1915 dev->num_vecs = result; 1916 dev->max_qid = max(result - 1, 1); 1917 1918 /* 1919 * Should investigate if there's a performance win from allocating 1920 * more queues than interrupt vectors; it might allow the submission 1921 * path to scale better, even if the receive path is limited by the 1922 * number of interrupts. 1923 */ 1924 1925 result = queue_request_irq(adminq); 1926 if (result) { 1927 adminq->cq_vector = -1; 1928 return result; 1929 } 1930 return nvme_create_io_queues(dev); 1931 } 1932 1933 static void nvme_del_queue_end(struct request *req, blk_status_t error) 1934 { 1935 struct nvme_queue *nvmeq = req->end_io_data; 1936 1937 blk_mq_free_request(req); 1938 complete(&nvmeq->dev->ioq_wait); 1939 } 1940 1941 static void nvme_del_cq_end(struct request *req, blk_status_t error) 1942 { 1943 struct nvme_queue *nvmeq = req->end_io_data; 1944 u16 start, end; 1945 1946 if (!error) { 1947 unsigned long flags; 1948 1949 spin_lock_irqsave(&nvmeq->cq_lock, flags); 1950 nvme_process_cq(nvmeq, &start, &end, -1); 1951 spin_unlock_irqrestore(&nvmeq->cq_lock, flags); 1952 1953 nvme_complete_cqes(nvmeq, start, end); 1954 } 1955 1956 nvme_del_queue_end(req, error); 1957 } 1958 1959 static int nvme_delete_queue(struct nvme_queue *nvmeq, u8 opcode) 1960 { 1961 struct request_queue *q = nvmeq->dev->ctrl.admin_q; 1962 struct request *req; 1963 struct nvme_command cmd; 1964 1965 memset(&cmd, 0, sizeof(cmd)); 1966 cmd.delete_queue.opcode = opcode; 1967 cmd.delete_queue.qid = cpu_to_le16(nvmeq->qid); 1968 1969 req = nvme_alloc_request(q, &cmd, BLK_MQ_REQ_NOWAIT, NVME_QID_ANY); 1970 if (IS_ERR(req)) 1971 return PTR_ERR(req); 1972 1973 req->timeout = ADMIN_TIMEOUT; 1974 req->end_io_data = nvmeq; 1975 1976 blk_execute_rq_nowait(q, NULL, req, false, 1977 opcode == nvme_admin_delete_cq ? 1978 nvme_del_cq_end : nvme_del_queue_end); 1979 return 0; 1980 } 1981 1982 static void nvme_disable_io_queues(struct nvme_dev *dev) 1983 { 1984 int pass, queues = dev->online_queues - 1; 1985 unsigned long timeout; 1986 u8 opcode = nvme_admin_delete_sq; 1987 1988 for (pass = 0; pass < 2; pass++) { 1989 int sent = 0, i = queues; 1990 1991 reinit_completion(&dev->ioq_wait); 1992 retry: 1993 timeout = ADMIN_TIMEOUT; 1994 for (; i > 0; i--, sent++) 1995 if (nvme_delete_queue(&dev->queues[i], opcode)) 1996 break; 1997 1998 while (sent--) { 1999 timeout = wait_for_completion_io_timeout(&dev->ioq_wait, timeout); 2000 if (timeout == 0) 2001 return; 2002 if (i) 2003 goto retry; 2004 } 2005 opcode = nvme_admin_delete_cq; 2006 } 2007 } 2008 2009 /* 2010 * return error value only when tagset allocation failed 2011 */ 2012 static int nvme_dev_add(struct nvme_dev *dev) 2013 { 2014 int ret; 2015 2016 if (!dev->ctrl.tagset) { 2017 dev->tagset.ops = &nvme_mq_ops; 2018 dev->tagset.nr_hw_queues = dev->online_queues - 1; 2019 dev->tagset.timeout = NVME_IO_TIMEOUT; 2020 dev->tagset.numa_node = dev_to_node(dev->dev); 2021 dev->tagset.queue_depth = 2022 min_t(int, dev->q_depth, BLK_MQ_MAX_DEPTH) - 1; 2023 dev->tagset.cmd_size = nvme_pci_cmd_size(dev, false); 2024 if ((dev->ctrl.sgls & ((1 << 0) | (1 << 1))) && sgl_threshold) { 2025 dev->tagset.cmd_size = max(dev->tagset.cmd_size, 2026 nvme_pci_cmd_size(dev, true)); 2027 } 2028 dev->tagset.flags = BLK_MQ_F_SHOULD_MERGE; 2029 dev->tagset.driver_data = dev; 2030 2031 ret = blk_mq_alloc_tag_set(&dev->tagset); 2032 if (ret) { 2033 dev_warn(dev->ctrl.device, 2034 "IO queues tagset allocation failed %d\n", ret); 2035 return ret; 2036 } 2037 dev->ctrl.tagset = &dev->tagset; 2038 2039 nvme_dbbuf_set(dev); 2040 } else { 2041 blk_mq_update_nr_hw_queues(&dev->tagset, dev->online_queues - 1); 2042 2043 /* Free previously allocated queues that are no longer usable */ 2044 nvme_free_queues(dev, dev->online_queues); 2045 } 2046 2047 return 0; 2048 } 2049 2050 static int nvme_pci_enable(struct nvme_dev *dev) 2051 { 2052 int result = -ENOMEM; 2053 struct pci_dev *pdev = to_pci_dev(dev->dev); 2054 2055 if (pci_enable_device_mem(pdev)) 2056 return result; 2057 2058 pci_set_master(pdev); 2059 2060 if (dma_set_mask_and_coherent(dev->dev, DMA_BIT_MASK(64)) && 2061 dma_set_mask_and_coherent(dev->dev, DMA_BIT_MASK(32))) 2062 goto disable; 2063 2064 if (readl(dev->bar + NVME_REG_CSTS) == -1) { 2065 result = -ENODEV; 2066 goto disable; 2067 } 2068 2069 /* 2070 * Some devices and/or platforms don't advertise or work with INTx 2071 * interrupts. Pre-enable a single MSIX or MSI vec for setup. We'll 2072 * adjust this later. 2073 */ 2074 result = pci_alloc_irq_vectors(pdev, 1, 1, PCI_IRQ_ALL_TYPES); 2075 if (result < 0) 2076 return result; 2077 2078 dev->ctrl.cap = lo_hi_readq(dev->bar + NVME_REG_CAP); 2079 2080 dev->q_depth = min_t(int, NVME_CAP_MQES(dev->ctrl.cap) + 1, 2081 io_queue_depth); 2082 dev->db_stride = 1 << NVME_CAP_STRIDE(dev->ctrl.cap); 2083 dev->dbs = dev->bar + 4096; 2084 2085 /* 2086 * Temporary fix for the Apple controller found in the MacBook8,1 and 2087 * some MacBook7,1 to avoid controller resets and data loss. 2088 */ 2089 if (pdev->vendor == PCI_VENDOR_ID_APPLE && pdev->device == 0x2001) { 2090 dev->q_depth = 2; 2091 dev_warn(dev->ctrl.device, "detected Apple NVMe controller, " 2092 "set queue depth=%u to work around controller resets\n", 2093 dev->q_depth); 2094 } else if (pdev->vendor == PCI_VENDOR_ID_SAMSUNG && 2095 (pdev->device == 0xa821 || pdev->device == 0xa822) && 2096 NVME_CAP_MQES(dev->ctrl.cap) == 0) { 2097 dev->q_depth = 64; 2098 dev_err(dev->ctrl.device, "detected PM1725 NVMe controller, " 2099 "set queue depth=%u\n", dev->q_depth); 2100 } 2101 2102 nvme_map_cmb(dev); 2103 2104 pci_enable_pcie_error_reporting(pdev); 2105 pci_save_state(pdev); 2106 return 0; 2107 2108 disable: 2109 pci_disable_device(pdev); 2110 return result; 2111 } 2112 2113 static void nvme_dev_unmap(struct nvme_dev *dev) 2114 { 2115 if (dev->bar) 2116 iounmap(dev->bar); 2117 pci_release_mem_regions(to_pci_dev(dev->dev)); 2118 } 2119 2120 static void nvme_pci_disable(struct nvme_dev *dev) 2121 { 2122 struct pci_dev *pdev = to_pci_dev(dev->dev); 2123 2124 nvme_release_cmb(dev); 2125 pci_free_irq_vectors(pdev); 2126 2127 if (pci_is_enabled(pdev)) { 2128 pci_disable_pcie_error_reporting(pdev); 2129 pci_disable_device(pdev); 2130 } 2131 } 2132 2133 static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown) 2134 { 2135 int i; 2136 bool dead = true; 2137 struct pci_dev *pdev = to_pci_dev(dev->dev); 2138 2139 mutex_lock(&dev->shutdown_lock); 2140 if (pci_is_enabled(pdev)) { 2141 u32 csts = readl(dev->bar + NVME_REG_CSTS); 2142 2143 if (dev->ctrl.state == NVME_CTRL_LIVE || 2144 dev->ctrl.state == NVME_CTRL_RESETTING) 2145 nvme_start_freeze(&dev->ctrl); 2146 dead = !!((csts & NVME_CSTS_CFS) || !(csts & NVME_CSTS_RDY) || 2147 pdev->error_state != pci_channel_io_normal); 2148 } 2149 2150 /* 2151 * Give the controller a chance to complete all entered requests if 2152 * doing a safe shutdown. 2153 */ 2154 if (!dead) { 2155 if (shutdown) 2156 nvme_wait_freeze_timeout(&dev->ctrl, NVME_IO_TIMEOUT); 2157 } 2158 2159 nvme_stop_queues(&dev->ctrl); 2160 2161 if (!dead && dev->ctrl.queue_count > 0) { 2162 nvme_disable_io_queues(dev); 2163 nvme_disable_admin_queue(dev, shutdown); 2164 } 2165 for (i = dev->ctrl.queue_count - 1; i >= 0; i--) 2166 nvme_suspend_queue(&dev->queues[i]); 2167 2168 nvme_pci_disable(dev); 2169 2170 blk_mq_tagset_busy_iter(&dev->tagset, nvme_cancel_request, &dev->ctrl); 2171 blk_mq_tagset_busy_iter(&dev->admin_tagset, nvme_cancel_request, &dev->ctrl); 2172 2173 /* 2174 * The driver will not be starting up queues again if shutting down so 2175 * must flush all entered requests to their failed completion to avoid 2176 * deadlocking blk-mq hot-cpu notifier. 2177 */ 2178 if (shutdown) 2179 nvme_start_queues(&dev->ctrl); 2180 mutex_unlock(&dev->shutdown_lock); 2181 } 2182 2183 static int nvme_setup_prp_pools(struct nvme_dev *dev) 2184 { 2185 dev->prp_page_pool = dma_pool_create("prp list page", dev->dev, 2186 PAGE_SIZE, PAGE_SIZE, 0); 2187 if (!dev->prp_page_pool) 2188 return -ENOMEM; 2189 2190 /* Optimisation for I/Os between 4k and 128k */ 2191 dev->prp_small_pool = dma_pool_create("prp list 256", dev->dev, 2192 256, 256, 0); 2193 if (!dev->prp_small_pool) { 2194 dma_pool_destroy(dev->prp_page_pool); 2195 return -ENOMEM; 2196 } 2197 return 0; 2198 } 2199 2200 static void nvme_release_prp_pools(struct nvme_dev *dev) 2201 { 2202 dma_pool_destroy(dev->prp_page_pool); 2203 dma_pool_destroy(dev->prp_small_pool); 2204 } 2205 2206 static void nvme_pci_free_ctrl(struct nvme_ctrl *ctrl) 2207 { 2208 struct nvme_dev *dev = to_nvme_dev(ctrl); 2209 2210 nvme_dbbuf_dma_free(dev); 2211 put_device(dev->dev); 2212 if (dev->tagset.tags) 2213 blk_mq_free_tag_set(&dev->tagset); 2214 if (dev->ctrl.admin_q) 2215 blk_put_queue(dev->ctrl.admin_q); 2216 kfree(dev->queues); 2217 free_opal_dev(dev->ctrl.opal_dev); 2218 mempool_destroy(dev->iod_mempool); 2219 kfree(dev); 2220 } 2221 2222 static void nvme_remove_dead_ctrl(struct nvme_dev *dev, int status) 2223 { 2224 dev_warn(dev->ctrl.device, "Removing after probe failure status: %d\n", status); 2225 2226 nvme_get_ctrl(&dev->ctrl); 2227 nvme_dev_disable(dev, false); 2228 nvme_kill_queues(&dev->ctrl); 2229 if (!queue_work(nvme_wq, &dev->remove_work)) 2230 nvme_put_ctrl(&dev->ctrl); 2231 } 2232 2233 static void nvme_reset_work(struct work_struct *work) 2234 { 2235 struct nvme_dev *dev = 2236 container_of(work, struct nvme_dev, ctrl.reset_work); 2237 bool was_suspend = !!(dev->ctrl.ctrl_config & NVME_CC_SHN_NORMAL); 2238 int result = -ENODEV; 2239 enum nvme_ctrl_state new_state = NVME_CTRL_LIVE; 2240 2241 if (WARN_ON(dev->ctrl.state != NVME_CTRL_RESETTING)) 2242 goto out; 2243 2244 /* 2245 * If we're called to reset a live controller first shut it down before 2246 * moving on. 2247 */ 2248 if (dev->ctrl.ctrl_config & NVME_CC_ENABLE) 2249 nvme_dev_disable(dev, false); 2250 2251 /* 2252 * Introduce CONNECTING state from nvme-fc/rdma transports to mark the 2253 * initializing procedure here. 2254 */ 2255 if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_CONNECTING)) { 2256 dev_warn(dev->ctrl.device, 2257 "failed to mark controller CONNECTING\n"); 2258 goto out; 2259 } 2260 2261 result = nvme_pci_enable(dev); 2262 if (result) 2263 goto out; 2264 2265 result = nvme_pci_configure_admin_queue(dev); 2266 if (result) 2267 goto out; 2268 2269 result = nvme_alloc_admin_tags(dev); 2270 if (result) 2271 goto out; 2272 2273 /* 2274 * Limit the max command size to prevent iod->sg allocations going 2275 * over a single page. 2276 */ 2277 dev->ctrl.max_hw_sectors = NVME_MAX_KB_SZ << 1; 2278 dev->ctrl.max_segments = NVME_MAX_SEGS; 2279 2280 result = nvme_init_identify(&dev->ctrl); 2281 if (result) 2282 goto out; 2283 2284 if (dev->ctrl.oacs & NVME_CTRL_OACS_SEC_SUPP) { 2285 if (!dev->ctrl.opal_dev) 2286 dev->ctrl.opal_dev = 2287 init_opal_dev(&dev->ctrl, &nvme_sec_submit); 2288 else if (was_suspend) 2289 opal_unlock_from_suspend(dev->ctrl.opal_dev); 2290 } else { 2291 free_opal_dev(dev->ctrl.opal_dev); 2292 dev->ctrl.opal_dev = NULL; 2293 } 2294 2295 if (dev->ctrl.oacs & NVME_CTRL_OACS_DBBUF_SUPP) { 2296 result = nvme_dbbuf_dma_alloc(dev); 2297 if (result) 2298 dev_warn(dev->dev, 2299 "unable to allocate dma for dbbuf\n"); 2300 } 2301 2302 if (dev->ctrl.hmpre) { 2303 result = nvme_setup_host_mem(dev); 2304 if (result < 0) 2305 goto out; 2306 } 2307 2308 result = nvme_setup_io_queues(dev); 2309 if (result) 2310 goto out; 2311 2312 /* 2313 * Keep the controller around but remove all namespaces if we don't have 2314 * any working I/O queue. 2315 */ 2316 if (dev->online_queues < 2) { 2317 dev_warn(dev->ctrl.device, "IO queues not created\n"); 2318 nvme_kill_queues(&dev->ctrl); 2319 nvme_remove_namespaces(&dev->ctrl); 2320 new_state = NVME_CTRL_ADMIN_ONLY; 2321 } else { 2322 nvme_start_queues(&dev->ctrl); 2323 nvme_wait_freeze(&dev->ctrl); 2324 /* hit this only when allocate tagset fails */ 2325 if (nvme_dev_add(dev)) 2326 new_state = NVME_CTRL_ADMIN_ONLY; 2327 nvme_unfreeze(&dev->ctrl); 2328 } 2329 2330 /* 2331 * If only admin queue live, keep it to do further investigation or 2332 * recovery. 2333 */ 2334 if (!nvme_change_ctrl_state(&dev->ctrl, new_state)) { 2335 dev_warn(dev->ctrl.device, 2336 "failed to mark controller state %d\n", new_state); 2337 goto out; 2338 } 2339 2340 nvme_start_ctrl(&dev->ctrl); 2341 return; 2342 2343 out: 2344 nvme_remove_dead_ctrl(dev, result); 2345 } 2346 2347 static void nvme_remove_dead_ctrl_work(struct work_struct *work) 2348 { 2349 struct nvme_dev *dev = container_of(work, struct nvme_dev, remove_work); 2350 struct pci_dev *pdev = to_pci_dev(dev->dev); 2351 2352 if (pci_get_drvdata(pdev)) 2353 device_release_driver(&pdev->dev); 2354 nvme_put_ctrl(&dev->ctrl); 2355 } 2356 2357 static int nvme_pci_reg_read32(struct nvme_ctrl *ctrl, u32 off, u32 *val) 2358 { 2359 *val = readl(to_nvme_dev(ctrl)->bar + off); 2360 return 0; 2361 } 2362 2363 static int nvme_pci_reg_write32(struct nvme_ctrl *ctrl, u32 off, u32 val) 2364 { 2365 writel(val, to_nvme_dev(ctrl)->bar + off); 2366 return 0; 2367 } 2368 2369 static int nvme_pci_reg_read64(struct nvme_ctrl *ctrl, u32 off, u64 *val) 2370 { 2371 *val = readq(to_nvme_dev(ctrl)->bar + off); 2372 return 0; 2373 } 2374 2375 static int nvme_pci_get_address(struct nvme_ctrl *ctrl, char *buf, int size) 2376 { 2377 struct pci_dev *pdev = to_pci_dev(to_nvme_dev(ctrl)->dev); 2378 2379 return snprintf(buf, size, "%s", dev_name(&pdev->dev)); 2380 } 2381 2382 static const struct nvme_ctrl_ops nvme_pci_ctrl_ops = { 2383 .name = "pcie", 2384 .module = THIS_MODULE, 2385 .flags = NVME_F_METADATA_SUPPORTED, 2386 .reg_read32 = nvme_pci_reg_read32, 2387 .reg_write32 = nvme_pci_reg_write32, 2388 .reg_read64 = nvme_pci_reg_read64, 2389 .free_ctrl = nvme_pci_free_ctrl, 2390 .submit_async_event = nvme_pci_submit_async_event, 2391 .get_address = nvme_pci_get_address, 2392 }; 2393 2394 static int nvme_dev_map(struct nvme_dev *dev) 2395 { 2396 struct pci_dev *pdev = to_pci_dev(dev->dev); 2397 2398 if (pci_request_mem_regions(pdev, "nvme")) 2399 return -ENODEV; 2400 2401 if (nvme_remap_bar(dev, NVME_REG_DBS + 4096)) 2402 goto release; 2403 2404 return 0; 2405 release: 2406 pci_release_mem_regions(pdev); 2407 return -ENODEV; 2408 } 2409 2410 static unsigned long check_vendor_combination_bug(struct pci_dev *pdev) 2411 { 2412 if (pdev->vendor == 0x144d && pdev->device == 0xa802) { 2413 /* 2414 * Several Samsung devices seem to drop off the PCIe bus 2415 * randomly when APST is on and uses the deepest sleep state. 2416 * This has been observed on a Samsung "SM951 NVMe SAMSUNG 2417 * 256GB", a "PM951 NVMe SAMSUNG 512GB", and a "Samsung SSD 2418 * 950 PRO 256GB", but it seems to be restricted to two Dell 2419 * laptops. 2420 */ 2421 if (dmi_match(DMI_SYS_VENDOR, "Dell Inc.") && 2422 (dmi_match(DMI_PRODUCT_NAME, "XPS 15 9550") || 2423 dmi_match(DMI_PRODUCT_NAME, "Precision 5510"))) 2424 return NVME_QUIRK_NO_DEEPEST_PS; 2425 } else if (pdev->vendor == 0x144d && pdev->device == 0xa804) { 2426 /* 2427 * Samsung SSD 960 EVO drops off the PCIe bus after system 2428 * suspend on a Ryzen board, ASUS PRIME B350M-A, as well as 2429 * within few minutes after bootup on a Coffee Lake board - 2430 * ASUS PRIME Z370-A 2431 */ 2432 if (dmi_match(DMI_BOARD_VENDOR, "ASUSTeK COMPUTER INC.") && 2433 (dmi_match(DMI_BOARD_NAME, "PRIME B350M-A") || 2434 dmi_match(DMI_BOARD_NAME, "PRIME Z370-A"))) 2435 return NVME_QUIRK_NO_APST; 2436 } 2437 2438 return 0; 2439 } 2440 2441 static void nvme_async_probe(void *data, async_cookie_t cookie) 2442 { 2443 struct nvme_dev *dev = data; 2444 2445 nvme_reset_ctrl_sync(&dev->ctrl); 2446 flush_work(&dev->ctrl.scan_work); 2447 nvme_put_ctrl(&dev->ctrl); 2448 } 2449 2450 static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) 2451 { 2452 int node, result = -ENOMEM; 2453 struct nvme_dev *dev; 2454 unsigned long quirks = id->driver_data; 2455 size_t alloc_size; 2456 2457 node = dev_to_node(&pdev->dev); 2458 if (node == NUMA_NO_NODE) 2459 set_dev_node(&pdev->dev, first_memory_node); 2460 2461 dev = kzalloc_node(sizeof(*dev), GFP_KERNEL, node); 2462 if (!dev) 2463 return -ENOMEM; 2464 2465 dev->queues = kcalloc_node(num_possible_cpus() + 1, 2466 sizeof(struct nvme_queue), GFP_KERNEL, node); 2467 if (!dev->queues) 2468 goto free; 2469 2470 dev->dev = get_device(&pdev->dev); 2471 pci_set_drvdata(pdev, dev); 2472 2473 result = nvme_dev_map(dev); 2474 if (result) 2475 goto put_pci; 2476 2477 INIT_WORK(&dev->ctrl.reset_work, nvme_reset_work); 2478 INIT_WORK(&dev->remove_work, nvme_remove_dead_ctrl_work); 2479 mutex_init(&dev->shutdown_lock); 2480 init_completion(&dev->ioq_wait); 2481 2482 result = nvme_setup_prp_pools(dev); 2483 if (result) 2484 goto unmap; 2485 2486 quirks |= check_vendor_combination_bug(pdev); 2487 2488 /* 2489 * Double check that our mempool alloc size will cover the biggest 2490 * command we support. 2491 */ 2492 alloc_size = nvme_pci_iod_alloc_size(dev, NVME_MAX_KB_SZ, 2493 NVME_MAX_SEGS, true); 2494 WARN_ON_ONCE(alloc_size > PAGE_SIZE); 2495 2496 dev->iod_mempool = mempool_create_node(1, mempool_kmalloc, 2497 mempool_kfree, 2498 (void *) alloc_size, 2499 GFP_KERNEL, node); 2500 if (!dev->iod_mempool) { 2501 result = -ENOMEM; 2502 goto release_pools; 2503 } 2504 2505 result = nvme_init_ctrl(&dev->ctrl, &pdev->dev, &nvme_pci_ctrl_ops, 2506 quirks); 2507 if (result) 2508 goto release_mempool; 2509 2510 dev_info(dev->ctrl.device, "pci function %s\n", dev_name(&pdev->dev)); 2511 2512 nvme_get_ctrl(&dev->ctrl); 2513 async_schedule(nvme_async_probe, dev); 2514 2515 return 0; 2516 2517 release_mempool: 2518 mempool_destroy(dev->iod_mempool); 2519 release_pools: 2520 nvme_release_prp_pools(dev); 2521 unmap: 2522 nvme_dev_unmap(dev); 2523 put_pci: 2524 put_device(dev->dev); 2525 free: 2526 kfree(dev->queues); 2527 kfree(dev); 2528 return result; 2529 } 2530 2531 static void nvme_reset_prepare(struct pci_dev *pdev) 2532 { 2533 struct nvme_dev *dev = pci_get_drvdata(pdev); 2534 nvme_dev_disable(dev, false); 2535 } 2536 2537 static void nvme_reset_done(struct pci_dev *pdev) 2538 { 2539 struct nvme_dev *dev = pci_get_drvdata(pdev); 2540 nvme_reset_ctrl_sync(&dev->ctrl); 2541 } 2542 2543 static void nvme_shutdown(struct pci_dev *pdev) 2544 { 2545 struct nvme_dev *dev = pci_get_drvdata(pdev); 2546 nvme_dev_disable(dev, true); 2547 } 2548 2549 /* 2550 * The driver's remove may be called on a device in a partially initialized 2551 * state. This function must not have any dependencies on the device state in 2552 * order to proceed. 2553 */ 2554 static void nvme_remove(struct pci_dev *pdev) 2555 { 2556 struct nvme_dev *dev = pci_get_drvdata(pdev); 2557 2558 nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DELETING); 2559 2560 cancel_work_sync(&dev->ctrl.reset_work); 2561 pci_set_drvdata(pdev, NULL); 2562 2563 if (!pci_device_is_present(pdev)) { 2564 nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DEAD); 2565 nvme_dev_disable(dev, true); 2566 } 2567 2568 flush_work(&dev->ctrl.reset_work); 2569 nvme_stop_ctrl(&dev->ctrl); 2570 nvme_remove_namespaces(&dev->ctrl); 2571 nvme_dev_disable(dev, true); 2572 nvme_free_host_mem(dev); 2573 nvme_dev_remove_admin(dev); 2574 nvme_free_queues(dev, 0); 2575 nvme_uninit_ctrl(&dev->ctrl); 2576 nvme_release_prp_pools(dev); 2577 nvme_dev_unmap(dev); 2578 nvme_put_ctrl(&dev->ctrl); 2579 } 2580 2581 #ifdef CONFIG_PM_SLEEP 2582 static int nvme_suspend(struct device *dev) 2583 { 2584 struct pci_dev *pdev = to_pci_dev(dev); 2585 struct nvme_dev *ndev = pci_get_drvdata(pdev); 2586 2587 nvme_dev_disable(ndev, true); 2588 return 0; 2589 } 2590 2591 static int nvme_resume(struct device *dev) 2592 { 2593 struct pci_dev *pdev = to_pci_dev(dev); 2594 struct nvme_dev *ndev = pci_get_drvdata(pdev); 2595 2596 nvme_reset_ctrl(&ndev->ctrl); 2597 return 0; 2598 } 2599 #endif 2600 2601 static SIMPLE_DEV_PM_OPS(nvme_dev_pm_ops, nvme_suspend, nvme_resume); 2602 2603 static pci_ers_result_t nvme_error_detected(struct pci_dev *pdev, 2604 pci_channel_state_t state) 2605 { 2606 struct nvme_dev *dev = pci_get_drvdata(pdev); 2607 2608 /* 2609 * A frozen channel requires a reset. When detected, this method will 2610 * shutdown the controller to quiesce. The controller will be restarted 2611 * after the slot reset through driver's slot_reset callback. 2612 */ 2613 switch (state) { 2614 case pci_channel_io_normal: 2615 return PCI_ERS_RESULT_CAN_RECOVER; 2616 case pci_channel_io_frozen: 2617 dev_warn(dev->ctrl.device, 2618 "frozen state error detected, reset controller\n"); 2619 nvme_dev_disable(dev, false); 2620 return PCI_ERS_RESULT_NEED_RESET; 2621 case pci_channel_io_perm_failure: 2622 dev_warn(dev->ctrl.device, 2623 "failure state error detected, request disconnect\n"); 2624 return PCI_ERS_RESULT_DISCONNECT; 2625 } 2626 return PCI_ERS_RESULT_NEED_RESET; 2627 } 2628 2629 static pci_ers_result_t nvme_slot_reset(struct pci_dev *pdev) 2630 { 2631 struct nvme_dev *dev = pci_get_drvdata(pdev); 2632 2633 dev_info(dev->ctrl.device, "restart after slot reset\n"); 2634 pci_restore_state(pdev); 2635 nvme_reset_ctrl(&dev->ctrl); 2636 return PCI_ERS_RESULT_RECOVERED; 2637 } 2638 2639 static void nvme_error_resume(struct pci_dev *pdev) 2640 { 2641 struct nvme_dev *dev = pci_get_drvdata(pdev); 2642 2643 flush_work(&dev->ctrl.reset_work); 2644 pci_cleanup_aer_uncorrect_error_status(pdev); 2645 } 2646 2647 static const struct pci_error_handlers nvme_err_handler = { 2648 .error_detected = nvme_error_detected, 2649 .slot_reset = nvme_slot_reset, 2650 .resume = nvme_error_resume, 2651 .reset_prepare = nvme_reset_prepare, 2652 .reset_done = nvme_reset_done, 2653 }; 2654 2655 static const struct pci_device_id nvme_id_table[] = { 2656 { PCI_VDEVICE(INTEL, 0x0953), 2657 .driver_data = NVME_QUIRK_STRIPE_SIZE | 2658 NVME_QUIRK_DEALLOCATE_ZEROES, }, 2659 { PCI_VDEVICE(INTEL, 0x0a53), 2660 .driver_data = NVME_QUIRK_STRIPE_SIZE | 2661 NVME_QUIRK_DEALLOCATE_ZEROES, }, 2662 { PCI_VDEVICE(INTEL, 0x0a54), 2663 .driver_data = NVME_QUIRK_STRIPE_SIZE | 2664 NVME_QUIRK_DEALLOCATE_ZEROES, }, 2665 { PCI_VDEVICE(INTEL, 0x0a55), 2666 .driver_data = NVME_QUIRK_STRIPE_SIZE | 2667 NVME_QUIRK_DEALLOCATE_ZEROES, }, 2668 { PCI_VDEVICE(INTEL, 0xf1a5), /* Intel 600P/P3100 */ 2669 .driver_data = NVME_QUIRK_NO_DEEPEST_PS | 2670 NVME_QUIRK_MEDIUM_PRIO_SQ }, 2671 { PCI_VDEVICE(INTEL, 0x5845), /* Qemu emulated controller */ 2672 .driver_data = NVME_QUIRK_IDENTIFY_CNS, }, 2673 { PCI_DEVICE(0x1bb1, 0x0100), /* Seagate Nytro Flash Storage */ 2674 .driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, }, 2675 { PCI_DEVICE(0x1c58, 0x0003), /* HGST adapter */ 2676 .driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, }, 2677 { PCI_DEVICE(0x1c58, 0x0023), /* WDC SN200 adapter */ 2678 .driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, }, 2679 { PCI_DEVICE(0x1c5f, 0x0540), /* Memblaze Pblaze4 adapter */ 2680 .driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, }, 2681 { PCI_DEVICE(0x144d, 0xa821), /* Samsung PM1725 */ 2682 .driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, }, 2683 { PCI_DEVICE(0x144d, 0xa822), /* Samsung PM1725a */ 2684 .driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, }, 2685 { PCI_DEVICE(0x1d1d, 0x1f1f), /* LighNVM qemu device */ 2686 .driver_data = NVME_QUIRK_LIGHTNVM, }, 2687 { PCI_DEVICE(0x1d1d, 0x2807), /* CNEX WL */ 2688 .driver_data = NVME_QUIRK_LIGHTNVM, }, 2689 { PCI_DEVICE(0x1d1d, 0x2601), /* CNEX Granby */ 2690 .driver_data = NVME_QUIRK_LIGHTNVM, }, 2691 { PCI_DEVICE_CLASS(PCI_CLASS_STORAGE_EXPRESS, 0xffffff) }, 2692 { PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2001) }, 2693 { PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2003) }, 2694 { 0, } 2695 }; 2696 MODULE_DEVICE_TABLE(pci, nvme_id_table); 2697 2698 static struct pci_driver nvme_driver = { 2699 .name = "nvme", 2700 .id_table = nvme_id_table, 2701 .probe = nvme_probe, 2702 .remove = nvme_remove, 2703 .shutdown = nvme_shutdown, 2704 .driver = { 2705 .pm = &nvme_dev_pm_ops, 2706 }, 2707 .sriov_configure = pci_sriov_configure_simple, 2708 .err_handler = &nvme_err_handler, 2709 }; 2710 2711 static int __init nvme_init(void) 2712 { 2713 return pci_register_driver(&nvme_driver); 2714 } 2715 2716 static void __exit nvme_exit(void) 2717 { 2718 pci_unregister_driver(&nvme_driver); 2719 flush_workqueue(nvme_wq); 2720 _nvme_check_size(); 2721 } 2722 2723 MODULE_AUTHOR("Matthew Wilcox <willy@linux.intel.com>"); 2724 MODULE_LICENSE("GPL"); 2725 MODULE_VERSION("1.0"); 2726 module_init(nvme_init); 2727 module_exit(nvme_exit); 2728