1 /* 2 * NVMe block driver based on vfio 3 * 4 * Copyright 2016 - 2018 Red Hat, Inc. 5 * 6 * Authors: 7 * Fam Zheng <famz@redhat.com> 8 * Paolo Bonzini <pbonzini@redhat.com> 9 * 10 * This work is licensed under the terms of the GNU GPL, version 2 or later. 11 * See the COPYING file in the top-level directory. 12 */ 13 14 #include "qemu/osdep.h" 15 #include <linux/vfio.h> 16 #include "qapi/error.h" 17 #include "qapi/qmp/qdict.h" 18 #include "qapi/qmp/qstring.h" 19 #include "qemu/error-report.h" 20 #include "qemu/module.h" 21 #include "qemu/cutils.h" 22 #include "qemu/option.h" 23 #include "qemu/vfio-helpers.h" 24 #include "block/block_int.h" 25 #include "trace.h" 26 27 #include "block/nvme.h" 28 29 #define NVME_SQ_ENTRY_BYTES 64 30 #define NVME_CQ_ENTRY_BYTES 16 31 #define NVME_QUEUE_SIZE 128 32 #define NVME_BAR_SIZE 8192 33 34 typedef struct { 35 int32_t head, tail; 36 uint8_t *queue; 37 uint64_t iova; 38 /* Hardware MMIO register */ 39 volatile uint32_t *doorbell; 40 } NVMeQueue; 41 42 typedef struct { 43 BlockCompletionFunc *cb; 44 void *opaque; 45 int cid; 46 void *prp_list_page; 47 uint64_t prp_list_iova; 48 bool busy; 49 } NVMeRequest; 50 51 typedef struct { 52 CoQueue free_req_queue; 53 QemuMutex lock; 54 55 /* Fields protected by BQL */ 56 int index; 57 uint8_t *prp_list_pages; 58 59 /* Fields protected by @lock */ 60 NVMeQueue sq, cq; 61 int cq_phase; 62 NVMeRequest reqs[NVME_QUEUE_SIZE]; 63 bool busy; 64 int need_kick; 65 int inflight; 66 } NVMeQueuePair; 67 68 /* Memory mapped registers */ 69 typedef volatile struct { 70 uint64_t cap; 71 uint32_t vs; 72 uint32_t intms; 73 uint32_t intmc; 74 uint32_t cc; 75 uint32_t reserved0; 76 uint32_t csts; 77 uint32_t nssr; 78 uint32_t aqa; 79 uint64_t asq; 80 uint64_t acq; 81 uint32_t cmbloc; 82 uint32_t cmbsz; 83 uint8_t reserved1[0xec0]; 84 uint8_t cmd_set_specfic[0x100]; 85 uint32_t doorbells[]; 86 } NVMeRegs; 87 88 QEMU_BUILD_BUG_ON(offsetof(NVMeRegs, doorbells) != 0x1000); 89 90 typedef struct { 91 AioContext *aio_context; 92 QEMUVFIOState *vfio; 93 NVMeRegs *regs; 94 /* The submission/completion queue pairs. 95 * [0]: admin queue. 96 * [1..]: io queues. 97 */ 98 NVMeQueuePair **queues; 99 int nr_queues; 100 size_t page_size; 101 /* How many uint32_t elements does each doorbell entry take. */ 102 size_t doorbell_scale; 103 bool write_cache_supported; 104 EventNotifier irq_notifier; 105 uint64_t nsze; /* Namespace size reported by identify command */ 106 int nsid; /* The namespace id to read/write data. */ 107 uint64_t max_transfer; 108 bool plugged; 109 110 CoMutex dma_map_lock; 111 CoQueue dma_flush_queue; 112 113 /* Total size of mapped qiov, accessed under dma_map_lock */ 114 int dma_map_count; 115 116 /* PCI address (required for nvme_refresh_filename()) */ 117 char *device; 118 } BDRVNVMeState; 119 120 #define NVME_BLOCK_OPT_DEVICE "device" 121 #define NVME_BLOCK_OPT_NAMESPACE "namespace" 122 123 static QemuOptsList runtime_opts = { 124 .name = "nvme", 125 .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head), 126 .desc = { 127 { 128 .name = NVME_BLOCK_OPT_DEVICE, 129 .type = QEMU_OPT_STRING, 130 .help = "NVMe PCI device address", 131 }, 132 { 133 .name = NVME_BLOCK_OPT_NAMESPACE, 134 .type = QEMU_OPT_NUMBER, 135 .help = "NVMe namespace", 136 }, 137 { /* end of list */ } 138 }, 139 }; 140 141 static void nvme_init_queue(BlockDriverState *bs, NVMeQueue *q, 142 int nentries, int entry_bytes, Error **errp) 143 { 144 BDRVNVMeState *s = bs->opaque; 145 size_t bytes; 146 int r; 147 148 bytes = ROUND_UP(nentries * entry_bytes, s->page_size); 149 q->head = q->tail = 0; 150 q->queue = qemu_try_blockalign0(bs, bytes); 151 152 if (!q->queue) { 153 error_setg(errp, "Cannot allocate queue"); 154 return; 155 } 156 r = qemu_vfio_dma_map(s->vfio, q->queue, bytes, false, &q->iova); 157 if (r) { 158 error_setg(errp, "Cannot map queue"); 159 } 160 } 161 162 static void nvme_free_queue_pair(BlockDriverState *bs, NVMeQueuePair *q) 163 { 164 qemu_vfree(q->prp_list_pages); 165 qemu_vfree(q->sq.queue); 166 qemu_vfree(q->cq.queue); 167 qemu_mutex_destroy(&q->lock); 168 g_free(q); 169 } 170 171 static void nvme_free_req_queue_cb(void *opaque) 172 { 173 NVMeQueuePair *q = opaque; 174 175 qemu_mutex_lock(&q->lock); 176 while (qemu_co_enter_next(&q->free_req_queue, &q->lock)) { 177 /* Retry all pending requests */ 178 } 179 qemu_mutex_unlock(&q->lock); 180 } 181 182 static NVMeQueuePair *nvme_create_queue_pair(BlockDriverState *bs, 183 int idx, int size, 184 Error **errp) 185 { 186 int i, r; 187 BDRVNVMeState *s = bs->opaque; 188 Error *local_err = NULL; 189 NVMeQueuePair *q = g_new0(NVMeQueuePair, 1); 190 uint64_t prp_list_iova; 191 192 qemu_mutex_init(&q->lock); 193 q->index = idx; 194 qemu_co_queue_init(&q->free_req_queue); 195 q->prp_list_pages = qemu_blockalign0(bs, s->page_size * NVME_QUEUE_SIZE); 196 r = qemu_vfio_dma_map(s->vfio, q->prp_list_pages, 197 s->page_size * NVME_QUEUE_SIZE, 198 false, &prp_list_iova); 199 if (r) { 200 goto fail; 201 } 202 for (i = 0; i < NVME_QUEUE_SIZE; i++) { 203 NVMeRequest *req = &q->reqs[i]; 204 req->cid = i + 1; 205 req->prp_list_page = q->prp_list_pages + i * s->page_size; 206 req->prp_list_iova = prp_list_iova + i * s->page_size; 207 } 208 nvme_init_queue(bs, &q->sq, size, NVME_SQ_ENTRY_BYTES, &local_err); 209 if (local_err) { 210 error_propagate(errp, local_err); 211 goto fail; 212 } 213 q->sq.doorbell = &s->regs->doorbells[idx * 2 * s->doorbell_scale]; 214 215 nvme_init_queue(bs, &q->cq, size, NVME_CQ_ENTRY_BYTES, &local_err); 216 if (local_err) { 217 error_propagate(errp, local_err); 218 goto fail; 219 } 220 q->cq.doorbell = &s->regs->doorbells[idx * 2 * s->doorbell_scale + 1]; 221 222 return q; 223 fail: 224 nvme_free_queue_pair(bs, q); 225 return NULL; 226 } 227 228 /* With q->lock */ 229 static void nvme_kick(BDRVNVMeState *s, NVMeQueuePair *q) 230 { 231 if (s->plugged || !q->need_kick) { 232 return; 233 } 234 trace_nvme_kick(s, q->index); 235 assert(!(q->sq.tail & 0xFF00)); 236 /* Fence the write to submission queue entry before notifying the device. */ 237 smp_wmb(); 238 *q->sq.doorbell = cpu_to_le32(q->sq.tail); 239 q->inflight += q->need_kick; 240 q->need_kick = 0; 241 } 242 243 /* Find a free request element if any, otherwise: 244 * a) if in coroutine context, try to wait for one to become available; 245 * b) if not in coroutine, return NULL; 246 */ 247 static NVMeRequest *nvme_get_free_req(NVMeQueuePair *q) 248 { 249 int i; 250 NVMeRequest *req = NULL; 251 252 qemu_mutex_lock(&q->lock); 253 while (q->inflight + q->need_kick > NVME_QUEUE_SIZE - 2) { 254 /* We have to leave one slot empty as that is the full queue case (head 255 * == tail + 1). */ 256 if (qemu_in_coroutine()) { 257 trace_nvme_free_req_queue_wait(q); 258 qemu_co_queue_wait(&q->free_req_queue, &q->lock); 259 } else { 260 qemu_mutex_unlock(&q->lock); 261 return NULL; 262 } 263 } 264 for (i = 0; i < NVME_QUEUE_SIZE; i++) { 265 if (!q->reqs[i].busy) { 266 q->reqs[i].busy = true; 267 req = &q->reqs[i]; 268 break; 269 } 270 } 271 /* We have checked inflight and need_kick while holding q->lock, so one 272 * free req must be available. */ 273 assert(req); 274 qemu_mutex_unlock(&q->lock); 275 return req; 276 } 277 278 static inline int nvme_translate_error(const NvmeCqe *c) 279 { 280 uint16_t status = (le16_to_cpu(c->status) >> 1) & 0xFF; 281 if (status) { 282 trace_nvme_error(le32_to_cpu(c->result), 283 le16_to_cpu(c->sq_head), 284 le16_to_cpu(c->sq_id), 285 le16_to_cpu(c->cid), 286 le16_to_cpu(status)); 287 } 288 switch (status) { 289 case 0: 290 return 0; 291 case 1: 292 return -ENOSYS; 293 case 2: 294 return -EINVAL; 295 default: 296 return -EIO; 297 } 298 } 299 300 /* With q->lock */ 301 static bool nvme_process_completion(BDRVNVMeState *s, NVMeQueuePair *q) 302 { 303 bool progress = false; 304 NVMeRequest *preq; 305 NVMeRequest req; 306 NvmeCqe *c; 307 308 trace_nvme_process_completion(s, q->index, q->inflight); 309 if (q->busy || s->plugged) { 310 trace_nvme_process_completion_queue_busy(s, q->index); 311 return false; 312 } 313 q->busy = true; 314 assert(q->inflight >= 0); 315 while (q->inflight) { 316 int16_t cid; 317 c = (NvmeCqe *)&q->cq.queue[q->cq.head * NVME_CQ_ENTRY_BYTES]; 318 if (!c->cid || (le16_to_cpu(c->status) & 0x1) == q->cq_phase) { 319 break; 320 } 321 q->cq.head = (q->cq.head + 1) % NVME_QUEUE_SIZE; 322 if (!q->cq.head) { 323 q->cq_phase = !q->cq_phase; 324 } 325 cid = le16_to_cpu(c->cid); 326 if (cid == 0 || cid > NVME_QUEUE_SIZE) { 327 fprintf(stderr, "Unexpected CID in completion queue: %" PRIu32 "\n", 328 cid); 329 continue; 330 } 331 assert(cid <= NVME_QUEUE_SIZE); 332 trace_nvme_complete_command(s, q->index, cid); 333 preq = &q->reqs[cid - 1]; 334 req = *preq; 335 assert(req.cid == cid); 336 assert(req.cb); 337 preq->busy = false; 338 preq->cb = preq->opaque = NULL; 339 qemu_mutex_unlock(&q->lock); 340 req.cb(req.opaque, nvme_translate_error(c)); 341 qemu_mutex_lock(&q->lock); 342 c->cid = cpu_to_le16(0); 343 q->inflight--; 344 /* Flip Phase Tag bit. */ 345 c->status = cpu_to_le16(le16_to_cpu(c->status) ^ 0x1); 346 progress = true; 347 } 348 if (progress) { 349 /* Notify the device so it can post more completions. */ 350 smp_mb_release(); 351 *q->cq.doorbell = cpu_to_le32(q->cq.head); 352 if (!qemu_co_queue_empty(&q->free_req_queue)) { 353 aio_bh_schedule_oneshot(s->aio_context, nvme_free_req_queue_cb, q); 354 } 355 } 356 q->busy = false; 357 return progress; 358 } 359 360 static void nvme_trace_command(const NvmeCmd *cmd) 361 { 362 int i; 363 364 for (i = 0; i < 8; ++i) { 365 uint8_t *cmdp = (uint8_t *)cmd + i * 8; 366 trace_nvme_submit_command_raw(cmdp[0], cmdp[1], cmdp[2], cmdp[3], 367 cmdp[4], cmdp[5], cmdp[6], cmdp[7]); 368 } 369 } 370 371 static void nvme_submit_command(BDRVNVMeState *s, NVMeQueuePair *q, 372 NVMeRequest *req, 373 NvmeCmd *cmd, BlockCompletionFunc cb, 374 void *opaque) 375 { 376 assert(!req->cb); 377 req->cb = cb; 378 req->opaque = opaque; 379 cmd->cid = cpu_to_le32(req->cid); 380 381 trace_nvme_submit_command(s, q->index, req->cid); 382 nvme_trace_command(cmd); 383 qemu_mutex_lock(&q->lock); 384 memcpy((uint8_t *)q->sq.queue + 385 q->sq.tail * NVME_SQ_ENTRY_BYTES, cmd, sizeof(*cmd)); 386 q->sq.tail = (q->sq.tail + 1) % NVME_QUEUE_SIZE; 387 q->need_kick++; 388 nvme_kick(s, q); 389 nvme_process_completion(s, q); 390 qemu_mutex_unlock(&q->lock); 391 } 392 393 static void nvme_cmd_sync_cb(void *opaque, int ret) 394 { 395 int *pret = opaque; 396 *pret = ret; 397 aio_wait_kick(); 398 } 399 400 static int nvme_cmd_sync(BlockDriverState *bs, NVMeQueuePair *q, 401 NvmeCmd *cmd) 402 { 403 NVMeRequest *req; 404 BDRVNVMeState *s = bs->opaque; 405 int ret = -EINPROGRESS; 406 req = nvme_get_free_req(q); 407 if (!req) { 408 return -EBUSY; 409 } 410 nvme_submit_command(s, q, req, cmd, nvme_cmd_sync_cb, &ret); 411 412 BDRV_POLL_WHILE(bs, ret == -EINPROGRESS); 413 return ret; 414 } 415 416 static void nvme_identify(BlockDriverState *bs, int namespace, Error **errp) 417 { 418 BDRVNVMeState *s = bs->opaque; 419 NvmeIdCtrl *idctrl; 420 NvmeIdNs *idns; 421 uint8_t *resp; 422 int r; 423 uint64_t iova; 424 NvmeCmd cmd = { 425 .opcode = NVME_ADM_CMD_IDENTIFY, 426 .cdw10 = cpu_to_le32(0x1), 427 }; 428 429 resp = qemu_try_blockalign0(bs, sizeof(NvmeIdCtrl)); 430 if (!resp) { 431 error_setg(errp, "Cannot allocate buffer for identify response"); 432 goto out; 433 } 434 idctrl = (NvmeIdCtrl *)resp; 435 idns = (NvmeIdNs *)resp; 436 r = qemu_vfio_dma_map(s->vfio, resp, sizeof(NvmeIdCtrl), true, &iova); 437 if (r) { 438 error_setg(errp, "Cannot map buffer for DMA"); 439 goto out; 440 } 441 cmd.prp1 = cpu_to_le64(iova); 442 443 if (nvme_cmd_sync(bs, s->queues[0], &cmd)) { 444 error_setg(errp, "Failed to identify controller"); 445 goto out; 446 } 447 448 if (le32_to_cpu(idctrl->nn) < namespace) { 449 error_setg(errp, "Invalid namespace"); 450 goto out; 451 } 452 s->write_cache_supported = le32_to_cpu(idctrl->vwc) & 0x1; 453 s->max_transfer = (idctrl->mdts ? 1 << idctrl->mdts : 0) * s->page_size; 454 /* For now the page list buffer per command is one page, to hold at most 455 * s->page_size / sizeof(uint64_t) entries. */ 456 s->max_transfer = MIN_NON_ZERO(s->max_transfer, 457 s->page_size / sizeof(uint64_t) * s->page_size); 458 459 memset(resp, 0, 4096); 460 461 cmd.cdw10 = 0; 462 cmd.nsid = cpu_to_le32(namespace); 463 if (nvme_cmd_sync(bs, s->queues[0], &cmd)) { 464 error_setg(errp, "Failed to identify namespace"); 465 goto out; 466 } 467 468 s->nsze = le64_to_cpu(idns->nsze); 469 470 out: 471 qemu_vfio_dma_unmap(s->vfio, resp); 472 qemu_vfree(resp); 473 } 474 475 static bool nvme_poll_queues(BDRVNVMeState *s) 476 { 477 bool progress = false; 478 int i; 479 480 for (i = 0; i < s->nr_queues; i++) { 481 NVMeQueuePair *q = s->queues[i]; 482 qemu_mutex_lock(&q->lock); 483 while (nvme_process_completion(s, q)) { 484 /* Keep polling */ 485 progress = true; 486 } 487 qemu_mutex_unlock(&q->lock); 488 } 489 return progress; 490 } 491 492 static void nvme_handle_event(EventNotifier *n) 493 { 494 BDRVNVMeState *s = container_of(n, BDRVNVMeState, irq_notifier); 495 496 trace_nvme_handle_event(s); 497 event_notifier_test_and_clear(n); 498 nvme_poll_queues(s); 499 } 500 501 static bool nvme_add_io_queue(BlockDriverState *bs, Error **errp) 502 { 503 BDRVNVMeState *s = bs->opaque; 504 int n = s->nr_queues; 505 NVMeQueuePair *q; 506 NvmeCmd cmd; 507 int queue_size = NVME_QUEUE_SIZE; 508 509 q = nvme_create_queue_pair(bs, n, queue_size, errp); 510 if (!q) { 511 return false; 512 } 513 cmd = (NvmeCmd) { 514 .opcode = NVME_ADM_CMD_CREATE_CQ, 515 .prp1 = cpu_to_le64(q->cq.iova), 516 .cdw10 = cpu_to_le32(((queue_size - 1) << 16) | (n & 0xFFFF)), 517 .cdw11 = cpu_to_le32(0x3), 518 }; 519 if (nvme_cmd_sync(bs, s->queues[0], &cmd)) { 520 error_setg(errp, "Failed to create io queue [%d]", n); 521 nvme_free_queue_pair(bs, q); 522 return false; 523 } 524 cmd = (NvmeCmd) { 525 .opcode = NVME_ADM_CMD_CREATE_SQ, 526 .prp1 = cpu_to_le64(q->sq.iova), 527 .cdw10 = cpu_to_le32(((queue_size - 1) << 16) | (n & 0xFFFF)), 528 .cdw11 = cpu_to_le32(0x1 | (n << 16)), 529 }; 530 if (nvme_cmd_sync(bs, s->queues[0], &cmd)) { 531 error_setg(errp, "Failed to create io queue [%d]", n); 532 nvme_free_queue_pair(bs, q); 533 return false; 534 } 535 s->queues = g_renew(NVMeQueuePair *, s->queues, n + 1); 536 s->queues[n] = q; 537 s->nr_queues++; 538 return true; 539 } 540 541 static bool nvme_poll_cb(void *opaque) 542 { 543 EventNotifier *e = opaque; 544 BDRVNVMeState *s = container_of(e, BDRVNVMeState, irq_notifier); 545 bool progress = false; 546 547 trace_nvme_poll_cb(s); 548 progress = nvme_poll_queues(s); 549 return progress; 550 } 551 552 static int nvme_init(BlockDriverState *bs, const char *device, int namespace, 553 Error **errp) 554 { 555 BDRVNVMeState *s = bs->opaque; 556 int ret; 557 uint64_t cap; 558 uint64_t timeout_ms; 559 uint64_t deadline, now; 560 Error *local_err = NULL; 561 562 qemu_co_mutex_init(&s->dma_map_lock); 563 qemu_co_queue_init(&s->dma_flush_queue); 564 s->device = g_strdup(device); 565 s->nsid = namespace; 566 s->aio_context = bdrv_get_aio_context(bs); 567 ret = event_notifier_init(&s->irq_notifier, 0); 568 if (ret) { 569 error_setg(errp, "Failed to init event notifier"); 570 return ret; 571 } 572 573 s->vfio = qemu_vfio_open_pci(device, errp); 574 if (!s->vfio) { 575 ret = -EINVAL; 576 goto out; 577 } 578 579 s->regs = qemu_vfio_pci_map_bar(s->vfio, 0, 0, NVME_BAR_SIZE, errp); 580 if (!s->regs) { 581 ret = -EINVAL; 582 goto out; 583 } 584 585 /* Perform initialize sequence as described in NVMe spec "7.6.1 586 * Initialization". */ 587 588 cap = le64_to_cpu(s->regs->cap); 589 if (!(cap & (1ULL << 37))) { 590 error_setg(errp, "Device doesn't support NVMe command set"); 591 ret = -EINVAL; 592 goto out; 593 } 594 595 s->page_size = MAX(4096, 1 << (12 + ((cap >> 48) & 0xF))); 596 s->doorbell_scale = (4 << (((cap >> 32) & 0xF))) / sizeof(uint32_t); 597 bs->bl.opt_mem_alignment = s->page_size; 598 timeout_ms = MIN(500 * ((cap >> 24) & 0xFF), 30000); 599 600 /* Reset device to get a clean state. */ 601 s->regs->cc = cpu_to_le32(le32_to_cpu(s->regs->cc) & 0xFE); 602 /* Wait for CSTS.RDY = 0. */ 603 deadline = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) + timeout_ms * 1000000ULL; 604 while (le32_to_cpu(s->regs->csts) & 0x1) { 605 if (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) > deadline) { 606 error_setg(errp, "Timeout while waiting for device to reset (%" 607 PRId64 " ms)", 608 timeout_ms); 609 ret = -ETIMEDOUT; 610 goto out; 611 } 612 } 613 614 /* Set up admin queue. */ 615 s->queues = g_new(NVMeQueuePair *, 1); 616 s->nr_queues = 1; 617 s->queues[0] = nvme_create_queue_pair(bs, 0, NVME_QUEUE_SIZE, errp); 618 if (!s->queues[0]) { 619 ret = -EINVAL; 620 goto out; 621 } 622 QEMU_BUILD_BUG_ON(NVME_QUEUE_SIZE & 0xF000); 623 s->regs->aqa = cpu_to_le32((NVME_QUEUE_SIZE << 16) | NVME_QUEUE_SIZE); 624 s->regs->asq = cpu_to_le64(s->queues[0]->sq.iova); 625 s->regs->acq = cpu_to_le64(s->queues[0]->cq.iova); 626 627 /* After setting up all control registers we can enable device now. */ 628 s->regs->cc = cpu_to_le32((ctz32(NVME_CQ_ENTRY_BYTES) << 20) | 629 (ctz32(NVME_SQ_ENTRY_BYTES) << 16) | 630 0x1); 631 /* Wait for CSTS.RDY = 1. */ 632 now = qemu_clock_get_ns(QEMU_CLOCK_REALTIME); 633 deadline = now + timeout_ms * 1000000; 634 while (!(le32_to_cpu(s->regs->csts) & 0x1)) { 635 if (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) > deadline) { 636 error_setg(errp, "Timeout while waiting for device to start (%" 637 PRId64 " ms)", 638 timeout_ms); 639 ret = -ETIMEDOUT; 640 goto out; 641 } 642 } 643 644 ret = qemu_vfio_pci_init_irq(s->vfio, &s->irq_notifier, 645 VFIO_PCI_MSIX_IRQ_INDEX, errp); 646 if (ret) { 647 goto out; 648 } 649 aio_set_event_notifier(bdrv_get_aio_context(bs), &s->irq_notifier, 650 false, nvme_handle_event, nvme_poll_cb); 651 652 nvme_identify(bs, namespace, &local_err); 653 if (local_err) { 654 error_propagate(errp, local_err); 655 ret = -EIO; 656 goto out; 657 } 658 659 /* Set up command queues. */ 660 if (!nvme_add_io_queue(bs, errp)) { 661 ret = -EIO; 662 } 663 out: 664 /* Cleaning up is done in nvme_file_open() upon error. */ 665 return ret; 666 } 667 668 /* Parse a filename in the format of nvme://XXXX:XX:XX.X/X. Example: 669 * 670 * nvme://0000:44:00.0/1 671 * 672 * where the "nvme://" is a fixed form of the protocol prefix, the middle part 673 * is the PCI address, and the last part is the namespace number starting from 674 * 1 according to the NVMe spec. */ 675 static void nvme_parse_filename(const char *filename, QDict *options, 676 Error **errp) 677 { 678 int pref = strlen("nvme://"); 679 680 if (strlen(filename) > pref && !strncmp(filename, "nvme://", pref)) { 681 const char *tmp = filename + pref; 682 char *device; 683 const char *namespace; 684 unsigned long ns; 685 const char *slash = strchr(tmp, '/'); 686 if (!slash) { 687 qdict_put_str(options, NVME_BLOCK_OPT_DEVICE, tmp); 688 return; 689 } 690 device = g_strndup(tmp, slash - tmp); 691 qdict_put_str(options, NVME_BLOCK_OPT_DEVICE, device); 692 g_free(device); 693 namespace = slash + 1; 694 if (*namespace && qemu_strtoul(namespace, NULL, 10, &ns)) { 695 error_setg(errp, "Invalid namespace '%s', positive number expected", 696 namespace); 697 return; 698 } 699 qdict_put_str(options, NVME_BLOCK_OPT_NAMESPACE, 700 *namespace ? namespace : "1"); 701 } 702 } 703 704 static int nvme_enable_disable_write_cache(BlockDriverState *bs, bool enable, 705 Error **errp) 706 { 707 int ret; 708 BDRVNVMeState *s = bs->opaque; 709 NvmeCmd cmd = { 710 .opcode = NVME_ADM_CMD_SET_FEATURES, 711 .nsid = cpu_to_le32(s->nsid), 712 .cdw10 = cpu_to_le32(0x06), 713 .cdw11 = cpu_to_le32(enable ? 0x01 : 0x00), 714 }; 715 716 ret = nvme_cmd_sync(bs, s->queues[0], &cmd); 717 if (ret) { 718 error_setg(errp, "Failed to configure NVMe write cache"); 719 } 720 return ret; 721 } 722 723 static void nvme_close(BlockDriverState *bs) 724 { 725 int i; 726 BDRVNVMeState *s = bs->opaque; 727 728 for (i = 0; i < s->nr_queues; ++i) { 729 nvme_free_queue_pair(bs, s->queues[i]); 730 } 731 g_free(s->queues); 732 aio_set_event_notifier(bdrv_get_aio_context(bs), &s->irq_notifier, 733 false, NULL, NULL); 734 event_notifier_cleanup(&s->irq_notifier); 735 qemu_vfio_pci_unmap_bar(s->vfio, 0, (void *)s->regs, 0, NVME_BAR_SIZE); 736 qemu_vfio_close(s->vfio); 737 738 g_free(s->device); 739 } 740 741 static int nvme_file_open(BlockDriverState *bs, QDict *options, int flags, 742 Error **errp) 743 { 744 const char *device; 745 QemuOpts *opts; 746 int namespace; 747 int ret; 748 BDRVNVMeState *s = bs->opaque; 749 750 opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort); 751 qemu_opts_absorb_qdict(opts, options, &error_abort); 752 device = qemu_opt_get(opts, NVME_BLOCK_OPT_DEVICE); 753 if (!device) { 754 error_setg(errp, "'" NVME_BLOCK_OPT_DEVICE "' option is required"); 755 qemu_opts_del(opts); 756 return -EINVAL; 757 } 758 759 namespace = qemu_opt_get_number(opts, NVME_BLOCK_OPT_NAMESPACE, 1); 760 ret = nvme_init(bs, device, namespace, errp); 761 qemu_opts_del(opts); 762 if (ret) { 763 goto fail; 764 } 765 if (flags & BDRV_O_NOCACHE) { 766 if (!s->write_cache_supported) { 767 error_setg(errp, 768 "NVMe controller doesn't support write cache configuration"); 769 ret = -EINVAL; 770 } else { 771 ret = nvme_enable_disable_write_cache(bs, !(flags & BDRV_O_NOCACHE), 772 errp); 773 } 774 if (ret) { 775 goto fail; 776 } 777 } 778 bs->supported_write_flags = BDRV_REQ_FUA; 779 return 0; 780 fail: 781 nvme_close(bs); 782 return ret; 783 } 784 785 static int64_t nvme_getlength(BlockDriverState *bs) 786 { 787 BDRVNVMeState *s = bs->opaque; 788 789 return s->nsze << BDRV_SECTOR_BITS; 790 } 791 792 /* Called with s->dma_map_lock */ 793 static coroutine_fn int nvme_cmd_unmap_qiov(BlockDriverState *bs, 794 QEMUIOVector *qiov) 795 { 796 int r = 0; 797 BDRVNVMeState *s = bs->opaque; 798 799 s->dma_map_count -= qiov->size; 800 if (!s->dma_map_count && !qemu_co_queue_empty(&s->dma_flush_queue)) { 801 r = qemu_vfio_dma_reset_temporary(s->vfio); 802 if (!r) { 803 qemu_co_queue_restart_all(&s->dma_flush_queue); 804 } 805 } 806 return r; 807 } 808 809 /* Called with s->dma_map_lock */ 810 static coroutine_fn int nvme_cmd_map_qiov(BlockDriverState *bs, NvmeCmd *cmd, 811 NVMeRequest *req, QEMUIOVector *qiov) 812 { 813 BDRVNVMeState *s = bs->opaque; 814 uint64_t *pagelist = req->prp_list_page; 815 int i, j, r; 816 int entries = 0; 817 818 assert(qiov->size); 819 assert(QEMU_IS_ALIGNED(qiov->size, s->page_size)); 820 assert(qiov->size / s->page_size <= s->page_size / sizeof(uint64_t)); 821 for (i = 0; i < qiov->niov; ++i) { 822 bool retry = true; 823 uint64_t iova; 824 try_map: 825 r = qemu_vfio_dma_map(s->vfio, 826 qiov->iov[i].iov_base, 827 qiov->iov[i].iov_len, 828 true, &iova); 829 if (r == -ENOMEM && retry) { 830 retry = false; 831 trace_nvme_dma_flush_queue_wait(s); 832 if (s->dma_map_count) { 833 trace_nvme_dma_map_flush(s); 834 qemu_co_queue_wait(&s->dma_flush_queue, &s->dma_map_lock); 835 } else { 836 r = qemu_vfio_dma_reset_temporary(s->vfio); 837 if (r) { 838 goto fail; 839 } 840 } 841 goto try_map; 842 } 843 if (r) { 844 goto fail; 845 } 846 847 for (j = 0; j < qiov->iov[i].iov_len / s->page_size; j++) { 848 pagelist[entries++] = cpu_to_le64(iova + j * s->page_size); 849 } 850 trace_nvme_cmd_map_qiov_iov(s, i, qiov->iov[i].iov_base, 851 qiov->iov[i].iov_len / s->page_size); 852 } 853 854 s->dma_map_count += qiov->size; 855 856 assert(entries <= s->page_size / sizeof(uint64_t)); 857 switch (entries) { 858 case 0: 859 abort(); 860 case 1: 861 cmd->prp1 = pagelist[0]; 862 cmd->prp2 = 0; 863 break; 864 case 2: 865 cmd->prp1 = pagelist[0]; 866 cmd->prp2 = pagelist[1]; 867 break; 868 default: 869 cmd->prp1 = pagelist[0]; 870 cmd->prp2 = cpu_to_le64(req->prp_list_iova + sizeof(uint64_t)); 871 break; 872 } 873 trace_nvme_cmd_map_qiov(s, cmd, req, qiov, entries); 874 for (i = 0; i < entries; ++i) { 875 trace_nvme_cmd_map_qiov_pages(s, i, pagelist[i]); 876 } 877 return 0; 878 fail: 879 /* No need to unmap [0 - i) iovs even if we've failed, since we don't 880 * increment s->dma_map_count. This is okay for fixed mapping memory areas 881 * because they are already mapped before calling this function; for 882 * temporary mappings, a later nvme_cmd_(un)map_qiov will reclaim by 883 * calling qemu_vfio_dma_reset_temporary when necessary. */ 884 return r; 885 } 886 887 typedef struct { 888 Coroutine *co; 889 int ret; 890 AioContext *ctx; 891 } NVMeCoData; 892 893 static void nvme_rw_cb_bh(void *opaque) 894 { 895 NVMeCoData *data = opaque; 896 qemu_coroutine_enter(data->co); 897 } 898 899 static void nvme_rw_cb(void *opaque, int ret) 900 { 901 NVMeCoData *data = opaque; 902 data->ret = ret; 903 if (!data->co) { 904 /* The rw coroutine hasn't yielded, don't try to enter. */ 905 return; 906 } 907 aio_bh_schedule_oneshot(data->ctx, nvme_rw_cb_bh, data); 908 } 909 910 static coroutine_fn int nvme_co_prw_aligned(BlockDriverState *bs, 911 uint64_t offset, uint64_t bytes, 912 QEMUIOVector *qiov, 913 bool is_write, 914 int flags) 915 { 916 int r; 917 BDRVNVMeState *s = bs->opaque; 918 NVMeQueuePair *ioq = s->queues[1]; 919 NVMeRequest *req; 920 uint32_t cdw12 = (((bytes >> BDRV_SECTOR_BITS) - 1) & 0xFFFF) | 921 (flags & BDRV_REQ_FUA ? 1 << 30 : 0); 922 NvmeCmd cmd = { 923 .opcode = is_write ? NVME_CMD_WRITE : NVME_CMD_READ, 924 .nsid = cpu_to_le32(s->nsid), 925 .cdw10 = cpu_to_le32((offset >> BDRV_SECTOR_BITS) & 0xFFFFFFFF), 926 .cdw11 = cpu_to_le32(((offset >> BDRV_SECTOR_BITS) >> 32) & 0xFFFFFFFF), 927 .cdw12 = cpu_to_le32(cdw12), 928 }; 929 NVMeCoData data = { 930 .ctx = bdrv_get_aio_context(bs), 931 .ret = -EINPROGRESS, 932 }; 933 934 trace_nvme_prw_aligned(s, is_write, offset, bytes, flags, qiov->niov); 935 assert(s->nr_queues > 1); 936 req = nvme_get_free_req(ioq); 937 assert(req); 938 939 qemu_co_mutex_lock(&s->dma_map_lock); 940 r = nvme_cmd_map_qiov(bs, &cmd, req, qiov); 941 qemu_co_mutex_unlock(&s->dma_map_lock); 942 if (r) { 943 req->busy = false; 944 return r; 945 } 946 nvme_submit_command(s, ioq, req, &cmd, nvme_rw_cb, &data); 947 948 data.co = qemu_coroutine_self(); 949 while (data.ret == -EINPROGRESS) { 950 qemu_coroutine_yield(); 951 } 952 953 qemu_co_mutex_lock(&s->dma_map_lock); 954 r = nvme_cmd_unmap_qiov(bs, qiov); 955 qemu_co_mutex_unlock(&s->dma_map_lock); 956 if (r) { 957 return r; 958 } 959 960 trace_nvme_rw_done(s, is_write, offset, bytes, data.ret); 961 return data.ret; 962 } 963 964 static inline bool nvme_qiov_aligned(BlockDriverState *bs, 965 const QEMUIOVector *qiov) 966 { 967 int i; 968 BDRVNVMeState *s = bs->opaque; 969 970 for (i = 0; i < qiov->niov; ++i) { 971 if (!QEMU_PTR_IS_ALIGNED(qiov->iov[i].iov_base, s->page_size) || 972 !QEMU_IS_ALIGNED(qiov->iov[i].iov_len, s->page_size)) { 973 trace_nvme_qiov_unaligned(qiov, i, qiov->iov[i].iov_base, 974 qiov->iov[i].iov_len, s->page_size); 975 return false; 976 } 977 } 978 return true; 979 } 980 981 static int nvme_co_prw(BlockDriverState *bs, uint64_t offset, uint64_t bytes, 982 QEMUIOVector *qiov, bool is_write, int flags) 983 { 984 BDRVNVMeState *s = bs->opaque; 985 int r; 986 uint8_t *buf = NULL; 987 QEMUIOVector local_qiov; 988 989 assert(QEMU_IS_ALIGNED(offset, s->page_size)); 990 assert(QEMU_IS_ALIGNED(bytes, s->page_size)); 991 assert(bytes <= s->max_transfer); 992 if (nvme_qiov_aligned(bs, qiov)) { 993 return nvme_co_prw_aligned(bs, offset, bytes, qiov, is_write, flags); 994 } 995 trace_nvme_prw_buffered(s, offset, bytes, qiov->niov, is_write); 996 buf = qemu_try_blockalign(bs, bytes); 997 998 if (!buf) { 999 return -ENOMEM; 1000 } 1001 qemu_iovec_init(&local_qiov, 1); 1002 if (is_write) { 1003 qemu_iovec_to_buf(qiov, 0, buf, bytes); 1004 } 1005 qemu_iovec_add(&local_qiov, buf, bytes); 1006 r = nvme_co_prw_aligned(bs, offset, bytes, &local_qiov, is_write, flags); 1007 qemu_iovec_destroy(&local_qiov); 1008 if (!r && !is_write) { 1009 qemu_iovec_from_buf(qiov, 0, buf, bytes); 1010 } 1011 qemu_vfree(buf); 1012 return r; 1013 } 1014 1015 static coroutine_fn int nvme_co_preadv(BlockDriverState *bs, 1016 uint64_t offset, uint64_t bytes, 1017 QEMUIOVector *qiov, int flags) 1018 { 1019 return nvme_co_prw(bs, offset, bytes, qiov, false, flags); 1020 } 1021 1022 static coroutine_fn int nvme_co_pwritev(BlockDriverState *bs, 1023 uint64_t offset, uint64_t bytes, 1024 QEMUIOVector *qiov, int flags) 1025 { 1026 return nvme_co_prw(bs, offset, bytes, qiov, true, flags); 1027 } 1028 1029 static coroutine_fn int nvme_co_flush(BlockDriverState *bs) 1030 { 1031 BDRVNVMeState *s = bs->opaque; 1032 NVMeQueuePair *ioq = s->queues[1]; 1033 NVMeRequest *req; 1034 NvmeCmd cmd = { 1035 .opcode = NVME_CMD_FLUSH, 1036 .nsid = cpu_to_le32(s->nsid), 1037 }; 1038 NVMeCoData data = { 1039 .ctx = bdrv_get_aio_context(bs), 1040 .ret = -EINPROGRESS, 1041 }; 1042 1043 assert(s->nr_queues > 1); 1044 req = nvme_get_free_req(ioq); 1045 assert(req); 1046 nvme_submit_command(s, ioq, req, &cmd, nvme_rw_cb, &data); 1047 1048 data.co = qemu_coroutine_self(); 1049 if (data.ret == -EINPROGRESS) { 1050 qemu_coroutine_yield(); 1051 } 1052 1053 return data.ret; 1054 } 1055 1056 1057 static int nvme_reopen_prepare(BDRVReopenState *reopen_state, 1058 BlockReopenQueue *queue, Error **errp) 1059 { 1060 return 0; 1061 } 1062 1063 static void nvme_refresh_filename(BlockDriverState *bs) 1064 { 1065 BDRVNVMeState *s = bs->opaque; 1066 1067 snprintf(bs->exact_filename, sizeof(bs->exact_filename), "nvme://%s/%i", 1068 s->device, s->nsid); 1069 } 1070 1071 static void nvme_refresh_limits(BlockDriverState *bs, Error **errp) 1072 { 1073 BDRVNVMeState *s = bs->opaque; 1074 1075 bs->bl.opt_mem_alignment = s->page_size; 1076 bs->bl.request_alignment = s->page_size; 1077 bs->bl.max_transfer = s->max_transfer; 1078 } 1079 1080 static void nvme_detach_aio_context(BlockDriverState *bs) 1081 { 1082 BDRVNVMeState *s = bs->opaque; 1083 1084 aio_set_event_notifier(bdrv_get_aio_context(bs), &s->irq_notifier, 1085 false, NULL, NULL); 1086 } 1087 1088 static void nvme_attach_aio_context(BlockDriverState *bs, 1089 AioContext *new_context) 1090 { 1091 BDRVNVMeState *s = bs->opaque; 1092 1093 s->aio_context = new_context; 1094 aio_set_event_notifier(new_context, &s->irq_notifier, 1095 false, nvme_handle_event, nvme_poll_cb); 1096 } 1097 1098 static void nvme_aio_plug(BlockDriverState *bs) 1099 { 1100 BDRVNVMeState *s = bs->opaque; 1101 assert(!s->plugged); 1102 s->plugged = true; 1103 } 1104 1105 static void nvme_aio_unplug(BlockDriverState *bs) 1106 { 1107 int i; 1108 BDRVNVMeState *s = bs->opaque; 1109 assert(s->plugged); 1110 s->plugged = false; 1111 for (i = 1; i < s->nr_queues; i++) { 1112 NVMeQueuePair *q = s->queues[i]; 1113 qemu_mutex_lock(&q->lock); 1114 nvme_kick(s, q); 1115 nvme_process_completion(s, q); 1116 qemu_mutex_unlock(&q->lock); 1117 } 1118 } 1119 1120 static void nvme_register_buf(BlockDriverState *bs, void *host, size_t size) 1121 { 1122 int ret; 1123 BDRVNVMeState *s = bs->opaque; 1124 1125 ret = qemu_vfio_dma_map(s->vfio, host, size, false, NULL); 1126 if (ret) { 1127 /* FIXME: we may run out of IOVA addresses after repeated 1128 * bdrv_register_buf/bdrv_unregister_buf, because nvme_vfio_dma_unmap 1129 * doesn't reclaim addresses for fixed mappings. */ 1130 error_report("nvme_register_buf failed: %s", strerror(-ret)); 1131 } 1132 } 1133 1134 static void nvme_unregister_buf(BlockDriverState *bs, void *host) 1135 { 1136 BDRVNVMeState *s = bs->opaque; 1137 1138 qemu_vfio_dma_unmap(s->vfio, host); 1139 } 1140 1141 static const char *const nvme_strong_runtime_opts[] = { 1142 NVME_BLOCK_OPT_DEVICE, 1143 NVME_BLOCK_OPT_NAMESPACE, 1144 1145 NULL 1146 }; 1147 1148 static BlockDriver bdrv_nvme = { 1149 .format_name = "nvme", 1150 .protocol_name = "nvme", 1151 .instance_size = sizeof(BDRVNVMeState), 1152 1153 .bdrv_parse_filename = nvme_parse_filename, 1154 .bdrv_file_open = nvme_file_open, 1155 .bdrv_close = nvme_close, 1156 .bdrv_getlength = nvme_getlength, 1157 1158 .bdrv_co_preadv = nvme_co_preadv, 1159 .bdrv_co_pwritev = nvme_co_pwritev, 1160 .bdrv_co_flush_to_disk = nvme_co_flush, 1161 .bdrv_reopen_prepare = nvme_reopen_prepare, 1162 1163 .bdrv_refresh_filename = nvme_refresh_filename, 1164 .bdrv_refresh_limits = nvme_refresh_limits, 1165 .strong_runtime_opts = nvme_strong_runtime_opts, 1166 1167 .bdrv_detach_aio_context = nvme_detach_aio_context, 1168 .bdrv_attach_aio_context = nvme_attach_aio_context, 1169 1170 .bdrv_io_plug = nvme_aio_plug, 1171 .bdrv_io_unplug = nvme_aio_unplug, 1172 1173 .bdrv_register_buf = nvme_register_buf, 1174 .bdrv_unregister_buf = nvme_unregister_buf, 1175 }; 1176 1177 static void bdrv_nvme_init(void) 1178 { 1179 bdrv_register(&bdrv_nvme); 1180 } 1181 1182 block_init(bdrv_nvme_init); 1183