1 /* 2 * NVMe block driver based on vfio 3 * 4 * Copyright 2016 - 2018 Red Hat, Inc. 5 * 6 * Authors: 7 * Fam Zheng <famz@redhat.com> 8 * Paolo Bonzini <pbonzini@redhat.com> 9 * 10 * This work is licensed under the terms of the GNU GPL, version 2 or later. 11 * See the COPYING file in the top-level directory. 12 */ 13 14 #include "qemu/osdep.h" 15 #include <linux/vfio.h> 16 #include "qapi/error.h" 17 #include "qobject/qdict.h" 18 #include "qobject/qstring.h" 19 #include "qemu/defer-call.h" 20 #include "qemu/error-report.h" 21 #include "qemu/host-pci-mmio.h" 22 #include "qemu/main-loop.h" 23 #include "qemu/module.h" 24 #include "qemu/cutils.h" 25 #include "qemu/option.h" 26 #include "qemu/memalign.h" 27 #include "qemu/vfio-helpers.h" 28 #include "block/block-io.h" 29 #include "block/block_int.h" 30 #include "system/block-backend.h" 31 #include "system/replay.h" 32 #include "trace.h" 33 34 #include "block/nvme.h" 35 36 #define NVME_SQ_ENTRY_BYTES 64 37 #define NVME_CQ_ENTRY_BYTES 16 38 #define NVME_QUEUE_SIZE 128 39 #define NVME_DOORBELL_SIZE 4096 40 41 /* 42 * We have to leave one slot empty as that is the full queue case where 43 * head == tail + 1. 44 */ 45 #define NVME_NUM_REQS (NVME_QUEUE_SIZE - 1) 46 47 typedef struct BDRVNVMeState BDRVNVMeState; 48 49 /* Same index is used for queues and IRQs */ 50 #define INDEX_ADMIN 0 51 #define INDEX_IO(n) (1 + n) 52 53 /* This driver shares a single MSIX IRQ for the admin and I/O queues */ 54 enum { 55 MSIX_SHARED_IRQ_IDX = 0, 56 MSIX_IRQ_COUNT = 1 57 }; 58 59 typedef struct { 60 int32_t head, tail; 61 uint8_t *queue; 62 uint64_t iova; 63 /* Hardware MMIO register */ 64 uint32_t *doorbell; 65 } NVMeQueue; 66 67 typedef struct { 68 BlockCompletionFunc *cb; 69 void *opaque; 70 int cid; 71 void *prp_list_page; 72 uint64_t prp_list_iova; 73 int free_req_next; /* q->reqs[] index of next free req */ 74 } NVMeRequest; 75 76 typedef struct { 77 QemuMutex lock; 78 79 /* Read from I/O code path, initialized under BQL */ 80 BDRVNVMeState *s; 81 int index; 82 83 /* Fields protected by BQL */ 84 uint8_t *prp_list_pages; 85 86 /* Fields protected by @lock */ 87 CoQueue free_req_queue; 88 NVMeQueue sq, cq; 89 int cq_phase; 90 int free_req_head; 91 NVMeRequest reqs[NVME_NUM_REQS]; 92 int need_kick; 93 int inflight; 94 95 /* Thread-safe, no lock necessary */ 96 QEMUBH *completion_bh; 97 } NVMeQueuePair; 98 99 struct BDRVNVMeState { 100 AioContext *aio_context; 101 QEMUVFIOState *vfio; 102 void *bar0_wo_map; 103 /* Memory mapped registers */ 104 struct { 105 uint32_t sq_tail; 106 uint32_t cq_head; 107 } *doorbells; 108 /* The submission/completion queue pairs. 109 * [0]: admin queue. 110 * [1..]: io queues. 111 */ 112 NVMeQueuePair **queues; 113 unsigned queue_count; 114 size_t page_size; 115 /* How many uint32_t elements does each doorbell entry take. */ 116 size_t doorbell_scale; 117 bool write_cache_supported; 118 EventNotifier irq_notifier[MSIX_IRQ_COUNT]; 119 120 uint64_t nsze; /* Namespace size reported by identify command */ 121 int nsid; /* The namespace id to read/write data. */ 122 int blkshift; 123 124 uint64_t max_transfer; 125 126 bool supports_write_zeroes; 127 bool supports_discard; 128 129 CoMutex dma_map_lock; 130 CoQueue dma_flush_queue; 131 132 /* Total size of mapped qiov, accessed under dma_map_lock */ 133 int dma_map_count; 134 135 /* PCI address (required for nvme_refresh_filename()) */ 136 char *device; 137 138 struct { 139 uint64_t completion_errors; 140 uint64_t aligned_accesses; 141 uint64_t unaligned_accesses; 142 } stats; 143 }; 144 145 #define NVME_BLOCK_OPT_DEVICE "device" 146 #define NVME_BLOCK_OPT_NAMESPACE "namespace" 147 148 static void nvme_process_completion_bh(void *opaque); 149 150 static QemuOptsList runtime_opts = { 151 .name = "nvme", 152 .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head), 153 .desc = { 154 { 155 .name = NVME_BLOCK_OPT_DEVICE, 156 .type = QEMU_OPT_STRING, 157 .help = "NVMe PCI device address", 158 }, 159 { 160 .name = NVME_BLOCK_OPT_NAMESPACE, 161 .type = QEMU_OPT_NUMBER, 162 .help = "NVMe namespace", 163 }, 164 { /* end of list */ } 165 }, 166 }; 167 168 /* Returns true on success, false on failure. */ 169 static bool nvme_init_queue(BDRVNVMeState *s, NVMeQueue *q, 170 unsigned nentries, size_t entry_bytes, Error **errp) 171 { 172 ERRP_GUARD(); 173 size_t bytes; 174 int r; 175 176 bytes = ROUND_UP(nentries * entry_bytes, qemu_real_host_page_size()); 177 q->head = q->tail = 0; 178 q->queue = qemu_try_memalign(qemu_real_host_page_size(), bytes); 179 if (!q->queue) { 180 error_setg(errp, "Cannot allocate queue"); 181 return false; 182 } 183 memset(q->queue, 0, bytes); 184 r = qemu_vfio_dma_map(s->vfio, q->queue, bytes, false, &q->iova, errp); 185 if (r) { 186 error_prepend(errp, "Cannot map queue: "); 187 } 188 return r == 0; 189 } 190 191 static void nvme_free_queue(NVMeQueue *q) 192 { 193 qemu_vfree(q->queue); 194 } 195 196 static void nvme_free_queue_pair(NVMeQueuePair *q) 197 { 198 trace_nvme_free_queue_pair(q->index, q, &q->cq, &q->sq); 199 if (q->completion_bh) { 200 qemu_bh_delete(q->completion_bh); 201 } 202 nvme_free_queue(&q->sq); 203 nvme_free_queue(&q->cq); 204 qemu_vfree(q->prp_list_pages); 205 qemu_mutex_destroy(&q->lock); 206 g_free(q); 207 } 208 209 static void nvme_free_req_queue_cb(void *opaque) 210 { 211 NVMeQueuePair *q = opaque; 212 213 qemu_mutex_lock(&q->lock); 214 while (q->free_req_head != -1 && 215 qemu_co_enter_next(&q->free_req_queue, &q->lock)) { 216 /* Retry waiting requests */ 217 } 218 qemu_mutex_unlock(&q->lock); 219 } 220 221 static NVMeQueuePair *nvme_create_queue_pair(BDRVNVMeState *s, 222 AioContext *aio_context, 223 unsigned idx, size_t size, 224 Error **errp) 225 { 226 ERRP_GUARD(); 227 int i, r; 228 NVMeQueuePair *q; 229 uint64_t prp_list_iova; 230 size_t bytes; 231 232 q = g_try_new0(NVMeQueuePair, 1); 233 if (!q) { 234 error_setg(errp, "Cannot allocate queue pair"); 235 return NULL; 236 } 237 trace_nvme_create_queue_pair(idx, q, size, aio_context, 238 event_notifier_get_fd(s->irq_notifier)); 239 bytes = QEMU_ALIGN_UP(s->page_size * NVME_NUM_REQS, 240 qemu_real_host_page_size()); 241 q->prp_list_pages = qemu_try_memalign(qemu_real_host_page_size(), bytes); 242 if (!q->prp_list_pages) { 243 error_setg(errp, "Cannot allocate PRP page list"); 244 goto fail; 245 } 246 memset(q->prp_list_pages, 0, bytes); 247 qemu_mutex_init(&q->lock); 248 q->s = s; 249 q->index = idx; 250 qemu_co_queue_init(&q->free_req_queue); 251 q->completion_bh = aio_bh_new(aio_context, nvme_process_completion_bh, q); 252 r = qemu_vfio_dma_map(s->vfio, q->prp_list_pages, bytes, 253 false, &prp_list_iova, errp); 254 if (r) { 255 error_prepend(errp, "Cannot map buffer for DMA: "); 256 goto fail; 257 } 258 q->free_req_head = -1; 259 for (i = 0; i < NVME_NUM_REQS; i++) { 260 NVMeRequest *req = &q->reqs[i]; 261 req->cid = i + 1; 262 req->free_req_next = q->free_req_head; 263 q->free_req_head = i; 264 req->prp_list_page = q->prp_list_pages + i * s->page_size; 265 req->prp_list_iova = prp_list_iova + i * s->page_size; 266 } 267 268 if (!nvme_init_queue(s, &q->sq, size, NVME_SQ_ENTRY_BYTES, errp)) { 269 goto fail; 270 } 271 q->sq.doorbell = &s->doorbells[idx * s->doorbell_scale].sq_tail; 272 273 if (!nvme_init_queue(s, &q->cq, size, NVME_CQ_ENTRY_BYTES, errp)) { 274 goto fail; 275 } 276 q->cq.doorbell = &s->doorbells[idx * s->doorbell_scale].cq_head; 277 278 return q; 279 fail: 280 nvme_free_queue_pair(q); 281 return NULL; 282 } 283 284 /* With q->lock */ 285 static void nvme_kick(NVMeQueuePair *q) 286 { 287 BDRVNVMeState *s = q->s; 288 289 if (!q->need_kick) { 290 return; 291 } 292 trace_nvme_kick(s, q->index); 293 assert(!(q->sq.tail & 0xFF00)); 294 /* Fence the write to submission queue entry before notifying the device. */ 295 smp_wmb(); 296 host_pci_stl_le_p(q->sq.doorbell, q->sq.tail); 297 q->inflight += q->need_kick; 298 q->need_kick = 0; 299 } 300 301 static NVMeRequest *nvme_get_free_req_nofail_locked(NVMeQueuePair *q) 302 { 303 NVMeRequest *req; 304 305 req = &q->reqs[q->free_req_head]; 306 q->free_req_head = req->free_req_next; 307 req->free_req_next = -1; 308 return req; 309 } 310 311 /* Return a free request element if any, otherwise return NULL. */ 312 static NVMeRequest *nvme_get_free_req_nowait(NVMeQueuePair *q) 313 { 314 QEMU_LOCK_GUARD(&q->lock); 315 if (q->free_req_head == -1) { 316 return NULL; 317 } 318 return nvme_get_free_req_nofail_locked(q); 319 } 320 321 /* 322 * Wait for a free request to become available if necessary, then 323 * return it. 324 */ 325 static coroutine_fn NVMeRequest *nvme_get_free_req(NVMeQueuePair *q) 326 { 327 QEMU_LOCK_GUARD(&q->lock); 328 329 while (q->free_req_head == -1) { 330 trace_nvme_free_req_queue_wait(q->s, q->index); 331 qemu_co_queue_wait(&q->free_req_queue, &q->lock); 332 } 333 334 return nvme_get_free_req_nofail_locked(q); 335 } 336 337 /* With q->lock */ 338 static void nvme_put_free_req_locked(NVMeQueuePair *q, NVMeRequest *req) 339 { 340 req->free_req_next = q->free_req_head; 341 q->free_req_head = req - q->reqs; 342 } 343 344 /* With q->lock */ 345 static void nvme_wake_free_req_locked(NVMeQueuePair *q) 346 { 347 if (!qemu_co_queue_empty(&q->free_req_queue)) { 348 replay_bh_schedule_oneshot_event(q->s->aio_context, 349 nvme_free_req_queue_cb, q); 350 } 351 } 352 353 /* Insert a request in the freelist and wake waiters */ 354 static void nvme_put_free_req_and_wake(NVMeQueuePair *q, NVMeRequest *req) 355 { 356 qemu_mutex_lock(&q->lock); 357 nvme_put_free_req_locked(q, req); 358 nvme_wake_free_req_locked(q); 359 qemu_mutex_unlock(&q->lock); 360 } 361 362 static inline int nvme_translate_error(const NvmeCqe *c) 363 { 364 uint16_t status = (le16_to_cpu(c->status) >> 1) & 0xFF; 365 if (status) { 366 trace_nvme_error(le32_to_cpu(c->result), 367 le16_to_cpu(c->sq_head), 368 le16_to_cpu(c->sq_id), 369 le16_to_cpu(c->cid), 370 le16_to_cpu(status)); 371 } 372 switch (status) { 373 case 0: 374 return 0; 375 case 1: 376 return -ENOSYS; 377 case 2: 378 return -EINVAL; 379 default: 380 return -EIO; 381 } 382 } 383 384 /* With q->lock */ 385 static bool nvme_process_completion(NVMeQueuePair *q) 386 { 387 BDRVNVMeState *s = q->s; 388 bool progress = false; 389 NVMeRequest *preq; 390 NVMeRequest req; 391 NvmeCqe *c; 392 393 trace_nvme_process_completion(s, q->index, q->inflight); 394 395 /* 396 * Support re-entrancy when a request cb() function invokes aio_poll(). 397 * Pending completions must be visible to aio_poll() so that a cb() 398 * function can wait for the completion of another request. 399 * 400 * The aio_poll() loop will execute our BH and we'll resume completion 401 * processing there. 402 */ 403 qemu_bh_schedule(q->completion_bh); 404 405 assert(q->inflight >= 0); 406 while (q->inflight) { 407 int ret; 408 int16_t cid; 409 410 c = (NvmeCqe *)&q->cq.queue[q->cq.head * NVME_CQ_ENTRY_BYTES]; 411 if ((le16_to_cpu(c->status) & 0x1) == q->cq_phase) { 412 break; 413 } 414 ret = nvme_translate_error(c); 415 if (ret) { 416 s->stats.completion_errors++; 417 } 418 q->cq.head = (q->cq.head + 1) % NVME_QUEUE_SIZE; 419 if (!q->cq.head) { 420 q->cq_phase = !q->cq_phase; 421 } 422 cid = le16_to_cpu(c->cid); 423 if (cid == 0 || cid > NVME_NUM_REQS) { 424 warn_report("NVMe: Unexpected CID in completion queue: %" PRIu32 425 ", should be within: 1..%u inclusively", cid, 426 NVME_NUM_REQS); 427 continue; 428 } 429 trace_nvme_complete_command(s, q->index, cid); 430 preq = &q->reqs[cid - 1]; 431 req = *preq; 432 assert(req.cid == cid); 433 assert(req.cb); 434 nvme_put_free_req_locked(q, preq); 435 preq->cb = preq->opaque = NULL; 436 q->inflight--; 437 qemu_mutex_unlock(&q->lock); 438 req.cb(req.opaque, ret); 439 qemu_mutex_lock(&q->lock); 440 progress = true; 441 } 442 if (progress) { 443 /* Notify the device so it can post more completions. */ 444 smp_mb_release(); 445 host_pci_stl_le_p(q->cq.doorbell, q->cq.head); 446 nvme_wake_free_req_locked(q); 447 } 448 449 qemu_bh_cancel(q->completion_bh); 450 451 return progress; 452 } 453 454 static void nvme_process_completion_bh(void *opaque) 455 { 456 NVMeQueuePair *q = opaque; 457 458 /* 459 * We're being invoked because a nvme_process_completion() cb() function 460 * called aio_poll(). The callback may be waiting for further completions 461 * so notify the device that it has space to fill in more completions now. 462 */ 463 smp_mb_release(); 464 host_pci_stl_le_p(q->cq.doorbell, q->cq.head); 465 nvme_wake_free_req_locked(q); 466 467 nvme_process_completion(q); 468 } 469 470 static void nvme_trace_command(const NvmeCmd *cmd) 471 { 472 int i; 473 474 if (!trace_event_get_state_backends(TRACE_NVME_SUBMIT_COMMAND_RAW)) { 475 return; 476 } 477 for (i = 0; i < 8; ++i) { 478 uint8_t *cmdp = (uint8_t *)cmd + i * 8; 479 trace_nvme_submit_command_raw(cmdp[0], cmdp[1], cmdp[2], cmdp[3], 480 cmdp[4], cmdp[5], cmdp[6], cmdp[7]); 481 } 482 } 483 484 static void nvme_deferred_fn(void *opaque) 485 { 486 NVMeQueuePair *q = opaque; 487 488 QEMU_LOCK_GUARD(&q->lock); 489 nvme_kick(q); 490 nvme_process_completion(q); 491 } 492 493 static void nvme_submit_command(NVMeQueuePair *q, NVMeRequest *req, 494 NvmeCmd *cmd, BlockCompletionFunc cb, 495 void *opaque) 496 { 497 assert(!req->cb); 498 req->cb = cb; 499 req->opaque = opaque; 500 cmd->cid = cpu_to_le16(req->cid); 501 502 trace_nvme_submit_command(q->s, q->index, req->cid); 503 nvme_trace_command(cmd); 504 qemu_mutex_lock(&q->lock); 505 memcpy((uint8_t *)q->sq.queue + 506 q->sq.tail * NVME_SQ_ENTRY_BYTES, cmd, sizeof(*cmd)); 507 q->sq.tail = (q->sq.tail + 1) % NVME_QUEUE_SIZE; 508 q->need_kick++; 509 qemu_mutex_unlock(&q->lock); 510 511 defer_call(nvme_deferred_fn, q); 512 } 513 514 static void nvme_admin_cmd_sync_cb(void *opaque, int ret) 515 { 516 int *pret = opaque; 517 *pret = ret; 518 aio_wait_kick(); 519 } 520 521 static int nvme_admin_cmd_sync(BlockDriverState *bs, NvmeCmd *cmd) 522 { 523 BDRVNVMeState *s = bs->opaque; 524 NVMeQueuePair *q = s->queues[INDEX_ADMIN]; 525 AioContext *aio_context = bdrv_get_aio_context(bs); 526 NVMeRequest *req; 527 int ret = -EINPROGRESS; 528 req = nvme_get_free_req_nowait(q); 529 if (!req) { 530 return -EBUSY; 531 } 532 nvme_submit_command(q, req, cmd, nvme_admin_cmd_sync_cb, &ret); 533 534 AIO_WAIT_WHILE(aio_context, ret == -EINPROGRESS); 535 return ret; 536 } 537 538 /* Returns true on success, false on failure. */ 539 static bool nvme_identify(BlockDriverState *bs, int namespace, Error **errp) 540 { 541 ERRP_GUARD(); 542 BDRVNVMeState *s = bs->opaque; 543 bool ret = false; 544 QEMU_AUTO_VFREE union { 545 NvmeIdCtrl ctrl; 546 NvmeIdNs ns; 547 } *id = NULL; 548 NvmeLBAF *lbaf; 549 uint16_t oncs; 550 int r; 551 uint64_t iova; 552 NvmeCmd cmd = { 553 .opcode = NVME_ADM_CMD_IDENTIFY, 554 .cdw10 = cpu_to_le32(0x1), 555 }; 556 size_t id_size = QEMU_ALIGN_UP(sizeof(*id), qemu_real_host_page_size()); 557 558 id = qemu_try_memalign(qemu_real_host_page_size(), id_size); 559 if (!id) { 560 error_setg(errp, "Cannot allocate buffer for identify response"); 561 goto out; 562 } 563 r = qemu_vfio_dma_map(s->vfio, id, id_size, true, &iova, errp); 564 if (r) { 565 error_prepend(errp, "Cannot map buffer for DMA: "); 566 goto out; 567 } 568 569 memset(id, 0, id_size); 570 cmd.dptr.prp1 = cpu_to_le64(iova); 571 if (nvme_admin_cmd_sync(bs, &cmd)) { 572 error_setg(errp, "Failed to identify controller"); 573 goto out; 574 } 575 576 if (le32_to_cpu(id->ctrl.nn) < namespace) { 577 error_setg(errp, "Invalid namespace"); 578 goto out; 579 } 580 s->write_cache_supported = le32_to_cpu(id->ctrl.vwc) & 0x1; 581 s->max_transfer = (id->ctrl.mdts ? 1 << id->ctrl.mdts : 0) * s->page_size; 582 /* For now the page list buffer per command is one page, to hold at most 583 * s->page_size / sizeof(uint64_t) entries. */ 584 s->max_transfer = MIN_NON_ZERO(s->max_transfer, 585 s->page_size / sizeof(uint64_t) * s->page_size); 586 587 oncs = le16_to_cpu(id->ctrl.oncs); 588 s->supports_write_zeroes = !!(oncs & NVME_ONCS_WRITE_ZEROES); 589 s->supports_discard = !!(oncs & NVME_ONCS_DSM); 590 591 memset(id, 0, id_size); 592 cmd.cdw10 = 0; 593 cmd.nsid = cpu_to_le32(namespace); 594 if (nvme_admin_cmd_sync(bs, &cmd)) { 595 error_setg(errp, "Failed to identify namespace"); 596 goto out; 597 } 598 599 s->nsze = le64_to_cpu(id->ns.nsze); 600 lbaf = &id->ns.lbaf[NVME_ID_NS_FLBAS_INDEX(id->ns.flbas)]; 601 602 if (NVME_ID_NS_DLFEAT_WRITE_ZEROES(id->ns.dlfeat) && 603 NVME_ID_NS_DLFEAT_READ_BEHAVIOR(id->ns.dlfeat) == 604 NVME_ID_NS_DLFEAT_READ_BEHAVIOR_ZEROES) { 605 bs->supported_write_flags |= BDRV_REQ_MAY_UNMAP; 606 } 607 608 if (lbaf->ms) { 609 error_setg(errp, "Namespaces with metadata are not yet supported"); 610 goto out; 611 } 612 613 if (lbaf->ds < BDRV_SECTOR_BITS || lbaf->ds > 12 || 614 (1 << lbaf->ds) > s->page_size) 615 { 616 error_setg(errp, "Namespace has unsupported block size (2^%d)", 617 lbaf->ds); 618 goto out; 619 } 620 621 ret = true; 622 s->blkshift = lbaf->ds; 623 out: 624 qemu_vfio_dma_unmap(s->vfio, id); 625 626 return ret; 627 } 628 629 static void nvme_poll_queue(NVMeQueuePair *q) 630 { 631 const size_t cqe_offset = q->cq.head * NVME_CQ_ENTRY_BYTES; 632 NvmeCqe *cqe = (NvmeCqe *)&q->cq.queue[cqe_offset]; 633 634 trace_nvme_poll_queue(q->s, q->index); 635 /* 636 * Do an early check for completions. q->lock isn't needed because 637 * nvme_process_completion() only runs in the event loop thread and 638 * cannot race with itself. 639 */ 640 if ((le16_to_cpu(cqe->status) & 0x1) == q->cq_phase) { 641 return; 642 } 643 644 qemu_mutex_lock(&q->lock); 645 while (nvme_process_completion(q)) { 646 /* Keep polling */ 647 } 648 qemu_mutex_unlock(&q->lock); 649 } 650 651 static void nvme_poll_queues(BDRVNVMeState *s) 652 { 653 int i; 654 655 for (i = 0; i < s->queue_count; i++) { 656 nvme_poll_queue(s->queues[i]); 657 } 658 } 659 660 static void nvme_handle_event(EventNotifier *n) 661 { 662 BDRVNVMeState *s = container_of(n, BDRVNVMeState, 663 irq_notifier[MSIX_SHARED_IRQ_IDX]); 664 665 trace_nvme_handle_event(s); 666 event_notifier_test_and_clear(n); 667 nvme_poll_queues(s); 668 } 669 670 static bool nvme_add_io_queue(BlockDriverState *bs, Error **errp) 671 { 672 BDRVNVMeState *s = bs->opaque; 673 unsigned n = s->queue_count; 674 NVMeQueuePair *q; 675 NvmeCmd cmd; 676 unsigned queue_size = NVME_QUEUE_SIZE; 677 678 assert(n <= UINT16_MAX); 679 q = nvme_create_queue_pair(s, bdrv_get_aio_context(bs), 680 n, queue_size, errp); 681 if (!q) { 682 return false; 683 } 684 cmd = (NvmeCmd) { 685 .opcode = NVME_ADM_CMD_CREATE_CQ, 686 .dptr.prp1 = cpu_to_le64(q->cq.iova), 687 .cdw10 = cpu_to_le32(((queue_size - 1) << 16) | n), 688 .cdw11 = cpu_to_le32(NVME_CQ_IEN | NVME_CQ_PC), 689 }; 690 if (nvme_admin_cmd_sync(bs, &cmd)) { 691 error_setg(errp, "Failed to create CQ io queue [%u]", n); 692 goto out_error; 693 } 694 cmd = (NvmeCmd) { 695 .opcode = NVME_ADM_CMD_CREATE_SQ, 696 .dptr.prp1 = cpu_to_le64(q->sq.iova), 697 .cdw10 = cpu_to_le32(((queue_size - 1) << 16) | n), 698 .cdw11 = cpu_to_le32(NVME_SQ_PC | (n << 16)), 699 }; 700 if (nvme_admin_cmd_sync(bs, &cmd)) { 701 error_setg(errp, "Failed to create SQ io queue [%u]", n); 702 goto out_error; 703 } 704 s->queues = g_renew(NVMeQueuePair *, s->queues, n + 1); 705 s->queues[n] = q; 706 s->queue_count++; 707 return true; 708 out_error: 709 nvme_free_queue_pair(q); 710 return false; 711 } 712 713 static bool nvme_poll_cb(void *opaque) 714 { 715 EventNotifier *e = opaque; 716 BDRVNVMeState *s = container_of(e, BDRVNVMeState, 717 irq_notifier[MSIX_SHARED_IRQ_IDX]); 718 int i; 719 720 for (i = 0; i < s->queue_count; i++) { 721 NVMeQueuePair *q = s->queues[i]; 722 const size_t cqe_offset = q->cq.head * NVME_CQ_ENTRY_BYTES; 723 NvmeCqe *cqe = (NvmeCqe *)&q->cq.queue[cqe_offset]; 724 725 /* 726 * q->lock isn't needed because nvme_process_completion() only runs in 727 * the event loop thread and cannot race with itself. 728 */ 729 if ((le16_to_cpu(cqe->status) & 0x1) != q->cq_phase) { 730 return true; 731 } 732 } 733 return false; 734 } 735 736 static void nvme_poll_ready(EventNotifier *e) 737 { 738 BDRVNVMeState *s = container_of(e, BDRVNVMeState, 739 irq_notifier[MSIX_SHARED_IRQ_IDX]); 740 741 nvme_poll_queues(s); 742 } 743 744 static int nvme_init(BlockDriverState *bs, const char *device, int namespace, 745 Error **errp) 746 { 747 BDRVNVMeState *s = bs->opaque; 748 NVMeQueuePair *q; 749 AioContext *aio_context = bdrv_get_aio_context(bs); 750 int ret; 751 uint64_t cap; 752 uint32_t ver; 753 uint32_t cc; 754 uint64_t timeout_ms; 755 uint64_t deadline, now; 756 NvmeBar *regs = NULL; 757 758 qemu_co_mutex_init(&s->dma_map_lock); 759 qemu_co_queue_init(&s->dma_flush_queue); 760 s->device = g_strdup(device); 761 s->nsid = namespace; 762 s->aio_context = bdrv_get_aio_context(bs); 763 ret = event_notifier_init(&s->irq_notifier[MSIX_SHARED_IRQ_IDX], 0); 764 if (ret) { 765 error_setg(errp, "Failed to init event notifier"); 766 return ret; 767 } 768 769 s->vfio = qemu_vfio_open_pci(device, errp); 770 if (!s->vfio) { 771 ret = -EINVAL; 772 goto out; 773 } 774 775 regs = qemu_vfio_pci_map_bar(s->vfio, 0, 0, sizeof(NvmeBar), 776 PROT_READ | PROT_WRITE, errp); 777 if (!regs) { 778 ret = -EINVAL; 779 goto out; 780 } 781 /* Perform initialize sequence as described in NVMe spec "7.6.1 782 * Initialization". */ 783 784 cap = host_pci_ldq_le_p(®s->cap); 785 trace_nvme_controller_capability_raw(cap); 786 trace_nvme_controller_capability("Maximum Queue Entries Supported", 787 1 + NVME_CAP_MQES(cap)); 788 trace_nvme_controller_capability("Contiguous Queues Required", 789 NVME_CAP_CQR(cap)); 790 trace_nvme_controller_capability("Doorbell Stride", 791 1 << (2 + NVME_CAP_DSTRD(cap))); 792 trace_nvme_controller_capability("Subsystem Reset Supported", 793 NVME_CAP_NSSRS(cap)); 794 trace_nvme_controller_capability("Memory Page Size Minimum", 795 1 << (12 + NVME_CAP_MPSMIN(cap))); 796 trace_nvme_controller_capability("Memory Page Size Maximum", 797 1 << (12 + NVME_CAP_MPSMAX(cap))); 798 if (!NVME_CAP_CSS(cap)) { 799 error_setg(errp, "Device doesn't support NVMe command set"); 800 ret = -EINVAL; 801 goto out; 802 } 803 804 s->page_size = 1u << (12 + NVME_CAP_MPSMIN(cap)); 805 s->doorbell_scale = (4 << NVME_CAP_DSTRD(cap)) / sizeof(uint32_t); 806 bs->bl.opt_mem_alignment = s->page_size; 807 bs->bl.request_alignment = s->page_size; 808 timeout_ms = MIN(500 * NVME_CAP_TO(cap), 30000); 809 810 ver = host_pci_ldl_le_p(®s->vs); 811 trace_nvme_controller_spec_version(extract32(ver, 16, 16), 812 extract32(ver, 8, 8), 813 extract32(ver, 0, 8)); 814 815 /* Reset device to get a clean state. */ 816 cc = host_pci_ldl_le_p(®s->cc); 817 host_pci_stl_le_p(®s->cc, cc & 0xFE); 818 /* Wait for CSTS.RDY = 0. */ 819 deadline = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) + timeout_ms * SCALE_MS; 820 while (NVME_CSTS_RDY(host_pci_ldl_le_p(®s->csts))) { 821 if (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) > deadline) { 822 error_setg(errp, "Timeout while waiting for device to reset (%" 823 PRId64 " ms)", 824 timeout_ms); 825 ret = -ETIMEDOUT; 826 goto out; 827 } 828 } 829 830 s->bar0_wo_map = qemu_vfio_pci_map_bar(s->vfio, 0, 0, 831 sizeof(NvmeBar) + NVME_DOORBELL_SIZE, 832 PROT_WRITE, errp); 833 s->doorbells = (void *)((uintptr_t)s->bar0_wo_map + sizeof(NvmeBar)); 834 if (!s->doorbells) { 835 ret = -EINVAL; 836 goto out; 837 } 838 839 /* Set up admin queue. */ 840 s->queues = g_new(NVMeQueuePair *, 1); 841 q = nvme_create_queue_pair(s, aio_context, 0, NVME_QUEUE_SIZE, errp); 842 if (!q) { 843 ret = -EINVAL; 844 goto out; 845 } 846 s->queues[INDEX_ADMIN] = q; 847 s->queue_count = 1; 848 QEMU_BUILD_BUG_ON((NVME_QUEUE_SIZE - 1) & 0xF000); 849 host_pci_stl_le_p(®s->aqa, 850 ((NVME_QUEUE_SIZE - 1) << AQA_ACQS_SHIFT) | 851 ((NVME_QUEUE_SIZE - 1) << AQA_ASQS_SHIFT)); 852 host_pci_stq_le_p(®s->asq, q->sq.iova); 853 host_pci_stq_le_p(®s->acq, q->cq.iova); 854 855 /* After setting up all control registers we can enable device now. */ 856 host_pci_stl_le_p(®s->cc, 857 (ctz32(NVME_CQ_ENTRY_BYTES) << CC_IOCQES_SHIFT) | 858 (ctz32(NVME_SQ_ENTRY_BYTES) << CC_IOSQES_SHIFT) | 859 CC_EN_MASK); 860 /* Wait for CSTS.RDY = 1. */ 861 now = qemu_clock_get_ns(QEMU_CLOCK_REALTIME); 862 deadline = now + timeout_ms * SCALE_MS; 863 while (!NVME_CSTS_RDY(host_pci_ldl_le_p(®s->csts))) { 864 if (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) > deadline) { 865 error_setg(errp, "Timeout while waiting for device to start (%" 866 PRId64 " ms)", 867 timeout_ms); 868 ret = -ETIMEDOUT; 869 goto out; 870 } 871 } 872 873 ret = qemu_vfio_pci_init_irq(s->vfio, s->irq_notifier, 874 VFIO_PCI_MSIX_IRQ_INDEX, errp); 875 if (ret) { 876 goto out; 877 } 878 aio_set_event_notifier(bdrv_get_aio_context(bs), 879 &s->irq_notifier[MSIX_SHARED_IRQ_IDX], 880 nvme_handle_event, nvme_poll_cb, 881 nvme_poll_ready); 882 883 if (!nvme_identify(bs, namespace, errp)) { 884 ret = -EIO; 885 goto out; 886 } 887 888 /* Set up command queues. */ 889 if (!nvme_add_io_queue(bs, errp)) { 890 ret = -EIO; 891 } 892 out: 893 if (regs) { 894 qemu_vfio_pci_unmap_bar(s->vfio, 0, (void *)regs, 0, sizeof(NvmeBar)); 895 } 896 897 /* Cleaning up is done in nvme_open() upon error. */ 898 return ret; 899 } 900 901 /* Parse a filename in the format of nvme://XXXX:XX:XX.X/X. Example: 902 * 903 * nvme://0000:44:00.0/1 904 * 905 * where the "nvme://" is a fixed form of the protocol prefix, the middle part 906 * is the PCI address, and the last part is the namespace number starting from 907 * 1 according to the NVMe spec. */ 908 static void nvme_parse_filename(const char *filename, QDict *options, 909 Error **errp) 910 { 911 int pref = strlen("nvme://"); 912 913 if (strlen(filename) > pref && !strncmp(filename, "nvme://", pref)) { 914 const char *tmp = filename + pref; 915 char *device; 916 const char *namespace; 917 unsigned long ns; 918 const char *slash = strchr(tmp, '/'); 919 if (!slash) { 920 qdict_put_str(options, NVME_BLOCK_OPT_DEVICE, tmp); 921 return; 922 } 923 device = g_strndup(tmp, slash - tmp); 924 qdict_put_str(options, NVME_BLOCK_OPT_DEVICE, device); 925 g_free(device); 926 namespace = slash + 1; 927 if (*namespace && qemu_strtoul(namespace, NULL, 10, &ns)) { 928 error_setg(errp, "Invalid namespace '%s', positive number expected", 929 namespace); 930 return; 931 } 932 qdict_put_str(options, NVME_BLOCK_OPT_NAMESPACE, 933 *namespace ? namespace : "1"); 934 } 935 } 936 937 static int nvme_enable_disable_write_cache(BlockDriverState *bs, bool enable, 938 Error **errp) 939 { 940 int ret; 941 BDRVNVMeState *s = bs->opaque; 942 NvmeCmd cmd = { 943 .opcode = NVME_ADM_CMD_SET_FEATURES, 944 .nsid = cpu_to_le32(s->nsid), 945 .cdw10 = cpu_to_le32(0x06), 946 .cdw11 = cpu_to_le32(enable ? 0x01 : 0x00), 947 }; 948 949 ret = nvme_admin_cmd_sync(bs, &cmd); 950 if (ret) { 951 error_setg(errp, "Failed to configure NVMe write cache"); 952 } 953 return ret; 954 } 955 956 static void nvme_close(BlockDriverState *bs) 957 { 958 BDRVNVMeState *s = bs->opaque; 959 960 for (unsigned i = 0; i < s->queue_count; ++i) { 961 nvme_free_queue_pair(s->queues[i]); 962 } 963 g_free(s->queues); 964 aio_set_event_notifier(bdrv_get_aio_context(bs), 965 &s->irq_notifier[MSIX_SHARED_IRQ_IDX], 966 NULL, NULL, NULL); 967 event_notifier_cleanup(&s->irq_notifier[MSIX_SHARED_IRQ_IDX]); 968 qemu_vfio_pci_unmap_bar(s->vfio, 0, s->bar0_wo_map, 969 0, sizeof(NvmeBar) + NVME_DOORBELL_SIZE); 970 qemu_vfio_close(s->vfio); 971 972 g_free(s->device); 973 } 974 975 static int nvme_open(BlockDriverState *bs, QDict *options, int flags, 976 Error **errp) 977 { 978 const char *device; 979 QemuOpts *opts; 980 int namespace; 981 int ret; 982 BDRVNVMeState *s = bs->opaque; 983 984 bs->supported_write_flags = BDRV_REQ_FUA; 985 986 opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort); 987 qemu_opts_absorb_qdict(opts, options, &error_abort); 988 device = qemu_opt_get(opts, NVME_BLOCK_OPT_DEVICE); 989 if (!device) { 990 error_setg(errp, "'" NVME_BLOCK_OPT_DEVICE "' option is required"); 991 qemu_opts_del(opts); 992 return -EINVAL; 993 } 994 995 namespace = qemu_opt_get_number(opts, NVME_BLOCK_OPT_NAMESPACE, 1); 996 ret = nvme_init(bs, device, namespace, errp); 997 qemu_opts_del(opts); 998 if (ret) { 999 goto fail; 1000 } 1001 if (flags & BDRV_O_NOCACHE) { 1002 if (!s->write_cache_supported) { 1003 error_setg(errp, 1004 "NVMe controller doesn't support write cache configuration"); 1005 ret = -EINVAL; 1006 } else { 1007 ret = nvme_enable_disable_write_cache(bs, !(flags & BDRV_O_NOCACHE), 1008 errp); 1009 } 1010 if (ret) { 1011 goto fail; 1012 } 1013 } 1014 return 0; 1015 fail: 1016 nvme_close(bs); 1017 return ret; 1018 } 1019 1020 static int64_t coroutine_fn nvme_co_getlength(BlockDriverState *bs) 1021 { 1022 BDRVNVMeState *s = bs->opaque; 1023 return s->nsze << s->blkshift; 1024 } 1025 1026 static uint32_t nvme_get_blocksize(BlockDriverState *bs) 1027 { 1028 BDRVNVMeState *s = bs->opaque; 1029 assert(s->blkshift >= BDRV_SECTOR_BITS && s->blkshift <= 12); 1030 return UINT32_C(1) << s->blkshift; 1031 } 1032 1033 static int nvme_probe_blocksizes(BlockDriverState *bs, BlockSizes *bsz) 1034 { 1035 uint32_t blocksize = nvme_get_blocksize(bs); 1036 bsz->phys = blocksize; 1037 bsz->log = blocksize; 1038 return 0; 1039 } 1040 1041 /* Called with s->dma_map_lock */ 1042 static coroutine_fn int nvme_cmd_unmap_qiov(BlockDriverState *bs, 1043 QEMUIOVector *qiov) 1044 { 1045 int r = 0; 1046 BDRVNVMeState *s = bs->opaque; 1047 1048 s->dma_map_count -= qiov->size; 1049 if (!s->dma_map_count && !qemu_co_queue_empty(&s->dma_flush_queue)) { 1050 r = qemu_vfio_dma_reset_temporary(s->vfio); 1051 if (!r) { 1052 qemu_co_queue_restart_all(&s->dma_flush_queue); 1053 } 1054 } 1055 return r; 1056 } 1057 1058 /* Called with s->dma_map_lock */ 1059 static coroutine_fn int nvme_cmd_map_qiov(BlockDriverState *bs, NvmeCmd *cmd, 1060 NVMeRequest *req, QEMUIOVector *qiov) 1061 { 1062 BDRVNVMeState *s = bs->opaque; 1063 uint64_t *pagelist = req->prp_list_page; 1064 int i, j, r; 1065 int entries = 0; 1066 Error *local_err = NULL, **errp = NULL; 1067 1068 assert(qiov->size); 1069 assert(QEMU_IS_ALIGNED(qiov->size, s->page_size)); 1070 assert(qiov->size / s->page_size <= s->page_size / sizeof(uint64_t)); 1071 for (i = 0; i < qiov->niov; ++i) { 1072 bool retry = true; 1073 uint64_t iova; 1074 size_t len = QEMU_ALIGN_UP(qiov->iov[i].iov_len, 1075 qemu_real_host_page_size()); 1076 try_map: 1077 r = qemu_vfio_dma_map(s->vfio, 1078 qiov->iov[i].iov_base, 1079 len, true, &iova, errp); 1080 if (r == -ENOSPC) { 1081 /* 1082 * In addition to the -ENOMEM error, the VFIO_IOMMU_MAP_DMA 1083 * ioctl returns -ENOSPC to signal the user exhausted the DMA 1084 * mappings available for a container since Linux kernel commit 1085 * 492855939bdb ("vfio/type1: Limit DMA mappings per container", 1086 * April 2019, see CVE-2019-3882). 1087 * 1088 * This block driver already handles this error path by checking 1089 * for the -ENOMEM error, so we directly replace -ENOSPC by 1090 * -ENOMEM. Beside, -ENOSPC has a specific meaning for blockdev 1091 * coroutines: it triggers BLOCKDEV_ON_ERROR_ENOSPC and 1092 * BLOCK_ERROR_ACTION_STOP which stops the VM, asking the operator 1093 * to add more storage to the blockdev. Not something we can do 1094 * easily with an IOMMU :) 1095 */ 1096 r = -ENOMEM; 1097 } 1098 if (r == -ENOMEM && retry) { 1099 /* 1100 * We exhausted the DMA mappings available for our container: 1101 * recycle the volatile IOVA mappings. 1102 */ 1103 retry = false; 1104 trace_nvme_dma_flush_queue_wait(s); 1105 if (s->dma_map_count) { 1106 trace_nvme_dma_map_flush(s); 1107 qemu_co_queue_wait(&s->dma_flush_queue, &s->dma_map_lock); 1108 } else { 1109 r = qemu_vfio_dma_reset_temporary(s->vfio); 1110 if (r) { 1111 goto fail; 1112 } 1113 } 1114 errp = &local_err; 1115 1116 goto try_map; 1117 } 1118 if (r) { 1119 goto fail; 1120 } 1121 1122 for (j = 0; j < qiov->iov[i].iov_len / s->page_size; j++) { 1123 pagelist[entries++] = cpu_to_le64(iova + j * s->page_size); 1124 } 1125 trace_nvme_cmd_map_qiov_iov(s, i, qiov->iov[i].iov_base, 1126 qiov->iov[i].iov_len / s->page_size); 1127 } 1128 1129 s->dma_map_count += qiov->size; 1130 1131 assert(entries <= s->page_size / sizeof(uint64_t)); 1132 switch (entries) { 1133 case 0: 1134 abort(); 1135 case 1: 1136 cmd->dptr.prp1 = pagelist[0]; 1137 cmd->dptr.prp2 = 0; 1138 break; 1139 case 2: 1140 cmd->dptr.prp1 = pagelist[0]; 1141 cmd->dptr.prp2 = pagelist[1]; 1142 break; 1143 default: 1144 cmd->dptr.prp1 = pagelist[0]; 1145 cmd->dptr.prp2 = cpu_to_le64(req->prp_list_iova + sizeof(uint64_t)); 1146 break; 1147 } 1148 trace_nvme_cmd_map_qiov(s, cmd, req, qiov, entries); 1149 for (i = 0; i < entries; ++i) { 1150 trace_nvme_cmd_map_qiov_pages(s, i, pagelist[i]); 1151 } 1152 return 0; 1153 fail: 1154 /* No need to unmap [0 - i) iovs even if we've failed, since we don't 1155 * increment s->dma_map_count. This is okay for fixed mapping memory areas 1156 * because they are already mapped before calling this function; for 1157 * temporary mappings, a later nvme_cmd_(un)map_qiov will reclaim by 1158 * calling qemu_vfio_dma_reset_temporary when necessary. */ 1159 if (local_err) { 1160 error_reportf_err(local_err, "Cannot map buffer for DMA: "); 1161 } 1162 return r; 1163 } 1164 1165 typedef struct { 1166 Coroutine *co; 1167 int ret; 1168 AioContext *ctx; 1169 } NVMeCoData; 1170 1171 static void nvme_rw_cb_bh(void *opaque) 1172 { 1173 NVMeCoData *data = opaque; 1174 qemu_coroutine_enter(data->co); 1175 } 1176 1177 static void nvme_rw_cb(void *opaque, int ret) 1178 { 1179 NVMeCoData *data = opaque; 1180 data->ret = ret; 1181 if (!data->co) { 1182 /* The rw coroutine hasn't yielded, don't try to enter. */ 1183 return; 1184 } 1185 replay_bh_schedule_oneshot_event(data->ctx, nvme_rw_cb_bh, data); 1186 } 1187 1188 static coroutine_fn int nvme_co_prw_aligned(BlockDriverState *bs, 1189 uint64_t offset, uint64_t bytes, 1190 QEMUIOVector *qiov, 1191 bool is_write, 1192 int flags) 1193 { 1194 int r; 1195 BDRVNVMeState *s = bs->opaque; 1196 NVMeQueuePair *ioq = s->queues[INDEX_IO(0)]; 1197 NVMeRequest *req; 1198 1199 uint32_t cdw12 = (((bytes >> s->blkshift) - 1) & 0xFFFF) | 1200 (flags & BDRV_REQ_FUA ? 1 << 30 : 0); 1201 NvmeCmd cmd = { 1202 .opcode = is_write ? NVME_CMD_WRITE : NVME_CMD_READ, 1203 .nsid = cpu_to_le32(s->nsid), 1204 .cdw10 = cpu_to_le32((offset >> s->blkshift) & 0xFFFFFFFF), 1205 .cdw11 = cpu_to_le32(((offset >> s->blkshift) >> 32) & 0xFFFFFFFF), 1206 .cdw12 = cpu_to_le32(cdw12), 1207 }; 1208 NVMeCoData data = { 1209 .ctx = bdrv_get_aio_context(bs), 1210 .ret = -EINPROGRESS, 1211 }; 1212 1213 trace_nvme_prw_aligned(s, is_write, offset, bytes, flags, qiov->niov); 1214 assert(s->queue_count > 1); 1215 req = nvme_get_free_req(ioq); 1216 assert(req); 1217 1218 qemu_co_mutex_lock(&s->dma_map_lock); 1219 r = nvme_cmd_map_qiov(bs, &cmd, req, qiov); 1220 qemu_co_mutex_unlock(&s->dma_map_lock); 1221 if (r) { 1222 nvme_put_free_req_and_wake(ioq, req); 1223 return r; 1224 } 1225 nvme_submit_command(ioq, req, &cmd, nvme_rw_cb, &data); 1226 1227 data.co = qemu_coroutine_self(); 1228 while (data.ret == -EINPROGRESS) { 1229 qemu_coroutine_yield(); 1230 } 1231 1232 qemu_co_mutex_lock(&s->dma_map_lock); 1233 r = nvme_cmd_unmap_qiov(bs, qiov); 1234 qemu_co_mutex_unlock(&s->dma_map_lock); 1235 if (r) { 1236 return r; 1237 } 1238 1239 trace_nvme_rw_done(s, is_write, offset, bytes, data.ret); 1240 return data.ret; 1241 } 1242 1243 static inline bool nvme_qiov_aligned(BlockDriverState *bs, 1244 const QEMUIOVector *qiov) 1245 { 1246 int i; 1247 BDRVNVMeState *s = bs->opaque; 1248 1249 for (i = 0; i < qiov->niov; ++i) { 1250 if (!QEMU_PTR_IS_ALIGNED(qiov->iov[i].iov_base, 1251 qemu_real_host_page_size()) || 1252 !QEMU_IS_ALIGNED(qiov->iov[i].iov_len, qemu_real_host_page_size())) { 1253 trace_nvme_qiov_unaligned(qiov, i, qiov->iov[i].iov_base, 1254 qiov->iov[i].iov_len, s->page_size); 1255 return false; 1256 } 1257 } 1258 return true; 1259 } 1260 1261 static coroutine_fn int nvme_co_prw(BlockDriverState *bs, 1262 uint64_t offset, uint64_t bytes, 1263 QEMUIOVector *qiov, bool is_write, 1264 int flags) 1265 { 1266 BDRVNVMeState *s = bs->opaque; 1267 int r; 1268 QEMU_AUTO_VFREE uint8_t *buf = NULL; 1269 QEMUIOVector local_qiov; 1270 size_t len = QEMU_ALIGN_UP(bytes, qemu_real_host_page_size()); 1271 assert(QEMU_IS_ALIGNED(offset, s->page_size)); 1272 assert(QEMU_IS_ALIGNED(bytes, s->page_size)); 1273 assert(bytes <= s->max_transfer); 1274 if (nvme_qiov_aligned(bs, qiov)) { 1275 s->stats.aligned_accesses++; 1276 return nvme_co_prw_aligned(bs, offset, bytes, qiov, is_write, flags); 1277 } 1278 s->stats.unaligned_accesses++; 1279 trace_nvme_prw_buffered(s, offset, bytes, qiov->niov, is_write); 1280 buf = qemu_try_memalign(qemu_real_host_page_size(), len); 1281 1282 if (!buf) { 1283 return -ENOMEM; 1284 } 1285 qemu_iovec_init(&local_qiov, 1); 1286 if (is_write) { 1287 qemu_iovec_to_buf(qiov, 0, buf, bytes); 1288 } 1289 qemu_iovec_add(&local_qiov, buf, bytes); 1290 r = nvme_co_prw_aligned(bs, offset, bytes, &local_qiov, is_write, flags); 1291 qemu_iovec_destroy(&local_qiov); 1292 if (!r && !is_write) { 1293 qemu_iovec_from_buf(qiov, 0, buf, bytes); 1294 } 1295 return r; 1296 } 1297 1298 static coroutine_fn int nvme_co_preadv(BlockDriverState *bs, 1299 int64_t offset, int64_t bytes, 1300 QEMUIOVector *qiov, 1301 BdrvRequestFlags flags) 1302 { 1303 return nvme_co_prw(bs, offset, bytes, qiov, false, flags); 1304 } 1305 1306 static coroutine_fn int nvme_co_pwritev(BlockDriverState *bs, 1307 int64_t offset, int64_t bytes, 1308 QEMUIOVector *qiov, 1309 BdrvRequestFlags flags) 1310 { 1311 return nvme_co_prw(bs, offset, bytes, qiov, true, flags); 1312 } 1313 1314 static coroutine_fn int nvme_co_flush(BlockDriverState *bs) 1315 { 1316 BDRVNVMeState *s = bs->opaque; 1317 NVMeQueuePair *ioq = s->queues[INDEX_IO(0)]; 1318 NVMeRequest *req; 1319 NvmeCmd cmd = { 1320 .opcode = NVME_CMD_FLUSH, 1321 .nsid = cpu_to_le32(s->nsid), 1322 }; 1323 NVMeCoData data = { 1324 .ctx = bdrv_get_aio_context(bs), 1325 .ret = -EINPROGRESS, 1326 }; 1327 1328 assert(s->queue_count > 1); 1329 req = nvme_get_free_req(ioq); 1330 assert(req); 1331 nvme_submit_command(ioq, req, &cmd, nvme_rw_cb, &data); 1332 1333 data.co = qemu_coroutine_self(); 1334 if (data.ret == -EINPROGRESS) { 1335 qemu_coroutine_yield(); 1336 } 1337 1338 return data.ret; 1339 } 1340 1341 1342 static coroutine_fn int nvme_co_pwrite_zeroes(BlockDriverState *bs, 1343 int64_t offset, 1344 int64_t bytes, 1345 BdrvRequestFlags flags) 1346 { 1347 BDRVNVMeState *s = bs->opaque; 1348 NVMeQueuePair *ioq = s->queues[INDEX_IO(0)]; 1349 NVMeRequest *req; 1350 uint32_t cdw12; 1351 1352 if (!s->supports_write_zeroes) { 1353 return -ENOTSUP; 1354 } 1355 1356 if (bytes == 0) { 1357 return 0; 1358 } 1359 1360 cdw12 = ((bytes >> s->blkshift) - 1) & 0xFFFF; 1361 /* 1362 * We should not lose information. pwrite_zeroes_alignment and 1363 * max_pwrite_zeroes guarantees it. 1364 */ 1365 assert(((cdw12 + 1) << s->blkshift) == bytes); 1366 1367 NvmeCmd cmd = { 1368 .opcode = NVME_CMD_WRITE_ZEROES, 1369 .nsid = cpu_to_le32(s->nsid), 1370 .cdw10 = cpu_to_le32((offset >> s->blkshift) & 0xFFFFFFFF), 1371 .cdw11 = cpu_to_le32(((offset >> s->blkshift) >> 32) & 0xFFFFFFFF), 1372 }; 1373 1374 NVMeCoData data = { 1375 .ctx = bdrv_get_aio_context(bs), 1376 .ret = -EINPROGRESS, 1377 }; 1378 1379 if (flags & BDRV_REQ_MAY_UNMAP) { 1380 cdw12 |= (1 << 25); 1381 } 1382 1383 if (flags & BDRV_REQ_FUA) { 1384 cdw12 |= (1 << 30); 1385 } 1386 1387 cmd.cdw12 = cpu_to_le32(cdw12); 1388 1389 trace_nvme_write_zeroes(s, offset, bytes, flags); 1390 assert(s->queue_count > 1); 1391 req = nvme_get_free_req(ioq); 1392 assert(req); 1393 1394 nvme_submit_command(ioq, req, &cmd, nvme_rw_cb, &data); 1395 1396 data.co = qemu_coroutine_self(); 1397 while (data.ret == -EINPROGRESS) { 1398 qemu_coroutine_yield(); 1399 } 1400 1401 trace_nvme_rw_done(s, true, offset, bytes, data.ret); 1402 return data.ret; 1403 } 1404 1405 1406 static int coroutine_fn nvme_co_pdiscard(BlockDriverState *bs, 1407 int64_t offset, 1408 int64_t bytes) 1409 { 1410 BDRVNVMeState *s = bs->opaque; 1411 NVMeQueuePair *ioq = s->queues[INDEX_IO(0)]; 1412 NVMeRequest *req; 1413 QEMU_AUTO_VFREE NvmeDsmRange *buf = NULL; 1414 QEMUIOVector local_qiov; 1415 int ret; 1416 1417 NvmeCmd cmd = { 1418 .opcode = NVME_CMD_DSM, 1419 .nsid = cpu_to_le32(s->nsid), 1420 .cdw10 = cpu_to_le32(0), /*number of ranges - 0 based*/ 1421 .cdw11 = cpu_to_le32(1 << 2), /*deallocate bit*/ 1422 }; 1423 1424 NVMeCoData data = { 1425 .ctx = bdrv_get_aio_context(bs), 1426 .ret = -EINPROGRESS, 1427 }; 1428 1429 if (!s->supports_discard) { 1430 return -ENOTSUP; 1431 } 1432 1433 assert(s->queue_count > 1); 1434 1435 /* 1436 * Filling the @buf requires @offset and @bytes to satisfy restrictions 1437 * defined in nvme_refresh_limits(). 1438 */ 1439 assert(QEMU_IS_ALIGNED(bytes, 1UL << s->blkshift)); 1440 assert(QEMU_IS_ALIGNED(offset, 1UL << s->blkshift)); 1441 assert((bytes >> s->blkshift) <= UINT32_MAX); 1442 1443 buf = qemu_try_memalign(s->page_size, s->page_size); 1444 if (!buf) { 1445 return -ENOMEM; 1446 } 1447 memset(buf, 0, s->page_size); 1448 buf->nlb = cpu_to_le32(bytes >> s->blkshift); 1449 buf->slba = cpu_to_le64(offset >> s->blkshift); 1450 buf->cattr = 0; 1451 1452 qemu_iovec_init(&local_qiov, 1); 1453 qemu_iovec_add(&local_qiov, buf, 4096); 1454 1455 req = nvme_get_free_req(ioq); 1456 assert(req); 1457 1458 qemu_co_mutex_lock(&s->dma_map_lock); 1459 ret = nvme_cmd_map_qiov(bs, &cmd, req, &local_qiov); 1460 qemu_co_mutex_unlock(&s->dma_map_lock); 1461 1462 if (ret) { 1463 nvme_put_free_req_and_wake(ioq, req); 1464 goto out; 1465 } 1466 1467 trace_nvme_dsm(s, offset, bytes); 1468 1469 nvme_submit_command(ioq, req, &cmd, nvme_rw_cb, &data); 1470 1471 data.co = qemu_coroutine_self(); 1472 while (data.ret == -EINPROGRESS) { 1473 qemu_coroutine_yield(); 1474 } 1475 1476 qemu_co_mutex_lock(&s->dma_map_lock); 1477 ret = nvme_cmd_unmap_qiov(bs, &local_qiov); 1478 qemu_co_mutex_unlock(&s->dma_map_lock); 1479 1480 if (ret) { 1481 goto out; 1482 } 1483 1484 ret = data.ret; 1485 trace_nvme_dsm_done(s, offset, bytes, ret); 1486 out: 1487 qemu_iovec_destroy(&local_qiov); 1488 return ret; 1489 1490 } 1491 1492 static int coroutine_fn nvme_co_truncate(BlockDriverState *bs, int64_t offset, 1493 bool exact, PreallocMode prealloc, 1494 BdrvRequestFlags flags, Error **errp) 1495 { 1496 int64_t cur_length; 1497 1498 if (prealloc != PREALLOC_MODE_OFF) { 1499 error_setg(errp, "Unsupported preallocation mode '%s'", 1500 PreallocMode_str(prealloc)); 1501 return -ENOTSUP; 1502 } 1503 1504 cur_length = nvme_co_getlength(bs); 1505 if (offset != cur_length && exact) { 1506 error_setg(errp, "Cannot resize NVMe devices"); 1507 return -ENOTSUP; 1508 } else if (offset > cur_length) { 1509 error_setg(errp, "Cannot grow NVMe devices"); 1510 return -EINVAL; 1511 } 1512 1513 return 0; 1514 } 1515 1516 static int nvme_reopen_prepare(BDRVReopenState *reopen_state, 1517 BlockReopenQueue *queue, Error **errp) 1518 { 1519 return 0; 1520 } 1521 1522 static void nvme_refresh_filename(BlockDriverState *bs) 1523 { 1524 BDRVNVMeState *s = bs->opaque; 1525 1526 snprintf(bs->exact_filename, sizeof(bs->exact_filename), "nvme://%s/%i", 1527 s->device, s->nsid); 1528 } 1529 1530 static void nvme_refresh_limits(BlockDriverState *bs, Error **errp) 1531 { 1532 BDRVNVMeState *s = bs->opaque; 1533 1534 bs->bl.opt_mem_alignment = s->page_size; 1535 bs->bl.request_alignment = s->page_size; 1536 bs->bl.max_transfer = s->max_transfer; 1537 1538 /* 1539 * Look at nvme_co_pwrite_zeroes: after shift and decrement we should get 1540 * at most 0xFFFF 1541 */ 1542 bs->bl.max_pwrite_zeroes = 1ULL << (s->blkshift + 16); 1543 bs->bl.pwrite_zeroes_alignment = MAX(bs->bl.request_alignment, 1544 1UL << s->blkshift); 1545 1546 bs->bl.max_pdiscard = (uint64_t)UINT32_MAX << s->blkshift; 1547 bs->bl.pdiscard_alignment = MAX(bs->bl.request_alignment, 1548 1UL << s->blkshift); 1549 } 1550 1551 static void nvme_detach_aio_context(BlockDriverState *bs) 1552 { 1553 BDRVNVMeState *s = bs->opaque; 1554 1555 for (unsigned i = 0; i < s->queue_count; i++) { 1556 NVMeQueuePair *q = s->queues[i]; 1557 1558 qemu_bh_delete(q->completion_bh); 1559 q->completion_bh = NULL; 1560 } 1561 1562 aio_set_event_notifier(bdrv_get_aio_context(bs), 1563 &s->irq_notifier[MSIX_SHARED_IRQ_IDX], 1564 NULL, NULL, NULL); 1565 } 1566 1567 static void nvme_attach_aio_context(BlockDriverState *bs, 1568 AioContext *new_context) 1569 { 1570 BDRVNVMeState *s = bs->opaque; 1571 1572 s->aio_context = new_context; 1573 aio_set_event_notifier(new_context, &s->irq_notifier[MSIX_SHARED_IRQ_IDX], 1574 nvme_handle_event, nvme_poll_cb, 1575 nvme_poll_ready); 1576 1577 for (unsigned i = 0; i < s->queue_count; i++) { 1578 NVMeQueuePair *q = s->queues[i]; 1579 1580 q->completion_bh = 1581 aio_bh_new(new_context, nvme_process_completion_bh, q); 1582 } 1583 } 1584 1585 static bool nvme_register_buf(BlockDriverState *bs, void *host, size_t size, 1586 Error **errp) 1587 { 1588 int ret; 1589 BDRVNVMeState *s = bs->opaque; 1590 1591 /* 1592 * FIXME: we may run out of IOVA addresses after repeated 1593 * bdrv_register_buf/bdrv_unregister_buf, because nvme_vfio_dma_unmap 1594 * doesn't reclaim addresses for fixed mappings. 1595 */ 1596 ret = qemu_vfio_dma_map(s->vfio, host, size, false, NULL, errp); 1597 return ret == 0; 1598 } 1599 1600 static void nvme_unregister_buf(BlockDriverState *bs, void *host, size_t size) 1601 { 1602 BDRVNVMeState *s = bs->opaque; 1603 1604 qemu_vfio_dma_unmap(s->vfio, host); 1605 } 1606 1607 static BlockStatsSpecific *nvme_get_specific_stats(BlockDriverState *bs) 1608 { 1609 BlockStatsSpecific *stats = g_new(BlockStatsSpecific, 1); 1610 BDRVNVMeState *s = bs->opaque; 1611 1612 stats->driver = BLOCKDEV_DRIVER_NVME; 1613 stats->u.nvme = (BlockStatsSpecificNvme) { 1614 .completion_errors = s->stats.completion_errors, 1615 .aligned_accesses = s->stats.aligned_accesses, 1616 .unaligned_accesses = s->stats.unaligned_accesses, 1617 }; 1618 1619 return stats; 1620 } 1621 1622 static const char *const nvme_strong_runtime_opts[] = { 1623 NVME_BLOCK_OPT_DEVICE, 1624 NVME_BLOCK_OPT_NAMESPACE, 1625 1626 NULL 1627 }; 1628 1629 static BlockDriver bdrv_nvme = { 1630 .format_name = "nvme", 1631 .protocol_name = "nvme", 1632 .instance_size = sizeof(BDRVNVMeState), 1633 1634 .bdrv_co_create_opts = bdrv_co_create_opts_simple, 1635 .create_opts = &bdrv_create_opts_simple, 1636 1637 .bdrv_parse_filename = nvme_parse_filename, 1638 .bdrv_open = nvme_open, 1639 .bdrv_close = nvme_close, 1640 .bdrv_co_getlength = nvme_co_getlength, 1641 .bdrv_probe_blocksizes = nvme_probe_blocksizes, 1642 .bdrv_co_truncate = nvme_co_truncate, 1643 1644 .bdrv_co_preadv = nvme_co_preadv, 1645 .bdrv_co_pwritev = nvme_co_pwritev, 1646 1647 .bdrv_co_pwrite_zeroes = nvme_co_pwrite_zeroes, 1648 .bdrv_co_pdiscard = nvme_co_pdiscard, 1649 1650 .bdrv_co_flush_to_disk = nvme_co_flush, 1651 .bdrv_reopen_prepare = nvme_reopen_prepare, 1652 1653 .bdrv_refresh_filename = nvme_refresh_filename, 1654 .bdrv_refresh_limits = nvme_refresh_limits, 1655 .strong_runtime_opts = nvme_strong_runtime_opts, 1656 .bdrv_get_specific_stats = nvme_get_specific_stats, 1657 1658 .bdrv_detach_aio_context = nvme_detach_aio_context, 1659 .bdrv_attach_aio_context = nvme_attach_aio_context, 1660 1661 .bdrv_register_buf = nvme_register_buf, 1662 .bdrv_unregister_buf = nvme_unregister_buf, 1663 }; 1664 1665 static void bdrv_nvme_init(void) 1666 { 1667 bdrv_register(&bdrv_nvme); 1668 } 1669 1670 block_init(bdrv_nvme_init); 1671