1 /* 2 * NVMe block driver based on vfio 3 * 4 * Copyright 2016 - 2018 Red Hat, Inc. 5 * 6 * Authors: 7 * Fam Zheng <famz@redhat.com> 8 * Paolo Bonzini <pbonzini@redhat.com> 9 * 10 * This work is licensed under the terms of the GNU GPL, version 2 or later. 11 * See the COPYING file in the top-level directory. 12 */ 13 14 #include "qemu/osdep.h" 15 #include <linux/vfio.h> 16 #include "qapi/error.h" 17 #include "qapi/qmp/qdict.h" 18 #include "qapi/qmp/qstring.h" 19 #include "qemu/defer-call.h" 20 #include "qemu/error-report.h" 21 #include "qemu/main-loop.h" 22 #include "qemu/module.h" 23 #include "qemu/cutils.h" 24 #include "qemu/option.h" 25 #include "qemu/memalign.h" 26 #include "qemu/vfio-helpers.h" 27 #include "block/block-io.h" 28 #include "block/block_int.h" 29 #include "sysemu/block-backend.h" 30 #include "sysemu/replay.h" 31 #include "trace.h" 32 33 #include "block/nvme.h" 34 35 #define NVME_SQ_ENTRY_BYTES 64 36 #define NVME_CQ_ENTRY_BYTES 16 37 #define NVME_QUEUE_SIZE 128 38 #define NVME_DOORBELL_SIZE 4096 39 40 /* 41 * We have to leave one slot empty as that is the full queue case where 42 * head == tail + 1. 43 */ 44 #define NVME_NUM_REQS (NVME_QUEUE_SIZE - 1) 45 46 typedef struct BDRVNVMeState BDRVNVMeState; 47 48 /* Same index is used for queues and IRQs */ 49 #define INDEX_ADMIN 0 50 #define INDEX_IO(n) (1 + n) 51 52 /* This driver shares a single MSIX IRQ for the admin and I/O queues */ 53 enum { 54 MSIX_SHARED_IRQ_IDX = 0, 55 MSIX_IRQ_COUNT = 1 56 }; 57 58 typedef struct { 59 int32_t head, tail; 60 uint8_t *queue; 61 uint64_t iova; 62 /* Hardware MMIO register */ 63 volatile uint32_t *doorbell; 64 } NVMeQueue; 65 66 typedef struct { 67 BlockCompletionFunc *cb; 68 void *opaque; 69 int cid; 70 void *prp_list_page; 71 uint64_t prp_list_iova; 72 int free_req_next; /* q->reqs[] index of next free req */ 73 } NVMeRequest; 74 75 typedef struct { 76 QemuMutex lock; 77 78 /* Read from I/O code path, initialized under BQL */ 79 BDRVNVMeState *s; 80 int index; 81 82 /* Fields protected by BQL */ 83 uint8_t *prp_list_pages; 84 85 /* Fields protected by @lock */ 86 CoQueue free_req_queue; 87 NVMeQueue sq, cq; 88 int cq_phase; 89 int free_req_head; 90 NVMeRequest reqs[NVME_NUM_REQS]; 91 int need_kick; 92 int inflight; 93 94 /* Thread-safe, no lock necessary */ 95 QEMUBH *completion_bh; 96 } NVMeQueuePair; 97 98 struct BDRVNVMeState { 99 AioContext *aio_context; 100 QEMUVFIOState *vfio; 101 void *bar0_wo_map; 102 /* Memory mapped registers */ 103 volatile struct { 104 uint32_t sq_tail; 105 uint32_t cq_head; 106 } *doorbells; 107 /* The submission/completion queue pairs. 108 * [0]: admin queue. 109 * [1..]: io queues. 110 */ 111 NVMeQueuePair **queues; 112 unsigned queue_count; 113 size_t page_size; 114 /* How many uint32_t elements does each doorbell entry take. */ 115 size_t doorbell_scale; 116 bool write_cache_supported; 117 EventNotifier irq_notifier[MSIX_IRQ_COUNT]; 118 119 uint64_t nsze; /* Namespace size reported by identify command */ 120 int nsid; /* The namespace id to read/write data. */ 121 int blkshift; 122 123 uint64_t max_transfer; 124 125 bool supports_write_zeroes; 126 bool supports_discard; 127 128 CoMutex dma_map_lock; 129 CoQueue dma_flush_queue; 130 131 /* Total size of mapped qiov, accessed under dma_map_lock */ 132 int dma_map_count; 133 134 /* PCI address (required for nvme_refresh_filename()) */ 135 char *device; 136 137 struct { 138 uint64_t completion_errors; 139 uint64_t aligned_accesses; 140 uint64_t unaligned_accesses; 141 } stats; 142 }; 143 144 #define NVME_BLOCK_OPT_DEVICE "device" 145 #define NVME_BLOCK_OPT_NAMESPACE "namespace" 146 147 static void nvme_process_completion_bh(void *opaque); 148 149 static QemuOptsList runtime_opts = { 150 .name = "nvme", 151 .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head), 152 .desc = { 153 { 154 .name = NVME_BLOCK_OPT_DEVICE, 155 .type = QEMU_OPT_STRING, 156 .help = "NVMe PCI device address", 157 }, 158 { 159 .name = NVME_BLOCK_OPT_NAMESPACE, 160 .type = QEMU_OPT_NUMBER, 161 .help = "NVMe namespace", 162 }, 163 { /* end of list */ } 164 }, 165 }; 166 167 /* Returns true on success, false on failure. */ 168 static bool nvme_init_queue(BDRVNVMeState *s, NVMeQueue *q, 169 unsigned nentries, size_t entry_bytes, Error **errp) 170 { 171 size_t bytes; 172 int r; 173 174 bytes = ROUND_UP(nentries * entry_bytes, qemu_real_host_page_size()); 175 q->head = q->tail = 0; 176 q->queue = qemu_try_memalign(qemu_real_host_page_size(), bytes); 177 if (!q->queue) { 178 error_setg(errp, "Cannot allocate queue"); 179 return false; 180 } 181 memset(q->queue, 0, bytes); 182 r = qemu_vfio_dma_map(s->vfio, q->queue, bytes, false, &q->iova, errp); 183 if (r) { 184 error_prepend(errp, "Cannot map queue: "); 185 } 186 return r == 0; 187 } 188 189 static void nvme_free_queue(NVMeQueue *q) 190 { 191 qemu_vfree(q->queue); 192 } 193 194 static void nvme_free_queue_pair(NVMeQueuePair *q) 195 { 196 trace_nvme_free_queue_pair(q->index, q, &q->cq, &q->sq); 197 if (q->completion_bh) { 198 qemu_bh_delete(q->completion_bh); 199 } 200 nvme_free_queue(&q->sq); 201 nvme_free_queue(&q->cq); 202 qemu_vfree(q->prp_list_pages); 203 qemu_mutex_destroy(&q->lock); 204 g_free(q); 205 } 206 207 static void nvme_free_req_queue_cb(void *opaque) 208 { 209 NVMeQueuePair *q = opaque; 210 211 qemu_mutex_lock(&q->lock); 212 while (q->free_req_head != -1 && 213 qemu_co_enter_next(&q->free_req_queue, &q->lock)) { 214 /* Retry waiting requests */ 215 } 216 qemu_mutex_unlock(&q->lock); 217 } 218 219 static NVMeQueuePair *nvme_create_queue_pair(BDRVNVMeState *s, 220 AioContext *aio_context, 221 unsigned idx, size_t size, 222 Error **errp) 223 { 224 int i, r; 225 NVMeQueuePair *q; 226 uint64_t prp_list_iova; 227 size_t bytes; 228 229 q = g_try_new0(NVMeQueuePair, 1); 230 if (!q) { 231 error_setg(errp, "Cannot allocate queue pair"); 232 return NULL; 233 } 234 trace_nvme_create_queue_pair(idx, q, size, aio_context, 235 event_notifier_get_fd(s->irq_notifier)); 236 bytes = QEMU_ALIGN_UP(s->page_size * NVME_NUM_REQS, 237 qemu_real_host_page_size()); 238 q->prp_list_pages = qemu_try_memalign(qemu_real_host_page_size(), bytes); 239 if (!q->prp_list_pages) { 240 error_setg(errp, "Cannot allocate PRP page list"); 241 goto fail; 242 } 243 memset(q->prp_list_pages, 0, bytes); 244 qemu_mutex_init(&q->lock); 245 q->s = s; 246 q->index = idx; 247 qemu_co_queue_init(&q->free_req_queue); 248 q->completion_bh = aio_bh_new(aio_context, nvme_process_completion_bh, q); 249 r = qemu_vfio_dma_map(s->vfio, q->prp_list_pages, bytes, 250 false, &prp_list_iova, errp); 251 if (r) { 252 error_prepend(errp, "Cannot map buffer for DMA: "); 253 goto fail; 254 } 255 q->free_req_head = -1; 256 for (i = 0; i < NVME_NUM_REQS; i++) { 257 NVMeRequest *req = &q->reqs[i]; 258 req->cid = i + 1; 259 req->free_req_next = q->free_req_head; 260 q->free_req_head = i; 261 req->prp_list_page = q->prp_list_pages + i * s->page_size; 262 req->prp_list_iova = prp_list_iova + i * s->page_size; 263 } 264 265 if (!nvme_init_queue(s, &q->sq, size, NVME_SQ_ENTRY_BYTES, errp)) { 266 goto fail; 267 } 268 q->sq.doorbell = &s->doorbells[idx * s->doorbell_scale].sq_tail; 269 270 if (!nvme_init_queue(s, &q->cq, size, NVME_CQ_ENTRY_BYTES, errp)) { 271 goto fail; 272 } 273 q->cq.doorbell = &s->doorbells[idx * s->doorbell_scale].cq_head; 274 275 return q; 276 fail: 277 nvme_free_queue_pair(q); 278 return NULL; 279 } 280 281 /* With q->lock */ 282 static void nvme_kick(NVMeQueuePair *q) 283 { 284 BDRVNVMeState *s = q->s; 285 286 if (!q->need_kick) { 287 return; 288 } 289 trace_nvme_kick(s, q->index); 290 assert(!(q->sq.tail & 0xFF00)); 291 /* Fence the write to submission queue entry before notifying the device. */ 292 smp_wmb(); 293 *q->sq.doorbell = cpu_to_le32(q->sq.tail); 294 q->inflight += q->need_kick; 295 q->need_kick = 0; 296 } 297 298 static NVMeRequest *nvme_get_free_req_nofail_locked(NVMeQueuePair *q) 299 { 300 NVMeRequest *req; 301 302 req = &q->reqs[q->free_req_head]; 303 q->free_req_head = req->free_req_next; 304 req->free_req_next = -1; 305 return req; 306 } 307 308 /* Return a free request element if any, otherwise return NULL. */ 309 static NVMeRequest *nvme_get_free_req_nowait(NVMeQueuePair *q) 310 { 311 QEMU_LOCK_GUARD(&q->lock); 312 if (q->free_req_head == -1) { 313 return NULL; 314 } 315 return nvme_get_free_req_nofail_locked(q); 316 } 317 318 /* 319 * Wait for a free request to become available if necessary, then 320 * return it. 321 */ 322 static coroutine_fn NVMeRequest *nvme_get_free_req(NVMeQueuePair *q) 323 { 324 QEMU_LOCK_GUARD(&q->lock); 325 326 while (q->free_req_head == -1) { 327 trace_nvme_free_req_queue_wait(q->s, q->index); 328 qemu_co_queue_wait(&q->free_req_queue, &q->lock); 329 } 330 331 return nvme_get_free_req_nofail_locked(q); 332 } 333 334 /* With q->lock */ 335 static void nvme_put_free_req_locked(NVMeQueuePair *q, NVMeRequest *req) 336 { 337 req->free_req_next = q->free_req_head; 338 q->free_req_head = req - q->reqs; 339 } 340 341 /* With q->lock */ 342 static void nvme_wake_free_req_locked(NVMeQueuePair *q) 343 { 344 if (!qemu_co_queue_empty(&q->free_req_queue)) { 345 replay_bh_schedule_oneshot_event(q->s->aio_context, 346 nvme_free_req_queue_cb, q); 347 } 348 } 349 350 /* Insert a request in the freelist and wake waiters */ 351 static void nvme_put_free_req_and_wake(NVMeQueuePair *q, NVMeRequest *req) 352 { 353 qemu_mutex_lock(&q->lock); 354 nvme_put_free_req_locked(q, req); 355 nvme_wake_free_req_locked(q); 356 qemu_mutex_unlock(&q->lock); 357 } 358 359 static inline int nvme_translate_error(const NvmeCqe *c) 360 { 361 uint16_t status = (le16_to_cpu(c->status) >> 1) & 0xFF; 362 if (status) { 363 trace_nvme_error(le32_to_cpu(c->result), 364 le16_to_cpu(c->sq_head), 365 le16_to_cpu(c->sq_id), 366 le16_to_cpu(c->cid), 367 le16_to_cpu(status)); 368 } 369 switch (status) { 370 case 0: 371 return 0; 372 case 1: 373 return -ENOSYS; 374 case 2: 375 return -EINVAL; 376 default: 377 return -EIO; 378 } 379 } 380 381 /* With q->lock */ 382 static bool nvme_process_completion(NVMeQueuePair *q) 383 { 384 BDRVNVMeState *s = q->s; 385 bool progress = false; 386 NVMeRequest *preq; 387 NVMeRequest req; 388 NvmeCqe *c; 389 390 trace_nvme_process_completion(s, q->index, q->inflight); 391 392 /* 393 * Support re-entrancy when a request cb() function invokes aio_poll(). 394 * Pending completions must be visible to aio_poll() so that a cb() 395 * function can wait for the completion of another request. 396 * 397 * The aio_poll() loop will execute our BH and we'll resume completion 398 * processing there. 399 */ 400 qemu_bh_schedule(q->completion_bh); 401 402 assert(q->inflight >= 0); 403 while (q->inflight) { 404 int ret; 405 int16_t cid; 406 407 c = (NvmeCqe *)&q->cq.queue[q->cq.head * NVME_CQ_ENTRY_BYTES]; 408 if ((le16_to_cpu(c->status) & 0x1) == q->cq_phase) { 409 break; 410 } 411 ret = nvme_translate_error(c); 412 if (ret) { 413 s->stats.completion_errors++; 414 } 415 q->cq.head = (q->cq.head + 1) % NVME_QUEUE_SIZE; 416 if (!q->cq.head) { 417 q->cq_phase = !q->cq_phase; 418 } 419 cid = le16_to_cpu(c->cid); 420 if (cid == 0 || cid > NVME_NUM_REQS) { 421 warn_report("NVMe: Unexpected CID in completion queue: %" PRIu32 422 ", should be within: 1..%u inclusively", cid, 423 NVME_NUM_REQS); 424 continue; 425 } 426 trace_nvme_complete_command(s, q->index, cid); 427 preq = &q->reqs[cid - 1]; 428 req = *preq; 429 assert(req.cid == cid); 430 assert(req.cb); 431 nvme_put_free_req_locked(q, preq); 432 preq->cb = preq->opaque = NULL; 433 q->inflight--; 434 qemu_mutex_unlock(&q->lock); 435 req.cb(req.opaque, ret); 436 qemu_mutex_lock(&q->lock); 437 progress = true; 438 } 439 if (progress) { 440 /* Notify the device so it can post more completions. */ 441 smp_mb_release(); 442 *q->cq.doorbell = cpu_to_le32(q->cq.head); 443 nvme_wake_free_req_locked(q); 444 } 445 446 qemu_bh_cancel(q->completion_bh); 447 448 return progress; 449 } 450 451 static void nvme_process_completion_bh(void *opaque) 452 { 453 NVMeQueuePair *q = opaque; 454 455 /* 456 * We're being invoked because a nvme_process_completion() cb() function 457 * called aio_poll(). The callback may be waiting for further completions 458 * so notify the device that it has space to fill in more completions now. 459 */ 460 smp_mb_release(); 461 *q->cq.doorbell = cpu_to_le32(q->cq.head); 462 nvme_wake_free_req_locked(q); 463 464 nvme_process_completion(q); 465 } 466 467 static void nvme_trace_command(const NvmeCmd *cmd) 468 { 469 int i; 470 471 if (!trace_event_get_state_backends(TRACE_NVME_SUBMIT_COMMAND_RAW)) { 472 return; 473 } 474 for (i = 0; i < 8; ++i) { 475 uint8_t *cmdp = (uint8_t *)cmd + i * 8; 476 trace_nvme_submit_command_raw(cmdp[0], cmdp[1], cmdp[2], cmdp[3], 477 cmdp[4], cmdp[5], cmdp[6], cmdp[7]); 478 } 479 } 480 481 static void nvme_deferred_fn(void *opaque) 482 { 483 NVMeQueuePair *q = opaque; 484 485 QEMU_LOCK_GUARD(&q->lock); 486 nvme_kick(q); 487 nvme_process_completion(q); 488 } 489 490 static void nvme_submit_command(NVMeQueuePair *q, NVMeRequest *req, 491 NvmeCmd *cmd, BlockCompletionFunc cb, 492 void *opaque) 493 { 494 assert(!req->cb); 495 req->cb = cb; 496 req->opaque = opaque; 497 cmd->cid = cpu_to_le16(req->cid); 498 499 trace_nvme_submit_command(q->s, q->index, req->cid); 500 nvme_trace_command(cmd); 501 qemu_mutex_lock(&q->lock); 502 memcpy((uint8_t *)q->sq.queue + 503 q->sq.tail * NVME_SQ_ENTRY_BYTES, cmd, sizeof(*cmd)); 504 q->sq.tail = (q->sq.tail + 1) % NVME_QUEUE_SIZE; 505 q->need_kick++; 506 qemu_mutex_unlock(&q->lock); 507 508 defer_call(nvme_deferred_fn, q); 509 } 510 511 static void nvme_admin_cmd_sync_cb(void *opaque, int ret) 512 { 513 int *pret = opaque; 514 *pret = ret; 515 aio_wait_kick(); 516 } 517 518 static int nvme_admin_cmd_sync(BlockDriverState *bs, NvmeCmd *cmd) 519 { 520 BDRVNVMeState *s = bs->opaque; 521 NVMeQueuePair *q = s->queues[INDEX_ADMIN]; 522 AioContext *aio_context = bdrv_get_aio_context(bs); 523 NVMeRequest *req; 524 int ret = -EINPROGRESS; 525 req = nvme_get_free_req_nowait(q); 526 if (!req) { 527 return -EBUSY; 528 } 529 nvme_submit_command(q, req, cmd, nvme_admin_cmd_sync_cb, &ret); 530 531 AIO_WAIT_WHILE(aio_context, ret == -EINPROGRESS); 532 return ret; 533 } 534 535 /* Returns true on success, false on failure. */ 536 static bool nvme_identify(BlockDriverState *bs, int namespace, Error **errp) 537 { 538 BDRVNVMeState *s = bs->opaque; 539 bool ret = false; 540 QEMU_AUTO_VFREE union { 541 NvmeIdCtrl ctrl; 542 NvmeIdNs ns; 543 } *id = NULL; 544 NvmeLBAF *lbaf; 545 uint16_t oncs; 546 int r; 547 uint64_t iova; 548 NvmeCmd cmd = { 549 .opcode = NVME_ADM_CMD_IDENTIFY, 550 .cdw10 = cpu_to_le32(0x1), 551 }; 552 size_t id_size = QEMU_ALIGN_UP(sizeof(*id), qemu_real_host_page_size()); 553 554 id = qemu_try_memalign(qemu_real_host_page_size(), id_size); 555 if (!id) { 556 error_setg(errp, "Cannot allocate buffer for identify response"); 557 goto out; 558 } 559 r = qemu_vfio_dma_map(s->vfio, id, id_size, true, &iova, errp); 560 if (r) { 561 error_prepend(errp, "Cannot map buffer for DMA: "); 562 goto out; 563 } 564 565 memset(id, 0, id_size); 566 cmd.dptr.prp1 = cpu_to_le64(iova); 567 if (nvme_admin_cmd_sync(bs, &cmd)) { 568 error_setg(errp, "Failed to identify controller"); 569 goto out; 570 } 571 572 if (le32_to_cpu(id->ctrl.nn) < namespace) { 573 error_setg(errp, "Invalid namespace"); 574 goto out; 575 } 576 s->write_cache_supported = le32_to_cpu(id->ctrl.vwc) & 0x1; 577 s->max_transfer = (id->ctrl.mdts ? 1 << id->ctrl.mdts : 0) * s->page_size; 578 /* For now the page list buffer per command is one page, to hold at most 579 * s->page_size / sizeof(uint64_t) entries. */ 580 s->max_transfer = MIN_NON_ZERO(s->max_transfer, 581 s->page_size / sizeof(uint64_t) * s->page_size); 582 583 oncs = le16_to_cpu(id->ctrl.oncs); 584 s->supports_write_zeroes = !!(oncs & NVME_ONCS_WRITE_ZEROES); 585 s->supports_discard = !!(oncs & NVME_ONCS_DSM); 586 587 memset(id, 0, id_size); 588 cmd.cdw10 = 0; 589 cmd.nsid = cpu_to_le32(namespace); 590 if (nvme_admin_cmd_sync(bs, &cmd)) { 591 error_setg(errp, "Failed to identify namespace"); 592 goto out; 593 } 594 595 s->nsze = le64_to_cpu(id->ns.nsze); 596 lbaf = &id->ns.lbaf[NVME_ID_NS_FLBAS_INDEX(id->ns.flbas)]; 597 598 if (NVME_ID_NS_DLFEAT_WRITE_ZEROES(id->ns.dlfeat) && 599 NVME_ID_NS_DLFEAT_READ_BEHAVIOR(id->ns.dlfeat) == 600 NVME_ID_NS_DLFEAT_READ_BEHAVIOR_ZEROES) { 601 bs->supported_write_flags |= BDRV_REQ_MAY_UNMAP; 602 } 603 604 if (lbaf->ms) { 605 error_setg(errp, "Namespaces with metadata are not yet supported"); 606 goto out; 607 } 608 609 if (lbaf->ds < BDRV_SECTOR_BITS || lbaf->ds > 12 || 610 (1 << lbaf->ds) > s->page_size) 611 { 612 error_setg(errp, "Namespace has unsupported block size (2^%d)", 613 lbaf->ds); 614 goto out; 615 } 616 617 ret = true; 618 s->blkshift = lbaf->ds; 619 out: 620 qemu_vfio_dma_unmap(s->vfio, id); 621 622 return ret; 623 } 624 625 static void nvme_poll_queue(NVMeQueuePair *q) 626 { 627 const size_t cqe_offset = q->cq.head * NVME_CQ_ENTRY_BYTES; 628 NvmeCqe *cqe = (NvmeCqe *)&q->cq.queue[cqe_offset]; 629 630 trace_nvme_poll_queue(q->s, q->index); 631 /* 632 * Do an early check for completions. q->lock isn't needed because 633 * nvme_process_completion() only runs in the event loop thread and 634 * cannot race with itself. 635 */ 636 if ((le16_to_cpu(cqe->status) & 0x1) == q->cq_phase) { 637 return; 638 } 639 640 qemu_mutex_lock(&q->lock); 641 while (nvme_process_completion(q)) { 642 /* Keep polling */ 643 } 644 qemu_mutex_unlock(&q->lock); 645 } 646 647 static void nvme_poll_queues(BDRVNVMeState *s) 648 { 649 int i; 650 651 for (i = 0; i < s->queue_count; i++) { 652 nvme_poll_queue(s->queues[i]); 653 } 654 } 655 656 static void nvme_handle_event(EventNotifier *n) 657 { 658 BDRVNVMeState *s = container_of(n, BDRVNVMeState, 659 irq_notifier[MSIX_SHARED_IRQ_IDX]); 660 661 trace_nvme_handle_event(s); 662 event_notifier_test_and_clear(n); 663 nvme_poll_queues(s); 664 } 665 666 static bool nvme_add_io_queue(BlockDriverState *bs, Error **errp) 667 { 668 BDRVNVMeState *s = bs->opaque; 669 unsigned n = s->queue_count; 670 NVMeQueuePair *q; 671 NvmeCmd cmd; 672 unsigned queue_size = NVME_QUEUE_SIZE; 673 674 assert(n <= UINT16_MAX); 675 q = nvme_create_queue_pair(s, bdrv_get_aio_context(bs), 676 n, queue_size, errp); 677 if (!q) { 678 return false; 679 } 680 cmd = (NvmeCmd) { 681 .opcode = NVME_ADM_CMD_CREATE_CQ, 682 .dptr.prp1 = cpu_to_le64(q->cq.iova), 683 .cdw10 = cpu_to_le32(((queue_size - 1) << 16) | n), 684 .cdw11 = cpu_to_le32(NVME_CQ_IEN | NVME_CQ_PC), 685 }; 686 if (nvme_admin_cmd_sync(bs, &cmd)) { 687 error_setg(errp, "Failed to create CQ io queue [%u]", n); 688 goto out_error; 689 } 690 cmd = (NvmeCmd) { 691 .opcode = NVME_ADM_CMD_CREATE_SQ, 692 .dptr.prp1 = cpu_to_le64(q->sq.iova), 693 .cdw10 = cpu_to_le32(((queue_size - 1) << 16) | n), 694 .cdw11 = cpu_to_le32(NVME_SQ_PC | (n << 16)), 695 }; 696 if (nvme_admin_cmd_sync(bs, &cmd)) { 697 error_setg(errp, "Failed to create SQ io queue [%u]", n); 698 goto out_error; 699 } 700 s->queues = g_renew(NVMeQueuePair *, s->queues, n + 1); 701 s->queues[n] = q; 702 s->queue_count++; 703 return true; 704 out_error: 705 nvme_free_queue_pair(q); 706 return false; 707 } 708 709 static bool nvme_poll_cb(void *opaque) 710 { 711 EventNotifier *e = opaque; 712 BDRVNVMeState *s = container_of(e, BDRVNVMeState, 713 irq_notifier[MSIX_SHARED_IRQ_IDX]); 714 int i; 715 716 for (i = 0; i < s->queue_count; i++) { 717 NVMeQueuePair *q = s->queues[i]; 718 const size_t cqe_offset = q->cq.head * NVME_CQ_ENTRY_BYTES; 719 NvmeCqe *cqe = (NvmeCqe *)&q->cq.queue[cqe_offset]; 720 721 /* 722 * q->lock isn't needed because nvme_process_completion() only runs in 723 * the event loop thread and cannot race with itself. 724 */ 725 if ((le16_to_cpu(cqe->status) & 0x1) != q->cq_phase) { 726 return true; 727 } 728 } 729 return false; 730 } 731 732 static void nvme_poll_ready(EventNotifier *e) 733 { 734 BDRVNVMeState *s = container_of(e, BDRVNVMeState, 735 irq_notifier[MSIX_SHARED_IRQ_IDX]); 736 737 nvme_poll_queues(s); 738 } 739 740 static int nvme_init(BlockDriverState *bs, const char *device, int namespace, 741 Error **errp) 742 { 743 BDRVNVMeState *s = bs->opaque; 744 NVMeQueuePair *q; 745 AioContext *aio_context = bdrv_get_aio_context(bs); 746 int ret; 747 uint64_t cap; 748 uint32_t ver; 749 uint64_t timeout_ms; 750 uint64_t deadline, now; 751 volatile NvmeBar *regs = NULL; 752 753 qemu_co_mutex_init(&s->dma_map_lock); 754 qemu_co_queue_init(&s->dma_flush_queue); 755 s->device = g_strdup(device); 756 s->nsid = namespace; 757 s->aio_context = bdrv_get_aio_context(bs); 758 ret = event_notifier_init(&s->irq_notifier[MSIX_SHARED_IRQ_IDX], 0); 759 if (ret) { 760 error_setg(errp, "Failed to init event notifier"); 761 return ret; 762 } 763 764 s->vfio = qemu_vfio_open_pci(device, errp); 765 if (!s->vfio) { 766 ret = -EINVAL; 767 goto out; 768 } 769 770 regs = qemu_vfio_pci_map_bar(s->vfio, 0, 0, sizeof(NvmeBar), 771 PROT_READ | PROT_WRITE, errp); 772 if (!regs) { 773 ret = -EINVAL; 774 goto out; 775 } 776 /* Perform initialize sequence as described in NVMe spec "7.6.1 777 * Initialization". */ 778 779 cap = le64_to_cpu(regs->cap); 780 trace_nvme_controller_capability_raw(cap); 781 trace_nvme_controller_capability("Maximum Queue Entries Supported", 782 1 + NVME_CAP_MQES(cap)); 783 trace_nvme_controller_capability("Contiguous Queues Required", 784 NVME_CAP_CQR(cap)); 785 trace_nvme_controller_capability("Doorbell Stride", 786 1 << (2 + NVME_CAP_DSTRD(cap))); 787 trace_nvme_controller_capability("Subsystem Reset Supported", 788 NVME_CAP_NSSRS(cap)); 789 trace_nvme_controller_capability("Memory Page Size Minimum", 790 1 << (12 + NVME_CAP_MPSMIN(cap))); 791 trace_nvme_controller_capability("Memory Page Size Maximum", 792 1 << (12 + NVME_CAP_MPSMAX(cap))); 793 if (!NVME_CAP_CSS(cap)) { 794 error_setg(errp, "Device doesn't support NVMe command set"); 795 ret = -EINVAL; 796 goto out; 797 } 798 799 s->page_size = 1u << (12 + NVME_CAP_MPSMIN(cap)); 800 s->doorbell_scale = (4 << NVME_CAP_DSTRD(cap)) / sizeof(uint32_t); 801 bs->bl.opt_mem_alignment = s->page_size; 802 bs->bl.request_alignment = s->page_size; 803 timeout_ms = MIN(500 * NVME_CAP_TO(cap), 30000); 804 805 ver = le32_to_cpu(regs->vs); 806 trace_nvme_controller_spec_version(extract32(ver, 16, 16), 807 extract32(ver, 8, 8), 808 extract32(ver, 0, 8)); 809 810 /* Reset device to get a clean state. */ 811 regs->cc = cpu_to_le32(le32_to_cpu(regs->cc) & 0xFE); 812 /* Wait for CSTS.RDY = 0. */ 813 deadline = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) + timeout_ms * SCALE_MS; 814 while (NVME_CSTS_RDY(le32_to_cpu(regs->csts))) { 815 if (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) > deadline) { 816 error_setg(errp, "Timeout while waiting for device to reset (%" 817 PRId64 " ms)", 818 timeout_ms); 819 ret = -ETIMEDOUT; 820 goto out; 821 } 822 } 823 824 s->bar0_wo_map = qemu_vfio_pci_map_bar(s->vfio, 0, 0, 825 sizeof(NvmeBar) + NVME_DOORBELL_SIZE, 826 PROT_WRITE, errp); 827 s->doorbells = (void *)((uintptr_t)s->bar0_wo_map + sizeof(NvmeBar)); 828 if (!s->doorbells) { 829 ret = -EINVAL; 830 goto out; 831 } 832 833 /* Set up admin queue. */ 834 s->queues = g_new(NVMeQueuePair *, 1); 835 q = nvme_create_queue_pair(s, aio_context, 0, NVME_QUEUE_SIZE, errp); 836 if (!q) { 837 ret = -EINVAL; 838 goto out; 839 } 840 s->queues[INDEX_ADMIN] = q; 841 s->queue_count = 1; 842 QEMU_BUILD_BUG_ON((NVME_QUEUE_SIZE - 1) & 0xF000); 843 regs->aqa = cpu_to_le32(((NVME_QUEUE_SIZE - 1) << AQA_ACQS_SHIFT) | 844 ((NVME_QUEUE_SIZE - 1) << AQA_ASQS_SHIFT)); 845 regs->asq = cpu_to_le64(q->sq.iova); 846 regs->acq = cpu_to_le64(q->cq.iova); 847 848 /* After setting up all control registers we can enable device now. */ 849 regs->cc = cpu_to_le32((ctz32(NVME_CQ_ENTRY_BYTES) << CC_IOCQES_SHIFT) | 850 (ctz32(NVME_SQ_ENTRY_BYTES) << CC_IOSQES_SHIFT) | 851 CC_EN_MASK); 852 /* Wait for CSTS.RDY = 1. */ 853 now = qemu_clock_get_ns(QEMU_CLOCK_REALTIME); 854 deadline = now + timeout_ms * SCALE_MS; 855 while (!NVME_CSTS_RDY(le32_to_cpu(regs->csts))) { 856 if (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) > deadline) { 857 error_setg(errp, "Timeout while waiting for device to start (%" 858 PRId64 " ms)", 859 timeout_ms); 860 ret = -ETIMEDOUT; 861 goto out; 862 } 863 } 864 865 ret = qemu_vfio_pci_init_irq(s->vfio, s->irq_notifier, 866 VFIO_PCI_MSIX_IRQ_INDEX, errp); 867 if (ret) { 868 goto out; 869 } 870 aio_set_event_notifier(bdrv_get_aio_context(bs), 871 &s->irq_notifier[MSIX_SHARED_IRQ_IDX], 872 nvme_handle_event, nvme_poll_cb, 873 nvme_poll_ready); 874 875 if (!nvme_identify(bs, namespace, errp)) { 876 ret = -EIO; 877 goto out; 878 } 879 880 /* Set up command queues. */ 881 if (!nvme_add_io_queue(bs, errp)) { 882 ret = -EIO; 883 } 884 out: 885 if (regs) { 886 qemu_vfio_pci_unmap_bar(s->vfio, 0, (void *)regs, 0, sizeof(NvmeBar)); 887 } 888 889 /* Cleaning up is done in nvme_file_open() upon error. */ 890 return ret; 891 } 892 893 /* Parse a filename in the format of nvme://XXXX:XX:XX.X/X. Example: 894 * 895 * nvme://0000:44:00.0/1 896 * 897 * where the "nvme://" is a fixed form of the protocol prefix, the middle part 898 * is the PCI address, and the last part is the namespace number starting from 899 * 1 according to the NVMe spec. */ 900 static void nvme_parse_filename(const char *filename, QDict *options, 901 Error **errp) 902 { 903 int pref = strlen("nvme://"); 904 905 if (strlen(filename) > pref && !strncmp(filename, "nvme://", pref)) { 906 const char *tmp = filename + pref; 907 char *device; 908 const char *namespace; 909 unsigned long ns; 910 const char *slash = strchr(tmp, '/'); 911 if (!slash) { 912 qdict_put_str(options, NVME_BLOCK_OPT_DEVICE, tmp); 913 return; 914 } 915 device = g_strndup(tmp, slash - tmp); 916 qdict_put_str(options, NVME_BLOCK_OPT_DEVICE, device); 917 g_free(device); 918 namespace = slash + 1; 919 if (*namespace && qemu_strtoul(namespace, NULL, 10, &ns)) { 920 error_setg(errp, "Invalid namespace '%s', positive number expected", 921 namespace); 922 return; 923 } 924 qdict_put_str(options, NVME_BLOCK_OPT_NAMESPACE, 925 *namespace ? namespace : "1"); 926 } 927 } 928 929 static int nvme_enable_disable_write_cache(BlockDriverState *bs, bool enable, 930 Error **errp) 931 { 932 int ret; 933 BDRVNVMeState *s = bs->opaque; 934 NvmeCmd cmd = { 935 .opcode = NVME_ADM_CMD_SET_FEATURES, 936 .nsid = cpu_to_le32(s->nsid), 937 .cdw10 = cpu_to_le32(0x06), 938 .cdw11 = cpu_to_le32(enable ? 0x01 : 0x00), 939 }; 940 941 ret = nvme_admin_cmd_sync(bs, &cmd); 942 if (ret) { 943 error_setg(errp, "Failed to configure NVMe write cache"); 944 } 945 return ret; 946 } 947 948 static void nvme_close(BlockDriverState *bs) 949 { 950 BDRVNVMeState *s = bs->opaque; 951 952 for (unsigned i = 0; i < s->queue_count; ++i) { 953 nvme_free_queue_pair(s->queues[i]); 954 } 955 g_free(s->queues); 956 aio_set_event_notifier(bdrv_get_aio_context(bs), 957 &s->irq_notifier[MSIX_SHARED_IRQ_IDX], 958 NULL, NULL, NULL); 959 event_notifier_cleanup(&s->irq_notifier[MSIX_SHARED_IRQ_IDX]); 960 qemu_vfio_pci_unmap_bar(s->vfio, 0, s->bar0_wo_map, 961 0, sizeof(NvmeBar) + NVME_DOORBELL_SIZE); 962 qemu_vfio_close(s->vfio); 963 964 g_free(s->device); 965 } 966 967 static int nvme_file_open(BlockDriverState *bs, QDict *options, int flags, 968 Error **errp) 969 { 970 const char *device; 971 QemuOpts *opts; 972 int namespace; 973 int ret; 974 BDRVNVMeState *s = bs->opaque; 975 976 bs->supported_write_flags = BDRV_REQ_FUA; 977 978 opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort); 979 qemu_opts_absorb_qdict(opts, options, &error_abort); 980 device = qemu_opt_get(opts, NVME_BLOCK_OPT_DEVICE); 981 if (!device) { 982 error_setg(errp, "'" NVME_BLOCK_OPT_DEVICE "' option is required"); 983 qemu_opts_del(opts); 984 return -EINVAL; 985 } 986 987 namespace = qemu_opt_get_number(opts, NVME_BLOCK_OPT_NAMESPACE, 1); 988 ret = nvme_init(bs, device, namespace, errp); 989 qemu_opts_del(opts); 990 if (ret) { 991 goto fail; 992 } 993 if (flags & BDRV_O_NOCACHE) { 994 if (!s->write_cache_supported) { 995 error_setg(errp, 996 "NVMe controller doesn't support write cache configuration"); 997 ret = -EINVAL; 998 } else { 999 ret = nvme_enable_disable_write_cache(bs, !(flags & BDRV_O_NOCACHE), 1000 errp); 1001 } 1002 if (ret) { 1003 goto fail; 1004 } 1005 } 1006 return 0; 1007 fail: 1008 nvme_close(bs); 1009 return ret; 1010 } 1011 1012 static int64_t coroutine_fn nvme_co_getlength(BlockDriverState *bs) 1013 { 1014 BDRVNVMeState *s = bs->opaque; 1015 return s->nsze << s->blkshift; 1016 } 1017 1018 static uint32_t nvme_get_blocksize(BlockDriverState *bs) 1019 { 1020 BDRVNVMeState *s = bs->opaque; 1021 assert(s->blkshift >= BDRV_SECTOR_BITS && s->blkshift <= 12); 1022 return UINT32_C(1) << s->blkshift; 1023 } 1024 1025 static int nvme_probe_blocksizes(BlockDriverState *bs, BlockSizes *bsz) 1026 { 1027 uint32_t blocksize = nvme_get_blocksize(bs); 1028 bsz->phys = blocksize; 1029 bsz->log = blocksize; 1030 return 0; 1031 } 1032 1033 /* Called with s->dma_map_lock */ 1034 static coroutine_fn int nvme_cmd_unmap_qiov(BlockDriverState *bs, 1035 QEMUIOVector *qiov) 1036 { 1037 int r = 0; 1038 BDRVNVMeState *s = bs->opaque; 1039 1040 s->dma_map_count -= qiov->size; 1041 if (!s->dma_map_count && !qemu_co_queue_empty(&s->dma_flush_queue)) { 1042 r = qemu_vfio_dma_reset_temporary(s->vfio); 1043 if (!r) { 1044 qemu_co_queue_restart_all(&s->dma_flush_queue); 1045 } 1046 } 1047 return r; 1048 } 1049 1050 /* Called with s->dma_map_lock */ 1051 static coroutine_fn int nvme_cmd_map_qiov(BlockDriverState *bs, NvmeCmd *cmd, 1052 NVMeRequest *req, QEMUIOVector *qiov) 1053 { 1054 BDRVNVMeState *s = bs->opaque; 1055 uint64_t *pagelist = req->prp_list_page; 1056 int i, j, r; 1057 int entries = 0; 1058 Error *local_err = NULL, **errp = NULL; 1059 1060 assert(qiov->size); 1061 assert(QEMU_IS_ALIGNED(qiov->size, s->page_size)); 1062 assert(qiov->size / s->page_size <= s->page_size / sizeof(uint64_t)); 1063 for (i = 0; i < qiov->niov; ++i) { 1064 bool retry = true; 1065 uint64_t iova; 1066 size_t len = QEMU_ALIGN_UP(qiov->iov[i].iov_len, 1067 qemu_real_host_page_size()); 1068 try_map: 1069 r = qemu_vfio_dma_map(s->vfio, 1070 qiov->iov[i].iov_base, 1071 len, true, &iova, errp); 1072 if (r == -ENOSPC) { 1073 /* 1074 * In addition to the -ENOMEM error, the VFIO_IOMMU_MAP_DMA 1075 * ioctl returns -ENOSPC to signal the user exhausted the DMA 1076 * mappings available for a container since Linux kernel commit 1077 * 492855939bdb ("vfio/type1: Limit DMA mappings per container", 1078 * April 2019, see CVE-2019-3882). 1079 * 1080 * This block driver already handles this error path by checking 1081 * for the -ENOMEM error, so we directly replace -ENOSPC by 1082 * -ENOMEM. Beside, -ENOSPC has a specific meaning for blockdev 1083 * coroutines: it triggers BLOCKDEV_ON_ERROR_ENOSPC and 1084 * BLOCK_ERROR_ACTION_STOP which stops the VM, asking the operator 1085 * to add more storage to the blockdev. Not something we can do 1086 * easily with an IOMMU :) 1087 */ 1088 r = -ENOMEM; 1089 } 1090 if (r == -ENOMEM && retry) { 1091 /* 1092 * We exhausted the DMA mappings available for our container: 1093 * recycle the volatile IOVA mappings. 1094 */ 1095 retry = false; 1096 trace_nvme_dma_flush_queue_wait(s); 1097 if (s->dma_map_count) { 1098 trace_nvme_dma_map_flush(s); 1099 qemu_co_queue_wait(&s->dma_flush_queue, &s->dma_map_lock); 1100 } else { 1101 r = qemu_vfio_dma_reset_temporary(s->vfio); 1102 if (r) { 1103 goto fail; 1104 } 1105 } 1106 errp = &local_err; 1107 1108 goto try_map; 1109 } 1110 if (r) { 1111 goto fail; 1112 } 1113 1114 for (j = 0; j < qiov->iov[i].iov_len / s->page_size; j++) { 1115 pagelist[entries++] = cpu_to_le64(iova + j * s->page_size); 1116 } 1117 trace_nvme_cmd_map_qiov_iov(s, i, qiov->iov[i].iov_base, 1118 qiov->iov[i].iov_len / s->page_size); 1119 } 1120 1121 s->dma_map_count += qiov->size; 1122 1123 assert(entries <= s->page_size / sizeof(uint64_t)); 1124 switch (entries) { 1125 case 0: 1126 abort(); 1127 case 1: 1128 cmd->dptr.prp1 = pagelist[0]; 1129 cmd->dptr.prp2 = 0; 1130 break; 1131 case 2: 1132 cmd->dptr.prp1 = pagelist[0]; 1133 cmd->dptr.prp2 = pagelist[1]; 1134 break; 1135 default: 1136 cmd->dptr.prp1 = pagelist[0]; 1137 cmd->dptr.prp2 = cpu_to_le64(req->prp_list_iova + sizeof(uint64_t)); 1138 break; 1139 } 1140 trace_nvme_cmd_map_qiov(s, cmd, req, qiov, entries); 1141 for (i = 0; i < entries; ++i) { 1142 trace_nvme_cmd_map_qiov_pages(s, i, pagelist[i]); 1143 } 1144 return 0; 1145 fail: 1146 /* No need to unmap [0 - i) iovs even if we've failed, since we don't 1147 * increment s->dma_map_count. This is okay for fixed mapping memory areas 1148 * because they are already mapped before calling this function; for 1149 * temporary mappings, a later nvme_cmd_(un)map_qiov will reclaim by 1150 * calling qemu_vfio_dma_reset_temporary when necessary. */ 1151 if (local_err) { 1152 error_reportf_err(local_err, "Cannot map buffer for DMA: "); 1153 } 1154 return r; 1155 } 1156 1157 typedef struct { 1158 Coroutine *co; 1159 int ret; 1160 AioContext *ctx; 1161 } NVMeCoData; 1162 1163 static void nvme_rw_cb_bh(void *opaque) 1164 { 1165 NVMeCoData *data = opaque; 1166 qemu_coroutine_enter(data->co); 1167 } 1168 1169 static void nvme_rw_cb(void *opaque, int ret) 1170 { 1171 NVMeCoData *data = opaque; 1172 data->ret = ret; 1173 if (!data->co) { 1174 /* The rw coroutine hasn't yielded, don't try to enter. */ 1175 return; 1176 } 1177 replay_bh_schedule_oneshot_event(data->ctx, nvme_rw_cb_bh, data); 1178 } 1179 1180 static coroutine_fn int nvme_co_prw_aligned(BlockDriverState *bs, 1181 uint64_t offset, uint64_t bytes, 1182 QEMUIOVector *qiov, 1183 bool is_write, 1184 int flags) 1185 { 1186 int r; 1187 BDRVNVMeState *s = bs->opaque; 1188 NVMeQueuePair *ioq = s->queues[INDEX_IO(0)]; 1189 NVMeRequest *req; 1190 1191 uint32_t cdw12 = (((bytes >> s->blkshift) - 1) & 0xFFFF) | 1192 (flags & BDRV_REQ_FUA ? 1 << 30 : 0); 1193 NvmeCmd cmd = { 1194 .opcode = is_write ? NVME_CMD_WRITE : NVME_CMD_READ, 1195 .nsid = cpu_to_le32(s->nsid), 1196 .cdw10 = cpu_to_le32((offset >> s->blkshift) & 0xFFFFFFFF), 1197 .cdw11 = cpu_to_le32(((offset >> s->blkshift) >> 32) & 0xFFFFFFFF), 1198 .cdw12 = cpu_to_le32(cdw12), 1199 }; 1200 NVMeCoData data = { 1201 .ctx = bdrv_get_aio_context(bs), 1202 .ret = -EINPROGRESS, 1203 }; 1204 1205 trace_nvme_prw_aligned(s, is_write, offset, bytes, flags, qiov->niov); 1206 assert(s->queue_count > 1); 1207 req = nvme_get_free_req(ioq); 1208 assert(req); 1209 1210 qemu_co_mutex_lock(&s->dma_map_lock); 1211 r = nvme_cmd_map_qiov(bs, &cmd, req, qiov); 1212 qemu_co_mutex_unlock(&s->dma_map_lock); 1213 if (r) { 1214 nvme_put_free_req_and_wake(ioq, req); 1215 return r; 1216 } 1217 nvme_submit_command(ioq, req, &cmd, nvme_rw_cb, &data); 1218 1219 data.co = qemu_coroutine_self(); 1220 while (data.ret == -EINPROGRESS) { 1221 qemu_coroutine_yield(); 1222 } 1223 1224 qemu_co_mutex_lock(&s->dma_map_lock); 1225 r = nvme_cmd_unmap_qiov(bs, qiov); 1226 qemu_co_mutex_unlock(&s->dma_map_lock); 1227 if (r) { 1228 return r; 1229 } 1230 1231 trace_nvme_rw_done(s, is_write, offset, bytes, data.ret); 1232 return data.ret; 1233 } 1234 1235 static inline bool nvme_qiov_aligned(BlockDriverState *bs, 1236 const QEMUIOVector *qiov) 1237 { 1238 int i; 1239 BDRVNVMeState *s = bs->opaque; 1240 1241 for (i = 0; i < qiov->niov; ++i) { 1242 if (!QEMU_PTR_IS_ALIGNED(qiov->iov[i].iov_base, 1243 qemu_real_host_page_size()) || 1244 !QEMU_IS_ALIGNED(qiov->iov[i].iov_len, qemu_real_host_page_size())) { 1245 trace_nvme_qiov_unaligned(qiov, i, qiov->iov[i].iov_base, 1246 qiov->iov[i].iov_len, s->page_size); 1247 return false; 1248 } 1249 } 1250 return true; 1251 } 1252 1253 static coroutine_fn int nvme_co_prw(BlockDriverState *bs, 1254 uint64_t offset, uint64_t bytes, 1255 QEMUIOVector *qiov, bool is_write, 1256 int flags) 1257 { 1258 BDRVNVMeState *s = bs->opaque; 1259 int r; 1260 QEMU_AUTO_VFREE uint8_t *buf = NULL; 1261 QEMUIOVector local_qiov; 1262 size_t len = QEMU_ALIGN_UP(bytes, qemu_real_host_page_size()); 1263 assert(QEMU_IS_ALIGNED(offset, s->page_size)); 1264 assert(QEMU_IS_ALIGNED(bytes, s->page_size)); 1265 assert(bytes <= s->max_transfer); 1266 if (nvme_qiov_aligned(bs, qiov)) { 1267 s->stats.aligned_accesses++; 1268 return nvme_co_prw_aligned(bs, offset, bytes, qiov, is_write, flags); 1269 } 1270 s->stats.unaligned_accesses++; 1271 trace_nvme_prw_buffered(s, offset, bytes, qiov->niov, is_write); 1272 buf = qemu_try_memalign(qemu_real_host_page_size(), len); 1273 1274 if (!buf) { 1275 return -ENOMEM; 1276 } 1277 qemu_iovec_init(&local_qiov, 1); 1278 if (is_write) { 1279 qemu_iovec_to_buf(qiov, 0, buf, bytes); 1280 } 1281 qemu_iovec_add(&local_qiov, buf, bytes); 1282 r = nvme_co_prw_aligned(bs, offset, bytes, &local_qiov, is_write, flags); 1283 qemu_iovec_destroy(&local_qiov); 1284 if (!r && !is_write) { 1285 qemu_iovec_from_buf(qiov, 0, buf, bytes); 1286 } 1287 return r; 1288 } 1289 1290 static coroutine_fn int nvme_co_preadv(BlockDriverState *bs, 1291 int64_t offset, int64_t bytes, 1292 QEMUIOVector *qiov, 1293 BdrvRequestFlags flags) 1294 { 1295 return nvme_co_prw(bs, offset, bytes, qiov, false, flags); 1296 } 1297 1298 static coroutine_fn int nvme_co_pwritev(BlockDriverState *bs, 1299 int64_t offset, int64_t bytes, 1300 QEMUIOVector *qiov, 1301 BdrvRequestFlags flags) 1302 { 1303 return nvme_co_prw(bs, offset, bytes, qiov, true, flags); 1304 } 1305 1306 static coroutine_fn int nvme_co_flush(BlockDriverState *bs) 1307 { 1308 BDRVNVMeState *s = bs->opaque; 1309 NVMeQueuePair *ioq = s->queues[INDEX_IO(0)]; 1310 NVMeRequest *req; 1311 NvmeCmd cmd = { 1312 .opcode = NVME_CMD_FLUSH, 1313 .nsid = cpu_to_le32(s->nsid), 1314 }; 1315 NVMeCoData data = { 1316 .ctx = bdrv_get_aio_context(bs), 1317 .ret = -EINPROGRESS, 1318 }; 1319 1320 assert(s->queue_count > 1); 1321 req = nvme_get_free_req(ioq); 1322 assert(req); 1323 nvme_submit_command(ioq, req, &cmd, nvme_rw_cb, &data); 1324 1325 data.co = qemu_coroutine_self(); 1326 if (data.ret == -EINPROGRESS) { 1327 qemu_coroutine_yield(); 1328 } 1329 1330 return data.ret; 1331 } 1332 1333 1334 static coroutine_fn int nvme_co_pwrite_zeroes(BlockDriverState *bs, 1335 int64_t offset, 1336 int64_t bytes, 1337 BdrvRequestFlags flags) 1338 { 1339 BDRVNVMeState *s = bs->opaque; 1340 NVMeQueuePair *ioq = s->queues[INDEX_IO(0)]; 1341 NVMeRequest *req; 1342 uint32_t cdw12; 1343 1344 if (!s->supports_write_zeroes) { 1345 return -ENOTSUP; 1346 } 1347 1348 if (bytes == 0) { 1349 return 0; 1350 } 1351 1352 cdw12 = ((bytes >> s->blkshift) - 1) & 0xFFFF; 1353 /* 1354 * We should not lose information. pwrite_zeroes_alignment and 1355 * max_pwrite_zeroes guarantees it. 1356 */ 1357 assert(((cdw12 + 1) << s->blkshift) == bytes); 1358 1359 NvmeCmd cmd = { 1360 .opcode = NVME_CMD_WRITE_ZEROES, 1361 .nsid = cpu_to_le32(s->nsid), 1362 .cdw10 = cpu_to_le32((offset >> s->blkshift) & 0xFFFFFFFF), 1363 .cdw11 = cpu_to_le32(((offset >> s->blkshift) >> 32) & 0xFFFFFFFF), 1364 }; 1365 1366 NVMeCoData data = { 1367 .ctx = bdrv_get_aio_context(bs), 1368 .ret = -EINPROGRESS, 1369 }; 1370 1371 if (flags & BDRV_REQ_MAY_UNMAP) { 1372 cdw12 |= (1 << 25); 1373 } 1374 1375 if (flags & BDRV_REQ_FUA) { 1376 cdw12 |= (1 << 30); 1377 } 1378 1379 cmd.cdw12 = cpu_to_le32(cdw12); 1380 1381 trace_nvme_write_zeroes(s, offset, bytes, flags); 1382 assert(s->queue_count > 1); 1383 req = nvme_get_free_req(ioq); 1384 assert(req); 1385 1386 nvme_submit_command(ioq, req, &cmd, nvme_rw_cb, &data); 1387 1388 data.co = qemu_coroutine_self(); 1389 while (data.ret == -EINPROGRESS) { 1390 qemu_coroutine_yield(); 1391 } 1392 1393 trace_nvme_rw_done(s, true, offset, bytes, data.ret); 1394 return data.ret; 1395 } 1396 1397 1398 static int coroutine_fn nvme_co_pdiscard(BlockDriverState *bs, 1399 int64_t offset, 1400 int64_t bytes) 1401 { 1402 BDRVNVMeState *s = bs->opaque; 1403 NVMeQueuePair *ioq = s->queues[INDEX_IO(0)]; 1404 NVMeRequest *req; 1405 QEMU_AUTO_VFREE NvmeDsmRange *buf = NULL; 1406 QEMUIOVector local_qiov; 1407 int ret; 1408 1409 NvmeCmd cmd = { 1410 .opcode = NVME_CMD_DSM, 1411 .nsid = cpu_to_le32(s->nsid), 1412 .cdw10 = cpu_to_le32(0), /*number of ranges - 0 based*/ 1413 .cdw11 = cpu_to_le32(1 << 2), /*deallocate bit*/ 1414 }; 1415 1416 NVMeCoData data = { 1417 .ctx = bdrv_get_aio_context(bs), 1418 .ret = -EINPROGRESS, 1419 }; 1420 1421 if (!s->supports_discard) { 1422 return -ENOTSUP; 1423 } 1424 1425 assert(s->queue_count > 1); 1426 1427 /* 1428 * Filling the @buf requires @offset and @bytes to satisfy restrictions 1429 * defined in nvme_refresh_limits(). 1430 */ 1431 assert(QEMU_IS_ALIGNED(bytes, 1UL << s->blkshift)); 1432 assert(QEMU_IS_ALIGNED(offset, 1UL << s->blkshift)); 1433 assert((bytes >> s->blkshift) <= UINT32_MAX); 1434 1435 buf = qemu_try_memalign(s->page_size, s->page_size); 1436 if (!buf) { 1437 return -ENOMEM; 1438 } 1439 memset(buf, 0, s->page_size); 1440 buf->nlb = cpu_to_le32(bytes >> s->blkshift); 1441 buf->slba = cpu_to_le64(offset >> s->blkshift); 1442 buf->cattr = 0; 1443 1444 qemu_iovec_init(&local_qiov, 1); 1445 qemu_iovec_add(&local_qiov, buf, 4096); 1446 1447 req = nvme_get_free_req(ioq); 1448 assert(req); 1449 1450 qemu_co_mutex_lock(&s->dma_map_lock); 1451 ret = nvme_cmd_map_qiov(bs, &cmd, req, &local_qiov); 1452 qemu_co_mutex_unlock(&s->dma_map_lock); 1453 1454 if (ret) { 1455 nvme_put_free_req_and_wake(ioq, req); 1456 goto out; 1457 } 1458 1459 trace_nvme_dsm(s, offset, bytes); 1460 1461 nvme_submit_command(ioq, req, &cmd, nvme_rw_cb, &data); 1462 1463 data.co = qemu_coroutine_self(); 1464 while (data.ret == -EINPROGRESS) { 1465 qemu_coroutine_yield(); 1466 } 1467 1468 qemu_co_mutex_lock(&s->dma_map_lock); 1469 ret = nvme_cmd_unmap_qiov(bs, &local_qiov); 1470 qemu_co_mutex_unlock(&s->dma_map_lock); 1471 1472 if (ret) { 1473 goto out; 1474 } 1475 1476 ret = data.ret; 1477 trace_nvme_dsm_done(s, offset, bytes, ret); 1478 out: 1479 qemu_iovec_destroy(&local_qiov); 1480 return ret; 1481 1482 } 1483 1484 static int coroutine_fn nvme_co_truncate(BlockDriverState *bs, int64_t offset, 1485 bool exact, PreallocMode prealloc, 1486 BdrvRequestFlags flags, Error **errp) 1487 { 1488 int64_t cur_length; 1489 1490 if (prealloc != PREALLOC_MODE_OFF) { 1491 error_setg(errp, "Unsupported preallocation mode '%s'", 1492 PreallocMode_str(prealloc)); 1493 return -ENOTSUP; 1494 } 1495 1496 cur_length = nvme_co_getlength(bs); 1497 if (offset != cur_length && exact) { 1498 error_setg(errp, "Cannot resize NVMe devices"); 1499 return -ENOTSUP; 1500 } else if (offset > cur_length) { 1501 error_setg(errp, "Cannot grow NVMe devices"); 1502 return -EINVAL; 1503 } 1504 1505 return 0; 1506 } 1507 1508 static int nvme_reopen_prepare(BDRVReopenState *reopen_state, 1509 BlockReopenQueue *queue, Error **errp) 1510 { 1511 return 0; 1512 } 1513 1514 static void nvme_refresh_filename(BlockDriverState *bs) 1515 { 1516 BDRVNVMeState *s = bs->opaque; 1517 1518 snprintf(bs->exact_filename, sizeof(bs->exact_filename), "nvme://%s/%i", 1519 s->device, s->nsid); 1520 } 1521 1522 static void nvme_refresh_limits(BlockDriverState *bs, Error **errp) 1523 { 1524 BDRVNVMeState *s = bs->opaque; 1525 1526 bs->bl.opt_mem_alignment = s->page_size; 1527 bs->bl.request_alignment = s->page_size; 1528 bs->bl.max_transfer = s->max_transfer; 1529 1530 /* 1531 * Look at nvme_co_pwrite_zeroes: after shift and decrement we should get 1532 * at most 0xFFFF 1533 */ 1534 bs->bl.max_pwrite_zeroes = 1ULL << (s->blkshift + 16); 1535 bs->bl.pwrite_zeroes_alignment = MAX(bs->bl.request_alignment, 1536 1UL << s->blkshift); 1537 1538 bs->bl.max_pdiscard = (uint64_t)UINT32_MAX << s->blkshift; 1539 bs->bl.pdiscard_alignment = MAX(bs->bl.request_alignment, 1540 1UL << s->blkshift); 1541 } 1542 1543 static void nvme_detach_aio_context(BlockDriverState *bs) 1544 { 1545 BDRVNVMeState *s = bs->opaque; 1546 1547 for (unsigned i = 0; i < s->queue_count; i++) { 1548 NVMeQueuePair *q = s->queues[i]; 1549 1550 qemu_bh_delete(q->completion_bh); 1551 q->completion_bh = NULL; 1552 } 1553 1554 aio_set_event_notifier(bdrv_get_aio_context(bs), 1555 &s->irq_notifier[MSIX_SHARED_IRQ_IDX], 1556 NULL, NULL, NULL); 1557 } 1558 1559 static void nvme_attach_aio_context(BlockDriverState *bs, 1560 AioContext *new_context) 1561 { 1562 BDRVNVMeState *s = bs->opaque; 1563 1564 s->aio_context = new_context; 1565 aio_set_event_notifier(new_context, &s->irq_notifier[MSIX_SHARED_IRQ_IDX], 1566 nvme_handle_event, nvme_poll_cb, 1567 nvme_poll_ready); 1568 1569 for (unsigned i = 0; i < s->queue_count; i++) { 1570 NVMeQueuePair *q = s->queues[i]; 1571 1572 q->completion_bh = 1573 aio_bh_new(new_context, nvme_process_completion_bh, q); 1574 } 1575 } 1576 1577 static bool nvme_register_buf(BlockDriverState *bs, void *host, size_t size, 1578 Error **errp) 1579 { 1580 int ret; 1581 BDRVNVMeState *s = bs->opaque; 1582 1583 /* 1584 * FIXME: we may run out of IOVA addresses after repeated 1585 * bdrv_register_buf/bdrv_unregister_buf, because nvme_vfio_dma_unmap 1586 * doesn't reclaim addresses for fixed mappings. 1587 */ 1588 ret = qemu_vfio_dma_map(s->vfio, host, size, false, NULL, errp); 1589 return ret == 0; 1590 } 1591 1592 static void nvme_unregister_buf(BlockDriverState *bs, void *host, size_t size) 1593 { 1594 BDRVNVMeState *s = bs->opaque; 1595 1596 qemu_vfio_dma_unmap(s->vfio, host); 1597 } 1598 1599 static BlockStatsSpecific *nvme_get_specific_stats(BlockDriverState *bs) 1600 { 1601 BlockStatsSpecific *stats = g_new(BlockStatsSpecific, 1); 1602 BDRVNVMeState *s = bs->opaque; 1603 1604 stats->driver = BLOCKDEV_DRIVER_NVME; 1605 stats->u.nvme = (BlockStatsSpecificNvme) { 1606 .completion_errors = s->stats.completion_errors, 1607 .aligned_accesses = s->stats.aligned_accesses, 1608 .unaligned_accesses = s->stats.unaligned_accesses, 1609 }; 1610 1611 return stats; 1612 } 1613 1614 static const char *const nvme_strong_runtime_opts[] = { 1615 NVME_BLOCK_OPT_DEVICE, 1616 NVME_BLOCK_OPT_NAMESPACE, 1617 1618 NULL 1619 }; 1620 1621 static BlockDriver bdrv_nvme = { 1622 .format_name = "nvme", 1623 .protocol_name = "nvme", 1624 .instance_size = sizeof(BDRVNVMeState), 1625 1626 .bdrv_co_create_opts = bdrv_co_create_opts_simple, 1627 .create_opts = &bdrv_create_opts_simple, 1628 1629 .bdrv_parse_filename = nvme_parse_filename, 1630 .bdrv_file_open = nvme_file_open, 1631 .bdrv_close = nvme_close, 1632 .bdrv_co_getlength = nvme_co_getlength, 1633 .bdrv_probe_blocksizes = nvme_probe_blocksizes, 1634 .bdrv_co_truncate = nvme_co_truncate, 1635 1636 .bdrv_co_preadv = nvme_co_preadv, 1637 .bdrv_co_pwritev = nvme_co_pwritev, 1638 1639 .bdrv_co_pwrite_zeroes = nvme_co_pwrite_zeroes, 1640 .bdrv_co_pdiscard = nvme_co_pdiscard, 1641 1642 .bdrv_co_flush_to_disk = nvme_co_flush, 1643 .bdrv_reopen_prepare = nvme_reopen_prepare, 1644 1645 .bdrv_refresh_filename = nvme_refresh_filename, 1646 .bdrv_refresh_limits = nvme_refresh_limits, 1647 .strong_runtime_opts = nvme_strong_runtime_opts, 1648 .bdrv_get_specific_stats = nvme_get_specific_stats, 1649 1650 .bdrv_detach_aio_context = nvme_detach_aio_context, 1651 .bdrv_attach_aio_context = nvme_attach_aio_context, 1652 1653 .bdrv_register_buf = nvme_register_buf, 1654 .bdrv_unregister_buf = nvme_unregister_buf, 1655 }; 1656 1657 static void bdrv_nvme_init(void) 1658 { 1659 bdrv_register(&bdrv_nvme); 1660 } 1661 1662 block_init(bdrv_nvme_init); 1663