1 /* 2 * NVMe block driver based on vfio 3 * 4 * Copyright 2016 - 2018 Red Hat, Inc. 5 * 6 * Authors: 7 * Fam Zheng <famz@redhat.com> 8 * Paolo Bonzini <pbonzini@redhat.com> 9 * 10 * This work is licensed under the terms of the GNU GPL, version 2 or later. 11 * See the COPYING file in the top-level directory. 12 */ 13 14 #include "qemu/osdep.h" 15 #include <linux/vfio.h> 16 #include "qapi/error.h" 17 #include "qapi/qmp/qdict.h" 18 #include "qapi/qmp/qstring.h" 19 #include "qemu/defer-call.h" 20 #include "qemu/error-report.h" 21 #include "qemu/main-loop.h" 22 #include "qemu/module.h" 23 #include "qemu/cutils.h" 24 #include "qemu/option.h" 25 #include "qemu/memalign.h" 26 #include "qemu/vfio-helpers.h" 27 #include "block/block-io.h" 28 #include "block/block_int.h" 29 #include "sysemu/block-backend.h" 30 #include "sysemu/replay.h" 31 #include "trace.h" 32 33 #include "block/nvme.h" 34 35 #define NVME_SQ_ENTRY_BYTES 64 36 #define NVME_CQ_ENTRY_BYTES 16 37 #define NVME_QUEUE_SIZE 128 38 #define NVME_DOORBELL_SIZE 4096 39 40 /* 41 * We have to leave one slot empty as that is the full queue case where 42 * head == tail + 1. 43 */ 44 #define NVME_NUM_REQS (NVME_QUEUE_SIZE - 1) 45 46 typedef struct BDRVNVMeState BDRVNVMeState; 47 48 /* Same index is used for queues and IRQs */ 49 #define INDEX_ADMIN 0 50 #define INDEX_IO(n) (1 + n) 51 52 /* This driver shares a single MSIX IRQ for the admin and I/O queues */ 53 enum { 54 MSIX_SHARED_IRQ_IDX = 0, 55 MSIX_IRQ_COUNT = 1 56 }; 57 58 typedef struct { 59 int32_t head, tail; 60 uint8_t *queue; 61 uint64_t iova; 62 /* Hardware MMIO register */ 63 volatile uint32_t *doorbell; 64 } NVMeQueue; 65 66 typedef struct { 67 BlockCompletionFunc *cb; 68 void *opaque; 69 int cid; 70 void *prp_list_page; 71 uint64_t prp_list_iova; 72 int free_req_next; /* q->reqs[] index of next free req */ 73 } NVMeRequest; 74 75 typedef struct { 76 QemuMutex lock; 77 78 /* Read from I/O code path, initialized under BQL */ 79 BDRVNVMeState *s; 80 int index; 81 82 /* Fields protected by BQL */ 83 uint8_t *prp_list_pages; 84 85 /* Fields protected by @lock */ 86 CoQueue free_req_queue; 87 NVMeQueue sq, cq; 88 int cq_phase; 89 int free_req_head; 90 NVMeRequest reqs[NVME_NUM_REQS]; 91 int need_kick; 92 int inflight; 93 94 /* Thread-safe, no lock necessary */ 95 QEMUBH *completion_bh; 96 } NVMeQueuePair; 97 98 struct BDRVNVMeState { 99 AioContext *aio_context; 100 QEMUVFIOState *vfio; 101 void *bar0_wo_map; 102 /* Memory mapped registers */ 103 volatile struct { 104 uint32_t sq_tail; 105 uint32_t cq_head; 106 } *doorbells; 107 /* The submission/completion queue pairs. 108 * [0]: admin queue. 109 * [1..]: io queues. 110 */ 111 NVMeQueuePair **queues; 112 unsigned queue_count; 113 size_t page_size; 114 /* How many uint32_t elements does each doorbell entry take. */ 115 size_t doorbell_scale; 116 bool write_cache_supported; 117 EventNotifier irq_notifier[MSIX_IRQ_COUNT]; 118 119 uint64_t nsze; /* Namespace size reported by identify command */ 120 int nsid; /* The namespace id to read/write data. */ 121 int blkshift; 122 123 uint64_t max_transfer; 124 125 bool supports_write_zeroes; 126 bool supports_discard; 127 128 CoMutex dma_map_lock; 129 CoQueue dma_flush_queue; 130 131 /* Total size of mapped qiov, accessed under dma_map_lock */ 132 int dma_map_count; 133 134 /* PCI address (required for nvme_refresh_filename()) */ 135 char *device; 136 137 struct { 138 uint64_t completion_errors; 139 uint64_t aligned_accesses; 140 uint64_t unaligned_accesses; 141 } stats; 142 }; 143 144 #define NVME_BLOCK_OPT_DEVICE "device" 145 #define NVME_BLOCK_OPT_NAMESPACE "namespace" 146 147 static void nvme_process_completion_bh(void *opaque); 148 149 static QemuOptsList runtime_opts = { 150 .name = "nvme", 151 .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head), 152 .desc = { 153 { 154 .name = NVME_BLOCK_OPT_DEVICE, 155 .type = QEMU_OPT_STRING, 156 .help = "NVMe PCI device address", 157 }, 158 { 159 .name = NVME_BLOCK_OPT_NAMESPACE, 160 .type = QEMU_OPT_NUMBER, 161 .help = "NVMe namespace", 162 }, 163 { /* end of list */ } 164 }, 165 }; 166 167 /* Returns true on success, false on failure. */ 168 static bool nvme_init_queue(BDRVNVMeState *s, NVMeQueue *q, 169 unsigned nentries, size_t entry_bytes, Error **errp) 170 { 171 size_t bytes; 172 int r; 173 174 bytes = ROUND_UP(nentries * entry_bytes, qemu_real_host_page_size()); 175 q->head = q->tail = 0; 176 q->queue = qemu_try_memalign(qemu_real_host_page_size(), bytes); 177 if (!q->queue) { 178 error_setg(errp, "Cannot allocate queue"); 179 return false; 180 } 181 memset(q->queue, 0, bytes); 182 r = qemu_vfio_dma_map(s->vfio, q->queue, bytes, false, &q->iova, errp); 183 if (r) { 184 error_prepend(errp, "Cannot map queue: "); 185 } 186 return r == 0; 187 } 188 189 static void nvme_free_queue(NVMeQueue *q) 190 { 191 qemu_vfree(q->queue); 192 } 193 194 static void nvme_free_queue_pair(NVMeQueuePair *q) 195 { 196 trace_nvme_free_queue_pair(q->index, q, &q->cq, &q->sq); 197 if (q->completion_bh) { 198 qemu_bh_delete(q->completion_bh); 199 } 200 nvme_free_queue(&q->sq); 201 nvme_free_queue(&q->cq); 202 qemu_vfree(q->prp_list_pages); 203 qemu_mutex_destroy(&q->lock); 204 g_free(q); 205 } 206 207 static void nvme_free_req_queue_cb(void *opaque) 208 { 209 NVMeQueuePair *q = opaque; 210 211 qemu_mutex_lock(&q->lock); 212 while (q->free_req_head != -1 && 213 qemu_co_enter_next(&q->free_req_queue, &q->lock)) { 214 /* Retry waiting requests */ 215 } 216 qemu_mutex_unlock(&q->lock); 217 } 218 219 static NVMeQueuePair *nvme_create_queue_pair(BDRVNVMeState *s, 220 AioContext *aio_context, 221 unsigned idx, size_t size, 222 Error **errp) 223 { 224 int i, r; 225 NVMeQueuePair *q; 226 uint64_t prp_list_iova; 227 size_t bytes; 228 229 q = g_try_new0(NVMeQueuePair, 1); 230 if (!q) { 231 error_setg(errp, "Cannot allocate queue pair"); 232 return NULL; 233 } 234 trace_nvme_create_queue_pair(idx, q, size, aio_context, 235 event_notifier_get_fd(s->irq_notifier)); 236 bytes = QEMU_ALIGN_UP(s->page_size * NVME_NUM_REQS, 237 qemu_real_host_page_size()); 238 q->prp_list_pages = qemu_try_memalign(qemu_real_host_page_size(), bytes); 239 if (!q->prp_list_pages) { 240 error_setg(errp, "Cannot allocate PRP page list"); 241 goto fail; 242 } 243 memset(q->prp_list_pages, 0, bytes); 244 qemu_mutex_init(&q->lock); 245 q->s = s; 246 q->index = idx; 247 qemu_co_queue_init(&q->free_req_queue); 248 q->completion_bh = aio_bh_new(aio_context, nvme_process_completion_bh, q); 249 r = qemu_vfio_dma_map(s->vfio, q->prp_list_pages, bytes, 250 false, &prp_list_iova, errp); 251 if (r) { 252 error_prepend(errp, "Cannot map buffer for DMA: "); 253 goto fail; 254 } 255 q->free_req_head = -1; 256 for (i = 0; i < NVME_NUM_REQS; i++) { 257 NVMeRequest *req = &q->reqs[i]; 258 req->cid = i + 1; 259 req->free_req_next = q->free_req_head; 260 q->free_req_head = i; 261 req->prp_list_page = q->prp_list_pages + i * s->page_size; 262 req->prp_list_iova = prp_list_iova + i * s->page_size; 263 } 264 265 if (!nvme_init_queue(s, &q->sq, size, NVME_SQ_ENTRY_BYTES, errp)) { 266 goto fail; 267 } 268 q->sq.doorbell = &s->doorbells[idx * s->doorbell_scale].sq_tail; 269 270 if (!nvme_init_queue(s, &q->cq, size, NVME_CQ_ENTRY_BYTES, errp)) { 271 goto fail; 272 } 273 q->cq.doorbell = &s->doorbells[idx * s->doorbell_scale].cq_head; 274 275 return q; 276 fail: 277 nvme_free_queue_pair(q); 278 return NULL; 279 } 280 281 /* With q->lock */ 282 static void nvme_kick(NVMeQueuePair *q) 283 { 284 BDRVNVMeState *s = q->s; 285 286 if (!q->need_kick) { 287 return; 288 } 289 trace_nvme_kick(s, q->index); 290 assert(!(q->sq.tail & 0xFF00)); 291 /* Fence the write to submission queue entry before notifying the device. */ 292 smp_wmb(); 293 *q->sq.doorbell = cpu_to_le32(q->sq.tail); 294 q->inflight += q->need_kick; 295 q->need_kick = 0; 296 } 297 298 static NVMeRequest *nvme_get_free_req_nofail_locked(NVMeQueuePair *q) 299 { 300 NVMeRequest *req; 301 302 req = &q->reqs[q->free_req_head]; 303 q->free_req_head = req->free_req_next; 304 req->free_req_next = -1; 305 return req; 306 } 307 308 /* Return a free request element if any, otherwise return NULL. */ 309 static NVMeRequest *nvme_get_free_req_nowait(NVMeQueuePair *q) 310 { 311 QEMU_LOCK_GUARD(&q->lock); 312 if (q->free_req_head == -1) { 313 return NULL; 314 } 315 return nvme_get_free_req_nofail_locked(q); 316 } 317 318 /* 319 * Wait for a free request to become available if necessary, then 320 * return it. 321 */ 322 static coroutine_fn NVMeRequest *nvme_get_free_req(NVMeQueuePair *q) 323 { 324 QEMU_LOCK_GUARD(&q->lock); 325 326 while (q->free_req_head == -1) { 327 trace_nvme_free_req_queue_wait(q->s, q->index); 328 qemu_co_queue_wait(&q->free_req_queue, &q->lock); 329 } 330 331 return nvme_get_free_req_nofail_locked(q); 332 } 333 334 /* With q->lock */ 335 static void nvme_put_free_req_locked(NVMeQueuePair *q, NVMeRequest *req) 336 { 337 req->free_req_next = q->free_req_head; 338 q->free_req_head = req - q->reqs; 339 } 340 341 /* With q->lock */ 342 static void nvme_wake_free_req_locked(NVMeQueuePair *q) 343 { 344 if (!qemu_co_queue_empty(&q->free_req_queue)) { 345 replay_bh_schedule_oneshot_event(q->s->aio_context, 346 nvme_free_req_queue_cb, q); 347 } 348 } 349 350 /* Insert a request in the freelist and wake waiters */ 351 static void nvme_put_free_req_and_wake(NVMeQueuePair *q, NVMeRequest *req) 352 { 353 qemu_mutex_lock(&q->lock); 354 nvme_put_free_req_locked(q, req); 355 nvme_wake_free_req_locked(q); 356 qemu_mutex_unlock(&q->lock); 357 } 358 359 static inline int nvme_translate_error(const NvmeCqe *c) 360 { 361 uint16_t status = (le16_to_cpu(c->status) >> 1) & 0xFF; 362 if (status) { 363 trace_nvme_error(le32_to_cpu(c->result), 364 le16_to_cpu(c->sq_head), 365 le16_to_cpu(c->sq_id), 366 le16_to_cpu(c->cid), 367 le16_to_cpu(status)); 368 } 369 switch (status) { 370 case 0: 371 return 0; 372 case 1: 373 return -ENOSYS; 374 case 2: 375 return -EINVAL; 376 default: 377 return -EIO; 378 } 379 } 380 381 /* With q->lock */ 382 static bool nvme_process_completion(NVMeQueuePair *q) 383 { 384 BDRVNVMeState *s = q->s; 385 bool progress = false; 386 NVMeRequest *preq; 387 NVMeRequest req; 388 NvmeCqe *c; 389 390 trace_nvme_process_completion(s, q->index, q->inflight); 391 392 /* 393 * Support re-entrancy when a request cb() function invokes aio_poll(). 394 * Pending completions must be visible to aio_poll() so that a cb() 395 * function can wait for the completion of another request. 396 * 397 * The aio_poll() loop will execute our BH and we'll resume completion 398 * processing there. 399 */ 400 qemu_bh_schedule(q->completion_bh); 401 402 assert(q->inflight >= 0); 403 while (q->inflight) { 404 int ret; 405 int16_t cid; 406 407 c = (NvmeCqe *)&q->cq.queue[q->cq.head * NVME_CQ_ENTRY_BYTES]; 408 if ((le16_to_cpu(c->status) & 0x1) == q->cq_phase) { 409 break; 410 } 411 ret = nvme_translate_error(c); 412 if (ret) { 413 s->stats.completion_errors++; 414 } 415 q->cq.head = (q->cq.head + 1) % NVME_QUEUE_SIZE; 416 if (!q->cq.head) { 417 q->cq_phase = !q->cq_phase; 418 } 419 cid = le16_to_cpu(c->cid); 420 if (cid == 0 || cid > NVME_QUEUE_SIZE) { 421 warn_report("NVMe: Unexpected CID in completion queue: %"PRIu32", " 422 "queue size: %u", cid, NVME_QUEUE_SIZE); 423 continue; 424 } 425 trace_nvme_complete_command(s, q->index, cid); 426 preq = &q->reqs[cid - 1]; 427 req = *preq; 428 assert(req.cid == cid); 429 assert(req.cb); 430 nvme_put_free_req_locked(q, preq); 431 preq->cb = preq->opaque = NULL; 432 q->inflight--; 433 qemu_mutex_unlock(&q->lock); 434 req.cb(req.opaque, ret); 435 qemu_mutex_lock(&q->lock); 436 progress = true; 437 } 438 if (progress) { 439 /* Notify the device so it can post more completions. */ 440 smp_mb_release(); 441 *q->cq.doorbell = cpu_to_le32(q->cq.head); 442 nvme_wake_free_req_locked(q); 443 } 444 445 qemu_bh_cancel(q->completion_bh); 446 447 return progress; 448 } 449 450 static void nvme_process_completion_bh(void *opaque) 451 { 452 NVMeQueuePair *q = opaque; 453 454 /* 455 * We're being invoked because a nvme_process_completion() cb() function 456 * called aio_poll(). The callback may be waiting for further completions 457 * so notify the device that it has space to fill in more completions now. 458 */ 459 smp_mb_release(); 460 *q->cq.doorbell = cpu_to_le32(q->cq.head); 461 nvme_wake_free_req_locked(q); 462 463 nvme_process_completion(q); 464 } 465 466 static void nvme_trace_command(const NvmeCmd *cmd) 467 { 468 int i; 469 470 if (!trace_event_get_state_backends(TRACE_NVME_SUBMIT_COMMAND_RAW)) { 471 return; 472 } 473 for (i = 0; i < 8; ++i) { 474 uint8_t *cmdp = (uint8_t *)cmd + i * 8; 475 trace_nvme_submit_command_raw(cmdp[0], cmdp[1], cmdp[2], cmdp[3], 476 cmdp[4], cmdp[5], cmdp[6], cmdp[7]); 477 } 478 } 479 480 static void nvme_deferred_fn(void *opaque) 481 { 482 NVMeQueuePair *q = opaque; 483 484 QEMU_LOCK_GUARD(&q->lock); 485 nvme_kick(q); 486 nvme_process_completion(q); 487 } 488 489 static void nvme_submit_command(NVMeQueuePair *q, NVMeRequest *req, 490 NvmeCmd *cmd, BlockCompletionFunc cb, 491 void *opaque) 492 { 493 assert(!req->cb); 494 req->cb = cb; 495 req->opaque = opaque; 496 cmd->cid = cpu_to_le16(req->cid); 497 498 trace_nvme_submit_command(q->s, q->index, req->cid); 499 nvme_trace_command(cmd); 500 qemu_mutex_lock(&q->lock); 501 memcpy((uint8_t *)q->sq.queue + 502 q->sq.tail * NVME_SQ_ENTRY_BYTES, cmd, sizeof(*cmd)); 503 q->sq.tail = (q->sq.tail + 1) % NVME_QUEUE_SIZE; 504 q->need_kick++; 505 qemu_mutex_unlock(&q->lock); 506 507 defer_call(nvme_deferred_fn, q); 508 } 509 510 static void nvme_admin_cmd_sync_cb(void *opaque, int ret) 511 { 512 int *pret = opaque; 513 *pret = ret; 514 aio_wait_kick(); 515 } 516 517 static int nvme_admin_cmd_sync(BlockDriverState *bs, NvmeCmd *cmd) 518 { 519 BDRVNVMeState *s = bs->opaque; 520 NVMeQueuePair *q = s->queues[INDEX_ADMIN]; 521 AioContext *aio_context = bdrv_get_aio_context(bs); 522 NVMeRequest *req; 523 int ret = -EINPROGRESS; 524 req = nvme_get_free_req_nowait(q); 525 if (!req) { 526 return -EBUSY; 527 } 528 nvme_submit_command(q, req, cmd, nvme_admin_cmd_sync_cb, &ret); 529 530 AIO_WAIT_WHILE(aio_context, ret == -EINPROGRESS); 531 return ret; 532 } 533 534 /* Returns true on success, false on failure. */ 535 static bool nvme_identify(BlockDriverState *bs, int namespace, Error **errp) 536 { 537 BDRVNVMeState *s = bs->opaque; 538 bool ret = false; 539 QEMU_AUTO_VFREE union { 540 NvmeIdCtrl ctrl; 541 NvmeIdNs ns; 542 } *id = NULL; 543 NvmeLBAF *lbaf; 544 uint16_t oncs; 545 int r; 546 uint64_t iova; 547 NvmeCmd cmd = { 548 .opcode = NVME_ADM_CMD_IDENTIFY, 549 .cdw10 = cpu_to_le32(0x1), 550 }; 551 size_t id_size = QEMU_ALIGN_UP(sizeof(*id), qemu_real_host_page_size()); 552 553 id = qemu_try_memalign(qemu_real_host_page_size(), id_size); 554 if (!id) { 555 error_setg(errp, "Cannot allocate buffer for identify response"); 556 goto out; 557 } 558 r = qemu_vfio_dma_map(s->vfio, id, id_size, true, &iova, errp); 559 if (r) { 560 error_prepend(errp, "Cannot map buffer for DMA: "); 561 goto out; 562 } 563 564 memset(id, 0, id_size); 565 cmd.dptr.prp1 = cpu_to_le64(iova); 566 if (nvme_admin_cmd_sync(bs, &cmd)) { 567 error_setg(errp, "Failed to identify controller"); 568 goto out; 569 } 570 571 if (le32_to_cpu(id->ctrl.nn) < namespace) { 572 error_setg(errp, "Invalid namespace"); 573 goto out; 574 } 575 s->write_cache_supported = le32_to_cpu(id->ctrl.vwc) & 0x1; 576 s->max_transfer = (id->ctrl.mdts ? 1 << id->ctrl.mdts : 0) * s->page_size; 577 /* For now the page list buffer per command is one page, to hold at most 578 * s->page_size / sizeof(uint64_t) entries. */ 579 s->max_transfer = MIN_NON_ZERO(s->max_transfer, 580 s->page_size / sizeof(uint64_t) * s->page_size); 581 582 oncs = le16_to_cpu(id->ctrl.oncs); 583 s->supports_write_zeroes = !!(oncs & NVME_ONCS_WRITE_ZEROES); 584 s->supports_discard = !!(oncs & NVME_ONCS_DSM); 585 586 memset(id, 0, id_size); 587 cmd.cdw10 = 0; 588 cmd.nsid = cpu_to_le32(namespace); 589 if (nvme_admin_cmd_sync(bs, &cmd)) { 590 error_setg(errp, "Failed to identify namespace"); 591 goto out; 592 } 593 594 s->nsze = le64_to_cpu(id->ns.nsze); 595 lbaf = &id->ns.lbaf[NVME_ID_NS_FLBAS_INDEX(id->ns.flbas)]; 596 597 if (NVME_ID_NS_DLFEAT_WRITE_ZEROES(id->ns.dlfeat) && 598 NVME_ID_NS_DLFEAT_READ_BEHAVIOR(id->ns.dlfeat) == 599 NVME_ID_NS_DLFEAT_READ_BEHAVIOR_ZEROES) { 600 bs->supported_write_flags |= BDRV_REQ_MAY_UNMAP; 601 } 602 603 if (lbaf->ms) { 604 error_setg(errp, "Namespaces with metadata are not yet supported"); 605 goto out; 606 } 607 608 if (lbaf->ds < BDRV_SECTOR_BITS || lbaf->ds > 12 || 609 (1 << lbaf->ds) > s->page_size) 610 { 611 error_setg(errp, "Namespace has unsupported block size (2^%d)", 612 lbaf->ds); 613 goto out; 614 } 615 616 ret = true; 617 s->blkshift = lbaf->ds; 618 out: 619 qemu_vfio_dma_unmap(s->vfio, id); 620 621 return ret; 622 } 623 624 static void nvme_poll_queue(NVMeQueuePair *q) 625 { 626 const size_t cqe_offset = q->cq.head * NVME_CQ_ENTRY_BYTES; 627 NvmeCqe *cqe = (NvmeCqe *)&q->cq.queue[cqe_offset]; 628 629 trace_nvme_poll_queue(q->s, q->index); 630 /* 631 * Do an early check for completions. q->lock isn't needed because 632 * nvme_process_completion() only runs in the event loop thread and 633 * cannot race with itself. 634 */ 635 if ((le16_to_cpu(cqe->status) & 0x1) == q->cq_phase) { 636 return; 637 } 638 639 qemu_mutex_lock(&q->lock); 640 while (nvme_process_completion(q)) { 641 /* Keep polling */ 642 } 643 qemu_mutex_unlock(&q->lock); 644 } 645 646 static void nvme_poll_queues(BDRVNVMeState *s) 647 { 648 int i; 649 650 for (i = 0; i < s->queue_count; i++) { 651 nvme_poll_queue(s->queues[i]); 652 } 653 } 654 655 static void nvme_handle_event(EventNotifier *n) 656 { 657 BDRVNVMeState *s = container_of(n, BDRVNVMeState, 658 irq_notifier[MSIX_SHARED_IRQ_IDX]); 659 660 trace_nvme_handle_event(s); 661 event_notifier_test_and_clear(n); 662 nvme_poll_queues(s); 663 } 664 665 static bool nvme_add_io_queue(BlockDriverState *bs, Error **errp) 666 { 667 BDRVNVMeState *s = bs->opaque; 668 unsigned n = s->queue_count; 669 NVMeQueuePair *q; 670 NvmeCmd cmd; 671 unsigned queue_size = NVME_QUEUE_SIZE; 672 673 assert(n <= UINT16_MAX); 674 q = nvme_create_queue_pair(s, bdrv_get_aio_context(bs), 675 n, queue_size, errp); 676 if (!q) { 677 return false; 678 } 679 cmd = (NvmeCmd) { 680 .opcode = NVME_ADM_CMD_CREATE_CQ, 681 .dptr.prp1 = cpu_to_le64(q->cq.iova), 682 .cdw10 = cpu_to_le32(((queue_size - 1) << 16) | n), 683 .cdw11 = cpu_to_le32(NVME_CQ_IEN | NVME_CQ_PC), 684 }; 685 if (nvme_admin_cmd_sync(bs, &cmd)) { 686 error_setg(errp, "Failed to create CQ io queue [%u]", n); 687 goto out_error; 688 } 689 cmd = (NvmeCmd) { 690 .opcode = NVME_ADM_CMD_CREATE_SQ, 691 .dptr.prp1 = cpu_to_le64(q->sq.iova), 692 .cdw10 = cpu_to_le32(((queue_size - 1) << 16) | n), 693 .cdw11 = cpu_to_le32(NVME_SQ_PC | (n << 16)), 694 }; 695 if (nvme_admin_cmd_sync(bs, &cmd)) { 696 error_setg(errp, "Failed to create SQ io queue [%u]", n); 697 goto out_error; 698 } 699 s->queues = g_renew(NVMeQueuePair *, s->queues, n + 1); 700 s->queues[n] = q; 701 s->queue_count++; 702 return true; 703 out_error: 704 nvme_free_queue_pair(q); 705 return false; 706 } 707 708 static bool nvme_poll_cb(void *opaque) 709 { 710 EventNotifier *e = opaque; 711 BDRVNVMeState *s = container_of(e, BDRVNVMeState, 712 irq_notifier[MSIX_SHARED_IRQ_IDX]); 713 int i; 714 715 for (i = 0; i < s->queue_count; i++) { 716 NVMeQueuePair *q = s->queues[i]; 717 const size_t cqe_offset = q->cq.head * NVME_CQ_ENTRY_BYTES; 718 NvmeCqe *cqe = (NvmeCqe *)&q->cq.queue[cqe_offset]; 719 720 /* 721 * q->lock isn't needed because nvme_process_completion() only runs in 722 * the event loop thread and cannot race with itself. 723 */ 724 if ((le16_to_cpu(cqe->status) & 0x1) != q->cq_phase) { 725 return true; 726 } 727 } 728 return false; 729 } 730 731 static void nvme_poll_ready(EventNotifier *e) 732 { 733 BDRVNVMeState *s = container_of(e, BDRVNVMeState, 734 irq_notifier[MSIX_SHARED_IRQ_IDX]); 735 736 nvme_poll_queues(s); 737 } 738 739 static int nvme_init(BlockDriverState *bs, const char *device, int namespace, 740 Error **errp) 741 { 742 BDRVNVMeState *s = bs->opaque; 743 NVMeQueuePair *q; 744 AioContext *aio_context = bdrv_get_aio_context(bs); 745 int ret; 746 uint64_t cap; 747 uint32_t ver; 748 uint64_t timeout_ms; 749 uint64_t deadline, now; 750 volatile NvmeBar *regs = NULL; 751 752 qemu_co_mutex_init(&s->dma_map_lock); 753 qemu_co_queue_init(&s->dma_flush_queue); 754 s->device = g_strdup(device); 755 s->nsid = namespace; 756 s->aio_context = bdrv_get_aio_context(bs); 757 ret = event_notifier_init(&s->irq_notifier[MSIX_SHARED_IRQ_IDX], 0); 758 if (ret) { 759 error_setg(errp, "Failed to init event notifier"); 760 return ret; 761 } 762 763 s->vfio = qemu_vfio_open_pci(device, errp); 764 if (!s->vfio) { 765 ret = -EINVAL; 766 goto out; 767 } 768 769 regs = qemu_vfio_pci_map_bar(s->vfio, 0, 0, sizeof(NvmeBar), 770 PROT_READ | PROT_WRITE, errp); 771 if (!regs) { 772 ret = -EINVAL; 773 goto out; 774 } 775 /* Perform initialize sequence as described in NVMe spec "7.6.1 776 * Initialization". */ 777 778 cap = le64_to_cpu(regs->cap); 779 trace_nvme_controller_capability_raw(cap); 780 trace_nvme_controller_capability("Maximum Queue Entries Supported", 781 1 + NVME_CAP_MQES(cap)); 782 trace_nvme_controller_capability("Contiguous Queues Required", 783 NVME_CAP_CQR(cap)); 784 trace_nvme_controller_capability("Doorbell Stride", 785 1 << (2 + NVME_CAP_DSTRD(cap))); 786 trace_nvme_controller_capability("Subsystem Reset Supported", 787 NVME_CAP_NSSRS(cap)); 788 trace_nvme_controller_capability("Memory Page Size Minimum", 789 1 << (12 + NVME_CAP_MPSMIN(cap))); 790 trace_nvme_controller_capability("Memory Page Size Maximum", 791 1 << (12 + NVME_CAP_MPSMAX(cap))); 792 if (!NVME_CAP_CSS(cap)) { 793 error_setg(errp, "Device doesn't support NVMe command set"); 794 ret = -EINVAL; 795 goto out; 796 } 797 798 s->page_size = 1u << (12 + NVME_CAP_MPSMIN(cap)); 799 s->doorbell_scale = (4 << NVME_CAP_DSTRD(cap)) / sizeof(uint32_t); 800 bs->bl.opt_mem_alignment = s->page_size; 801 bs->bl.request_alignment = s->page_size; 802 timeout_ms = MIN(500 * NVME_CAP_TO(cap), 30000); 803 804 ver = le32_to_cpu(regs->vs); 805 trace_nvme_controller_spec_version(extract32(ver, 16, 16), 806 extract32(ver, 8, 8), 807 extract32(ver, 0, 8)); 808 809 /* Reset device to get a clean state. */ 810 regs->cc = cpu_to_le32(le32_to_cpu(regs->cc) & 0xFE); 811 /* Wait for CSTS.RDY = 0. */ 812 deadline = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) + timeout_ms * SCALE_MS; 813 while (NVME_CSTS_RDY(le32_to_cpu(regs->csts))) { 814 if (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) > deadline) { 815 error_setg(errp, "Timeout while waiting for device to reset (%" 816 PRId64 " ms)", 817 timeout_ms); 818 ret = -ETIMEDOUT; 819 goto out; 820 } 821 } 822 823 s->bar0_wo_map = qemu_vfio_pci_map_bar(s->vfio, 0, 0, 824 sizeof(NvmeBar) + NVME_DOORBELL_SIZE, 825 PROT_WRITE, errp); 826 s->doorbells = (void *)((uintptr_t)s->bar0_wo_map + sizeof(NvmeBar)); 827 if (!s->doorbells) { 828 ret = -EINVAL; 829 goto out; 830 } 831 832 /* Set up admin queue. */ 833 s->queues = g_new(NVMeQueuePair *, 1); 834 q = nvme_create_queue_pair(s, aio_context, 0, NVME_QUEUE_SIZE, errp); 835 if (!q) { 836 ret = -EINVAL; 837 goto out; 838 } 839 s->queues[INDEX_ADMIN] = q; 840 s->queue_count = 1; 841 QEMU_BUILD_BUG_ON((NVME_QUEUE_SIZE - 1) & 0xF000); 842 regs->aqa = cpu_to_le32(((NVME_QUEUE_SIZE - 1) << AQA_ACQS_SHIFT) | 843 ((NVME_QUEUE_SIZE - 1) << AQA_ASQS_SHIFT)); 844 regs->asq = cpu_to_le64(q->sq.iova); 845 regs->acq = cpu_to_le64(q->cq.iova); 846 847 /* After setting up all control registers we can enable device now. */ 848 regs->cc = cpu_to_le32((ctz32(NVME_CQ_ENTRY_BYTES) << CC_IOCQES_SHIFT) | 849 (ctz32(NVME_SQ_ENTRY_BYTES) << CC_IOSQES_SHIFT) | 850 CC_EN_MASK); 851 /* Wait for CSTS.RDY = 1. */ 852 now = qemu_clock_get_ns(QEMU_CLOCK_REALTIME); 853 deadline = now + timeout_ms * SCALE_MS; 854 while (!NVME_CSTS_RDY(le32_to_cpu(regs->csts))) { 855 if (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) > deadline) { 856 error_setg(errp, "Timeout while waiting for device to start (%" 857 PRId64 " ms)", 858 timeout_ms); 859 ret = -ETIMEDOUT; 860 goto out; 861 } 862 } 863 864 ret = qemu_vfio_pci_init_irq(s->vfio, s->irq_notifier, 865 VFIO_PCI_MSIX_IRQ_INDEX, errp); 866 if (ret) { 867 goto out; 868 } 869 aio_set_event_notifier(bdrv_get_aio_context(bs), 870 &s->irq_notifier[MSIX_SHARED_IRQ_IDX], 871 nvme_handle_event, nvme_poll_cb, 872 nvme_poll_ready); 873 874 if (!nvme_identify(bs, namespace, errp)) { 875 ret = -EIO; 876 goto out; 877 } 878 879 /* Set up command queues. */ 880 if (!nvme_add_io_queue(bs, errp)) { 881 ret = -EIO; 882 } 883 out: 884 if (regs) { 885 qemu_vfio_pci_unmap_bar(s->vfio, 0, (void *)regs, 0, sizeof(NvmeBar)); 886 } 887 888 /* Cleaning up is done in nvme_file_open() upon error. */ 889 return ret; 890 } 891 892 /* Parse a filename in the format of nvme://XXXX:XX:XX.X/X. Example: 893 * 894 * nvme://0000:44:00.0/1 895 * 896 * where the "nvme://" is a fixed form of the protocol prefix, the middle part 897 * is the PCI address, and the last part is the namespace number starting from 898 * 1 according to the NVMe spec. */ 899 static void nvme_parse_filename(const char *filename, QDict *options, 900 Error **errp) 901 { 902 int pref = strlen("nvme://"); 903 904 if (strlen(filename) > pref && !strncmp(filename, "nvme://", pref)) { 905 const char *tmp = filename + pref; 906 char *device; 907 const char *namespace; 908 unsigned long ns; 909 const char *slash = strchr(tmp, '/'); 910 if (!slash) { 911 qdict_put_str(options, NVME_BLOCK_OPT_DEVICE, tmp); 912 return; 913 } 914 device = g_strndup(tmp, slash - tmp); 915 qdict_put_str(options, NVME_BLOCK_OPT_DEVICE, device); 916 g_free(device); 917 namespace = slash + 1; 918 if (*namespace && qemu_strtoul(namespace, NULL, 10, &ns)) { 919 error_setg(errp, "Invalid namespace '%s', positive number expected", 920 namespace); 921 return; 922 } 923 qdict_put_str(options, NVME_BLOCK_OPT_NAMESPACE, 924 *namespace ? namespace : "1"); 925 } 926 } 927 928 static int nvme_enable_disable_write_cache(BlockDriverState *bs, bool enable, 929 Error **errp) 930 { 931 int ret; 932 BDRVNVMeState *s = bs->opaque; 933 NvmeCmd cmd = { 934 .opcode = NVME_ADM_CMD_SET_FEATURES, 935 .nsid = cpu_to_le32(s->nsid), 936 .cdw10 = cpu_to_le32(0x06), 937 .cdw11 = cpu_to_le32(enable ? 0x01 : 0x00), 938 }; 939 940 ret = nvme_admin_cmd_sync(bs, &cmd); 941 if (ret) { 942 error_setg(errp, "Failed to configure NVMe write cache"); 943 } 944 return ret; 945 } 946 947 static void nvme_close(BlockDriverState *bs) 948 { 949 BDRVNVMeState *s = bs->opaque; 950 951 for (unsigned i = 0; i < s->queue_count; ++i) { 952 nvme_free_queue_pair(s->queues[i]); 953 } 954 g_free(s->queues); 955 aio_set_event_notifier(bdrv_get_aio_context(bs), 956 &s->irq_notifier[MSIX_SHARED_IRQ_IDX], 957 NULL, NULL, NULL); 958 event_notifier_cleanup(&s->irq_notifier[MSIX_SHARED_IRQ_IDX]); 959 qemu_vfio_pci_unmap_bar(s->vfio, 0, s->bar0_wo_map, 960 0, sizeof(NvmeBar) + NVME_DOORBELL_SIZE); 961 qemu_vfio_close(s->vfio); 962 963 g_free(s->device); 964 } 965 966 static int nvme_file_open(BlockDriverState *bs, QDict *options, int flags, 967 Error **errp) 968 { 969 const char *device; 970 QemuOpts *opts; 971 int namespace; 972 int ret; 973 BDRVNVMeState *s = bs->opaque; 974 975 bs->supported_write_flags = BDRV_REQ_FUA; 976 977 opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort); 978 qemu_opts_absorb_qdict(opts, options, &error_abort); 979 device = qemu_opt_get(opts, NVME_BLOCK_OPT_DEVICE); 980 if (!device) { 981 error_setg(errp, "'" NVME_BLOCK_OPT_DEVICE "' option is required"); 982 qemu_opts_del(opts); 983 return -EINVAL; 984 } 985 986 namespace = qemu_opt_get_number(opts, NVME_BLOCK_OPT_NAMESPACE, 1); 987 ret = nvme_init(bs, device, namespace, errp); 988 qemu_opts_del(opts); 989 if (ret) { 990 goto fail; 991 } 992 if (flags & BDRV_O_NOCACHE) { 993 if (!s->write_cache_supported) { 994 error_setg(errp, 995 "NVMe controller doesn't support write cache configuration"); 996 ret = -EINVAL; 997 } else { 998 ret = nvme_enable_disable_write_cache(bs, !(flags & BDRV_O_NOCACHE), 999 errp); 1000 } 1001 if (ret) { 1002 goto fail; 1003 } 1004 } 1005 return 0; 1006 fail: 1007 nvme_close(bs); 1008 return ret; 1009 } 1010 1011 static int64_t coroutine_fn nvme_co_getlength(BlockDriverState *bs) 1012 { 1013 BDRVNVMeState *s = bs->opaque; 1014 return s->nsze << s->blkshift; 1015 } 1016 1017 static uint32_t nvme_get_blocksize(BlockDriverState *bs) 1018 { 1019 BDRVNVMeState *s = bs->opaque; 1020 assert(s->blkshift >= BDRV_SECTOR_BITS && s->blkshift <= 12); 1021 return UINT32_C(1) << s->blkshift; 1022 } 1023 1024 static int nvme_probe_blocksizes(BlockDriverState *bs, BlockSizes *bsz) 1025 { 1026 uint32_t blocksize = nvme_get_blocksize(bs); 1027 bsz->phys = blocksize; 1028 bsz->log = blocksize; 1029 return 0; 1030 } 1031 1032 /* Called with s->dma_map_lock */ 1033 static coroutine_fn int nvme_cmd_unmap_qiov(BlockDriverState *bs, 1034 QEMUIOVector *qiov) 1035 { 1036 int r = 0; 1037 BDRVNVMeState *s = bs->opaque; 1038 1039 s->dma_map_count -= qiov->size; 1040 if (!s->dma_map_count && !qemu_co_queue_empty(&s->dma_flush_queue)) { 1041 r = qemu_vfio_dma_reset_temporary(s->vfio); 1042 if (!r) { 1043 qemu_co_queue_restart_all(&s->dma_flush_queue); 1044 } 1045 } 1046 return r; 1047 } 1048 1049 /* Called with s->dma_map_lock */ 1050 static coroutine_fn int nvme_cmd_map_qiov(BlockDriverState *bs, NvmeCmd *cmd, 1051 NVMeRequest *req, QEMUIOVector *qiov) 1052 { 1053 BDRVNVMeState *s = bs->opaque; 1054 uint64_t *pagelist = req->prp_list_page; 1055 int i, j, r; 1056 int entries = 0; 1057 Error *local_err = NULL, **errp = NULL; 1058 1059 assert(qiov->size); 1060 assert(QEMU_IS_ALIGNED(qiov->size, s->page_size)); 1061 assert(qiov->size / s->page_size <= s->page_size / sizeof(uint64_t)); 1062 for (i = 0; i < qiov->niov; ++i) { 1063 bool retry = true; 1064 uint64_t iova; 1065 size_t len = QEMU_ALIGN_UP(qiov->iov[i].iov_len, 1066 qemu_real_host_page_size()); 1067 try_map: 1068 r = qemu_vfio_dma_map(s->vfio, 1069 qiov->iov[i].iov_base, 1070 len, true, &iova, errp); 1071 if (r == -ENOSPC) { 1072 /* 1073 * In addition to the -ENOMEM error, the VFIO_IOMMU_MAP_DMA 1074 * ioctl returns -ENOSPC to signal the user exhausted the DMA 1075 * mappings available for a container since Linux kernel commit 1076 * 492855939bdb ("vfio/type1: Limit DMA mappings per container", 1077 * April 2019, see CVE-2019-3882). 1078 * 1079 * This block driver already handles this error path by checking 1080 * for the -ENOMEM error, so we directly replace -ENOSPC by 1081 * -ENOMEM. Beside, -ENOSPC has a specific meaning for blockdev 1082 * coroutines: it triggers BLOCKDEV_ON_ERROR_ENOSPC and 1083 * BLOCK_ERROR_ACTION_STOP which stops the VM, asking the operator 1084 * to add more storage to the blockdev. Not something we can do 1085 * easily with an IOMMU :) 1086 */ 1087 r = -ENOMEM; 1088 } 1089 if (r == -ENOMEM && retry) { 1090 /* 1091 * We exhausted the DMA mappings available for our container: 1092 * recycle the volatile IOVA mappings. 1093 */ 1094 retry = false; 1095 trace_nvme_dma_flush_queue_wait(s); 1096 if (s->dma_map_count) { 1097 trace_nvme_dma_map_flush(s); 1098 qemu_co_queue_wait(&s->dma_flush_queue, &s->dma_map_lock); 1099 } else { 1100 r = qemu_vfio_dma_reset_temporary(s->vfio); 1101 if (r) { 1102 goto fail; 1103 } 1104 } 1105 errp = &local_err; 1106 1107 goto try_map; 1108 } 1109 if (r) { 1110 goto fail; 1111 } 1112 1113 for (j = 0; j < qiov->iov[i].iov_len / s->page_size; j++) { 1114 pagelist[entries++] = cpu_to_le64(iova + j * s->page_size); 1115 } 1116 trace_nvme_cmd_map_qiov_iov(s, i, qiov->iov[i].iov_base, 1117 qiov->iov[i].iov_len / s->page_size); 1118 } 1119 1120 s->dma_map_count += qiov->size; 1121 1122 assert(entries <= s->page_size / sizeof(uint64_t)); 1123 switch (entries) { 1124 case 0: 1125 abort(); 1126 case 1: 1127 cmd->dptr.prp1 = pagelist[0]; 1128 cmd->dptr.prp2 = 0; 1129 break; 1130 case 2: 1131 cmd->dptr.prp1 = pagelist[0]; 1132 cmd->dptr.prp2 = pagelist[1]; 1133 break; 1134 default: 1135 cmd->dptr.prp1 = pagelist[0]; 1136 cmd->dptr.prp2 = cpu_to_le64(req->prp_list_iova + sizeof(uint64_t)); 1137 break; 1138 } 1139 trace_nvme_cmd_map_qiov(s, cmd, req, qiov, entries); 1140 for (i = 0; i < entries; ++i) { 1141 trace_nvme_cmd_map_qiov_pages(s, i, pagelist[i]); 1142 } 1143 return 0; 1144 fail: 1145 /* No need to unmap [0 - i) iovs even if we've failed, since we don't 1146 * increment s->dma_map_count. This is okay for fixed mapping memory areas 1147 * because they are already mapped before calling this function; for 1148 * temporary mappings, a later nvme_cmd_(un)map_qiov will reclaim by 1149 * calling qemu_vfio_dma_reset_temporary when necessary. */ 1150 if (local_err) { 1151 error_reportf_err(local_err, "Cannot map buffer for DMA: "); 1152 } 1153 return r; 1154 } 1155 1156 typedef struct { 1157 Coroutine *co; 1158 int ret; 1159 AioContext *ctx; 1160 } NVMeCoData; 1161 1162 static void nvme_rw_cb_bh(void *opaque) 1163 { 1164 NVMeCoData *data = opaque; 1165 qemu_coroutine_enter(data->co); 1166 } 1167 1168 static void nvme_rw_cb(void *opaque, int ret) 1169 { 1170 NVMeCoData *data = opaque; 1171 data->ret = ret; 1172 if (!data->co) { 1173 /* The rw coroutine hasn't yielded, don't try to enter. */ 1174 return; 1175 } 1176 replay_bh_schedule_oneshot_event(data->ctx, nvme_rw_cb_bh, data); 1177 } 1178 1179 static coroutine_fn int nvme_co_prw_aligned(BlockDriverState *bs, 1180 uint64_t offset, uint64_t bytes, 1181 QEMUIOVector *qiov, 1182 bool is_write, 1183 int flags) 1184 { 1185 int r; 1186 BDRVNVMeState *s = bs->opaque; 1187 NVMeQueuePair *ioq = s->queues[INDEX_IO(0)]; 1188 NVMeRequest *req; 1189 1190 uint32_t cdw12 = (((bytes >> s->blkshift) - 1) & 0xFFFF) | 1191 (flags & BDRV_REQ_FUA ? 1 << 30 : 0); 1192 NvmeCmd cmd = { 1193 .opcode = is_write ? NVME_CMD_WRITE : NVME_CMD_READ, 1194 .nsid = cpu_to_le32(s->nsid), 1195 .cdw10 = cpu_to_le32((offset >> s->blkshift) & 0xFFFFFFFF), 1196 .cdw11 = cpu_to_le32(((offset >> s->blkshift) >> 32) & 0xFFFFFFFF), 1197 .cdw12 = cpu_to_le32(cdw12), 1198 }; 1199 NVMeCoData data = { 1200 .ctx = bdrv_get_aio_context(bs), 1201 .ret = -EINPROGRESS, 1202 }; 1203 1204 trace_nvme_prw_aligned(s, is_write, offset, bytes, flags, qiov->niov); 1205 assert(s->queue_count > 1); 1206 req = nvme_get_free_req(ioq); 1207 assert(req); 1208 1209 qemu_co_mutex_lock(&s->dma_map_lock); 1210 r = nvme_cmd_map_qiov(bs, &cmd, req, qiov); 1211 qemu_co_mutex_unlock(&s->dma_map_lock); 1212 if (r) { 1213 nvme_put_free_req_and_wake(ioq, req); 1214 return r; 1215 } 1216 nvme_submit_command(ioq, req, &cmd, nvme_rw_cb, &data); 1217 1218 data.co = qemu_coroutine_self(); 1219 while (data.ret == -EINPROGRESS) { 1220 qemu_coroutine_yield(); 1221 } 1222 1223 qemu_co_mutex_lock(&s->dma_map_lock); 1224 r = nvme_cmd_unmap_qiov(bs, qiov); 1225 qemu_co_mutex_unlock(&s->dma_map_lock); 1226 if (r) { 1227 return r; 1228 } 1229 1230 trace_nvme_rw_done(s, is_write, offset, bytes, data.ret); 1231 return data.ret; 1232 } 1233 1234 static inline bool nvme_qiov_aligned(BlockDriverState *bs, 1235 const QEMUIOVector *qiov) 1236 { 1237 int i; 1238 BDRVNVMeState *s = bs->opaque; 1239 1240 for (i = 0; i < qiov->niov; ++i) { 1241 if (!QEMU_PTR_IS_ALIGNED(qiov->iov[i].iov_base, 1242 qemu_real_host_page_size()) || 1243 !QEMU_IS_ALIGNED(qiov->iov[i].iov_len, qemu_real_host_page_size())) { 1244 trace_nvme_qiov_unaligned(qiov, i, qiov->iov[i].iov_base, 1245 qiov->iov[i].iov_len, s->page_size); 1246 return false; 1247 } 1248 } 1249 return true; 1250 } 1251 1252 static coroutine_fn int nvme_co_prw(BlockDriverState *bs, 1253 uint64_t offset, uint64_t bytes, 1254 QEMUIOVector *qiov, bool is_write, 1255 int flags) 1256 { 1257 BDRVNVMeState *s = bs->opaque; 1258 int r; 1259 QEMU_AUTO_VFREE uint8_t *buf = NULL; 1260 QEMUIOVector local_qiov; 1261 size_t len = QEMU_ALIGN_UP(bytes, qemu_real_host_page_size()); 1262 assert(QEMU_IS_ALIGNED(offset, s->page_size)); 1263 assert(QEMU_IS_ALIGNED(bytes, s->page_size)); 1264 assert(bytes <= s->max_transfer); 1265 if (nvme_qiov_aligned(bs, qiov)) { 1266 s->stats.aligned_accesses++; 1267 return nvme_co_prw_aligned(bs, offset, bytes, qiov, is_write, flags); 1268 } 1269 s->stats.unaligned_accesses++; 1270 trace_nvme_prw_buffered(s, offset, bytes, qiov->niov, is_write); 1271 buf = qemu_try_memalign(qemu_real_host_page_size(), len); 1272 1273 if (!buf) { 1274 return -ENOMEM; 1275 } 1276 qemu_iovec_init(&local_qiov, 1); 1277 if (is_write) { 1278 qemu_iovec_to_buf(qiov, 0, buf, bytes); 1279 } 1280 qemu_iovec_add(&local_qiov, buf, bytes); 1281 r = nvme_co_prw_aligned(bs, offset, bytes, &local_qiov, is_write, flags); 1282 qemu_iovec_destroy(&local_qiov); 1283 if (!r && !is_write) { 1284 qemu_iovec_from_buf(qiov, 0, buf, bytes); 1285 } 1286 return r; 1287 } 1288 1289 static coroutine_fn int nvme_co_preadv(BlockDriverState *bs, 1290 int64_t offset, int64_t bytes, 1291 QEMUIOVector *qiov, 1292 BdrvRequestFlags flags) 1293 { 1294 return nvme_co_prw(bs, offset, bytes, qiov, false, flags); 1295 } 1296 1297 static coroutine_fn int nvme_co_pwritev(BlockDriverState *bs, 1298 int64_t offset, int64_t bytes, 1299 QEMUIOVector *qiov, 1300 BdrvRequestFlags flags) 1301 { 1302 return nvme_co_prw(bs, offset, bytes, qiov, true, flags); 1303 } 1304 1305 static coroutine_fn int nvme_co_flush(BlockDriverState *bs) 1306 { 1307 BDRVNVMeState *s = bs->opaque; 1308 NVMeQueuePair *ioq = s->queues[INDEX_IO(0)]; 1309 NVMeRequest *req; 1310 NvmeCmd cmd = { 1311 .opcode = NVME_CMD_FLUSH, 1312 .nsid = cpu_to_le32(s->nsid), 1313 }; 1314 NVMeCoData data = { 1315 .ctx = bdrv_get_aio_context(bs), 1316 .ret = -EINPROGRESS, 1317 }; 1318 1319 assert(s->queue_count > 1); 1320 req = nvme_get_free_req(ioq); 1321 assert(req); 1322 nvme_submit_command(ioq, req, &cmd, nvme_rw_cb, &data); 1323 1324 data.co = qemu_coroutine_self(); 1325 if (data.ret == -EINPROGRESS) { 1326 qemu_coroutine_yield(); 1327 } 1328 1329 return data.ret; 1330 } 1331 1332 1333 static coroutine_fn int nvme_co_pwrite_zeroes(BlockDriverState *bs, 1334 int64_t offset, 1335 int64_t bytes, 1336 BdrvRequestFlags flags) 1337 { 1338 BDRVNVMeState *s = bs->opaque; 1339 NVMeQueuePair *ioq = s->queues[INDEX_IO(0)]; 1340 NVMeRequest *req; 1341 uint32_t cdw12; 1342 1343 if (!s->supports_write_zeroes) { 1344 return -ENOTSUP; 1345 } 1346 1347 if (bytes == 0) { 1348 return 0; 1349 } 1350 1351 cdw12 = ((bytes >> s->blkshift) - 1) & 0xFFFF; 1352 /* 1353 * We should not lose information. pwrite_zeroes_alignment and 1354 * max_pwrite_zeroes guarantees it. 1355 */ 1356 assert(((cdw12 + 1) << s->blkshift) == bytes); 1357 1358 NvmeCmd cmd = { 1359 .opcode = NVME_CMD_WRITE_ZEROES, 1360 .nsid = cpu_to_le32(s->nsid), 1361 .cdw10 = cpu_to_le32((offset >> s->blkshift) & 0xFFFFFFFF), 1362 .cdw11 = cpu_to_le32(((offset >> s->blkshift) >> 32) & 0xFFFFFFFF), 1363 }; 1364 1365 NVMeCoData data = { 1366 .ctx = bdrv_get_aio_context(bs), 1367 .ret = -EINPROGRESS, 1368 }; 1369 1370 if (flags & BDRV_REQ_MAY_UNMAP) { 1371 cdw12 |= (1 << 25); 1372 } 1373 1374 if (flags & BDRV_REQ_FUA) { 1375 cdw12 |= (1 << 30); 1376 } 1377 1378 cmd.cdw12 = cpu_to_le32(cdw12); 1379 1380 trace_nvme_write_zeroes(s, offset, bytes, flags); 1381 assert(s->queue_count > 1); 1382 req = nvme_get_free_req(ioq); 1383 assert(req); 1384 1385 nvme_submit_command(ioq, req, &cmd, nvme_rw_cb, &data); 1386 1387 data.co = qemu_coroutine_self(); 1388 while (data.ret == -EINPROGRESS) { 1389 qemu_coroutine_yield(); 1390 } 1391 1392 trace_nvme_rw_done(s, true, offset, bytes, data.ret); 1393 return data.ret; 1394 } 1395 1396 1397 static int coroutine_fn nvme_co_pdiscard(BlockDriverState *bs, 1398 int64_t offset, 1399 int64_t bytes) 1400 { 1401 BDRVNVMeState *s = bs->opaque; 1402 NVMeQueuePair *ioq = s->queues[INDEX_IO(0)]; 1403 NVMeRequest *req; 1404 QEMU_AUTO_VFREE NvmeDsmRange *buf = NULL; 1405 QEMUIOVector local_qiov; 1406 int ret; 1407 1408 NvmeCmd cmd = { 1409 .opcode = NVME_CMD_DSM, 1410 .nsid = cpu_to_le32(s->nsid), 1411 .cdw10 = cpu_to_le32(0), /*number of ranges - 0 based*/ 1412 .cdw11 = cpu_to_le32(1 << 2), /*deallocate bit*/ 1413 }; 1414 1415 NVMeCoData data = { 1416 .ctx = bdrv_get_aio_context(bs), 1417 .ret = -EINPROGRESS, 1418 }; 1419 1420 if (!s->supports_discard) { 1421 return -ENOTSUP; 1422 } 1423 1424 assert(s->queue_count > 1); 1425 1426 /* 1427 * Filling the @buf requires @offset and @bytes to satisfy restrictions 1428 * defined in nvme_refresh_limits(). 1429 */ 1430 assert(QEMU_IS_ALIGNED(bytes, 1UL << s->blkshift)); 1431 assert(QEMU_IS_ALIGNED(offset, 1UL << s->blkshift)); 1432 assert((bytes >> s->blkshift) <= UINT32_MAX); 1433 1434 buf = qemu_try_memalign(s->page_size, s->page_size); 1435 if (!buf) { 1436 return -ENOMEM; 1437 } 1438 memset(buf, 0, s->page_size); 1439 buf->nlb = cpu_to_le32(bytes >> s->blkshift); 1440 buf->slba = cpu_to_le64(offset >> s->blkshift); 1441 buf->cattr = 0; 1442 1443 qemu_iovec_init(&local_qiov, 1); 1444 qemu_iovec_add(&local_qiov, buf, 4096); 1445 1446 req = nvme_get_free_req(ioq); 1447 assert(req); 1448 1449 qemu_co_mutex_lock(&s->dma_map_lock); 1450 ret = nvme_cmd_map_qiov(bs, &cmd, req, &local_qiov); 1451 qemu_co_mutex_unlock(&s->dma_map_lock); 1452 1453 if (ret) { 1454 nvme_put_free_req_and_wake(ioq, req); 1455 goto out; 1456 } 1457 1458 trace_nvme_dsm(s, offset, bytes); 1459 1460 nvme_submit_command(ioq, req, &cmd, nvme_rw_cb, &data); 1461 1462 data.co = qemu_coroutine_self(); 1463 while (data.ret == -EINPROGRESS) { 1464 qemu_coroutine_yield(); 1465 } 1466 1467 qemu_co_mutex_lock(&s->dma_map_lock); 1468 ret = nvme_cmd_unmap_qiov(bs, &local_qiov); 1469 qemu_co_mutex_unlock(&s->dma_map_lock); 1470 1471 if (ret) { 1472 goto out; 1473 } 1474 1475 ret = data.ret; 1476 trace_nvme_dsm_done(s, offset, bytes, ret); 1477 out: 1478 qemu_iovec_destroy(&local_qiov); 1479 return ret; 1480 1481 } 1482 1483 static int coroutine_fn nvme_co_truncate(BlockDriverState *bs, int64_t offset, 1484 bool exact, PreallocMode prealloc, 1485 BdrvRequestFlags flags, Error **errp) 1486 { 1487 int64_t cur_length; 1488 1489 if (prealloc != PREALLOC_MODE_OFF) { 1490 error_setg(errp, "Unsupported preallocation mode '%s'", 1491 PreallocMode_str(prealloc)); 1492 return -ENOTSUP; 1493 } 1494 1495 cur_length = nvme_co_getlength(bs); 1496 if (offset != cur_length && exact) { 1497 error_setg(errp, "Cannot resize NVMe devices"); 1498 return -ENOTSUP; 1499 } else if (offset > cur_length) { 1500 error_setg(errp, "Cannot grow NVMe devices"); 1501 return -EINVAL; 1502 } 1503 1504 return 0; 1505 } 1506 1507 static int nvme_reopen_prepare(BDRVReopenState *reopen_state, 1508 BlockReopenQueue *queue, Error **errp) 1509 { 1510 return 0; 1511 } 1512 1513 static void nvme_refresh_filename(BlockDriverState *bs) 1514 { 1515 BDRVNVMeState *s = bs->opaque; 1516 1517 snprintf(bs->exact_filename, sizeof(bs->exact_filename), "nvme://%s/%i", 1518 s->device, s->nsid); 1519 } 1520 1521 static void nvme_refresh_limits(BlockDriverState *bs, Error **errp) 1522 { 1523 BDRVNVMeState *s = bs->opaque; 1524 1525 bs->bl.opt_mem_alignment = s->page_size; 1526 bs->bl.request_alignment = s->page_size; 1527 bs->bl.max_transfer = s->max_transfer; 1528 1529 /* 1530 * Look at nvme_co_pwrite_zeroes: after shift and decrement we should get 1531 * at most 0xFFFF 1532 */ 1533 bs->bl.max_pwrite_zeroes = 1ULL << (s->blkshift + 16); 1534 bs->bl.pwrite_zeroes_alignment = MAX(bs->bl.request_alignment, 1535 1UL << s->blkshift); 1536 1537 bs->bl.max_pdiscard = (uint64_t)UINT32_MAX << s->blkshift; 1538 bs->bl.pdiscard_alignment = MAX(bs->bl.request_alignment, 1539 1UL << s->blkshift); 1540 } 1541 1542 static void nvme_detach_aio_context(BlockDriverState *bs) 1543 { 1544 BDRVNVMeState *s = bs->opaque; 1545 1546 for (unsigned i = 0; i < s->queue_count; i++) { 1547 NVMeQueuePair *q = s->queues[i]; 1548 1549 qemu_bh_delete(q->completion_bh); 1550 q->completion_bh = NULL; 1551 } 1552 1553 aio_set_event_notifier(bdrv_get_aio_context(bs), 1554 &s->irq_notifier[MSIX_SHARED_IRQ_IDX], 1555 NULL, NULL, NULL); 1556 } 1557 1558 static void nvme_attach_aio_context(BlockDriverState *bs, 1559 AioContext *new_context) 1560 { 1561 BDRVNVMeState *s = bs->opaque; 1562 1563 s->aio_context = new_context; 1564 aio_set_event_notifier(new_context, &s->irq_notifier[MSIX_SHARED_IRQ_IDX], 1565 nvme_handle_event, nvme_poll_cb, 1566 nvme_poll_ready); 1567 1568 for (unsigned i = 0; i < s->queue_count; i++) { 1569 NVMeQueuePair *q = s->queues[i]; 1570 1571 q->completion_bh = 1572 aio_bh_new(new_context, nvme_process_completion_bh, q); 1573 } 1574 } 1575 1576 static bool nvme_register_buf(BlockDriverState *bs, void *host, size_t size, 1577 Error **errp) 1578 { 1579 int ret; 1580 BDRVNVMeState *s = bs->opaque; 1581 1582 /* 1583 * FIXME: we may run out of IOVA addresses after repeated 1584 * bdrv_register_buf/bdrv_unregister_buf, because nvme_vfio_dma_unmap 1585 * doesn't reclaim addresses for fixed mappings. 1586 */ 1587 ret = qemu_vfio_dma_map(s->vfio, host, size, false, NULL, errp); 1588 return ret == 0; 1589 } 1590 1591 static void nvme_unregister_buf(BlockDriverState *bs, void *host, size_t size) 1592 { 1593 BDRVNVMeState *s = bs->opaque; 1594 1595 qemu_vfio_dma_unmap(s->vfio, host); 1596 } 1597 1598 static BlockStatsSpecific *nvme_get_specific_stats(BlockDriverState *bs) 1599 { 1600 BlockStatsSpecific *stats = g_new(BlockStatsSpecific, 1); 1601 BDRVNVMeState *s = bs->opaque; 1602 1603 stats->driver = BLOCKDEV_DRIVER_NVME; 1604 stats->u.nvme = (BlockStatsSpecificNvme) { 1605 .completion_errors = s->stats.completion_errors, 1606 .aligned_accesses = s->stats.aligned_accesses, 1607 .unaligned_accesses = s->stats.unaligned_accesses, 1608 }; 1609 1610 return stats; 1611 } 1612 1613 static const char *const nvme_strong_runtime_opts[] = { 1614 NVME_BLOCK_OPT_DEVICE, 1615 NVME_BLOCK_OPT_NAMESPACE, 1616 1617 NULL 1618 }; 1619 1620 static BlockDriver bdrv_nvme = { 1621 .format_name = "nvme", 1622 .protocol_name = "nvme", 1623 .instance_size = sizeof(BDRVNVMeState), 1624 1625 .bdrv_co_create_opts = bdrv_co_create_opts_simple, 1626 .create_opts = &bdrv_create_opts_simple, 1627 1628 .bdrv_parse_filename = nvme_parse_filename, 1629 .bdrv_file_open = nvme_file_open, 1630 .bdrv_close = nvme_close, 1631 .bdrv_co_getlength = nvme_co_getlength, 1632 .bdrv_probe_blocksizes = nvme_probe_blocksizes, 1633 .bdrv_co_truncate = nvme_co_truncate, 1634 1635 .bdrv_co_preadv = nvme_co_preadv, 1636 .bdrv_co_pwritev = nvme_co_pwritev, 1637 1638 .bdrv_co_pwrite_zeroes = nvme_co_pwrite_zeroes, 1639 .bdrv_co_pdiscard = nvme_co_pdiscard, 1640 1641 .bdrv_co_flush_to_disk = nvme_co_flush, 1642 .bdrv_reopen_prepare = nvme_reopen_prepare, 1643 1644 .bdrv_refresh_filename = nvme_refresh_filename, 1645 .bdrv_refresh_limits = nvme_refresh_limits, 1646 .strong_runtime_opts = nvme_strong_runtime_opts, 1647 .bdrv_get_specific_stats = nvme_get_specific_stats, 1648 1649 .bdrv_detach_aio_context = nvme_detach_aio_context, 1650 .bdrv_attach_aio_context = nvme_attach_aio_context, 1651 1652 .bdrv_register_buf = nvme_register_buf, 1653 .bdrv_unregister_buf = nvme_unregister_buf, 1654 }; 1655 1656 static void bdrv_nvme_init(void) 1657 { 1658 bdrv_register(&bdrv_nvme); 1659 } 1660 1661 block_init(bdrv_nvme_init); 1662