1 /* 2 * NVMe block driver based on vfio 3 * 4 * Copyright 2016 - 2018 Red Hat, Inc. 5 * 6 * Authors: 7 * Fam Zheng <famz@redhat.com> 8 * Paolo Bonzini <pbonzini@redhat.com> 9 * 10 * This work is licensed under the terms of the GNU GPL, version 2 or later. 11 * See the COPYING file in the top-level directory. 12 */ 13 14 #include "qemu/osdep.h" 15 #include <linux/vfio.h> 16 #include "qapi/error.h" 17 #include "qapi/qmp/qdict.h" 18 #include "qapi/qmp/qstring.h" 19 #include "qemu/error-report.h" 20 #include "qemu/main-loop.h" 21 #include "qemu/module.h" 22 #include "qemu/cutils.h" 23 #include "qemu/option.h" 24 #include "qemu/memalign.h" 25 #include "qemu/vfio-helpers.h" 26 #include "block/block-io.h" 27 #include "block/block_int.h" 28 #include "sysemu/replay.h" 29 #include "trace.h" 30 31 #include "block/nvme.h" 32 33 #define NVME_SQ_ENTRY_BYTES 64 34 #define NVME_CQ_ENTRY_BYTES 16 35 #define NVME_QUEUE_SIZE 128 36 #define NVME_DOORBELL_SIZE 4096 37 38 /* 39 * We have to leave one slot empty as that is the full queue case where 40 * head == tail + 1. 41 */ 42 #define NVME_NUM_REQS (NVME_QUEUE_SIZE - 1) 43 44 typedef struct BDRVNVMeState BDRVNVMeState; 45 46 /* Same index is used for queues and IRQs */ 47 #define INDEX_ADMIN 0 48 #define INDEX_IO(n) (1 + n) 49 50 /* This driver shares a single MSIX IRQ for the admin and I/O queues */ 51 enum { 52 MSIX_SHARED_IRQ_IDX = 0, 53 MSIX_IRQ_COUNT = 1 54 }; 55 56 typedef struct { 57 int32_t head, tail; 58 uint8_t *queue; 59 uint64_t iova; 60 /* Hardware MMIO register */ 61 volatile uint32_t *doorbell; 62 } NVMeQueue; 63 64 typedef struct { 65 BlockCompletionFunc *cb; 66 void *opaque; 67 int cid; 68 void *prp_list_page; 69 uint64_t prp_list_iova; 70 int free_req_next; /* q->reqs[] index of next free req */ 71 } NVMeRequest; 72 73 typedef struct { 74 QemuMutex lock; 75 76 /* Read from I/O code path, initialized under BQL */ 77 BDRVNVMeState *s; 78 int index; 79 80 /* Fields protected by BQL */ 81 uint8_t *prp_list_pages; 82 83 /* Fields protected by @lock */ 84 CoQueue free_req_queue; 85 NVMeQueue sq, cq; 86 int cq_phase; 87 int free_req_head; 88 NVMeRequest reqs[NVME_NUM_REQS]; 89 int need_kick; 90 int inflight; 91 92 /* Thread-safe, no lock necessary */ 93 QEMUBH *completion_bh; 94 } NVMeQueuePair; 95 96 struct BDRVNVMeState { 97 AioContext *aio_context; 98 QEMUVFIOState *vfio; 99 void *bar0_wo_map; 100 /* Memory mapped registers */ 101 volatile struct { 102 uint32_t sq_tail; 103 uint32_t cq_head; 104 } *doorbells; 105 /* The submission/completion queue pairs. 106 * [0]: admin queue. 107 * [1..]: io queues. 108 */ 109 NVMeQueuePair **queues; 110 unsigned queue_count; 111 size_t page_size; 112 /* How many uint32_t elements does each doorbell entry take. */ 113 size_t doorbell_scale; 114 bool write_cache_supported; 115 EventNotifier irq_notifier[MSIX_IRQ_COUNT]; 116 117 uint64_t nsze; /* Namespace size reported by identify command */ 118 int nsid; /* The namespace id to read/write data. */ 119 int blkshift; 120 121 uint64_t max_transfer; 122 bool plugged; 123 124 bool supports_write_zeroes; 125 bool supports_discard; 126 127 CoMutex dma_map_lock; 128 CoQueue dma_flush_queue; 129 130 /* Total size of mapped qiov, accessed under dma_map_lock */ 131 int dma_map_count; 132 133 /* PCI address (required for nvme_refresh_filename()) */ 134 char *device; 135 136 struct { 137 uint64_t completion_errors; 138 uint64_t aligned_accesses; 139 uint64_t unaligned_accesses; 140 } stats; 141 }; 142 143 #define NVME_BLOCK_OPT_DEVICE "device" 144 #define NVME_BLOCK_OPT_NAMESPACE "namespace" 145 146 static void nvme_process_completion_bh(void *opaque); 147 148 static QemuOptsList runtime_opts = { 149 .name = "nvme", 150 .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head), 151 .desc = { 152 { 153 .name = NVME_BLOCK_OPT_DEVICE, 154 .type = QEMU_OPT_STRING, 155 .help = "NVMe PCI device address", 156 }, 157 { 158 .name = NVME_BLOCK_OPT_NAMESPACE, 159 .type = QEMU_OPT_NUMBER, 160 .help = "NVMe namespace", 161 }, 162 { /* end of list */ } 163 }, 164 }; 165 166 /* Returns true on success, false on failure. */ 167 static bool nvme_init_queue(BDRVNVMeState *s, NVMeQueue *q, 168 unsigned nentries, size_t entry_bytes, Error **errp) 169 { 170 size_t bytes; 171 int r; 172 173 bytes = ROUND_UP(nentries * entry_bytes, qemu_real_host_page_size()); 174 q->head = q->tail = 0; 175 q->queue = qemu_try_memalign(qemu_real_host_page_size(), bytes); 176 if (!q->queue) { 177 error_setg(errp, "Cannot allocate queue"); 178 return false; 179 } 180 memset(q->queue, 0, bytes); 181 r = qemu_vfio_dma_map(s->vfio, q->queue, bytes, false, &q->iova, errp); 182 if (r) { 183 error_prepend(errp, "Cannot map queue: "); 184 } 185 return r == 0; 186 } 187 188 static void nvme_free_queue(NVMeQueue *q) 189 { 190 qemu_vfree(q->queue); 191 } 192 193 static void nvme_free_queue_pair(NVMeQueuePair *q) 194 { 195 trace_nvme_free_queue_pair(q->index, q, &q->cq, &q->sq); 196 if (q->completion_bh) { 197 qemu_bh_delete(q->completion_bh); 198 } 199 nvme_free_queue(&q->sq); 200 nvme_free_queue(&q->cq); 201 qemu_vfree(q->prp_list_pages); 202 qemu_mutex_destroy(&q->lock); 203 g_free(q); 204 } 205 206 static void nvme_free_req_queue_cb(void *opaque) 207 { 208 NVMeQueuePair *q = opaque; 209 210 qemu_mutex_lock(&q->lock); 211 while (q->free_req_head != -1 && 212 qemu_co_enter_next(&q->free_req_queue, &q->lock)) { 213 /* Retry waiting requests */ 214 } 215 qemu_mutex_unlock(&q->lock); 216 } 217 218 static NVMeQueuePair *nvme_create_queue_pair(BDRVNVMeState *s, 219 AioContext *aio_context, 220 unsigned idx, size_t size, 221 Error **errp) 222 { 223 int i, r; 224 NVMeQueuePair *q; 225 uint64_t prp_list_iova; 226 size_t bytes; 227 228 q = g_try_new0(NVMeQueuePair, 1); 229 if (!q) { 230 error_setg(errp, "Cannot allocate queue pair"); 231 return NULL; 232 } 233 trace_nvme_create_queue_pair(idx, q, size, aio_context, 234 event_notifier_get_fd(s->irq_notifier)); 235 bytes = QEMU_ALIGN_UP(s->page_size * NVME_NUM_REQS, 236 qemu_real_host_page_size()); 237 q->prp_list_pages = qemu_try_memalign(qemu_real_host_page_size(), bytes); 238 if (!q->prp_list_pages) { 239 error_setg(errp, "Cannot allocate PRP page list"); 240 goto fail; 241 } 242 memset(q->prp_list_pages, 0, bytes); 243 qemu_mutex_init(&q->lock); 244 q->s = s; 245 q->index = idx; 246 qemu_co_queue_init(&q->free_req_queue); 247 q->completion_bh = aio_bh_new(aio_context, nvme_process_completion_bh, q); 248 r = qemu_vfio_dma_map(s->vfio, q->prp_list_pages, bytes, 249 false, &prp_list_iova, errp); 250 if (r) { 251 error_prepend(errp, "Cannot map buffer for DMA: "); 252 goto fail; 253 } 254 q->free_req_head = -1; 255 for (i = 0; i < NVME_NUM_REQS; i++) { 256 NVMeRequest *req = &q->reqs[i]; 257 req->cid = i + 1; 258 req->free_req_next = q->free_req_head; 259 q->free_req_head = i; 260 req->prp_list_page = q->prp_list_pages + i * s->page_size; 261 req->prp_list_iova = prp_list_iova + i * s->page_size; 262 } 263 264 if (!nvme_init_queue(s, &q->sq, size, NVME_SQ_ENTRY_BYTES, errp)) { 265 goto fail; 266 } 267 q->sq.doorbell = &s->doorbells[idx * s->doorbell_scale].sq_tail; 268 269 if (!nvme_init_queue(s, &q->cq, size, NVME_CQ_ENTRY_BYTES, errp)) { 270 goto fail; 271 } 272 q->cq.doorbell = &s->doorbells[idx * s->doorbell_scale].cq_head; 273 274 return q; 275 fail: 276 nvme_free_queue_pair(q); 277 return NULL; 278 } 279 280 /* With q->lock */ 281 static void nvme_kick(NVMeQueuePair *q) 282 { 283 BDRVNVMeState *s = q->s; 284 285 if (s->plugged || !q->need_kick) { 286 return; 287 } 288 trace_nvme_kick(s, q->index); 289 assert(!(q->sq.tail & 0xFF00)); 290 /* Fence the write to submission queue entry before notifying the device. */ 291 smp_wmb(); 292 *q->sq.doorbell = cpu_to_le32(q->sq.tail); 293 q->inflight += q->need_kick; 294 q->need_kick = 0; 295 } 296 297 static NVMeRequest *nvme_get_free_req_nofail_locked(NVMeQueuePair *q) 298 { 299 NVMeRequest *req; 300 301 req = &q->reqs[q->free_req_head]; 302 q->free_req_head = req->free_req_next; 303 req->free_req_next = -1; 304 return req; 305 } 306 307 /* Return a free request element if any, otherwise return NULL. */ 308 static NVMeRequest *nvme_get_free_req_nowait(NVMeQueuePair *q) 309 { 310 QEMU_LOCK_GUARD(&q->lock); 311 if (q->free_req_head == -1) { 312 return NULL; 313 } 314 return nvme_get_free_req_nofail_locked(q); 315 } 316 317 /* 318 * Wait for a free request to become available if necessary, then 319 * return it. 320 */ 321 static coroutine_fn NVMeRequest *nvme_get_free_req(NVMeQueuePair *q) 322 { 323 QEMU_LOCK_GUARD(&q->lock); 324 325 while (q->free_req_head == -1) { 326 trace_nvme_free_req_queue_wait(q->s, q->index); 327 qemu_co_queue_wait(&q->free_req_queue, &q->lock); 328 } 329 330 return nvme_get_free_req_nofail_locked(q); 331 } 332 333 /* With q->lock */ 334 static void nvme_put_free_req_locked(NVMeQueuePair *q, NVMeRequest *req) 335 { 336 req->free_req_next = q->free_req_head; 337 q->free_req_head = req - q->reqs; 338 } 339 340 /* With q->lock */ 341 static void nvme_wake_free_req_locked(NVMeQueuePair *q) 342 { 343 if (!qemu_co_queue_empty(&q->free_req_queue)) { 344 replay_bh_schedule_oneshot_event(q->s->aio_context, 345 nvme_free_req_queue_cb, q); 346 } 347 } 348 349 /* Insert a request in the freelist and wake waiters */ 350 static void nvme_put_free_req_and_wake(NVMeQueuePair *q, NVMeRequest *req) 351 { 352 qemu_mutex_lock(&q->lock); 353 nvme_put_free_req_locked(q, req); 354 nvme_wake_free_req_locked(q); 355 qemu_mutex_unlock(&q->lock); 356 } 357 358 static inline int nvme_translate_error(const NvmeCqe *c) 359 { 360 uint16_t status = (le16_to_cpu(c->status) >> 1) & 0xFF; 361 if (status) { 362 trace_nvme_error(le32_to_cpu(c->result), 363 le16_to_cpu(c->sq_head), 364 le16_to_cpu(c->sq_id), 365 le16_to_cpu(c->cid), 366 le16_to_cpu(status)); 367 } 368 switch (status) { 369 case 0: 370 return 0; 371 case 1: 372 return -ENOSYS; 373 case 2: 374 return -EINVAL; 375 default: 376 return -EIO; 377 } 378 } 379 380 /* With q->lock */ 381 static bool nvme_process_completion(NVMeQueuePair *q) 382 { 383 BDRVNVMeState *s = q->s; 384 bool progress = false; 385 NVMeRequest *preq; 386 NVMeRequest req; 387 NvmeCqe *c; 388 389 trace_nvme_process_completion(s, q->index, q->inflight); 390 if (s->plugged) { 391 trace_nvme_process_completion_queue_plugged(s, q->index); 392 return false; 393 } 394 395 /* 396 * Support re-entrancy when a request cb() function invokes aio_poll(). 397 * Pending completions must be visible to aio_poll() so that a cb() 398 * function can wait for the completion of another request. 399 * 400 * The aio_poll() loop will execute our BH and we'll resume completion 401 * processing there. 402 */ 403 qemu_bh_schedule(q->completion_bh); 404 405 assert(q->inflight >= 0); 406 while (q->inflight) { 407 int ret; 408 int16_t cid; 409 410 c = (NvmeCqe *)&q->cq.queue[q->cq.head * NVME_CQ_ENTRY_BYTES]; 411 if ((le16_to_cpu(c->status) & 0x1) == q->cq_phase) { 412 break; 413 } 414 ret = nvme_translate_error(c); 415 if (ret) { 416 s->stats.completion_errors++; 417 } 418 q->cq.head = (q->cq.head + 1) % NVME_QUEUE_SIZE; 419 if (!q->cq.head) { 420 q->cq_phase = !q->cq_phase; 421 } 422 cid = le16_to_cpu(c->cid); 423 if (cid == 0 || cid > NVME_QUEUE_SIZE) { 424 warn_report("NVMe: Unexpected CID in completion queue: %"PRIu32", " 425 "queue size: %u", cid, NVME_QUEUE_SIZE); 426 continue; 427 } 428 trace_nvme_complete_command(s, q->index, cid); 429 preq = &q->reqs[cid - 1]; 430 req = *preq; 431 assert(req.cid == cid); 432 assert(req.cb); 433 nvme_put_free_req_locked(q, preq); 434 preq->cb = preq->opaque = NULL; 435 q->inflight--; 436 qemu_mutex_unlock(&q->lock); 437 req.cb(req.opaque, ret); 438 qemu_mutex_lock(&q->lock); 439 progress = true; 440 } 441 if (progress) { 442 /* Notify the device so it can post more completions. */ 443 smp_mb_release(); 444 *q->cq.doorbell = cpu_to_le32(q->cq.head); 445 nvme_wake_free_req_locked(q); 446 } 447 448 qemu_bh_cancel(q->completion_bh); 449 450 return progress; 451 } 452 453 static void nvme_process_completion_bh(void *opaque) 454 { 455 NVMeQueuePair *q = opaque; 456 457 /* 458 * We're being invoked because a nvme_process_completion() cb() function 459 * called aio_poll(). The callback may be waiting for further completions 460 * so notify the device that it has space to fill in more completions now. 461 */ 462 smp_mb_release(); 463 *q->cq.doorbell = cpu_to_le32(q->cq.head); 464 nvme_wake_free_req_locked(q); 465 466 nvme_process_completion(q); 467 } 468 469 static void nvme_trace_command(const NvmeCmd *cmd) 470 { 471 int i; 472 473 if (!trace_event_get_state_backends(TRACE_NVME_SUBMIT_COMMAND_RAW)) { 474 return; 475 } 476 for (i = 0; i < 8; ++i) { 477 uint8_t *cmdp = (uint8_t *)cmd + i * 8; 478 trace_nvme_submit_command_raw(cmdp[0], cmdp[1], cmdp[2], cmdp[3], 479 cmdp[4], cmdp[5], cmdp[6], cmdp[7]); 480 } 481 } 482 483 static void nvme_submit_command(NVMeQueuePair *q, NVMeRequest *req, 484 NvmeCmd *cmd, BlockCompletionFunc cb, 485 void *opaque) 486 { 487 assert(!req->cb); 488 req->cb = cb; 489 req->opaque = opaque; 490 cmd->cid = cpu_to_le16(req->cid); 491 492 trace_nvme_submit_command(q->s, q->index, req->cid); 493 nvme_trace_command(cmd); 494 qemu_mutex_lock(&q->lock); 495 memcpy((uint8_t *)q->sq.queue + 496 q->sq.tail * NVME_SQ_ENTRY_BYTES, cmd, sizeof(*cmd)); 497 q->sq.tail = (q->sq.tail + 1) % NVME_QUEUE_SIZE; 498 q->need_kick++; 499 nvme_kick(q); 500 nvme_process_completion(q); 501 qemu_mutex_unlock(&q->lock); 502 } 503 504 static void nvme_admin_cmd_sync_cb(void *opaque, int ret) 505 { 506 int *pret = opaque; 507 *pret = ret; 508 aio_wait_kick(); 509 } 510 511 static int nvme_admin_cmd_sync(BlockDriverState *bs, NvmeCmd *cmd) 512 { 513 BDRVNVMeState *s = bs->opaque; 514 NVMeQueuePair *q = s->queues[INDEX_ADMIN]; 515 AioContext *aio_context = bdrv_get_aio_context(bs); 516 NVMeRequest *req; 517 int ret = -EINPROGRESS; 518 req = nvme_get_free_req_nowait(q); 519 if (!req) { 520 return -EBUSY; 521 } 522 nvme_submit_command(q, req, cmd, nvme_admin_cmd_sync_cb, &ret); 523 524 AIO_WAIT_WHILE(aio_context, ret == -EINPROGRESS); 525 return ret; 526 } 527 528 /* Returns true on success, false on failure. */ 529 static bool nvme_identify(BlockDriverState *bs, int namespace, Error **errp) 530 { 531 BDRVNVMeState *s = bs->opaque; 532 bool ret = false; 533 QEMU_AUTO_VFREE union { 534 NvmeIdCtrl ctrl; 535 NvmeIdNs ns; 536 } *id = NULL; 537 NvmeLBAF *lbaf; 538 uint16_t oncs; 539 int r; 540 uint64_t iova; 541 NvmeCmd cmd = { 542 .opcode = NVME_ADM_CMD_IDENTIFY, 543 .cdw10 = cpu_to_le32(0x1), 544 }; 545 size_t id_size = QEMU_ALIGN_UP(sizeof(*id), qemu_real_host_page_size()); 546 547 id = qemu_try_memalign(qemu_real_host_page_size(), id_size); 548 if (!id) { 549 error_setg(errp, "Cannot allocate buffer for identify response"); 550 goto out; 551 } 552 r = qemu_vfio_dma_map(s->vfio, id, id_size, true, &iova, errp); 553 if (r) { 554 error_prepend(errp, "Cannot map buffer for DMA: "); 555 goto out; 556 } 557 558 memset(id, 0, id_size); 559 cmd.dptr.prp1 = cpu_to_le64(iova); 560 if (nvme_admin_cmd_sync(bs, &cmd)) { 561 error_setg(errp, "Failed to identify controller"); 562 goto out; 563 } 564 565 if (le32_to_cpu(id->ctrl.nn) < namespace) { 566 error_setg(errp, "Invalid namespace"); 567 goto out; 568 } 569 s->write_cache_supported = le32_to_cpu(id->ctrl.vwc) & 0x1; 570 s->max_transfer = (id->ctrl.mdts ? 1 << id->ctrl.mdts : 0) * s->page_size; 571 /* For now the page list buffer per command is one page, to hold at most 572 * s->page_size / sizeof(uint64_t) entries. */ 573 s->max_transfer = MIN_NON_ZERO(s->max_transfer, 574 s->page_size / sizeof(uint64_t) * s->page_size); 575 576 oncs = le16_to_cpu(id->ctrl.oncs); 577 s->supports_write_zeroes = !!(oncs & NVME_ONCS_WRITE_ZEROES); 578 s->supports_discard = !!(oncs & NVME_ONCS_DSM); 579 580 memset(id, 0, id_size); 581 cmd.cdw10 = 0; 582 cmd.nsid = cpu_to_le32(namespace); 583 if (nvme_admin_cmd_sync(bs, &cmd)) { 584 error_setg(errp, "Failed to identify namespace"); 585 goto out; 586 } 587 588 s->nsze = le64_to_cpu(id->ns.nsze); 589 lbaf = &id->ns.lbaf[NVME_ID_NS_FLBAS_INDEX(id->ns.flbas)]; 590 591 if (NVME_ID_NS_DLFEAT_WRITE_ZEROES(id->ns.dlfeat) && 592 NVME_ID_NS_DLFEAT_READ_BEHAVIOR(id->ns.dlfeat) == 593 NVME_ID_NS_DLFEAT_READ_BEHAVIOR_ZEROES) { 594 bs->supported_write_flags |= BDRV_REQ_MAY_UNMAP; 595 } 596 597 if (lbaf->ms) { 598 error_setg(errp, "Namespaces with metadata are not yet supported"); 599 goto out; 600 } 601 602 if (lbaf->ds < BDRV_SECTOR_BITS || lbaf->ds > 12 || 603 (1 << lbaf->ds) > s->page_size) 604 { 605 error_setg(errp, "Namespace has unsupported block size (2^%d)", 606 lbaf->ds); 607 goto out; 608 } 609 610 ret = true; 611 s->blkshift = lbaf->ds; 612 out: 613 qemu_vfio_dma_unmap(s->vfio, id); 614 615 return ret; 616 } 617 618 static void nvme_poll_queue(NVMeQueuePair *q) 619 { 620 const size_t cqe_offset = q->cq.head * NVME_CQ_ENTRY_BYTES; 621 NvmeCqe *cqe = (NvmeCqe *)&q->cq.queue[cqe_offset]; 622 623 trace_nvme_poll_queue(q->s, q->index); 624 /* 625 * Do an early check for completions. q->lock isn't needed because 626 * nvme_process_completion() only runs in the event loop thread and 627 * cannot race with itself. 628 */ 629 if ((le16_to_cpu(cqe->status) & 0x1) == q->cq_phase) { 630 return; 631 } 632 633 qemu_mutex_lock(&q->lock); 634 while (nvme_process_completion(q)) { 635 /* Keep polling */ 636 } 637 qemu_mutex_unlock(&q->lock); 638 } 639 640 static void nvme_poll_queues(BDRVNVMeState *s) 641 { 642 int i; 643 644 for (i = 0; i < s->queue_count; i++) { 645 nvme_poll_queue(s->queues[i]); 646 } 647 } 648 649 static void nvme_handle_event(EventNotifier *n) 650 { 651 BDRVNVMeState *s = container_of(n, BDRVNVMeState, 652 irq_notifier[MSIX_SHARED_IRQ_IDX]); 653 654 trace_nvme_handle_event(s); 655 event_notifier_test_and_clear(n); 656 nvme_poll_queues(s); 657 } 658 659 static bool nvme_add_io_queue(BlockDriverState *bs, Error **errp) 660 { 661 BDRVNVMeState *s = bs->opaque; 662 unsigned n = s->queue_count; 663 NVMeQueuePair *q; 664 NvmeCmd cmd; 665 unsigned queue_size = NVME_QUEUE_SIZE; 666 667 assert(n <= UINT16_MAX); 668 q = nvme_create_queue_pair(s, bdrv_get_aio_context(bs), 669 n, queue_size, errp); 670 if (!q) { 671 return false; 672 } 673 cmd = (NvmeCmd) { 674 .opcode = NVME_ADM_CMD_CREATE_CQ, 675 .dptr.prp1 = cpu_to_le64(q->cq.iova), 676 .cdw10 = cpu_to_le32(((queue_size - 1) << 16) | n), 677 .cdw11 = cpu_to_le32(NVME_CQ_IEN | NVME_CQ_PC), 678 }; 679 if (nvme_admin_cmd_sync(bs, &cmd)) { 680 error_setg(errp, "Failed to create CQ io queue [%u]", n); 681 goto out_error; 682 } 683 cmd = (NvmeCmd) { 684 .opcode = NVME_ADM_CMD_CREATE_SQ, 685 .dptr.prp1 = cpu_to_le64(q->sq.iova), 686 .cdw10 = cpu_to_le32(((queue_size - 1) << 16) | n), 687 .cdw11 = cpu_to_le32(NVME_SQ_PC | (n << 16)), 688 }; 689 if (nvme_admin_cmd_sync(bs, &cmd)) { 690 error_setg(errp, "Failed to create SQ io queue [%u]", n); 691 goto out_error; 692 } 693 s->queues = g_renew(NVMeQueuePair *, s->queues, n + 1); 694 s->queues[n] = q; 695 s->queue_count++; 696 return true; 697 out_error: 698 nvme_free_queue_pair(q); 699 return false; 700 } 701 702 static bool nvme_poll_cb(void *opaque) 703 { 704 EventNotifier *e = opaque; 705 BDRVNVMeState *s = container_of(e, BDRVNVMeState, 706 irq_notifier[MSIX_SHARED_IRQ_IDX]); 707 int i; 708 709 for (i = 0; i < s->queue_count; i++) { 710 NVMeQueuePair *q = s->queues[i]; 711 const size_t cqe_offset = q->cq.head * NVME_CQ_ENTRY_BYTES; 712 NvmeCqe *cqe = (NvmeCqe *)&q->cq.queue[cqe_offset]; 713 714 /* 715 * q->lock isn't needed because nvme_process_completion() only runs in 716 * the event loop thread and cannot race with itself. 717 */ 718 if ((le16_to_cpu(cqe->status) & 0x1) != q->cq_phase) { 719 return true; 720 } 721 } 722 return false; 723 } 724 725 static void nvme_poll_ready(EventNotifier *e) 726 { 727 BDRVNVMeState *s = container_of(e, BDRVNVMeState, 728 irq_notifier[MSIX_SHARED_IRQ_IDX]); 729 730 nvme_poll_queues(s); 731 } 732 733 static int nvme_init(BlockDriverState *bs, const char *device, int namespace, 734 Error **errp) 735 { 736 BDRVNVMeState *s = bs->opaque; 737 NVMeQueuePair *q; 738 AioContext *aio_context = bdrv_get_aio_context(bs); 739 int ret; 740 uint64_t cap; 741 uint32_t ver; 742 uint64_t timeout_ms; 743 uint64_t deadline, now; 744 volatile NvmeBar *regs = NULL; 745 746 qemu_co_mutex_init(&s->dma_map_lock); 747 qemu_co_queue_init(&s->dma_flush_queue); 748 s->device = g_strdup(device); 749 s->nsid = namespace; 750 s->aio_context = bdrv_get_aio_context(bs); 751 ret = event_notifier_init(&s->irq_notifier[MSIX_SHARED_IRQ_IDX], 0); 752 if (ret) { 753 error_setg(errp, "Failed to init event notifier"); 754 return ret; 755 } 756 757 s->vfio = qemu_vfio_open_pci(device, errp); 758 if (!s->vfio) { 759 ret = -EINVAL; 760 goto out; 761 } 762 763 regs = qemu_vfio_pci_map_bar(s->vfio, 0, 0, sizeof(NvmeBar), 764 PROT_READ | PROT_WRITE, errp); 765 if (!regs) { 766 ret = -EINVAL; 767 goto out; 768 } 769 /* Perform initialize sequence as described in NVMe spec "7.6.1 770 * Initialization". */ 771 772 cap = le64_to_cpu(regs->cap); 773 trace_nvme_controller_capability_raw(cap); 774 trace_nvme_controller_capability("Maximum Queue Entries Supported", 775 1 + NVME_CAP_MQES(cap)); 776 trace_nvme_controller_capability("Contiguous Queues Required", 777 NVME_CAP_CQR(cap)); 778 trace_nvme_controller_capability("Doorbell Stride", 779 1 << (2 + NVME_CAP_DSTRD(cap))); 780 trace_nvme_controller_capability("Subsystem Reset Supported", 781 NVME_CAP_NSSRS(cap)); 782 trace_nvme_controller_capability("Memory Page Size Minimum", 783 1 << (12 + NVME_CAP_MPSMIN(cap))); 784 trace_nvme_controller_capability("Memory Page Size Maximum", 785 1 << (12 + NVME_CAP_MPSMAX(cap))); 786 if (!NVME_CAP_CSS(cap)) { 787 error_setg(errp, "Device doesn't support NVMe command set"); 788 ret = -EINVAL; 789 goto out; 790 } 791 792 s->page_size = 1u << (12 + NVME_CAP_MPSMIN(cap)); 793 s->doorbell_scale = (4 << NVME_CAP_DSTRD(cap)) / sizeof(uint32_t); 794 bs->bl.opt_mem_alignment = s->page_size; 795 bs->bl.request_alignment = s->page_size; 796 timeout_ms = MIN(500 * NVME_CAP_TO(cap), 30000); 797 798 ver = le32_to_cpu(regs->vs); 799 trace_nvme_controller_spec_version(extract32(ver, 16, 16), 800 extract32(ver, 8, 8), 801 extract32(ver, 0, 8)); 802 803 /* Reset device to get a clean state. */ 804 regs->cc = cpu_to_le32(le32_to_cpu(regs->cc) & 0xFE); 805 /* Wait for CSTS.RDY = 0. */ 806 deadline = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) + timeout_ms * SCALE_MS; 807 while (NVME_CSTS_RDY(le32_to_cpu(regs->csts))) { 808 if (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) > deadline) { 809 error_setg(errp, "Timeout while waiting for device to reset (%" 810 PRId64 " ms)", 811 timeout_ms); 812 ret = -ETIMEDOUT; 813 goto out; 814 } 815 } 816 817 s->bar0_wo_map = qemu_vfio_pci_map_bar(s->vfio, 0, 0, 818 sizeof(NvmeBar) + NVME_DOORBELL_SIZE, 819 PROT_WRITE, errp); 820 s->doorbells = (void *)((uintptr_t)s->bar0_wo_map + sizeof(NvmeBar)); 821 if (!s->doorbells) { 822 ret = -EINVAL; 823 goto out; 824 } 825 826 /* Set up admin queue. */ 827 s->queues = g_new(NVMeQueuePair *, 1); 828 q = nvme_create_queue_pair(s, aio_context, 0, NVME_QUEUE_SIZE, errp); 829 if (!q) { 830 ret = -EINVAL; 831 goto out; 832 } 833 s->queues[INDEX_ADMIN] = q; 834 s->queue_count = 1; 835 QEMU_BUILD_BUG_ON((NVME_QUEUE_SIZE - 1) & 0xF000); 836 regs->aqa = cpu_to_le32(((NVME_QUEUE_SIZE - 1) << AQA_ACQS_SHIFT) | 837 ((NVME_QUEUE_SIZE - 1) << AQA_ASQS_SHIFT)); 838 regs->asq = cpu_to_le64(q->sq.iova); 839 regs->acq = cpu_to_le64(q->cq.iova); 840 841 /* After setting up all control registers we can enable device now. */ 842 regs->cc = cpu_to_le32((ctz32(NVME_CQ_ENTRY_BYTES) << CC_IOCQES_SHIFT) | 843 (ctz32(NVME_SQ_ENTRY_BYTES) << CC_IOSQES_SHIFT) | 844 CC_EN_MASK); 845 /* Wait for CSTS.RDY = 1. */ 846 now = qemu_clock_get_ns(QEMU_CLOCK_REALTIME); 847 deadline = now + timeout_ms * SCALE_MS; 848 while (!NVME_CSTS_RDY(le32_to_cpu(regs->csts))) { 849 if (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) > deadline) { 850 error_setg(errp, "Timeout while waiting for device to start (%" 851 PRId64 " ms)", 852 timeout_ms); 853 ret = -ETIMEDOUT; 854 goto out; 855 } 856 } 857 858 ret = qemu_vfio_pci_init_irq(s->vfio, s->irq_notifier, 859 VFIO_PCI_MSIX_IRQ_INDEX, errp); 860 if (ret) { 861 goto out; 862 } 863 aio_set_event_notifier(bdrv_get_aio_context(bs), 864 &s->irq_notifier[MSIX_SHARED_IRQ_IDX], 865 nvme_handle_event, nvme_poll_cb, 866 nvme_poll_ready); 867 868 if (!nvme_identify(bs, namespace, errp)) { 869 ret = -EIO; 870 goto out; 871 } 872 873 /* Set up command queues. */ 874 if (!nvme_add_io_queue(bs, errp)) { 875 ret = -EIO; 876 } 877 out: 878 if (regs) { 879 qemu_vfio_pci_unmap_bar(s->vfio, 0, (void *)regs, 0, sizeof(NvmeBar)); 880 } 881 882 /* Cleaning up is done in nvme_file_open() upon error. */ 883 return ret; 884 } 885 886 /* Parse a filename in the format of nvme://XXXX:XX:XX.X/X. Example: 887 * 888 * nvme://0000:44:00.0/1 889 * 890 * where the "nvme://" is a fixed form of the protocol prefix, the middle part 891 * is the PCI address, and the last part is the namespace number starting from 892 * 1 according to the NVMe spec. */ 893 static void nvme_parse_filename(const char *filename, QDict *options, 894 Error **errp) 895 { 896 int pref = strlen("nvme://"); 897 898 if (strlen(filename) > pref && !strncmp(filename, "nvme://", pref)) { 899 const char *tmp = filename + pref; 900 char *device; 901 const char *namespace; 902 unsigned long ns; 903 const char *slash = strchr(tmp, '/'); 904 if (!slash) { 905 qdict_put_str(options, NVME_BLOCK_OPT_DEVICE, tmp); 906 return; 907 } 908 device = g_strndup(tmp, slash - tmp); 909 qdict_put_str(options, NVME_BLOCK_OPT_DEVICE, device); 910 g_free(device); 911 namespace = slash + 1; 912 if (*namespace && qemu_strtoul(namespace, NULL, 10, &ns)) { 913 error_setg(errp, "Invalid namespace '%s', positive number expected", 914 namespace); 915 return; 916 } 917 qdict_put_str(options, NVME_BLOCK_OPT_NAMESPACE, 918 *namespace ? namespace : "1"); 919 } 920 } 921 922 static int nvme_enable_disable_write_cache(BlockDriverState *bs, bool enable, 923 Error **errp) 924 { 925 int ret; 926 BDRVNVMeState *s = bs->opaque; 927 NvmeCmd cmd = { 928 .opcode = NVME_ADM_CMD_SET_FEATURES, 929 .nsid = cpu_to_le32(s->nsid), 930 .cdw10 = cpu_to_le32(0x06), 931 .cdw11 = cpu_to_le32(enable ? 0x01 : 0x00), 932 }; 933 934 ret = nvme_admin_cmd_sync(bs, &cmd); 935 if (ret) { 936 error_setg(errp, "Failed to configure NVMe write cache"); 937 } 938 return ret; 939 } 940 941 static void nvme_close(BlockDriverState *bs) 942 { 943 BDRVNVMeState *s = bs->opaque; 944 945 for (unsigned i = 0; i < s->queue_count; ++i) { 946 nvme_free_queue_pair(s->queues[i]); 947 } 948 g_free(s->queues); 949 aio_set_event_notifier(bdrv_get_aio_context(bs), 950 &s->irq_notifier[MSIX_SHARED_IRQ_IDX], 951 NULL, NULL, NULL); 952 event_notifier_cleanup(&s->irq_notifier[MSIX_SHARED_IRQ_IDX]); 953 qemu_vfio_pci_unmap_bar(s->vfio, 0, s->bar0_wo_map, 954 0, sizeof(NvmeBar) + NVME_DOORBELL_SIZE); 955 qemu_vfio_close(s->vfio); 956 957 g_free(s->device); 958 } 959 960 static int nvme_file_open(BlockDriverState *bs, QDict *options, int flags, 961 Error **errp) 962 { 963 const char *device; 964 QemuOpts *opts; 965 int namespace; 966 int ret; 967 BDRVNVMeState *s = bs->opaque; 968 969 bs->supported_write_flags = BDRV_REQ_FUA; 970 971 opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort); 972 qemu_opts_absorb_qdict(opts, options, &error_abort); 973 device = qemu_opt_get(opts, NVME_BLOCK_OPT_DEVICE); 974 if (!device) { 975 error_setg(errp, "'" NVME_BLOCK_OPT_DEVICE "' option is required"); 976 qemu_opts_del(opts); 977 return -EINVAL; 978 } 979 980 namespace = qemu_opt_get_number(opts, NVME_BLOCK_OPT_NAMESPACE, 1); 981 ret = nvme_init(bs, device, namespace, errp); 982 qemu_opts_del(opts); 983 if (ret) { 984 goto fail; 985 } 986 if (flags & BDRV_O_NOCACHE) { 987 if (!s->write_cache_supported) { 988 error_setg(errp, 989 "NVMe controller doesn't support write cache configuration"); 990 ret = -EINVAL; 991 } else { 992 ret = nvme_enable_disable_write_cache(bs, !(flags & BDRV_O_NOCACHE), 993 errp); 994 } 995 if (ret) { 996 goto fail; 997 } 998 } 999 return 0; 1000 fail: 1001 nvme_close(bs); 1002 return ret; 1003 } 1004 1005 static int64_t coroutine_fn nvme_co_getlength(BlockDriverState *bs) 1006 { 1007 BDRVNVMeState *s = bs->opaque; 1008 return s->nsze << s->blkshift; 1009 } 1010 1011 static uint32_t nvme_get_blocksize(BlockDriverState *bs) 1012 { 1013 BDRVNVMeState *s = bs->opaque; 1014 assert(s->blkshift >= BDRV_SECTOR_BITS && s->blkshift <= 12); 1015 return UINT32_C(1) << s->blkshift; 1016 } 1017 1018 static int nvme_probe_blocksizes(BlockDriverState *bs, BlockSizes *bsz) 1019 { 1020 uint32_t blocksize = nvme_get_blocksize(bs); 1021 bsz->phys = blocksize; 1022 bsz->log = blocksize; 1023 return 0; 1024 } 1025 1026 /* Called with s->dma_map_lock */ 1027 static coroutine_fn int nvme_cmd_unmap_qiov(BlockDriverState *bs, 1028 QEMUIOVector *qiov) 1029 { 1030 int r = 0; 1031 BDRVNVMeState *s = bs->opaque; 1032 1033 s->dma_map_count -= qiov->size; 1034 if (!s->dma_map_count && !qemu_co_queue_empty(&s->dma_flush_queue)) { 1035 r = qemu_vfio_dma_reset_temporary(s->vfio); 1036 if (!r) { 1037 qemu_co_queue_restart_all(&s->dma_flush_queue); 1038 } 1039 } 1040 return r; 1041 } 1042 1043 /* Called with s->dma_map_lock */ 1044 static coroutine_fn int nvme_cmd_map_qiov(BlockDriverState *bs, NvmeCmd *cmd, 1045 NVMeRequest *req, QEMUIOVector *qiov) 1046 { 1047 BDRVNVMeState *s = bs->opaque; 1048 uint64_t *pagelist = req->prp_list_page; 1049 int i, j, r; 1050 int entries = 0; 1051 Error *local_err = NULL, **errp = NULL; 1052 1053 assert(qiov->size); 1054 assert(QEMU_IS_ALIGNED(qiov->size, s->page_size)); 1055 assert(qiov->size / s->page_size <= s->page_size / sizeof(uint64_t)); 1056 for (i = 0; i < qiov->niov; ++i) { 1057 bool retry = true; 1058 uint64_t iova; 1059 size_t len = QEMU_ALIGN_UP(qiov->iov[i].iov_len, 1060 qemu_real_host_page_size()); 1061 try_map: 1062 r = qemu_vfio_dma_map(s->vfio, 1063 qiov->iov[i].iov_base, 1064 len, true, &iova, errp); 1065 if (r == -ENOSPC) { 1066 /* 1067 * In addition to the -ENOMEM error, the VFIO_IOMMU_MAP_DMA 1068 * ioctl returns -ENOSPC to signal the user exhausted the DMA 1069 * mappings available for a container since Linux kernel commit 1070 * 492855939bdb ("vfio/type1: Limit DMA mappings per container", 1071 * April 2019, see CVE-2019-3882). 1072 * 1073 * This block driver already handles this error path by checking 1074 * for the -ENOMEM error, so we directly replace -ENOSPC by 1075 * -ENOMEM. Beside, -ENOSPC has a specific meaning for blockdev 1076 * coroutines: it triggers BLOCKDEV_ON_ERROR_ENOSPC and 1077 * BLOCK_ERROR_ACTION_STOP which stops the VM, asking the operator 1078 * to add more storage to the blockdev. Not something we can do 1079 * easily with an IOMMU :) 1080 */ 1081 r = -ENOMEM; 1082 } 1083 if (r == -ENOMEM && retry) { 1084 /* 1085 * We exhausted the DMA mappings available for our container: 1086 * recycle the volatile IOVA mappings. 1087 */ 1088 retry = false; 1089 trace_nvme_dma_flush_queue_wait(s); 1090 if (s->dma_map_count) { 1091 trace_nvme_dma_map_flush(s); 1092 qemu_co_queue_wait(&s->dma_flush_queue, &s->dma_map_lock); 1093 } else { 1094 r = qemu_vfio_dma_reset_temporary(s->vfio); 1095 if (r) { 1096 goto fail; 1097 } 1098 } 1099 errp = &local_err; 1100 1101 goto try_map; 1102 } 1103 if (r) { 1104 goto fail; 1105 } 1106 1107 for (j = 0; j < qiov->iov[i].iov_len / s->page_size; j++) { 1108 pagelist[entries++] = cpu_to_le64(iova + j * s->page_size); 1109 } 1110 trace_nvme_cmd_map_qiov_iov(s, i, qiov->iov[i].iov_base, 1111 qiov->iov[i].iov_len / s->page_size); 1112 } 1113 1114 s->dma_map_count += qiov->size; 1115 1116 assert(entries <= s->page_size / sizeof(uint64_t)); 1117 switch (entries) { 1118 case 0: 1119 abort(); 1120 case 1: 1121 cmd->dptr.prp1 = pagelist[0]; 1122 cmd->dptr.prp2 = 0; 1123 break; 1124 case 2: 1125 cmd->dptr.prp1 = pagelist[0]; 1126 cmd->dptr.prp2 = pagelist[1]; 1127 break; 1128 default: 1129 cmd->dptr.prp1 = pagelist[0]; 1130 cmd->dptr.prp2 = cpu_to_le64(req->prp_list_iova + sizeof(uint64_t)); 1131 break; 1132 } 1133 trace_nvme_cmd_map_qiov(s, cmd, req, qiov, entries); 1134 for (i = 0; i < entries; ++i) { 1135 trace_nvme_cmd_map_qiov_pages(s, i, pagelist[i]); 1136 } 1137 return 0; 1138 fail: 1139 /* No need to unmap [0 - i) iovs even if we've failed, since we don't 1140 * increment s->dma_map_count. This is okay for fixed mapping memory areas 1141 * because they are already mapped before calling this function; for 1142 * temporary mappings, a later nvme_cmd_(un)map_qiov will reclaim by 1143 * calling qemu_vfio_dma_reset_temporary when necessary. */ 1144 if (local_err) { 1145 error_reportf_err(local_err, "Cannot map buffer for DMA: "); 1146 } 1147 return r; 1148 } 1149 1150 typedef struct { 1151 Coroutine *co; 1152 int ret; 1153 AioContext *ctx; 1154 } NVMeCoData; 1155 1156 static void nvme_rw_cb_bh(void *opaque) 1157 { 1158 NVMeCoData *data = opaque; 1159 qemu_coroutine_enter(data->co); 1160 } 1161 1162 static void nvme_rw_cb(void *opaque, int ret) 1163 { 1164 NVMeCoData *data = opaque; 1165 data->ret = ret; 1166 if (!data->co) { 1167 /* The rw coroutine hasn't yielded, don't try to enter. */ 1168 return; 1169 } 1170 replay_bh_schedule_oneshot_event(data->ctx, nvme_rw_cb_bh, data); 1171 } 1172 1173 static coroutine_fn int nvme_co_prw_aligned(BlockDriverState *bs, 1174 uint64_t offset, uint64_t bytes, 1175 QEMUIOVector *qiov, 1176 bool is_write, 1177 int flags) 1178 { 1179 int r; 1180 BDRVNVMeState *s = bs->opaque; 1181 NVMeQueuePair *ioq = s->queues[INDEX_IO(0)]; 1182 NVMeRequest *req; 1183 1184 uint32_t cdw12 = (((bytes >> s->blkshift) - 1) & 0xFFFF) | 1185 (flags & BDRV_REQ_FUA ? 1 << 30 : 0); 1186 NvmeCmd cmd = { 1187 .opcode = is_write ? NVME_CMD_WRITE : NVME_CMD_READ, 1188 .nsid = cpu_to_le32(s->nsid), 1189 .cdw10 = cpu_to_le32((offset >> s->blkshift) & 0xFFFFFFFF), 1190 .cdw11 = cpu_to_le32(((offset >> s->blkshift) >> 32) & 0xFFFFFFFF), 1191 .cdw12 = cpu_to_le32(cdw12), 1192 }; 1193 NVMeCoData data = { 1194 .ctx = bdrv_get_aio_context(bs), 1195 .ret = -EINPROGRESS, 1196 }; 1197 1198 trace_nvme_prw_aligned(s, is_write, offset, bytes, flags, qiov->niov); 1199 assert(s->queue_count > 1); 1200 req = nvme_get_free_req(ioq); 1201 assert(req); 1202 1203 qemu_co_mutex_lock(&s->dma_map_lock); 1204 r = nvme_cmd_map_qiov(bs, &cmd, req, qiov); 1205 qemu_co_mutex_unlock(&s->dma_map_lock); 1206 if (r) { 1207 nvme_put_free_req_and_wake(ioq, req); 1208 return r; 1209 } 1210 nvme_submit_command(ioq, req, &cmd, nvme_rw_cb, &data); 1211 1212 data.co = qemu_coroutine_self(); 1213 while (data.ret == -EINPROGRESS) { 1214 qemu_coroutine_yield(); 1215 } 1216 1217 qemu_co_mutex_lock(&s->dma_map_lock); 1218 r = nvme_cmd_unmap_qiov(bs, qiov); 1219 qemu_co_mutex_unlock(&s->dma_map_lock); 1220 if (r) { 1221 return r; 1222 } 1223 1224 trace_nvme_rw_done(s, is_write, offset, bytes, data.ret); 1225 return data.ret; 1226 } 1227 1228 static inline bool nvme_qiov_aligned(BlockDriverState *bs, 1229 const QEMUIOVector *qiov) 1230 { 1231 int i; 1232 BDRVNVMeState *s = bs->opaque; 1233 1234 for (i = 0; i < qiov->niov; ++i) { 1235 if (!QEMU_PTR_IS_ALIGNED(qiov->iov[i].iov_base, 1236 qemu_real_host_page_size()) || 1237 !QEMU_IS_ALIGNED(qiov->iov[i].iov_len, qemu_real_host_page_size())) { 1238 trace_nvme_qiov_unaligned(qiov, i, qiov->iov[i].iov_base, 1239 qiov->iov[i].iov_len, s->page_size); 1240 return false; 1241 } 1242 } 1243 return true; 1244 } 1245 1246 static coroutine_fn int nvme_co_prw(BlockDriverState *bs, 1247 uint64_t offset, uint64_t bytes, 1248 QEMUIOVector *qiov, bool is_write, 1249 int flags) 1250 { 1251 BDRVNVMeState *s = bs->opaque; 1252 int r; 1253 QEMU_AUTO_VFREE uint8_t *buf = NULL; 1254 QEMUIOVector local_qiov; 1255 size_t len = QEMU_ALIGN_UP(bytes, qemu_real_host_page_size()); 1256 assert(QEMU_IS_ALIGNED(offset, s->page_size)); 1257 assert(QEMU_IS_ALIGNED(bytes, s->page_size)); 1258 assert(bytes <= s->max_transfer); 1259 if (nvme_qiov_aligned(bs, qiov)) { 1260 s->stats.aligned_accesses++; 1261 return nvme_co_prw_aligned(bs, offset, bytes, qiov, is_write, flags); 1262 } 1263 s->stats.unaligned_accesses++; 1264 trace_nvme_prw_buffered(s, offset, bytes, qiov->niov, is_write); 1265 buf = qemu_try_memalign(qemu_real_host_page_size(), len); 1266 1267 if (!buf) { 1268 return -ENOMEM; 1269 } 1270 qemu_iovec_init(&local_qiov, 1); 1271 if (is_write) { 1272 qemu_iovec_to_buf(qiov, 0, buf, bytes); 1273 } 1274 qemu_iovec_add(&local_qiov, buf, bytes); 1275 r = nvme_co_prw_aligned(bs, offset, bytes, &local_qiov, is_write, flags); 1276 qemu_iovec_destroy(&local_qiov); 1277 if (!r && !is_write) { 1278 qemu_iovec_from_buf(qiov, 0, buf, bytes); 1279 } 1280 return r; 1281 } 1282 1283 static coroutine_fn int nvme_co_preadv(BlockDriverState *bs, 1284 int64_t offset, int64_t bytes, 1285 QEMUIOVector *qiov, 1286 BdrvRequestFlags flags) 1287 { 1288 return nvme_co_prw(bs, offset, bytes, qiov, false, flags); 1289 } 1290 1291 static coroutine_fn int nvme_co_pwritev(BlockDriverState *bs, 1292 int64_t offset, int64_t bytes, 1293 QEMUIOVector *qiov, 1294 BdrvRequestFlags flags) 1295 { 1296 return nvme_co_prw(bs, offset, bytes, qiov, true, flags); 1297 } 1298 1299 static coroutine_fn int nvme_co_flush(BlockDriverState *bs) 1300 { 1301 BDRVNVMeState *s = bs->opaque; 1302 NVMeQueuePair *ioq = s->queues[INDEX_IO(0)]; 1303 NVMeRequest *req; 1304 NvmeCmd cmd = { 1305 .opcode = NVME_CMD_FLUSH, 1306 .nsid = cpu_to_le32(s->nsid), 1307 }; 1308 NVMeCoData data = { 1309 .ctx = bdrv_get_aio_context(bs), 1310 .ret = -EINPROGRESS, 1311 }; 1312 1313 assert(s->queue_count > 1); 1314 req = nvme_get_free_req(ioq); 1315 assert(req); 1316 nvme_submit_command(ioq, req, &cmd, nvme_rw_cb, &data); 1317 1318 data.co = qemu_coroutine_self(); 1319 if (data.ret == -EINPROGRESS) { 1320 qemu_coroutine_yield(); 1321 } 1322 1323 return data.ret; 1324 } 1325 1326 1327 static coroutine_fn int nvme_co_pwrite_zeroes(BlockDriverState *bs, 1328 int64_t offset, 1329 int64_t bytes, 1330 BdrvRequestFlags flags) 1331 { 1332 BDRVNVMeState *s = bs->opaque; 1333 NVMeQueuePair *ioq = s->queues[INDEX_IO(0)]; 1334 NVMeRequest *req; 1335 uint32_t cdw12; 1336 1337 if (!s->supports_write_zeroes) { 1338 return -ENOTSUP; 1339 } 1340 1341 if (bytes == 0) { 1342 return 0; 1343 } 1344 1345 cdw12 = ((bytes >> s->blkshift) - 1) & 0xFFFF; 1346 /* 1347 * We should not lose information. pwrite_zeroes_alignment and 1348 * max_pwrite_zeroes guarantees it. 1349 */ 1350 assert(((cdw12 + 1) << s->blkshift) == bytes); 1351 1352 NvmeCmd cmd = { 1353 .opcode = NVME_CMD_WRITE_ZEROES, 1354 .nsid = cpu_to_le32(s->nsid), 1355 .cdw10 = cpu_to_le32((offset >> s->blkshift) & 0xFFFFFFFF), 1356 .cdw11 = cpu_to_le32(((offset >> s->blkshift) >> 32) & 0xFFFFFFFF), 1357 }; 1358 1359 NVMeCoData data = { 1360 .ctx = bdrv_get_aio_context(bs), 1361 .ret = -EINPROGRESS, 1362 }; 1363 1364 if (flags & BDRV_REQ_MAY_UNMAP) { 1365 cdw12 |= (1 << 25); 1366 } 1367 1368 if (flags & BDRV_REQ_FUA) { 1369 cdw12 |= (1 << 30); 1370 } 1371 1372 cmd.cdw12 = cpu_to_le32(cdw12); 1373 1374 trace_nvme_write_zeroes(s, offset, bytes, flags); 1375 assert(s->queue_count > 1); 1376 req = nvme_get_free_req(ioq); 1377 assert(req); 1378 1379 nvme_submit_command(ioq, req, &cmd, nvme_rw_cb, &data); 1380 1381 data.co = qemu_coroutine_self(); 1382 while (data.ret == -EINPROGRESS) { 1383 qemu_coroutine_yield(); 1384 } 1385 1386 trace_nvme_rw_done(s, true, offset, bytes, data.ret); 1387 return data.ret; 1388 } 1389 1390 1391 static int coroutine_fn nvme_co_pdiscard(BlockDriverState *bs, 1392 int64_t offset, 1393 int64_t bytes) 1394 { 1395 BDRVNVMeState *s = bs->opaque; 1396 NVMeQueuePair *ioq = s->queues[INDEX_IO(0)]; 1397 NVMeRequest *req; 1398 QEMU_AUTO_VFREE NvmeDsmRange *buf = NULL; 1399 QEMUIOVector local_qiov; 1400 int ret; 1401 1402 NvmeCmd cmd = { 1403 .opcode = NVME_CMD_DSM, 1404 .nsid = cpu_to_le32(s->nsid), 1405 .cdw10 = cpu_to_le32(0), /*number of ranges - 0 based*/ 1406 .cdw11 = cpu_to_le32(1 << 2), /*deallocate bit*/ 1407 }; 1408 1409 NVMeCoData data = { 1410 .ctx = bdrv_get_aio_context(bs), 1411 .ret = -EINPROGRESS, 1412 }; 1413 1414 if (!s->supports_discard) { 1415 return -ENOTSUP; 1416 } 1417 1418 assert(s->queue_count > 1); 1419 1420 /* 1421 * Filling the @buf requires @offset and @bytes to satisfy restrictions 1422 * defined in nvme_refresh_limits(). 1423 */ 1424 assert(QEMU_IS_ALIGNED(bytes, 1UL << s->blkshift)); 1425 assert(QEMU_IS_ALIGNED(offset, 1UL << s->blkshift)); 1426 assert((bytes >> s->blkshift) <= UINT32_MAX); 1427 1428 buf = qemu_try_memalign(s->page_size, s->page_size); 1429 if (!buf) { 1430 return -ENOMEM; 1431 } 1432 memset(buf, 0, s->page_size); 1433 buf->nlb = cpu_to_le32(bytes >> s->blkshift); 1434 buf->slba = cpu_to_le64(offset >> s->blkshift); 1435 buf->cattr = 0; 1436 1437 qemu_iovec_init(&local_qiov, 1); 1438 qemu_iovec_add(&local_qiov, buf, 4096); 1439 1440 req = nvme_get_free_req(ioq); 1441 assert(req); 1442 1443 qemu_co_mutex_lock(&s->dma_map_lock); 1444 ret = nvme_cmd_map_qiov(bs, &cmd, req, &local_qiov); 1445 qemu_co_mutex_unlock(&s->dma_map_lock); 1446 1447 if (ret) { 1448 nvme_put_free_req_and_wake(ioq, req); 1449 goto out; 1450 } 1451 1452 trace_nvme_dsm(s, offset, bytes); 1453 1454 nvme_submit_command(ioq, req, &cmd, nvme_rw_cb, &data); 1455 1456 data.co = qemu_coroutine_self(); 1457 while (data.ret == -EINPROGRESS) { 1458 qemu_coroutine_yield(); 1459 } 1460 1461 qemu_co_mutex_lock(&s->dma_map_lock); 1462 ret = nvme_cmd_unmap_qiov(bs, &local_qiov); 1463 qemu_co_mutex_unlock(&s->dma_map_lock); 1464 1465 if (ret) { 1466 goto out; 1467 } 1468 1469 ret = data.ret; 1470 trace_nvme_dsm_done(s, offset, bytes, ret); 1471 out: 1472 qemu_iovec_destroy(&local_qiov); 1473 return ret; 1474 1475 } 1476 1477 static int coroutine_fn nvme_co_truncate(BlockDriverState *bs, int64_t offset, 1478 bool exact, PreallocMode prealloc, 1479 BdrvRequestFlags flags, Error **errp) 1480 { 1481 int64_t cur_length; 1482 1483 if (prealloc != PREALLOC_MODE_OFF) { 1484 error_setg(errp, "Unsupported preallocation mode '%s'", 1485 PreallocMode_str(prealloc)); 1486 return -ENOTSUP; 1487 } 1488 1489 cur_length = nvme_co_getlength(bs); 1490 if (offset != cur_length && exact) { 1491 error_setg(errp, "Cannot resize NVMe devices"); 1492 return -ENOTSUP; 1493 } else if (offset > cur_length) { 1494 error_setg(errp, "Cannot grow NVMe devices"); 1495 return -EINVAL; 1496 } 1497 1498 return 0; 1499 } 1500 1501 static int nvme_reopen_prepare(BDRVReopenState *reopen_state, 1502 BlockReopenQueue *queue, Error **errp) 1503 { 1504 return 0; 1505 } 1506 1507 static void nvme_refresh_filename(BlockDriverState *bs) 1508 { 1509 BDRVNVMeState *s = bs->opaque; 1510 1511 snprintf(bs->exact_filename, sizeof(bs->exact_filename), "nvme://%s/%i", 1512 s->device, s->nsid); 1513 } 1514 1515 static void nvme_refresh_limits(BlockDriverState *bs, Error **errp) 1516 { 1517 BDRVNVMeState *s = bs->opaque; 1518 1519 bs->bl.opt_mem_alignment = s->page_size; 1520 bs->bl.request_alignment = s->page_size; 1521 bs->bl.max_transfer = s->max_transfer; 1522 1523 /* 1524 * Look at nvme_co_pwrite_zeroes: after shift and decrement we should get 1525 * at most 0xFFFF 1526 */ 1527 bs->bl.max_pwrite_zeroes = 1ULL << (s->blkshift + 16); 1528 bs->bl.pwrite_zeroes_alignment = MAX(bs->bl.request_alignment, 1529 1UL << s->blkshift); 1530 1531 bs->bl.max_pdiscard = (uint64_t)UINT32_MAX << s->blkshift; 1532 bs->bl.pdiscard_alignment = MAX(bs->bl.request_alignment, 1533 1UL << s->blkshift); 1534 } 1535 1536 static void nvme_detach_aio_context(BlockDriverState *bs) 1537 { 1538 BDRVNVMeState *s = bs->opaque; 1539 1540 for (unsigned i = 0; i < s->queue_count; i++) { 1541 NVMeQueuePair *q = s->queues[i]; 1542 1543 qemu_bh_delete(q->completion_bh); 1544 q->completion_bh = NULL; 1545 } 1546 1547 aio_set_event_notifier(bdrv_get_aio_context(bs), 1548 &s->irq_notifier[MSIX_SHARED_IRQ_IDX], 1549 NULL, NULL, NULL); 1550 } 1551 1552 static void nvme_attach_aio_context(BlockDriverState *bs, 1553 AioContext *new_context) 1554 { 1555 BDRVNVMeState *s = bs->opaque; 1556 1557 s->aio_context = new_context; 1558 aio_set_event_notifier(new_context, &s->irq_notifier[MSIX_SHARED_IRQ_IDX], 1559 nvme_handle_event, nvme_poll_cb, 1560 nvme_poll_ready); 1561 1562 for (unsigned i = 0; i < s->queue_count; i++) { 1563 NVMeQueuePair *q = s->queues[i]; 1564 1565 q->completion_bh = 1566 aio_bh_new(new_context, nvme_process_completion_bh, q); 1567 } 1568 } 1569 1570 static void coroutine_fn nvme_co_io_plug(BlockDriverState *bs) 1571 { 1572 BDRVNVMeState *s = bs->opaque; 1573 assert(!s->plugged); 1574 s->plugged = true; 1575 } 1576 1577 static void coroutine_fn nvme_co_io_unplug(BlockDriverState *bs) 1578 { 1579 BDRVNVMeState *s = bs->opaque; 1580 assert(s->plugged); 1581 s->plugged = false; 1582 for (unsigned i = INDEX_IO(0); i < s->queue_count; i++) { 1583 NVMeQueuePair *q = s->queues[i]; 1584 qemu_mutex_lock(&q->lock); 1585 nvme_kick(q); 1586 nvme_process_completion(q); 1587 qemu_mutex_unlock(&q->lock); 1588 } 1589 } 1590 1591 static bool nvme_register_buf(BlockDriverState *bs, void *host, size_t size, 1592 Error **errp) 1593 { 1594 int ret; 1595 BDRVNVMeState *s = bs->opaque; 1596 1597 /* 1598 * FIXME: we may run out of IOVA addresses after repeated 1599 * bdrv_register_buf/bdrv_unregister_buf, because nvme_vfio_dma_unmap 1600 * doesn't reclaim addresses for fixed mappings. 1601 */ 1602 ret = qemu_vfio_dma_map(s->vfio, host, size, false, NULL, errp); 1603 return ret == 0; 1604 } 1605 1606 static void nvme_unregister_buf(BlockDriverState *bs, void *host, size_t size) 1607 { 1608 BDRVNVMeState *s = bs->opaque; 1609 1610 qemu_vfio_dma_unmap(s->vfio, host); 1611 } 1612 1613 static BlockStatsSpecific *nvme_get_specific_stats(BlockDriverState *bs) 1614 { 1615 BlockStatsSpecific *stats = g_new(BlockStatsSpecific, 1); 1616 BDRVNVMeState *s = bs->opaque; 1617 1618 stats->driver = BLOCKDEV_DRIVER_NVME; 1619 stats->u.nvme = (BlockStatsSpecificNvme) { 1620 .completion_errors = s->stats.completion_errors, 1621 .aligned_accesses = s->stats.aligned_accesses, 1622 .unaligned_accesses = s->stats.unaligned_accesses, 1623 }; 1624 1625 return stats; 1626 } 1627 1628 static const char *const nvme_strong_runtime_opts[] = { 1629 NVME_BLOCK_OPT_DEVICE, 1630 NVME_BLOCK_OPT_NAMESPACE, 1631 1632 NULL 1633 }; 1634 1635 static BlockDriver bdrv_nvme = { 1636 .format_name = "nvme", 1637 .protocol_name = "nvme", 1638 .instance_size = sizeof(BDRVNVMeState), 1639 1640 .bdrv_co_create_opts = bdrv_co_create_opts_simple, 1641 .create_opts = &bdrv_create_opts_simple, 1642 1643 .bdrv_parse_filename = nvme_parse_filename, 1644 .bdrv_file_open = nvme_file_open, 1645 .bdrv_close = nvme_close, 1646 .bdrv_co_getlength = nvme_co_getlength, 1647 .bdrv_probe_blocksizes = nvme_probe_blocksizes, 1648 .bdrv_co_truncate = nvme_co_truncate, 1649 1650 .bdrv_co_preadv = nvme_co_preadv, 1651 .bdrv_co_pwritev = nvme_co_pwritev, 1652 1653 .bdrv_co_pwrite_zeroes = nvme_co_pwrite_zeroes, 1654 .bdrv_co_pdiscard = nvme_co_pdiscard, 1655 1656 .bdrv_co_flush_to_disk = nvme_co_flush, 1657 .bdrv_reopen_prepare = nvme_reopen_prepare, 1658 1659 .bdrv_refresh_filename = nvme_refresh_filename, 1660 .bdrv_refresh_limits = nvme_refresh_limits, 1661 .strong_runtime_opts = nvme_strong_runtime_opts, 1662 .bdrv_get_specific_stats = nvme_get_specific_stats, 1663 1664 .bdrv_detach_aio_context = nvme_detach_aio_context, 1665 .bdrv_attach_aio_context = nvme_attach_aio_context, 1666 1667 .bdrv_co_io_plug = nvme_co_io_plug, 1668 .bdrv_co_io_unplug = nvme_co_io_unplug, 1669 1670 .bdrv_register_buf = nvme_register_buf, 1671 .bdrv_unregister_buf = nvme_unregister_buf, 1672 }; 1673 1674 static void bdrv_nvme_init(void) 1675 { 1676 bdrv_register(&bdrv_nvme); 1677 } 1678 1679 block_init(bdrv_nvme_init); 1680