1 /* 2 * NVMe block driver based on vfio 3 * 4 * Copyright 2016 - 2018 Red Hat, Inc. 5 * 6 * Authors: 7 * Fam Zheng <famz@redhat.com> 8 * Paolo Bonzini <pbonzini@redhat.com> 9 * 10 * This work is licensed under the terms of the GNU GPL, version 2 or later. 11 * See the COPYING file in the top-level directory. 12 */ 13 14 #include "qemu/osdep.h" 15 #include <linux/vfio.h> 16 #include "qapi/error.h" 17 #include "qapi/qmp/qdict.h" 18 #include "qapi/qmp/qstring.h" 19 #include "qemu/error-report.h" 20 #include "qemu/main-loop.h" 21 #include "qemu/module.h" 22 #include "qemu/cutils.h" 23 #include "qemu/option.h" 24 #include "qemu/memalign.h" 25 #include "qemu/vfio-helpers.h" 26 #include "block/block_int.h" 27 #include "sysemu/replay.h" 28 #include "trace.h" 29 30 #include "block/nvme.h" 31 32 #define NVME_SQ_ENTRY_BYTES 64 33 #define NVME_CQ_ENTRY_BYTES 16 34 #define NVME_QUEUE_SIZE 128 35 #define NVME_DOORBELL_SIZE 4096 36 37 /* 38 * We have to leave one slot empty as that is the full queue case where 39 * head == tail + 1. 40 */ 41 #define NVME_NUM_REQS (NVME_QUEUE_SIZE - 1) 42 43 typedef struct BDRVNVMeState BDRVNVMeState; 44 45 /* Same index is used for queues and IRQs */ 46 #define INDEX_ADMIN 0 47 #define INDEX_IO(n) (1 + n) 48 49 /* This driver shares a single MSIX IRQ for the admin and I/O queues */ 50 enum { 51 MSIX_SHARED_IRQ_IDX = 0, 52 MSIX_IRQ_COUNT = 1 53 }; 54 55 typedef struct { 56 int32_t head, tail; 57 uint8_t *queue; 58 uint64_t iova; 59 /* Hardware MMIO register */ 60 volatile uint32_t *doorbell; 61 } NVMeQueue; 62 63 typedef struct { 64 BlockCompletionFunc *cb; 65 void *opaque; 66 int cid; 67 void *prp_list_page; 68 uint64_t prp_list_iova; 69 int free_req_next; /* q->reqs[] index of next free req */ 70 } NVMeRequest; 71 72 typedef struct { 73 QemuMutex lock; 74 75 /* Read from I/O code path, initialized under BQL */ 76 BDRVNVMeState *s; 77 int index; 78 79 /* Fields protected by BQL */ 80 uint8_t *prp_list_pages; 81 82 /* Fields protected by @lock */ 83 CoQueue free_req_queue; 84 NVMeQueue sq, cq; 85 int cq_phase; 86 int free_req_head; 87 NVMeRequest reqs[NVME_NUM_REQS]; 88 int need_kick; 89 int inflight; 90 91 /* Thread-safe, no lock necessary */ 92 QEMUBH *completion_bh; 93 } NVMeQueuePair; 94 95 struct BDRVNVMeState { 96 AioContext *aio_context; 97 QEMUVFIOState *vfio; 98 void *bar0_wo_map; 99 /* Memory mapped registers */ 100 volatile struct { 101 uint32_t sq_tail; 102 uint32_t cq_head; 103 } *doorbells; 104 /* The submission/completion queue pairs. 105 * [0]: admin queue. 106 * [1..]: io queues. 107 */ 108 NVMeQueuePair **queues; 109 unsigned queue_count; 110 size_t page_size; 111 /* How many uint32_t elements does each doorbell entry take. */ 112 size_t doorbell_scale; 113 bool write_cache_supported; 114 EventNotifier irq_notifier[MSIX_IRQ_COUNT]; 115 116 uint64_t nsze; /* Namespace size reported by identify command */ 117 int nsid; /* The namespace id to read/write data. */ 118 int blkshift; 119 120 uint64_t max_transfer; 121 bool plugged; 122 123 bool supports_write_zeroes; 124 bool supports_discard; 125 126 CoMutex dma_map_lock; 127 CoQueue dma_flush_queue; 128 129 /* Total size of mapped qiov, accessed under dma_map_lock */ 130 int dma_map_count; 131 132 /* PCI address (required for nvme_refresh_filename()) */ 133 char *device; 134 135 struct { 136 uint64_t completion_errors; 137 uint64_t aligned_accesses; 138 uint64_t unaligned_accesses; 139 } stats; 140 }; 141 142 #define NVME_BLOCK_OPT_DEVICE "device" 143 #define NVME_BLOCK_OPT_NAMESPACE "namespace" 144 145 static void nvme_process_completion_bh(void *opaque); 146 147 static QemuOptsList runtime_opts = { 148 .name = "nvme", 149 .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head), 150 .desc = { 151 { 152 .name = NVME_BLOCK_OPT_DEVICE, 153 .type = QEMU_OPT_STRING, 154 .help = "NVMe PCI device address", 155 }, 156 { 157 .name = NVME_BLOCK_OPT_NAMESPACE, 158 .type = QEMU_OPT_NUMBER, 159 .help = "NVMe namespace", 160 }, 161 { /* end of list */ } 162 }, 163 }; 164 165 /* Returns true on success, false on failure. */ 166 static bool nvme_init_queue(BDRVNVMeState *s, NVMeQueue *q, 167 unsigned nentries, size_t entry_bytes, Error **errp) 168 { 169 size_t bytes; 170 int r; 171 172 bytes = ROUND_UP(nentries * entry_bytes, qemu_real_host_page_size()); 173 q->head = q->tail = 0; 174 q->queue = qemu_try_memalign(qemu_real_host_page_size(), bytes); 175 if (!q->queue) { 176 error_setg(errp, "Cannot allocate queue"); 177 return false; 178 } 179 memset(q->queue, 0, bytes); 180 r = qemu_vfio_dma_map(s->vfio, q->queue, bytes, false, &q->iova, errp); 181 if (r) { 182 error_prepend(errp, "Cannot map queue: "); 183 } 184 return r == 0; 185 } 186 187 static void nvme_free_queue(NVMeQueue *q) 188 { 189 qemu_vfree(q->queue); 190 } 191 192 static void nvme_free_queue_pair(NVMeQueuePair *q) 193 { 194 trace_nvme_free_queue_pair(q->index, q, &q->cq, &q->sq); 195 if (q->completion_bh) { 196 qemu_bh_delete(q->completion_bh); 197 } 198 nvme_free_queue(&q->sq); 199 nvme_free_queue(&q->cq); 200 qemu_vfree(q->prp_list_pages); 201 qemu_mutex_destroy(&q->lock); 202 g_free(q); 203 } 204 205 static void nvme_free_req_queue_cb(void *opaque) 206 { 207 NVMeQueuePair *q = opaque; 208 209 qemu_mutex_lock(&q->lock); 210 while (q->free_req_head != -1 && 211 qemu_co_enter_next(&q->free_req_queue, &q->lock)) { 212 /* Retry waiting requests */ 213 } 214 qemu_mutex_unlock(&q->lock); 215 } 216 217 static NVMeQueuePair *nvme_create_queue_pair(BDRVNVMeState *s, 218 AioContext *aio_context, 219 unsigned idx, size_t size, 220 Error **errp) 221 { 222 int i, r; 223 NVMeQueuePair *q; 224 uint64_t prp_list_iova; 225 size_t bytes; 226 227 q = g_try_new0(NVMeQueuePair, 1); 228 if (!q) { 229 error_setg(errp, "Cannot allocate queue pair"); 230 return NULL; 231 } 232 trace_nvme_create_queue_pair(idx, q, size, aio_context, 233 event_notifier_get_fd(s->irq_notifier)); 234 bytes = QEMU_ALIGN_UP(s->page_size * NVME_NUM_REQS, 235 qemu_real_host_page_size()); 236 q->prp_list_pages = qemu_try_memalign(qemu_real_host_page_size(), bytes); 237 if (!q->prp_list_pages) { 238 error_setg(errp, "Cannot allocate PRP page list"); 239 goto fail; 240 } 241 memset(q->prp_list_pages, 0, bytes); 242 qemu_mutex_init(&q->lock); 243 q->s = s; 244 q->index = idx; 245 qemu_co_queue_init(&q->free_req_queue); 246 q->completion_bh = aio_bh_new(aio_context, nvme_process_completion_bh, q); 247 r = qemu_vfio_dma_map(s->vfio, q->prp_list_pages, bytes, 248 false, &prp_list_iova, errp); 249 if (r) { 250 error_prepend(errp, "Cannot map buffer for DMA: "); 251 goto fail; 252 } 253 q->free_req_head = -1; 254 for (i = 0; i < NVME_NUM_REQS; i++) { 255 NVMeRequest *req = &q->reqs[i]; 256 req->cid = i + 1; 257 req->free_req_next = q->free_req_head; 258 q->free_req_head = i; 259 req->prp_list_page = q->prp_list_pages + i * s->page_size; 260 req->prp_list_iova = prp_list_iova + i * s->page_size; 261 } 262 263 if (!nvme_init_queue(s, &q->sq, size, NVME_SQ_ENTRY_BYTES, errp)) { 264 goto fail; 265 } 266 q->sq.doorbell = &s->doorbells[idx * s->doorbell_scale].sq_tail; 267 268 if (!nvme_init_queue(s, &q->cq, size, NVME_CQ_ENTRY_BYTES, errp)) { 269 goto fail; 270 } 271 q->cq.doorbell = &s->doorbells[idx * s->doorbell_scale].cq_head; 272 273 return q; 274 fail: 275 nvme_free_queue_pair(q); 276 return NULL; 277 } 278 279 /* With q->lock */ 280 static void nvme_kick(NVMeQueuePair *q) 281 { 282 BDRVNVMeState *s = q->s; 283 284 if (s->plugged || !q->need_kick) { 285 return; 286 } 287 trace_nvme_kick(s, q->index); 288 assert(!(q->sq.tail & 0xFF00)); 289 /* Fence the write to submission queue entry before notifying the device. */ 290 smp_wmb(); 291 *q->sq.doorbell = cpu_to_le32(q->sq.tail); 292 q->inflight += q->need_kick; 293 q->need_kick = 0; 294 } 295 296 static NVMeRequest *nvme_get_free_req_nofail_locked(NVMeQueuePair *q) 297 { 298 NVMeRequest *req; 299 300 req = &q->reqs[q->free_req_head]; 301 q->free_req_head = req->free_req_next; 302 req->free_req_next = -1; 303 return req; 304 } 305 306 /* Return a free request element if any, otherwise return NULL. */ 307 static NVMeRequest *nvme_get_free_req_nowait(NVMeQueuePair *q) 308 { 309 QEMU_LOCK_GUARD(&q->lock); 310 if (q->free_req_head == -1) { 311 return NULL; 312 } 313 return nvme_get_free_req_nofail_locked(q); 314 } 315 316 /* 317 * Wait for a free request to become available if necessary, then 318 * return it. 319 */ 320 static coroutine_fn NVMeRequest *nvme_get_free_req(NVMeQueuePair *q) 321 { 322 QEMU_LOCK_GUARD(&q->lock); 323 324 while (q->free_req_head == -1) { 325 trace_nvme_free_req_queue_wait(q->s, q->index); 326 qemu_co_queue_wait(&q->free_req_queue, &q->lock); 327 } 328 329 return nvme_get_free_req_nofail_locked(q); 330 } 331 332 /* With q->lock */ 333 static void nvme_put_free_req_locked(NVMeQueuePair *q, NVMeRequest *req) 334 { 335 req->free_req_next = q->free_req_head; 336 q->free_req_head = req - q->reqs; 337 } 338 339 /* With q->lock */ 340 static void nvme_wake_free_req_locked(NVMeQueuePair *q) 341 { 342 if (!qemu_co_queue_empty(&q->free_req_queue)) { 343 replay_bh_schedule_oneshot_event(q->s->aio_context, 344 nvme_free_req_queue_cb, q); 345 } 346 } 347 348 /* Insert a request in the freelist and wake waiters */ 349 static void nvme_put_free_req_and_wake(NVMeQueuePair *q, NVMeRequest *req) 350 { 351 qemu_mutex_lock(&q->lock); 352 nvme_put_free_req_locked(q, req); 353 nvme_wake_free_req_locked(q); 354 qemu_mutex_unlock(&q->lock); 355 } 356 357 static inline int nvme_translate_error(const NvmeCqe *c) 358 { 359 uint16_t status = (le16_to_cpu(c->status) >> 1) & 0xFF; 360 if (status) { 361 trace_nvme_error(le32_to_cpu(c->result), 362 le16_to_cpu(c->sq_head), 363 le16_to_cpu(c->sq_id), 364 le16_to_cpu(c->cid), 365 le16_to_cpu(status)); 366 } 367 switch (status) { 368 case 0: 369 return 0; 370 case 1: 371 return -ENOSYS; 372 case 2: 373 return -EINVAL; 374 default: 375 return -EIO; 376 } 377 } 378 379 /* With q->lock */ 380 static bool nvme_process_completion(NVMeQueuePair *q) 381 { 382 BDRVNVMeState *s = q->s; 383 bool progress = false; 384 NVMeRequest *preq; 385 NVMeRequest req; 386 NvmeCqe *c; 387 388 trace_nvme_process_completion(s, q->index, q->inflight); 389 if (s->plugged) { 390 trace_nvme_process_completion_queue_plugged(s, q->index); 391 return false; 392 } 393 394 /* 395 * Support re-entrancy when a request cb() function invokes aio_poll(). 396 * Pending completions must be visible to aio_poll() so that a cb() 397 * function can wait for the completion of another request. 398 * 399 * The aio_poll() loop will execute our BH and we'll resume completion 400 * processing there. 401 */ 402 qemu_bh_schedule(q->completion_bh); 403 404 assert(q->inflight >= 0); 405 while (q->inflight) { 406 int ret; 407 int16_t cid; 408 409 c = (NvmeCqe *)&q->cq.queue[q->cq.head * NVME_CQ_ENTRY_BYTES]; 410 if ((le16_to_cpu(c->status) & 0x1) == q->cq_phase) { 411 break; 412 } 413 ret = nvme_translate_error(c); 414 if (ret) { 415 s->stats.completion_errors++; 416 } 417 q->cq.head = (q->cq.head + 1) % NVME_QUEUE_SIZE; 418 if (!q->cq.head) { 419 q->cq_phase = !q->cq_phase; 420 } 421 cid = le16_to_cpu(c->cid); 422 if (cid == 0 || cid > NVME_QUEUE_SIZE) { 423 warn_report("NVMe: Unexpected CID in completion queue: %"PRIu32", " 424 "queue size: %u", cid, NVME_QUEUE_SIZE); 425 continue; 426 } 427 trace_nvme_complete_command(s, q->index, cid); 428 preq = &q->reqs[cid - 1]; 429 req = *preq; 430 assert(req.cid == cid); 431 assert(req.cb); 432 nvme_put_free_req_locked(q, preq); 433 preq->cb = preq->opaque = NULL; 434 q->inflight--; 435 qemu_mutex_unlock(&q->lock); 436 req.cb(req.opaque, ret); 437 qemu_mutex_lock(&q->lock); 438 progress = true; 439 } 440 if (progress) { 441 /* Notify the device so it can post more completions. */ 442 smp_mb_release(); 443 *q->cq.doorbell = cpu_to_le32(q->cq.head); 444 nvme_wake_free_req_locked(q); 445 } 446 447 qemu_bh_cancel(q->completion_bh); 448 449 return progress; 450 } 451 452 static void nvme_process_completion_bh(void *opaque) 453 { 454 NVMeQueuePair *q = opaque; 455 456 /* 457 * We're being invoked because a nvme_process_completion() cb() function 458 * called aio_poll(). The callback may be waiting for further completions 459 * so notify the device that it has space to fill in more completions now. 460 */ 461 smp_mb_release(); 462 *q->cq.doorbell = cpu_to_le32(q->cq.head); 463 nvme_wake_free_req_locked(q); 464 465 nvme_process_completion(q); 466 } 467 468 static void nvme_trace_command(const NvmeCmd *cmd) 469 { 470 int i; 471 472 if (!trace_event_get_state_backends(TRACE_NVME_SUBMIT_COMMAND_RAW)) { 473 return; 474 } 475 for (i = 0; i < 8; ++i) { 476 uint8_t *cmdp = (uint8_t *)cmd + i * 8; 477 trace_nvme_submit_command_raw(cmdp[0], cmdp[1], cmdp[2], cmdp[3], 478 cmdp[4], cmdp[5], cmdp[6], cmdp[7]); 479 } 480 } 481 482 static void nvme_submit_command(NVMeQueuePair *q, NVMeRequest *req, 483 NvmeCmd *cmd, BlockCompletionFunc cb, 484 void *opaque) 485 { 486 assert(!req->cb); 487 req->cb = cb; 488 req->opaque = opaque; 489 cmd->cid = cpu_to_le16(req->cid); 490 491 trace_nvme_submit_command(q->s, q->index, req->cid); 492 nvme_trace_command(cmd); 493 qemu_mutex_lock(&q->lock); 494 memcpy((uint8_t *)q->sq.queue + 495 q->sq.tail * NVME_SQ_ENTRY_BYTES, cmd, sizeof(*cmd)); 496 q->sq.tail = (q->sq.tail + 1) % NVME_QUEUE_SIZE; 497 q->need_kick++; 498 nvme_kick(q); 499 nvme_process_completion(q); 500 qemu_mutex_unlock(&q->lock); 501 } 502 503 static void nvme_admin_cmd_sync_cb(void *opaque, int ret) 504 { 505 int *pret = opaque; 506 *pret = ret; 507 aio_wait_kick(); 508 } 509 510 static int nvme_admin_cmd_sync(BlockDriverState *bs, NvmeCmd *cmd) 511 { 512 BDRVNVMeState *s = bs->opaque; 513 NVMeQueuePair *q = s->queues[INDEX_ADMIN]; 514 AioContext *aio_context = bdrv_get_aio_context(bs); 515 NVMeRequest *req; 516 int ret = -EINPROGRESS; 517 req = nvme_get_free_req_nowait(q); 518 if (!req) { 519 return -EBUSY; 520 } 521 nvme_submit_command(q, req, cmd, nvme_admin_cmd_sync_cb, &ret); 522 523 AIO_WAIT_WHILE(aio_context, ret == -EINPROGRESS); 524 return ret; 525 } 526 527 /* Returns true on success, false on failure. */ 528 static bool nvme_identify(BlockDriverState *bs, int namespace, Error **errp) 529 { 530 BDRVNVMeState *s = bs->opaque; 531 bool ret = false; 532 QEMU_AUTO_VFREE union { 533 NvmeIdCtrl ctrl; 534 NvmeIdNs ns; 535 } *id = NULL; 536 NvmeLBAF *lbaf; 537 uint16_t oncs; 538 int r; 539 uint64_t iova; 540 NvmeCmd cmd = { 541 .opcode = NVME_ADM_CMD_IDENTIFY, 542 .cdw10 = cpu_to_le32(0x1), 543 }; 544 size_t id_size = QEMU_ALIGN_UP(sizeof(*id), qemu_real_host_page_size()); 545 546 id = qemu_try_memalign(qemu_real_host_page_size(), id_size); 547 if (!id) { 548 error_setg(errp, "Cannot allocate buffer for identify response"); 549 goto out; 550 } 551 r = qemu_vfio_dma_map(s->vfio, id, id_size, true, &iova, errp); 552 if (r) { 553 error_prepend(errp, "Cannot map buffer for DMA: "); 554 goto out; 555 } 556 557 memset(id, 0, id_size); 558 cmd.dptr.prp1 = cpu_to_le64(iova); 559 if (nvme_admin_cmd_sync(bs, &cmd)) { 560 error_setg(errp, "Failed to identify controller"); 561 goto out; 562 } 563 564 if (le32_to_cpu(id->ctrl.nn) < namespace) { 565 error_setg(errp, "Invalid namespace"); 566 goto out; 567 } 568 s->write_cache_supported = le32_to_cpu(id->ctrl.vwc) & 0x1; 569 s->max_transfer = (id->ctrl.mdts ? 1 << id->ctrl.mdts : 0) * s->page_size; 570 /* For now the page list buffer per command is one page, to hold at most 571 * s->page_size / sizeof(uint64_t) entries. */ 572 s->max_transfer = MIN_NON_ZERO(s->max_transfer, 573 s->page_size / sizeof(uint64_t) * s->page_size); 574 575 oncs = le16_to_cpu(id->ctrl.oncs); 576 s->supports_write_zeroes = !!(oncs & NVME_ONCS_WRITE_ZEROES); 577 s->supports_discard = !!(oncs & NVME_ONCS_DSM); 578 579 memset(id, 0, id_size); 580 cmd.cdw10 = 0; 581 cmd.nsid = cpu_to_le32(namespace); 582 if (nvme_admin_cmd_sync(bs, &cmd)) { 583 error_setg(errp, "Failed to identify namespace"); 584 goto out; 585 } 586 587 s->nsze = le64_to_cpu(id->ns.nsze); 588 lbaf = &id->ns.lbaf[NVME_ID_NS_FLBAS_INDEX(id->ns.flbas)]; 589 590 if (NVME_ID_NS_DLFEAT_WRITE_ZEROES(id->ns.dlfeat) && 591 NVME_ID_NS_DLFEAT_READ_BEHAVIOR(id->ns.dlfeat) == 592 NVME_ID_NS_DLFEAT_READ_BEHAVIOR_ZEROES) { 593 bs->supported_write_flags |= BDRV_REQ_MAY_UNMAP; 594 } 595 596 if (lbaf->ms) { 597 error_setg(errp, "Namespaces with metadata are not yet supported"); 598 goto out; 599 } 600 601 if (lbaf->ds < BDRV_SECTOR_BITS || lbaf->ds > 12 || 602 (1 << lbaf->ds) > s->page_size) 603 { 604 error_setg(errp, "Namespace has unsupported block size (2^%d)", 605 lbaf->ds); 606 goto out; 607 } 608 609 ret = true; 610 s->blkshift = lbaf->ds; 611 out: 612 qemu_vfio_dma_unmap(s->vfio, id); 613 614 return ret; 615 } 616 617 static void nvme_poll_queue(NVMeQueuePair *q) 618 { 619 const size_t cqe_offset = q->cq.head * NVME_CQ_ENTRY_BYTES; 620 NvmeCqe *cqe = (NvmeCqe *)&q->cq.queue[cqe_offset]; 621 622 trace_nvme_poll_queue(q->s, q->index); 623 /* 624 * Do an early check for completions. q->lock isn't needed because 625 * nvme_process_completion() only runs in the event loop thread and 626 * cannot race with itself. 627 */ 628 if ((le16_to_cpu(cqe->status) & 0x1) == q->cq_phase) { 629 return; 630 } 631 632 qemu_mutex_lock(&q->lock); 633 while (nvme_process_completion(q)) { 634 /* Keep polling */ 635 } 636 qemu_mutex_unlock(&q->lock); 637 } 638 639 static void nvme_poll_queues(BDRVNVMeState *s) 640 { 641 int i; 642 643 for (i = 0; i < s->queue_count; i++) { 644 nvme_poll_queue(s->queues[i]); 645 } 646 } 647 648 static void nvme_handle_event(EventNotifier *n) 649 { 650 BDRVNVMeState *s = container_of(n, BDRVNVMeState, 651 irq_notifier[MSIX_SHARED_IRQ_IDX]); 652 653 trace_nvme_handle_event(s); 654 event_notifier_test_and_clear(n); 655 nvme_poll_queues(s); 656 } 657 658 static bool nvme_add_io_queue(BlockDriverState *bs, Error **errp) 659 { 660 BDRVNVMeState *s = bs->opaque; 661 unsigned n = s->queue_count; 662 NVMeQueuePair *q; 663 NvmeCmd cmd; 664 unsigned queue_size = NVME_QUEUE_SIZE; 665 666 assert(n <= UINT16_MAX); 667 q = nvme_create_queue_pair(s, bdrv_get_aio_context(bs), 668 n, queue_size, errp); 669 if (!q) { 670 return false; 671 } 672 cmd = (NvmeCmd) { 673 .opcode = NVME_ADM_CMD_CREATE_CQ, 674 .dptr.prp1 = cpu_to_le64(q->cq.iova), 675 .cdw10 = cpu_to_le32(((queue_size - 1) << 16) | n), 676 .cdw11 = cpu_to_le32(NVME_CQ_IEN | NVME_CQ_PC), 677 }; 678 if (nvme_admin_cmd_sync(bs, &cmd)) { 679 error_setg(errp, "Failed to create CQ io queue [%u]", n); 680 goto out_error; 681 } 682 cmd = (NvmeCmd) { 683 .opcode = NVME_ADM_CMD_CREATE_SQ, 684 .dptr.prp1 = cpu_to_le64(q->sq.iova), 685 .cdw10 = cpu_to_le32(((queue_size - 1) << 16) | n), 686 .cdw11 = cpu_to_le32(NVME_SQ_PC | (n << 16)), 687 }; 688 if (nvme_admin_cmd_sync(bs, &cmd)) { 689 error_setg(errp, "Failed to create SQ io queue [%u]", n); 690 goto out_error; 691 } 692 s->queues = g_renew(NVMeQueuePair *, s->queues, n + 1); 693 s->queues[n] = q; 694 s->queue_count++; 695 return true; 696 out_error: 697 nvme_free_queue_pair(q); 698 return false; 699 } 700 701 static bool nvme_poll_cb(void *opaque) 702 { 703 EventNotifier *e = opaque; 704 BDRVNVMeState *s = container_of(e, BDRVNVMeState, 705 irq_notifier[MSIX_SHARED_IRQ_IDX]); 706 int i; 707 708 for (i = 0; i < s->queue_count; i++) { 709 NVMeQueuePair *q = s->queues[i]; 710 const size_t cqe_offset = q->cq.head * NVME_CQ_ENTRY_BYTES; 711 NvmeCqe *cqe = (NvmeCqe *)&q->cq.queue[cqe_offset]; 712 713 /* 714 * q->lock isn't needed because nvme_process_completion() only runs in 715 * the event loop thread and cannot race with itself. 716 */ 717 if ((le16_to_cpu(cqe->status) & 0x1) != q->cq_phase) { 718 return true; 719 } 720 } 721 return false; 722 } 723 724 static void nvme_poll_ready(EventNotifier *e) 725 { 726 BDRVNVMeState *s = container_of(e, BDRVNVMeState, 727 irq_notifier[MSIX_SHARED_IRQ_IDX]); 728 729 nvme_poll_queues(s); 730 } 731 732 static int nvme_init(BlockDriverState *bs, const char *device, int namespace, 733 Error **errp) 734 { 735 BDRVNVMeState *s = bs->opaque; 736 NVMeQueuePair *q; 737 AioContext *aio_context = bdrv_get_aio_context(bs); 738 int ret; 739 uint64_t cap; 740 uint32_t ver; 741 uint64_t timeout_ms; 742 uint64_t deadline, now; 743 volatile NvmeBar *regs = NULL; 744 745 qemu_co_mutex_init(&s->dma_map_lock); 746 qemu_co_queue_init(&s->dma_flush_queue); 747 s->device = g_strdup(device); 748 s->nsid = namespace; 749 s->aio_context = bdrv_get_aio_context(bs); 750 ret = event_notifier_init(&s->irq_notifier[MSIX_SHARED_IRQ_IDX], 0); 751 if (ret) { 752 error_setg(errp, "Failed to init event notifier"); 753 return ret; 754 } 755 756 s->vfio = qemu_vfio_open_pci(device, errp); 757 if (!s->vfio) { 758 ret = -EINVAL; 759 goto out; 760 } 761 762 regs = qemu_vfio_pci_map_bar(s->vfio, 0, 0, sizeof(NvmeBar), 763 PROT_READ | PROT_WRITE, errp); 764 if (!regs) { 765 ret = -EINVAL; 766 goto out; 767 } 768 /* Perform initialize sequence as described in NVMe spec "7.6.1 769 * Initialization". */ 770 771 cap = le64_to_cpu(regs->cap); 772 trace_nvme_controller_capability_raw(cap); 773 trace_nvme_controller_capability("Maximum Queue Entries Supported", 774 1 + NVME_CAP_MQES(cap)); 775 trace_nvme_controller_capability("Contiguous Queues Required", 776 NVME_CAP_CQR(cap)); 777 trace_nvme_controller_capability("Doorbell Stride", 778 1 << (2 + NVME_CAP_DSTRD(cap))); 779 trace_nvme_controller_capability("Subsystem Reset Supported", 780 NVME_CAP_NSSRS(cap)); 781 trace_nvme_controller_capability("Memory Page Size Minimum", 782 1 << (12 + NVME_CAP_MPSMIN(cap))); 783 trace_nvme_controller_capability("Memory Page Size Maximum", 784 1 << (12 + NVME_CAP_MPSMAX(cap))); 785 if (!NVME_CAP_CSS(cap)) { 786 error_setg(errp, "Device doesn't support NVMe command set"); 787 ret = -EINVAL; 788 goto out; 789 } 790 791 s->page_size = 1u << (12 + NVME_CAP_MPSMIN(cap)); 792 s->doorbell_scale = (4 << NVME_CAP_DSTRD(cap)) / sizeof(uint32_t); 793 bs->bl.opt_mem_alignment = s->page_size; 794 bs->bl.request_alignment = s->page_size; 795 timeout_ms = MIN(500 * NVME_CAP_TO(cap), 30000); 796 797 ver = le32_to_cpu(regs->vs); 798 trace_nvme_controller_spec_version(extract32(ver, 16, 16), 799 extract32(ver, 8, 8), 800 extract32(ver, 0, 8)); 801 802 /* Reset device to get a clean state. */ 803 regs->cc = cpu_to_le32(le32_to_cpu(regs->cc) & 0xFE); 804 /* Wait for CSTS.RDY = 0. */ 805 deadline = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) + timeout_ms * SCALE_MS; 806 while (NVME_CSTS_RDY(le32_to_cpu(regs->csts))) { 807 if (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) > deadline) { 808 error_setg(errp, "Timeout while waiting for device to reset (%" 809 PRId64 " ms)", 810 timeout_ms); 811 ret = -ETIMEDOUT; 812 goto out; 813 } 814 } 815 816 s->bar0_wo_map = qemu_vfio_pci_map_bar(s->vfio, 0, 0, 817 sizeof(NvmeBar) + NVME_DOORBELL_SIZE, 818 PROT_WRITE, errp); 819 s->doorbells = (void *)((uintptr_t)s->bar0_wo_map + sizeof(NvmeBar)); 820 if (!s->doorbells) { 821 ret = -EINVAL; 822 goto out; 823 } 824 825 /* Set up admin queue. */ 826 s->queues = g_new(NVMeQueuePair *, 1); 827 q = nvme_create_queue_pair(s, aio_context, 0, NVME_QUEUE_SIZE, errp); 828 if (!q) { 829 ret = -EINVAL; 830 goto out; 831 } 832 s->queues[INDEX_ADMIN] = q; 833 s->queue_count = 1; 834 QEMU_BUILD_BUG_ON((NVME_QUEUE_SIZE - 1) & 0xF000); 835 regs->aqa = cpu_to_le32(((NVME_QUEUE_SIZE - 1) << AQA_ACQS_SHIFT) | 836 ((NVME_QUEUE_SIZE - 1) << AQA_ASQS_SHIFT)); 837 regs->asq = cpu_to_le64(q->sq.iova); 838 regs->acq = cpu_to_le64(q->cq.iova); 839 840 /* After setting up all control registers we can enable device now. */ 841 regs->cc = cpu_to_le32((ctz32(NVME_CQ_ENTRY_BYTES) << CC_IOCQES_SHIFT) | 842 (ctz32(NVME_SQ_ENTRY_BYTES) << CC_IOSQES_SHIFT) | 843 CC_EN_MASK); 844 /* Wait for CSTS.RDY = 1. */ 845 now = qemu_clock_get_ns(QEMU_CLOCK_REALTIME); 846 deadline = now + timeout_ms * SCALE_MS; 847 while (!NVME_CSTS_RDY(le32_to_cpu(regs->csts))) { 848 if (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) > deadline) { 849 error_setg(errp, "Timeout while waiting for device to start (%" 850 PRId64 " ms)", 851 timeout_ms); 852 ret = -ETIMEDOUT; 853 goto out; 854 } 855 } 856 857 ret = qemu_vfio_pci_init_irq(s->vfio, s->irq_notifier, 858 VFIO_PCI_MSIX_IRQ_INDEX, errp); 859 if (ret) { 860 goto out; 861 } 862 aio_set_event_notifier(bdrv_get_aio_context(bs), 863 &s->irq_notifier[MSIX_SHARED_IRQ_IDX], 864 false, nvme_handle_event, nvme_poll_cb, 865 nvme_poll_ready); 866 867 if (!nvme_identify(bs, namespace, errp)) { 868 ret = -EIO; 869 goto out; 870 } 871 872 /* Set up command queues. */ 873 if (!nvme_add_io_queue(bs, errp)) { 874 ret = -EIO; 875 } 876 out: 877 if (regs) { 878 qemu_vfio_pci_unmap_bar(s->vfio, 0, (void *)regs, 0, sizeof(NvmeBar)); 879 } 880 881 /* Cleaning up is done in nvme_file_open() upon error. */ 882 return ret; 883 } 884 885 /* Parse a filename in the format of nvme://XXXX:XX:XX.X/X. Example: 886 * 887 * nvme://0000:44:00.0/1 888 * 889 * where the "nvme://" is a fixed form of the protocol prefix, the middle part 890 * is the PCI address, and the last part is the namespace number starting from 891 * 1 according to the NVMe spec. */ 892 static void nvme_parse_filename(const char *filename, QDict *options, 893 Error **errp) 894 { 895 int pref = strlen("nvme://"); 896 897 if (strlen(filename) > pref && !strncmp(filename, "nvme://", pref)) { 898 const char *tmp = filename + pref; 899 char *device; 900 const char *namespace; 901 unsigned long ns; 902 const char *slash = strchr(tmp, '/'); 903 if (!slash) { 904 qdict_put_str(options, NVME_BLOCK_OPT_DEVICE, tmp); 905 return; 906 } 907 device = g_strndup(tmp, slash - tmp); 908 qdict_put_str(options, NVME_BLOCK_OPT_DEVICE, device); 909 g_free(device); 910 namespace = slash + 1; 911 if (*namespace && qemu_strtoul(namespace, NULL, 10, &ns)) { 912 error_setg(errp, "Invalid namespace '%s', positive number expected", 913 namespace); 914 return; 915 } 916 qdict_put_str(options, NVME_BLOCK_OPT_NAMESPACE, 917 *namespace ? namespace : "1"); 918 } 919 } 920 921 static int nvme_enable_disable_write_cache(BlockDriverState *bs, bool enable, 922 Error **errp) 923 { 924 int ret; 925 BDRVNVMeState *s = bs->opaque; 926 NvmeCmd cmd = { 927 .opcode = NVME_ADM_CMD_SET_FEATURES, 928 .nsid = cpu_to_le32(s->nsid), 929 .cdw10 = cpu_to_le32(0x06), 930 .cdw11 = cpu_to_le32(enable ? 0x01 : 0x00), 931 }; 932 933 ret = nvme_admin_cmd_sync(bs, &cmd); 934 if (ret) { 935 error_setg(errp, "Failed to configure NVMe write cache"); 936 } 937 return ret; 938 } 939 940 static void nvme_close(BlockDriverState *bs) 941 { 942 BDRVNVMeState *s = bs->opaque; 943 944 for (unsigned i = 0; i < s->queue_count; ++i) { 945 nvme_free_queue_pair(s->queues[i]); 946 } 947 g_free(s->queues); 948 aio_set_event_notifier(bdrv_get_aio_context(bs), 949 &s->irq_notifier[MSIX_SHARED_IRQ_IDX], 950 false, NULL, NULL, NULL); 951 event_notifier_cleanup(&s->irq_notifier[MSIX_SHARED_IRQ_IDX]); 952 qemu_vfio_pci_unmap_bar(s->vfio, 0, s->bar0_wo_map, 953 0, sizeof(NvmeBar) + NVME_DOORBELL_SIZE); 954 qemu_vfio_close(s->vfio); 955 956 g_free(s->device); 957 } 958 959 static int nvme_file_open(BlockDriverState *bs, QDict *options, int flags, 960 Error **errp) 961 { 962 const char *device; 963 QemuOpts *opts; 964 int namespace; 965 int ret; 966 BDRVNVMeState *s = bs->opaque; 967 968 bs->supported_write_flags = BDRV_REQ_FUA; 969 970 opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort); 971 qemu_opts_absorb_qdict(opts, options, &error_abort); 972 device = qemu_opt_get(opts, NVME_BLOCK_OPT_DEVICE); 973 if (!device) { 974 error_setg(errp, "'" NVME_BLOCK_OPT_DEVICE "' option is required"); 975 qemu_opts_del(opts); 976 return -EINVAL; 977 } 978 979 namespace = qemu_opt_get_number(opts, NVME_BLOCK_OPT_NAMESPACE, 1); 980 ret = nvme_init(bs, device, namespace, errp); 981 qemu_opts_del(opts); 982 if (ret) { 983 goto fail; 984 } 985 if (flags & BDRV_O_NOCACHE) { 986 if (!s->write_cache_supported) { 987 error_setg(errp, 988 "NVMe controller doesn't support write cache configuration"); 989 ret = -EINVAL; 990 } else { 991 ret = nvme_enable_disable_write_cache(bs, !(flags & BDRV_O_NOCACHE), 992 errp); 993 } 994 if (ret) { 995 goto fail; 996 } 997 } 998 return 0; 999 fail: 1000 nvme_close(bs); 1001 return ret; 1002 } 1003 1004 static int64_t nvme_getlength(BlockDriverState *bs) 1005 { 1006 BDRVNVMeState *s = bs->opaque; 1007 return s->nsze << s->blkshift; 1008 } 1009 1010 static uint32_t nvme_get_blocksize(BlockDriverState *bs) 1011 { 1012 BDRVNVMeState *s = bs->opaque; 1013 assert(s->blkshift >= BDRV_SECTOR_BITS && s->blkshift <= 12); 1014 return UINT32_C(1) << s->blkshift; 1015 } 1016 1017 static int nvme_probe_blocksizes(BlockDriverState *bs, BlockSizes *bsz) 1018 { 1019 uint32_t blocksize = nvme_get_blocksize(bs); 1020 bsz->phys = blocksize; 1021 bsz->log = blocksize; 1022 return 0; 1023 } 1024 1025 /* Called with s->dma_map_lock */ 1026 static coroutine_fn int nvme_cmd_unmap_qiov(BlockDriverState *bs, 1027 QEMUIOVector *qiov) 1028 { 1029 int r = 0; 1030 BDRVNVMeState *s = bs->opaque; 1031 1032 s->dma_map_count -= qiov->size; 1033 if (!s->dma_map_count && !qemu_co_queue_empty(&s->dma_flush_queue)) { 1034 r = qemu_vfio_dma_reset_temporary(s->vfio); 1035 if (!r) { 1036 qemu_co_queue_restart_all(&s->dma_flush_queue); 1037 } 1038 } 1039 return r; 1040 } 1041 1042 /* Called with s->dma_map_lock */ 1043 static coroutine_fn int nvme_cmd_map_qiov(BlockDriverState *bs, NvmeCmd *cmd, 1044 NVMeRequest *req, QEMUIOVector *qiov) 1045 { 1046 BDRVNVMeState *s = bs->opaque; 1047 uint64_t *pagelist = req->prp_list_page; 1048 int i, j, r; 1049 int entries = 0; 1050 Error *local_err = NULL, **errp = NULL; 1051 1052 assert(qiov->size); 1053 assert(QEMU_IS_ALIGNED(qiov->size, s->page_size)); 1054 assert(qiov->size / s->page_size <= s->page_size / sizeof(uint64_t)); 1055 for (i = 0; i < qiov->niov; ++i) { 1056 bool retry = true; 1057 uint64_t iova; 1058 size_t len = QEMU_ALIGN_UP(qiov->iov[i].iov_len, 1059 qemu_real_host_page_size()); 1060 try_map: 1061 r = qemu_vfio_dma_map(s->vfio, 1062 qiov->iov[i].iov_base, 1063 len, true, &iova, errp); 1064 if (r == -ENOSPC) { 1065 /* 1066 * In addition to the -ENOMEM error, the VFIO_IOMMU_MAP_DMA 1067 * ioctl returns -ENOSPC to signal the user exhausted the DMA 1068 * mappings available for a container since Linux kernel commit 1069 * 492855939bdb ("vfio/type1: Limit DMA mappings per container", 1070 * April 2019, see CVE-2019-3882). 1071 * 1072 * This block driver already handles this error path by checking 1073 * for the -ENOMEM error, so we directly replace -ENOSPC by 1074 * -ENOMEM. Beside, -ENOSPC has a specific meaning for blockdev 1075 * coroutines: it triggers BLOCKDEV_ON_ERROR_ENOSPC and 1076 * BLOCK_ERROR_ACTION_STOP which stops the VM, asking the operator 1077 * to add more storage to the blockdev. Not something we can do 1078 * easily with an IOMMU :) 1079 */ 1080 r = -ENOMEM; 1081 } 1082 if (r == -ENOMEM && retry) { 1083 /* 1084 * We exhausted the DMA mappings available for our container: 1085 * recycle the volatile IOVA mappings. 1086 */ 1087 retry = false; 1088 trace_nvme_dma_flush_queue_wait(s); 1089 if (s->dma_map_count) { 1090 trace_nvme_dma_map_flush(s); 1091 qemu_co_queue_wait(&s->dma_flush_queue, &s->dma_map_lock); 1092 } else { 1093 r = qemu_vfio_dma_reset_temporary(s->vfio); 1094 if (r) { 1095 goto fail; 1096 } 1097 } 1098 errp = &local_err; 1099 1100 goto try_map; 1101 } 1102 if (r) { 1103 goto fail; 1104 } 1105 1106 for (j = 0; j < qiov->iov[i].iov_len / s->page_size; j++) { 1107 pagelist[entries++] = cpu_to_le64(iova + j * s->page_size); 1108 } 1109 trace_nvme_cmd_map_qiov_iov(s, i, qiov->iov[i].iov_base, 1110 qiov->iov[i].iov_len / s->page_size); 1111 } 1112 1113 s->dma_map_count += qiov->size; 1114 1115 assert(entries <= s->page_size / sizeof(uint64_t)); 1116 switch (entries) { 1117 case 0: 1118 abort(); 1119 case 1: 1120 cmd->dptr.prp1 = pagelist[0]; 1121 cmd->dptr.prp2 = 0; 1122 break; 1123 case 2: 1124 cmd->dptr.prp1 = pagelist[0]; 1125 cmd->dptr.prp2 = pagelist[1]; 1126 break; 1127 default: 1128 cmd->dptr.prp1 = pagelist[0]; 1129 cmd->dptr.prp2 = cpu_to_le64(req->prp_list_iova + sizeof(uint64_t)); 1130 break; 1131 } 1132 trace_nvme_cmd_map_qiov(s, cmd, req, qiov, entries); 1133 for (i = 0; i < entries; ++i) { 1134 trace_nvme_cmd_map_qiov_pages(s, i, pagelist[i]); 1135 } 1136 return 0; 1137 fail: 1138 /* No need to unmap [0 - i) iovs even if we've failed, since we don't 1139 * increment s->dma_map_count. This is okay for fixed mapping memory areas 1140 * because they are already mapped before calling this function; for 1141 * temporary mappings, a later nvme_cmd_(un)map_qiov will reclaim by 1142 * calling qemu_vfio_dma_reset_temporary when necessary. */ 1143 if (local_err) { 1144 error_reportf_err(local_err, "Cannot map buffer for DMA: "); 1145 } 1146 return r; 1147 } 1148 1149 typedef struct { 1150 Coroutine *co; 1151 int ret; 1152 AioContext *ctx; 1153 } NVMeCoData; 1154 1155 static void nvme_rw_cb_bh(void *opaque) 1156 { 1157 NVMeCoData *data = opaque; 1158 qemu_coroutine_enter(data->co); 1159 } 1160 1161 static void nvme_rw_cb(void *opaque, int ret) 1162 { 1163 NVMeCoData *data = opaque; 1164 data->ret = ret; 1165 if (!data->co) { 1166 /* The rw coroutine hasn't yielded, don't try to enter. */ 1167 return; 1168 } 1169 replay_bh_schedule_oneshot_event(data->ctx, nvme_rw_cb_bh, data); 1170 } 1171 1172 static coroutine_fn int nvme_co_prw_aligned(BlockDriverState *bs, 1173 uint64_t offset, uint64_t bytes, 1174 QEMUIOVector *qiov, 1175 bool is_write, 1176 int flags) 1177 { 1178 int r; 1179 BDRVNVMeState *s = bs->opaque; 1180 NVMeQueuePair *ioq = s->queues[INDEX_IO(0)]; 1181 NVMeRequest *req; 1182 1183 uint32_t cdw12 = (((bytes >> s->blkshift) - 1) & 0xFFFF) | 1184 (flags & BDRV_REQ_FUA ? 1 << 30 : 0); 1185 NvmeCmd cmd = { 1186 .opcode = is_write ? NVME_CMD_WRITE : NVME_CMD_READ, 1187 .nsid = cpu_to_le32(s->nsid), 1188 .cdw10 = cpu_to_le32((offset >> s->blkshift) & 0xFFFFFFFF), 1189 .cdw11 = cpu_to_le32(((offset >> s->blkshift) >> 32) & 0xFFFFFFFF), 1190 .cdw12 = cpu_to_le32(cdw12), 1191 }; 1192 NVMeCoData data = { 1193 .ctx = bdrv_get_aio_context(bs), 1194 .ret = -EINPROGRESS, 1195 }; 1196 1197 trace_nvme_prw_aligned(s, is_write, offset, bytes, flags, qiov->niov); 1198 assert(s->queue_count > 1); 1199 req = nvme_get_free_req(ioq); 1200 assert(req); 1201 1202 qemu_co_mutex_lock(&s->dma_map_lock); 1203 r = nvme_cmd_map_qiov(bs, &cmd, req, qiov); 1204 qemu_co_mutex_unlock(&s->dma_map_lock); 1205 if (r) { 1206 nvme_put_free_req_and_wake(ioq, req); 1207 return r; 1208 } 1209 nvme_submit_command(ioq, req, &cmd, nvme_rw_cb, &data); 1210 1211 data.co = qemu_coroutine_self(); 1212 while (data.ret == -EINPROGRESS) { 1213 qemu_coroutine_yield(); 1214 } 1215 1216 qemu_co_mutex_lock(&s->dma_map_lock); 1217 r = nvme_cmd_unmap_qiov(bs, qiov); 1218 qemu_co_mutex_unlock(&s->dma_map_lock); 1219 if (r) { 1220 return r; 1221 } 1222 1223 trace_nvme_rw_done(s, is_write, offset, bytes, data.ret); 1224 return data.ret; 1225 } 1226 1227 static inline bool nvme_qiov_aligned(BlockDriverState *bs, 1228 const QEMUIOVector *qiov) 1229 { 1230 int i; 1231 BDRVNVMeState *s = bs->opaque; 1232 1233 for (i = 0; i < qiov->niov; ++i) { 1234 if (!QEMU_PTR_IS_ALIGNED(qiov->iov[i].iov_base, 1235 qemu_real_host_page_size()) || 1236 !QEMU_IS_ALIGNED(qiov->iov[i].iov_len, qemu_real_host_page_size())) { 1237 trace_nvme_qiov_unaligned(qiov, i, qiov->iov[i].iov_base, 1238 qiov->iov[i].iov_len, s->page_size); 1239 return false; 1240 } 1241 } 1242 return true; 1243 } 1244 1245 static coroutine_fn int nvme_co_prw(BlockDriverState *bs, 1246 uint64_t offset, uint64_t bytes, 1247 QEMUIOVector *qiov, bool is_write, 1248 int flags) 1249 { 1250 BDRVNVMeState *s = bs->opaque; 1251 int r; 1252 QEMU_AUTO_VFREE uint8_t *buf = NULL; 1253 QEMUIOVector local_qiov; 1254 size_t len = QEMU_ALIGN_UP(bytes, qemu_real_host_page_size()); 1255 assert(QEMU_IS_ALIGNED(offset, s->page_size)); 1256 assert(QEMU_IS_ALIGNED(bytes, s->page_size)); 1257 assert(bytes <= s->max_transfer); 1258 if (nvme_qiov_aligned(bs, qiov)) { 1259 s->stats.aligned_accesses++; 1260 return nvme_co_prw_aligned(bs, offset, bytes, qiov, is_write, flags); 1261 } 1262 s->stats.unaligned_accesses++; 1263 trace_nvme_prw_buffered(s, offset, bytes, qiov->niov, is_write); 1264 buf = qemu_try_memalign(qemu_real_host_page_size(), len); 1265 1266 if (!buf) { 1267 return -ENOMEM; 1268 } 1269 qemu_iovec_init(&local_qiov, 1); 1270 if (is_write) { 1271 qemu_iovec_to_buf(qiov, 0, buf, bytes); 1272 } 1273 qemu_iovec_add(&local_qiov, buf, bytes); 1274 r = nvme_co_prw_aligned(bs, offset, bytes, &local_qiov, is_write, flags); 1275 qemu_iovec_destroy(&local_qiov); 1276 if (!r && !is_write) { 1277 qemu_iovec_from_buf(qiov, 0, buf, bytes); 1278 } 1279 return r; 1280 } 1281 1282 static coroutine_fn int nvme_co_preadv(BlockDriverState *bs, 1283 int64_t offset, int64_t bytes, 1284 QEMUIOVector *qiov, 1285 BdrvRequestFlags flags) 1286 { 1287 return nvme_co_prw(bs, offset, bytes, qiov, false, flags); 1288 } 1289 1290 static coroutine_fn int nvme_co_pwritev(BlockDriverState *bs, 1291 int64_t offset, int64_t bytes, 1292 QEMUIOVector *qiov, 1293 BdrvRequestFlags flags) 1294 { 1295 return nvme_co_prw(bs, offset, bytes, qiov, true, flags); 1296 } 1297 1298 static coroutine_fn int nvme_co_flush(BlockDriverState *bs) 1299 { 1300 BDRVNVMeState *s = bs->opaque; 1301 NVMeQueuePair *ioq = s->queues[INDEX_IO(0)]; 1302 NVMeRequest *req; 1303 NvmeCmd cmd = { 1304 .opcode = NVME_CMD_FLUSH, 1305 .nsid = cpu_to_le32(s->nsid), 1306 }; 1307 NVMeCoData data = { 1308 .ctx = bdrv_get_aio_context(bs), 1309 .ret = -EINPROGRESS, 1310 }; 1311 1312 assert(s->queue_count > 1); 1313 req = nvme_get_free_req(ioq); 1314 assert(req); 1315 nvme_submit_command(ioq, req, &cmd, nvme_rw_cb, &data); 1316 1317 data.co = qemu_coroutine_self(); 1318 if (data.ret == -EINPROGRESS) { 1319 qemu_coroutine_yield(); 1320 } 1321 1322 return data.ret; 1323 } 1324 1325 1326 static coroutine_fn int nvme_co_pwrite_zeroes(BlockDriverState *bs, 1327 int64_t offset, 1328 int64_t bytes, 1329 BdrvRequestFlags flags) 1330 { 1331 BDRVNVMeState *s = bs->opaque; 1332 NVMeQueuePair *ioq = s->queues[INDEX_IO(0)]; 1333 NVMeRequest *req; 1334 uint32_t cdw12; 1335 1336 if (!s->supports_write_zeroes) { 1337 return -ENOTSUP; 1338 } 1339 1340 if (bytes == 0) { 1341 return 0; 1342 } 1343 1344 cdw12 = ((bytes >> s->blkshift) - 1) & 0xFFFF; 1345 /* 1346 * We should not lose information. pwrite_zeroes_alignment and 1347 * max_pwrite_zeroes guarantees it. 1348 */ 1349 assert(((cdw12 + 1) << s->blkshift) == bytes); 1350 1351 NvmeCmd cmd = { 1352 .opcode = NVME_CMD_WRITE_ZEROES, 1353 .nsid = cpu_to_le32(s->nsid), 1354 .cdw10 = cpu_to_le32((offset >> s->blkshift) & 0xFFFFFFFF), 1355 .cdw11 = cpu_to_le32(((offset >> s->blkshift) >> 32) & 0xFFFFFFFF), 1356 }; 1357 1358 NVMeCoData data = { 1359 .ctx = bdrv_get_aio_context(bs), 1360 .ret = -EINPROGRESS, 1361 }; 1362 1363 if (flags & BDRV_REQ_MAY_UNMAP) { 1364 cdw12 |= (1 << 25); 1365 } 1366 1367 if (flags & BDRV_REQ_FUA) { 1368 cdw12 |= (1 << 30); 1369 } 1370 1371 cmd.cdw12 = cpu_to_le32(cdw12); 1372 1373 trace_nvme_write_zeroes(s, offset, bytes, flags); 1374 assert(s->queue_count > 1); 1375 req = nvme_get_free_req(ioq); 1376 assert(req); 1377 1378 nvme_submit_command(ioq, req, &cmd, nvme_rw_cb, &data); 1379 1380 data.co = qemu_coroutine_self(); 1381 while (data.ret == -EINPROGRESS) { 1382 qemu_coroutine_yield(); 1383 } 1384 1385 trace_nvme_rw_done(s, true, offset, bytes, data.ret); 1386 return data.ret; 1387 } 1388 1389 1390 static int coroutine_fn nvme_co_pdiscard(BlockDriverState *bs, 1391 int64_t offset, 1392 int64_t bytes) 1393 { 1394 BDRVNVMeState *s = bs->opaque; 1395 NVMeQueuePair *ioq = s->queues[INDEX_IO(0)]; 1396 NVMeRequest *req; 1397 QEMU_AUTO_VFREE NvmeDsmRange *buf = NULL; 1398 QEMUIOVector local_qiov; 1399 int ret; 1400 1401 NvmeCmd cmd = { 1402 .opcode = NVME_CMD_DSM, 1403 .nsid = cpu_to_le32(s->nsid), 1404 .cdw10 = cpu_to_le32(0), /*number of ranges - 0 based*/ 1405 .cdw11 = cpu_to_le32(1 << 2), /*deallocate bit*/ 1406 }; 1407 1408 NVMeCoData data = { 1409 .ctx = bdrv_get_aio_context(bs), 1410 .ret = -EINPROGRESS, 1411 }; 1412 1413 if (!s->supports_discard) { 1414 return -ENOTSUP; 1415 } 1416 1417 assert(s->queue_count > 1); 1418 1419 /* 1420 * Filling the @buf requires @offset and @bytes to satisfy restrictions 1421 * defined in nvme_refresh_limits(). 1422 */ 1423 assert(QEMU_IS_ALIGNED(bytes, 1UL << s->blkshift)); 1424 assert(QEMU_IS_ALIGNED(offset, 1UL << s->blkshift)); 1425 assert((bytes >> s->blkshift) <= UINT32_MAX); 1426 1427 buf = qemu_try_memalign(s->page_size, s->page_size); 1428 if (!buf) { 1429 return -ENOMEM; 1430 } 1431 memset(buf, 0, s->page_size); 1432 buf->nlb = cpu_to_le32(bytes >> s->blkshift); 1433 buf->slba = cpu_to_le64(offset >> s->blkshift); 1434 buf->cattr = 0; 1435 1436 qemu_iovec_init(&local_qiov, 1); 1437 qemu_iovec_add(&local_qiov, buf, 4096); 1438 1439 req = nvme_get_free_req(ioq); 1440 assert(req); 1441 1442 qemu_co_mutex_lock(&s->dma_map_lock); 1443 ret = nvme_cmd_map_qiov(bs, &cmd, req, &local_qiov); 1444 qemu_co_mutex_unlock(&s->dma_map_lock); 1445 1446 if (ret) { 1447 nvme_put_free_req_and_wake(ioq, req); 1448 goto out; 1449 } 1450 1451 trace_nvme_dsm(s, offset, bytes); 1452 1453 nvme_submit_command(ioq, req, &cmd, nvme_rw_cb, &data); 1454 1455 data.co = qemu_coroutine_self(); 1456 while (data.ret == -EINPROGRESS) { 1457 qemu_coroutine_yield(); 1458 } 1459 1460 qemu_co_mutex_lock(&s->dma_map_lock); 1461 ret = nvme_cmd_unmap_qiov(bs, &local_qiov); 1462 qemu_co_mutex_unlock(&s->dma_map_lock); 1463 1464 if (ret) { 1465 goto out; 1466 } 1467 1468 ret = data.ret; 1469 trace_nvme_dsm_done(s, offset, bytes, ret); 1470 out: 1471 qemu_iovec_destroy(&local_qiov); 1472 return ret; 1473 1474 } 1475 1476 static int coroutine_fn nvme_co_truncate(BlockDriverState *bs, int64_t offset, 1477 bool exact, PreallocMode prealloc, 1478 BdrvRequestFlags flags, Error **errp) 1479 { 1480 int64_t cur_length; 1481 1482 if (prealloc != PREALLOC_MODE_OFF) { 1483 error_setg(errp, "Unsupported preallocation mode '%s'", 1484 PreallocMode_str(prealloc)); 1485 return -ENOTSUP; 1486 } 1487 1488 cur_length = nvme_getlength(bs); 1489 if (offset != cur_length && exact) { 1490 error_setg(errp, "Cannot resize NVMe devices"); 1491 return -ENOTSUP; 1492 } else if (offset > cur_length) { 1493 error_setg(errp, "Cannot grow NVMe devices"); 1494 return -EINVAL; 1495 } 1496 1497 return 0; 1498 } 1499 1500 static int nvme_reopen_prepare(BDRVReopenState *reopen_state, 1501 BlockReopenQueue *queue, Error **errp) 1502 { 1503 return 0; 1504 } 1505 1506 static void nvme_refresh_filename(BlockDriverState *bs) 1507 { 1508 BDRVNVMeState *s = bs->opaque; 1509 1510 snprintf(bs->exact_filename, sizeof(bs->exact_filename), "nvme://%s/%i", 1511 s->device, s->nsid); 1512 } 1513 1514 static void nvme_refresh_limits(BlockDriverState *bs, Error **errp) 1515 { 1516 BDRVNVMeState *s = bs->opaque; 1517 1518 bs->bl.opt_mem_alignment = s->page_size; 1519 bs->bl.request_alignment = s->page_size; 1520 bs->bl.max_transfer = s->max_transfer; 1521 1522 /* 1523 * Look at nvme_co_pwrite_zeroes: after shift and decrement we should get 1524 * at most 0xFFFF 1525 */ 1526 bs->bl.max_pwrite_zeroes = 1ULL << (s->blkshift + 16); 1527 bs->bl.pwrite_zeroes_alignment = MAX(bs->bl.request_alignment, 1528 1UL << s->blkshift); 1529 1530 bs->bl.max_pdiscard = (uint64_t)UINT32_MAX << s->blkshift; 1531 bs->bl.pdiscard_alignment = MAX(bs->bl.request_alignment, 1532 1UL << s->blkshift); 1533 } 1534 1535 static void nvme_detach_aio_context(BlockDriverState *bs) 1536 { 1537 BDRVNVMeState *s = bs->opaque; 1538 1539 for (unsigned i = 0; i < s->queue_count; i++) { 1540 NVMeQueuePair *q = s->queues[i]; 1541 1542 qemu_bh_delete(q->completion_bh); 1543 q->completion_bh = NULL; 1544 } 1545 1546 aio_set_event_notifier(bdrv_get_aio_context(bs), 1547 &s->irq_notifier[MSIX_SHARED_IRQ_IDX], 1548 false, NULL, NULL, NULL); 1549 } 1550 1551 static void nvme_attach_aio_context(BlockDriverState *bs, 1552 AioContext *new_context) 1553 { 1554 BDRVNVMeState *s = bs->opaque; 1555 1556 s->aio_context = new_context; 1557 aio_set_event_notifier(new_context, &s->irq_notifier[MSIX_SHARED_IRQ_IDX], 1558 false, nvme_handle_event, nvme_poll_cb, 1559 nvme_poll_ready); 1560 1561 for (unsigned i = 0; i < s->queue_count; i++) { 1562 NVMeQueuePair *q = s->queues[i]; 1563 1564 q->completion_bh = 1565 aio_bh_new(new_context, nvme_process_completion_bh, q); 1566 } 1567 } 1568 1569 static void nvme_aio_plug(BlockDriverState *bs) 1570 { 1571 BDRVNVMeState *s = bs->opaque; 1572 assert(!s->plugged); 1573 s->plugged = true; 1574 } 1575 1576 static void nvme_aio_unplug(BlockDriverState *bs) 1577 { 1578 BDRVNVMeState *s = bs->opaque; 1579 assert(s->plugged); 1580 s->plugged = false; 1581 for (unsigned i = INDEX_IO(0); i < s->queue_count; i++) { 1582 NVMeQueuePair *q = s->queues[i]; 1583 qemu_mutex_lock(&q->lock); 1584 nvme_kick(q); 1585 nvme_process_completion(q); 1586 qemu_mutex_unlock(&q->lock); 1587 } 1588 } 1589 1590 static void nvme_register_buf(BlockDriverState *bs, void *host, size_t size) 1591 { 1592 int ret; 1593 Error *local_err = NULL; 1594 BDRVNVMeState *s = bs->opaque; 1595 1596 ret = qemu_vfio_dma_map(s->vfio, host, size, false, NULL, &local_err); 1597 if (ret) { 1598 /* FIXME: we may run out of IOVA addresses after repeated 1599 * bdrv_register_buf/bdrv_unregister_buf, because nvme_vfio_dma_unmap 1600 * doesn't reclaim addresses for fixed mappings. */ 1601 error_reportf_err(local_err, "nvme_register_buf failed: "); 1602 } 1603 } 1604 1605 static void nvme_unregister_buf(BlockDriverState *bs, void *host) 1606 { 1607 BDRVNVMeState *s = bs->opaque; 1608 1609 qemu_vfio_dma_unmap(s->vfio, host); 1610 } 1611 1612 static BlockStatsSpecific *nvme_get_specific_stats(BlockDriverState *bs) 1613 { 1614 BlockStatsSpecific *stats = g_new(BlockStatsSpecific, 1); 1615 BDRVNVMeState *s = bs->opaque; 1616 1617 stats->driver = BLOCKDEV_DRIVER_NVME; 1618 stats->u.nvme = (BlockStatsSpecificNvme) { 1619 .completion_errors = s->stats.completion_errors, 1620 .aligned_accesses = s->stats.aligned_accesses, 1621 .unaligned_accesses = s->stats.unaligned_accesses, 1622 }; 1623 1624 return stats; 1625 } 1626 1627 static const char *const nvme_strong_runtime_opts[] = { 1628 NVME_BLOCK_OPT_DEVICE, 1629 NVME_BLOCK_OPT_NAMESPACE, 1630 1631 NULL 1632 }; 1633 1634 static BlockDriver bdrv_nvme = { 1635 .format_name = "nvme", 1636 .protocol_name = "nvme", 1637 .instance_size = sizeof(BDRVNVMeState), 1638 1639 .bdrv_co_create_opts = bdrv_co_create_opts_simple, 1640 .create_opts = &bdrv_create_opts_simple, 1641 1642 .bdrv_parse_filename = nvme_parse_filename, 1643 .bdrv_file_open = nvme_file_open, 1644 .bdrv_close = nvme_close, 1645 .bdrv_getlength = nvme_getlength, 1646 .bdrv_probe_blocksizes = nvme_probe_blocksizes, 1647 .bdrv_co_truncate = nvme_co_truncate, 1648 1649 .bdrv_co_preadv = nvme_co_preadv, 1650 .bdrv_co_pwritev = nvme_co_pwritev, 1651 1652 .bdrv_co_pwrite_zeroes = nvme_co_pwrite_zeroes, 1653 .bdrv_co_pdiscard = nvme_co_pdiscard, 1654 1655 .bdrv_co_flush_to_disk = nvme_co_flush, 1656 .bdrv_reopen_prepare = nvme_reopen_prepare, 1657 1658 .bdrv_refresh_filename = nvme_refresh_filename, 1659 .bdrv_refresh_limits = nvme_refresh_limits, 1660 .strong_runtime_opts = nvme_strong_runtime_opts, 1661 .bdrv_get_specific_stats = nvme_get_specific_stats, 1662 1663 .bdrv_detach_aio_context = nvme_detach_aio_context, 1664 .bdrv_attach_aio_context = nvme_attach_aio_context, 1665 1666 .bdrv_io_plug = nvme_aio_plug, 1667 .bdrv_io_unplug = nvme_aio_unplug, 1668 1669 .bdrv_register_buf = nvme_register_buf, 1670 .bdrv_unregister_buf = nvme_unregister_buf, 1671 }; 1672 1673 static void bdrv_nvme_init(void) 1674 { 1675 bdrv_register(&bdrv_nvme); 1676 } 1677 1678 block_init(bdrv_nvme_init); 1679