xref: /openbmc/qemu/block/nvme.c (revision f7160f32)
1 /*
2  * NVMe block driver based on vfio
3  *
4  * Copyright 2016 - 2018 Red Hat, Inc.
5  *
6  * Authors:
7  *   Fam Zheng <famz@redhat.com>
8  *   Paolo Bonzini <pbonzini@redhat.com>
9  *
10  * This work is licensed under the terms of the GNU GPL, version 2 or later.
11  * See the COPYING file in the top-level directory.
12  */
13 
14 #include "qemu/osdep.h"
15 #include <linux/vfio.h>
16 #include "qapi/error.h"
17 #include "qapi/qmp/qdict.h"
18 #include "qapi/qmp/qstring.h"
19 #include "qemu/error-report.h"
20 #include "qemu/main-loop.h"
21 #include "qemu/module.h"
22 #include "qemu/cutils.h"
23 #include "qemu/option.h"
24 #include "qemu/vfio-helpers.h"
25 #include "block/block_int.h"
26 #include "sysemu/replay.h"
27 #include "trace.h"
28 
29 #include "block/nvme.h"
30 
31 #define NVME_SQ_ENTRY_BYTES 64
32 #define NVME_CQ_ENTRY_BYTES 16
33 #define NVME_QUEUE_SIZE 128
34 #define NVME_BAR_SIZE 8192
35 
36 /*
37  * We have to leave one slot empty as that is the full queue case where
38  * head == tail + 1.
39  */
40 #define NVME_NUM_REQS (NVME_QUEUE_SIZE - 1)
41 
42 typedef struct BDRVNVMeState BDRVNVMeState;
43 
44 typedef struct {
45     int32_t  head, tail;
46     uint8_t  *queue;
47     uint64_t iova;
48     /* Hardware MMIO register */
49     volatile uint32_t *doorbell;
50 } NVMeQueue;
51 
52 typedef struct {
53     BlockCompletionFunc *cb;
54     void *opaque;
55     int cid;
56     void *prp_list_page;
57     uint64_t prp_list_iova;
58     int free_req_next; /* q->reqs[] index of next free req */
59 } NVMeRequest;
60 
61 typedef struct {
62     QemuMutex   lock;
63 
64     /* Read from I/O code path, initialized under BQL */
65     BDRVNVMeState   *s;
66     int             index;
67 
68     /* Fields protected by BQL */
69     uint8_t     *prp_list_pages;
70 
71     /* Fields protected by @lock */
72     CoQueue     free_req_queue;
73     NVMeQueue   sq, cq;
74     int         cq_phase;
75     int         free_req_head;
76     NVMeRequest reqs[NVME_NUM_REQS];
77     int         need_kick;
78     int         inflight;
79 
80     /* Thread-safe, no lock necessary */
81     QEMUBH      *completion_bh;
82 } NVMeQueuePair;
83 
84 /* Memory mapped registers */
85 typedef volatile struct {
86     uint64_t cap;
87     uint32_t vs;
88     uint32_t intms;
89     uint32_t intmc;
90     uint32_t cc;
91     uint32_t reserved0;
92     uint32_t csts;
93     uint32_t nssr;
94     uint32_t aqa;
95     uint64_t asq;
96     uint64_t acq;
97     uint32_t cmbloc;
98     uint32_t cmbsz;
99     uint8_t  reserved1[0xec0];
100     uint8_t  cmd_set_specfic[0x100];
101     uint32_t doorbells[];
102 } NVMeRegs;
103 
104 QEMU_BUILD_BUG_ON(offsetof(NVMeRegs, doorbells) != 0x1000);
105 
106 struct BDRVNVMeState {
107     AioContext *aio_context;
108     QEMUVFIOState *vfio;
109     NVMeRegs *regs;
110     /* The submission/completion queue pairs.
111      * [0]: admin queue.
112      * [1..]: io queues.
113      */
114     NVMeQueuePair **queues;
115     int nr_queues;
116     size_t page_size;
117     /* How many uint32_t elements does each doorbell entry take. */
118     size_t doorbell_scale;
119     bool write_cache_supported;
120     EventNotifier irq_notifier;
121 
122     uint64_t nsze; /* Namespace size reported by identify command */
123     int nsid;      /* The namespace id to read/write data. */
124     int blkshift;
125 
126     uint64_t max_transfer;
127     bool plugged;
128 
129     bool supports_write_zeroes;
130     bool supports_discard;
131 
132     CoMutex dma_map_lock;
133     CoQueue dma_flush_queue;
134 
135     /* Total size of mapped qiov, accessed under dma_map_lock */
136     int dma_map_count;
137 
138     /* PCI address (required for nvme_refresh_filename()) */
139     char *device;
140 };
141 
142 #define NVME_BLOCK_OPT_DEVICE "device"
143 #define NVME_BLOCK_OPT_NAMESPACE "namespace"
144 
145 static void nvme_process_completion_bh(void *opaque);
146 
147 static QemuOptsList runtime_opts = {
148     .name = "nvme",
149     .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head),
150     .desc = {
151         {
152             .name = NVME_BLOCK_OPT_DEVICE,
153             .type = QEMU_OPT_STRING,
154             .help = "NVMe PCI device address",
155         },
156         {
157             .name = NVME_BLOCK_OPT_NAMESPACE,
158             .type = QEMU_OPT_NUMBER,
159             .help = "NVMe namespace",
160         },
161         { /* end of list */ }
162     },
163 };
164 
165 static void nvme_init_queue(BlockDriverState *bs, NVMeQueue *q,
166                             int nentries, int entry_bytes, Error **errp)
167 {
168     BDRVNVMeState *s = bs->opaque;
169     size_t bytes;
170     int r;
171 
172     bytes = ROUND_UP(nentries * entry_bytes, s->page_size);
173     q->head = q->tail = 0;
174     q->queue = qemu_try_blockalign0(bs, bytes);
175 
176     if (!q->queue) {
177         error_setg(errp, "Cannot allocate queue");
178         return;
179     }
180     r = qemu_vfio_dma_map(s->vfio, q->queue, bytes, false, &q->iova);
181     if (r) {
182         error_setg(errp, "Cannot map queue");
183     }
184 }
185 
186 static void nvme_free_queue_pair(NVMeQueuePair *q)
187 {
188     if (q->completion_bh) {
189         qemu_bh_delete(q->completion_bh);
190     }
191     qemu_vfree(q->prp_list_pages);
192     qemu_vfree(q->sq.queue);
193     qemu_vfree(q->cq.queue);
194     qemu_mutex_destroy(&q->lock);
195     g_free(q);
196 }
197 
198 static void nvme_free_req_queue_cb(void *opaque)
199 {
200     NVMeQueuePair *q = opaque;
201 
202     qemu_mutex_lock(&q->lock);
203     while (qemu_co_enter_next(&q->free_req_queue, &q->lock)) {
204         /* Retry all pending requests */
205     }
206     qemu_mutex_unlock(&q->lock);
207 }
208 
209 static NVMeQueuePair *nvme_create_queue_pair(BlockDriverState *bs,
210                                              int idx, int size,
211                                              Error **errp)
212 {
213     int i, r;
214     BDRVNVMeState *s = bs->opaque;
215     Error *local_err = NULL;
216     NVMeQueuePair *q = g_new0(NVMeQueuePair, 1);
217     uint64_t prp_list_iova;
218 
219     qemu_mutex_init(&q->lock);
220     q->s = s;
221     q->index = idx;
222     qemu_co_queue_init(&q->free_req_queue);
223     q->prp_list_pages = qemu_blockalign0(bs, s->page_size * NVME_NUM_REQS);
224     q->completion_bh = aio_bh_new(bdrv_get_aio_context(bs),
225                                   nvme_process_completion_bh, q);
226     r = qemu_vfio_dma_map(s->vfio, q->prp_list_pages,
227                           s->page_size * NVME_NUM_REQS,
228                           false, &prp_list_iova);
229     if (r) {
230         goto fail;
231     }
232     q->free_req_head = -1;
233     for (i = 0; i < NVME_NUM_REQS; i++) {
234         NVMeRequest *req = &q->reqs[i];
235         req->cid = i + 1;
236         req->free_req_next = q->free_req_head;
237         q->free_req_head = i;
238         req->prp_list_page = q->prp_list_pages + i * s->page_size;
239         req->prp_list_iova = prp_list_iova + i * s->page_size;
240     }
241 
242     nvme_init_queue(bs, &q->sq, size, NVME_SQ_ENTRY_BYTES, &local_err);
243     if (local_err) {
244         error_propagate(errp, local_err);
245         goto fail;
246     }
247     q->sq.doorbell = &s->regs->doorbells[idx * 2 * s->doorbell_scale];
248 
249     nvme_init_queue(bs, &q->cq, size, NVME_CQ_ENTRY_BYTES, &local_err);
250     if (local_err) {
251         error_propagate(errp, local_err);
252         goto fail;
253     }
254     q->cq.doorbell = &s->regs->doorbells[(idx * 2 + 1) * s->doorbell_scale];
255 
256     return q;
257 fail:
258     nvme_free_queue_pair(q);
259     return NULL;
260 }
261 
262 /* With q->lock */
263 static void nvme_kick(NVMeQueuePair *q)
264 {
265     BDRVNVMeState *s = q->s;
266 
267     if (s->plugged || !q->need_kick) {
268         return;
269     }
270     trace_nvme_kick(s, q->index);
271     assert(!(q->sq.tail & 0xFF00));
272     /* Fence the write to submission queue entry before notifying the device. */
273     smp_wmb();
274     *q->sq.doorbell = cpu_to_le32(q->sq.tail);
275     q->inflight += q->need_kick;
276     q->need_kick = 0;
277 }
278 
279 /* Find a free request element if any, otherwise:
280  * a) if in coroutine context, try to wait for one to become available;
281  * b) if not in coroutine, return NULL;
282  */
283 static NVMeRequest *nvme_get_free_req(NVMeQueuePair *q)
284 {
285     NVMeRequest *req;
286 
287     qemu_mutex_lock(&q->lock);
288 
289     while (q->free_req_head == -1) {
290         if (qemu_in_coroutine()) {
291             trace_nvme_free_req_queue_wait(q);
292             qemu_co_queue_wait(&q->free_req_queue, &q->lock);
293         } else {
294             qemu_mutex_unlock(&q->lock);
295             return NULL;
296         }
297     }
298 
299     req = &q->reqs[q->free_req_head];
300     q->free_req_head = req->free_req_next;
301     req->free_req_next = -1;
302 
303     qemu_mutex_unlock(&q->lock);
304     return req;
305 }
306 
307 /* With q->lock */
308 static void nvme_put_free_req_locked(NVMeQueuePair *q, NVMeRequest *req)
309 {
310     req->free_req_next = q->free_req_head;
311     q->free_req_head = req - q->reqs;
312 }
313 
314 /* With q->lock */
315 static void nvme_wake_free_req_locked(NVMeQueuePair *q)
316 {
317     if (!qemu_co_queue_empty(&q->free_req_queue)) {
318         replay_bh_schedule_oneshot_event(q->s->aio_context,
319                 nvme_free_req_queue_cb, q);
320     }
321 }
322 
323 /* Insert a request in the freelist and wake waiters */
324 static void nvme_put_free_req_and_wake(NVMeQueuePair *q, NVMeRequest *req)
325 {
326     qemu_mutex_lock(&q->lock);
327     nvme_put_free_req_locked(q, req);
328     nvme_wake_free_req_locked(q);
329     qemu_mutex_unlock(&q->lock);
330 }
331 
332 static inline int nvme_translate_error(const NvmeCqe *c)
333 {
334     uint16_t status = (le16_to_cpu(c->status) >> 1) & 0xFF;
335     if (status) {
336         trace_nvme_error(le32_to_cpu(c->result),
337                          le16_to_cpu(c->sq_head),
338                          le16_to_cpu(c->sq_id),
339                          le16_to_cpu(c->cid),
340                          le16_to_cpu(status));
341     }
342     switch (status) {
343     case 0:
344         return 0;
345     case 1:
346         return -ENOSYS;
347     case 2:
348         return -EINVAL;
349     default:
350         return -EIO;
351     }
352 }
353 
354 /* With q->lock */
355 static bool nvme_process_completion(NVMeQueuePair *q)
356 {
357     BDRVNVMeState *s = q->s;
358     bool progress = false;
359     NVMeRequest *preq;
360     NVMeRequest req;
361     NvmeCqe *c;
362 
363     trace_nvme_process_completion(s, q->index, q->inflight);
364     if (s->plugged) {
365         trace_nvme_process_completion_queue_plugged(s, q->index);
366         return false;
367     }
368 
369     /*
370      * Support re-entrancy when a request cb() function invokes aio_poll().
371      * Pending completions must be visible to aio_poll() so that a cb()
372      * function can wait for the completion of another request.
373      *
374      * The aio_poll() loop will execute our BH and we'll resume completion
375      * processing there.
376      */
377     qemu_bh_schedule(q->completion_bh);
378 
379     assert(q->inflight >= 0);
380     while (q->inflight) {
381         int ret;
382         int16_t cid;
383 
384         c = (NvmeCqe *)&q->cq.queue[q->cq.head * NVME_CQ_ENTRY_BYTES];
385         if ((le16_to_cpu(c->status) & 0x1) == q->cq_phase) {
386             break;
387         }
388         ret = nvme_translate_error(c);
389         q->cq.head = (q->cq.head + 1) % NVME_QUEUE_SIZE;
390         if (!q->cq.head) {
391             q->cq_phase = !q->cq_phase;
392         }
393         cid = le16_to_cpu(c->cid);
394         if (cid == 0 || cid > NVME_QUEUE_SIZE) {
395             fprintf(stderr, "Unexpected CID in completion queue: %" PRIu32 "\n",
396                     cid);
397             continue;
398         }
399         trace_nvme_complete_command(s, q->index, cid);
400         preq = &q->reqs[cid - 1];
401         req = *preq;
402         assert(req.cid == cid);
403         assert(req.cb);
404         nvme_put_free_req_locked(q, preq);
405         preq->cb = preq->opaque = NULL;
406         q->inflight--;
407         qemu_mutex_unlock(&q->lock);
408         req.cb(req.opaque, ret);
409         qemu_mutex_lock(&q->lock);
410         progress = true;
411     }
412     if (progress) {
413         /* Notify the device so it can post more completions. */
414         smp_mb_release();
415         *q->cq.doorbell = cpu_to_le32(q->cq.head);
416         nvme_wake_free_req_locked(q);
417     }
418 
419     qemu_bh_cancel(q->completion_bh);
420 
421     return progress;
422 }
423 
424 static void nvme_process_completion_bh(void *opaque)
425 {
426     NVMeQueuePair *q = opaque;
427 
428     /*
429      * We're being invoked because a nvme_process_completion() cb() function
430      * called aio_poll(). The callback may be waiting for further completions
431      * so notify the device that it has space to fill in more completions now.
432      */
433     smp_mb_release();
434     *q->cq.doorbell = cpu_to_le32(q->cq.head);
435     nvme_wake_free_req_locked(q);
436 
437     nvme_process_completion(q);
438 }
439 
440 static void nvme_trace_command(const NvmeCmd *cmd)
441 {
442     int i;
443 
444     for (i = 0; i < 8; ++i) {
445         uint8_t *cmdp = (uint8_t *)cmd + i * 8;
446         trace_nvme_submit_command_raw(cmdp[0], cmdp[1], cmdp[2], cmdp[3],
447                                       cmdp[4], cmdp[5], cmdp[6], cmdp[7]);
448     }
449 }
450 
451 static void nvme_submit_command(NVMeQueuePair *q, NVMeRequest *req,
452                                 NvmeCmd *cmd, BlockCompletionFunc cb,
453                                 void *opaque)
454 {
455     assert(!req->cb);
456     req->cb = cb;
457     req->opaque = opaque;
458     cmd->cid = cpu_to_le32(req->cid);
459 
460     trace_nvme_submit_command(q->s, q->index, req->cid);
461     nvme_trace_command(cmd);
462     qemu_mutex_lock(&q->lock);
463     memcpy((uint8_t *)q->sq.queue +
464            q->sq.tail * NVME_SQ_ENTRY_BYTES, cmd, sizeof(*cmd));
465     q->sq.tail = (q->sq.tail + 1) % NVME_QUEUE_SIZE;
466     q->need_kick++;
467     nvme_kick(q);
468     nvme_process_completion(q);
469     qemu_mutex_unlock(&q->lock);
470 }
471 
472 static void nvme_cmd_sync_cb(void *opaque, int ret)
473 {
474     int *pret = opaque;
475     *pret = ret;
476     aio_wait_kick();
477 }
478 
479 static int nvme_cmd_sync(BlockDriverState *bs, NVMeQueuePair *q,
480                          NvmeCmd *cmd)
481 {
482     NVMeRequest *req;
483     int ret = -EINPROGRESS;
484     req = nvme_get_free_req(q);
485     if (!req) {
486         return -EBUSY;
487     }
488     nvme_submit_command(q, req, cmd, nvme_cmd_sync_cb, &ret);
489 
490     BDRV_POLL_WHILE(bs, ret == -EINPROGRESS);
491     return ret;
492 }
493 
494 static void nvme_identify(BlockDriverState *bs, int namespace, Error **errp)
495 {
496     BDRVNVMeState *s = bs->opaque;
497     NvmeIdCtrl *idctrl;
498     NvmeIdNs *idns;
499     NvmeLBAF *lbaf;
500     uint8_t *resp;
501     uint16_t oncs;
502     int r;
503     uint64_t iova;
504     NvmeCmd cmd = {
505         .opcode = NVME_ADM_CMD_IDENTIFY,
506         .cdw10 = cpu_to_le32(0x1),
507     };
508 
509     resp = qemu_try_blockalign0(bs, sizeof(NvmeIdCtrl));
510     if (!resp) {
511         error_setg(errp, "Cannot allocate buffer for identify response");
512         goto out;
513     }
514     idctrl = (NvmeIdCtrl *)resp;
515     idns = (NvmeIdNs *)resp;
516     r = qemu_vfio_dma_map(s->vfio, resp, sizeof(NvmeIdCtrl), true, &iova);
517     if (r) {
518         error_setg(errp, "Cannot map buffer for DMA");
519         goto out;
520     }
521     cmd.prp1 = cpu_to_le64(iova);
522 
523     if (nvme_cmd_sync(bs, s->queues[0], &cmd)) {
524         error_setg(errp, "Failed to identify controller");
525         goto out;
526     }
527 
528     if (le32_to_cpu(idctrl->nn) < namespace) {
529         error_setg(errp, "Invalid namespace");
530         goto out;
531     }
532     s->write_cache_supported = le32_to_cpu(idctrl->vwc) & 0x1;
533     s->max_transfer = (idctrl->mdts ? 1 << idctrl->mdts : 0) * s->page_size;
534     /* For now the page list buffer per command is one page, to hold at most
535      * s->page_size / sizeof(uint64_t) entries. */
536     s->max_transfer = MIN_NON_ZERO(s->max_transfer,
537                           s->page_size / sizeof(uint64_t) * s->page_size);
538 
539     oncs = le16_to_cpu(idctrl->oncs);
540     s->supports_write_zeroes = !!(oncs & NVME_ONCS_WRITE_ZEROS);
541     s->supports_discard = !!(oncs & NVME_ONCS_DSM);
542 
543     memset(resp, 0, 4096);
544 
545     cmd.cdw10 = 0;
546     cmd.nsid = cpu_to_le32(namespace);
547     if (nvme_cmd_sync(bs, s->queues[0], &cmd)) {
548         error_setg(errp, "Failed to identify namespace");
549         goto out;
550     }
551 
552     s->nsze = le64_to_cpu(idns->nsze);
553     lbaf = &idns->lbaf[NVME_ID_NS_FLBAS_INDEX(idns->flbas)];
554 
555     if (NVME_ID_NS_DLFEAT_WRITE_ZEROES(idns->dlfeat) &&
556             NVME_ID_NS_DLFEAT_READ_BEHAVIOR(idns->dlfeat) ==
557                     NVME_ID_NS_DLFEAT_READ_BEHAVIOR_ZEROES) {
558         bs->supported_write_flags |= BDRV_REQ_MAY_UNMAP;
559     }
560 
561     if (lbaf->ms) {
562         error_setg(errp, "Namespaces with metadata are not yet supported");
563         goto out;
564     }
565 
566     if (lbaf->ds < BDRV_SECTOR_BITS || lbaf->ds > 12 ||
567         (1 << lbaf->ds) > s->page_size)
568     {
569         error_setg(errp, "Namespace has unsupported block size (2^%d)",
570                    lbaf->ds);
571         goto out;
572     }
573 
574     s->blkshift = lbaf->ds;
575 out:
576     qemu_vfio_dma_unmap(s->vfio, resp);
577     qemu_vfree(resp);
578 }
579 
580 static bool nvme_poll_queues(BDRVNVMeState *s)
581 {
582     bool progress = false;
583     int i;
584 
585     for (i = 0; i < s->nr_queues; i++) {
586         NVMeQueuePair *q = s->queues[i];
587         const size_t cqe_offset = q->cq.head * NVME_CQ_ENTRY_BYTES;
588         NvmeCqe *cqe = (NvmeCqe *)&q->cq.queue[cqe_offset];
589 
590         /*
591          * Do an early check for completions. q->lock isn't needed because
592          * nvme_process_completion() only runs in the event loop thread and
593          * cannot race with itself.
594          */
595         if ((le16_to_cpu(cqe->status) & 0x1) == q->cq_phase) {
596             continue;
597         }
598 
599         qemu_mutex_lock(&q->lock);
600         while (nvme_process_completion(q)) {
601             /* Keep polling */
602             progress = true;
603         }
604         qemu_mutex_unlock(&q->lock);
605     }
606     return progress;
607 }
608 
609 static void nvme_handle_event(EventNotifier *n)
610 {
611     BDRVNVMeState *s = container_of(n, BDRVNVMeState, irq_notifier);
612 
613     trace_nvme_handle_event(s);
614     event_notifier_test_and_clear(n);
615     nvme_poll_queues(s);
616 }
617 
618 static bool nvme_add_io_queue(BlockDriverState *bs, Error **errp)
619 {
620     BDRVNVMeState *s = bs->opaque;
621     int n = s->nr_queues;
622     NVMeQueuePair *q;
623     NvmeCmd cmd;
624     int queue_size = NVME_QUEUE_SIZE;
625 
626     q = nvme_create_queue_pair(bs, n, queue_size, errp);
627     if (!q) {
628         return false;
629     }
630     cmd = (NvmeCmd) {
631         .opcode = NVME_ADM_CMD_CREATE_CQ,
632         .prp1 = cpu_to_le64(q->cq.iova),
633         .cdw10 = cpu_to_le32(((queue_size - 1) << 16) | (n & 0xFFFF)),
634         .cdw11 = cpu_to_le32(0x3),
635     };
636     if (nvme_cmd_sync(bs, s->queues[0], &cmd)) {
637         error_setg(errp, "Failed to create io queue [%d]", n);
638         nvme_free_queue_pair(q);
639         return false;
640     }
641     cmd = (NvmeCmd) {
642         .opcode = NVME_ADM_CMD_CREATE_SQ,
643         .prp1 = cpu_to_le64(q->sq.iova),
644         .cdw10 = cpu_to_le32(((queue_size - 1) << 16) | (n & 0xFFFF)),
645         .cdw11 = cpu_to_le32(0x1 | (n << 16)),
646     };
647     if (nvme_cmd_sync(bs, s->queues[0], &cmd)) {
648         error_setg(errp, "Failed to create io queue [%d]", n);
649         nvme_free_queue_pair(q);
650         return false;
651     }
652     s->queues = g_renew(NVMeQueuePair *, s->queues, n + 1);
653     s->queues[n] = q;
654     s->nr_queues++;
655     return true;
656 }
657 
658 static bool nvme_poll_cb(void *opaque)
659 {
660     EventNotifier *e = opaque;
661     BDRVNVMeState *s = container_of(e, BDRVNVMeState, irq_notifier);
662 
663     trace_nvme_poll_cb(s);
664     return nvme_poll_queues(s);
665 }
666 
667 static int nvme_init(BlockDriverState *bs, const char *device, int namespace,
668                      Error **errp)
669 {
670     BDRVNVMeState *s = bs->opaque;
671     int ret;
672     uint64_t cap;
673     uint64_t timeout_ms;
674     uint64_t deadline, now;
675     Error *local_err = NULL;
676 
677     qemu_co_mutex_init(&s->dma_map_lock);
678     qemu_co_queue_init(&s->dma_flush_queue);
679     s->device = g_strdup(device);
680     s->nsid = namespace;
681     s->aio_context = bdrv_get_aio_context(bs);
682     ret = event_notifier_init(&s->irq_notifier, 0);
683     if (ret) {
684         error_setg(errp, "Failed to init event notifier");
685         return ret;
686     }
687 
688     s->vfio = qemu_vfio_open_pci(device, errp);
689     if (!s->vfio) {
690         ret = -EINVAL;
691         goto out;
692     }
693 
694     s->regs = qemu_vfio_pci_map_bar(s->vfio, 0, 0, NVME_BAR_SIZE, errp);
695     if (!s->regs) {
696         ret = -EINVAL;
697         goto out;
698     }
699 
700     /* Perform initialize sequence as described in NVMe spec "7.6.1
701      * Initialization". */
702 
703     cap = le64_to_cpu(s->regs->cap);
704     if (!(cap & (1ULL << 37))) {
705         error_setg(errp, "Device doesn't support NVMe command set");
706         ret = -EINVAL;
707         goto out;
708     }
709 
710     s->page_size = MAX(4096, 1 << (12 + ((cap >> 48) & 0xF)));
711     s->doorbell_scale = (4 << (((cap >> 32) & 0xF))) / sizeof(uint32_t);
712     bs->bl.opt_mem_alignment = s->page_size;
713     timeout_ms = MIN(500 * ((cap >> 24) & 0xFF), 30000);
714 
715     /* Reset device to get a clean state. */
716     s->regs->cc = cpu_to_le32(le32_to_cpu(s->regs->cc) & 0xFE);
717     /* Wait for CSTS.RDY = 0. */
718     deadline = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) + timeout_ms * 1000000ULL;
719     while (le32_to_cpu(s->regs->csts) & 0x1) {
720         if (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) > deadline) {
721             error_setg(errp, "Timeout while waiting for device to reset (%"
722                              PRId64 " ms)",
723                        timeout_ms);
724             ret = -ETIMEDOUT;
725             goto out;
726         }
727     }
728 
729     /* Set up admin queue. */
730     s->queues = g_new(NVMeQueuePair *, 1);
731     s->queues[0] = nvme_create_queue_pair(bs, 0, NVME_QUEUE_SIZE, errp);
732     if (!s->queues[0]) {
733         ret = -EINVAL;
734         goto out;
735     }
736     s->nr_queues = 1;
737     QEMU_BUILD_BUG_ON(NVME_QUEUE_SIZE & 0xF000);
738     s->regs->aqa = cpu_to_le32((NVME_QUEUE_SIZE << 16) | NVME_QUEUE_SIZE);
739     s->regs->asq = cpu_to_le64(s->queues[0]->sq.iova);
740     s->regs->acq = cpu_to_le64(s->queues[0]->cq.iova);
741 
742     /* After setting up all control registers we can enable device now. */
743     s->regs->cc = cpu_to_le32((ctz32(NVME_CQ_ENTRY_BYTES) << 20) |
744                               (ctz32(NVME_SQ_ENTRY_BYTES) << 16) |
745                               0x1);
746     /* Wait for CSTS.RDY = 1. */
747     now = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
748     deadline = now + timeout_ms * 1000000;
749     while (!(le32_to_cpu(s->regs->csts) & 0x1)) {
750         if (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) > deadline) {
751             error_setg(errp, "Timeout while waiting for device to start (%"
752                              PRId64 " ms)",
753                        timeout_ms);
754             ret = -ETIMEDOUT;
755             goto out;
756         }
757     }
758 
759     ret = qemu_vfio_pci_init_irq(s->vfio, &s->irq_notifier,
760                                  VFIO_PCI_MSIX_IRQ_INDEX, errp);
761     if (ret) {
762         goto out;
763     }
764     aio_set_event_notifier(bdrv_get_aio_context(bs), &s->irq_notifier,
765                            false, nvme_handle_event, nvme_poll_cb);
766 
767     nvme_identify(bs, namespace, &local_err);
768     if (local_err) {
769         error_propagate(errp, local_err);
770         ret = -EIO;
771         goto out;
772     }
773 
774     /* Set up command queues. */
775     if (!nvme_add_io_queue(bs, errp)) {
776         ret = -EIO;
777     }
778 out:
779     /* Cleaning up is done in nvme_file_open() upon error. */
780     return ret;
781 }
782 
783 /* Parse a filename in the format of nvme://XXXX:XX:XX.X/X. Example:
784  *
785  *     nvme://0000:44:00.0/1
786  *
787  * where the "nvme://" is a fixed form of the protocol prefix, the middle part
788  * is the PCI address, and the last part is the namespace number starting from
789  * 1 according to the NVMe spec. */
790 static void nvme_parse_filename(const char *filename, QDict *options,
791                                 Error **errp)
792 {
793     int pref = strlen("nvme://");
794 
795     if (strlen(filename) > pref && !strncmp(filename, "nvme://", pref)) {
796         const char *tmp = filename + pref;
797         char *device;
798         const char *namespace;
799         unsigned long ns;
800         const char *slash = strchr(tmp, '/');
801         if (!slash) {
802             qdict_put_str(options, NVME_BLOCK_OPT_DEVICE, tmp);
803             return;
804         }
805         device = g_strndup(tmp, slash - tmp);
806         qdict_put_str(options, NVME_BLOCK_OPT_DEVICE, device);
807         g_free(device);
808         namespace = slash + 1;
809         if (*namespace && qemu_strtoul(namespace, NULL, 10, &ns)) {
810             error_setg(errp, "Invalid namespace '%s', positive number expected",
811                        namespace);
812             return;
813         }
814         qdict_put_str(options, NVME_BLOCK_OPT_NAMESPACE,
815                       *namespace ? namespace : "1");
816     }
817 }
818 
819 static int nvme_enable_disable_write_cache(BlockDriverState *bs, bool enable,
820                                            Error **errp)
821 {
822     int ret;
823     BDRVNVMeState *s = bs->opaque;
824     NvmeCmd cmd = {
825         .opcode = NVME_ADM_CMD_SET_FEATURES,
826         .nsid = cpu_to_le32(s->nsid),
827         .cdw10 = cpu_to_le32(0x06),
828         .cdw11 = cpu_to_le32(enable ? 0x01 : 0x00),
829     };
830 
831     ret = nvme_cmd_sync(bs, s->queues[0], &cmd);
832     if (ret) {
833         error_setg(errp, "Failed to configure NVMe write cache");
834     }
835     return ret;
836 }
837 
838 static void nvme_close(BlockDriverState *bs)
839 {
840     int i;
841     BDRVNVMeState *s = bs->opaque;
842 
843     for (i = 0; i < s->nr_queues; ++i) {
844         nvme_free_queue_pair(s->queues[i]);
845     }
846     g_free(s->queues);
847     aio_set_event_notifier(bdrv_get_aio_context(bs), &s->irq_notifier,
848                            false, NULL, NULL);
849     event_notifier_cleanup(&s->irq_notifier);
850     qemu_vfio_pci_unmap_bar(s->vfio, 0, (void *)s->regs, 0, NVME_BAR_SIZE);
851     qemu_vfio_close(s->vfio);
852 
853     g_free(s->device);
854 }
855 
856 static int nvme_file_open(BlockDriverState *bs, QDict *options, int flags,
857                           Error **errp)
858 {
859     const char *device;
860     QemuOpts *opts;
861     int namespace;
862     int ret;
863     BDRVNVMeState *s = bs->opaque;
864 
865     bs->supported_write_flags = BDRV_REQ_FUA;
866 
867     opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort);
868     qemu_opts_absorb_qdict(opts, options, &error_abort);
869     device = qemu_opt_get(opts, NVME_BLOCK_OPT_DEVICE);
870     if (!device) {
871         error_setg(errp, "'" NVME_BLOCK_OPT_DEVICE "' option is required");
872         qemu_opts_del(opts);
873         return -EINVAL;
874     }
875 
876     namespace = qemu_opt_get_number(opts, NVME_BLOCK_OPT_NAMESPACE, 1);
877     ret = nvme_init(bs, device, namespace, errp);
878     qemu_opts_del(opts);
879     if (ret) {
880         goto fail;
881     }
882     if (flags & BDRV_O_NOCACHE) {
883         if (!s->write_cache_supported) {
884             error_setg(errp,
885                        "NVMe controller doesn't support write cache configuration");
886             ret = -EINVAL;
887         } else {
888             ret = nvme_enable_disable_write_cache(bs, !(flags & BDRV_O_NOCACHE),
889                                                   errp);
890         }
891         if (ret) {
892             goto fail;
893         }
894     }
895     return 0;
896 fail:
897     nvme_close(bs);
898     return ret;
899 }
900 
901 static int64_t nvme_getlength(BlockDriverState *bs)
902 {
903     BDRVNVMeState *s = bs->opaque;
904     return s->nsze << s->blkshift;
905 }
906 
907 static uint32_t nvme_get_blocksize(BlockDriverState *bs)
908 {
909     BDRVNVMeState *s = bs->opaque;
910     assert(s->blkshift >= BDRV_SECTOR_BITS && s->blkshift <= 12);
911     return UINT32_C(1) << s->blkshift;
912 }
913 
914 static int nvme_probe_blocksizes(BlockDriverState *bs, BlockSizes *bsz)
915 {
916     uint32_t blocksize = nvme_get_blocksize(bs);
917     bsz->phys = blocksize;
918     bsz->log = blocksize;
919     return 0;
920 }
921 
922 /* Called with s->dma_map_lock */
923 static coroutine_fn int nvme_cmd_unmap_qiov(BlockDriverState *bs,
924                                             QEMUIOVector *qiov)
925 {
926     int r = 0;
927     BDRVNVMeState *s = bs->opaque;
928 
929     s->dma_map_count -= qiov->size;
930     if (!s->dma_map_count && !qemu_co_queue_empty(&s->dma_flush_queue)) {
931         r = qemu_vfio_dma_reset_temporary(s->vfio);
932         if (!r) {
933             qemu_co_queue_restart_all(&s->dma_flush_queue);
934         }
935     }
936     return r;
937 }
938 
939 /* Called with s->dma_map_lock */
940 static coroutine_fn int nvme_cmd_map_qiov(BlockDriverState *bs, NvmeCmd *cmd,
941                                           NVMeRequest *req, QEMUIOVector *qiov)
942 {
943     BDRVNVMeState *s = bs->opaque;
944     uint64_t *pagelist = req->prp_list_page;
945     int i, j, r;
946     int entries = 0;
947 
948     assert(qiov->size);
949     assert(QEMU_IS_ALIGNED(qiov->size, s->page_size));
950     assert(qiov->size / s->page_size <= s->page_size / sizeof(uint64_t));
951     for (i = 0; i < qiov->niov; ++i) {
952         bool retry = true;
953         uint64_t iova;
954 try_map:
955         r = qemu_vfio_dma_map(s->vfio,
956                               qiov->iov[i].iov_base,
957                               qiov->iov[i].iov_len,
958                               true, &iova);
959         if (r == -ENOMEM && retry) {
960             retry = false;
961             trace_nvme_dma_flush_queue_wait(s);
962             if (s->dma_map_count) {
963                 trace_nvme_dma_map_flush(s);
964                 qemu_co_queue_wait(&s->dma_flush_queue, &s->dma_map_lock);
965             } else {
966                 r = qemu_vfio_dma_reset_temporary(s->vfio);
967                 if (r) {
968                     goto fail;
969                 }
970             }
971             goto try_map;
972         }
973         if (r) {
974             goto fail;
975         }
976 
977         for (j = 0; j < qiov->iov[i].iov_len / s->page_size; j++) {
978             pagelist[entries++] = cpu_to_le64(iova + j * s->page_size);
979         }
980         trace_nvme_cmd_map_qiov_iov(s, i, qiov->iov[i].iov_base,
981                                     qiov->iov[i].iov_len / s->page_size);
982     }
983 
984     s->dma_map_count += qiov->size;
985 
986     assert(entries <= s->page_size / sizeof(uint64_t));
987     switch (entries) {
988     case 0:
989         abort();
990     case 1:
991         cmd->prp1 = pagelist[0];
992         cmd->prp2 = 0;
993         break;
994     case 2:
995         cmd->prp1 = pagelist[0];
996         cmd->prp2 = pagelist[1];
997         break;
998     default:
999         cmd->prp1 = pagelist[0];
1000         cmd->prp2 = cpu_to_le64(req->prp_list_iova + sizeof(uint64_t));
1001         break;
1002     }
1003     trace_nvme_cmd_map_qiov(s, cmd, req, qiov, entries);
1004     for (i = 0; i < entries; ++i) {
1005         trace_nvme_cmd_map_qiov_pages(s, i, pagelist[i]);
1006     }
1007     return 0;
1008 fail:
1009     /* No need to unmap [0 - i) iovs even if we've failed, since we don't
1010      * increment s->dma_map_count. This is okay for fixed mapping memory areas
1011      * because they are already mapped before calling this function; for
1012      * temporary mappings, a later nvme_cmd_(un)map_qiov will reclaim by
1013      * calling qemu_vfio_dma_reset_temporary when necessary. */
1014     return r;
1015 }
1016 
1017 typedef struct {
1018     Coroutine *co;
1019     int ret;
1020     AioContext *ctx;
1021 } NVMeCoData;
1022 
1023 static void nvme_rw_cb_bh(void *opaque)
1024 {
1025     NVMeCoData *data = opaque;
1026     qemu_coroutine_enter(data->co);
1027 }
1028 
1029 static void nvme_rw_cb(void *opaque, int ret)
1030 {
1031     NVMeCoData *data = opaque;
1032     data->ret = ret;
1033     if (!data->co) {
1034         /* The rw coroutine hasn't yielded, don't try to enter. */
1035         return;
1036     }
1037     replay_bh_schedule_oneshot_event(data->ctx, nvme_rw_cb_bh, data);
1038 }
1039 
1040 static coroutine_fn int nvme_co_prw_aligned(BlockDriverState *bs,
1041                                             uint64_t offset, uint64_t bytes,
1042                                             QEMUIOVector *qiov,
1043                                             bool is_write,
1044                                             int flags)
1045 {
1046     int r;
1047     BDRVNVMeState *s = bs->opaque;
1048     NVMeQueuePair *ioq = s->queues[1];
1049     NVMeRequest *req;
1050 
1051     uint32_t cdw12 = (((bytes >> s->blkshift) - 1) & 0xFFFF) |
1052                        (flags & BDRV_REQ_FUA ? 1 << 30 : 0);
1053     NvmeCmd cmd = {
1054         .opcode = is_write ? NVME_CMD_WRITE : NVME_CMD_READ,
1055         .nsid = cpu_to_le32(s->nsid),
1056         .cdw10 = cpu_to_le32((offset >> s->blkshift) & 0xFFFFFFFF),
1057         .cdw11 = cpu_to_le32(((offset >> s->blkshift) >> 32) & 0xFFFFFFFF),
1058         .cdw12 = cpu_to_le32(cdw12),
1059     };
1060     NVMeCoData data = {
1061         .ctx = bdrv_get_aio_context(bs),
1062         .ret = -EINPROGRESS,
1063     };
1064 
1065     trace_nvme_prw_aligned(s, is_write, offset, bytes, flags, qiov->niov);
1066     assert(s->nr_queues > 1);
1067     req = nvme_get_free_req(ioq);
1068     assert(req);
1069 
1070     qemu_co_mutex_lock(&s->dma_map_lock);
1071     r = nvme_cmd_map_qiov(bs, &cmd, req, qiov);
1072     qemu_co_mutex_unlock(&s->dma_map_lock);
1073     if (r) {
1074         nvme_put_free_req_and_wake(ioq, req);
1075         return r;
1076     }
1077     nvme_submit_command(ioq, req, &cmd, nvme_rw_cb, &data);
1078 
1079     data.co = qemu_coroutine_self();
1080     while (data.ret == -EINPROGRESS) {
1081         qemu_coroutine_yield();
1082     }
1083 
1084     qemu_co_mutex_lock(&s->dma_map_lock);
1085     r = nvme_cmd_unmap_qiov(bs, qiov);
1086     qemu_co_mutex_unlock(&s->dma_map_lock);
1087     if (r) {
1088         return r;
1089     }
1090 
1091     trace_nvme_rw_done(s, is_write, offset, bytes, data.ret);
1092     return data.ret;
1093 }
1094 
1095 static inline bool nvme_qiov_aligned(BlockDriverState *bs,
1096                                      const QEMUIOVector *qiov)
1097 {
1098     int i;
1099     BDRVNVMeState *s = bs->opaque;
1100 
1101     for (i = 0; i < qiov->niov; ++i) {
1102         if (!QEMU_PTR_IS_ALIGNED(qiov->iov[i].iov_base, s->page_size) ||
1103             !QEMU_IS_ALIGNED(qiov->iov[i].iov_len, s->page_size)) {
1104             trace_nvme_qiov_unaligned(qiov, i, qiov->iov[i].iov_base,
1105                                       qiov->iov[i].iov_len, s->page_size);
1106             return false;
1107         }
1108     }
1109     return true;
1110 }
1111 
1112 static int nvme_co_prw(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
1113                        QEMUIOVector *qiov, bool is_write, int flags)
1114 {
1115     BDRVNVMeState *s = bs->opaque;
1116     int r;
1117     uint8_t *buf = NULL;
1118     QEMUIOVector local_qiov;
1119 
1120     assert(QEMU_IS_ALIGNED(offset, s->page_size));
1121     assert(QEMU_IS_ALIGNED(bytes, s->page_size));
1122     assert(bytes <= s->max_transfer);
1123     if (nvme_qiov_aligned(bs, qiov)) {
1124         return nvme_co_prw_aligned(bs, offset, bytes, qiov, is_write, flags);
1125     }
1126     trace_nvme_prw_buffered(s, offset, bytes, qiov->niov, is_write);
1127     buf = qemu_try_blockalign(bs, bytes);
1128 
1129     if (!buf) {
1130         return -ENOMEM;
1131     }
1132     qemu_iovec_init(&local_qiov, 1);
1133     if (is_write) {
1134         qemu_iovec_to_buf(qiov, 0, buf, bytes);
1135     }
1136     qemu_iovec_add(&local_qiov, buf, bytes);
1137     r = nvme_co_prw_aligned(bs, offset, bytes, &local_qiov, is_write, flags);
1138     qemu_iovec_destroy(&local_qiov);
1139     if (!r && !is_write) {
1140         qemu_iovec_from_buf(qiov, 0, buf, bytes);
1141     }
1142     qemu_vfree(buf);
1143     return r;
1144 }
1145 
1146 static coroutine_fn int nvme_co_preadv(BlockDriverState *bs,
1147                                        uint64_t offset, uint64_t bytes,
1148                                        QEMUIOVector *qiov, int flags)
1149 {
1150     return nvme_co_prw(bs, offset, bytes, qiov, false, flags);
1151 }
1152 
1153 static coroutine_fn int nvme_co_pwritev(BlockDriverState *bs,
1154                                         uint64_t offset, uint64_t bytes,
1155                                         QEMUIOVector *qiov, int flags)
1156 {
1157     return nvme_co_prw(bs, offset, bytes, qiov, true, flags);
1158 }
1159 
1160 static coroutine_fn int nvme_co_flush(BlockDriverState *bs)
1161 {
1162     BDRVNVMeState *s = bs->opaque;
1163     NVMeQueuePair *ioq = s->queues[1];
1164     NVMeRequest *req;
1165     NvmeCmd cmd = {
1166         .opcode = NVME_CMD_FLUSH,
1167         .nsid = cpu_to_le32(s->nsid),
1168     };
1169     NVMeCoData data = {
1170         .ctx = bdrv_get_aio_context(bs),
1171         .ret = -EINPROGRESS,
1172     };
1173 
1174     assert(s->nr_queues > 1);
1175     req = nvme_get_free_req(ioq);
1176     assert(req);
1177     nvme_submit_command(ioq, req, &cmd, nvme_rw_cb, &data);
1178 
1179     data.co = qemu_coroutine_self();
1180     if (data.ret == -EINPROGRESS) {
1181         qemu_coroutine_yield();
1182     }
1183 
1184     return data.ret;
1185 }
1186 
1187 
1188 static coroutine_fn int nvme_co_pwrite_zeroes(BlockDriverState *bs,
1189                                               int64_t offset,
1190                                               int bytes,
1191                                               BdrvRequestFlags flags)
1192 {
1193     BDRVNVMeState *s = bs->opaque;
1194     NVMeQueuePair *ioq = s->queues[1];
1195     NVMeRequest *req;
1196 
1197     uint32_t cdw12 = ((bytes >> s->blkshift) - 1) & 0xFFFF;
1198 
1199     if (!s->supports_write_zeroes) {
1200         return -ENOTSUP;
1201     }
1202 
1203     NvmeCmd cmd = {
1204         .opcode = NVME_CMD_WRITE_ZEROS,
1205         .nsid = cpu_to_le32(s->nsid),
1206         .cdw10 = cpu_to_le32((offset >> s->blkshift) & 0xFFFFFFFF),
1207         .cdw11 = cpu_to_le32(((offset >> s->blkshift) >> 32) & 0xFFFFFFFF),
1208     };
1209 
1210     NVMeCoData data = {
1211         .ctx = bdrv_get_aio_context(bs),
1212         .ret = -EINPROGRESS,
1213     };
1214 
1215     if (flags & BDRV_REQ_MAY_UNMAP) {
1216         cdw12 |= (1 << 25);
1217     }
1218 
1219     if (flags & BDRV_REQ_FUA) {
1220         cdw12 |= (1 << 30);
1221     }
1222 
1223     cmd.cdw12 = cpu_to_le32(cdw12);
1224 
1225     trace_nvme_write_zeroes(s, offset, bytes, flags);
1226     assert(s->nr_queues > 1);
1227     req = nvme_get_free_req(ioq);
1228     assert(req);
1229 
1230     nvme_submit_command(ioq, req, &cmd, nvme_rw_cb, &data);
1231 
1232     data.co = qemu_coroutine_self();
1233     while (data.ret == -EINPROGRESS) {
1234         qemu_coroutine_yield();
1235     }
1236 
1237     trace_nvme_rw_done(s, true, offset, bytes, data.ret);
1238     return data.ret;
1239 }
1240 
1241 
1242 static int coroutine_fn nvme_co_pdiscard(BlockDriverState *bs,
1243                                          int64_t offset,
1244                                          int bytes)
1245 {
1246     BDRVNVMeState *s = bs->opaque;
1247     NVMeQueuePair *ioq = s->queues[1];
1248     NVMeRequest *req;
1249     NvmeDsmRange *buf;
1250     QEMUIOVector local_qiov;
1251     int ret;
1252 
1253     NvmeCmd cmd = {
1254         .opcode = NVME_CMD_DSM,
1255         .nsid = cpu_to_le32(s->nsid),
1256         .cdw10 = cpu_to_le32(0), /*number of ranges - 0 based*/
1257         .cdw11 = cpu_to_le32(1 << 2), /*deallocate bit*/
1258     };
1259 
1260     NVMeCoData data = {
1261         .ctx = bdrv_get_aio_context(bs),
1262         .ret = -EINPROGRESS,
1263     };
1264 
1265     if (!s->supports_discard) {
1266         return -ENOTSUP;
1267     }
1268 
1269     assert(s->nr_queues > 1);
1270 
1271     buf = qemu_try_blockalign0(bs, s->page_size);
1272     if (!buf) {
1273         return -ENOMEM;
1274     }
1275 
1276     buf->nlb = cpu_to_le32(bytes >> s->blkshift);
1277     buf->slba = cpu_to_le64(offset >> s->blkshift);
1278     buf->cattr = 0;
1279 
1280     qemu_iovec_init(&local_qiov, 1);
1281     qemu_iovec_add(&local_qiov, buf, 4096);
1282 
1283     req = nvme_get_free_req(ioq);
1284     assert(req);
1285 
1286     qemu_co_mutex_lock(&s->dma_map_lock);
1287     ret = nvme_cmd_map_qiov(bs, &cmd, req, &local_qiov);
1288     qemu_co_mutex_unlock(&s->dma_map_lock);
1289 
1290     if (ret) {
1291         nvme_put_free_req_and_wake(ioq, req);
1292         goto out;
1293     }
1294 
1295     trace_nvme_dsm(s, offset, bytes);
1296 
1297     nvme_submit_command(ioq, req, &cmd, nvme_rw_cb, &data);
1298 
1299     data.co = qemu_coroutine_self();
1300     while (data.ret == -EINPROGRESS) {
1301         qemu_coroutine_yield();
1302     }
1303 
1304     qemu_co_mutex_lock(&s->dma_map_lock);
1305     ret = nvme_cmd_unmap_qiov(bs, &local_qiov);
1306     qemu_co_mutex_unlock(&s->dma_map_lock);
1307 
1308     if (ret) {
1309         goto out;
1310     }
1311 
1312     ret = data.ret;
1313     trace_nvme_dsm_done(s, offset, bytes, ret);
1314 out:
1315     qemu_iovec_destroy(&local_qiov);
1316     qemu_vfree(buf);
1317     return ret;
1318 
1319 }
1320 
1321 
1322 static int nvme_reopen_prepare(BDRVReopenState *reopen_state,
1323                                BlockReopenQueue *queue, Error **errp)
1324 {
1325     return 0;
1326 }
1327 
1328 static void nvme_refresh_filename(BlockDriverState *bs)
1329 {
1330     BDRVNVMeState *s = bs->opaque;
1331 
1332     snprintf(bs->exact_filename, sizeof(bs->exact_filename), "nvme://%s/%i",
1333              s->device, s->nsid);
1334 }
1335 
1336 static void nvme_refresh_limits(BlockDriverState *bs, Error **errp)
1337 {
1338     BDRVNVMeState *s = bs->opaque;
1339 
1340     bs->bl.opt_mem_alignment = s->page_size;
1341     bs->bl.request_alignment = s->page_size;
1342     bs->bl.max_transfer = s->max_transfer;
1343 }
1344 
1345 static void nvme_detach_aio_context(BlockDriverState *bs)
1346 {
1347     BDRVNVMeState *s = bs->opaque;
1348 
1349     for (int i = 0; i < s->nr_queues; i++) {
1350         NVMeQueuePair *q = s->queues[i];
1351 
1352         qemu_bh_delete(q->completion_bh);
1353         q->completion_bh = NULL;
1354     }
1355 
1356     aio_set_event_notifier(bdrv_get_aio_context(bs), &s->irq_notifier,
1357                            false, NULL, NULL);
1358 }
1359 
1360 static void nvme_attach_aio_context(BlockDriverState *bs,
1361                                     AioContext *new_context)
1362 {
1363     BDRVNVMeState *s = bs->opaque;
1364 
1365     s->aio_context = new_context;
1366     aio_set_event_notifier(new_context, &s->irq_notifier,
1367                            false, nvme_handle_event, nvme_poll_cb);
1368 
1369     for (int i = 0; i < s->nr_queues; i++) {
1370         NVMeQueuePair *q = s->queues[i];
1371 
1372         q->completion_bh =
1373             aio_bh_new(new_context, nvme_process_completion_bh, q);
1374     }
1375 }
1376 
1377 static void nvme_aio_plug(BlockDriverState *bs)
1378 {
1379     BDRVNVMeState *s = bs->opaque;
1380     assert(!s->plugged);
1381     s->plugged = true;
1382 }
1383 
1384 static void nvme_aio_unplug(BlockDriverState *bs)
1385 {
1386     int i;
1387     BDRVNVMeState *s = bs->opaque;
1388     assert(s->plugged);
1389     s->plugged = false;
1390     for (i = 1; i < s->nr_queues; i++) {
1391         NVMeQueuePair *q = s->queues[i];
1392         qemu_mutex_lock(&q->lock);
1393         nvme_kick(q);
1394         nvme_process_completion(q);
1395         qemu_mutex_unlock(&q->lock);
1396     }
1397 }
1398 
1399 static void nvme_register_buf(BlockDriverState *bs, void *host, size_t size)
1400 {
1401     int ret;
1402     BDRVNVMeState *s = bs->opaque;
1403 
1404     ret = qemu_vfio_dma_map(s->vfio, host, size, false, NULL);
1405     if (ret) {
1406         /* FIXME: we may run out of IOVA addresses after repeated
1407          * bdrv_register_buf/bdrv_unregister_buf, because nvme_vfio_dma_unmap
1408          * doesn't reclaim addresses for fixed mappings. */
1409         error_report("nvme_register_buf failed: %s", strerror(-ret));
1410     }
1411 }
1412 
1413 static void nvme_unregister_buf(BlockDriverState *bs, void *host)
1414 {
1415     BDRVNVMeState *s = bs->opaque;
1416 
1417     qemu_vfio_dma_unmap(s->vfio, host);
1418 }
1419 
1420 static const char *const nvme_strong_runtime_opts[] = {
1421     NVME_BLOCK_OPT_DEVICE,
1422     NVME_BLOCK_OPT_NAMESPACE,
1423 
1424     NULL
1425 };
1426 
1427 static BlockDriver bdrv_nvme = {
1428     .format_name              = "nvme",
1429     .protocol_name            = "nvme",
1430     .instance_size            = sizeof(BDRVNVMeState),
1431 
1432     .bdrv_co_create_opts      = bdrv_co_create_opts_simple,
1433     .create_opts              = &bdrv_create_opts_simple,
1434 
1435     .bdrv_parse_filename      = nvme_parse_filename,
1436     .bdrv_file_open           = nvme_file_open,
1437     .bdrv_close               = nvme_close,
1438     .bdrv_getlength           = nvme_getlength,
1439     .bdrv_probe_blocksizes    = nvme_probe_blocksizes,
1440 
1441     .bdrv_co_preadv           = nvme_co_preadv,
1442     .bdrv_co_pwritev          = nvme_co_pwritev,
1443 
1444     .bdrv_co_pwrite_zeroes    = nvme_co_pwrite_zeroes,
1445     .bdrv_co_pdiscard         = nvme_co_pdiscard,
1446 
1447     .bdrv_co_flush_to_disk    = nvme_co_flush,
1448     .bdrv_reopen_prepare      = nvme_reopen_prepare,
1449 
1450     .bdrv_refresh_filename    = nvme_refresh_filename,
1451     .bdrv_refresh_limits      = nvme_refresh_limits,
1452     .strong_runtime_opts      = nvme_strong_runtime_opts,
1453 
1454     .bdrv_detach_aio_context  = nvme_detach_aio_context,
1455     .bdrv_attach_aio_context  = nvme_attach_aio_context,
1456 
1457     .bdrv_io_plug             = nvme_aio_plug,
1458     .bdrv_io_unplug           = nvme_aio_unplug,
1459 
1460     .bdrv_register_buf        = nvme_register_buf,
1461     .bdrv_unregister_buf      = nvme_unregister_buf,
1462 };
1463 
1464 static void bdrv_nvme_init(void)
1465 {
1466     bdrv_register(&bdrv_nvme);
1467 }
1468 
1469 block_init(bdrv_nvme_init);
1470