xref: /openbmc/qemu/block/nvme.c (revision e6546342)
1 /*
2  * NVMe block driver based on vfio
3  *
4  * Copyright 2016 - 2018 Red Hat, Inc.
5  *
6  * Authors:
7  *   Fam Zheng <famz@redhat.com>
8  *   Paolo Bonzini <pbonzini@redhat.com>
9  *
10  * This work is licensed under the terms of the GNU GPL, version 2 or later.
11  * See the COPYING file in the top-level directory.
12  */
13 
14 #include "qemu/osdep.h"
15 #include <linux/vfio.h>
16 #include "qapi/error.h"
17 #include "qapi/qmp/qdict.h"
18 #include "qapi/qmp/qstring.h"
19 #include "qemu/error-report.h"
20 #include "qemu/main-loop.h"
21 #include "qemu/module.h"
22 #include "qemu/cutils.h"
23 #include "qemu/option.h"
24 #include "qemu/vfio-helpers.h"
25 #include "block/block_int.h"
26 #include "sysemu/replay.h"
27 #include "trace.h"
28 
29 #include "block/nvme.h"
30 
31 #define NVME_SQ_ENTRY_BYTES 64
32 #define NVME_CQ_ENTRY_BYTES 16
33 #define NVME_QUEUE_SIZE 128
34 #define NVME_BAR_SIZE 8192
35 
36 /*
37  * We have to leave one slot empty as that is the full queue case where
38  * head == tail + 1.
39  */
40 #define NVME_NUM_REQS (NVME_QUEUE_SIZE - 1)
41 
42 typedef struct BDRVNVMeState BDRVNVMeState;
43 
44 typedef struct {
45     int32_t  head, tail;
46     uint8_t  *queue;
47     uint64_t iova;
48     /* Hardware MMIO register */
49     volatile uint32_t *doorbell;
50 } NVMeQueue;
51 
52 typedef struct {
53     BlockCompletionFunc *cb;
54     void *opaque;
55     int cid;
56     void *prp_list_page;
57     uint64_t prp_list_iova;
58     int free_req_next; /* q->reqs[] index of next free req */
59 } NVMeRequest;
60 
61 typedef struct {
62     QemuMutex   lock;
63 
64     /* Read from I/O code path, initialized under BQL */
65     BDRVNVMeState   *s;
66     int             index;
67 
68     /* Fields protected by BQL */
69     uint8_t     *prp_list_pages;
70 
71     /* Fields protected by @lock */
72     CoQueue     free_req_queue;
73     NVMeQueue   sq, cq;
74     int         cq_phase;
75     int         free_req_head;
76     NVMeRequest reqs[NVME_NUM_REQS];
77     int         need_kick;
78     int         inflight;
79 
80     /* Thread-safe, no lock necessary */
81     QEMUBH      *completion_bh;
82 } NVMeQueuePair;
83 
84 /* Memory mapped registers */
85 typedef volatile struct {
86     NvmeBar ctrl;
87     struct {
88         uint32_t sq_tail;
89         uint32_t cq_head;
90     } doorbells[];
91 } NVMeRegs;
92 
93 #define INDEX_ADMIN     0
94 #define INDEX_IO(n)     (1 + n)
95 
96 /* This driver shares a single MSIX IRQ for the admin and I/O queues */
97 enum {
98     MSIX_SHARED_IRQ_IDX = 0,
99     MSIX_IRQ_COUNT = 1
100 };
101 
102 struct BDRVNVMeState {
103     AioContext *aio_context;
104     QEMUVFIOState *vfio;
105     NVMeRegs *regs;
106     /* The submission/completion queue pairs.
107      * [0]: admin queue.
108      * [1..]: io queues.
109      */
110     NVMeQueuePair **queues;
111     int nr_queues;
112     size_t page_size;
113     /* How many uint32_t elements does each doorbell entry take. */
114     size_t doorbell_scale;
115     bool write_cache_supported;
116     EventNotifier irq_notifier[MSIX_IRQ_COUNT];
117 
118     uint64_t nsze; /* Namespace size reported by identify command */
119     int nsid;      /* The namespace id to read/write data. */
120     int blkshift;
121 
122     uint64_t max_transfer;
123     bool plugged;
124 
125     bool supports_write_zeroes;
126     bool supports_discard;
127 
128     CoMutex dma_map_lock;
129     CoQueue dma_flush_queue;
130 
131     /* Total size of mapped qiov, accessed under dma_map_lock */
132     int dma_map_count;
133 
134     /* PCI address (required for nvme_refresh_filename()) */
135     char *device;
136 };
137 
138 #define NVME_BLOCK_OPT_DEVICE "device"
139 #define NVME_BLOCK_OPT_NAMESPACE "namespace"
140 
141 static void nvme_process_completion_bh(void *opaque);
142 
143 static QemuOptsList runtime_opts = {
144     .name = "nvme",
145     .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head),
146     .desc = {
147         {
148             .name = NVME_BLOCK_OPT_DEVICE,
149             .type = QEMU_OPT_STRING,
150             .help = "NVMe PCI device address",
151         },
152         {
153             .name = NVME_BLOCK_OPT_NAMESPACE,
154             .type = QEMU_OPT_NUMBER,
155             .help = "NVMe namespace",
156         },
157         { /* end of list */ }
158     },
159 };
160 
161 static void nvme_init_queue(BDRVNVMeState *s, NVMeQueue *q,
162                             int nentries, int entry_bytes, Error **errp)
163 {
164     size_t bytes;
165     int r;
166 
167     bytes = ROUND_UP(nentries * entry_bytes, s->page_size);
168     q->head = q->tail = 0;
169     q->queue = qemu_try_memalign(s->page_size, bytes);
170     if (!q->queue) {
171         error_setg(errp, "Cannot allocate queue");
172         return;
173     }
174     memset(q->queue, 0, bytes);
175     r = qemu_vfio_dma_map(s->vfio, q->queue, bytes, false, &q->iova);
176     if (r) {
177         error_setg(errp, "Cannot map queue");
178     }
179 }
180 
181 static void nvme_free_queue_pair(NVMeQueuePair *q)
182 {
183     if (q->completion_bh) {
184         qemu_bh_delete(q->completion_bh);
185     }
186     qemu_vfree(q->prp_list_pages);
187     qemu_vfree(q->sq.queue);
188     qemu_vfree(q->cq.queue);
189     qemu_mutex_destroy(&q->lock);
190     g_free(q);
191 }
192 
193 static void nvme_free_req_queue_cb(void *opaque)
194 {
195     NVMeQueuePair *q = opaque;
196 
197     qemu_mutex_lock(&q->lock);
198     while (qemu_co_enter_next(&q->free_req_queue, &q->lock)) {
199         /* Retry all pending requests */
200     }
201     qemu_mutex_unlock(&q->lock);
202 }
203 
204 static NVMeQueuePair *nvme_create_queue_pair(BDRVNVMeState *s,
205                                              AioContext *aio_context,
206                                              int idx, int size,
207                                              Error **errp)
208 {
209     int i, r;
210     Error *local_err = NULL;
211     NVMeQueuePair *q;
212     uint64_t prp_list_iova;
213 
214     q = g_try_new0(NVMeQueuePair, 1);
215     if (!q) {
216         return NULL;
217     }
218     q->prp_list_pages = qemu_try_memalign(s->page_size,
219                                           s->page_size * NVME_NUM_REQS);
220     if (!q->prp_list_pages) {
221         goto fail;
222     }
223     memset(q->prp_list_pages, 0, s->page_size * NVME_NUM_REQS);
224     qemu_mutex_init(&q->lock);
225     q->s = s;
226     q->index = idx;
227     qemu_co_queue_init(&q->free_req_queue);
228     q->completion_bh = aio_bh_new(aio_context, nvme_process_completion_bh, q);
229     r = qemu_vfio_dma_map(s->vfio, q->prp_list_pages,
230                           s->page_size * NVME_NUM_REQS,
231                           false, &prp_list_iova);
232     if (r) {
233         goto fail;
234     }
235     q->free_req_head = -1;
236     for (i = 0; i < NVME_NUM_REQS; i++) {
237         NVMeRequest *req = &q->reqs[i];
238         req->cid = i + 1;
239         req->free_req_next = q->free_req_head;
240         q->free_req_head = i;
241         req->prp_list_page = q->prp_list_pages + i * s->page_size;
242         req->prp_list_iova = prp_list_iova + i * s->page_size;
243     }
244 
245     nvme_init_queue(s, &q->sq, size, NVME_SQ_ENTRY_BYTES, &local_err);
246     if (local_err) {
247         error_propagate(errp, local_err);
248         goto fail;
249     }
250     q->sq.doorbell = &s->regs->doorbells[idx * s->doorbell_scale].sq_tail;
251 
252     nvme_init_queue(s, &q->cq, size, NVME_CQ_ENTRY_BYTES, &local_err);
253     if (local_err) {
254         error_propagate(errp, local_err);
255         goto fail;
256     }
257     q->cq.doorbell = &s->regs->doorbells[idx * s->doorbell_scale].cq_head;
258 
259     return q;
260 fail:
261     nvme_free_queue_pair(q);
262     return NULL;
263 }
264 
265 /* With q->lock */
266 static void nvme_kick(NVMeQueuePair *q)
267 {
268     BDRVNVMeState *s = q->s;
269 
270     if (s->plugged || !q->need_kick) {
271         return;
272     }
273     trace_nvme_kick(s, q->index);
274     assert(!(q->sq.tail & 0xFF00));
275     /* Fence the write to submission queue entry before notifying the device. */
276     smp_wmb();
277     *q->sq.doorbell = cpu_to_le32(q->sq.tail);
278     q->inflight += q->need_kick;
279     q->need_kick = 0;
280 }
281 
282 /* Find a free request element if any, otherwise:
283  * a) if in coroutine context, try to wait for one to become available;
284  * b) if not in coroutine, return NULL;
285  */
286 static NVMeRequest *nvme_get_free_req(NVMeQueuePair *q)
287 {
288     NVMeRequest *req;
289 
290     qemu_mutex_lock(&q->lock);
291 
292     while (q->free_req_head == -1) {
293         if (qemu_in_coroutine()) {
294             trace_nvme_free_req_queue_wait(q);
295             qemu_co_queue_wait(&q->free_req_queue, &q->lock);
296         } else {
297             qemu_mutex_unlock(&q->lock);
298             return NULL;
299         }
300     }
301 
302     req = &q->reqs[q->free_req_head];
303     q->free_req_head = req->free_req_next;
304     req->free_req_next = -1;
305 
306     qemu_mutex_unlock(&q->lock);
307     return req;
308 }
309 
310 /* With q->lock */
311 static void nvme_put_free_req_locked(NVMeQueuePair *q, NVMeRequest *req)
312 {
313     req->free_req_next = q->free_req_head;
314     q->free_req_head = req - q->reqs;
315 }
316 
317 /* With q->lock */
318 static void nvme_wake_free_req_locked(NVMeQueuePair *q)
319 {
320     if (!qemu_co_queue_empty(&q->free_req_queue)) {
321         replay_bh_schedule_oneshot_event(q->s->aio_context,
322                 nvme_free_req_queue_cb, q);
323     }
324 }
325 
326 /* Insert a request in the freelist and wake waiters */
327 static void nvme_put_free_req_and_wake(NVMeQueuePair *q, NVMeRequest *req)
328 {
329     qemu_mutex_lock(&q->lock);
330     nvme_put_free_req_locked(q, req);
331     nvme_wake_free_req_locked(q);
332     qemu_mutex_unlock(&q->lock);
333 }
334 
335 static inline int nvme_translate_error(const NvmeCqe *c)
336 {
337     uint16_t status = (le16_to_cpu(c->status) >> 1) & 0xFF;
338     if (status) {
339         trace_nvme_error(le32_to_cpu(c->result),
340                          le16_to_cpu(c->sq_head),
341                          le16_to_cpu(c->sq_id),
342                          le16_to_cpu(c->cid),
343                          le16_to_cpu(status));
344     }
345     switch (status) {
346     case 0:
347         return 0;
348     case 1:
349         return -ENOSYS;
350     case 2:
351         return -EINVAL;
352     default:
353         return -EIO;
354     }
355 }
356 
357 /* With q->lock */
358 static bool nvme_process_completion(NVMeQueuePair *q)
359 {
360     BDRVNVMeState *s = q->s;
361     bool progress = false;
362     NVMeRequest *preq;
363     NVMeRequest req;
364     NvmeCqe *c;
365 
366     trace_nvme_process_completion(s, q->index, q->inflight);
367     if (s->plugged) {
368         trace_nvme_process_completion_queue_plugged(s, q->index);
369         return false;
370     }
371 
372     /*
373      * Support re-entrancy when a request cb() function invokes aio_poll().
374      * Pending completions must be visible to aio_poll() so that a cb()
375      * function can wait for the completion of another request.
376      *
377      * The aio_poll() loop will execute our BH and we'll resume completion
378      * processing there.
379      */
380     qemu_bh_schedule(q->completion_bh);
381 
382     assert(q->inflight >= 0);
383     while (q->inflight) {
384         int ret;
385         int16_t cid;
386 
387         c = (NvmeCqe *)&q->cq.queue[q->cq.head * NVME_CQ_ENTRY_BYTES];
388         if ((le16_to_cpu(c->status) & 0x1) == q->cq_phase) {
389             break;
390         }
391         ret = nvme_translate_error(c);
392         q->cq.head = (q->cq.head + 1) % NVME_QUEUE_SIZE;
393         if (!q->cq.head) {
394             q->cq_phase = !q->cq_phase;
395         }
396         cid = le16_to_cpu(c->cid);
397         if (cid == 0 || cid > NVME_QUEUE_SIZE) {
398             fprintf(stderr, "Unexpected CID in completion queue: %" PRIu32 "\n",
399                     cid);
400             continue;
401         }
402         trace_nvme_complete_command(s, q->index, cid);
403         preq = &q->reqs[cid - 1];
404         req = *preq;
405         assert(req.cid == cid);
406         assert(req.cb);
407         nvme_put_free_req_locked(q, preq);
408         preq->cb = preq->opaque = NULL;
409         q->inflight--;
410         qemu_mutex_unlock(&q->lock);
411         req.cb(req.opaque, ret);
412         qemu_mutex_lock(&q->lock);
413         progress = true;
414     }
415     if (progress) {
416         /* Notify the device so it can post more completions. */
417         smp_mb_release();
418         *q->cq.doorbell = cpu_to_le32(q->cq.head);
419         nvme_wake_free_req_locked(q);
420     }
421 
422     qemu_bh_cancel(q->completion_bh);
423 
424     return progress;
425 }
426 
427 static void nvme_process_completion_bh(void *opaque)
428 {
429     NVMeQueuePair *q = opaque;
430 
431     /*
432      * We're being invoked because a nvme_process_completion() cb() function
433      * called aio_poll(). The callback may be waiting for further completions
434      * so notify the device that it has space to fill in more completions now.
435      */
436     smp_mb_release();
437     *q->cq.doorbell = cpu_to_le32(q->cq.head);
438     nvme_wake_free_req_locked(q);
439 
440     nvme_process_completion(q);
441 }
442 
443 static void nvme_trace_command(const NvmeCmd *cmd)
444 {
445     int i;
446 
447     if (!trace_event_get_state_backends(TRACE_NVME_SUBMIT_COMMAND_RAW)) {
448         return;
449     }
450     for (i = 0; i < 8; ++i) {
451         uint8_t *cmdp = (uint8_t *)cmd + i * 8;
452         trace_nvme_submit_command_raw(cmdp[0], cmdp[1], cmdp[2], cmdp[3],
453                                       cmdp[4], cmdp[5], cmdp[6], cmdp[7]);
454     }
455 }
456 
457 static void nvme_submit_command(NVMeQueuePair *q, NVMeRequest *req,
458                                 NvmeCmd *cmd, BlockCompletionFunc cb,
459                                 void *opaque)
460 {
461     assert(!req->cb);
462     req->cb = cb;
463     req->opaque = opaque;
464     cmd->cid = cpu_to_le32(req->cid);
465 
466     trace_nvme_submit_command(q->s, q->index, req->cid);
467     nvme_trace_command(cmd);
468     qemu_mutex_lock(&q->lock);
469     memcpy((uint8_t *)q->sq.queue +
470            q->sq.tail * NVME_SQ_ENTRY_BYTES, cmd, sizeof(*cmd));
471     q->sq.tail = (q->sq.tail + 1) % NVME_QUEUE_SIZE;
472     q->need_kick++;
473     nvme_kick(q);
474     nvme_process_completion(q);
475     qemu_mutex_unlock(&q->lock);
476 }
477 
478 static void nvme_cmd_sync_cb(void *opaque, int ret)
479 {
480     int *pret = opaque;
481     *pret = ret;
482     aio_wait_kick();
483 }
484 
485 static int nvme_cmd_sync(BlockDriverState *bs, NVMeQueuePair *q,
486                          NvmeCmd *cmd)
487 {
488     AioContext *aio_context = bdrv_get_aio_context(bs);
489     NVMeRequest *req;
490     int ret = -EINPROGRESS;
491     req = nvme_get_free_req(q);
492     if (!req) {
493         return -EBUSY;
494     }
495     nvme_submit_command(q, req, cmd, nvme_cmd_sync_cb, &ret);
496 
497     AIO_WAIT_WHILE(aio_context, ret == -EINPROGRESS);
498     return ret;
499 }
500 
501 static void nvme_identify(BlockDriverState *bs, int namespace, Error **errp)
502 {
503     BDRVNVMeState *s = bs->opaque;
504     union {
505         NvmeIdCtrl ctrl;
506         NvmeIdNs ns;
507     } *id;
508     NvmeLBAF *lbaf;
509     uint16_t oncs;
510     int r;
511     uint64_t iova;
512     NvmeCmd cmd = {
513         .opcode = NVME_ADM_CMD_IDENTIFY,
514         .cdw10 = cpu_to_le32(0x1),
515     };
516 
517     id = qemu_try_memalign(s->page_size, sizeof(*id));
518     if (!id) {
519         error_setg(errp, "Cannot allocate buffer for identify response");
520         goto out;
521     }
522     r = qemu_vfio_dma_map(s->vfio, id, sizeof(*id), true, &iova);
523     if (r) {
524         error_setg(errp, "Cannot map buffer for DMA");
525         goto out;
526     }
527 
528     memset(id, 0, sizeof(*id));
529     cmd.dptr.prp1 = cpu_to_le64(iova);
530     if (nvme_cmd_sync(bs, s->queues[INDEX_ADMIN], &cmd)) {
531         error_setg(errp, "Failed to identify controller");
532         goto out;
533     }
534 
535     if (le32_to_cpu(id->ctrl.nn) < namespace) {
536         error_setg(errp, "Invalid namespace");
537         goto out;
538     }
539     s->write_cache_supported = le32_to_cpu(id->ctrl.vwc) & 0x1;
540     s->max_transfer = (id->ctrl.mdts ? 1 << id->ctrl.mdts : 0) * s->page_size;
541     /* For now the page list buffer per command is one page, to hold at most
542      * s->page_size / sizeof(uint64_t) entries. */
543     s->max_transfer = MIN_NON_ZERO(s->max_transfer,
544                           s->page_size / sizeof(uint64_t) * s->page_size);
545 
546     oncs = le16_to_cpu(id->ctrl.oncs);
547     s->supports_write_zeroes = !!(oncs & NVME_ONCS_WRITE_ZEROES);
548     s->supports_discard = !!(oncs & NVME_ONCS_DSM);
549 
550     memset(id, 0, sizeof(*id));
551     cmd.cdw10 = 0;
552     cmd.nsid = cpu_to_le32(namespace);
553     if (nvme_cmd_sync(bs, s->queues[INDEX_ADMIN], &cmd)) {
554         error_setg(errp, "Failed to identify namespace");
555         goto out;
556     }
557 
558     s->nsze = le64_to_cpu(id->ns.nsze);
559     lbaf = &id->ns.lbaf[NVME_ID_NS_FLBAS_INDEX(id->ns.flbas)];
560 
561     if (NVME_ID_NS_DLFEAT_WRITE_ZEROES(id->ns.dlfeat) &&
562             NVME_ID_NS_DLFEAT_READ_BEHAVIOR(id->ns.dlfeat) ==
563                     NVME_ID_NS_DLFEAT_READ_BEHAVIOR_ZEROES) {
564         bs->supported_write_flags |= BDRV_REQ_MAY_UNMAP;
565     }
566 
567     if (lbaf->ms) {
568         error_setg(errp, "Namespaces with metadata are not yet supported");
569         goto out;
570     }
571 
572     if (lbaf->ds < BDRV_SECTOR_BITS || lbaf->ds > 12 ||
573         (1 << lbaf->ds) > s->page_size)
574     {
575         error_setg(errp, "Namespace has unsupported block size (2^%d)",
576                    lbaf->ds);
577         goto out;
578     }
579 
580     s->blkshift = lbaf->ds;
581 out:
582     qemu_vfio_dma_unmap(s->vfio, id);
583     qemu_vfree(id);
584 }
585 
586 static bool nvme_poll_queue(NVMeQueuePair *q)
587 {
588     bool progress = false;
589 
590     const size_t cqe_offset = q->cq.head * NVME_CQ_ENTRY_BYTES;
591     NvmeCqe *cqe = (NvmeCqe *)&q->cq.queue[cqe_offset];
592 
593     /*
594      * Do an early check for completions. q->lock isn't needed because
595      * nvme_process_completion() only runs in the event loop thread and
596      * cannot race with itself.
597      */
598     if ((le16_to_cpu(cqe->status) & 0x1) == q->cq_phase) {
599         return false;
600     }
601 
602     qemu_mutex_lock(&q->lock);
603     while (nvme_process_completion(q)) {
604         /* Keep polling */
605         progress = true;
606     }
607     qemu_mutex_unlock(&q->lock);
608 
609     return progress;
610 }
611 
612 static bool nvme_poll_queues(BDRVNVMeState *s)
613 {
614     bool progress = false;
615     int i;
616 
617     for (i = 0; i < s->nr_queues; i++) {
618         if (nvme_poll_queue(s->queues[i])) {
619             progress = true;
620         }
621     }
622     return progress;
623 }
624 
625 static void nvme_handle_event(EventNotifier *n)
626 {
627     BDRVNVMeState *s = container_of(n, BDRVNVMeState,
628                                     irq_notifier[MSIX_SHARED_IRQ_IDX]);
629 
630     trace_nvme_handle_event(s);
631     event_notifier_test_and_clear(n);
632     nvme_poll_queues(s);
633 }
634 
635 static bool nvme_add_io_queue(BlockDriverState *bs, Error **errp)
636 {
637     BDRVNVMeState *s = bs->opaque;
638     int n = s->nr_queues;
639     NVMeQueuePair *q;
640     NvmeCmd cmd;
641     int queue_size = NVME_QUEUE_SIZE;
642 
643     q = nvme_create_queue_pair(s, bdrv_get_aio_context(bs),
644                                n, queue_size, errp);
645     if (!q) {
646         return false;
647     }
648     cmd = (NvmeCmd) {
649         .opcode = NVME_ADM_CMD_CREATE_CQ,
650         .dptr.prp1 = cpu_to_le64(q->cq.iova),
651         .cdw10 = cpu_to_le32(((queue_size - 1) << 16) | (n & 0xFFFF)),
652         .cdw11 = cpu_to_le32(0x3),
653     };
654     if (nvme_cmd_sync(bs, s->queues[INDEX_ADMIN], &cmd)) {
655         error_setg(errp, "Failed to create CQ io queue [%d]", n);
656         goto out_error;
657     }
658     cmd = (NvmeCmd) {
659         .opcode = NVME_ADM_CMD_CREATE_SQ,
660         .dptr.prp1 = cpu_to_le64(q->sq.iova),
661         .cdw10 = cpu_to_le32(((queue_size - 1) << 16) | (n & 0xFFFF)),
662         .cdw11 = cpu_to_le32(0x1 | (n << 16)),
663     };
664     if (nvme_cmd_sync(bs, s->queues[INDEX_ADMIN], &cmd)) {
665         error_setg(errp, "Failed to create SQ io queue [%d]", n);
666         goto out_error;
667     }
668     s->queues = g_renew(NVMeQueuePair *, s->queues, n + 1);
669     s->queues[n] = q;
670     s->nr_queues++;
671     return true;
672 out_error:
673     nvme_free_queue_pair(q);
674     return false;
675 }
676 
677 static bool nvme_poll_cb(void *opaque)
678 {
679     EventNotifier *e = opaque;
680     BDRVNVMeState *s = container_of(e, BDRVNVMeState,
681                                     irq_notifier[MSIX_SHARED_IRQ_IDX]);
682 
683     trace_nvme_poll_cb(s);
684     return nvme_poll_queues(s);
685 }
686 
687 static int nvme_init(BlockDriverState *bs, const char *device, int namespace,
688                      Error **errp)
689 {
690     BDRVNVMeState *s = bs->opaque;
691     AioContext *aio_context = bdrv_get_aio_context(bs);
692     int ret;
693     uint64_t cap;
694     uint64_t timeout_ms;
695     uint64_t deadline, now;
696     Error *local_err = NULL;
697 
698     qemu_co_mutex_init(&s->dma_map_lock);
699     qemu_co_queue_init(&s->dma_flush_queue);
700     s->device = g_strdup(device);
701     s->nsid = namespace;
702     s->aio_context = bdrv_get_aio_context(bs);
703     ret = event_notifier_init(&s->irq_notifier[MSIX_SHARED_IRQ_IDX], 0);
704     if (ret) {
705         error_setg(errp, "Failed to init event notifier");
706         return ret;
707     }
708 
709     s->vfio = qemu_vfio_open_pci(device, errp);
710     if (!s->vfio) {
711         ret = -EINVAL;
712         goto out;
713     }
714 
715     s->regs = qemu_vfio_pci_map_bar(s->vfio, 0, 0, NVME_BAR_SIZE, errp);
716     if (!s->regs) {
717         ret = -EINVAL;
718         goto out;
719     }
720 
721     /* Perform initialize sequence as described in NVMe spec "7.6.1
722      * Initialization". */
723 
724     cap = le64_to_cpu(s->regs->ctrl.cap);
725     if (!(cap & (1ULL << 37))) {
726         error_setg(errp, "Device doesn't support NVMe command set");
727         ret = -EINVAL;
728         goto out;
729     }
730 
731     s->page_size = MAX(4096, 1 << (12 + ((cap >> 48) & 0xF)));
732     s->doorbell_scale = (4 << (((cap >> 32) & 0xF))) / sizeof(uint32_t);
733     bs->bl.opt_mem_alignment = s->page_size;
734     timeout_ms = MIN(500 * ((cap >> 24) & 0xFF), 30000);
735 
736     /* Reset device to get a clean state. */
737     s->regs->ctrl.cc = cpu_to_le32(le32_to_cpu(s->regs->ctrl.cc) & 0xFE);
738     /* Wait for CSTS.RDY = 0. */
739     deadline = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) + timeout_ms * SCALE_MS;
740     while (le32_to_cpu(s->regs->ctrl.csts) & 0x1) {
741         if (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) > deadline) {
742             error_setg(errp, "Timeout while waiting for device to reset (%"
743                              PRId64 " ms)",
744                        timeout_ms);
745             ret = -ETIMEDOUT;
746             goto out;
747         }
748     }
749 
750     /* Set up admin queue. */
751     s->queues = g_new(NVMeQueuePair *, 1);
752     s->queues[INDEX_ADMIN] = nvme_create_queue_pair(s, aio_context, 0,
753                                                           NVME_QUEUE_SIZE,
754                                                           errp);
755     if (!s->queues[INDEX_ADMIN]) {
756         ret = -EINVAL;
757         goto out;
758     }
759     s->nr_queues = 1;
760     QEMU_BUILD_BUG_ON(NVME_QUEUE_SIZE & 0xF000);
761     s->regs->ctrl.aqa = cpu_to_le32((NVME_QUEUE_SIZE << 16) | NVME_QUEUE_SIZE);
762     s->regs->ctrl.asq = cpu_to_le64(s->queues[INDEX_ADMIN]->sq.iova);
763     s->regs->ctrl.acq = cpu_to_le64(s->queues[INDEX_ADMIN]->cq.iova);
764 
765     /* After setting up all control registers we can enable device now. */
766     s->regs->ctrl.cc = cpu_to_le32((ctz32(NVME_CQ_ENTRY_BYTES) << 20) |
767                               (ctz32(NVME_SQ_ENTRY_BYTES) << 16) |
768                               0x1);
769     /* Wait for CSTS.RDY = 1. */
770     now = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
771     deadline = now + timeout_ms * 1000000;
772     while (!(le32_to_cpu(s->regs->ctrl.csts) & 0x1)) {
773         if (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) > deadline) {
774             error_setg(errp, "Timeout while waiting for device to start (%"
775                              PRId64 " ms)",
776                        timeout_ms);
777             ret = -ETIMEDOUT;
778             goto out;
779         }
780     }
781 
782     ret = qemu_vfio_pci_init_irq(s->vfio, s->irq_notifier,
783                                  VFIO_PCI_MSIX_IRQ_INDEX, errp);
784     if (ret) {
785         goto out;
786     }
787     aio_set_event_notifier(bdrv_get_aio_context(bs),
788                            &s->irq_notifier[MSIX_SHARED_IRQ_IDX],
789                            false, nvme_handle_event, nvme_poll_cb);
790 
791     nvme_identify(bs, namespace, &local_err);
792     if (local_err) {
793         error_propagate(errp, local_err);
794         ret = -EIO;
795         goto out;
796     }
797 
798     /* Set up command queues. */
799     if (!nvme_add_io_queue(bs, errp)) {
800         ret = -EIO;
801     }
802 out:
803     /* Cleaning up is done in nvme_file_open() upon error. */
804     return ret;
805 }
806 
807 /* Parse a filename in the format of nvme://XXXX:XX:XX.X/X. Example:
808  *
809  *     nvme://0000:44:00.0/1
810  *
811  * where the "nvme://" is a fixed form of the protocol prefix, the middle part
812  * is the PCI address, and the last part is the namespace number starting from
813  * 1 according to the NVMe spec. */
814 static void nvme_parse_filename(const char *filename, QDict *options,
815                                 Error **errp)
816 {
817     int pref = strlen("nvme://");
818 
819     if (strlen(filename) > pref && !strncmp(filename, "nvme://", pref)) {
820         const char *tmp = filename + pref;
821         char *device;
822         const char *namespace;
823         unsigned long ns;
824         const char *slash = strchr(tmp, '/');
825         if (!slash) {
826             qdict_put_str(options, NVME_BLOCK_OPT_DEVICE, tmp);
827             return;
828         }
829         device = g_strndup(tmp, slash - tmp);
830         qdict_put_str(options, NVME_BLOCK_OPT_DEVICE, device);
831         g_free(device);
832         namespace = slash + 1;
833         if (*namespace && qemu_strtoul(namespace, NULL, 10, &ns)) {
834             error_setg(errp, "Invalid namespace '%s', positive number expected",
835                        namespace);
836             return;
837         }
838         qdict_put_str(options, NVME_BLOCK_OPT_NAMESPACE,
839                       *namespace ? namespace : "1");
840     }
841 }
842 
843 static int nvme_enable_disable_write_cache(BlockDriverState *bs, bool enable,
844                                            Error **errp)
845 {
846     int ret;
847     BDRVNVMeState *s = bs->opaque;
848     NvmeCmd cmd = {
849         .opcode = NVME_ADM_CMD_SET_FEATURES,
850         .nsid = cpu_to_le32(s->nsid),
851         .cdw10 = cpu_to_le32(0x06),
852         .cdw11 = cpu_to_le32(enable ? 0x01 : 0x00),
853     };
854 
855     ret = nvme_cmd_sync(bs, s->queues[INDEX_ADMIN], &cmd);
856     if (ret) {
857         error_setg(errp, "Failed to configure NVMe write cache");
858     }
859     return ret;
860 }
861 
862 static void nvme_close(BlockDriverState *bs)
863 {
864     int i;
865     BDRVNVMeState *s = bs->opaque;
866 
867     for (i = 0; i < s->nr_queues; ++i) {
868         nvme_free_queue_pair(s->queues[i]);
869     }
870     g_free(s->queues);
871     aio_set_event_notifier(bdrv_get_aio_context(bs),
872                            &s->irq_notifier[MSIX_SHARED_IRQ_IDX],
873                            false, NULL, NULL);
874     event_notifier_cleanup(&s->irq_notifier[MSIX_SHARED_IRQ_IDX]);
875     qemu_vfio_pci_unmap_bar(s->vfio, 0, (void *)s->regs, 0, NVME_BAR_SIZE);
876     qemu_vfio_close(s->vfio);
877 
878     g_free(s->device);
879 }
880 
881 static int nvme_file_open(BlockDriverState *bs, QDict *options, int flags,
882                           Error **errp)
883 {
884     const char *device;
885     QemuOpts *opts;
886     int namespace;
887     int ret;
888     BDRVNVMeState *s = bs->opaque;
889 
890     bs->supported_write_flags = BDRV_REQ_FUA;
891 
892     opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort);
893     qemu_opts_absorb_qdict(opts, options, &error_abort);
894     device = qemu_opt_get(opts, NVME_BLOCK_OPT_DEVICE);
895     if (!device) {
896         error_setg(errp, "'" NVME_BLOCK_OPT_DEVICE "' option is required");
897         qemu_opts_del(opts);
898         return -EINVAL;
899     }
900 
901     namespace = qemu_opt_get_number(opts, NVME_BLOCK_OPT_NAMESPACE, 1);
902     ret = nvme_init(bs, device, namespace, errp);
903     qemu_opts_del(opts);
904     if (ret) {
905         goto fail;
906     }
907     if (flags & BDRV_O_NOCACHE) {
908         if (!s->write_cache_supported) {
909             error_setg(errp,
910                        "NVMe controller doesn't support write cache configuration");
911             ret = -EINVAL;
912         } else {
913             ret = nvme_enable_disable_write_cache(bs, !(flags & BDRV_O_NOCACHE),
914                                                   errp);
915         }
916         if (ret) {
917             goto fail;
918         }
919     }
920     return 0;
921 fail:
922     nvme_close(bs);
923     return ret;
924 }
925 
926 static int64_t nvme_getlength(BlockDriverState *bs)
927 {
928     BDRVNVMeState *s = bs->opaque;
929     return s->nsze << s->blkshift;
930 }
931 
932 static uint32_t nvme_get_blocksize(BlockDriverState *bs)
933 {
934     BDRVNVMeState *s = bs->opaque;
935     assert(s->blkshift >= BDRV_SECTOR_BITS && s->blkshift <= 12);
936     return UINT32_C(1) << s->blkshift;
937 }
938 
939 static int nvme_probe_blocksizes(BlockDriverState *bs, BlockSizes *bsz)
940 {
941     uint32_t blocksize = nvme_get_blocksize(bs);
942     bsz->phys = blocksize;
943     bsz->log = blocksize;
944     return 0;
945 }
946 
947 /* Called with s->dma_map_lock */
948 static coroutine_fn int nvme_cmd_unmap_qiov(BlockDriverState *bs,
949                                             QEMUIOVector *qiov)
950 {
951     int r = 0;
952     BDRVNVMeState *s = bs->opaque;
953 
954     s->dma_map_count -= qiov->size;
955     if (!s->dma_map_count && !qemu_co_queue_empty(&s->dma_flush_queue)) {
956         r = qemu_vfio_dma_reset_temporary(s->vfio);
957         if (!r) {
958             qemu_co_queue_restart_all(&s->dma_flush_queue);
959         }
960     }
961     return r;
962 }
963 
964 /* Called with s->dma_map_lock */
965 static coroutine_fn int nvme_cmd_map_qiov(BlockDriverState *bs, NvmeCmd *cmd,
966                                           NVMeRequest *req, QEMUIOVector *qiov)
967 {
968     BDRVNVMeState *s = bs->opaque;
969     uint64_t *pagelist = req->prp_list_page;
970     int i, j, r;
971     int entries = 0;
972 
973     assert(qiov->size);
974     assert(QEMU_IS_ALIGNED(qiov->size, s->page_size));
975     assert(qiov->size / s->page_size <= s->page_size / sizeof(uint64_t));
976     for (i = 0; i < qiov->niov; ++i) {
977         bool retry = true;
978         uint64_t iova;
979 try_map:
980         r = qemu_vfio_dma_map(s->vfio,
981                               qiov->iov[i].iov_base,
982                               qiov->iov[i].iov_len,
983                               true, &iova);
984         if (r == -ENOMEM && retry) {
985             retry = false;
986             trace_nvme_dma_flush_queue_wait(s);
987             if (s->dma_map_count) {
988                 trace_nvme_dma_map_flush(s);
989                 qemu_co_queue_wait(&s->dma_flush_queue, &s->dma_map_lock);
990             } else {
991                 r = qemu_vfio_dma_reset_temporary(s->vfio);
992                 if (r) {
993                     goto fail;
994                 }
995             }
996             goto try_map;
997         }
998         if (r) {
999             goto fail;
1000         }
1001 
1002         for (j = 0; j < qiov->iov[i].iov_len / s->page_size; j++) {
1003             pagelist[entries++] = cpu_to_le64(iova + j * s->page_size);
1004         }
1005         trace_nvme_cmd_map_qiov_iov(s, i, qiov->iov[i].iov_base,
1006                                     qiov->iov[i].iov_len / s->page_size);
1007     }
1008 
1009     s->dma_map_count += qiov->size;
1010 
1011     assert(entries <= s->page_size / sizeof(uint64_t));
1012     switch (entries) {
1013     case 0:
1014         abort();
1015     case 1:
1016         cmd->dptr.prp1 = pagelist[0];
1017         cmd->dptr.prp2 = 0;
1018         break;
1019     case 2:
1020         cmd->dptr.prp1 = pagelist[0];
1021         cmd->dptr.prp2 = pagelist[1];
1022         break;
1023     default:
1024         cmd->dptr.prp1 = pagelist[0];
1025         cmd->dptr.prp2 = cpu_to_le64(req->prp_list_iova + sizeof(uint64_t));
1026         break;
1027     }
1028     trace_nvme_cmd_map_qiov(s, cmd, req, qiov, entries);
1029     for (i = 0; i < entries; ++i) {
1030         trace_nvme_cmd_map_qiov_pages(s, i, pagelist[i]);
1031     }
1032     return 0;
1033 fail:
1034     /* No need to unmap [0 - i) iovs even if we've failed, since we don't
1035      * increment s->dma_map_count. This is okay for fixed mapping memory areas
1036      * because they are already mapped before calling this function; for
1037      * temporary mappings, a later nvme_cmd_(un)map_qiov will reclaim by
1038      * calling qemu_vfio_dma_reset_temporary when necessary. */
1039     return r;
1040 }
1041 
1042 typedef struct {
1043     Coroutine *co;
1044     int ret;
1045     AioContext *ctx;
1046 } NVMeCoData;
1047 
1048 static void nvme_rw_cb_bh(void *opaque)
1049 {
1050     NVMeCoData *data = opaque;
1051     qemu_coroutine_enter(data->co);
1052 }
1053 
1054 static void nvme_rw_cb(void *opaque, int ret)
1055 {
1056     NVMeCoData *data = opaque;
1057     data->ret = ret;
1058     if (!data->co) {
1059         /* The rw coroutine hasn't yielded, don't try to enter. */
1060         return;
1061     }
1062     replay_bh_schedule_oneshot_event(data->ctx, nvme_rw_cb_bh, data);
1063 }
1064 
1065 static coroutine_fn int nvme_co_prw_aligned(BlockDriverState *bs,
1066                                             uint64_t offset, uint64_t bytes,
1067                                             QEMUIOVector *qiov,
1068                                             bool is_write,
1069                                             int flags)
1070 {
1071     int r;
1072     BDRVNVMeState *s = bs->opaque;
1073     NVMeQueuePair *ioq = s->queues[INDEX_IO(0)];
1074     NVMeRequest *req;
1075 
1076     uint32_t cdw12 = (((bytes >> s->blkshift) - 1) & 0xFFFF) |
1077                        (flags & BDRV_REQ_FUA ? 1 << 30 : 0);
1078     NvmeCmd cmd = {
1079         .opcode = is_write ? NVME_CMD_WRITE : NVME_CMD_READ,
1080         .nsid = cpu_to_le32(s->nsid),
1081         .cdw10 = cpu_to_le32((offset >> s->blkshift) & 0xFFFFFFFF),
1082         .cdw11 = cpu_to_le32(((offset >> s->blkshift) >> 32) & 0xFFFFFFFF),
1083         .cdw12 = cpu_to_le32(cdw12),
1084     };
1085     NVMeCoData data = {
1086         .ctx = bdrv_get_aio_context(bs),
1087         .ret = -EINPROGRESS,
1088     };
1089 
1090     trace_nvme_prw_aligned(s, is_write, offset, bytes, flags, qiov->niov);
1091     assert(s->nr_queues > 1);
1092     req = nvme_get_free_req(ioq);
1093     assert(req);
1094 
1095     qemu_co_mutex_lock(&s->dma_map_lock);
1096     r = nvme_cmd_map_qiov(bs, &cmd, req, qiov);
1097     qemu_co_mutex_unlock(&s->dma_map_lock);
1098     if (r) {
1099         nvme_put_free_req_and_wake(ioq, req);
1100         return r;
1101     }
1102     nvme_submit_command(ioq, req, &cmd, nvme_rw_cb, &data);
1103 
1104     data.co = qemu_coroutine_self();
1105     while (data.ret == -EINPROGRESS) {
1106         qemu_coroutine_yield();
1107     }
1108 
1109     qemu_co_mutex_lock(&s->dma_map_lock);
1110     r = nvme_cmd_unmap_qiov(bs, qiov);
1111     qemu_co_mutex_unlock(&s->dma_map_lock);
1112     if (r) {
1113         return r;
1114     }
1115 
1116     trace_nvme_rw_done(s, is_write, offset, bytes, data.ret);
1117     return data.ret;
1118 }
1119 
1120 static inline bool nvme_qiov_aligned(BlockDriverState *bs,
1121                                      const QEMUIOVector *qiov)
1122 {
1123     int i;
1124     BDRVNVMeState *s = bs->opaque;
1125 
1126     for (i = 0; i < qiov->niov; ++i) {
1127         if (!QEMU_PTR_IS_ALIGNED(qiov->iov[i].iov_base, s->page_size) ||
1128             !QEMU_IS_ALIGNED(qiov->iov[i].iov_len, s->page_size)) {
1129             trace_nvme_qiov_unaligned(qiov, i, qiov->iov[i].iov_base,
1130                                       qiov->iov[i].iov_len, s->page_size);
1131             return false;
1132         }
1133     }
1134     return true;
1135 }
1136 
1137 static int nvme_co_prw(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
1138                        QEMUIOVector *qiov, bool is_write, int flags)
1139 {
1140     BDRVNVMeState *s = bs->opaque;
1141     int r;
1142     uint8_t *buf = NULL;
1143     QEMUIOVector local_qiov;
1144 
1145     assert(QEMU_IS_ALIGNED(offset, s->page_size));
1146     assert(QEMU_IS_ALIGNED(bytes, s->page_size));
1147     assert(bytes <= s->max_transfer);
1148     if (nvme_qiov_aligned(bs, qiov)) {
1149         return nvme_co_prw_aligned(bs, offset, bytes, qiov, is_write, flags);
1150     }
1151     trace_nvme_prw_buffered(s, offset, bytes, qiov->niov, is_write);
1152     buf = qemu_try_memalign(s->page_size, bytes);
1153 
1154     if (!buf) {
1155         return -ENOMEM;
1156     }
1157     qemu_iovec_init(&local_qiov, 1);
1158     if (is_write) {
1159         qemu_iovec_to_buf(qiov, 0, buf, bytes);
1160     }
1161     qemu_iovec_add(&local_qiov, buf, bytes);
1162     r = nvme_co_prw_aligned(bs, offset, bytes, &local_qiov, is_write, flags);
1163     qemu_iovec_destroy(&local_qiov);
1164     if (!r && !is_write) {
1165         qemu_iovec_from_buf(qiov, 0, buf, bytes);
1166     }
1167     qemu_vfree(buf);
1168     return r;
1169 }
1170 
1171 static coroutine_fn int nvme_co_preadv(BlockDriverState *bs,
1172                                        uint64_t offset, uint64_t bytes,
1173                                        QEMUIOVector *qiov, int flags)
1174 {
1175     return nvme_co_prw(bs, offset, bytes, qiov, false, flags);
1176 }
1177 
1178 static coroutine_fn int nvme_co_pwritev(BlockDriverState *bs,
1179                                         uint64_t offset, uint64_t bytes,
1180                                         QEMUIOVector *qiov, int flags)
1181 {
1182     return nvme_co_prw(bs, offset, bytes, qiov, true, flags);
1183 }
1184 
1185 static coroutine_fn int nvme_co_flush(BlockDriverState *bs)
1186 {
1187     BDRVNVMeState *s = bs->opaque;
1188     NVMeQueuePair *ioq = s->queues[INDEX_IO(0)];
1189     NVMeRequest *req;
1190     NvmeCmd cmd = {
1191         .opcode = NVME_CMD_FLUSH,
1192         .nsid = cpu_to_le32(s->nsid),
1193     };
1194     NVMeCoData data = {
1195         .ctx = bdrv_get_aio_context(bs),
1196         .ret = -EINPROGRESS,
1197     };
1198 
1199     assert(s->nr_queues > 1);
1200     req = nvme_get_free_req(ioq);
1201     assert(req);
1202     nvme_submit_command(ioq, req, &cmd, nvme_rw_cb, &data);
1203 
1204     data.co = qemu_coroutine_self();
1205     if (data.ret == -EINPROGRESS) {
1206         qemu_coroutine_yield();
1207     }
1208 
1209     return data.ret;
1210 }
1211 
1212 
1213 static coroutine_fn int nvme_co_pwrite_zeroes(BlockDriverState *bs,
1214                                               int64_t offset,
1215                                               int bytes,
1216                                               BdrvRequestFlags flags)
1217 {
1218     BDRVNVMeState *s = bs->opaque;
1219     NVMeQueuePair *ioq = s->queues[INDEX_IO(0)];
1220     NVMeRequest *req;
1221 
1222     uint32_t cdw12 = ((bytes >> s->blkshift) - 1) & 0xFFFF;
1223 
1224     if (!s->supports_write_zeroes) {
1225         return -ENOTSUP;
1226     }
1227 
1228     NvmeCmd cmd = {
1229         .opcode = NVME_CMD_WRITE_ZEROES,
1230         .nsid = cpu_to_le32(s->nsid),
1231         .cdw10 = cpu_to_le32((offset >> s->blkshift) & 0xFFFFFFFF),
1232         .cdw11 = cpu_to_le32(((offset >> s->blkshift) >> 32) & 0xFFFFFFFF),
1233     };
1234 
1235     NVMeCoData data = {
1236         .ctx = bdrv_get_aio_context(bs),
1237         .ret = -EINPROGRESS,
1238     };
1239 
1240     if (flags & BDRV_REQ_MAY_UNMAP) {
1241         cdw12 |= (1 << 25);
1242     }
1243 
1244     if (flags & BDRV_REQ_FUA) {
1245         cdw12 |= (1 << 30);
1246     }
1247 
1248     cmd.cdw12 = cpu_to_le32(cdw12);
1249 
1250     trace_nvme_write_zeroes(s, offset, bytes, flags);
1251     assert(s->nr_queues > 1);
1252     req = nvme_get_free_req(ioq);
1253     assert(req);
1254 
1255     nvme_submit_command(ioq, req, &cmd, nvme_rw_cb, &data);
1256 
1257     data.co = qemu_coroutine_self();
1258     while (data.ret == -EINPROGRESS) {
1259         qemu_coroutine_yield();
1260     }
1261 
1262     trace_nvme_rw_done(s, true, offset, bytes, data.ret);
1263     return data.ret;
1264 }
1265 
1266 
1267 static int coroutine_fn nvme_co_pdiscard(BlockDriverState *bs,
1268                                          int64_t offset,
1269                                          int bytes)
1270 {
1271     BDRVNVMeState *s = bs->opaque;
1272     NVMeQueuePair *ioq = s->queues[INDEX_IO(0)];
1273     NVMeRequest *req;
1274     NvmeDsmRange *buf;
1275     QEMUIOVector local_qiov;
1276     int ret;
1277 
1278     NvmeCmd cmd = {
1279         .opcode = NVME_CMD_DSM,
1280         .nsid = cpu_to_le32(s->nsid),
1281         .cdw10 = cpu_to_le32(0), /*number of ranges - 0 based*/
1282         .cdw11 = cpu_to_le32(1 << 2), /*deallocate bit*/
1283     };
1284 
1285     NVMeCoData data = {
1286         .ctx = bdrv_get_aio_context(bs),
1287         .ret = -EINPROGRESS,
1288     };
1289 
1290     if (!s->supports_discard) {
1291         return -ENOTSUP;
1292     }
1293 
1294     assert(s->nr_queues > 1);
1295 
1296     buf = qemu_try_memalign(s->page_size, s->page_size);
1297     if (!buf) {
1298         return -ENOMEM;
1299     }
1300     memset(buf, 0, s->page_size);
1301     buf->nlb = cpu_to_le32(bytes >> s->blkshift);
1302     buf->slba = cpu_to_le64(offset >> s->blkshift);
1303     buf->cattr = 0;
1304 
1305     qemu_iovec_init(&local_qiov, 1);
1306     qemu_iovec_add(&local_qiov, buf, 4096);
1307 
1308     req = nvme_get_free_req(ioq);
1309     assert(req);
1310 
1311     qemu_co_mutex_lock(&s->dma_map_lock);
1312     ret = nvme_cmd_map_qiov(bs, &cmd, req, &local_qiov);
1313     qemu_co_mutex_unlock(&s->dma_map_lock);
1314 
1315     if (ret) {
1316         nvme_put_free_req_and_wake(ioq, req);
1317         goto out;
1318     }
1319 
1320     trace_nvme_dsm(s, offset, bytes);
1321 
1322     nvme_submit_command(ioq, req, &cmd, nvme_rw_cb, &data);
1323 
1324     data.co = qemu_coroutine_self();
1325     while (data.ret == -EINPROGRESS) {
1326         qemu_coroutine_yield();
1327     }
1328 
1329     qemu_co_mutex_lock(&s->dma_map_lock);
1330     ret = nvme_cmd_unmap_qiov(bs, &local_qiov);
1331     qemu_co_mutex_unlock(&s->dma_map_lock);
1332 
1333     if (ret) {
1334         goto out;
1335     }
1336 
1337     ret = data.ret;
1338     trace_nvme_dsm_done(s, offset, bytes, ret);
1339 out:
1340     qemu_iovec_destroy(&local_qiov);
1341     qemu_vfree(buf);
1342     return ret;
1343 
1344 }
1345 
1346 
1347 static int nvme_reopen_prepare(BDRVReopenState *reopen_state,
1348                                BlockReopenQueue *queue, Error **errp)
1349 {
1350     return 0;
1351 }
1352 
1353 static void nvme_refresh_filename(BlockDriverState *bs)
1354 {
1355     BDRVNVMeState *s = bs->opaque;
1356 
1357     snprintf(bs->exact_filename, sizeof(bs->exact_filename), "nvme://%s/%i",
1358              s->device, s->nsid);
1359 }
1360 
1361 static void nvme_refresh_limits(BlockDriverState *bs, Error **errp)
1362 {
1363     BDRVNVMeState *s = bs->opaque;
1364 
1365     bs->bl.opt_mem_alignment = s->page_size;
1366     bs->bl.request_alignment = s->page_size;
1367     bs->bl.max_transfer = s->max_transfer;
1368 }
1369 
1370 static void nvme_detach_aio_context(BlockDriverState *bs)
1371 {
1372     BDRVNVMeState *s = bs->opaque;
1373 
1374     for (int i = 0; i < s->nr_queues; i++) {
1375         NVMeQueuePair *q = s->queues[i];
1376 
1377         qemu_bh_delete(q->completion_bh);
1378         q->completion_bh = NULL;
1379     }
1380 
1381     aio_set_event_notifier(bdrv_get_aio_context(bs),
1382                            &s->irq_notifier[MSIX_SHARED_IRQ_IDX],
1383                            false, NULL, NULL);
1384 }
1385 
1386 static void nvme_attach_aio_context(BlockDriverState *bs,
1387                                     AioContext *new_context)
1388 {
1389     BDRVNVMeState *s = bs->opaque;
1390 
1391     s->aio_context = new_context;
1392     aio_set_event_notifier(new_context, &s->irq_notifier[MSIX_SHARED_IRQ_IDX],
1393                            false, nvme_handle_event, nvme_poll_cb);
1394 
1395     for (int i = 0; i < s->nr_queues; i++) {
1396         NVMeQueuePair *q = s->queues[i];
1397 
1398         q->completion_bh =
1399             aio_bh_new(new_context, nvme_process_completion_bh, q);
1400     }
1401 }
1402 
1403 static void nvme_aio_plug(BlockDriverState *bs)
1404 {
1405     BDRVNVMeState *s = bs->opaque;
1406     assert(!s->plugged);
1407     s->plugged = true;
1408 }
1409 
1410 static void nvme_aio_unplug(BlockDriverState *bs)
1411 {
1412     int i;
1413     BDRVNVMeState *s = bs->opaque;
1414     assert(s->plugged);
1415     s->plugged = false;
1416     for (i = INDEX_IO(0); i < s->nr_queues; i++) {
1417         NVMeQueuePair *q = s->queues[i];
1418         qemu_mutex_lock(&q->lock);
1419         nvme_kick(q);
1420         nvme_process_completion(q);
1421         qemu_mutex_unlock(&q->lock);
1422     }
1423 }
1424 
1425 static void nvme_register_buf(BlockDriverState *bs, void *host, size_t size)
1426 {
1427     int ret;
1428     BDRVNVMeState *s = bs->opaque;
1429 
1430     ret = qemu_vfio_dma_map(s->vfio, host, size, false, NULL);
1431     if (ret) {
1432         /* FIXME: we may run out of IOVA addresses after repeated
1433          * bdrv_register_buf/bdrv_unregister_buf, because nvme_vfio_dma_unmap
1434          * doesn't reclaim addresses for fixed mappings. */
1435         error_report("nvme_register_buf failed: %s", strerror(-ret));
1436     }
1437 }
1438 
1439 static void nvme_unregister_buf(BlockDriverState *bs, void *host)
1440 {
1441     BDRVNVMeState *s = bs->opaque;
1442 
1443     qemu_vfio_dma_unmap(s->vfio, host);
1444 }
1445 
1446 static const char *const nvme_strong_runtime_opts[] = {
1447     NVME_BLOCK_OPT_DEVICE,
1448     NVME_BLOCK_OPT_NAMESPACE,
1449 
1450     NULL
1451 };
1452 
1453 static BlockDriver bdrv_nvme = {
1454     .format_name              = "nvme",
1455     .protocol_name            = "nvme",
1456     .instance_size            = sizeof(BDRVNVMeState),
1457 
1458     .bdrv_co_create_opts      = bdrv_co_create_opts_simple,
1459     .create_opts              = &bdrv_create_opts_simple,
1460 
1461     .bdrv_parse_filename      = nvme_parse_filename,
1462     .bdrv_file_open           = nvme_file_open,
1463     .bdrv_close               = nvme_close,
1464     .bdrv_getlength           = nvme_getlength,
1465     .bdrv_probe_blocksizes    = nvme_probe_blocksizes,
1466 
1467     .bdrv_co_preadv           = nvme_co_preadv,
1468     .bdrv_co_pwritev          = nvme_co_pwritev,
1469 
1470     .bdrv_co_pwrite_zeroes    = nvme_co_pwrite_zeroes,
1471     .bdrv_co_pdiscard         = nvme_co_pdiscard,
1472 
1473     .bdrv_co_flush_to_disk    = nvme_co_flush,
1474     .bdrv_reopen_prepare      = nvme_reopen_prepare,
1475 
1476     .bdrv_refresh_filename    = nvme_refresh_filename,
1477     .bdrv_refresh_limits      = nvme_refresh_limits,
1478     .strong_runtime_opts      = nvme_strong_runtime_opts,
1479 
1480     .bdrv_detach_aio_context  = nvme_detach_aio_context,
1481     .bdrv_attach_aio_context  = nvme_attach_aio_context,
1482 
1483     .bdrv_io_plug             = nvme_aio_plug,
1484     .bdrv_io_unplug           = nvme_aio_unplug,
1485 
1486     .bdrv_register_buf        = nvme_register_buf,
1487     .bdrv_unregister_buf      = nvme_unregister_buf,
1488 };
1489 
1490 static void bdrv_nvme_init(void)
1491 {
1492     bdrv_register(&bdrv_nvme);
1493 }
1494 
1495 block_init(bdrv_nvme_init);
1496