xref: /openbmc/qemu/block/nvme.c (revision b1f4b9b8)
1 /*
2  * NVMe block driver based on vfio
3  *
4  * Copyright 2016 - 2018 Red Hat, Inc.
5  *
6  * Authors:
7  *   Fam Zheng <famz@redhat.com>
8  *   Paolo Bonzini <pbonzini@redhat.com>
9  *
10  * This work is licensed under the terms of the GNU GPL, version 2 or later.
11  * See the COPYING file in the top-level directory.
12  */
13 
14 #include "qemu/osdep.h"
15 #include <linux/vfio.h>
16 #include "qapi/error.h"
17 #include "qapi/qmp/qdict.h"
18 #include "qapi/qmp/qstring.h"
19 #include "qemu/error-report.h"
20 #include "qemu/main-loop.h"
21 #include "qemu/module.h"
22 #include "qemu/cutils.h"
23 #include "qemu/option.h"
24 #include "qemu/memalign.h"
25 #include "qemu/vfio-helpers.h"
26 #include "block/block_int.h"
27 #include "sysemu/replay.h"
28 #include "trace.h"
29 
30 #include "block/nvme.h"
31 
32 #define NVME_SQ_ENTRY_BYTES 64
33 #define NVME_CQ_ENTRY_BYTES 16
34 #define NVME_QUEUE_SIZE 128
35 #define NVME_DOORBELL_SIZE 4096
36 
37 /*
38  * We have to leave one slot empty as that is the full queue case where
39  * head == tail + 1.
40  */
41 #define NVME_NUM_REQS (NVME_QUEUE_SIZE - 1)
42 
43 typedef struct BDRVNVMeState BDRVNVMeState;
44 
45 /* Same index is used for queues and IRQs */
46 #define INDEX_ADMIN     0
47 #define INDEX_IO(n)     (1 + n)
48 
49 /* This driver shares a single MSIX IRQ for the admin and I/O queues */
50 enum {
51     MSIX_SHARED_IRQ_IDX = 0,
52     MSIX_IRQ_COUNT = 1
53 };
54 
55 typedef struct {
56     int32_t  head, tail;
57     uint8_t  *queue;
58     uint64_t iova;
59     /* Hardware MMIO register */
60     volatile uint32_t *doorbell;
61 } NVMeQueue;
62 
63 typedef struct {
64     BlockCompletionFunc *cb;
65     void *opaque;
66     int cid;
67     void *prp_list_page;
68     uint64_t prp_list_iova;
69     int free_req_next; /* q->reqs[] index of next free req */
70 } NVMeRequest;
71 
72 typedef struct {
73     QemuMutex   lock;
74 
75     /* Read from I/O code path, initialized under BQL */
76     BDRVNVMeState   *s;
77     int             index;
78 
79     /* Fields protected by BQL */
80     uint8_t     *prp_list_pages;
81 
82     /* Fields protected by @lock */
83     CoQueue     free_req_queue;
84     NVMeQueue   sq, cq;
85     int         cq_phase;
86     int         free_req_head;
87     NVMeRequest reqs[NVME_NUM_REQS];
88     int         need_kick;
89     int         inflight;
90 
91     /* Thread-safe, no lock necessary */
92     QEMUBH      *completion_bh;
93 } NVMeQueuePair;
94 
95 struct BDRVNVMeState {
96     AioContext *aio_context;
97     QEMUVFIOState *vfio;
98     void *bar0_wo_map;
99     /* Memory mapped registers */
100     volatile struct {
101         uint32_t sq_tail;
102         uint32_t cq_head;
103     } *doorbells;
104     /* The submission/completion queue pairs.
105      * [0]: admin queue.
106      * [1..]: io queues.
107      */
108     NVMeQueuePair **queues;
109     unsigned queue_count;
110     size_t page_size;
111     /* How many uint32_t elements does each doorbell entry take. */
112     size_t doorbell_scale;
113     bool write_cache_supported;
114     EventNotifier irq_notifier[MSIX_IRQ_COUNT];
115 
116     uint64_t nsze; /* Namespace size reported by identify command */
117     int nsid;      /* The namespace id to read/write data. */
118     int blkshift;
119 
120     uint64_t max_transfer;
121     bool plugged;
122 
123     bool supports_write_zeroes;
124     bool supports_discard;
125 
126     CoMutex dma_map_lock;
127     CoQueue dma_flush_queue;
128 
129     /* Total size of mapped qiov, accessed under dma_map_lock */
130     int dma_map_count;
131 
132     /* PCI address (required for nvme_refresh_filename()) */
133     char *device;
134 
135     struct {
136         uint64_t completion_errors;
137         uint64_t aligned_accesses;
138         uint64_t unaligned_accesses;
139     } stats;
140 };
141 
142 #define NVME_BLOCK_OPT_DEVICE "device"
143 #define NVME_BLOCK_OPT_NAMESPACE "namespace"
144 
145 static void nvme_process_completion_bh(void *opaque);
146 
147 static QemuOptsList runtime_opts = {
148     .name = "nvme",
149     .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head),
150     .desc = {
151         {
152             .name = NVME_BLOCK_OPT_DEVICE,
153             .type = QEMU_OPT_STRING,
154             .help = "NVMe PCI device address",
155         },
156         {
157             .name = NVME_BLOCK_OPT_NAMESPACE,
158             .type = QEMU_OPT_NUMBER,
159             .help = "NVMe namespace",
160         },
161         { /* end of list */ }
162     },
163 };
164 
165 /* Returns true on success, false on failure. */
166 static bool nvme_init_queue(BDRVNVMeState *s, NVMeQueue *q,
167                             unsigned nentries, size_t entry_bytes, Error **errp)
168 {
169     size_t bytes;
170     int r;
171 
172     bytes = ROUND_UP(nentries * entry_bytes, qemu_real_host_page_size);
173     q->head = q->tail = 0;
174     q->queue = qemu_try_memalign(qemu_real_host_page_size, bytes);
175     if (!q->queue) {
176         error_setg(errp, "Cannot allocate queue");
177         return false;
178     }
179     memset(q->queue, 0, bytes);
180     r = qemu_vfio_dma_map(s->vfio, q->queue, bytes, false, &q->iova, errp);
181     if (r) {
182         error_prepend(errp, "Cannot map queue: ");
183     }
184     return r == 0;
185 }
186 
187 static void nvme_free_queue(NVMeQueue *q)
188 {
189     qemu_vfree(q->queue);
190 }
191 
192 static void nvme_free_queue_pair(NVMeQueuePair *q)
193 {
194     trace_nvme_free_queue_pair(q->index, q, &q->cq, &q->sq);
195     if (q->completion_bh) {
196         qemu_bh_delete(q->completion_bh);
197     }
198     nvme_free_queue(&q->sq);
199     nvme_free_queue(&q->cq);
200     qemu_vfree(q->prp_list_pages);
201     qemu_mutex_destroy(&q->lock);
202     g_free(q);
203 }
204 
205 static void nvme_free_req_queue_cb(void *opaque)
206 {
207     NVMeQueuePair *q = opaque;
208 
209     qemu_mutex_lock(&q->lock);
210     while (q->free_req_head != -1 &&
211            qemu_co_enter_next(&q->free_req_queue, &q->lock)) {
212         /* Retry waiting requests */
213     }
214     qemu_mutex_unlock(&q->lock);
215 }
216 
217 static NVMeQueuePair *nvme_create_queue_pair(BDRVNVMeState *s,
218                                              AioContext *aio_context,
219                                              unsigned idx, size_t size,
220                                              Error **errp)
221 {
222     int i, r;
223     NVMeQueuePair *q;
224     uint64_t prp_list_iova;
225     size_t bytes;
226 
227     q = g_try_new0(NVMeQueuePair, 1);
228     if (!q) {
229         error_setg(errp, "Cannot allocate queue pair");
230         return NULL;
231     }
232     trace_nvme_create_queue_pair(idx, q, size, aio_context,
233                                  event_notifier_get_fd(s->irq_notifier));
234     bytes = QEMU_ALIGN_UP(s->page_size * NVME_NUM_REQS,
235                           qemu_real_host_page_size);
236     q->prp_list_pages = qemu_try_memalign(qemu_real_host_page_size, bytes);
237     if (!q->prp_list_pages) {
238         error_setg(errp, "Cannot allocate PRP page list");
239         goto fail;
240     }
241     memset(q->prp_list_pages, 0, bytes);
242     qemu_mutex_init(&q->lock);
243     q->s = s;
244     q->index = idx;
245     qemu_co_queue_init(&q->free_req_queue);
246     q->completion_bh = aio_bh_new(aio_context, nvme_process_completion_bh, q);
247     r = qemu_vfio_dma_map(s->vfio, q->prp_list_pages, bytes,
248                           false, &prp_list_iova, errp);
249     if (r) {
250         error_prepend(errp, "Cannot map buffer for DMA: ");
251         goto fail;
252     }
253     q->free_req_head = -1;
254     for (i = 0; i < NVME_NUM_REQS; i++) {
255         NVMeRequest *req = &q->reqs[i];
256         req->cid = i + 1;
257         req->free_req_next = q->free_req_head;
258         q->free_req_head = i;
259         req->prp_list_page = q->prp_list_pages + i * s->page_size;
260         req->prp_list_iova = prp_list_iova + i * s->page_size;
261     }
262 
263     if (!nvme_init_queue(s, &q->sq, size, NVME_SQ_ENTRY_BYTES, errp)) {
264         goto fail;
265     }
266     q->sq.doorbell = &s->doorbells[idx * s->doorbell_scale].sq_tail;
267 
268     if (!nvme_init_queue(s, &q->cq, size, NVME_CQ_ENTRY_BYTES, errp)) {
269         goto fail;
270     }
271     q->cq.doorbell = &s->doorbells[idx * s->doorbell_scale].cq_head;
272 
273     return q;
274 fail:
275     nvme_free_queue_pair(q);
276     return NULL;
277 }
278 
279 /* With q->lock */
280 static void nvme_kick(NVMeQueuePair *q)
281 {
282     BDRVNVMeState *s = q->s;
283 
284     if (s->plugged || !q->need_kick) {
285         return;
286     }
287     trace_nvme_kick(s, q->index);
288     assert(!(q->sq.tail & 0xFF00));
289     /* Fence the write to submission queue entry before notifying the device. */
290     smp_wmb();
291     *q->sq.doorbell = cpu_to_le32(q->sq.tail);
292     q->inflight += q->need_kick;
293     q->need_kick = 0;
294 }
295 
296 /* Find a free request element if any, otherwise:
297  * a) if in coroutine context, try to wait for one to become available;
298  * b) if not in coroutine, return NULL;
299  */
300 static NVMeRequest *nvme_get_free_req(NVMeQueuePair *q)
301 {
302     NVMeRequest *req;
303 
304     qemu_mutex_lock(&q->lock);
305 
306     while (q->free_req_head == -1) {
307         if (qemu_in_coroutine()) {
308             trace_nvme_free_req_queue_wait(q->s, q->index);
309             qemu_co_queue_wait(&q->free_req_queue, &q->lock);
310         } else {
311             qemu_mutex_unlock(&q->lock);
312             return NULL;
313         }
314     }
315 
316     req = &q->reqs[q->free_req_head];
317     q->free_req_head = req->free_req_next;
318     req->free_req_next = -1;
319 
320     qemu_mutex_unlock(&q->lock);
321     return req;
322 }
323 
324 /* With q->lock */
325 static void nvme_put_free_req_locked(NVMeQueuePair *q, NVMeRequest *req)
326 {
327     req->free_req_next = q->free_req_head;
328     q->free_req_head = req - q->reqs;
329 }
330 
331 /* With q->lock */
332 static void nvme_wake_free_req_locked(NVMeQueuePair *q)
333 {
334     if (!qemu_co_queue_empty(&q->free_req_queue)) {
335         replay_bh_schedule_oneshot_event(q->s->aio_context,
336                 nvme_free_req_queue_cb, q);
337     }
338 }
339 
340 /* Insert a request in the freelist and wake waiters */
341 static void nvme_put_free_req_and_wake(NVMeQueuePair *q, NVMeRequest *req)
342 {
343     qemu_mutex_lock(&q->lock);
344     nvme_put_free_req_locked(q, req);
345     nvme_wake_free_req_locked(q);
346     qemu_mutex_unlock(&q->lock);
347 }
348 
349 static inline int nvme_translate_error(const NvmeCqe *c)
350 {
351     uint16_t status = (le16_to_cpu(c->status) >> 1) & 0xFF;
352     if (status) {
353         trace_nvme_error(le32_to_cpu(c->result),
354                          le16_to_cpu(c->sq_head),
355                          le16_to_cpu(c->sq_id),
356                          le16_to_cpu(c->cid),
357                          le16_to_cpu(status));
358     }
359     switch (status) {
360     case 0:
361         return 0;
362     case 1:
363         return -ENOSYS;
364     case 2:
365         return -EINVAL;
366     default:
367         return -EIO;
368     }
369 }
370 
371 /* With q->lock */
372 static bool nvme_process_completion(NVMeQueuePair *q)
373 {
374     BDRVNVMeState *s = q->s;
375     bool progress = false;
376     NVMeRequest *preq;
377     NVMeRequest req;
378     NvmeCqe *c;
379 
380     trace_nvme_process_completion(s, q->index, q->inflight);
381     if (s->plugged) {
382         trace_nvme_process_completion_queue_plugged(s, q->index);
383         return false;
384     }
385 
386     /*
387      * Support re-entrancy when a request cb() function invokes aio_poll().
388      * Pending completions must be visible to aio_poll() so that a cb()
389      * function can wait for the completion of another request.
390      *
391      * The aio_poll() loop will execute our BH and we'll resume completion
392      * processing there.
393      */
394     qemu_bh_schedule(q->completion_bh);
395 
396     assert(q->inflight >= 0);
397     while (q->inflight) {
398         int ret;
399         int16_t cid;
400 
401         c = (NvmeCqe *)&q->cq.queue[q->cq.head * NVME_CQ_ENTRY_BYTES];
402         if ((le16_to_cpu(c->status) & 0x1) == q->cq_phase) {
403             break;
404         }
405         ret = nvme_translate_error(c);
406         if (ret) {
407             s->stats.completion_errors++;
408         }
409         q->cq.head = (q->cq.head + 1) % NVME_QUEUE_SIZE;
410         if (!q->cq.head) {
411             q->cq_phase = !q->cq_phase;
412         }
413         cid = le16_to_cpu(c->cid);
414         if (cid == 0 || cid > NVME_QUEUE_SIZE) {
415             warn_report("NVMe: Unexpected CID in completion queue: %"PRIu32", "
416                         "queue size: %u", cid, NVME_QUEUE_SIZE);
417             continue;
418         }
419         trace_nvme_complete_command(s, q->index, cid);
420         preq = &q->reqs[cid - 1];
421         req = *preq;
422         assert(req.cid == cid);
423         assert(req.cb);
424         nvme_put_free_req_locked(q, preq);
425         preq->cb = preq->opaque = NULL;
426         q->inflight--;
427         qemu_mutex_unlock(&q->lock);
428         req.cb(req.opaque, ret);
429         qemu_mutex_lock(&q->lock);
430         progress = true;
431     }
432     if (progress) {
433         /* Notify the device so it can post more completions. */
434         smp_mb_release();
435         *q->cq.doorbell = cpu_to_le32(q->cq.head);
436         nvme_wake_free_req_locked(q);
437     }
438 
439     qemu_bh_cancel(q->completion_bh);
440 
441     return progress;
442 }
443 
444 static void nvme_process_completion_bh(void *opaque)
445 {
446     NVMeQueuePair *q = opaque;
447 
448     /*
449      * We're being invoked because a nvme_process_completion() cb() function
450      * called aio_poll(). The callback may be waiting for further completions
451      * so notify the device that it has space to fill in more completions now.
452      */
453     smp_mb_release();
454     *q->cq.doorbell = cpu_to_le32(q->cq.head);
455     nvme_wake_free_req_locked(q);
456 
457     nvme_process_completion(q);
458 }
459 
460 static void nvme_trace_command(const NvmeCmd *cmd)
461 {
462     int i;
463 
464     if (!trace_event_get_state_backends(TRACE_NVME_SUBMIT_COMMAND_RAW)) {
465         return;
466     }
467     for (i = 0; i < 8; ++i) {
468         uint8_t *cmdp = (uint8_t *)cmd + i * 8;
469         trace_nvme_submit_command_raw(cmdp[0], cmdp[1], cmdp[2], cmdp[3],
470                                       cmdp[4], cmdp[5], cmdp[6], cmdp[7]);
471     }
472 }
473 
474 static void nvme_submit_command(NVMeQueuePair *q, NVMeRequest *req,
475                                 NvmeCmd *cmd, BlockCompletionFunc cb,
476                                 void *opaque)
477 {
478     assert(!req->cb);
479     req->cb = cb;
480     req->opaque = opaque;
481     cmd->cid = cpu_to_le16(req->cid);
482 
483     trace_nvme_submit_command(q->s, q->index, req->cid);
484     nvme_trace_command(cmd);
485     qemu_mutex_lock(&q->lock);
486     memcpy((uint8_t *)q->sq.queue +
487            q->sq.tail * NVME_SQ_ENTRY_BYTES, cmd, sizeof(*cmd));
488     q->sq.tail = (q->sq.tail + 1) % NVME_QUEUE_SIZE;
489     q->need_kick++;
490     nvme_kick(q);
491     nvme_process_completion(q);
492     qemu_mutex_unlock(&q->lock);
493 }
494 
495 static void nvme_admin_cmd_sync_cb(void *opaque, int ret)
496 {
497     int *pret = opaque;
498     *pret = ret;
499     aio_wait_kick();
500 }
501 
502 static int nvme_admin_cmd_sync(BlockDriverState *bs, NvmeCmd *cmd)
503 {
504     BDRVNVMeState *s = bs->opaque;
505     NVMeQueuePair *q = s->queues[INDEX_ADMIN];
506     AioContext *aio_context = bdrv_get_aio_context(bs);
507     NVMeRequest *req;
508     int ret = -EINPROGRESS;
509     req = nvme_get_free_req(q);
510     if (!req) {
511         return -EBUSY;
512     }
513     nvme_submit_command(q, req, cmd, nvme_admin_cmd_sync_cb, &ret);
514 
515     AIO_WAIT_WHILE(aio_context, ret == -EINPROGRESS);
516     return ret;
517 }
518 
519 /* Returns true on success, false on failure. */
520 static bool nvme_identify(BlockDriverState *bs, int namespace, Error **errp)
521 {
522     BDRVNVMeState *s = bs->opaque;
523     bool ret = false;
524     QEMU_AUTO_VFREE union {
525         NvmeIdCtrl ctrl;
526         NvmeIdNs ns;
527     } *id = NULL;
528     NvmeLBAF *lbaf;
529     uint16_t oncs;
530     int r;
531     uint64_t iova;
532     NvmeCmd cmd = {
533         .opcode = NVME_ADM_CMD_IDENTIFY,
534         .cdw10 = cpu_to_le32(0x1),
535     };
536     size_t id_size = QEMU_ALIGN_UP(sizeof(*id), qemu_real_host_page_size);
537 
538     id = qemu_try_memalign(qemu_real_host_page_size, id_size);
539     if (!id) {
540         error_setg(errp, "Cannot allocate buffer for identify response");
541         goto out;
542     }
543     r = qemu_vfio_dma_map(s->vfio, id, id_size, true, &iova, errp);
544     if (r) {
545         error_prepend(errp, "Cannot map buffer for DMA: ");
546         goto out;
547     }
548 
549     memset(id, 0, id_size);
550     cmd.dptr.prp1 = cpu_to_le64(iova);
551     if (nvme_admin_cmd_sync(bs, &cmd)) {
552         error_setg(errp, "Failed to identify controller");
553         goto out;
554     }
555 
556     if (le32_to_cpu(id->ctrl.nn) < namespace) {
557         error_setg(errp, "Invalid namespace");
558         goto out;
559     }
560     s->write_cache_supported = le32_to_cpu(id->ctrl.vwc) & 0x1;
561     s->max_transfer = (id->ctrl.mdts ? 1 << id->ctrl.mdts : 0) * s->page_size;
562     /* For now the page list buffer per command is one page, to hold at most
563      * s->page_size / sizeof(uint64_t) entries. */
564     s->max_transfer = MIN_NON_ZERO(s->max_transfer,
565                           s->page_size / sizeof(uint64_t) * s->page_size);
566 
567     oncs = le16_to_cpu(id->ctrl.oncs);
568     s->supports_write_zeroes = !!(oncs & NVME_ONCS_WRITE_ZEROES);
569     s->supports_discard = !!(oncs & NVME_ONCS_DSM);
570 
571     memset(id, 0, id_size);
572     cmd.cdw10 = 0;
573     cmd.nsid = cpu_to_le32(namespace);
574     if (nvme_admin_cmd_sync(bs, &cmd)) {
575         error_setg(errp, "Failed to identify namespace");
576         goto out;
577     }
578 
579     s->nsze = le64_to_cpu(id->ns.nsze);
580     lbaf = &id->ns.lbaf[NVME_ID_NS_FLBAS_INDEX(id->ns.flbas)];
581 
582     if (NVME_ID_NS_DLFEAT_WRITE_ZEROES(id->ns.dlfeat) &&
583             NVME_ID_NS_DLFEAT_READ_BEHAVIOR(id->ns.dlfeat) ==
584                     NVME_ID_NS_DLFEAT_READ_BEHAVIOR_ZEROES) {
585         bs->supported_write_flags |= BDRV_REQ_MAY_UNMAP;
586     }
587 
588     if (lbaf->ms) {
589         error_setg(errp, "Namespaces with metadata are not yet supported");
590         goto out;
591     }
592 
593     if (lbaf->ds < BDRV_SECTOR_BITS || lbaf->ds > 12 ||
594         (1 << lbaf->ds) > s->page_size)
595     {
596         error_setg(errp, "Namespace has unsupported block size (2^%d)",
597                    lbaf->ds);
598         goto out;
599     }
600 
601     ret = true;
602     s->blkshift = lbaf->ds;
603 out:
604     qemu_vfio_dma_unmap(s->vfio, id);
605 
606     return ret;
607 }
608 
609 static void nvme_poll_queue(NVMeQueuePair *q)
610 {
611     const size_t cqe_offset = q->cq.head * NVME_CQ_ENTRY_BYTES;
612     NvmeCqe *cqe = (NvmeCqe *)&q->cq.queue[cqe_offset];
613 
614     trace_nvme_poll_queue(q->s, q->index);
615     /*
616      * Do an early check for completions. q->lock isn't needed because
617      * nvme_process_completion() only runs in the event loop thread and
618      * cannot race with itself.
619      */
620     if ((le16_to_cpu(cqe->status) & 0x1) == q->cq_phase) {
621         return;
622     }
623 
624     qemu_mutex_lock(&q->lock);
625     while (nvme_process_completion(q)) {
626         /* Keep polling */
627     }
628     qemu_mutex_unlock(&q->lock);
629 }
630 
631 static void nvme_poll_queues(BDRVNVMeState *s)
632 {
633     int i;
634 
635     for (i = 0; i < s->queue_count; i++) {
636         nvme_poll_queue(s->queues[i]);
637     }
638 }
639 
640 static void nvme_handle_event(EventNotifier *n)
641 {
642     BDRVNVMeState *s = container_of(n, BDRVNVMeState,
643                                     irq_notifier[MSIX_SHARED_IRQ_IDX]);
644 
645     trace_nvme_handle_event(s);
646     event_notifier_test_and_clear(n);
647     nvme_poll_queues(s);
648 }
649 
650 static bool nvme_add_io_queue(BlockDriverState *bs, Error **errp)
651 {
652     BDRVNVMeState *s = bs->opaque;
653     unsigned n = s->queue_count;
654     NVMeQueuePair *q;
655     NvmeCmd cmd;
656     unsigned queue_size = NVME_QUEUE_SIZE;
657 
658     assert(n <= UINT16_MAX);
659     q = nvme_create_queue_pair(s, bdrv_get_aio_context(bs),
660                                n, queue_size, errp);
661     if (!q) {
662         return false;
663     }
664     cmd = (NvmeCmd) {
665         .opcode = NVME_ADM_CMD_CREATE_CQ,
666         .dptr.prp1 = cpu_to_le64(q->cq.iova),
667         .cdw10 = cpu_to_le32(((queue_size - 1) << 16) | n),
668         .cdw11 = cpu_to_le32(NVME_CQ_IEN | NVME_CQ_PC),
669     };
670     if (nvme_admin_cmd_sync(bs, &cmd)) {
671         error_setg(errp, "Failed to create CQ io queue [%u]", n);
672         goto out_error;
673     }
674     cmd = (NvmeCmd) {
675         .opcode = NVME_ADM_CMD_CREATE_SQ,
676         .dptr.prp1 = cpu_to_le64(q->sq.iova),
677         .cdw10 = cpu_to_le32(((queue_size - 1) << 16) | n),
678         .cdw11 = cpu_to_le32(NVME_SQ_PC | (n << 16)),
679     };
680     if (nvme_admin_cmd_sync(bs, &cmd)) {
681         error_setg(errp, "Failed to create SQ io queue [%u]", n);
682         goto out_error;
683     }
684     s->queues = g_renew(NVMeQueuePair *, s->queues, n + 1);
685     s->queues[n] = q;
686     s->queue_count++;
687     return true;
688 out_error:
689     nvme_free_queue_pair(q);
690     return false;
691 }
692 
693 static bool nvme_poll_cb(void *opaque)
694 {
695     EventNotifier *e = opaque;
696     BDRVNVMeState *s = container_of(e, BDRVNVMeState,
697                                     irq_notifier[MSIX_SHARED_IRQ_IDX]);
698     int i;
699 
700     for (i = 0; i < s->queue_count; i++) {
701         NVMeQueuePair *q = s->queues[i];
702         const size_t cqe_offset = q->cq.head * NVME_CQ_ENTRY_BYTES;
703         NvmeCqe *cqe = (NvmeCqe *)&q->cq.queue[cqe_offset];
704 
705         /*
706          * q->lock isn't needed because nvme_process_completion() only runs in
707          * the event loop thread and cannot race with itself.
708          */
709         if ((le16_to_cpu(cqe->status) & 0x1) != q->cq_phase) {
710             return true;
711         }
712     }
713     return false;
714 }
715 
716 static void nvme_poll_ready(EventNotifier *e)
717 {
718     BDRVNVMeState *s = container_of(e, BDRVNVMeState,
719                                     irq_notifier[MSIX_SHARED_IRQ_IDX]);
720 
721     nvme_poll_queues(s);
722 }
723 
724 static int nvme_init(BlockDriverState *bs, const char *device, int namespace,
725                      Error **errp)
726 {
727     BDRVNVMeState *s = bs->opaque;
728     NVMeQueuePair *q;
729     AioContext *aio_context = bdrv_get_aio_context(bs);
730     int ret;
731     uint64_t cap;
732     uint32_t ver;
733     uint64_t timeout_ms;
734     uint64_t deadline, now;
735     volatile NvmeBar *regs = NULL;
736 
737     qemu_co_mutex_init(&s->dma_map_lock);
738     qemu_co_queue_init(&s->dma_flush_queue);
739     s->device = g_strdup(device);
740     s->nsid = namespace;
741     s->aio_context = bdrv_get_aio_context(bs);
742     ret = event_notifier_init(&s->irq_notifier[MSIX_SHARED_IRQ_IDX], 0);
743     if (ret) {
744         error_setg(errp, "Failed to init event notifier");
745         return ret;
746     }
747 
748     s->vfio = qemu_vfio_open_pci(device, errp);
749     if (!s->vfio) {
750         ret = -EINVAL;
751         goto out;
752     }
753 
754     regs = qemu_vfio_pci_map_bar(s->vfio, 0, 0, sizeof(NvmeBar),
755                                  PROT_READ | PROT_WRITE, errp);
756     if (!regs) {
757         ret = -EINVAL;
758         goto out;
759     }
760     /* Perform initialize sequence as described in NVMe spec "7.6.1
761      * Initialization". */
762 
763     cap = le64_to_cpu(regs->cap);
764     trace_nvme_controller_capability_raw(cap);
765     trace_nvme_controller_capability("Maximum Queue Entries Supported",
766                                      1 + NVME_CAP_MQES(cap));
767     trace_nvme_controller_capability("Contiguous Queues Required",
768                                      NVME_CAP_CQR(cap));
769     trace_nvme_controller_capability("Doorbell Stride",
770                                      1 << (2 + NVME_CAP_DSTRD(cap)));
771     trace_nvme_controller_capability("Subsystem Reset Supported",
772                                      NVME_CAP_NSSRS(cap));
773     trace_nvme_controller_capability("Memory Page Size Minimum",
774                                      1 << (12 + NVME_CAP_MPSMIN(cap)));
775     trace_nvme_controller_capability("Memory Page Size Maximum",
776                                      1 << (12 + NVME_CAP_MPSMAX(cap)));
777     if (!NVME_CAP_CSS(cap)) {
778         error_setg(errp, "Device doesn't support NVMe command set");
779         ret = -EINVAL;
780         goto out;
781     }
782 
783     s->page_size = 1u << (12 + NVME_CAP_MPSMIN(cap));
784     s->doorbell_scale = (4 << NVME_CAP_DSTRD(cap)) / sizeof(uint32_t);
785     bs->bl.opt_mem_alignment = s->page_size;
786     bs->bl.request_alignment = s->page_size;
787     timeout_ms = MIN(500 * NVME_CAP_TO(cap), 30000);
788 
789     ver = le32_to_cpu(regs->vs);
790     trace_nvme_controller_spec_version(extract32(ver, 16, 16),
791                                        extract32(ver, 8, 8),
792                                        extract32(ver, 0, 8));
793 
794     /* Reset device to get a clean state. */
795     regs->cc = cpu_to_le32(le32_to_cpu(regs->cc) & 0xFE);
796     /* Wait for CSTS.RDY = 0. */
797     deadline = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) + timeout_ms * SCALE_MS;
798     while (NVME_CSTS_RDY(le32_to_cpu(regs->csts))) {
799         if (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) > deadline) {
800             error_setg(errp, "Timeout while waiting for device to reset (%"
801                              PRId64 " ms)",
802                        timeout_ms);
803             ret = -ETIMEDOUT;
804             goto out;
805         }
806     }
807 
808     s->bar0_wo_map = qemu_vfio_pci_map_bar(s->vfio, 0, 0,
809                                            sizeof(NvmeBar) + NVME_DOORBELL_SIZE,
810                                            PROT_WRITE, errp);
811     s->doorbells = (void *)((uintptr_t)s->bar0_wo_map + sizeof(NvmeBar));
812     if (!s->doorbells) {
813         ret = -EINVAL;
814         goto out;
815     }
816 
817     /* Set up admin queue. */
818     s->queues = g_new(NVMeQueuePair *, 1);
819     q = nvme_create_queue_pair(s, aio_context, 0, NVME_QUEUE_SIZE, errp);
820     if (!q) {
821         ret = -EINVAL;
822         goto out;
823     }
824     s->queues[INDEX_ADMIN] = q;
825     s->queue_count = 1;
826     QEMU_BUILD_BUG_ON((NVME_QUEUE_SIZE - 1) & 0xF000);
827     regs->aqa = cpu_to_le32(((NVME_QUEUE_SIZE - 1) << AQA_ACQS_SHIFT) |
828                             ((NVME_QUEUE_SIZE - 1) << AQA_ASQS_SHIFT));
829     regs->asq = cpu_to_le64(q->sq.iova);
830     regs->acq = cpu_to_le64(q->cq.iova);
831 
832     /* After setting up all control registers we can enable device now. */
833     regs->cc = cpu_to_le32((ctz32(NVME_CQ_ENTRY_BYTES) << CC_IOCQES_SHIFT) |
834                            (ctz32(NVME_SQ_ENTRY_BYTES) << CC_IOSQES_SHIFT) |
835                            CC_EN_MASK);
836     /* Wait for CSTS.RDY = 1. */
837     now = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
838     deadline = now + timeout_ms * SCALE_MS;
839     while (!NVME_CSTS_RDY(le32_to_cpu(regs->csts))) {
840         if (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) > deadline) {
841             error_setg(errp, "Timeout while waiting for device to start (%"
842                              PRId64 " ms)",
843                        timeout_ms);
844             ret = -ETIMEDOUT;
845             goto out;
846         }
847     }
848 
849     ret = qemu_vfio_pci_init_irq(s->vfio, s->irq_notifier,
850                                  VFIO_PCI_MSIX_IRQ_INDEX, errp);
851     if (ret) {
852         goto out;
853     }
854     aio_set_event_notifier(bdrv_get_aio_context(bs),
855                            &s->irq_notifier[MSIX_SHARED_IRQ_IDX],
856                            false, nvme_handle_event, nvme_poll_cb,
857                            nvme_poll_ready);
858 
859     if (!nvme_identify(bs, namespace, errp)) {
860         ret = -EIO;
861         goto out;
862     }
863 
864     /* Set up command queues. */
865     if (!nvme_add_io_queue(bs, errp)) {
866         ret = -EIO;
867     }
868 out:
869     if (regs) {
870         qemu_vfio_pci_unmap_bar(s->vfio, 0, (void *)regs, 0, sizeof(NvmeBar));
871     }
872 
873     /* Cleaning up is done in nvme_file_open() upon error. */
874     return ret;
875 }
876 
877 /* Parse a filename in the format of nvme://XXXX:XX:XX.X/X. Example:
878  *
879  *     nvme://0000:44:00.0/1
880  *
881  * where the "nvme://" is a fixed form of the protocol prefix, the middle part
882  * is the PCI address, and the last part is the namespace number starting from
883  * 1 according to the NVMe spec. */
884 static void nvme_parse_filename(const char *filename, QDict *options,
885                                 Error **errp)
886 {
887     int pref = strlen("nvme://");
888 
889     if (strlen(filename) > pref && !strncmp(filename, "nvme://", pref)) {
890         const char *tmp = filename + pref;
891         char *device;
892         const char *namespace;
893         unsigned long ns;
894         const char *slash = strchr(tmp, '/');
895         if (!slash) {
896             qdict_put_str(options, NVME_BLOCK_OPT_DEVICE, tmp);
897             return;
898         }
899         device = g_strndup(tmp, slash - tmp);
900         qdict_put_str(options, NVME_BLOCK_OPT_DEVICE, device);
901         g_free(device);
902         namespace = slash + 1;
903         if (*namespace && qemu_strtoul(namespace, NULL, 10, &ns)) {
904             error_setg(errp, "Invalid namespace '%s', positive number expected",
905                        namespace);
906             return;
907         }
908         qdict_put_str(options, NVME_BLOCK_OPT_NAMESPACE,
909                       *namespace ? namespace : "1");
910     }
911 }
912 
913 static int nvme_enable_disable_write_cache(BlockDriverState *bs, bool enable,
914                                            Error **errp)
915 {
916     int ret;
917     BDRVNVMeState *s = bs->opaque;
918     NvmeCmd cmd = {
919         .opcode = NVME_ADM_CMD_SET_FEATURES,
920         .nsid = cpu_to_le32(s->nsid),
921         .cdw10 = cpu_to_le32(0x06),
922         .cdw11 = cpu_to_le32(enable ? 0x01 : 0x00),
923     };
924 
925     ret = nvme_admin_cmd_sync(bs, &cmd);
926     if (ret) {
927         error_setg(errp, "Failed to configure NVMe write cache");
928     }
929     return ret;
930 }
931 
932 static void nvme_close(BlockDriverState *bs)
933 {
934     BDRVNVMeState *s = bs->opaque;
935 
936     for (unsigned i = 0; i < s->queue_count; ++i) {
937         nvme_free_queue_pair(s->queues[i]);
938     }
939     g_free(s->queues);
940     aio_set_event_notifier(bdrv_get_aio_context(bs),
941                            &s->irq_notifier[MSIX_SHARED_IRQ_IDX],
942                            false, NULL, NULL, NULL);
943     event_notifier_cleanup(&s->irq_notifier[MSIX_SHARED_IRQ_IDX]);
944     qemu_vfio_pci_unmap_bar(s->vfio, 0, s->bar0_wo_map,
945                             0, sizeof(NvmeBar) + NVME_DOORBELL_SIZE);
946     qemu_vfio_close(s->vfio);
947 
948     g_free(s->device);
949 }
950 
951 static int nvme_file_open(BlockDriverState *bs, QDict *options, int flags,
952                           Error **errp)
953 {
954     const char *device;
955     QemuOpts *opts;
956     int namespace;
957     int ret;
958     BDRVNVMeState *s = bs->opaque;
959 
960     bs->supported_write_flags = BDRV_REQ_FUA;
961 
962     opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort);
963     qemu_opts_absorb_qdict(opts, options, &error_abort);
964     device = qemu_opt_get(opts, NVME_BLOCK_OPT_DEVICE);
965     if (!device) {
966         error_setg(errp, "'" NVME_BLOCK_OPT_DEVICE "' option is required");
967         qemu_opts_del(opts);
968         return -EINVAL;
969     }
970 
971     namespace = qemu_opt_get_number(opts, NVME_BLOCK_OPT_NAMESPACE, 1);
972     ret = nvme_init(bs, device, namespace, errp);
973     qemu_opts_del(opts);
974     if (ret) {
975         goto fail;
976     }
977     if (flags & BDRV_O_NOCACHE) {
978         if (!s->write_cache_supported) {
979             error_setg(errp,
980                        "NVMe controller doesn't support write cache configuration");
981             ret = -EINVAL;
982         } else {
983             ret = nvme_enable_disable_write_cache(bs, !(flags & BDRV_O_NOCACHE),
984                                                   errp);
985         }
986         if (ret) {
987             goto fail;
988         }
989     }
990     return 0;
991 fail:
992     nvme_close(bs);
993     return ret;
994 }
995 
996 static int64_t nvme_getlength(BlockDriverState *bs)
997 {
998     BDRVNVMeState *s = bs->opaque;
999     return s->nsze << s->blkshift;
1000 }
1001 
1002 static uint32_t nvme_get_blocksize(BlockDriverState *bs)
1003 {
1004     BDRVNVMeState *s = bs->opaque;
1005     assert(s->blkshift >= BDRV_SECTOR_BITS && s->blkshift <= 12);
1006     return UINT32_C(1) << s->blkshift;
1007 }
1008 
1009 static int nvme_probe_blocksizes(BlockDriverState *bs, BlockSizes *bsz)
1010 {
1011     uint32_t blocksize = nvme_get_blocksize(bs);
1012     bsz->phys = blocksize;
1013     bsz->log = blocksize;
1014     return 0;
1015 }
1016 
1017 /* Called with s->dma_map_lock */
1018 static coroutine_fn int nvme_cmd_unmap_qiov(BlockDriverState *bs,
1019                                             QEMUIOVector *qiov)
1020 {
1021     int r = 0;
1022     BDRVNVMeState *s = bs->opaque;
1023 
1024     s->dma_map_count -= qiov->size;
1025     if (!s->dma_map_count && !qemu_co_queue_empty(&s->dma_flush_queue)) {
1026         r = qemu_vfio_dma_reset_temporary(s->vfio);
1027         if (!r) {
1028             qemu_co_queue_restart_all(&s->dma_flush_queue);
1029         }
1030     }
1031     return r;
1032 }
1033 
1034 /* Called with s->dma_map_lock */
1035 static coroutine_fn int nvme_cmd_map_qiov(BlockDriverState *bs, NvmeCmd *cmd,
1036                                           NVMeRequest *req, QEMUIOVector *qiov)
1037 {
1038     BDRVNVMeState *s = bs->opaque;
1039     uint64_t *pagelist = req->prp_list_page;
1040     int i, j, r;
1041     int entries = 0;
1042     Error *local_err = NULL, **errp = NULL;
1043 
1044     assert(qiov->size);
1045     assert(QEMU_IS_ALIGNED(qiov->size, s->page_size));
1046     assert(qiov->size / s->page_size <= s->page_size / sizeof(uint64_t));
1047     for (i = 0; i < qiov->niov; ++i) {
1048         bool retry = true;
1049         uint64_t iova;
1050         size_t len = QEMU_ALIGN_UP(qiov->iov[i].iov_len,
1051                                    qemu_real_host_page_size);
1052 try_map:
1053         r = qemu_vfio_dma_map(s->vfio,
1054                               qiov->iov[i].iov_base,
1055                               len, true, &iova, errp);
1056         if (r == -ENOSPC) {
1057             /*
1058              * In addition to the -ENOMEM error, the VFIO_IOMMU_MAP_DMA
1059              * ioctl returns -ENOSPC to signal the user exhausted the DMA
1060              * mappings available for a container since Linux kernel commit
1061              * 492855939bdb ("vfio/type1: Limit DMA mappings per container",
1062              * April 2019, see CVE-2019-3882).
1063              *
1064              * This block driver already handles this error path by checking
1065              * for the -ENOMEM error, so we directly replace -ENOSPC by
1066              * -ENOMEM. Beside, -ENOSPC has a specific meaning for blockdev
1067              * coroutines: it triggers BLOCKDEV_ON_ERROR_ENOSPC and
1068              * BLOCK_ERROR_ACTION_STOP which stops the VM, asking the operator
1069              * to add more storage to the blockdev. Not something we can do
1070              * easily with an IOMMU :)
1071              */
1072             r = -ENOMEM;
1073         }
1074         if (r == -ENOMEM && retry) {
1075             /*
1076              * We exhausted the DMA mappings available for our container:
1077              * recycle the volatile IOVA mappings.
1078              */
1079             retry = false;
1080             trace_nvme_dma_flush_queue_wait(s);
1081             if (s->dma_map_count) {
1082                 trace_nvme_dma_map_flush(s);
1083                 qemu_co_queue_wait(&s->dma_flush_queue, &s->dma_map_lock);
1084             } else {
1085                 r = qemu_vfio_dma_reset_temporary(s->vfio);
1086                 if (r) {
1087                     goto fail;
1088                 }
1089             }
1090             errp = &local_err;
1091 
1092             goto try_map;
1093         }
1094         if (r) {
1095             goto fail;
1096         }
1097 
1098         for (j = 0; j < qiov->iov[i].iov_len / s->page_size; j++) {
1099             pagelist[entries++] = cpu_to_le64(iova + j * s->page_size);
1100         }
1101         trace_nvme_cmd_map_qiov_iov(s, i, qiov->iov[i].iov_base,
1102                                     qiov->iov[i].iov_len / s->page_size);
1103     }
1104 
1105     s->dma_map_count += qiov->size;
1106 
1107     assert(entries <= s->page_size / sizeof(uint64_t));
1108     switch (entries) {
1109     case 0:
1110         abort();
1111     case 1:
1112         cmd->dptr.prp1 = pagelist[0];
1113         cmd->dptr.prp2 = 0;
1114         break;
1115     case 2:
1116         cmd->dptr.prp1 = pagelist[0];
1117         cmd->dptr.prp2 = pagelist[1];
1118         break;
1119     default:
1120         cmd->dptr.prp1 = pagelist[0];
1121         cmd->dptr.prp2 = cpu_to_le64(req->prp_list_iova + sizeof(uint64_t));
1122         break;
1123     }
1124     trace_nvme_cmd_map_qiov(s, cmd, req, qiov, entries);
1125     for (i = 0; i < entries; ++i) {
1126         trace_nvme_cmd_map_qiov_pages(s, i, pagelist[i]);
1127     }
1128     return 0;
1129 fail:
1130     /* No need to unmap [0 - i) iovs even if we've failed, since we don't
1131      * increment s->dma_map_count. This is okay for fixed mapping memory areas
1132      * because they are already mapped before calling this function; for
1133      * temporary mappings, a later nvme_cmd_(un)map_qiov will reclaim by
1134      * calling qemu_vfio_dma_reset_temporary when necessary. */
1135     if (local_err) {
1136         error_reportf_err(local_err, "Cannot map buffer for DMA: ");
1137     }
1138     return r;
1139 }
1140 
1141 typedef struct {
1142     Coroutine *co;
1143     int ret;
1144     AioContext *ctx;
1145 } NVMeCoData;
1146 
1147 static void nvme_rw_cb_bh(void *opaque)
1148 {
1149     NVMeCoData *data = opaque;
1150     qemu_coroutine_enter(data->co);
1151 }
1152 
1153 static void nvme_rw_cb(void *opaque, int ret)
1154 {
1155     NVMeCoData *data = opaque;
1156     data->ret = ret;
1157     if (!data->co) {
1158         /* The rw coroutine hasn't yielded, don't try to enter. */
1159         return;
1160     }
1161     replay_bh_schedule_oneshot_event(data->ctx, nvme_rw_cb_bh, data);
1162 }
1163 
1164 static coroutine_fn int nvme_co_prw_aligned(BlockDriverState *bs,
1165                                             uint64_t offset, uint64_t bytes,
1166                                             QEMUIOVector *qiov,
1167                                             bool is_write,
1168                                             int flags)
1169 {
1170     int r;
1171     BDRVNVMeState *s = bs->opaque;
1172     NVMeQueuePair *ioq = s->queues[INDEX_IO(0)];
1173     NVMeRequest *req;
1174 
1175     uint32_t cdw12 = (((bytes >> s->blkshift) - 1) & 0xFFFF) |
1176                        (flags & BDRV_REQ_FUA ? 1 << 30 : 0);
1177     NvmeCmd cmd = {
1178         .opcode = is_write ? NVME_CMD_WRITE : NVME_CMD_READ,
1179         .nsid = cpu_to_le32(s->nsid),
1180         .cdw10 = cpu_to_le32((offset >> s->blkshift) & 0xFFFFFFFF),
1181         .cdw11 = cpu_to_le32(((offset >> s->blkshift) >> 32) & 0xFFFFFFFF),
1182         .cdw12 = cpu_to_le32(cdw12),
1183     };
1184     NVMeCoData data = {
1185         .ctx = bdrv_get_aio_context(bs),
1186         .ret = -EINPROGRESS,
1187     };
1188 
1189     trace_nvme_prw_aligned(s, is_write, offset, bytes, flags, qiov->niov);
1190     assert(s->queue_count > 1);
1191     req = nvme_get_free_req(ioq);
1192     assert(req);
1193 
1194     qemu_co_mutex_lock(&s->dma_map_lock);
1195     r = nvme_cmd_map_qiov(bs, &cmd, req, qiov);
1196     qemu_co_mutex_unlock(&s->dma_map_lock);
1197     if (r) {
1198         nvme_put_free_req_and_wake(ioq, req);
1199         return r;
1200     }
1201     nvme_submit_command(ioq, req, &cmd, nvme_rw_cb, &data);
1202 
1203     data.co = qemu_coroutine_self();
1204     while (data.ret == -EINPROGRESS) {
1205         qemu_coroutine_yield();
1206     }
1207 
1208     qemu_co_mutex_lock(&s->dma_map_lock);
1209     r = nvme_cmd_unmap_qiov(bs, qiov);
1210     qemu_co_mutex_unlock(&s->dma_map_lock);
1211     if (r) {
1212         return r;
1213     }
1214 
1215     trace_nvme_rw_done(s, is_write, offset, bytes, data.ret);
1216     return data.ret;
1217 }
1218 
1219 static inline bool nvme_qiov_aligned(BlockDriverState *bs,
1220                                      const QEMUIOVector *qiov)
1221 {
1222     int i;
1223     BDRVNVMeState *s = bs->opaque;
1224 
1225     for (i = 0; i < qiov->niov; ++i) {
1226         if (!QEMU_PTR_IS_ALIGNED(qiov->iov[i].iov_base,
1227                                  qemu_real_host_page_size) ||
1228             !QEMU_IS_ALIGNED(qiov->iov[i].iov_len, qemu_real_host_page_size)) {
1229             trace_nvme_qiov_unaligned(qiov, i, qiov->iov[i].iov_base,
1230                                       qiov->iov[i].iov_len, s->page_size);
1231             return false;
1232         }
1233     }
1234     return true;
1235 }
1236 
1237 static int nvme_co_prw(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
1238                        QEMUIOVector *qiov, bool is_write, int flags)
1239 {
1240     BDRVNVMeState *s = bs->opaque;
1241     int r;
1242     QEMU_AUTO_VFREE uint8_t *buf = NULL;
1243     QEMUIOVector local_qiov;
1244     size_t len = QEMU_ALIGN_UP(bytes, qemu_real_host_page_size);
1245     assert(QEMU_IS_ALIGNED(offset, s->page_size));
1246     assert(QEMU_IS_ALIGNED(bytes, s->page_size));
1247     assert(bytes <= s->max_transfer);
1248     if (nvme_qiov_aligned(bs, qiov)) {
1249         s->stats.aligned_accesses++;
1250         return nvme_co_prw_aligned(bs, offset, bytes, qiov, is_write, flags);
1251     }
1252     s->stats.unaligned_accesses++;
1253     trace_nvme_prw_buffered(s, offset, bytes, qiov->niov, is_write);
1254     buf = qemu_try_memalign(qemu_real_host_page_size, len);
1255 
1256     if (!buf) {
1257         return -ENOMEM;
1258     }
1259     qemu_iovec_init(&local_qiov, 1);
1260     if (is_write) {
1261         qemu_iovec_to_buf(qiov, 0, buf, bytes);
1262     }
1263     qemu_iovec_add(&local_qiov, buf, bytes);
1264     r = nvme_co_prw_aligned(bs, offset, bytes, &local_qiov, is_write, flags);
1265     qemu_iovec_destroy(&local_qiov);
1266     if (!r && !is_write) {
1267         qemu_iovec_from_buf(qiov, 0, buf, bytes);
1268     }
1269     return r;
1270 }
1271 
1272 static coroutine_fn int nvme_co_preadv(BlockDriverState *bs,
1273                                        int64_t offset, int64_t bytes,
1274                                        QEMUIOVector *qiov,
1275                                        BdrvRequestFlags flags)
1276 {
1277     return nvme_co_prw(bs, offset, bytes, qiov, false, flags);
1278 }
1279 
1280 static coroutine_fn int nvme_co_pwritev(BlockDriverState *bs,
1281                                         int64_t offset, int64_t bytes,
1282                                         QEMUIOVector *qiov,
1283                                         BdrvRequestFlags flags)
1284 {
1285     return nvme_co_prw(bs, offset, bytes, qiov, true, flags);
1286 }
1287 
1288 static coroutine_fn int nvme_co_flush(BlockDriverState *bs)
1289 {
1290     BDRVNVMeState *s = bs->opaque;
1291     NVMeQueuePair *ioq = s->queues[INDEX_IO(0)];
1292     NVMeRequest *req;
1293     NvmeCmd cmd = {
1294         .opcode = NVME_CMD_FLUSH,
1295         .nsid = cpu_to_le32(s->nsid),
1296     };
1297     NVMeCoData data = {
1298         .ctx = bdrv_get_aio_context(bs),
1299         .ret = -EINPROGRESS,
1300     };
1301 
1302     assert(s->queue_count > 1);
1303     req = nvme_get_free_req(ioq);
1304     assert(req);
1305     nvme_submit_command(ioq, req, &cmd, nvme_rw_cb, &data);
1306 
1307     data.co = qemu_coroutine_self();
1308     if (data.ret == -EINPROGRESS) {
1309         qemu_coroutine_yield();
1310     }
1311 
1312     return data.ret;
1313 }
1314 
1315 
1316 static coroutine_fn int nvme_co_pwrite_zeroes(BlockDriverState *bs,
1317                                               int64_t offset,
1318                                               int64_t bytes,
1319                                               BdrvRequestFlags flags)
1320 {
1321     BDRVNVMeState *s = bs->opaque;
1322     NVMeQueuePair *ioq = s->queues[INDEX_IO(0)];
1323     NVMeRequest *req;
1324     uint32_t cdw12;
1325 
1326     if (!s->supports_write_zeroes) {
1327         return -ENOTSUP;
1328     }
1329 
1330     if (bytes == 0) {
1331         return 0;
1332     }
1333 
1334     cdw12 = ((bytes >> s->blkshift) - 1) & 0xFFFF;
1335     /*
1336      * We should not lose information. pwrite_zeroes_alignment and
1337      * max_pwrite_zeroes guarantees it.
1338      */
1339     assert(((cdw12 + 1) << s->blkshift) == bytes);
1340 
1341     NvmeCmd cmd = {
1342         .opcode = NVME_CMD_WRITE_ZEROES,
1343         .nsid = cpu_to_le32(s->nsid),
1344         .cdw10 = cpu_to_le32((offset >> s->blkshift) & 0xFFFFFFFF),
1345         .cdw11 = cpu_to_le32(((offset >> s->blkshift) >> 32) & 0xFFFFFFFF),
1346     };
1347 
1348     NVMeCoData data = {
1349         .ctx = bdrv_get_aio_context(bs),
1350         .ret = -EINPROGRESS,
1351     };
1352 
1353     if (flags & BDRV_REQ_MAY_UNMAP) {
1354         cdw12 |= (1 << 25);
1355     }
1356 
1357     if (flags & BDRV_REQ_FUA) {
1358         cdw12 |= (1 << 30);
1359     }
1360 
1361     cmd.cdw12 = cpu_to_le32(cdw12);
1362 
1363     trace_nvme_write_zeroes(s, offset, bytes, flags);
1364     assert(s->queue_count > 1);
1365     req = nvme_get_free_req(ioq);
1366     assert(req);
1367 
1368     nvme_submit_command(ioq, req, &cmd, nvme_rw_cb, &data);
1369 
1370     data.co = qemu_coroutine_self();
1371     while (data.ret == -EINPROGRESS) {
1372         qemu_coroutine_yield();
1373     }
1374 
1375     trace_nvme_rw_done(s, true, offset, bytes, data.ret);
1376     return data.ret;
1377 }
1378 
1379 
1380 static int coroutine_fn nvme_co_pdiscard(BlockDriverState *bs,
1381                                          int64_t offset,
1382                                          int64_t bytes)
1383 {
1384     BDRVNVMeState *s = bs->opaque;
1385     NVMeQueuePair *ioq = s->queues[INDEX_IO(0)];
1386     NVMeRequest *req;
1387     QEMU_AUTO_VFREE NvmeDsmRange *buf = NULL;
1388     QEMUIOVector local_qiov;
1389     int ret;
1390 
1391     NvmeCmd cmd = {
1392         .opcode = NVME_CMD_DSM,
1393         .nsid = cpu_to_le32(s->nsid),
1394         .cdw10 = cpu_to_le32(0), /*number of ranges - 0 based*/
1395         .cdw11 = cpu_to_le32(1 << 2), /*deallocate bit*/
1396     };
1397 
1398     NVMeCoData data = {
1399         .ctx = bdrv_get_aio_context(bs),
1400         .ret = -EINPROGRESS,
1401     };
1402 
1403     if (!s->supports_discard) {
1404         return -ENOTSUP;
1405     }
1406 
1407     assert(s->queue_count > 1);
1408 
1409     /*
1410      * Filling the @buf requires @offset and @bytes to satisfy restrictions
1411      * defined in nvme_refresh_limits().
1412      */
1413     assert(QEMU_IS_ALIGNED(bytes, 1UL << s->blkshift));
1414     assert(QEMU_IS_ALIGNED(offset, 1UL << s->blkshift));
1415     assert((bytes >> s->blkshift) <= UINT32_MAX);
1416 
1417     buf = qemu_try_memalign(s->page_size, s->page_size);
1418     if (!buf) {
1419         return -ENOMEM;
1420     }
1421     memset(buf, 0, s->page_size);
1422     buf->nlb = cpu_to_le32(bytes >> s->blkshift);
1423     buf->slba = cpu_to_le64(offset >> s->blkshift);
1424     buf->cattr = 0;
1425 
1426     qemu_iovec_init(&local_qiov, 1);
1427     qemu_iovec_add(&local_qiov, buf, 4096);
1428 
1429     req = nvme_get_free_req(ioq);
1430     assert(req);
1431 
1432     qemu_co_mutex_lock(&s->dma_map_lock);
1433     ret = nvme_cmd_map_qiov(bs, &cmd, req, &local_qiov);
1434     qemu_co_mutex_unlock(&s->dma_map_lock);
1435 
1436     if (ret) {
1437         nvme_put_free_req_and_wake(ioq, req);
1438         goto out;
1439     }
1440 
1441     trace_nvme_dsm(s, offset, bytes);
1442 
1443     nvme_submit_command(ioq, req, &cmd, nvme_rw_cb, &data);
1444 
1445     data.co = qemu_coroutine_self();
1446     while (data.ret == -EINPROGRESS) {
1447         qemu_coroutine_yield();
1448     }
1449 
1450     qemu_co_mutex_lock(&s->dma_map_lock);
1451     ret = nvme_cmd_unmap_qiov(bs, &local_qiov);
1452     qemu_co_mutex_unlock(&s->dma_map_lock);
1453 
1454     if (ret) {
1455         goto out;
1456     }
1457 
1458     ret = data.ret;
1459     trace_nvme_dsm_done(s, offset, bytes, ret);
1460 out:
1461     qemu_iovec_destroy(&local_qiov);
1462     return ret;
1463 
1464 }
1465 
1466 static int coroutine_fn nvme_co_truncate(BlockDriverState *bs, int64_t offset,
1467                                          bool exact, PreallocMode prealloc,
1468                                          BdrvRequestFlags flags, Error **errp)
1469 {
1470     int64_t cur_length;
1471 
1472     if (prealloc != PREALLOC_MODE_OFF) {
1473         error_setg(errp, "Unsupported preallocation mode '%s'",
1474                    PreallocMode_str(prealloc));
1475         return -ENOTSUP;
1476     }
1477 
1478     cur_length = nvme_getlength(bs);
1479     if (offset != cur_length && exact) {
1480         error_setg(errp, "Cannot resize NVMe devices");
1481         return -ENOTSUP;
1482     } else if (offset > cur_length) {
1483         error_setg(errp, "Cannot grow NVMe devices");
1484         return -EINVAL;
1485     }
1486 
1487     return 0;
1488 }
1489 
1490 static int nvme_reopen_prepare(BDRVReopenState *reopen_state,
1491                                BlockReopenQueue *queue, Error **errp)
1492 {
1493     return 0;
1494 }
1495 
1496 static void nvme_refresh_filename(BlockDriverState *bs)
1497 {
1498     BDRVNVMeState *s = bs->opaque;
1499 
1500     snprintf(bs->exact_filename, sizeof(bs->exact_filename), "nvme://%s/%i",
1501              s->device, s->nsid);
1502 }
1503 
1504 static void nvme_refresh_limits(BlockDriverState *bs, Error **errp)
1505 {
1506     BDRVNVMeState *s = bs->opaque;
1507 
1508     bs->bl.opt_mem_alignment = s->page_size;
1509     bs->bl.request_alignment = s->page_size;
1510     bs->bl.max_transfer = s->max_transfer;
1511 
1512     /*
1513      * Look at nvme_co_pwrite_zeroes: after shift and decrement we should get
1514      * at most 0xFFFF
1515      */
1516     bs->bl.max_pwrite_zeroes = 1ULL << (s->blkshift + 16);
1517     bs->bl.pwrite_zeroes_alignment = MAX(bs->bl.request_alignment,
1518                                          1UL << s->blkshift);
1519 
1520     bs->bl.max_pdiscard = (uint64_t)UINT32_MAX << s->blkshift;
1521     bs->bl.pdiscard_alignment = MAX(bs->bl.request_alignment,
1522                                     1UL << s->blkshift);
1523 }
1524 
1525 static void nvme_detach_aio_context(BlockDriverState *bs)
1526 {
1527     BDRVNVMeState *s = bs->opaque;
1528 
1529     for (unsigned i = 0; i < s->queue_count; i++) {
1530         NVMeQueuePair *q = s->queues[i];
1531 
1532         qemu_bh_delete(q->completion_bh);
1533         q->completion_bh = NULL;
1534     }
1535 
1536     aio_set_event_notifier(bdrv_get_aio_context(bs),
1537                            &s->irq_notifier[MSIX_SHARED_IRQ_IDX],
1538                            false, NULL, NULL, NULL);
1539 }
1540 
1541 static void nvme_attach_aio_context(BlockDriverState *bs,
1542                                     AioContext *new_context)
1543 {
1544     BDRVNVMeState *s = bs->opaque;
1545 
1546     s->aio_context = new_context;
1547     aio_set_event_notifier(new_context, &s->irq_notifier[MSIX_SHARED_IRQ_IDX],
1548                            false, nvme_handle_event, nvme_poll_cb,
1549                            nvme_poll_ready);
1550 
1551     for (unsigned i = 0; i < s->queue_count; i++) {
1552         NVMeQueuePair *q = s->queues[i];
1553 
1554         q->completion_bh =
1555             aio_bh_new(new_context, nvme_process_completion_bh, q);
1556     }
1557 }
1558 
1559 static void nvme_aio_plug(BlockDriverState *bs)
1560 {
1561     BDRVNVMeState *s = bs->opaque;
1562     assert(!s->plugged);
1563     s->plugged = true;
1564 }
1565 
1566 static void nvme_aio_unplug(BlockDriverState *bs)
1567 {
1568     BDRVNVMeState *s = bs->opaque;
1569     assert(s->plugged);
1570     s->plugged = false;
1571     for (unsigned i = INDEX_IO(0); i < s->queue_count; i++) {
1572         NVMeQueuePair *q = s->queues[i];
1573         qemu_mutex_lock(&q->lock);
1574         nvme_kick(q);
1575         nvme_process_completion(q);
1576         qemu_mutex_unlock(&q->lock);
1577     }
1578 }
1579 
1580 static void nvme_register_buf(BlockDriverState *bs, void *host, size_t size)
1581 {
1582     int ret;
1583     Error *local_err = NULL;
1584     BDRVNVMeState *s = bs->opaque;
1585 
1586     ret = qemu_vfio_dma_map(s->vfio, host, size, false, NULL, &local_err);
1587     if (ret) {
1588         /* FIXME: we may run out of IOVA addresses after repeated
1589          * bdrv_register_buf/bdrv_unregister_buf, because nvme_vfio_dma_unmap
1590          * doesn't reclaim addresses for fixed mappings. */
1591         error_reportf_err(local_err, "nvme_register_buf failed: ");
1592     }
1593 }
1594 
1595 static void nvme_unregister_buf(BlockDriverState *bs, void *host)
1596 {
1597     BDRVNVMeState *s = bs->opaque;
1598 
1599     qemu_vfio_dma_unmap(s->vfio, host);
1600 }
1601 
1602 static BlockStatsSpecific *nvme_get_specific_stats(BlockDriverState *bs)
1603 {
1604     BlockStatsSpecific *stats = g_new(BlockStatsSpecific, 1);
1605     BDRVNVMeState *s = bs->opaque;
1606 
1607     stats->driver = BLOCKDEV_DRIVER_NVME;
1608     stats->u.nvme = (BlockStatsSpecificNvme) {
1609         .completion_errors = s->stats.completion_errors,
1610         .aligned_accesses = s->stats.aligned_accesses,
1611         .unaligned_accesses = s->stats.unaligned_accesses,
1612     };
1613 
1614     return stats;
1615 }
1616 
1617 static const char *const nvme_strong_runtime_opts[] = {
1618     NVME_BLOCK_OPT_DEVICE,
1619     NVME_BLOCK_OPT_NAMESPACE,
1620 
1621     NULL
1622 };
1623 
1624 static BlockDriver bdrv_nvme = {
1625     .format_name              = "nvme",
1626     .protocol_name            = "nvme",
1627     .instance_size            = sizeof(BDRVNVMeState),
1628 
1629     .bdrv_co_create_opts      = bdrv_co_create_opts_simple,
1630     .create_opts              = &bdrv_create_opts_simple,
1631 
1632     .bdrv_parse_filename      = nvme_parse_filename,
1633     .bdrv_file_open           = nvme_file_open,
1634     .bdrv_close               = nvme_close,
1635     .bdrv_getlength           = nvme_getlength,
1636     .bdrv_probe_blocksizes    = nvme_probe_blocksizes,
1637     .bdrv_co_truncate         = nvme_co_truncate,
1638 
1639     .bdrv_co_preadv           = nvme_co_preadv,
1640     .bdrv_co_pwritev          = nvme_co_pwritev,
1641 
1642     .bdrv_co_pwrite_zeroes    = nvme_co_pwrite_zeroes,
1643     .bdrv_co_pdiscard         = nvme_co_pdiscard,
1644 
1645     .bdrv_co_flush_to_disk    = nvme_co_flush,
1646     .bdrv_reopen_prepare      = nvme_reopen_prepare,
1647 
1648     .bdrv_refresh_filename    = nvme_refresh_filename,
1649     .bdrv_refresh_limits      = nvme_refresh_limits,
1650     .strong_runtime_opts      = nvme_strong_runtime_opts,
1651     .bdrv_get_specific_stats  = nvme_get_specific_stats,
1652 
1653     .bdrv_detach_aio_context  = nvme_detach_aio_context,
1654     .bdrv_attach_aio_context  = nvme_attach_aio_context,
1655 
1656     .bdrv_io_plug             = nvme_aio_plug,
1657     .bdrv_io_unplug           = nvme_aio_unplug,
1658 
1659     .bdrv_register_buf        = nvme_register_buf,
1660     .bdrv_unregister_buf      = nvme_unregister_buf,
1661 };
1662 
1663 static void bdrv_nvme_init(void)
1664 {
1665     bdrv_register(&bdrv_nvme);
1666 }
1667 
1668 block_init(bdrv_nvme_init);
1669