xref: /openbmc/qemu/block/nvme.c (revision 892609056ddff373f8c8c55525a53dd932ee403d)
1 /*
2  * NVMe block driver based on vfio
3  *
4  * Copyright 2016 - 2018 Red Hat, Inc.
5  *
6  * Authors:
7  *   Fam Zheng <famz@redhat.com>
8  *   Paolo Bonzini <pbonzini@redhat.com>
9  *
10  * This work is licensed under the terms of the GNU GPL, version 2 or later.
11  * See the COPYING file in the top-level directory.
12  */
13 
14 #include "qemu/osdep.h"
15 #include <linux/vfio.h>
16 #include "qapi/error.h"
17 #include "qapi/qmp/qdict.h"
18 #include "qapi/qmp/qstring.h"
19 #include "qemu/error-report.h"
20 #include "qemu/main-loop.h"
21 #include "qemu/module.h"
22 #include "qemu/cutils.h"
23 #include "qemu/option.h"
24 #include "qemu/vfio-helpers.h"
25 #include "block/block_int.h"
26 #include "trace.h"
27 
28 #include "block/nvme.h"
29 
30 #define NVME_SQ_ENTRY_BYTES 64
31 #define NVME_CQ_ENTRY_BYTES 16
32 #define NVME_QUEUE_SIZE 128
33 #define NVME_BAR_SIZE 8192
34 
35 typedef struct {
36     int32_t  head, tail;
37     uint8_t  *queue;
38     uint64_t iova;
39     /* Hardware MMIO register */
40     volatile uint32_t *doorbell;
41 } NVMeQueue;
42 
43 typedef struct {
44     BlockCompletionFunc *cb;
45     void *opaque;
46     int cid;
47     void *prp_list_page;
48     uint64_t prp_list_iova;
49     bool busy;
50 } NVMeRequest;
51 
52 typedef struct {
53     CoQueue     free_req_queue;
54     QemuMutex   lock;
55 
56     /* Fields protected by BQL */
57     int         index;
58     uint8_t     *prp_list_pages;
59 
60     /* Fields protected by @lock */
61     NVMeQueue   sq, cq;
62     int         cq_phase;
63     NVMeRequest reqs[NVME_QUEUE_SIZE];
64     bool        busy;
65     int         need_kick;
66     int         inflight;
67 } NVMeQueuePair;
68 
69 /* Memory mapped registers */
70 typedef volatile struct {
71     uint64_t cap;
72     uint32_t vs;
73     uint32_t intms;
74     uint32_t intmc;
75     uint32_t cc;
76     uint32_t reserved0;
77     uint32_t csts;
78     uint32_t nssr;
79     uint32_t aqa;
80     uint64_t asq;
81     uint64_t acq;
82     uint32_t cmbloc;
83     uint32_t cmbsz;
84     uint8_t  reserved1[0xec0];
85     uint8_t  cmd_set_specfic[0x100];
86     uint32_t doorbells[];
87 } NVMeRegs;
88 
89 QEMU_BUILD_BUG_ON(offsetof(NVMeRegs, doorbells) != 0x1000);
90 
91 typedef struct {
92     AioContext *aio_context;
93     QEMUVFIOState *vfio;
94     NVMeRegs *regs;
95     /* The submission/completion queue pairs.
96      * [0]: admin queue.
97      * [1..]: io queues.
98      */
99     NVMeQueuePair **queues;
100     int nr_queues;
101     size_t page_size;
102     /* How many uint32_t elements does each doorbell entry take. */
103     size_t doorbell_scale;
104     bool write_cache_supported;
105     EventNotifier irq_notifier;
106 
107     uint64_t nsze; /* Namespace size reported by identify command */
108     int nsid;      /* The namespace id to read/write data. */
109     int blkshift;
110 
111     uint64_t max_transfer;
112     bool plugged;
113 
114     CoMutex dma_map_lock;
115     CoQueue dma_flush_queue;
116 
117     /* Total size of mapped qiov, accessed under dma_map_lock */
118     int dma_map_count;
119 
120     /* PCI address (required for nvme_refresh_filename()) */
121     char *device;
122 } BDRVNVMeState;
123 
124 #define NVME_BLOCK_OPT_DEVICE "device"
125 #define NVME_BLOCK_OPT_NAMESPACE "namespace"
126 
127 static QemuOptsList runtime_opts = {
128     .name = "nvme",
129     .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head),
130     .desc = {
131         {
132             .name = NVME_BLOCK_OPT_DEVICE,
133             .type = QEMU_OPT_STRING,
134             .help = "NVMe PCI device address",
135         },
136         {
137             .name = NVME_BLOCK_OPT_NAMESPACE,
138             .type = QEMU_OPT_NUMBER,
139             .help = "NVMe namespace",
140         },
141         { /* end of list */ }
142     },
143 };
144 
145 static void nvme_init_queue(BlockDriverState *bs, NVMeQueue *q,
146                             int nentries, int entry_bytes, Error **errp)
147 {
148     BDRVNVMeState *s = bs->opaque;
149     size_t bytes;
150     int r;
151 
152     bytes = ROUND_UP(nentries * entry_bytes, s->page_size);
153     q->head = q->tail = 0;
154     q->queue = qemu_try_blockalign0(bs, bytes);
155 
156     if (!q->queue) {
157         error_setg(errp, "Cannot allocate queue");
158         return;
159     }
160     r = qemu_vfio_dma_map(s->vfio, q->queue, bytes, false, &q->iova);
161     if (r) {
162         error_setg(errp, "Cannot map queue");
163     }
164 }
165 
166 static void nvme_free_queue_pair(BlockDriverState *bs, NVMeQueuePair *q)
167 {
168     qemu_vfree(q->prp_list_pages);
169     qemu_vfree(q->sq.queue);
170     qemu_vfree(q->cq.queue);
171     qemu_mutex_destroy(&q->lock);
172     g_free(q);
173 }
174 
175 static void nvme_free_req_queue_cb(void *opaque)
176 {
177     NVMeQueuePair *q = opaque;
178 
179     qemu_mutex_lock(&q->lock);
180     while (qemu_co_enter_next(&q->free_req_queue, &q->lock)) {
181         /* Retry all pending requests */
182     }
183     qemu_mutex_unlock(&q->lock);
184 }
185 
186 static NVMeQueuePair *nvme_create_queue_pair(BlockDriverState *bs,
187                                              int idx, int size,
188                                              Error **errp)
189 {
190     int i, r;
191     BDRVNVMeState *s = bs->opaque;
192     Error *local_err = NULL;
193     NVMeQueuePair *q = g_new0(NVMeQueuePair, 1);
194     uint64_t prp_list_iova;
195 
196     qemu_mutex_init(&q->lock);
197     q->index = idx;
198     qemu_co_queue_init(&q->free_req_queue);
199     q->prp_list_pages = qemu_blockalign0(bs, s->page_size * NVME_QUEUE_SIZE);
200     r = qemu_vfio_dma_map(s->vfio, q->prp_list_pages,
201                           s->page_size * NVME_QUEUE_SIZE,
202                           false, &prp_list_iova);
203     if (r) {
204         goto fail;
205     }
206     for (i = 0; i < NVME_QUEUE_SIZE; i++) {
207         NVMeRequest *req = &q->reqs[i];
208         req->cid = i + 1;
209         req->prp_list_page = q->prp_list_pages + i * s->page_size;
210         req->prp_list_iova = prp_list_iova + i * s->page_size;
211     }
212     nvme_init_queue(bs, &q->sq, size, NVME_SQ_ENTRY_BYTES, &local_err);
213     if (local_err) {
214         error_propagate(errp, local_err);
215         goto fail;
216     }
217     q->sq.doorbell = &s->regs->doorbells[idx * 2 * s->doorbell_scale];
218 
219     nvme_init_queue(bs, &q->cq, size, NVME_CQ_ENTRY_BYTES, &local_err);
220     if (local_err) {
221         error_propagate(errp, local_err);
222         goto fail;
223     }
224     q->cq.doorbell = &s->regs->doorbells[(idx * 2 + 1) * s->doorbell_scale];
225 
226     return q;
227 fail:
228     nvme_free_queue_pair(bs, q);
229     return NULL;
230 }
231 
232 /* With q->lock */
233 static void nvme_kick(BDRVNVMeState *s, NVMeQueuePair *q)
234 {
235     if (s->plugged || !q->need_kick) {
236         return;
237     }
238     trace_nvme_kick(s, q->index);
239     assert(!(q->sq.tail & 0xFF00));
240     /* Fence the write to submission queue entry before notifying the device. */
241     smp_wmb();
242     *q->sq.doorbell = cpu_to_le32(q->sq.tail);
243     q->inflight += q->need_kick;
244     q->need_kick = 0;
245 }
246 
247 /* Find a free request element if any, otherwise:
248  * a) if in coroutine context, try to wait for one to become available;
249  * b) if not in coroutine, return NULL;
250  */
251 static NVMeRequest *nvme_get_free_req(NVMeQueuePair *q)
252 {
253     int i;
254     NVMeRequest *req = NULL;
255 
256     qemu_mutex_lock(&q->lock);
257     while (q->inflight + q->need_kick > NVME_QUEUE_SIZE - 2) {
258         /* We have to leave one slot empty as that is the full queue case (head
259          * == tail + 1). */
260         if (qemu_in_coroutine()) {
261             trace_nvme_free_req_queue_wait(q);
262             qemu_co_queue_wait(&q->free_req_queue, &q->lock);
263         } else {
264             qemu_mutex_unlock(&q->lock);
265             return NULL;
266         }
267     }
268     for (i = 0; i < NVME_QUEUE_SIZE; i++) {
269         if (!q->reqs[i].busy) {
270             q->reqs[i].busy = true;
271             req = &q->reqs[i];
272             break;
273         }
274     }
275     /* We have checked inflight and need_kick while holding q->lock, so one
276      * free req must be available. */
277     assert(req);
278     qemu_mutex_unlock(&q->lock);
279     return req;
280 }
281 
282 static inline int nvme_translate_error(const NvmeCqe *c)
283 {
284     uint16_t status = (le16_to_cpu(c->status) >> 1) & 0xFF;
285     if (status) {
286         trace_nvme_error(le32_to_cpu(c->result),
287                          le16_to_cpu(c->sq_head),
288                          le16_to_cpu(c->sq_id),
289                          le16_to_cpu(c->cid),
290                          le16_to_cpu(status));
291     }
292     switch (status) {
293     case 0:
294         return 0;
295     case 1:
296         return -ENOSYS;
297     case 2:
298         return -EINVAL;
299     default:
300         return -EIO;
301     }
302 }
303 
304 /* With q->lock */
305 static bool nvme_process_completion(BDRVNVMeState *s, NVMeQueuePair *q)
306 {
307     bool progress = false;
308     NVMeRequest *preq;
309     NVMeRequest req;
310     NvmeCqe *c;
311 
312     trace_nvme_process_completion(s, q->index, q->inflight);
313     if (q->busy || s->plugged) {
314         trace_nvme_process_completion_queue_busy(s, q->index);
315         return false;
316     }
317     q->busy = true;
318     assert(q->inflight >= 0);
319     while (q->inflight) {
320         int16_t cid;
321         c = (NvmeCqe *)&q->cq.queue[q->cq.head * NVME_CQ_ENTRY_BYTES];
322         if ((le16_to_cpu(c->status) & 0x1) == q->cq_phase) {
323             break;
324         }
325         q->cq.head = (q->cq.head + 1) % NVME_QUEUE_SIZE;
326         if (!q->cq.head) {
327             q->cq_phase = !q->cq_phase;
328         }
329         cid = le16_to_cpu(c->cid);
330         if (cid == 0 || cid > NVME_QUEUE_SIZE) {
331             fprintf(stderr, "Unexpected CID in completion queue: %" PRIu32 "\n",
332                     cid);
333             continue;
334         }
335         assert(cid <= NVME_QUEUE_SIZE);
336         trace_nvme_complete_command(s, q->index, cid);
337         preq = &q->reqs[cid - 1];
338         req = *preq;
339         assert(req.cid == cid);
340         assert(req.cb);
341         preq->busy = false;
342         preq->cb = preq->opaque = NULL;
343         qemu_mutex_unlock(&q->lock);
344         req.cb(req.opaque, nvme_translate_error(c));
345         qemu_mutex_lock(&q->lock);
346         q->inflight--;
347         progress = true;
348     }
349     if (progress) {
350         /* Notify the device so it can post more completions. */
351         smp_mb_release();
352         *q->cq.doorbell = cpu_to_le32(q->cq.head);
353         if (!qemu_co_queue_empty(&q->free_req_queue)) {
354             aio_bh_schedule_oneshot(s->aio_context, nvme_free_req_queue_cb, q);
355         }
356     }
357     q->busy = false;
358     return progress;
359 }
360 
361 static void nvme_trace_command(const NvmeCmd *cmd)
362 {
363     int i;
364 
365     for (i = 0; i < 8; ++i) {
366         uint8_t *cmdp = (uint8_t *)cmd + i * 8;
367         trace_nvme_submit_command_raw(cmdp[0], cmdp[1], cmdp[2], cmdp[3],
368                                       cmdp[4], cmdp[5], cmdp[6], cmdp[7]);
369     }
370 }
371 
372 static void nvme_submit_command(BDRVNVMeState *s, NVMeQueuePair *q,
373                                 NVMeRequest *req,
374                                 NvmeCmd *cmd, BlockCompletionFunc cb,
375                                 void *opaque)
376 {
377     assert(!req->cb);
378     req->cb = cb;
379     req->opaque = opaque;
380     cmd->cid = cpu_to_le32(req->cid);
381 
382     trace_nvme_submit_command(s, q->index, req->cid);
383     nvme_trace_command(cmd);
384     qemu_mutex_lock(&q->lock);
385     memcpy((uint8_t *)q->sq.queue +
386            q->sq.tail * NVME_SQ_ENTRY_BYTES, cmd, sizeof(*cmd));
387     q->sq.tail = (q->sq.tail + 1) % NVME_QUEUE_SIZE;
388     q->need_kick++;
389     nvme_kick(s, q);
390     nvme_process_completion(s, q);
391     qemu_mutex_unlock(&q->lock);
392 }
393 
394 static void nvme_cmd_sync_cb(void *opaque, int ret)
395 {
396     int *pret = opaque;
397     *pret = ret;
398     aio_wait_kick();
399 }
400 
401 static int nvme_cmd_sync(BlockDriverState *bs, NVMeQueuePair *q,
402                          NvmeCmd *cmd)
403 {
404     NVMeRequest *req;
405     BDRVNVMeState *s = bs->opaque;
406     int ret = -EINPROGRESS;
407     req = nvme_get_free_req(q);
408     if (!req) {
409         return -EBUSY;
410     }
411     nvme_submit_command(s, q, req, cmd, nvme_cmd_sync_cb, &ret);
412 
413     BDRV_POLL_WHILE(bs, ret == -EINPROGRESS);
414     return ret;
415 }
416 
417 static void nvme_identify(BlockDriverState *bs, int namespace, Error **errp)
418 {
419     BDRVNVMeState *s = bs->opaque;
420     NvmeIdCtrl *idctrl;
421     NvmeIdNs *idns;
422     NvmeLBAF *lbaf;
423     uint8_t *resp;
424     int r;
425     uint64_t iova;
426     NvmeCmd cmd = {
427         .opcode = NVME_ADM_CMD_IDENTIFY,
428         .cdw10 = cpu_to_le32(0x1),
429     };
430 
431     resp = qemu_try_blockalign0(bs, sizeof(NvmeIdCtrl));
432     if (!resp) {
433         error_setg(errp, "Cannot allocate buffer for identify response");
434         goto out;
435     }
436     idctrl = (NvmeIdCtrl *)resp;
437     idns = (NvmeIdNs *)resp;
438     r = qemu_vfio_dma_map(s->vfio, resp, sizeof(NvmeIdCtrl), true, &iova);
439     if (r) {
440         error_setg(errp, "Cannot map buffer for DMA");
441         goto out;
442     }
443     cmd.prp1 = cpu_to_le64(iova);
444 
445     if (nvme_cmd_sync(bs, s->queues[0], &cmd)) {
446         error_setg(errp, "Failed to identify controller");
447         goto out;
448     }
449 
450     if (le32_to_cpu(idctrl->nn) < namespace) {
451         error_setg(errp, "Invalid namespace");
452         goto out;
453     }
454     s->write_cache_supported = le32_to_cpu(idctrl->vwc) & 0x1;
455     s->max_transfer = (idctrl->mdts ? 1 << idctrl->mdts : 0) * s->page_size;
456     /* For now the page list buffer per command is one page, to hold at most
457      * s->page_size / sizeof(uint64_t) entries. */
458     s->max_transfer = MIN_NON_ZERO(s->max_transfer,
459                           s->page_size / sizeof(uint64_t) * s->page_size);
460 
461     memset(resp, 0, 4096);
462 
463     cmd.cdw10 = 0;
464     cmd.nsid = cpu_to_le32(namespace);
465     if (nvme_cmd_sync(bs, s->queues[0], &cmd)) {
466         error_setg(errp, "Failed to identify namespace");
467         goto out;
468     }
469 
470     s->nsze = le64_to_cpu(idns->nsze);
471     lbaf = &idns->lbaf[NVME_ID_NS_FLBAS_INDEX(idns->flbas)];
472 
473     if (lbaf->ms) {
474         error_setg(errp, "Namespaces with metadata are not yet supported");
475         goto out;
476     }
477 
478     if (lbaf->ds < BDRV_SECTOR_BITS || lbaf->ds > 12 ||
479         (1 << lbaf->ds) > s->page_size)
480     {
481         error_setg(errp, "Namespace has unsupported block size (2^%d)",
482                    lbaf->ds);
483         goto out;
484     }
485 
486     s->blkshift = lbaf->ds;
487 out:
488     qemu_vfio_dma_unmap(s->vfio, resp);
489     qemu_vfree(resp);
490 }
491 
492 static bool nvme_poll_queues(BDRVNVMeState *s)
493 {
494     bool progress = false;
495     int i;
496 
497     for (i = 0; i < s->nr_queues; i++) {
498         NVMeQueuePair *q = s->queues[i];
499         qemu_mutex_lock(&q->lock);
500         while (nvme_process_completion(s, q)) {
501             /* Keep polling */
502             progress = true;
503         }
504         qemu_mutex_unlock(&q->lock);
505     }
506     return progress;
507 }
508 
509 static void nvme_handle_event(EventNotifier *n)
510 {
511     BDRVNVMeState *s = container_of(n, BDRVNVMeState, irq_notifier);
512 
513     trace_nvme_handle_event(s);
514     event_notifier_test_and_clear(n);
515     nvme_poll_queues(s);
516 }
517 
518 static bool nvme_add_io_queue(BlockDriverState *bs, Error **errp)
519 {
520     BDRVNVMeState *s = bs->opaque;
521     int n = s->nr_queues;
522     NVMeQueuePair *q;
523     NvmeCmd cmd;
524     int queue_size = NVME_QUEUE_SIZE;
525 
526     q = nvme_create_queue_pair(bs, n, queue_size, errp);
527     if (!q) {
528         return false;
529     }
530     cmd = (NvmeCmd) {
531         .opcode = NVME_ADM_CMD_CREATE_CQ,
532         .prp1 = cpu_to_le64(q->cq.iova),
533         .cdw10 = cpu_to_le32(((queue_size - 1) << 16) | (n & 0xFFFF)),
534         .cdw11 = cpu_to_le32(0x3),
535     };
536     if (nvme_cmd_sync(bs, s->queues[0], &cmd)) {
537         error_setg(errp, "Failed to create io queue [%d]", n);
538         nvme_free_queue_pair(bs, q);
539         return false;
540     }
541     cmd = (NvmeCmd) {
542         .opcode = NVME_ADM_CMD_CREATE_SQ,
543         .prp1 = cpu_to_le64(q->sq.iova),
544         .cdw10 = cpu_to_le32(((queue_size - 1) << 16) | (n & 0xFFFF)),
545         .cdw11 = cpu_to_le32(0x1 | (n << 16)),
546     };
547     if (nvme_cmd_sync(bs, s->queues[0], &cmd)) {
548         error_setg(errp, "Failed to create io queue [%d]", n);
549         nvme_free_queue_pair(bs, q);
550         return false;
551     }
552     s->queues = g_renew(NVMeQueuePair *, s->queues, n + 1);
553     s->queues[n] = q;
554     s->nr_queues++;
555     return true;
556 }
557 
558 static bool nvme_poll_cb(void *opaque)
559 {
560     EventNotifier *e = opaque;
561     BDRVNVMeState *s = container_of(e, BDRVNVMeState, irq_notifier);
562     bool progress = false;
563 
564     trace_nvme_poll_cb(s);
565     progress = nvme_poll_queues(s);
566     return progress;
567 }
568 
569 static int nvme_init(BlockDriverState *bs, const char *device, int namespace,
570                      Error **errp)
571 {
572     BDRVNVMeState *s = bs->opaque;
573     int ret;
574     uint64_t cap;
575     uint64_t timeout_ms;
576     uint64_t deadline, now;
577     Error *local_err = NULL;
578 
579     qemu_co_mutex_init(&s->dma_map_lock);
580     qemu_co_queue_init(&s->dma_flush_queue);
581     s->device = g_strdup(device);
582     s->nsid = namespace;
583     s->aio_context = bdrv_get_aio_context(bs);
584     ret = event_notifier_init(&s->irq_notifier, 0);
585     if (ret) {
586         error_setg(errp, "Failed to init event notifier");
587         return ret;
588     }
589 
590     s->vfio = qemu_vfio_open_pci(device, errp);
591     if (!s->vfio) {
592         ret = -EINVAL;
593         goto out;
594     }
595 
596     s->regs = qemu_vfio_pci_map_bar(s->vfio, 0, 0, NVME_BAR_SIZE, errp);
597     if (!s->regs) {
598         ret = -EINVAL;
599         goto out;
600     }
601 
602     /* Perform initialize sequence as described in NVMe spec "7.6.1
603      * Initialization". */
604 
605     cap = le64_to_cpu(s->regs->cap);
606     if (!(cap & (1ULL << 37))) {
607         error_setg(errp, "Device doesn't support NVMe command set");
608         ret = -EINVAL;
609         goto out;
610     }
611 
612     s->page_size = MAX(4096, 1 << (12 + ((cap >> 48) & 0xF)));
613     s->doorbell_scale = (4 << (((cap >> 32) & 0xF))) / sizeof(uint32_t);
614     bs->bl.opt_mem_alignment = s->page_size;
615     timeout_ms = MIN(500 * ((cap >> 24) & 0xFF), 30000);
616 
617     /* Reset device to get a clean state. */
618     s->regs->cc = cpu_to_le32(le32_to_cpu(s->regs->cc) & 0xFE);
619     /* Wait for CSTS.RDY = 0. */
620     deadline = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) + timeout_ms * 1000000ULL;
621     while (le32_to_cpu(s->regs->csts) & 0x1) {
622         if (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) > deadline) {
623             error_setg(errp, "Timeout while waiting for device to reset (%"
624                              PRId64 " ms)",
625                        timeout_ms);
626             ret = -ETIMEDOUT;
627             goto out;
628         }
629     }
630 
631     /* Set up admin queue. */
632     s->queues = g_new(NVMeQueuePair *, 1);
633     s->queues[0] = nvme_create_queue_pair(bs, 0, NVME_QUEUE_SIZE, errp);
634     if (!s->queues[0]) {
635         ret = -EINVAL;
636         goto out;
637     }
638     s->nr_queues = 1;
639     QEMU_BUILD_BUG_ON(NVME_QUEUE_SIZE & 0xF000);
640     s->regs->aqa = cpu_to_le32((NVME_QUEUE_SIZE << 16) | NVME_QUEUE_SIZE);
641     s->regs->asq = cpu_to_le64(s->queues[0]->sq.iova);
642     s->regs->acq = cpu_to_le64(s->queues[0]->cq.iova);
643 
644     /* After setting up all control registers we can enable device now. */
645     s->regs->cc = cpu_to_le32((ctz32(NVME_CQ_ENTRY_BYTES) << 20) |
646                               (ctz32(NVME_SQ_ENTRY_BYTES) << 16) |
647                               0x1);
648     /* Wait for CSTS.RDY = 1. */
649     now = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
650     deadline = now + timeout_ms * 1000000;
651     while (!(le32_to_cpu(s->regs->csts) & 0x1)) {
652         if (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) > deadline) {
653             error_setg(errp, "Timeout while waiting for device to start (%"
654                              PRId64 " ms)",
655                        timeout_ms);
656             ret = -ETIMEDOUT;
657             goto out;
658         }
659     }
660 
661     ret = qemu_vfio_pci_init_irq(s->vfio, &s->irq_notifier,
662                                  VFIO_PCI_MSIX_IRQ_INDEX, errp);
663     if (ret) {
664         goto out;
665     }
666     aio_set_event_notifier(bdrv_get_aio_context(bs), &s->irq_notifier,
667                            false, nvme_handle_event, nvme_poll_cb);
668 
669     nvme_identify(bs, namespace, &local_err);
670     if (local_err) {
671         error_propagate(errp, local_err);
672         ret = -EIO;
673         goto out;
674     }
675 
676     /* Set up command queues. */
677     if (!nvme_add_io_queue(bs, errp)) {
678         ret = -EIO;
679     }
680 out:
681     /* Cleaning up is done in nvme_file_open() upon error. */
682     return ret;
683 }
684 
685 /* Parse a filename in the format of nvme://XXXX:XX:XX.X/X. Example:
686  *
687  *     nvme://0000:44:00.0/1
688  *
689  * where the "nvme://" is a fixed form of the protocol prefix, the middle part
690  * is the PCI address, and the last part is the namespace number starting from
691  * 1 according to the NVMe spec. */
692 static void nvme_parse_filename(const char *filename, QDict *options,
693                                 Error **errp)
694 {
695     int pref = strlen("nvme://");
696 
697     if (strlen(filename) > pref && !strncmp(filename, "nvme://", pref)) {
698         const char *tmp = filename + pref;
699         char *device;
700         const char *namespace;
701         unsigned long ns;
702         const char *slash = strchr(tmp, '/');
703         if (!slash) {
704             qdict_put_str(options, NVME_BLOCK_OPT_DEVICE, tmp);
705             return;
706         }
707         device = g_strndup(tmp, slash - tmp);
708         qdict_put_str(options, NVME_BLOCK_OPT_DEVICE, device);
709         g_free(device);
710         namespace = slash + 1;
711         if (*namespace && qemu_strtoul(namespace, NULL, 10, &ns)) {
712             error_setg(errp, "Invalid namespace '%s', positive number expected",
713                        namespace);
714             return;
715         }
716         qdict_put_str(options, NVME_BLOCK_OPT_NAMESPACE,
717                       *namespace ? namespace : "1");
718     }
719 }
720 
721 static int nvme_enable_disable_write_cache(BlockDriverState *bs, bool enable,
722                                            Error **errp)
723 {
724     int ret;
725     BDRVNVMeState *s = bs->opaque;
726     NvmeCmd cmd = {
727         .opcode = NVME_ADM_CMD_SET_FEATURES,
728         .nsid = cpu_to_le32(s->nsid),
729         .cdw10 = cpu_to_le32(0x06),
730         .cdw11 = cpu_to_le32(enable ? 0x01 : 0x00),
731     };
732 
733     ret = nvme_cmd_sync(bs, s->queues[0], &cmd);
734     if (ret) {
735         error_setg(errp, "Failed to configure NVMe write cache");
736     }
737     return ret;
738 }
739 
740 static void nvme_close(BlockDriverState *bs)
741 {
742     int i;
743     BDRVNVMeState *s = bs->opaque;
744 
745     for (i = 0; i < s->nr_queues; ++i) {
746         nvme_free_queue_pair(bs, s->queues[i]);
747     }
748     g_free(s->queues);
749     aio_set_event_notifier(bdrv_get_aio_context(bs), &s->irq_notifier,
750                            false, NULL, NULL);
751     event_notifier_cleanup(&s->irq_notifier);
752     qemu_vfio_pci_unmap_bar(s->vfio, 0, (void *)s->regs, 0, NVME_BAR_SIZE);
753     qemu_vfio_close(s->vfio);
754 
755     g_free(s->device);
756 }
757 
758 static int nvme_file_open(BlockDriverState *bs, QDict *options, int flags,
759                           Error **errp)
760 {
761     const char *device;
762     QemuOpts *opts;
763     int namespace;
764     int ret;
765     BDRVNVMeState *s = bs->opaque;
766 
767     opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort);
768     qemu_opts_absorb_qdict(opts, options, &error_abort);
769     device = qemu_opt_get(opts, NVME_BLOCK_OPT_DEVICE);
770     if (!device) {
771         error_setg(errp, "'" NVME_BLOCK_OPT_DEVICE "' option is required");
772         qemu_opts_del(opts);
773         return -EINVAL;
774     }
775 
776     namespace = qemu_opt_get_number(opts, NVME_BLOCK_OPT_NAMESPACE, 1);
777     ret = nvme_init(bs, device, namespace, errp);
778     qemu_opts_del(opts);
779     if (ret) {
780         goto fail;
781     }
782     if (flags & BDRV_O_NOCACHE) {
783         if (!s->write_cache_supported) {
784             error_setg(errp,
785                        "NVMe controller doesn't support write cache configuration");
786             ret = -EINVAL;
787         } else {
788             ret = nvme_enable_disable_write_cache(bs, !(flags & BDRV_O_NOCACHE),
789                                                   errp);
790         }
791         if (ret) {
792             goto fail;
793         }
794     }
795     bs->supported_write_flags = BDRV_REQ_FUA;
796     return 0;
797 fail:
798     nvme_close(bs);
799     return ret;
800 }
801 
802 static int64_t nvme_getlength(BlockDriverState *bs)
803 {
804     BDRVNVMeState *s = bs->opaque;
805     return s->nsze << s->blkshift;
806 }
807 
808 static uint32_t nvme_get_blocksize(BlockDriverState *bs)
809 {
810     BDRVNVMeState *s = bs->opaque;
811     assert(s->blkshift >= BDRV_SECTOR_BITS && s->blkshift <= 12);
812     return UINT32_C(1) << s->blkshift;
813 }
814 
815 static int nvme_probe_blocksizes(BlockDriverState *bs, BlockSizes *bsz)
816 {
817     uint32_t blocksize = nvme_get_blocksize(bs);
818     bsz->phys = blocksize;
819     bsz->log = blocksize;
820     return 0;
821 }
822 
823 /* Called with s->dma_map_lock */
824 static coroutine_fn int nvme_cmd_unmap_qiov(BlockDriverState *bs,
825                                             QEMUIOVector *qiov)
826 {
827     int r = 0;
828     BDRVNVMeState *s = bs->opaque;
829 
830     s->dma_map_count -= qiov->size;
831     if (!s->dma_map_count && !qemu_co_queue_empty(&s->dma_flush_queue)) {
832         r = qemu_vfio_dma_reset_temporary(s->vfio);
833         if (!r) {
834             qemu_co_queue_restart_all(&s->dma_flush_queue);
835         }
836     }
837     return r;
838 }
839 
840 /* Called with s->dma_map_lock */
841 static coroutine_fn int nvme_cmd_map_qiov(BlockDriverState *bs, NvmeCmd *cmd,
842                                           NVMeRequest *req, QEMUIOVector *qiov)
843 {
844     BDRVNVMeState *s = bs->opaque;
845     uint64_t *pagelist = req->prp_list_page;
846     int i, j, r;
847     int entries = 0;
848 
849     assert(qiov->size);
850     assert(QEMU_IS_ALIGNED(qiov->size, s->page_size));
851     assert(qiov->size / s->page_size <= s->page_size / sizeof(uint64_t));
852     for (i = 0; i < qiov->niov; ++i) {
853         bool retry = true;
854         uint64_t iova;
855 try_map:
856         r = qemu_vfio_dma_map(s->vfio,
857                               qiov->iov[i].iov_base,
858                               qiov->iov[i].iov_len,
859                               true, &iova);
860         if (r == -ENOMEM && retry) {
861             retry = false;
862             trace_nvme_dma_flush_queue_wait(s);
863             if (s->dma_map_count) {
864                 trace_nvme_dma_map_flush(s);
865                 qemu_co_queue_wait(&s->dma_flush_queue, &s->dma_map_lock);
866             } else {
867                 r = qemu_vfio_dma_reset_temporary(s->vfio);
868                 if (r) {
869                     goto fail;
870                 }
871             }
872             goto try_map;
873         }
874         if (r) {
875             goto fail;
876         }
877 
878         for (j = 0; j < qiov->iov[i].iov_len / s->page_size; j++) {
879             pagelist[entries++] = cpu_to_le64(iova + j * s->page_size);
880         }
881         trace_nvme_cmd_map_qiov_iov(s, i, qiov->iov[i].iov_base,
882                                     qiov->iov[i].iov_len / s->page_size);
883     }
884 
885     s->dma_map_count += qiov->size;
886 
887     assert(entries <= s->page_size / sizeof(uint64_t));
888     switch (entries) {
889     case 0:
890         abort();
891     case 1:
892         cmd->prp1 = pagelist[0];
893         cmd->prp2 = 0;
894         break;
895     case 2:
896         cmd->prp1 = pagelist[0];
897         cmd->prp2 = pagelist[1];
898         break;
899     default:
900         cmd->prp1 = pagelist[0];
901         cmd->prp2 = cpu_to_le64(req->prp_list_iova + sizeof(uint64_t));
902         break;
903     }
904     trace_nvme_cmd_map_qiov(s, cmd, req, qiov, entries);
905     for (i = 0; i < entries; ++i) {
906         trace_nvme_cmd_map_qiov_pages(s, i, pagelist[i]);
907     }
908     return 0;
909 fail:
910     /* No need to unmap [0 - i) iovs even if we've failed, since we don't
911      * increment s->dma_map_count. This is okay for fixed mapping memory areas
912      * because they are already mapped before calling this function; for
913      * temporary mappings, a later nvme_cmd_(un)map_qiov will reclaim by
914      * calling qemu_vfio_dma_reset_temporary when necessary. */
915     return r;
916 }
917 
918 typedef struct {
919     Coroutine *co;
920     int ret;
921     AioContext *ctx;
922 } NVMeCoData;
923 
924 static void nvme_rw_cb_bh(void *opaque)
925 {
926     NVMeCoData *data = opaque;
927     qemu_coroutine_enter(data->co);
928 }
929 
930 static void nvme_rw_cb(void *opaque, int ret)
931 {
932     NVMeCoData *data = opaque;
933     data->ret = ret;
934     if (!data->co) {
935         /* The rw coroutine hasn't yielded, don't try to enter. */
936         return;
937     }
938     aio_bh_schedule_oneshot(data->ctx, nvme_rw_cb_bh, data);
939 }
940 
941 static coroutine_fn int nvme_co_prw_aligned(BlockDriverState *bs,
942                                             uint64_t offset, uint64_t bytes,
943                                             QEMUIOVector *qiov,
944                                             bool is_write,
945                                             int flags)
946 {
947     int r;
948     BDRVNVMeState *s = bs->opaque;
949     NVMeQueuePair *ioq = s->queues[1];
950     NVMeRequest *req;
951 
952     uint32_t cdw12 = (((bytes >> s->blkshift) - 1) & 0xFFFF) |
953                        (flags & BDRV_REQ_FUA ? 1 << 30 : 0);
954     NvmeCmd cmd = {
955         .opcode = is_write ? NVME_CMD_WRITE : NVME_CMD_READ,
956         .nsid = cpu_to_le32(s->nsid),
957         .cdw10 = cpu_to_le32((offset >> s->blkshift) & 0xFFFFFFFF),
958         .cdw11 = cpu_to_le32(((offset >> s->blkshift) >> 32) & 0xFFFFFFFF),
959         .cdw12 = cpu_to_le32(cdw12),
960     };
961     NVMeCoData data = {
962         .ctx = bdrv_get_aio_context(bs),
963         .ret = -EINPROGRESS,
964     };
965 
966     trace_nvme_prw_aligned(s, is_write, offset, bytes, flags, qiov->niov);
967     assert(s->nr_queues > 1);
968     req = nvme_get_free_req(ioq);
969     assert(req);
970 
971     qemu_co_mutex_lock(&s->dma_map_lock);
972     r = nvme_cmd_map_qiov(bs, &cmd, req, qiov);
973     qemu_co_mutex_unlock(&s->dma_map_lock);
974     if (r) {
975         req->busy = false;
976         return r;
977     }
978     nvme_submit_command(s, ioq, req, &cmd, nvme_rw_cb, &data);
979 
980     data.co = qemu_coroutine_self();
981     while (data.ret == -EINPROGRESS) {
982         qemu_coroutine_yield();
983     }
984 
985     qemu_co_mutex_lock(&s->dma_map_lock);
986     r = nvme_cmd_unmap_qiov(bs, qiov);
987     qemu_co_mutex_unlock(&s->dma_map_lock);
988     if (r) {
989         return r;
990     }
991 
992     trace_nvme_rw_done(s, is_write, offset, bytes, data.ret);
993     return data.ret;
994 }
995 
996 static inline bool nvme_qiov_aligned(BlockDriverState *bs,
997                                      const QEMUIOVector *qiov)
998 {
999     int i;
1000     BDRVNVMeState *s = bs->opaque;
1001 
1002     for (i = 0; i < qiov->niov; ++i) {
1003         if (!QEMU_PTR_IS_ALIGNED(qiov->iov[i].iov_base, s->page_size) ||
1004             !QEMU_IS_ALIGNED(qiov->iov[i].iov_len, s->page_size)) {
1005             trace_nvme_qiov_unaligned(qiov, i, qiov->iov[i].iov_base,
1006                                       qiov->iov[i].iov_len, s->page_size);
1007             return false;
1008         }
1009     }
1010     return true;
1011 }
1012 
1013 static int nvme_co_prw(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
1014                        QEMUIOVector *qiov, bool is_write, int flags)
1015 {
1016     BDRVNVMeState *s = bs->opaque;
1017     int r;
1018     uint8_t *buf = NULL;
1019     QEMUIOVector local_qiov;
1020 
1021     assert(QEMU_IS_ALIGNED(offset, s->page_size));
1022     assert(QEMU_IS_ALIGNED(bytes, s->page_size));
1023     assert(bytes <= s->max_transfer);
1024     if (nvme_qiov_aligned(bs, qiov)) {
1025         return nvme_co_prw_aligned(bs, offset, bytes, qiov, is_write, flags);
1026     }
1027     trace_nvme_prw_buffered(s, offset, bytes, qiov->niov, is_write);
1028     buf = qemu_try_blockalign(bs, bytes);
1029 
1030     if (!buf) {
1031         return -ENOMEM;
1032     }
1033     qemu_iovec_init(&local_qiov, 1);
1034     if (is_write) {
1035         qemu_iovec_to_buf(qiov, 0, buf, bytes);
1036     }
1037     qemu_iovec_add(&local_qiov, buf, bytes);
1038     r = nvme_co_prw_aligned(bs, offset, bytes, &local_qiov, is_write, flags);
1039     qemu_iovec_destroy(&local_qiov);
1040     if (!r && !is_write) {
1041         qemu_iovec_from_buf(qiov, 0, buf, bytes);
1042     }
1043     qemu_vfree(buf);
1044     return r;
1045 }
1046 
1047 static coroutine_fn int nvme_co_preadv(BlockDriverState *bs,
1048                                        uint64_t offset, uint64_t bytes,
1049                                        QEMUIOVector *qiov, int flags)
1050 {
1051     return nvme_co_prw(bs, offset, bytes, qiov, false, flags);
1052 }
1053 
1054 static coroutine_fn int nvme_co_pwritev(BlockDriverState *bs,
1055                                         uint64_t offset, uint64_t bytes,
1056                                         QEMUIOVector *qiov, int flags)
1057 {
1058     return nvme_co_prw(bs, offset, bytes, qiov, true, flags);
1059 }
1060 
1061 static coroutine_fn int nvme_co_flush(BlockDriverState *bs)
1062 {
1063     BDRVNVMeState *s = bs->opaque;
1064     NVMeQueuePair *ioq = s->queues[1];
1065     NVMeRequest *req;
1066     NvmeCmd cmd = {
1067         .opcode = NVME_CMD_FLUSH,
1068         .nsid = cpu_to_le32(s->nsid),
1069     };
1070     NVMeCoData data = {
1071         .ctx = bdrv_get_aio_context(bs),
1072         .ret = -EINPROGRESS,
1073     };
1074 
1075     assert(s->nr_queues > 1);
1076     req = nvme_get_free_req(ioq);
1077     assert(req);
1078     nvme_submit_command(s, ioq, req, &cmd, nvme_rw_cb, &data);
1079 
1080     data.co = qemu_coroutine_self();
1081     if (data.ret == -EINPROGRESS) {
1082         qemu_coroutine_yield();
1083     }
1084 
1085     return data.ret;
1086 }
1087 
1088 
1089 static int nvme_reopen_prepare(BDRVReopenState *reopen_state,
1090                                BlockReopenQueue *queue, Error **errp)
1091 {
1092     return 0;
1093 }
1094 
1095 static void nvme_refresh_filename(BlockDriverState *bs)
1096 {
1097     BDRVNVMeState *s = bs->opaque;
1098 
1099     snprintf(bs->exact_filename, sizeof(bs->exact_filename), "nvme://%s/%i",
1100              s->device, s->nsid);
1101 }
1102 
1103 static void nvme_refresh_limits(BlockDriverState *bs, Error **errp)
1104 {
1105     BDRVNVMeState *s = bs->opaque;
1106 
1107     bs->bl.opt_mem_alignment = s->page_size;
1108     bs->bl.request_alignment = s->page_size;
1109     bs->bl.max_transfer = s->max_transfer;
1110 }
1111 
1112 static void nvme_detach_aio_context(BlockDriverState *bs)
1113 {
1114     BDRVNVMeState *s = bs->opaque;
1115 
1116     aio_set_event_notifier(bdrv_get_aio_context(bs), &s->irq_notifier,
1117                            false, NULL, NULL);
1118 }
1119 
1120 static void nvme_attach_aio_context(BlockDriverState *bs,
1121                                     AioContext *new_context)
1122 {
1123     BDRVNVMeState *s = bs->opaque;
1124 
1125     s->aio_context = new_context;
1126     aio_set_event_notifier(new_context, &s->irq_notifier,
1127                            false, nvme_handle_event, nvme_poll_cb);
1128 }
1129 
1130 static void nvme_aio_plug(BlockDriverState *bs)
1131 {
1132     BDRVNVMeState *s = bs->opaque;
1133     assert(!s->plugged);
1134     s->plugged = true;
1135 }
1136 
1137 static void nvme_aio_unplug(BlockDriverState *bs)
1138 {
1139     int i;
1140     BDRVNVMeState *s = bs->opaque;
1141     assert(s->plugged);
1142     s->plugged = false;
1143     for (i = 1; i < s->nr_queues; i++) {
1144         NVMeQueuePair *q = s->queues[i];
1145         qemu_mutex_lock(&q->lock);
1146         nvme_kick(s, q);
1147         nvme_process_completion(s, q);
1148         qemu_mutex_unlock(&q->lock);
1149     }
1150 }
1151 
1152 static void nvme_register_buf(BlockDriverState *bs, void *host, size_t size)
1153 {
1154     int ret;
1155     BDRVNVMeState *s = bs->opaque;
1156 
1157     ret = qemu_vfio_dma_map(s->vfio, host, size, false, NULL);
1158     if (ret) {
1159         /* FIXME: we may run out of IOVA addresses after repeated
1160          * bdrv_register_buf/bdrv_unregister_buf, because nvme_vfio_dma_unmap
1161          * doesn't reclaim addresses for fixed mappings. */
1162         error_report("nvme_register_buf failed: %s", strerror(-ret));
1163     }
1164 }
1165 
1166 static void nvme_unregister_buf(BlockDriverState *bs, void *host)
1167 {
1168     BDRVNVMeState *s = bs->opaque;
1169 
1170     qemu_vfio_dma_unmap(s->vfio, host);
1171 }
1172 
1173 static const char *const nvme_strong_runtime_opts[] = {
1174     NVME_BLOCK_OPT_DEVICE,
1175     NVME_BLOCK_OPT_NAMESPACE,
1176 
1177     NULL
1178 };
1179 
1180 static BlockDriver bdrv_nvme = {
1181     .format_name              = "nvme",
1182     .protocol_name            = "nvme",
1183     .instance_size            = sizeof(BDRVNVMeState),
1184 
1185     .bdrv_parse_filename      = nvme_parse_filename,
1186     .bdrv_file_open           = nvme_file_open,
1187     .bdrv_close               = nvme_close,
1188     .bdrv_getlength           = nvme_getlength,
1189     .bdrv_probe_blocksizes    = nvme_probe_blocksizes,
1190 
1191     .bdrv_co_preadv           = nvme_co_preadv,
1192     .bdrv_co_pwritev          = nvme_co_pwritev,
1193     .bdrv_co_flush_to_disk    = nvme_co_flush,
1194     .bdrv_reopen_prepare      = nvme_reopen_prepare,
1195 
1196     .bdrv_refresh_filename    = nvme_refresh_filename,
1197     .bdrv_refresh_limits      = nvme_refresh_limits,
1198     .strong_runtime_opts      = nvme_strong_runtime_opts,
1199 
1200     .bdrv_detach_aio_context  = nvme_detach_aio_context,
1201     .bdrv_attach_aio_context  = nvme_attach_aio_context,
1202 
1203     .bdrv_io_plug             = nvme_aio_plug,
1204     .bdrv_io_unplug           = nvme_aio_unplug,
1205 
1206     .bdrv_register_buf        = nvme_register_buf,
1207     .bdrv_unregister_buf      = nvme_unregister_buf,
1208 };
1209 
1210 static void bdrv_nvme_init(void)
1211 {
1212     bdrv_register(&bdrv_nvme);
1213 }
1214 
1215 block_init(bdrv_nvme_init);
1216