xref: /openbmc/qemu/block/nvme.c (revision 8e6fe6b8)
1 /*
2  * NVMe block driver based on vfio
3  *
4  * Copyright 2016 - 2018 Red Hat, Inc.
5  *
6  * Authors:
7  *   Fam Zheng <famz@redhat.com>
8  *   Paolo Bonzini <pbonzini@redhat.com>
9  *
10  * This work is licensed under the terms of the GNU GPL, version 2 or later.
11  * See the COPYING file in the top-level directory.
12  */
13 
14 #include "qemu/osdep.h"
15 #include <linux/vfio.h>
16 #include "qapi/error.h"
17 #include "qapi/qmp/qdict.h"
18 #include "qapi/qmp/qstring.h"
19 #include "qemu/error-report.h"
20 #include "qemu/module.h"
21 #include "qemu/cutils.h"
22 #include "qemu/option.h"
23 #include "qemu/vfio-helpers.h"
24 #include "block/block_int.h"
25 #include "trace.h"
26 
27 #include "block/nvme.h"
28 
29 #define NVME_SQ_ENTRY_BYTES 64
30 #define NVME_CQ_ENTRY_BYTES 16
31 #define NVME_QUEUE_SIZE 128
32 #define NVME_BAR_SIZE 8192
33 
34 typedef struct {
35     int32_t  head, tail;
36     uint8_t  *queue;
37     uint64_t iova;
38     /* Hardware MMIO register */
39     volatile uint32_t *doorbell;
40 } NVMeQueue;
41 
42 typedef struct {
43     BlockCompletionFunc *cb;
44     void *opaque;
45     int cid;
46     void *prp_list_page;
47     uint64_t prp_list_iova;
48     bool busy;
49 } NVMeRequest;
50 
51 typedef struct {
52     CoQueue     free_req_queue;
53     QemuMutex   lock;
54 
55     /* Fields protected by BQL */
56     int         index;
57     uint8_t     *prp_list_pages;
58 
59     /* Fields protected by @lock */
60     NVMeQueue   sq, cq;
61     int         cq_phase;
62     NVMeRequest reqs[NVME_QUEUE_SIZE];
63     bool        busy;
64     int         need_kick;
65     int         inflight;
66 } NVMeQueuePair;
67 
68 /* Memory mapped registers */
69 typedef volatile struct {
70     uint64_t cap;
71     uint32_t vs;
72     uint32_t intms;
73     uint32_t intmc;
74     uint32_t cc;
75     uint32_t reserved0;
76     uint32_t csts;
77     uint32_t nssr;
78     uint32_t aqa;
79     uint64_t asq;
80     uint64_t acq;
81     uint32_t cmbloc;
82     uint32_t cmbsz;
83     uint8_t  reserved1[0xec0];
84     uint8_t  cmd_set_specfic[0x100];
85     uint32_t doorbells[];
86 } NVMeRegs;
87 
88 QEMU_BUILD_BUG_ON(offsetof(NVMeRegs, doorbells) != 0x1000);
89 
90 typedef struct {
91     AioContext *aio_context;
92     QEMUVFIOState *vfio;
93     NVMeRegs *regs;
94     /* The submission/completion queue pairs.
95      * [0]: admin queue.
96      * [1..]: io queues.
97      */
98     NVMeQueuePair **queues;
99     int nr_queues;
100     size_t page_size;
101     /* How many uint32_t elements does each doorbell entry take. */
102     size_t doorbell_scale;
103     bool write_cache_supported;
104     EventNotifier irq_notifier;
105     uint64_t nsze; /* Namespace size reported by identify command */
106     int nsid;      /* The namespace id to read/write data. */
107     uint64_t max_transfer;
108     bool plugged;
109 
110     CoMutex dma_map_lock;
111     CoQueue dma_flush_queue;
112 
113     /* Total size of mapped qiov, accessed under dma_map_lock */
114     int dma_map_count;
115 
116     /* PCI address (required for nvme_refresh_filename()) */
117     char *device;
118 } BDRVNVMeState;
119 
120 #define NVME_BLOCK_OPT_DEVICE "device"
121 #define NVME_BLOCK_OPT_NAMESPACE "namespace"
122 
123 static QemuOptsList runtime_opts = {
124     .name = "nvme",
125     .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head),
126     .desc = {
127         {
128             .name = NVME_BLOCK_OPT_DEVICE,
129             .type = QEMU_OPT_STRING,
130             .help = "NVMe PCI device address",
131         },
132         {
133             .name = NVME_BLOCK_OPT_NAMESPACE,
134             .type = QEMU_OPT_NUMBER,
135             .help = "NVMe namespace",
136         },
137         { /* end of list */ }
138     },
139 };
140 
141 static void nvme_init_queue(BlockDriverState *bs, NVMeQueue *q,
142                             int nentries, int entry_bytes, Error **errp)
143 {
144     BDRVNVMeState *s = bs->opaque;
145     size_t bytes;
146     int r;
147 
148     bytes = ROUND_UP(nentries * entry_bytes, s->page_size);
149     q->head = q->tail = 0;
150     q->queue = qemu_try_blockalign0(bs, bytes);
151 
152     if (!q->queue) {
153         error_setg(errp, "Cannot allocate queue");
154         return;
155     }
156     r = qemu_vfio_dma_map(s->vfio, q->queue, bytes, false, &q->iova);
157     if (r) {
158         error_setg(errp, "Cannot map queue");
159     }
160 }
161 
162 static void nvme_free_queue_pair(BlockDriverState *bs, NVMeQueuePair *q)
163 {
164     qemu_vfree(q->prp_list_pages);
165     qemu_vfree(q->sq.queue);
166     qemu_vfree(q->cq.queue);
167     qemu_mutex_destroy(&q->lock);
168     g_free(q);
169 }
170 
171 static void nvme_free_req_queue_cb(void *opaque)
172 {
173     NVMeQueuePair *q = opaque;
174 
175     qemu_mutex_lock(&q->lock);
176     while (qemu_co_enter_next(&q->free_req_queue, &q->lock)) {
177         /* Retry all pending requests */
178     }
179     qemu_mutex_unlock(&q->lock);
180 }
181 
182 static NVMeQueuePair *nvme_create_queue_pair(BlockDriverState *bs,
183                                              int idx, int size,
184                                              Error **errp)
185 {
186     int i, r;
187     BDRVNVMeState *s = bs->opaque;
188     Error *local_err = NULL;
189     NVMeQueuePair *q = g_new0(NVMeQueuePair, 1);
190     uint64_t prp_list_iova;
191 
192     qemu_mutex_init(&q->lock);
193     q->index = idx;
194     qemu_co_queue_init(&q->free_req_queue);
195     q->prp_list_pages = qemu_blockalign0(bs, s->page_size * NVME_QUEUE_SIZE);
196     r = qemu_vfio_dma_map(s->vfio, q->prp_list_pages,
197                           s->page_size * NVME_QUEUE_SIZE,
198                           false, &prp_list_iova);
199     if (r) {
200         goto fail;
201     }
202     for (i = 0; i < NVME_QUEUE_SIZE; i++) {
203         NVMeRequest *req = &q->reqs[i];
204         req->cid = i + 1;
205         req->prp_list_page = q->prp_list_pages + i * s->page_size;
206         req->prp_list_iova = prp_list_iova + i * s->page_size;
207     }
208     nvme_init_queue(bs, &q->sq, size, NVME_SQ_ENTRY_BYTES, &local_err);
209     if (local_err) {
210         error_propagate(errp, local_err);
211         goto fail;
212     }
213     q->sq.doorbell = &s->regs->doorbells[idx * 2 * s->doorbell_scale];
214 
215     nvme_init_queue(bs, &q->cq, size, NVME_CQ_ENTRY_BYTES, &local_err);
216     if (local_err) {
217         error_propagate(errp, local_err);
218         goto fail;
219     }
220     q->cq.doorbell = &s->regs->doorbells[idx * 2 * s->doorbell_scale + 1];
221 
222     return q;
223 fail:
224     nvme_free_queue_pair(bs, q);
225     return NULL;
226 }
227 
228 /* With q->lock */
229 static void nvme_kick(BDRVNVMeState *s, NVMeQueuePair *q)
230 {
231     if (s->plugged || !q->need_kick) {
232         return;
233     }
234     trace_nvme_kick(s, q->index);
235     assert(!(q->sq.tail & 0xFF00));
236     /* Fence the write to submission queue entry before notifying the device. */
237     smp_wmb();
238     *q->sq.doorbell = cpu_to_le32(q->sq.tail);
239     q->inflight += q->need_kick;
240     q->need_kick = 0;
241 }
242 
243 /* Find a free request element if any, otherwise:
244  * a) if in coroutine context, try to wait for one to become available;
245  * b) if not in coroutine, return NULL;
246  */
247 static NVMeRequest *nvme_get_free_req(NVMeQueuePair *q)
248 {
249     int i;
250     NVMeRequest *req = NULL;
251 
252     qemu_mutex_lock(&q->lock);
253     while (q->inflight + q->need_kick > NVME_QUEUE_SIZE - 2) {
254         /* We have to leave one slot empty as that is the full queue case (head
255          * == tail + 1). */
256         if (qemu_in_coroutine()) {
257             trace_nvme_free_req_queue_wait(q);
258             qemu_co_queue_wait(&q->free_req_queue, &q->lock);
259         } else {
260             qemu_mutex_unlock(&q->lock);
261             return NULL;
262         }
263     }
264     for (i = 0; i < NVME_QUEUE_SIZE; i++) {
265         if (!q->reqs[i].busy) {
266             q->reqs[i].busy = true;
267             req = &q->reqs[i];
268             break;
269         }
270     }
271     /* We have checked inflight and need_kick while holding q->lock, so one
272      * free req must be available. */
273     assert(req);
274     qemu_mutex_unlock(&q->lock);
275     return req;
276 }
277 
278 static inline int nvme_translate_error(const NvmeCqe *c)
279 {
280     uint16_t status = (le16_to_cpu(c->status) >> 1) & 0xFF;
281     if (status) {
282         trace_nvme_error(le32_to_cpu(c->result),
283                          le16_to_cpu(c->sq_head),
284                          le16_to_cpu(c->sq_id),
285                          le16_to_cpu(c->cid),
286                          le16_to_cpu(status));
287     }
288     switch (status) {
289     case 0:
290         return 0;
291     case 1:
292         return -ENOSYS;
293     case 2:
294         return -EINVAL;
295     default:
296         return -EIO;
297     }
298 }
299 
300 /* With q->lock */
301 static bool nvme_process_completion(BDRVNVMeState *s, NVMeQueuePair *q)
302 {
303     bool progress = false;
304     NVMeRequest *preq;
305     NVMeRequest req;
306     NvmeCqe *c;
307 
308     trace_nvme_process_completion(s, q->index, q->inflight);
309     if (q->busy || s->plugged) {
310         trace_nvme_process_completion_queue_busy(s, q->index);
311         return false;
312     }
313     q->busy = true;
314     assert(q->inflight >= 0);
315     while (q->inflight) {
316         int16_t cid;
317         c = (NvmeCqe *)&q->cq.queue[q->cq.head * NVME_CQ_ENTRY_BYTES];
318         if (!c->cid || (le16_to_cpu(c->status) & 0x1) == q->cq_phase) {
319             break;
320         }
321         q->cq.head = (q->cq.head + 1) % NVME_QUEUE_SIZE;
322         if (!q->cq.head) {
323             q->cq_phase = !q->cq_phase;
324         }
325         cid = le16_to_cpu(c->cid);
326         if (cid == 0 || cid > NVME_QUEUE_SIZE) {
327             fprintf(stderr, "Unexpected CID in completion queue: %" PRIu32 "\n",
328                     cid);
329             continue;
330         }
331         assert(cid <= NVME_QUEUE_SIZE);
332         trace_nvme_complete_command(s, q->index, cid);
333         preq = &q->reqs[cid - 1];
334         req = *preq;
335         assert(req.cid == cid);
336         assert(req.cb);
337         preq->busy = false;
338         preq->cb = preq->opaque = NULL;
339         qemu_mutex_unlock(&q->lock);
340         req.cb(req.opaque, nvme_translate_error(c));
341         qemu_mutex_lock(&q->lock);
342         c->cid = cpu_to_le16(0);
343         q->inflight--;
344         /* Flip Phase Tag bit. */
345         c->status = cpu_to_le16(le16_to_cpu(c->status) ^ 0x1);
346         progress = true;
347     }
348     if (progress) {
349         /* Notify the device so it can post more completions. */
350         smp_mb_release();
351         *q->cq.doorbell = cpu_to_le32(q->cq.head);
352         if (!qemu_co_queue_empty(&q->free_req_queue)) {
353             aio_bh_schedule_oneshot(s->aio_context, nvme_free_req_queue_cb, q);
354         }
355     }
356     q->busy = false;
357     return progress;
358 }
359 
360 static void nvme_trace_command(const NvmeCmd *cmd)
361 {
362     int i;
363 
364     for (i = 0; i < 8; ++i) {
365         uint8_t *cmdp = (uint8_t *)cmd + i * 8;
366         trace_nvme_submit_command_raw(cmdp[0], cmdp[1], cmdp[2], cmdp[3],
367                                       cmdp[4], cmdp[5], cmdp[6], cmdp[7]);
368     }
369 }
370 
371 static void nvme_submit_command(BDRVNVMeState *s, NVMeQueuePair *q,
372                                 NVMeRequest *req,
373                                 NvmeCmd *cmd, BlockCompletionFunc cb,
374                                 void *opaque)
375 {
376     assert(!req->cb);
377     req->cb = cb;
378     req->opaque = opaque;
379     cmd->cid = cpu_to_le32(req->cid);
380 
381     trace_nvme_submit_command(s, q->index, req->cid);
382     nvme_trace_command(cmd);
383     qemu_mutex_lock(&q->lock);
384     memcpy((uint8_t *)q->sq.queue +
385            q->sq.tail * NVME_SQ_ENTRY_BYTES, cmd, sizeof(*cmd));
386     q->sq.tail = (q->sq.tail + 1) % NVME_QUEUE_SIZE;
387     q->need_kick++;
388     nvme_kick(s, q);
389     nvme_process_completion(s, q);
390     qemu_mutex_unlock(&q->lock);
391 }
392 
393 static void nvme_cmd_sync_cb(void *opaque, int ret)
394 {
395     int *pret = opaque;
396     *pret = ret;
397     aio_wait_kick();
398 }
399 
400 static int nvme_cmd_sync(BlockDriverState *bs, NVMeQueuePair *q,
401                          NvmeCmd *cmd)
402 {
403     NVMeRequest *req;
404     BDRVNVMeState *s = bs->opaque;
405     int ret = -EINPROGRESS;
406     req = nvme_get_free_req(q);
407     if (!req) {
408         return -EBUSY;
409     }
410     nvme_submit_command(s, q, req, cmd, nvme_cmd_sync_cb, &ret);
411 
412     BDRV_POLL_WHILE(bs, ret == -EINPROGRESS);
413     return ret;
414 }
415 
416 static void nvme_identify(BlockDriverState *bs, int namespace, Error **errp)
417 {
418     BDRVNVMeState *s = bs->opaque;
419     NvmeIdCtrl *idctrl;
420     NvmeIdNs *idns;
421     uint8_t *resp;
422     int r;
423     uint64_t iova;
424     NvmeCmd cmd = {
425         .opcode = NVME_ADM_CMD_IDENTIFY,
426         .cdw10 = cpu_to_le32(0x1),
427     };
428 
429     resp = qemu_try_blockalign0(bs, sizeof(NvmeIdCtrl));
430     if (!resp) {
431         error_setg(errp, "Cannot allocate buffer for identify response");
432         goto out;
433     }
434     idctrl = (NvmeIdCtrl *)resp;
435     idns = (NvmeIdNs *)resp;
436     r = qemu_vfio_dma_map(s->vfio, resp, sizeof(NvmeIdCtrl), true, &iova);
437     if (r) {
438         error_setg(errp, "Cannot map buffer for DMA");
439         goto out;
440     }
441     cmd.prp1 = cpu_to_le64(iova);
442 
443     if (nvme_cmd_sync(bs, s->queues[0], &cmd)) {
444         error_setg(errp, "Failed to identify controller");
445         goto out;
446     }
447 
448     if (le32_to_cpu(idctrl->nn) < namespace) {
449         error_setg(errp, "Invalid namespace");
450         goto out;
451     }
452     s->write_cache_supported = le32_to_cpu(idctrl->vwc) & 0x1;
453     s->max_transfer = (idctrl->mdts ? 1 << idctrl->mdts : 0) * s->page_size;
454     /* For now the page list buffer per command is one page, to hold at most
455      * s->page_size / sizeof(uint64_t) entries. */
456     s->max_transfer = MIN_NON_ZERO(s->max_transfer,
457                           s->page_size / sizeof(uint64_t) * s->page_size);
458 
459     memset(resp, 0, 4096);
460 
461     cmd.cdw10 = 0;
462     cmd.nsid = cpu_to_le32(namespace);
463     if (nvme_cmd_sync(bs, s->queues[0], &cmd)) {
464         error_setg(errp, "Failed to identify namespace");
465         goto out;
466     }
467 
468     s->nsze = le64_to_cpu(idns->nsze);
469 
470 out:
471     qemu_vfio_dma_unmap(s->vfio, resp);
472     qemu_vfree(resp);
473 }
474 
475 static bool nvme_poll_queues(BDRVNVMeState *s)
476 {
477     bool progress = false;
478     int i;
479 
480     for (i = 0; i < s->nr_queues; i++) {
481         NVMeQueuePair *q = s->queues[i];
482         qemu_mutex_lock(&q->lock);
483         while (nvme_process_completion(s, q)) {
484             /* Keep polling */
485             progress = true;
486         }
487         qemu_mutex_unlock(&q->lock);
488     }
489     return progress;
490 }
491 
492 static void nvme_handle_event(EventNotifier *n)
493 {
494     BDRVNVMeState *s = container_of(n, BDRVNVMeState, irq_notifier);
495 
496     trace_nvme_handle_event(s);
497     event_notifier_test_and_clear(n);
498     nvme_poll_queues(s);
499 }
500 
501 static bool nvme_add_io_queue(BlockDriverState *bs, Error **errp)
502 {
503     BDRVNVMeState *s = bs->opaque;
504     int n = s->nr_queues;
505     NVMeQueuePair *q;
506     NvmeCmd cmd;
507     int queue_size = NVME_QUEUE_SIZE;
508 
509     q = nvme_create_queue_pair(bs, n, queue_size, errp);
510     if (!q) {
511         return false;
512     }
513     cmd = (NvmeCmd) {
514         .opcode = NVME_ADM_CMD_CREATE_CQ,
515         .prp1 = cpu_to_le64(q->cq.iova),
516         .cdw10 = cpu_to_le32(((queue_size - 1) << 16) | (n & 0xFFFF)),
517         .cdw11 = cpu_to_le32(0x3),
518     };
519     if (nvme_cmd_sync(bs, s->queues[0], &cmd)) {
520         error_setg(errp, "Failed to create io queue [%d]", n);
521         nvme_free_queue_pair(bs, q);
522         return false;
523     }
524     cmd = (NvmeCmd) {
525         .opcode = NVME_ADM_CMD_CREATE_SQ,
526         .prp1 = cpu_to_le64(q->sq.iova),
527         .cdw10 = cpu_to_le32(((queue_size - 1) << 16) | (n & 0xFFFF)),
528         .cdw11 = cpu_to_le32(0x1 | (n << 16)),
529     };
530     if (nvme_cmd_sync(bs, s->queues[0], &cmd)) {
531         error_setg(errp, "Failed to create io queue [%d]", n);
532         nvme_free_queue_pair(bs, q);
533         return false;
534     }
535     s->queues = g_renew(NVMeQueuePair *, s->queues, n + 1);
536     s->queues[n] = q;
537     s->nr_queues++;
538     return true;
539 }
540 
541 static bool nvme_poll_cb(void *opaque)
542 {
543     EventNotifier *e = opaque;
544     BDRVNVMeState *s = container_of(e, BDRVNVMeState, irq_notifier);
545     bool progress = false;
546 
547     trace_nvme_poll_cb(s);
548     progress = nvme_poll_queues(s);
549     return progress;
550 }
551 
552 static int nvme_init(BlockDriverState *bs, const char *device, int namespace,
553                      Error **errp)
554 {
555     BDRVNVMeState *s = bs->opaque;
556     int ret;
557     uint64_t cap;
558     uint64_t timeout_ms;
559     uint64_t deadline, now;
560     Error *local_err = NULL;
561 
562     qemu_co_mutex_init(&s->dma_map_lock);
563     qemu_co_queue_init(&s->dma_flush_queue);
564     s->device = g_strdup(device);
565     s->nsid = namespace;
566     s->aio_context = bdrv_get_aio_context(bs);
567     ret = event_notifier_init(&s->irq_notifier, 0);
568     if (ret) {
569         error_setg(errp, "Failed to init event notifier");
570         return ret;
571     }
572 
573     s->vfio = qemu_vfio_open_pci(device, errp);
574     if (!s->vfio) {
575         ret = -EINVAL;
576         goto out;
577     }
578 
579     s->regs = qemu_vfio_pci_map_bar(s->vfio, 0, 0, NVME_BAR_SIZE, errp);
580     if (!s->regs) {
581         ret = -EINVAL;
582         goto out;
583     }
584 
585     /* Perform initialize sequence as described in NVMe spec "7.6.1
586      * Initialization". */
587 
588     cap = le64_to_cpu(s->regs->cap);
589     if (!(cap & (1ULL << 37))) {
590         error_setg(errp, "Device doesn't support NVMe command set");
591         ret = -EINVAL;
592         goto out;
593     }
594 
595     s->page_size = MAX(4096, 1 << (12 + ((cap >> 48) & 0xF)));
596     s->doorbell_scale = (4 << (((cap >> 32) & 0xF))) / sizeof(uint32_t);
597     bs->bl.opt_mem_alignment = s->page_size;
598     timeout_ms = MIN(500 * ((cap >> 24) & 0xFF), 30000);
599 
600     /* Reset device to get a clean state. */
601     s->regs->cc = cpu_to_le32(le32_to_cpu(s->regs->cc) & 0xFE);
602     /* Wait for CSTS.RDY = 0. */
603     deadline = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) + timeout_ms * 1000000ULL;
604     while (le32_to_cpu(s->regs->csts) & 0x1) {
605         if (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) > deadline) {
606             error_setg(errp, "Timeout while waiting for device to reset (%"
607                              PRId64 " ms)",
608                        timeout_ms);
609             ret = -ETIMEDOUT;
610             goto out;
611         }
612     }
613 
614     /* Set up admin queue. */
615     s->queues = g_new(NVMeQueuePair *, 1);
616     s->nr_queues = 1;
617     s->queues[0] = nvme_create_queue_pair(bs, 0, NVME_QUEUE_SIZE, errp);
618     if (!s->queues[0]) {
619         ret = -EINVAL;
620         goto out;
621     }
622     QEMU_BUILD_BUG_ON(NVME_QUEUE_SIZE & 0xF000);
623     s->regs->aqa = cpu_to_le32((NVME_QUEUE_SIZE << 16) | NVME_QUEUE_SIZE);
624     s->regs->asq = cpu_to_le64(s->queues[0]->sq.iova);
625     s->regs->acq = cpu_to_le64(s->queues[0]->cq.iova);
626 
627     /* After setting up all control registers we can enable device now. */
628     s->regs->cc = cpu_to_le32((ctz32(NVME_CQ_ENTRY_BYTES) << 20) |
629                               (ctz32(NVME_SQ_ENTRY_BYTES) << 16) |
630                               0x1);
631     /* Wait for CSTS.RDY = 1. */
632     now = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
633     deadline = now + timeout_ms * 1000000;
634     while (!(le32_to_cpu(s->regs->csts) & 0x1)) {
635         if (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) > deadline) {
636             error_setg(errp, "Timeout while waiting for device to start (%"
637                              PRId64 " ms)",
638                        timeout_ms);
639             ret = -ETIMEDOUT;
640             goto out;
641         }
642     }
643 
644     ret = qemu_vfio_pci_init_irq(s->vfio, &s->irq_notifier,
645                                  VFIO_PCI_MSIX_IRQ_INDEX, errp);
646     if (ret) {
647         goto out;
648     }
649     aio_set_event_notifier(bdrv_get_aio_context(bs), &s->irq_notifier,
650                            false, nvme_handle_event, nvme_poll_cb);
651 
652     nvme_identify(bs, namespace, &local_err);
653     if (local_err) {
654         error_propagate(errp, local_err);
655         ret = -EIO;
656         goto out;
657     }
658 
659     /* Set up command queues. */
660     if (!nvme_add_io_queue(bs, errp)) {
661         ret = -EIO;
662     }
663 out:
664     /* Cleaning up is done in nvme_file_open() upon error. */
665     return ret;
666 }
667 
668 /* Parse a filename in the format of nvme://XXXX:XX:XX.X/X. Example:
669  *
670  *     nvme://0000:44:00.0/1
671  *
672  * where the "nvme://" is a fixed form of the protocol prefix, the middle part
673  * is the PCI address, and the last part is the namespace number starting from
674  * 1 according to the NVMe spec. */
675 static void nvme_parse_filename(const char *filename, QDict *options,
676                                 Error **errp)
677 {
678     int pref = strlen("nvme://");
679 
680     if (strlen(filename) > pref && !strncmp(filename, "nvme://", pref)) {
681         const char *tmp = filename + pref;
682         char *device;
683         const char *namespace;
684         unsigned long ns;
685         const char *slash = strchr(tmp, '/');
686         if (!slash) {
687             qdict_put_str(options, NVME_BLOCK_OPT_DEVICE, tmp);
688             return;
689         }
690         device = g_strndup(tmp, slash - tmp);
691         qdict_put_str(options, NVME_BLOCK_OPT_DEVICE, device);
692         g_free(device);
693         namespace = slash + 1;
694         if (*namespace && qemu_strtoul(namespace, NULL, 10, &ns)) {
695             error_setg(errp, "Invalid namespace '%s', positive number expected",
696                        namespace);
697             return;
698         }
699         qdict_put_str(options, NVME_BLOCK_OPT_NAMESPACE,
700                       *namespace ? namespace : "1");
701     }
702 }
703 
704 static int nvme_enable_disable_write_cache(BlockDriverState *bs, bool enable,
705                                            Error **errp)
706 {
707     int ret;
708     BDRVNVMeState *s = bs->opaque;
709     NvmeCmd cmd = {
710         .opcode = NVME_ADM_CMD_SET_FEATURES,
711         .nsid = cpu_to_le32(s->nsid),
712         .cdw10 = cpu_to_le32(0x06),
713         .cdw11 = cpu_to_le32(enable ? 0x01 : 0x00),
714     };
715 
716     ret = nvme_cmd_sync(bs, s->queues[0], &cmd);
717     if (ret) {
718         error_setg(errp, "Failed to configure NVMe write cache");
719     }
720     return ret;
721 }
722 
723 static void nvme_close(BlockDriverState *bs)
724 {
725     int i;
726     BDRVNVMeState *s = bs->opaque;
727 
728     for (i = 0; i < s->nr_queues; ++i) {
729         nvme_free_queue_pair(bs, s->queues[i]);
730     }
731     g_free(s->queues);
732     aio_set_event_notifier(bdrv_get_aio_context(bs), &s->irq_notifier,
733                            false, NULL, NULL);
734     event_notifier_cleanup(&s->irq_notifier);
735     qemu_vfio_pci_unmap_bar(s->vfio, 0, (void *)s->regs, 0, NVME_BAR_SIZE);
736     qemu_vfio_close(s->vfio);
737 
738     g_free(s->device);
739 }
740 
741 static int nvme_file_open(BlockDriverState *bs, QDict *options, int flags,
742                           Error **errp)
743 {
744     const char *device;
745     QemuOpts *opts;
746     int namespace;
747     int ret;
748     BDRVNVMeState *s = bs->opaque;
749 
750     opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort);
751     qemu_opts_absorb_qdict(opts, options, &error_abort);
752     device = qemu_opt_get(opts, NVME_BLOCK_OPT_DEVICE);
753     if (!device) {
754         error_setg(errp, "'" NVME_BLOCK_OPT_DEVICE "' option is required");
755         qemu_opts_del(opts);
756         return -EINVAL;
757     }
758 
759     namespace = qemu_opt_get_number(opts, NVME_BLOCK_OPT_NAMESPACE, 1);
760     ret = nvme_init(bs, device, namespace, errp);
761     qemu_opts_del(opts);
762     if (ret) {
763         goto fail;
764     }
765     if (flags & BDRV_O_NOCACHE) {
766         if (!s->write_cache_supported) {
767             error_setg(errp,
768                        "NVMe controller doesn't support write cache configuration");
769             ret = -EINVAL;
770         } else {
771             ret = nvme_enable_disable_write_cache(bs, !(flags & BDRV_O_NOCACHE),
772                                                   errp);
773         }
774         if (ret) {
775             goto fail;
776         }
777     }
778     bs->supported_write_flags = BDRV_REQ_FUA;
779     return 0;
780 fail:
781     nvme_close(bs);
782     return ret;
783 }
784 
785 static int64_t nvme_getlength(BlockDriverState *bs)
786 {
787     BDRVNVMeState *s = bs->opaque;
788 
789     return s->nsze << BDRV_SECTOR_BITS;
790 }
791 
792 /* Called with s->dma_map_lock */
793 static coroutine_fn int nvme_cmd_unmap_qiov(BlockDriverState *bs,
794                                             QEMUIOVector *qiov)
795 {
796     int r = 0;
797     BDRVNVMeState *s = bs->opaque;
798 
799     s->dma_map_count -= qiov->size;
800     if (!s->dma_map_count && !qemu_co_queue_empty(&s->dma_flush_queue)) {
801         r = qemu_vfio_dma_reset_temporary(s->vfio);
802         if (!r) {
803             qemu_co_queue_restart_all(&s->dma_flush_queue);
804         }
805     }
806     return r;
807 }
808 
809 /* Called with s->dma_map_lock */
810 static coroutine_fn int nvme_cmd_map_qiov(BlockDriverState *bs, NvmeCmd *cmd,
811                                           NVMeRequest *req, QEMUIOVector *qiov)
812 {
813     BDRVNVMeState *s = bs->opaque;
814     uint64_t *pagelist = req->prp_list_page;
815     int i, j, r;
816     int entries = 0;
817 
818     assert(qiov->size);
819     assert(QEMU_IS_ALIGNED(qiov->size, s->page_size));
820     assert(qiov->size / s->page_size <= s->page_size / sizeof(uint64_t));
821     for (i = 0; i < qiov->niov; ++i) {
822         bool retry = true;
823         uint64_t iova;
824 try_map:
825         r = qemu_vfio_dma_map(s->vfio,
826                               qiov->iov[i].iov_base,
827                               qiov->iov[i].iov_len,
828                               true, &iova);
829         if (r == -ENOMEM && retry) {
830             retry = false;
831             trace_nvme_dma_flush_queue_wait(s);
832             if (s->dma_map_count) {
833                 trace_nvme_dma_map_flush(s);
834                 qemu_co_queue_wait(&s->dma_flush_queue, &s->dma_map_lock);
835             } else {
836                 r = qemu_vfio_dma_reset_temporary(s->vfio);
837                 if (r) {
838                     goto fail;
839                 }
840             }
841             goto try_map;
842         }
843         if (r) {
844             goto fail;
845         }
846 
847         for (j = 0; j < qiov->iov[i].iov_len / s->page_size; j++) {
848             pagelist[entries++] = cpu_to_le64(iova + j * s->page_size);
849         }
850         trace_nvme_cmd_map_qiov_iov(s, i, qiov->iov[i].iov_base,
851                                     qiov->iov[i].iov_len / s->page_size);
852     }
853 
854     s->dma_map_count += qiov->size;
855 
856     assert(entries <= s->page_size / sizeof(uint64_t));
857     switch (entries) {
858     case 0:
859         abort();
860     case 1:
861         cmd->prp1 = pagelist[0];
862         cmd->prp2 = 0;
863         break;
864     case 2:
865         cmd->prp1 = pagelist[0];
866         cmd->prp2 = pagelist[1];
867         break;
868     default:
869         cmd->prp1 = pagelist[0];
870         cmd->prp2 = cpu_to_le64(req->prp_list_iova + sizeof(uint64_t));
871         break;
872     }
873     trace_nvme_cmd_map_qiov(s, cmd, req, qiov, entries);
874     for (i = 0; i < entries; ++i) {
875         trace_nvme_cmd_map_qiov_pages(s, i, pagelist[i]);
876     }
877     return 0;
878 fail:
879     /* No need to unmap [0 - i) iovs even if we've failed, since we don't
880      * increment s->dma_map_count. This is okay for fixed mapping memory areas
881      * because they are already mapped before calling this function; for
882      * temporary mappings, a later nvme_cmd_(un)map_qiov will reclaim by
883      * calling qemu_vfio_dma_reset_temporary when necessary. */
884     return r;
885 }
886 
887 typedef struct {
888     Coroutine *co;
889     int ret;
890     AioContext *ctx;
891 } NVMeCoData;
892 
893 static void nvme_rw_cb_bh(void *opaque)
894 {
895     NVMeCoData *data = opaque;
896     qemu_coroutine_enter(data->co);
897 }
898 
899 static void nvme_rw_cb(void *opaque, int ret)
900 {
901     NVMeCoData *data = opaque;
902     data->ret = ret;
903     if (!data->co) {
904         /* The rw coroutine hasn't yielded, don't try to enter. */
905         return;
906     }
907     aio_bh_schedule_oneshot(data->ctx, nvme_rw_cb_bh, data);
908 }
909 
910 static coroutine_fn int nvme_co_prw_aligned(BlockDriverState *bs,
911                                             uint64_t offset, uint64_t bytes,
912                                             QEMUIOVector *qiov,
913                                             bool is_write,
914                                             int flags)
915 {
916     int r;
917     BDRVNVMeState *s = bs->opaque;
918     NVMeQueuePair *ioq = s->queues[1];
919     NVMeRequest *req;
920     uint32_t cdw12 = (((bytes >> BDRV_SECTOR_BITS) - 1) & 0xFFFF) |
921                        (flags & BDRV_REQ_FUA ? 1 << 30 : 0);
922     NvmeCmd cmd = {
923         .opcode = is_write ? NVME_CMD_WRITE : NVME_CMD_READ,
924         .nsid = cpu_to_le32(s->nsid),
925         .cdw10 = cpu_to_le32((offset >> BDRV_SECTOR_BITS) & 0xFFFFFFFF),
926         .cdw11 = cpu_to_le32(((offset >> BDRV_SECTOR_BITS) >> 32) & 0xFFFFFFFF),
927         .cdw12 = cpu_to_le32(cdw12),
928     };
929     NVMeCoData data = {
930         .ctx = bdrv_get_aio_context(bs),
931         .ret = -EINPROGRESS,
932     };
933 
934     trace_nvme_prw_aligned(s, is_write, offset, bytes, flags, qiov->niov);
935     assert(s->nr_queues > 1);
936     req = nvme_get_free_req(ioq);
937     assert(req);
938 
939     qemu_co_mutex_lock(&s->dma_map_lock);
940     r = nvme_cmd_map_qiov(bs, &cmd, req, qiov);
941     qemu_co_mutex_unlock(&s->dma_map_lock);
942     if (r) {
943         req->busy = false;
944         return r;
945     }
946     nvme_submit_command(s, ioq, req, &cmd, nvme_rw_cb, &data);
947 
948     data.co = qemu_coroutine_self();
949     while (data.ret == -EINPROGRESS) {
950         qemu_coroutine_yield();
951     }
952 
953     qemu_co_mutex_lock(&s->dma_map_lock);
954     r = nvme_cmd_unmap_qiov(bs, qiov);
955     qemu_co_mutex_unlock(&s->dma_map_lock);
956     if (r) {
957         return r;
958     }
959 
960     trace_nvme_rw_done(s, is_write, offset, bytes, data.ret);
961     return data.ret;
962 }
963 
964 static inline bool nvme_qiov_aligned(BlockDriverState *bs,
965                                      const QEMUIOVector *qiov)
966 {
967     int i;
968     BDRVNVMeState *s = bs->opaque;
969 
970     for (i = 0; i < qiov->niov; ++i) {
971         if (!QEMU_PTR_IS_ALIGNED(qiov->iov[i].iov_base, s->page_size) ||
972             !QEMU_IS_ALIGNED(qiov->iov[i].iov_len, s->page_size)) {
973             trace_nvme_qiov_unaligned(qiov, i, qiov->iov[i].iov_base,
974                                       qiov->iov[i].iov_len, s->page_size);
975             return false;
976         }
977     }
978     return true;
979 }
980 
981 static int nvme_co_prw(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
982                        QEMUIOVector *qiov, bool is_write, int flags)
983 {
984     BDRVNVMeState *s = bs->opaque;
985     int r;
986     uint8_t *buf = NULL;
987     QEMUIOVector local_qiov;
988 
989     assert(QEMU_IS_ALIGNED(offset, s->page_size));
990     assert(QEMU_IS_ALIGNED(bytes, s->page_size));
991     assert(bytes <= s->max_transfer);
992     if (nvme_qiov_aligned(bs, qiov)) {
993         return nvme_co_prw_aligned(bs, offset, bytes, qiov, is_write, flags);
994     }
995     trace_nvme_prw_buffered(s, offset, bytes, qiov->niov, is_write);
996     buf = qemu_try_blockalign(bs, bytes);
997 
998     if (!buf) {
999         return -ENOMEM;
1000     }
1001     qemu_iovec_init(&local_qiov, 1);
1002     if (is_write) {
1003         qemu_iovec_to_buf(qiov, 0, buf, bytes);
1004     }
1005     qemu_iovec_add(&local_qiov, buf, bytes);
1006     r = nvme_co_prw_aligned(bs, offset, bytes, &local_qiov, is_write, flags);
1007     qemu_iovec_destroy(&local_qiov);
1008     if (!r && !is_write) {
1009         qemu_iovec_from_buf(qiov, 0, buf, bytes);
1010     }
1011     qemu_vfree(buf);
1012     return r;
1013 }
1014 
1015 static coroutine_fn int nvme_co_preadv(BlockDriverState *bs,
1016                                        uint64_t offset, uint64_t bytes,
1017                                        QEMUIOVector *qiov, int flags)
1018 {
1019     return nvme_co_prw(bs, offset, bytes, qiov, false, flags);
1020 }
1021 
1022 static coroutine_fn int nvme_co_pwritev(BlockDriverState *bs,
1023                                         uint64_t offset, uint64_t bytes,
1024                                         QEMUIOVector *qiov, int flags)
1025 {
1026     return nvme_co_prw(bs, offset, bytes, qiov, true, flags);
1027 }
1028 
1029 static coroutine_fn int nvme_co_flush(BlockDriverState *bs)
1030 {
1031     BDRVNVMeState *s = bs->opaque;
1032     NVMeQueuePair *ioq = s->queues[1];
1033     NVMeRequest *req;
1034     NvmeCmd cmd = {
1035         .opcode = NVME_CMD_FLUSH,
1036         .nsid = cpu_to_le32(s->nsid),
1037     };
1038     NVMeCoData data = {
1039         .ctx = bdrv_get_aio_context(bs),
1040         .ret = -EINPROGRESS,
1041     };
1042 
1043     assert(s->nr_queues > 1);
1044     req = nvme_get_free_req(ioq);
1045     assert(req);
1046     nvme_submit_command(s, ioq, req, &cmd, nvme_rw_cb, &data);
1047 
1048     data.co = qemu_coroutine_self();
1049     if (data.ret == -EINPROGRESS) {
1050         qemu_coroutine_yield();
1051     }
1052 
1053     return data.ret;
1054 }
1055 
1056 
1057 static int nvme_reopen_prepare(BDRVReopenState *reopen_state,
1058                                BlockReopenQueue *queue, Error **errp)
1059 {
1060     return 0;
1061 }
1062 
1063 static void nvme_refresh_filename(BlockDriverState *bs)
1064 {
1065     BDRVNVMeState *s = bs->opaque;
1066 
1067     snprintf(bs->exact_filename, sizeof(bs->exact_filename), "nvme://%s/%i",
1068              s->device, s->nsid);
1069 }
1070 
1071 static void nvme_refresh_limits(BlockDriverState *bs, Error **errp)
1072 {
1073     BDRVNVMeState *s = bs->opaque;
1074 
1075     bs->bl.opt_mem_alignment = s->page_size;
1076     bs->bl.request_alignment = s->page_size;
1077     bs->bl.max_transfer = s->max_transfer;
1078 }
1079 
1080 static void nvme_detach_aio_context(BlockDriverState *bs)
1081 {
1082     BDRVNVMeState *s = bs->opaque;
1083 
1084     aio_set_event_notifier(bdrv_get_aio_context(bs), &s->irq_notifier,
1085                            false, NULL, NULL);
1086 }
1087 
1088 static void nvme_attach_aio_context(BlockDriverState *bs,
1089                                     AioContext *new_context)
1090 {
1091     BDRVNVMeState *s = bs->opaque;
1092 
1093     s->aio_context = new_context;
1094     aio_set_event_notifier(new_context, &s->irq_notifier,
1095                            false, nvme_handle_event, nvme_poll_cb);
1096 }
1097 
1098 static void nvme_aio_plug(BlockDriverState *bs)
1099 {
1100     BDRVNVMeState *s = bs->opaque;
1101     assert(!s->plugged);
1102     s->plugged = true;
1103 }
1104 
1105 static void nvme_aio_unplug(BlockDriverState *bs)
1106 {
1107     int i;
1108     BDRVNVMeState *s = bs->opaque;
1109     assert(s->plugged);
1110     s->plugged = false;
1111     for (i = 1; i < s->nr_queues; i++) {
1112         NVMeQueuePair *q = s->queues[i];
1113         qemu_mutex_lock(&q->lock);
1114         nvme_kick(s, q);
1115         nvme_process_completion(s, q);
1116         qemu_mutex_unlock(&q->lock);
1117     }
1118 }
1119 
1120 static void nvme_register_buf(BlockDriverState *bs, void *host, size_t size)
1121 {
1122     int ret;
1123     BDRVNVMeState *s = bs->opaque;
1124 
1125     ret = qemu_vfio_dma_map(s->vfio, host, size, false, NULL);
1126     if (ret) {
1127         /* FIXME: we may run out of IOVA addresses after repeated
1128          * bdrv_register_buf/bdrv_unregister_buf, because nvme_vfio_dma_unmap
1129          * doesn't reclaim addresses for fixed mappings. */
1130         error_report("nvme_register_buf failed: %s", strerror(-ret));
1131     }
1132 }
1133 
1134 static void nvme_unregister_buf(BlockDriverState *bs, void *host)
1135 {
1136     BDRVNVMeState *s = bs->opaque;
1137 
1138     qemu_vfio_dma_unmap(s->vfio, host);
1139 }
1140 
1141 static const char *const nvme_strong_runtime_opts[] = {
1142     NVME_BLOCK_OPT_DEVICE,
1143     NVME_BLOCK_OPT_NAMESPACE,
1144 
1145     NULL
1146 };
1147 
1148 static BlockDriver bdrv_nvme = {
1149     .format_name              = "nvme",
1150     .protocol_name            = "nvme",
1151     .instance_size            = sizeof(BDRVNVMeState),
1152 
1153     .bdrv_parse_filename      = nvme_parse_filename,
1154     .bdrv_file_open           = nvme_file_open,
1155     .bdrv_close               = nvme_close,
1156     .bdrv_getlength           = nvme_getlength,
1157 
1158     .bdrv_co_preadv           = nvme_co_preadv,
1159     .bdrv_co_pwritev          = nvme_co_pwritev,
1160     .bdrv_co_flush_to_disk    = nvme_co_flush,
1161     .bdrv_reopen_prepare      = nvme_reopen_prepare,
1162 
1163     .bdrv_refresh_filename    = nvme_refresh_filename,
1164     .bdrv_refresh_limits      = nvme_refresh_limits,
1165     .strong_runtime_opts      = nvme_strong_runtime_opts,
1166 
1167     .bdrv_detach_aio_context  = nvme_detach_aio_context,
1168     .bdrv_attach_aio_context  = nvme_attach_aio_context,
1169 
1170     .bdrv_io_plug             = nvme_aio_plug,
1171     .bdrv_io_unplug           = nvme_aio_unplug,
1172 
1173     .bdrv_register_buf        = nvme_register_buf,
1174     .bdrv_unregister_buf      = nvme_unregister_buf,
1175 };
1176 
1177 static void bdrv_nvme_init(void)
1178 {
1179     bdrv_register(&bdrv_nvme);
1180 }
1181 
1182 block_init(bdrv_nvme_init);
1183