xref: /openbmc/qemu/block/nvme.c (revision a75ed3c43064528f3409f0be286b62b9c3a47218)
1  /*
2   * NVMe block driver based on vfio
3   *
4   * Copyright 2016 - 2018 Red Hat, Inc.
5   *
6   * Authors:
7   *   Fam Zheng <famz@redhat.com>
8   *   Paolo Bonzini <pbonzini@redhat.com>
9   *
10   * This work is licensed under the terms of the GNU GPL, version 2 or later.
11   * See the COPYING file in the top-level directory.
12   */
13  
14  #include "qemu/osdep.h"
15  #include <linux/vfio.h>
16  #include "qapi/error.h"
17  #include "qapi/qmp/qdict.h"
18  #include "qapi/qmp/qstring.h"
19  #include "qemu/error-report.h"
20  #include "qemu/main-loop.h"
21  #include "qemu/module.h"
22  #include "qemu/cutils.h"
23  #include "qemu/option.h"
24  #include "qemu/memalign.h"
25  #include "qemu/vfio-helpers.h"
26  #include "block/block_int.h"
27  #include "sysemu/replay.h"
28  #include "trace.h"
29  
30  #include "block/nvme.h"
31  
32  #define NVME_SQ_ENTRY_BYTES 64
33  #define NVME_CQ_ENTRY_BYTES 16
34  #define NVME_QUEUE_SIZE 128
35  #define NVME_DOORBELL_SIZE 4096
36  
37  /*
38   * We have to leave one slot empty as that is the full queue case where
39   * head == tail + 1.
40   */
41  #define NVME_NUM_REQS (NVME_QUEUE_SIZE - 1)
42  
43  typedef struct BDRVNVMeState BDRVNVMeState;
44  
45  /* Same index is used for queues and IRQs */
46  #define INDEX_ADMIN     0
47  #define INDEX_IO(n)     (1 + n)
48  
49  /* This driver shares a single MSIX IRQ for the admin and I/O queues */
50  enum {
51      MSIX_SHARED_IRQ_IDX = 0,
52      MSIX_IRQ_COUNT = 1
53  };
54  
55  typedef struct {
56      int32_t  head, tail;
57      uint8_t  *queue;
58      uint64_t iova;
59      /* Hardware MMIO register */
60      volatile uint32_t *doorbell;
61  } NVMeQueue;
62  
63  typedef struct {
64      BlockCompletionFunc *cb;
65      void *opaque;
66      int cid;
67      void *prp_list_page;
68      uint64_t prp_list_iova;
69      int free_req_next; /* q->reqs[] index of next free req */
70  } NVMeRequest;
71  
72  typedef struct {
73      QemuMutex   lock;
74  
75      /* Read from I/O code path, initialized under BQL */
76      BDRVNVMeState   *s;
77      int             index;
78  
79      /* Fields protected by BQL */
80      uint8_t     *prp_list_pages;
81  
82      /* Fields protected by @lock */
83      CoQueue     free_req_queue;
84      NVMeQueue   sq, cq;
85      int         cq_phase;
86      int         free_req_head;
87      NVMeRequest reqs[NVME_NUM_REQS];
88      int         need_kick;
89      int         inflight;
90  
91      /* Thread-safe, no lock necessary */
92      QEMUBH      *completion_bh;
93  } NVMeQueuePair;
94  
95  struct BDRVNVMeState {
96      AioContext *aio_context;
97      QEMUVFIOState *vfio;
98      void *bar0_wo_map;
99      /* Memory mapped registers */
100      volatile struct {
101          uint32_t sq_tail;
102          uint32_t cq_head;
103      } *doorbells;
104      /* The submission/completion queue pairs.
105       * [0]: admin queue.
106       * [1..]: io queues.
107       */
108      NVMeQueuePair **queues;
109      unsigned queue_count;
110      size_t page_size;
111      /* How many uint32_t elements does each doorbell entry take. */
112      size_t doorbell_scale;
113      bool write_cache_supported;
114      EventNotifier irq_notifier[MSIX_IRQ_COUNT];
115  
116      uint64_t nsze; /* Namespace size reported by identify command */
117      int nsid;      /* The namespace id to read/write data. */
118      int blkshift;
119  
120      uint64_t max_transfer;
121      bool plugged;
122  
123      bool supports_write_zeroes;
124      bool supports_discard;
125  
126      CoMutex dma_map_lock;
127      CoQueue dma_flush_queue;
128  
129      /* Total size of mapped qiov, accessed under dma_map_lock */
130      int dma_map_count;
131  
132      /* PCI address (required for nvme_refresh_filename()) */
133      char *device;
134  
135      struct {
136          uint64_t completion_errors;
137          uint64_t aligned_accesses;
138          uint64_t unaligned_accesses;
139      } stats;
140  };
141  
142  #define NVME_BLOCK_OPT_DEVICE "device"
143  #define NVME_BLOCK_OPT_NAMESPACE "namespace"
144  
145  static void nvme_process_completion_bh(void *opaque);
146  
147  static QemuOptsList runtime_opts = {
148      .name = "nvme",
149      .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head),
150      .desc = {
151          {
152              .name = NVME_BLOCK_OPT_DEVICE,
153              .type = QEMU_OPT_STRING,
154              .help = "NVMe PCI device address",
155          },
156          {
157              .name = NVME_BLOCK_OPT_NAMESPACE,
158              .type = QEMU_OPT_NUMBER,
159              .help = "NVMe namespace",
160          },
161          { /* end of list */ }
162      },
163  };
164  
165  /* Returns true on success, false on failure. */
166  static bool nvme_init_queue(BDRVNVMeState *s, NVMeQueue *q,
167                              unsigned nentries, size_t entry_bytes, Error **errp)
168  {
169      size_t bytes;
170      int r;
171  
172      bytes = ROUND_UP(nentries * entry_bytes, qemu_real_host_page_size());
173      q->head = q->tail = 0;
174      q->queue = qemu_try_memalign(qemu_real_host_page_size(), bytes);
175      if (!q->queue) {
176          error_setg(errp, "Cannot allocate queue");
177          return false;
178      }
179      memset(q->queue, 0, bytes);
180      r = qemu_vfio_dma_map(s->vfio, q->queue, bytes, false, &q->iova, errp);
181      if (r) {
182          error_prepend(errp, "Cannot map queue: ");
183      }
184      return r == 0;
185  }
186  
187  static void nvme_free_queue(NVMeQueue *q)
188  {
189      qemu_vfree(q->queue);
190  }
191  
192  static void nvme_free_queue_pair(NVMeQueuePair *q)
193  {
194      trace_nvme_free_queue_pair(q->index, q, &q->cq, &q->sq);
195      if (q->completion_bh) {
196          qemu_bh_delete(q->completion_bh);
197      }
198      nvme_free_queue(&q->sq);
199      nvme_free_queue(&q->cq);
200      qemu_vfree(q->prp_list_pages);
201      qemu_mutex_destroy(&q->lock);
202      g_free(q);
203  }
204  
205  static void nvme_free_req_queue_cb(void *opaque)
206  {
207      NVMeQueuePair *q = opaque;
208  
209      qemu_mutex_lock(&q->lock);
210      while (q->free_req_head != -1 &&
211             qemu_co_enter_next(&q->free_req_queue, &q->lock)) {
212          /* Retry waiting requests */
213      }
214      qemu_mutex_unlock(&q->lock);
215  }
216  
217  static NVMeQueuePair *nvme_create_queue_pair(BDRVNVMeState *s,
218                                               AioContext *aio_context,
219                                               unsigned idx, size_t size,
220                                               Error **errp)
221  {
222      int i, r;
223      NVMeQueuePair *q;
224      uint64_t prp_list_iova;
225      size_t bytes;
226  
227      q = g_try_new0(NVMeQueuePair, 1);
228      if (!q) {
229          error_setg(errp, "Cannot allocate queue pair");
230          return NULL;
231      }
232      trace_nvme_create_queue_pair(idx, q, size, aio_context,
233                                   event_notifier_get_fd(s->irq_notifier));
234      bytes = QEMU_ALIGN_UP(s->page_size * NVME_NUM_REQS,
235                            qemu_real_host_page_size());
236      q->prp_list_pages = qemu_try_memalign(qemu_real_host_page_size(), bytes);
237      if (!q->prp_list_pages) {
238          error_setg(errp, "Cannot allocate PRP page list");
239          goto fail;
240      }
241      memset(q->prp_list_pages, 0, bytes);
242      qemu_mutex_init(&q->lock);
243      q->s = s;
244      q->index = idx;
245      qemu_co_queue_init(&q->free_req_queue);
246      q->completion_bh = aio_bh_new(aio_context, nvme_process_completion_bh, q);
247      r = qemu_vfio_dma_map(s->vfio, q->prp_list_pages, bytes,
248                            false, &prp_list_iova, errp);
249      if (r) {
250          error_prepend(errp, "Cannot map buffer for DMA: ");
251          goto fail;
252      }
253      q->free_req_head = -1;
254      for (i = 0; i < NVME_NUM_REQS; i++) {
255          NVMeRequest *req = &q->reqs[i];
256          req->cid = i + 1;
257          req->free_req_next = q->free_req_head;
258          q->free_req_head = i;
259          req->prp_list_page = q->prp_list_pages + i * s->page_size;
260          req->prp_list_iova = prp_list_iova + i * s->page_size;
261      }
262  
263      if (!nvme_init_queue(s, &q->sq, size, NVME_SQ_ENTRY_BYTES, errp)) {
264          goto fail;
265      }
266      q->sq.doorbell = &s->doorbells[idx * s->doorbell_scale].sq_tail;
267  
268      if (!nvme_init_queue(s, &q->cq, size, NVME_CQ_ENTRY_BYTES, errp)) {
269          goto fail;
270      }
271      q->cq.doorbell = &s->doorbells[idx * s->doorbell_scale].cq_head;
272  
273      return q;
274  fail:
275      nvme_free_queue_pair(q);
276      return NULL;
277  }
278  
279  /* With q->lock */
280  static void nvme_kick(NVMeQueuePair *q)
281  {
282      BDRVNVMeState *s = q->s;
283  
284      if (s->plugged || !q->need_kick) {
285          return;
286      }
287      trace_nvme_kick(s, q->index);
288      assert(!(q->sq.tail & 0xFF00));
289      /* Fence the write to submission queue entry before notifying the device. */
290      smp_wmb();
291      *q->sq.doorbell = cpu_to_le32(q->sq.tail);
292      q->inflight += q->need_kick;
293      q->need_kick = 0;
294  }
295  
296  static NVMeRequest *nvme_get_free_req_nofail_locked(NVMeQueuePair *q)
297  {
298      NVMeRequest *req;
299  
300      req = &q->reqs[q->free_req_head];
301      q->free_req_head = req->free_req_next;
302      req->free_req_next = -1;
303      return req;
304  }
305  
306  /* Return a free request element if any, otherwise return NULL.  */
307  static NVMeRequest *nvme_get_free_req_nowait(NVMeQueuePair *q)
308  {
309      QEMU_LOCK_GUARD(&q->lock);
310      if (q->free_req_head == -1) {
311          return NULL;
312      }
313      return nvme_get_free_req_nofail_locked(q);
314  }
315  
316  /*
317   * Wait for a free request to become available if necessary, then
318   * return it.
319   */
320  static coroutine_fn NVMeRequest *nvme_get_free_req(NVMeQueuePair *q)
321  {
322      QEMU_LOCK_GUARD(&q->lock);
323  
324      while (q->free_req_head == -1) {
325          trace_nvme_free_req_queue_wait(q->s, q->index);
326          qemu_co_queue_wait(&q->free_req_queue, &q->lock);
327      }
328  
329      return nvme_get_free_req_nofail_locked(q);
330  }
331  
332  /* With q->lock */
333  static void nvme_put_free_req_locked(NVMeQueuePair *q, NVMeRequest *req)
334  {
335      req->free_req_next = q->free_req_head;
336      q->free_req_head = req - q->reqs;
337  }
338  
339  /* With q->lock */
340  static void nvme_wake_free_req_locked(NVMeQueuePair *q)
341  {
342      if (!qemu_co_queue_empty(&q->free_req_queue)) {
343          replay_bh_schedule_oneshot_event(q->s->aio_context,
344                  nvme_free_req_queue_cb, q);
345      }
346  }
347  
348  /* Insert a request in the freelist and wake waiters */
349  static void nvme_put_free_req_and_wake(NVMeQueuePair *q, NVMeRequest *req)
350  {
351      qemu_mutex_lock(&q->lock);
352      nvme_put_free_req_locked(q, req);
353      nvme_wake_free_req_locked(q);
354      qemu_mutex_unlock(&q->lock);
355  }
356  
357  static inline int nvme_translate_error(const NvmeCqe *c)
358  {
359      uint16_t status = (le16_to_cpu(c->status) >> 1) & 0xFF;
360      if (status) {
361          trace_nvme_error(le32_to_cpu(c->result),
362                           le16_to_cpu(c->sq_head),
363                           le16_to_cpu(c->sq_id),
364                           le16_to_cpu(c->cid),
365                           le16_to_cpu(status));
366      }
367      switch (status) {
368      case 0:
369          return 0;
370      case 1:
371          return -ENOSYS;
372      case 2:
373          return -EINVAL;
374      default:
375          return -EIO;
376      }
377  }
378  
379  /* With q->lock */
380  static bool nvme_process_completion(NVMeQueuePair *q)
381  {
382      BDRVNVMeState *s = q->s;
383      bool progress = false;
384      NVMeRequest *preq;
385      NVMeRequest req;
386      NvmeCqe *c;
387  
388      trace_nvme_process_completion(s, q->index, q->inflight);
389      if (s->plugged) {
390          trace_nvme_process_completion_queue_plugged(s, q->index);
391          return false;
392      }
393  
394      /*
395       * Support re-entrancy when a request cb() function invokes aio_poll().
396       * Pending completions must be visible to aio_poll() so that a cb()
397       * function can wait for the completion of another request.
398       *
399       * The aio_poll() loop will execute our BH and we'll resume completion
400       * processing there.
401       */
402      qemu_bh_schedule(q->completion_bh);
403  
404      assert(q->inflight >= 0);
405      while (q->inflight) {
406          int ret;
407          int16_t cid;
408  
409          c = (NvmeCqe *)&q->cq.queue[q->cq.head * NVME_CQ_ENTRY_BYTES];
410          if ((le16_to_cpu(c->status) & 0x1) == q->cq_phase) {
411              break;
412          }
413          ret = nvme_translate_error(c);
414          if (ret) {
415              s->stats.completion_errors++;
416          }
417          q->cq.head = (q->cq.head + 1) % NVME_QUEUE_SIZE;
418          if (!q->cq.head) {
419              q->cq_phase = !q->cq_phase;
420          }
421          cid = le16_to_cpu(c->cid);
422          if (cid == 0 || cid > NVME_QUEUE_SIZE) {
423              warn_report("NVMe: Unexpected CID in completion queue: %"PRIu32", "
424                          "queue size: %u", cid, NVME_QUEUE_SIZE);
425              continue;
426          }
427          trace_nvme_complete_command(s, q->index, cid);
428          preq = &q->reqs[cid - 1];
429          req = *preq;
430          assert(req.cid == cid);
431          assert(req.cb);
432          nvme_put_free_req_locked(q, preq);
433          preq->cb = preq->opaque = NULL;
434          q->inflight--;
435          qemu_mutex_unlock(&q->lock);
436          req.cb(req.opaque, ret);
437          qemu_mutex_lock(&q->lock);
438          progress = true;
439      }
440      if (progress) {
441          /* Notify the device so it can post more completions. */
442          smp_mb_release();
443          *q->cq.doorbell = cpu_to_le32(q->cq.head);
444          nvme_wake_free_req_locked(q);
445      }
446  
447      qemu_bh_cancel(q->completion_bh);
448  
449      return progress;
450  }
451  
452  static void nvme_process_completion_bh(void *opaque)
453  {
454      NVMeQueuePair *q = opaque;
455  
456      /*
457       * We're being invoked because a nvme_process_completion() cb() function
458       * called aio_poll(). The callback may be waiting for further completions
459       * so notify the device that it has space to fill in more completions now.
460       */
461      smp_mb_release();
462      *q->cq.doorbell = cpu_to_le32(q->cq.head);
463      nvme_wake_free_req_locked(q);
464  
465      nvme_process_completion(q);
466  }
467  
468  static void nvme_trace_command(const NvmeCmd *cmd)
469  {
470      int i;
471  
472      if (!trace_event_get_state_backends(TRACE_NVME_SUBMIT_COMMAND_RAW)) {
473          return;
474      }
475      for (i = 0; i < 8; ++i) {
476          uint8_t *cmdp = (uint8_t *)cmd + i * 8;
477          trace_nvme_submit_command_raw(cmdp[0], cmdp[1], cmdp[2], cmdp[3],
478                                        cmdp[4], cmdp[5], cmdp[6], cmdp[7]);
479      }
480  }
481  
482  static void nvme_submit_command(NVMeQueuePair *q, NVMeRequest *req,
483                                  NvmeCmd *cmd, BlockCompletionFunc cb,
484                                  void *opaque)
485  {
486      assert(!req->cb);
487      req->cb = cb;
488      req->opaque = opaque;
489      cmd->cid = cpu_to_le16(req->cid);
490  
491      trace_nvme_submit_command(q->s, q->index, req->cid);
492      nvme_trace_command(cmd);
493      qemu_mutex_lock(&q->lock);
494      memcpy((uint8_t *)q->sq.queue +
495             q->sq.tail * NVME_SQ_ENTRY_BYTES, cmd, sizeof(*cmd));
496      q->sq.tail = (q->sq.tail + 1) % NVME_QUEUE_SIZE;
497      q->need_kick++;
498      nvme_kick(q);
499      nvme_process_completion(q);
500      qemu_mutex_unlock(&q->lock);
501  }
502  
503  static void nvme_admin_cmd_sync_cb(void *opaque, int ret)
504  {
505      int *pret = opaque;
506      *pret = ret;
507      aio_wait_kick();
508  }
509  
510  static int nvme_admin_cmd_sync(BlockDriverState *bs, NvmeCmd *cmd)
511  {
512      BDRVNVMeState *s = bs->opaque;
513      NVMeQueuePair *q = s->queues[INDEX_ADMIN];
514      AioContext *aio_context = bdrv_get_aio_context(bs);
515      NVMeRequest *req;
516      int ret = -EINPROGRESS;
517      req = nvme_get_free_req_nowait(q);
518      if (!req) {
519          return -EBUSY;
520      }
521      nvme_submit_command(q, req, cmd, nvme_admin_cmd_sync_cb, &ret);
522  
523      AIO_WAIT_WHILE(aio_context, ret == -EINPROGRESS);
524      return ret;
525  }
526  
527  /* Returns true on success, false on failure. */
528  static bool nvme_identify(BlockDriverState *bs, int namespace, Error **errp)
529  {
530      BDRVNVMeState *s = bs->opaque;
531      bool ret = false;
532      QEMU_AUTO_VFREE union {
533          NvmeIdCtrl ctrl;
534          NvmeIdNs ns;
535      } *id = NULL;
536      NvmeLBAF *lbaf;
537      uint16_t oncs;
538      int r;
539      uint64_t iova;
540      NvmeCmd cmd = {
541          .opcode = NVME_ADM_CMD_IDENTIFY,
542          .cdw10 = cpu_to_le32(0x1),
543      };
544      size_t id_size = QEMU_ALIGN_UP(sizeof(*id), qemu_real_host_page_size());
545  
546      id = qemu_try_memalign(qemu_real_host_page_size(), id_size);
547      if (!id) {
548          error_setg(errp, "Cannot allocate buffer for identify response");
549          goto out;
550      }
551      r = qemu_vfio_dma_map(s->vfio, id, id_size, true, &iova, errp);
552      if (r) {
553          error_prepend(errp, "Cannot map buffer for DMA: ");
554          goto out;
555      }
556  
557      memset(id, 0, id_size);
558      cmd.dptr.prp1 = cpu_to_le64(iova);
559      if (nvme_admin_cmd_sync(bs, &cmd)) {
560          error_setg(errp, "Failed to identify controller");
561          goto out;
562      }
563  
564      if (le32_to_cpu(id->ctrl.nn) < namespace) {
565          error_setg(errp, "Invalid namespace");
566          goto out;
567      }
568      s->write_cache_supported = le32_to_cpu(id->ctrl.vwc) & 0x1;
569      s->max_transfer = (id->ctrl.mdts ? 1 << id->ctrl.mdts : 0) * s->page_size;
570      /* For now the page list buffer per command is one page, to hold at most
571       * s->page_size / sizeof(uint64_t) entries. */
572      s->max_transfer = MIN_NON_ZERO(s->max_transfer,
573                            s->page_size / sizeof(uint64_t) * s->page_size);
574  
575      oncs = le16_to_cpu(id->ctrl.oncs);
576      s->supports_write_zeroes = !!(oncs & NVME_ONCS_WRITE_ZEROES);
577      s->supports_discard = !!(oncs & NVME_ONCS_DSM);
578  
579      memset(id, 0, id_size);
580      cmd.cdw10 = 0;
581      cmd.nsid = cpu_to_le32(namespace);
582      if (nvme_admin_cmd_sync(bs, &cmd)) {
583          error_setg(errp, "Failed to identify namespace");
584          goto out;
585      }
586  
587      s->nsze = le64_to_cpu(id->ns.nsze);
588      lbaf = &id->ns.lbaf[NVME_ID_NS_FLBAS_INDEX(id->ns.flbas)];
589  
590      if (NVME_ID_NS_DLFEAT_WRITE_ZEROES(id->ns.dlfeat) &&
591              NVME_ID_NS_DLFEAT_READ_BEHAVIOR(id->ns.dlfeat) ==
592                      NVME_ID_NS_DLFEAT_READ_BEHAVIOR_ZEROES) {
593          bs->supported_write_flags |= BDRV_REQ_MAY_UNMAP;
594      }
595  
596      if (lbaf->ms) {
597          error_setg(errp, "Namespaces with metadata are not yet supported");
598          goto out;
599      }
600  
601      if (lbaf->ds < BDRV_SECTOR_BITS || lbaf->ds > 12 ||
602          (1 << lbaf->ds) > s->page_size)
603      {
604          error_setg(errp, "Namespace has unsupported block size (2^%d)",
605                     lbaf->ds);
606          goto out;
607      }
608  
609      ret = true;
610      s->blkshift = lbaf->ds;
611  out:
612      qemu_vfio_dma_unmap(s->vfio, id);
613  
614      return ret;
615  }
616  
617  static void nvme_poll_queue(NVMeQueuePair *q)
618  {
619      const size_t cqe_offset = q->cq.head * NVME_CQ_ENTRY_BYTES;
620      NvmeCqe *cqe = (NvmeCqe *)&q->cq.queue[cqe_offset];
621  
622      trace_nvme_poll_queue(q->s, q->index);
623      /*
624       * Do an early check for completions. q->lock isn't needed because
625       * nvme_process_completion() only runs in the event loop thread and
626       * cannot race with itself.
627       */
628      if ((le16_to_cpu(cqe->status) & 0x1) == q->cq_phase) {
629          return;
630      }
631  
632      qemu_mutex_lock(&q->lock);
633      while (nvme_process_completion(q)) {
634          /* Keep polling */
635      }
636      qemu_mutex_unlock(&q->lock);
637  }
638  
639  static void nvme_poll_queues(BDRVNVMeState *s)
640  {
641      int i;
642  
643      for (i = 0; i < s->queue_count; i++) {
644          nvme_poll_queue(s->queues[i]);
645      }
646  }
647  
648  static void nvme_handle_event(EventNotifier *n)
649  {
650      BDRVNVMeState *s = container_of(n, BDRVNVMeState,
651                                      irq_notifier[MSIX_SHARED_IRQ_IDX]);
652  
653      trace_nvme_handle_event(s);
654      event_notifier_test_and_clear(n);
655      nvme_poll_queues(s);
656  }
657  
658  static bool nvme_add_io_queue(BlockDriverState *bs, Error **errp)
659  {
660      BDRVNVMeState *s = bs->opaque;
661      unsigned n = s->queue_count;
662      NVMeQueuePair *q;
663      NvmeCmd cmd;
664      unsigned queue_size = NVME_QUEUE_SIZE;
665  
666      assert(n <= UINT16_MAX);
667      q = nvme_create_queue_pair(s, bdrv_get_aio_context(bs),
668                                 n, queue_size, errp);
669      if (!q) {
670          return false;
671      }
672      cmd = (NvmeCmd) {
673          .opcode = NVME_ADM_CMD_CREATE_CQ,
674          .dptr.prp1 = cpu_to_le64(q->cq.iova),
675          .cdw10 = cpu_to_le32(((queue_size - 1) << 16) | n),
676          .cdw11 = cpu_to_le32(NVME_CQ_IEN | NVME_CQ_PC),
677      };
678      if (nvme_admin_cmd_sync(bs, &cmd)) {
679          error_setg(errp, "Failed to create CQ io queue [%u]", n);
680          goto out_error;
681      }
682      cmd = (NvmeCmd) {
683          .opcode = NVME_ADM_CMD_CREATE_SQ,
684          .dptr.prp1 = cpu_to_le64(q->sq.iova),
685          .cdw10 = cpu_to_le32(((queue_size - 1) << 16) | n),
686          .cdw11 = cpu_to_le32(NVME_SQ_PC | (n << 16)),
687      };
688      if (nvme_admin_cmd_sync(bs, &cmd)) {
689          error_setg(errp, "Failed to create SQ io queue [%u]", n);
690          goto out_error;
691      }
692      s->queues = g_renew(NVMeQueuePair *, s->queues, n + 1);
693      s->queues[n] = q;
694      s->queue_count++;
695      return true;
696  out_error:
697      nvme_free_queue_pair(q);
698      return false;
699  }
700  
701  static bool nvme_poll_cb(void *opaque)
702  {
703      EventNotifier *e = opaque;
704      BDRVNVMeState *s = container_of(e, BDRVNVMeState,
705                                      irq_notifier[MSIX_SHARED_IRQ_IDX]);
706      int i;
707  
708      for (i = 0; i < s->queue_count; i++) {
709          NVMeQueuePair *q = s->queues[i];
710          const size_t cqe_offset = q->cq.head * NVME_CQ_ENTRY_BYTES;
711          NvmeCqe *cqe = (NvmeCqe *)&q->cq.queue[cqe_offset];
712  
713          /*
714           * q->lock isn't needed because nvme_process_completion() only runs in
715           * the event loop thread and cannot race with itself.
716           */
717          if ((le16_to_cpu(cqe->status) & 0x1) != q->cq_phase) {
718              return true;
719          }
720      }
721      return false;
722  }
723  
724  static void nvme_poll_ready(EventNotifier *e)
725  {
726      BDRVNVMeState *s = container_of(e, BDRVNVMeState,
727                                      irq_notifier[MSIX_SHARED_IRQ_IDX]);
728  
729      nvme_poll_queues(s);
730  }
731  
732  static int nvme_init(BlockDriverState *bs, const char *device, int namespace,
733                       Error **errp)
734  {
735      BDRVNVMeState *s = bs->opaque;
736      NVMeQueuePair *q;
737      AioContext *aio_context = bdrv_get_aio_context(bs);
738      int ret;
739      uint64_t cap;
740      uint32_t ver;
741      uint64_t timeout_ms;
742      uint64_t deadline, now;
743      volatile NvmeBar *regs = NULL;
744  
745      qemu_co_mutex_init(&s->dma_map_lock);
746      qemu_co_queue_init(&s->dma_flush_queue);
747      s->device = g_strdup(device);
748      s->nsid = namespace;
749      s->aio_context = bdrv_get_aio_context(bs);
750      ret = event_notifier_init(&s->irq_notifier[MSIX_SHARED_IRQ_IDX], 0);
751      if (ret) {
752          error_setg(errp, "Failed to init event notifier");
753          return ret;
754      }
755  
756      s->vfio = qemu_vfio_open_pci(device, errp);
757      if (!s->vfio) {
758          ret = -EINVAL;
759          goto out;
760      }
761  
762      regs = qemu_vfio_pci_map_bar(s->vfio, 0, 0, sizeof(NvmeBar),
763                                   PROT_READ | PROT_WRITE, errp);
764      if (!regs) {
765          ret = -EINVAL;
766          goto out;
767      }
768      /* Perform initialize sequence as described in NVMe spec "7.6.1
769       * Initialization". */
770  
771      cap = le64_to_cpu(regs->cap);
772      trace_nvme_controller_capability_raw(cap);
773      trace_nvme_controller_capability("Maximum Queue Entries Supported",
774                                       1 + NVME_CAP_MQES(cap));
775      trace_nvme_controller_capability("Contiguous Queues Required",
776                                       NVME_CAP_CQR(cap));
777      trace_nvme_controller_capability("Doorbell Stride",
778                                       1 << (2 + NVME_CAP_DSTRD(cap)));
779      trace_nvme_controller_capability("Subsystem Reset Supported",
780                                       NVME_CAP_NSSRS(cap));
781      trace_nvme_controller_capability("Memory Page Size Minimum",
782                                       1 << (12 + NVME_CAP_MPSMIN(cap)));
783      trace_nvme_controller_capability("Memory Page Size Maximum",
784                                       1 << (12 + NVME_CAP_MPSMAX(cap)));
785      if (!NVME_CAP_CSS(cap)) {
786          error_setg(errp, "Device doesn't support NVMe command set");
787          ret = -EINVAL;
788          goto out;
789      }
790  
791      s->page_size = 1u << (12 + NVME_CAP_MPSMIN(cap));
792      s->doorbell_scale = (4 << NVME_CAP_DSTRD(cap)) / sizeof(uint32_t);
793      bs->bl.opt_mem_alignment = s->page_size;
794      bs->bl.request_alignment = s->page_size;
795      timeout_ms = MIN(500 * NVME_CAP_TO(cap), 30000);
796  
797      ver = le32_to_cpu(regs->vs);
798      trace_nvme_controller_spec_version(extract32(ver, 16, 16),
799                                         extract32(ver, 8, 8),
800                                         extract32(ver, 0, 8));
801  
802      /* Reset device to get a clean state. */
803      regs->cc = cpu_to_le32(le32_to_cpu(regs->cc) & 0xFE);
804      /* Wait for CSTS.RDY = 0. */
805      deadline = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) + timeout_ms * SCALE_MS;
806      while (NVME_CSTS_RDY(le32_to_cpu(regs->csts))) {
807          if (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) > deadline) {
808              error_setg(errp, "Timeout while waiting for device to reset (%"
809                               PRId64 " ms)",
810                         timeout_ms);
811              ret = -ETIMEDOUT;
812              goto out;
813          }
814      }
815  
816      s->bar0_wo_map = qemu_vfio_pci_map_bar(s->vfio, 0, 0,
817                                             sizeof(NvmeBar) + NVME_DOORBELL_SIZE,
818                                             PROT_WRITE, errp);
819      s->doorbells = (void *)((uintptr_t)s->bar0_wo_map + sizeof(NvmeBar));
820      if (!s->doorbells) {
821          ret = -EINVAL;
822          goto out;
823      }
824  
825      /* Set up admin queue. */
826      s->queues = g_new(NVMeQueuePair *, 1);
827      q = nvme_create_queue_pair(s, aio_context, 0, NVME_QUEUE_SIZE, errp);
828      if (!q) {
829          ret = -EINVAL;
830          goto out;
831      }
832      s->queues[INDEX_ADMIN] = q;
833      s->queue_count = 1;
834      QEMU_BUILD_BUG_ON((NVME_QUEUE_SIZE - 1) & 0xF000);
835      regs->aqa = cpu_to_le32(((NVME_QUEUE_SIZE - 1) << AQA_ACQS_SHIFT) |
836                              ((NVME_QUEUE_SIZE - 1) << AQA_ASQS_SHIFT));
837      regs->asq = cpu_to_le64(q->sq.iova);
838      regs->acq = cpu_to_le64(q->cq.iova);
839  
840      /* After setting up all control registers we can enable device now. */
841      regs->cc = cpu_to_le32((ctz32(NVME_CQ_ENTRY_BYTES) << CC_IOCQES_SHIFT) |
842                             (ctz32(NVME_SQ_ENTRY_BYTES) << CC_IOSQES_SHIFT) |
843                             CC_EN_MASK);
844      /* Wait for CSTS.RDY = 1. */
845      now = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
846      deadline = now + timeout_ms * SCALE_MS;
847      while (!NVME_CSTS_RDY(le32_to_cpu(regs->csts))) {
848          if (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) > deadline) {
849              error_setg(errp, "Timeout while waiting for device to start (%"
850                               PRId64 " ms)",
851                         timeout_ms);
852              ret = -ETIMEDOUT;
853              goto out;
854          }
855      }
856  
857      ret = qemu_vfio_pci_init_irq(s->vfio, s->irq_notifier,
858                                   VFIO_PCI_MSIX_IRQ_INDEX, errp);
859      if (ret) {
860          goto out;
861      }
862      aio_set_event_notifier(bdrv_get_aio_context(bs),
863                             &s->irq_notifier[MSIX_SHARED_IRQ_IDX],
864                             false, nvme_handle_event, nvme_poll_cb,
865                             nvme_poll_ready);
866  
867      if (!nvme_identify(bs, namespace, errp)) {
868          ret = -EIO;
869          goto out;
870      }
871  
872      /* Set up command queues. */
873      if (!nvme_add_io_queue(bs, errp)) {
874          ret = -EIO;
875      }
876  out:
877      if (regs) {
878          qemu_vfio_pci_unmap_bar(s->vfio, 0, (void *)regs, 0, sizeof(NvmeBar));
879      }
880  
881      /* Cleaning up is done in nvme_file_open() upon error. */
882      return ret;
883  }
884  
885  /* Parse a filename in the format of nvme://XXXX:XX:XX.X/X. Example:
886   *
887   *     nvme://0000:44:00.0/1
888   *
889   * where the "nvme://" is a fixed form of the protocol prefix, the middle part
890   * is the PCI address, and the last part is the namespace number starting from
891   * 1 according to the NVMe spec. */
892  static void nvme_parse_filename(const char *filename, QDict *options,
893                                  Error **errp)
894  {
895      int pref = strlen("nvme://");
896  
897      if (strlen(filename) > pref && !strncmp(filename, "nvme://", pref)) {
898          const char *tmp = filename + pref;
899          char *device;
900          const char *namespace;
901          unsigned long ns;
902          const char *slash = strchr(tmp, '/');
903          if (!slash) {
904              qdict_put_str(options, NVME_BLOCK_OPT_DEVICE, tmp);
905              return;
906          }
907          device = g_strndup(tmp, slash - tmp);
908          qdict_put_str(options, NVME_BLOCK_OPT_DEVICE, device);
909          g_free(device);
910          namespace = slash + 1;
911          if (*namespace && qemu_strtoul(namespace, NULL, 10, &ns)) {
912              error_setg(errp, "Invalid namespace '%s', positive number expected",
913                         namespace);
914              return;
915          }
916          qdict_put_str(options, NVME_BLOCK_OPT_NAMESPACE,
917                        *namespace ? namespace : "1");
918      }
919  }
920  
921  static int nvme_enable_disable_write_cache(BlockDriverState *bs, bool enable,
922                                             Error **errp)
923  {
924      int ret;
925      BDRVNVMeState *s = bs->opaque;
926      NvmeCmd cmd = {
927          .opcode = NVME_ADM_CMD_SET_FEATURES,
928          .nsid = cpu_to_le32(s->nsid),
929          .cdw10 = cpu_to_le32(0x06),
930          .cdw11 = cpu_to_le32(enable ? 0x01 : 0x00),
931      };
932  
933      ret = nvme_admin_cmd_sync(bs, &cmd);
934      if (ret) {
935          error_setg(errp, "Failed to configure NVMe write cache");
936      }
937      return ret;
938  }
939  
940  static void nvme_close(BlockDriverState *bs)
941  {
942      BDRVNVMeState *s = bs->opaque;
943  
944      for (unsigned i = 0; i < s->queue_count; ++i) {
945          nvme_free_queue_pair(s->queues[i]);
946      }
947      g_free(s->queues);
948      aio_set_event_notifier(bdrv_get_aio_context(bs),
949                             &s->irq_notifier[MSIX_SHARED_IRQ_IDX],
950                             false, NULL, NULL, NULL);
951      event_notifier_cleanup(&s->irq_notifier[MSIX_SHARED_IRQ_IDX]);
952      qemu_vfio_pci_unmap_bar(s->vfio, 0, s->bar0_wo_map,
953                              0, sizeof(NvmeBar) + NVME_DOORBELL_SIZE);
954      qemu_vfio_close(s->vfio);
955  
956      g_free(s->device);
957  }
958  
959  static int nvme_file_open(BlockDriverState *bs, QDict *options, int flags,
960                            Error **errp)
961  {
962      const char *device;
963      QemuOpts *opts;
964      int namespace;
965      int ret;
966      BDRVNVMeState *s = bs->opaque;
967  
968      bs->supported_write_flags = BDRV_REQ_FUA;
969  
970      opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort);
971      qemu_opts_absorb_qdict(opts, options, &error_abort);
972      device = qemu_opt_get(opts, NVME_BLOCK_OPT_DEVICE);
973      if (!device) {
974          error_setg(errp, "'" NVME_BLOCK_OPT_DEVICE "' option is required");
975          qemu_opts_del(opts);
976          return -EINVAL;
977      }
978  
979      namespace = qemu_opt_get_number(opts, NVME_BLOCK_OPT_NAMESPACE, 1);
980      ret = nvme_init(bs, device, namespace, errp);
981      qemu_opts_del(opts);
982      if (ret) {
983          goto fail;
984      }
985      if (flags & BDRV_O_NOCACHE) {
986          if (!s->write_cache_supported) {
987              error_setg(errp,
988                         "NVMe controller doesn't support write cache configuration");
989              ret = -EINVAL;
990          } else {
991              ret = nvme_enable_disable_write_cache(bs, !(flags & BDRV_O_NOCACHE),
992                                                    errp);
993          }
994          if (ret) {
995              goto fail;
996          }
997      }
998      return 0;
999  fail:
1000      nvme_close(bs);
1001      return ret;
1002  }
1003  
1004  static int64_t nvme_getlength(BlockDriverState *bs)
1005  {
1006      BDRVNVMeState *s = bs->opaque;
1007      return s->nsze << s->blkshift;
1008  }
1009  
1010  static uint32_t nvme_get_blocksize(BlockDriverState *bs)
1011  {
1012      BDRVNVMeState *s = bs->opaque;
1013      assert(s->blkshift >= BDRV_SECTOR_BITS && s->blkshift <= 12);
1014      return UINT32_C(1) << s->blkshift;
1015  }
1016  
1017  static int nvme_probe_blocksizes(BlockDriverState *bs, BlockSizes *bsz)
1018  {
1019      uint32_t blocksize = nvme_get_blocksize(bs);
1020      bsz->phys = blocksize;
1021      bsz->log = blocksize;
1022      return 0;
1023  }
1024  
1025  /* Called with s->dma_map_lock */
1026  static coroutine_fn int nvme_cmd_unmap_qiov(BlockDriverState *bs,
1027                                              QEMUIOVector *qiov)
1028  {
1029      int r = 0;
1030      BDRVNVMeState *s = bs->opaque;
1031  
1032      s->dma_map_count -= qiov->size;
1033      if (!s->dma_map_count && !qemu_co_queue_empty(&s->dma_flush_queue)) {
1034          r = qemu_vfio_dma_reset_temporary(s->vfio);
1035          if (!r) {
1036              qemu_co_queue_restart_all(&s->dma_flush_queue);
1037          }
1038      }
1039      return r;
1040  }
1041  
1042  /* Called with s->dma_map_lock */
1043  static coroutine_fn int nvme_cmd_map_qiov(BlockDriverState *bs, NvmeCmd *cmd,
1044                                            NVMeRequest *req, QEMUIOVector *qiov)
1045  {
1046      BDRVNVMeState *s = bs->opaque;
1047      uint64_t *pagelist = req->prp_list_page;
1048      int i, j, r;
1049      int entries = 0;
1050      Error *local_err = NULL, **errp = NULL;
1051  
1052      assert(qiov->size);
1053      assert(QEMU_IS_ALIGNED(qiov->size, s->page_size));
1054      assert(qiov->size / s->page_size <= s->page_size / sizeof(uint64_t));
1055      for (i = 0; i < qiov->niov; ++i) {
1056          bool retry = true;
1057          uint64_t iova;
1058          size_t len = QEMU_ALIGN_UP(qiov->iov[i].iov_len,
1059                                     qemu_real_host_page_size());
1060  try_map:
1061          r = qemu_vfio_dma_map(s->vfio,
1062                                qiov->iov[i].iov_base,
1063                                len, true, &iova, errp);
1064          if (r == -ENOSPC) {
1065              /*
1066               * In addition to the -ENOMEM error, the VFIO_IOMMU_MAP_DMA
1067               * ioctl returns -ENOSPC to signal the user exhausted the DMA
1068               * mappings available for a container since Linux kernel commit
1069               * 492855939bdb ("vfio/type1: Limit DMA mappings per container",
1070               * April 2019, see CVE-2019-3882).
1071               *
1072               * This block driver already handles this error path by checking
1073               * for the -ENOMEM error, so we directly replace -ENOSPC by
1074               * -ENOMEM. Beside, -ENOSPC has a specific meaning for blockdev
1075               * coroutines: it triggers BLOCKDEV_ON_ERROR_ENOSPC and
1076               * BLOCK_ERROR_ACTION_STOP which stops the VM, asking the operator
1077               * to add more storage to the blockdev. Not something we can do
1078               * easily with an IOMMU :)
1079               */
1080              r = -ENOMEM;
1081          }
1082          if (r == -ENOMEM && retry) {
1083              /*
1084               * We exhausted the DMA mappings available for our container:
1085               * recycle the volatile IOVA mappings.
1086               */
1087              retry = false;
1088              trace_nvme_dma_flush_queue_wait(s);
1089              if (s->dma_map_count) {
1090                  trace_nvme_dma_map_flush(s);
1091                  qemu_co_queue_wait(&s->dma_flush_queue, &s->dma_map_lock);
1092              } else {
1093                  r = qemu_vfio_dma_reset_temporary(s->vfio);
1094                  if (r) {
1095                      goto fail;
1096                  }
1097              }
1098              errp = &local_err;
1099  
1100              goto try_map;
1101          }
1102          if (r) {
1103              goto fail;
1104          }
1105  
1106          for (j = 0; j < qiov->iov[i].iov_len / s->page_size; j++) {
1107              pagelist[entries++] = cpu_to_le64(iova + j * s->page_size);
1108          }
1109          trace_nvme_cmd_map_qiov_iov(s, i, qiov->iov[i].iov_base,
1110                                      qiov->iov[i].iov_len / s->page_size);
1111      }
1112  
1113      s->dma_map_count += qiov->size;
1114  
1115      assert(entries <= s->page_size / sizeof(uint64_t));
1116      switch (entries) {
1117      case 0:
1118          abort();
1119      case 1:
1120          cmd->dptr.prp1 = pagelist[0];
1121          cmd->dptr.prp2 = 0;
1122          break;
1123      case 2:
1124          cmd->dptr.prp1 = pagelist[0];
1125          cmd->dptr.prp2 = pagelist[1];
1126          break;
1127      default:
1128          cmd->dptr.prp1 = pagelist[0];
1129          cmd->dptr.prp2 = cpu_to_le64(req->prp_list_iova + sizeof(uint64_t));
1130          break;
1131      }
1132      trace_nvme_cmd_map_qiov(s, cmd, req, qiov, entries);
1133      for (i = 0; i < entries; ++i) {
1134          trace_nvme_cmd_map_qiov_pages(s, i, pagelist[i]);
1135      }
1136      return 0;
1137  fail:
1138      /* No need to unmap [0 - i) iovs even if we've failed, since we don't
1139       * increment s->dma_map_count. This is okay for fixed mapping memory areas
1140       * because they are already mapped before calling this function; for
1141       * temporary mappings, a later nvme_cmd_(un)map_qiov will reclaim by
1142       * calling qemu_vfio_dma_reset_temporary when necessary. */
1143      if (local_err) {
1144          error_reportf_err(local_err, "Cannot map buffer for DMA: ");
1145      }
1146      return r;
1147  }
1148  
1149  typedef struct {
1150      Coroutine *co;
1151      int ret;
1152      AioContext *ctx;
1153  } NVMeCoData;
1154  
1155  static void nvme_rw_cb_bh(void *opaque)
1156  {
1157      NVMeCoData *data = opaque;
1158      qemu_coroutine_enter(data->co);
1159  }
1160  
1161  static void nvme_rw_cb(void *opaque, int ret)
1162  {
1163      NVMeCoData *data = opaque;
1164      data->ret = ret;
1165      if (!data->co) {
1166          /* The rw coroutine hasn't yielded, don't try to enter. */
1167          return;
1168      }
1169      replay_bh_schedule_oneshot_event(data->ctx, nvme_rw_cb_bh, data);
1170  }
1171  
1172  static coroutine_fn int nvme_co_prw_aligned(BlockDriverState *bs,
1173                                              uint64_t offset, uint64_t bytes,
1174                                              QEMUIOVector *qiov,
1175                                              bool is_write,
1176                                              int flags)
1177  {
1178      int r;
1179      BDRVNVMeState *s = bs->opaque;
1180      NVMeQueuePair *ioq = s->queues[INDEX_IO(0)];
1181      NVMeRequest *req;
1182  
1183      uint32_t cdw12 = (((bytes >> s->blkshift) - 1) & 0xFFFF) |
1184                         (flags & BDRV_REQ_FUA ? 1 << 30 : 0);
1185      NvmeCmd cmd = {
1186          .opcode = is_write ? NVME_CMD_WRITE : NVME_CMD_READ,
1187          .nsid = cpu_to_le32(s->nsid),
1188          .cdw10 = cpu_to_le32((offset >> s->blkshift) & 0xFFFFFFFF),
1189          .cdw11 = cpu_to_le32(((offset >> s->blkshift) >> 32) & 0xFFFFFFFF),
1190          .cdw12 = cpu_to_le32(cdw12),
1191      };
1192      NVMeCoData data = {
1193          .ctx = bdrv_get_aio_context(bs),
1194          .ret = -EINPROGRESS,
1195      };
1196  
1197      trace_nvme_prw_aligned(s, is_write, offset, bytes, flags, qiov->niov);
1198      assert(s->queue_count > 1);
1199      req = nvme_get_free_req(ioq);
1200      assert(req);
1201  
1202      qemu_co_mutex_lock(&s->dma_map_lock);
1203      r = nvme_cmd_map_qiov(bs, &cmd, req, qiov);
1204      qemu_co_mutex_unlock(&s->dma_map_lock);
1205      if (r) {
1206          nvme_put_free_req_and_wake(ioq, req);
1207          return r;
1208      }
1209      nvme_submit_command(ioq, req, &cmd, nvme_rw_cb, &data);
1210  
1211      data.co = qemu_coroutine_self();
1212      while (data.ret == -EINPROGRESS) {
1213          qemu_coroutine_yield();
1214      }
1215  
1216      qemu_co_mutex_lock(&s->dma_map_lock);
1217      r = nvme_cmd_unmap_qiov(bs, qiov);
1218      qemu_co_mutex_unlock(&s->dma_map_lock);
1219      if (r) {
1220          return r;
1221      }
1222  
1223      trace_nvme_rw_done(s, is_write, offset, bytes, data.ret);
1224      return data.ret;
1225  }
1226  
1227  static inline bool nvme_qiov_aligned(BlockDriverState *bs,
1228                                       const QEMUIOVector *qiov)
1229  {
1230      int i;
1231      BDRVNVMeState *s = bs->opaque;
1232  
1233      for (i = 0; i < qiov->niov; ++i) {
1234          if (!QEMU_PTR_IS_ALIGNED(qiov->iov[i].iov_base,
1235                                   qemu_real_host_page_size()) ||
1236              !QEMU_IS_ALIGNED(qiov->iov[i].iov_len, qemu_real_host_page_size())) {
1237              trace_nvme_qiov_unaligned(qiov, i, qiov->iov[i].iov_base,
1238                                        qiov->iov[i].iov_len, s->page_size);
1239              return false;
1240          }
1241      }
1242      return true;
1243  }
1244  
1245  static coroutine_fn int nvme_co_prw(BlockDriverState *bs,
1246                                      uint64_t offset, uint64_t bytes,
1247                                      QEMUIOVector *qiov, bool is_write,
1248                                      int flags)
1249  {
1250      BDRVNVMeState *s = bs->opaque;
1251      int r;
1252      QEMU_AUTO_VFREE uint8_t *buf = NULL;
1253      QEMUIOVector local_qiov;
1254      size_t len = QEMU_ALIGN_UP(bytes, qemu_real_host_page_size());
1255      assert(QEMU_IS_ALIGNED(offset, s->page_size));
1256      assert(QEMU_IS_ALIGNED(bytes, s->page_size));
1257      assert(bytes <= s->max_transfer);
1258      if (nvme_qiov_aligned(bs, qiov)) {
1259          s->stats.aligned_accesses++;
1260          return nvme_co_prw_aligned(bs, offset, bytes, qiov, is_write, flags);
1261      }
1262      s->stats.unaligned_accesses++;
1263      trace_nvme_prw_buffered(s, offset, bytes, qiov->niov, is_write);
1264      buf = qemu_try_memalign(qemu_real_host_page_size(), len);
1265  
1266      if (!buf) {
1267          return -ENOMEM;
1268      }
1269      qemu_iovec_init(&local_qiov, 1);
1270      if (is_write) {
1271          qemu_iovec_to_buf(qiov, 0, buf, bytes);
1272      }
1273      qemu_iovec_add(&local_qiov, buf, bytes);
1274      r = nvme_co_prw_aligned(bs, offset, bytes, &local_qiov, is_write, flags);
1275      qemu_iovec_destroy(&local_qiov);
1276      if (!r && !is_write) {
1277          qemu_iovec_from_buf(qiov, 0, buf, bytes);
1278      }
1279      return r;
1280  }
1281  
1282  static coroutine_fn int nvme_co_preadv(BlockDriverState *bs,
1283                                         int64_t offset, int64_t bytes,
1284                                         QEMUIOVector *qiov,
1285                                         BdrvRequestFlags flags)
1286  {
1287      return nvme_co_prw(bs, offset, bytes, qiov, false, flags);
1288  }
1289  
1290  static coroutine_fn int nvme_co_pwritev(BlockDriverState *bs,
1291                                          int64_t offset, int64_t bytes,
1292                                          QEMUIOVector *qiov,
1293                                          BdrvRequestFlags flags)
1294  {
1295      return nvme_co_prw(bs, offset, bytes, qiov, true, flags);
1296  }
1297  
1298  static coroutine_fn int nvme_co_flush(BlockDriverState *bs)
1299  {
1300      BDRVNVMeState *s = bs->opaque;
1301      NVMeQueuePair *ioq = s->queues[INDEX_IO(0)];
1302      NVMeRequest *req;
1303      NvmeCmd cmd = {
1304          .opcode = NVME_CMD_FLUSH,
1305          .nsid = cpu_to_le32(s->nsid),
1306      };
1307      NVMeCoData data = {
1308          .ctx = bdrv_get_aio_context(bs),
1309          .ret = -EINPROGRESS,
1310      };
1311  
1312      assert(s->queue_count > 1);
1313      req = nvme_get_free_req(ioq);
1314      assert(req);
1315      nvme_submit_command(ioq, req, &cmd, nvme_rw_cb, &data);
1316  
1317      data.co = qemu_coroutine_self();
1318      if (data.ret == -EINPROGRESS) {
1319          qemu_coroutine_yield();
1320      }
1321  
1322      return data.ret;
1323  }
1324  
1325  
1326  static coroutine_fn int nvme_co_pwrite_zeroes(BlockDriverState *bs,
1327                                                int64_t offset,
1328                                                int64_t bytes,
1329                                                BdrvRequestFlags flags)
1330  {
1331      BDRVNVMeState *s = bs->opaque;
1332      NVMeQueuePair *ioq = s->queues[INDEX_IO(0)];
1333      NVMeRequest *req;
1334      uint32_t cdw12;
1335  
1336      if (!s->supports_write_zeroes) {
1337          return -ENOTSUP;
1338      }
1339  
1340      if (bytes == 0) {
1341          return 0;
1342      }
1343  
1344      cdw12 = ((bytes >> s->blkshift) - 1) & 0xFFFF;
1345      /*
1346       * We should not lose information. pwrite_zeroes_alignment and
1347       * max_pwrite_zeroes guarantees it.
1348       */
1349      assert(((cdw12 + 1) << s->blkshift) == bytes);
1350  
1351      NvmeCmd cmd = {
1352          .opcode = NVME_CMD_WRITE_ZEROES,
1353          .nsid = cpu_to_le32(s->nsid),
1354          .cdw10 = cpu_to_le32((offset >> s->blkshift) & 0xFFFFFFFF),
1355          .cdw11 = cpu_to_le32(((offset >> s->blkshift) >> 32) & 0xFFFFFFFF),
1356      };
1357  
1358      NVMeCoData data = {
1359          .ctx = bdrv_get_aio_context(bs),
1360          .ret = -EINPROGRESS,
1361      };
1362  
1363      if (flags & BDRV_REQ_MAY_UNMAP) {
1364          cdw12 |= (1 << 25);
1365      }
1366  
1367      if (flags & BDRV_REQ_FUA) {
1368          cdw12 |= (1 << 30);
1369      }
1370  
1371      cmd.cdw12 = cpu_to_le32(cdw12);
1372  
1373      trace_nvme_write_zeroes(s, offset, bytes, flags);
1374      assert(s->queue_count > 1);
1375      req = nvme_get_free_req(ioq);
1376      assert(req);
1377  
1378      nvme_submit_command(ioq, req, &cmd, nvme_rw_cb, &data);
1379  
1380      data.co = qemu_coroutine_self();
1381      while (data.ret == -EINPROGRESS) {
1382          qemu_coroutine_yield();
1383      }
1384  
1385      trace_nvme_rw_done(s, true, offset, bytes, data.ret);
1386      return data.ret;
1387  }
1388  
1389  
1390  static int coroutine_fn nvme_co_pdiscard(BlockDriverState *bs,
1391                                           int64_t offset,
1392                                           int64_t bytes)
1393  {
1394      BDRVNVMeState *s = bs->opaque;
1395      NVMeQueuePair *ioq = s->queues[INDEX_IO(0)];
1396      NVMeRequest *req;
1397      QEMU_AUTO_VFREE NvmeDsmRange *buf = NULL;
1398      QEMUIOVector local_qiov;
1399      int ret;
1400  
1401      NvmeCmd cmd = {
1402          .opcode = NVME_CMD_DSM,
1403          .nsid = cpu_to_le32(s->nsid),
1404          .cdw10 = cpu_to_le32(0), /*number of ranges - 0 based*/
1405          .cdw11 = cpu_to_le32(1 << 2), /*deallocate bit*/
1406      };
1407  
1408      NVMeCoData data = {
1409          .ctx = bdrv_get_aio_context(bs),
1410          .ret = -EINPROGRESS,
1411      };
1412  
1413      if (!s->supports_discard) {
1414          return -ENOTSUP;
1415      }
1416  
1417      assert(s->queue_count > 1);
1418  
1419      /*
1420       * Filling the @buf requires @offset and @bytes to satisfy restrictions
1421       * defined in nvme_refresh_limits().
1422       */
1423      assert(QEMU_IS_ALIGNED(bytes, 1UL << s->blkshift));
1424      assert(QEMU_IS_ALIGNED(offset, 1UL << s->blkshift));
1425      assert((bytes >> s->blkshift) <= UINT32_MAX);
1426  
1427      buf = qemu_try_memalign(s->page_size, s->page_size);
1428      if (!buf) {
1429          return -ENOMEM;
1430      }
1431      memset(buf, 0, s->page_size);
1432      buf->nlb = cpu_to_le32(bytes >> s->blkshift);
1433      buf->slba = cpu_to_le64(offset >> s->blkshift);
1434      buf->cattr = 0;
1435  
1436      qemu_iovec_init(&local_qiov, 1);
1437      qemu_iovec_add(&local_qiov, buf, 4096);
1438  
1439      req = nvme_get_free_req(ioq);
1440      assert(req);
1441  
1442      qemu_co_mutex_lock(&s->dma_map_lock);
1443      ret = nvme_cmd_map_qiov(bs, &cmd, req, &local_qiov);
1444      qemu_co_mutex_unlock(&s->dma_map_lock);
1445  
1446      if (ret) {
1447          nvme_put_free_req_and_wake(ioq, req);
1448          goto out;
1449      }
1450  
1451      trace_nvme_dsm(s, offset, bytes);
1452  
1453      nvme_submit_command(ioq, req, &cmd, nvme_rw_cb, &data);
1454  
1455      data.co = qemu_coroutine_self();
1456      while (data.ret == -EINPROGRESS) {
1457          qemu_coroutine_yield();
1458      }
1459  
1460      qemu_co_mutex_lock(&s->dma_map_lock);
1461      ret = nvme_cmd_unmap_qiov(bs, &local_qiov);
1462      qemu_co_mutex_unlock(&s->dma_map_lock);
1463  
1464      if (ret) {
1465          goto out;
1466      }
1467  
1468      ret = data.ret;
1469      trace_nvme_dsm_done(s, offset, bytes, ret);
1470  out:
1471      qemu_iovec_destroy(&local_qiov);
1472      return ret;
1473  
1474  }
1475  
1476  static int coroutine_fn nvme_co_truncate(BlockDriverState *bs, int64_t offset,
1477                                           bool exact, PreallocMode prealloc,
1478                                           BdrvRequestFlags flags, Error **errp)
1479  {
1480      int64_t cur_length;
1481  
1482      if (prealloc != PREALLOC_MODE_OFF) {
1483          error_setg(errp, "Unsupported preallocation mode '%s'",
1484                     PreallocMode_str(prealloc));
1485          return -ENOTSUP;
1486      }
1487  
1488      cur_length = nvme_getlength(bs);
1489      if (offset != cur_length && exact) {
1490          error_setg(errp, "Cannot resize NVMe devices");
1491          return -ENOTSUP;
1492      } else if (offset > cur_length) {
1493          error_setg(errp, "Cannot grow NVMe devices");
1494          return -EINVAL;
1495      }
1496  
1497      return 0;
1498  }
1499  
1500  static int nvme_reopen_prepare(BDRVReopenState *reopen_state,
1501                                 BlockReopenQueue *queue, Error **errp)
1502  {
1503      return 0;
1504  }
1505  
1506  static void nvme_refresh_filename(BlockDriverState *bs)
1507  {
1508      BDRVNVMeState *s = bs->opaque;
1509  
1510      snprintf(bs->exact_filename, sizeof(bs->exact_filename), "nvme://%s/%i",
1511               s->device, s->nsid);
1512  }
1513  
1514  static void nvme_refresh_limits(BlockDriverState *bs, Error **errp)
1515  {
1516      BDRVNVMeState *s = bs->opaque;
1517  
1518      bs->bl.opt_mem_alignment = s->page_size;
1519      bs->bl.request_alignment = s->page_size;
1520      bs->bl.max_transfer = s->max_transfer;
1521  
1522      /*
1523       * Look at nvme_co_pwrite_zeroes: after shift and decrement we should get
1524       * at most 0xFFFF
1525       */
1526      bs->bl.max_pwrite_zeroes = 1ULL << (s->blkshift + 16);
1527      bs->bl.pwrite_zeroes_alignment = MAX(bs->bl.request_alignment,
1528                                           1UL << s->blkshift);
1529  
1530      bs->bl.max_pdiscard = (uint64_t)UINT32_MAX << s->blkshift;
1531      bs->bl.pdiscard_alignment = MAX(bs->bl.request_alignment,
1532                                      1UL << s->blkshift);
1533  }
1534  
1535  static void nvme_detach_aio_context(BlockDriverState *bs)
1536  {
1537      BDRVNVMeState *s = bs->opaque;
1538  
1539      for (unsigned i = 0; i < s->queue_count; i++) {
1540          NVMeQueuePair *q = s->queues[i];
1541  
1542          qemu_bh_delete(q->completion_bh);
1543          q->completion_bh = NULL;
1544      }
1545  
1546      aio_set_event_notifier(bdrv_get_aio_context(bs),
1547                             &s->irq_notifier[MSIX_SHARED_IRQ_IDX],
1548                             false, NULL, NULL, NULL);
1549  }
1550  
1551  static void nvme_attach_aio_context(BlockDriverState *bs,
1552                                      AioContext *new_context)
1553  {
1554      BDRVNVMeState *s = bs->opaque;
1555  
1556      s->aio_context = new_context;
1557      aio_set_event_notifier(new_context, &s->irq_notifier[MSIX_SHARED_IRQ_IDX],
1558                             false, nvme_handle_event, nvme_poll_cb,
1559                             nvme_poll_ready);
1560  
1561      for (unsigned i = 0; i < s->queue_count; i++) {
1562          NVMeQueuePair *q = s->queues[i];
1563  
1564          q->completion_bh =
1565              aio_bh_new(new_context, nvme_process_completion_bh, q);
1566      }
1567  }
1568  
1569  static void nvme_aio_plug(BlockDriverState *bs)
1570  {
1571      BDRVNVMeState *s = bs->opaque;
1572      assert(!s->plugged);
1573      s->plugged = true;
1574  }
1575  
1576  static void nvme_aio_unplug(BlockDriverState *bs)
1577  {
1578      BDRVNVMeState *s = bs->opaque;
1579      assert(s->plugged);
1580      s->plugged = false;
1581      for (unsigned i = INDEX_IO(0); i < s->queue_count; i++) {
1582          NVMeQueuePair *q = s->queues[i];
1583          qemu_mutex_lock(&q->lock);
1584          nvme_kick(q);
1585          nvme_process_completion(q);
1586          qemu_mutex_unlock(&q->lock);
1587      }
1588  }
1589  
1590  static bool nvme_register_buf(BlockDriverState *bs, void *host, size_t size,
1591                                Error **errp)
1592  {
1593      int ret;
1594      BDRVNVMeState *s = bs->opaque;
1595  
1596      /*
1597       * FIXME: we may run out of IOVA addresses after repeated
1598       * bdrv_register_buf/bdrv_unregister_buf, because nvme_vfio_dma_unmap
1599       * doesn't reclaim addresses for fixed mappings.
1600       */
1601      ret = qemu_vfio_dma_map(s->vfio, host, size, false, NULL, errp);
1602      return ret == 0;
1603  }
1604  
1605  static void nvme_unregister_buf(BlockDriverState *bs, void *host, size_t size)
1606  {
1607      BDRVNVMeState *s = bs->opaque;
1608  
1609      qemu_vfio_dma_unmap(s->vfio, host);
1610  }
1611  
1612  static BlockStatsSpecific *nvme_get_specific_stats(BlockDriverState *bs)
1613  {
1614      BlockStatsSpecific *stats = g_new(BlockStatsSpecific, 1);
1615      BDRVNVMeState *s = bs->opaque;
1616  
1617      stats->driver = BLOCKDEV_DRIVER_NVME;
1618      stats->u.nvme = (BlockStatsSpecificNvme) {
1619          .completion_errors = s->stats.completion_errors,
1620          .aligned_accesses = s->stats.aligned_accesses,
1621          .unaligned_accesses = s->stats.unaligned_accesses,
1622      };
1623  
1624      return stats;
1625  }
1626  
1627  static const char *const nvme_strong_runtime_opts[] = {
1628      NVME_BLOCK_OPT_DEVICE,
1629      NVME_BLOCK_OPT_NAMESPACE,
1630  
1631      NULL
1632  };
1633  
1634  static BlockDriver bdrv_nvme = {
1635      .format_name              = "nvme",
1636      .protocol_name            = "nvme",
1637      .instance_size            = sizeof(BDRVNVMeState),
1638  
1639      .bdrv_co_create_opts      = bdrv_co_create_opts_simple,
1640      .create_opts              = &bdrv_create_opts_simple,
1641  
1642      .bdrv_parse_filename      = nvme_parse_filename,
1643      .bdrv_file_open           = nvme_file_open,
1644      .bdrv_close               = nvme_close,
1645      .bdrv_getlength           = nvme_getlength,
1646      .bdrv_probe_blocksizes    = nvme_probe_blocksizes,
1647      .bdrv_co_truncate         = nvme_co_truncate,
1648  
1649      .bdrv_co_preadv           = nvme_co_preadv,
1650      .bdrv_co_pwritev          = nvme_co_pwritev,
1651  
1652      .bdrv_co_pwrite_zeroes    = nvme_co_pwrite_zeroes,
1653      .bdrv_co_pdiscard         = nvme_co_pdiscard,
1654  
1655      .bdrv_co_flush_to_disk    = nvme_co_flush,
1656      .bdrv_reopen_prepare      = nvme_reopen_prepare,
1657  
1658      .bdrv_refresh_filename    = nvme_refresh_filename,
1659      .bdrv_refresh_limits      = nvme_refresh_limits,
1660      .strong_runtime_opts      = nvme_strong_runtime_opts,
1661      .bdrv_get_specific_stats  = nvme_get_specific_stats,
1662  
1663      .bdrv_detach_aio_context  = nvme_detach_aio_context,
1664      .bdrv_attach_aio_context  = nvme_attach_aio_context,
1665  
1666      .bdrv_io_plug             = nvme_aio_plug,
1667      .bdrv_io_unplug           = nvme_aio_unplug,
1668  
1669      .bdrv_register_buf        = nvme_register_buf,
1670      .bdrv_unregister_buf      = nvme_unregister_buf,
1671  };
1672  
1673  static void bdrv_nvme_init(void)
1674  {
1675      bdrv_register(&bdrv_nvme);
1676  }
1677  
1678  block_init(bdrv_nvme_init);
1679