xref: /openbmc/qemu/hw/virtio/vhost.c (revision 66210a1a30f2384bb59f9dad8d769dba56dd30f1)
1  /*
2   * vhost support
3   *
4   * Copyright Red Hat, Inc. 2010
5   *
6   * Authors:
7   *  Michael S. Tsirkin <mst@redhat.com>
8   *
9   * This work is licensed under the terms of the GNU GPL, version 2.  See
10   * the COPYING file in the top-level directory.
11   *
12   * Contributions after 2012-01-13 are licensed under the terms of the
13   * GNU GPL, version 2 or (at your option) any later version.
14   */
15  
16  #include "qemu/osdep.h"
17  #include "qapi/error.h"
18  #include "hw/virtio/vhost.h"
19  #include "qemu/atomic.h"
20  #include "qemu/range.h"
21  #include "qemu/error-report.h"
22  #include "qemu/memfd.h"
23  #include "qemu/log.h"
24  #include "standard-headers/linux/vhost_types.h"
25  #include "hw/virtio/virtio-bus.h"
26  #include "hw/mem/memory-device.h"
27  #include "migration/blocker.h"
28  #include "migration/qemu-file-types.h"
29  #include "sysemu/dma.h"
30  #include "trace.h"
31  
32  /* enabled until disconnected backend stabilizes */
33  #define _VHOST_DEBUG 1
34  
35  #ifdef _VHOST_DEBUG
36  #define VHOST_OPS_DEBUG(retval, fmt, ...) \
37      do { \
38          error_report(fmt ": %s (%d)", ## __VA_ARGS__, \
39                       strerror(-retval), -retval); \
40      } while (0)
41  #else
42  #define VHOST_OPS_DEBUG(retval, fmt, ...) \
43      do { } while (0)
44  #endif
45  
46  static struct vhost_log *vhost_log;
47  static struct vhost_log *vhost_log_shm;
48  
49  /* Memslots used by backends that support private memslots (without an fd). */
50  static unsigned int used_memslots;
51  
52  /* Memslots used by backends that only support shared memslots (with an fd). */
53  static unsigned int used_shared_memslots;
54  
55  static QLIST_HEAD(, vhost_dev) vhost_devices =
56      QLIST_HEAD_INITIALIZER(vhost_devices);
57  
58  unsigned int vhost_get_max_memslots(void)
59  {
60      unsigned int max = UINT_MAX;
61      struct vhost_dev *hdev;
62  
63      QLIST_FOREACH(hdev, &vhost_devices, entry) {
64          max = MIN(max, hdev->vhost_ops->vhost_backend_memslots_limit(hdev));
65      }
66      return max;
67  }
68  
69  unsigned int vhost_get_free_memslots(void)
70  {
71      unsigned int free = UINT_MAX;
72      struct vhost_dev *hdev;
73  
74      QLIST_FOREACH(hdev, &vhost_devices, entry) {
75          unsigned int r = hdev->vhost_ops->vhost_backend_memslots_limit(hdev);
76          unsigned int cur_free;
77  
78          if (hdev->vhost_ops->vhost_backend_no_private_memslots &&
79              hdev->vhost_ops->vhost_backend_no_private_memslots(hdev)) {
80              cur_free = r - used_shared_memslots;
81          } else {
82              cur_free = r - used_memslots;
83          }
84          free = MIN(free, cur_free);
85      }
86      return free;
87  }
88  
89  static void vhost_dev_sync_region(struct vhost_dev *dev,
90                                    MemoryRegionSection *section,
91                                    uint64_t mfirst, uint64_t mlast,
92                                    uint64_t rfirst, uint64_t rlast)
93  {
94      vhost_log_chunk_t *dev_log = dev->log->log;
95  
96      uint64_t start = MAX(mfirst, rfirst);
97      uint64_t end = MIN(mlast, rlast);
98      vhost_log_chunk_t *from = dev_log + start / VHOST_LOG_CHUNK;
99      vhost_log_chunk_t *to = dev_log + end / VHOST_LOG_CHUNK + 1;
100      uint64_t addr = QEMU_ALIGN_DOWN(start, VHOST_LOG_CHUNK);
101  
102      if (end < start) {
103          return;
104      }
105      assert(end / VHOST_LOG_CHUNK < dev->log_size);
106      assert(start / VHOST_LOG_CHUNK < dev->log_size);
107  
108      for (;from < to; ++from) {
109          vhost_log_chunk_t log;
110          /* We first check with non-atomic: much cheaper,
111           * and we expect non-dirty to be the common case. */
112          if (!*from) {
113              addr += VHOST_LOG_CHUNK;
114              continue;
115          }
116          /* Data must be read atomically. We don't really need barrier semantics
117           * but it's easier to use atomic_* than roll our own. */
118          log = qatomic_xchg(from, 0);
119          while (log) {
120              int bit = ctzl(log);
121              hwaddr page_addr;
122              hwaddr section_offset;
123              hwaddr mr_offset;
124              page_addr = addr + bit * VHOST_LOG_PAGE;
125              section_offset = page_addr - section->offset_within_address_space;
126              mr_offset = section_offset + section->offset_within_region;
127              memory_region_set_dirty(section->mr, mr_offset, VHOST_LOG_PAGE);
128              log &= ~(0x1ull << bit);
129          }
130          addr += VHOST_LOG_CHUNK;
131      }
132  }
133  
134  bool vhost_dev_has_iommu(struct vhost_dev *dev)
135  {
136      VirtIODevice *vdev = dev->vdev;
137  
138      /*
139       * For vhost, VIRTIO_F_IOMMU_PLATFORM means the backend support
140       * incremental memory mapping API via IOTLB API. For platform that
141       * does not have IOMMU, there's no need to enable this feature
142       * which may cause unnecessary IOTLB miss/update transactions.
143       */
144      if (vdev) {
145          return virtio_bus_device_iommu_enabled(vdev) &&
146              virtio_host_has_feature(vdev, VIRTIO_F_IOMMU_PLATFORM);
147      } else {
148          return false;
149      }
150  }
151  
152  static int vhost_sync_dirty_bitmap(struct vhost_dev *dev,
153                                     MemoryRegionSection *section,
154                                     hwaddr first,
155                                     hwaddr last)
156  {
157      int i;
158      hwaddr start_addr;
159      hwaddr end_addr;
160  
161      if (!dev->log_enabled || !dev->started) {
162          return 0;
163      }
164      start_addr = section->offset_within_address_space;
165      end_addr = range_get_last(start_addr, int128_get64(section->size));
166      start_addr = MAX(first, start_addr);
167      end_addr = MIN(last, end_addr);
168  
169      for (i = 0; i < dev->mem->nregions; ++i) {
170          struct vhost_memory_region *reg = dev->mem->regions + i;
171          vhost_dev_sync_region(dev, section, start_addr, end_addr,
172                                reg->guest_phys_addr,
173                                range_get_last(reg->guest_phys_addr,
174                                               reg->memory_size));
175      }
176      for (i = 0; i < dev->nvqs; ++i) {
177          struct vhost_virtqueue *vq = dev->vqs + i;
178  
179          if (!vq->used_phys && !vq->used_size) {
180              continue;
181          }
182  
183          if (vhost_dev_has_iommu(dev)) {
184              IOMMUTLBEntry iotlb;
185              hwaddr used_phys = vq->used_phys, used_size = vq->used_size;
186              hwaddr phys, s, offset;
187  
188              while (used_size) {
189                  rcu_read_lock();
190                  iotlb = address_space_get_iotlb_entry(dev->vdev->dma_as,
191                                                        used_phys,
192                                                        true,
193                                                        MEMTXATTRS_UNSPECIFIED);
194                  rcu_read_unlock();
195  
196                  if (!iotlb.target_as) {
197                      qemu_log_mask(LOG_GUEST_ERROR, "translation "
198                                    "failure for used_iova %"PRIx64"\n",
199                                    used_phys);
200                      return -EINVAL;
201                  }
202  
203                  offset = used_phys & iotlb.addr_mask;
204                  phys = iotlb.translated_addr + offset;
205  
206                  /*
207                   * Distance from start of used ring until last byte of
208                   * IOMMU page.
209                   */
210                  s = iotlb.addr_mask - offset;
211                  /*
212                   * Size of used ring, or of the part of it until end
213                   * of IOMMU page. To avoid zero result, do the adding
214                   * outside of MIN().
215                   */
216                  s = MIN(s, used_size - 1) + 1;
217  
218                  vhost_dev_sync_region(dev, section, start_addr, end_addr, phys,
219                                        range_get_last(phys, s));
220                  used_size -= s;
221                  used_phys += s;
222              }
223          } else {
224              vhost_dev_sync_region(dev, section, start_addr,
225                                    end_addr, vq->used_phys,
226                                    range_get_last(vq->used_phys, vq->used_size));
227          }
228      }
229      return 0;
230  }
231  
232  static void vhost_log_sync(MemoryListener *listener,
233                            MemoryRegionSection *section)
234  {
235      struct vhost_dev *dev = container_of(listener, struct vhost_dev,
236                                           memory_listener);
237      vhost_sync_dirty_bitmap(dev, section, 0x0, ~0x0ULL);
238  }
239  
240  static void vhost_log_sync_range(struct vhost_dev *dev,
241                                   hwaddr first, hwaddr last)
242  {
243      int i;
244      /* FIXME: this is N^2 in number of sections */
245      for (i = 0; i < dev->n_mem_sections; ++i) {
246          MemoryRegionSection *section = &dev->mem_sections[i];
247          vhost_sync_dirty_bitmap(dev, section, first, last);
248      }
249  }
250  
251  static uint64_t vhost_get_log_size(struct vhost_dev *dev)
252  {
253      uint64_t log_size = 0;
254      int i;
255      for (i = 0; i < dev->mem->nregions; ++i) {
256          struct vhost_memory_region *reg = dev->mem->regions + i;
257          uint64_t last = range_get_last(reg->guest_phys_addr,
258                                         reg->memory_size);
259          log_size = MAX(log_size, last / VHOST_LOG_CHUNK + 1);
260      }
261      return log_size;
262  }
263  
264  static int vhost_set_backend_type(struct vhost_dev *dev,
265                                    VhostBackendType backend_type)
266  {
267      int r = 0;
268  
269      switch (backend_type) {
270  #ifdef CONFIG_VHOST_KERNEL
271      case VHOST_BACKEND_TYPE_KERNEL:
272          dev->vhost_ops = &kernel_ops;
273          break;
274  #endif
275  #ifdef CONFIG_VHOST_USER
276      case VHOST_BACKEND_TYPE_USER:
277          dev->vhost_ops = &user_ops;
278          break;
279  #endif
280  #ifdef CONFIG_VHOST_VDPA
281      case VHOST_BACKEND_TYPE_VDPA:
282          dev->vhost_ops = &vdpa_ops;
283          break;
284  #endif
285      default:
286          error_report("Unknown vhost backend type");
287          r = -1;
288      }
289  
290      return r;
291  }
292  
293  static struct vhost_log *vhost_log_alloc(uint64_t size, bool share)
294  {
295      Error *err = NULL;
296      struct vhost_log *log;
297      uint64_t logsize = size * sizeof(*(log->log));
298      int fd = -1;
299  
300      log = g_new0(struct vhost_log, 1);
301      if (share) {
302          log->log = qemu_memfd_alloc("vhost-log", logsize,
303                                      F_SEAL_GROW | F_SEAL_SHRINK | F_SEAL_SEAL,
304                                      &fd, &err);
305          if (err) {
306              error_report_err(err);
307              g_free(log);
308              return NULL;
309          }
310          memset(log->log, 0, logsize);
311      } else {
312          log->log = g_malloc0(logsize);
313      }
314  
315      log->size = size;
316      log->refcnt = 1;
317      log->fd = fd;
318  
319      return log;
320  }
321  
322  static struct vhost_log *vhost_log_get(uint64_t size, bool share)
323  {
324      struct vhost_log *log = share ? vhost_log_shm : vhost_log;
325  
326      if (!log || log->size != size) {
327          log = vhost_log_alloc(size, share);
328          if (share) {
329              vhost_log_shm = log;
330          } else {
331              vhost_log = log;
332          }
333      } else {
334          ++log->refcnt;
335      }
336  
337      return log;
338  }
339  
340  static void vhost_log_put(struct vhost_dev *dev, bool sync)
341  {
342      struct vhost_log *log = dev->log;
343  
344      if (!log) {
345          return;
346      }
347  
348      --log->refcnt;
349      if (log->refcnt == 0) {
350          /* Sync only the range covered by the old log */
351          if (dev->log_size && sync) {
352              vhost_log_sync_range(dev, 0, dev->log_size * VHOST_LOG_CHUNK - 1);
353          }
354  
355          if (vhost_log == log) {
356              g_free(log->log);
357              vhost_log = NULL;
358          } else if (vhost_log_shm == log) {
359              qemu_memfd_free(log->log, log->size * sizeof(*(log->log)),
360                              log->fd);
361              vhost_log_shm = NULL;
362          }
363  
364          g_free(log);
365      }
366  
367      dev->log = NULL;
368      dev->log_size = 0;
369  }
370  
371  static bool vhost_dev_log_is_shared(struct vhost_dev *dev)
372  {
373      return dev->vhost_ops->vhost_requires_shm_log &&
374             dev->vhost_ops->vhost_requires_shm_log(dev);
375  }
376  
377  static inline void vhost_dev_log_resize(struct vhost_dev *dev, uint64_t size)
378  {
379      struct vhost_log *log = vhost_log_get(size, vhost_dev_log_is_shared(dev));
380      uint64_t log_base = (uintptr_t)log->log;
381      int r;
382  
383      /* inform backend of log switching, this must be done before
384         releasing the current log, to ensure no logging is lost */
385      r = dev->vhost_ops->vhost_set_log_base(dev, log_base, log);
386      if (r < 0) {
387          VHOST_OPS_DEBUG(r, "vhost_set_log_base failed");
388      }
389  
390      vhost_log_put(dev, true);
391      dev->log = log;
392      dev->log_size = size;
393  }
394  
395  static void *vhost_memory_map(struct vhost_dev *dev, hwaddr addr,
396                                hwaddr *plen, bool is_write)
397  {
398      if (!vhost_dev_has_iommu(dev)) {
399          return cpu_physical_memory_map(addr, plen, is_write);
400      } else {
401          return (void *)(uintptr_t)addr;
402      }
403  }
404  
405  static void vhost_memory_unmap(struct vhost_dev *dev, void *buffer,
406                                 hwaddr len, int is_write,
407                                 hwaddr access_len)
408  {
409      if (!vhost_dev_has_iommu(dev)) {
410          cpu_physical_memory_unmap(buffer, len, is_write, access_len);
411      }
412  }
413  
414  static int vhost_verify_ring_part_mapping(void *ring_hva,
415                                            uint64_t ring_gpa,
416                                            uint64_t ring_size,
417                                            void *reg_hva,
418                                            uint64_t reg_gpa,
419                                            uint64_t reg_size)
420  {
421      uint64_t hva_ring_offset;
422      uint64_t ring_last = range_get_last(ring_gpa, ring_size);
423      uint64_t reg_last = range_get_last(reg_gpa, reg_size);
424  
425      if (ring_last < reg_gpa || ring_gpa > reg_last) {
426          return 0;
427      }
428      /* check that whole ring's is mapped */
429      if (ring_last > reg_last) {
430          return -ENOMEM;
431      }
432      /* check that ring's MemoryRegion wasn't replaced */
433      hva_ring_offset = ring_gpa - reg_gpa;
434      if (ring_hva != reg_hva + hva_ring_offset) {
435          return -EBUSY;
436      }
437  
438      return 0;
439  }
440  
441  static int vhost_verify_ring_mappings(struct vhost_dev *dev,
442                                        void *reg_hva,
443                                        uint64_t reg_gpa,
444                                        uint64_t reg_size)
445  {
446      int i, j;
447      int r = 0;
448      const char *part_name[] = {
449          "descriptor table",
450          "available ring",
451          "used ring"
452      };
453  
454      if (vhost_dev_has_iommu(dev)) {
455          return 0;
456      }
457  
458      for (i = 0; i < dev->nvqs; ++i) {
459          struct vhost_virtqueue *vq = dev->vqs + i;
460  
461          if (vq->desc_phys == 0) {
462              continue;
463          }
464  
465          j = 0;
466          r = vhost_verify_ring_part_mapping(
467                  vq->desc, vq->desc_phys, vq->desc_size,
468                  reg_hva, reg_gpa, reg_size);
469          if (r) {
470              break;
471          }
472  
473          j++;
474          r = vhost_verify_ring_part_mapping(
475                  vq->avail, vq->avail_phys, vq->avail_size,
476                  reg_hva, reg_gpa, reg_size);
477          if (r) {
478              break;
479          }
480  
481          j++;
482          r = vhost_verify_ring_part_mapping(
483                  vq->used, vq->used_phys, vq->used_size,
484                  reg_hva, reg_gpa, reg_size);
485          if (r) {
486              break;
487          }
488      }
489  
490      if (r == -ENOMEM) {
491          error_report("Unable to map %s for ring %d", part_name[j], i);
492      } else if (r == -EBUSY) {
493          error_report("%s relocated for ring %d", part_name[j], i);
494      }
495      return r;
496  }
497  
498  /*
499   * vhost_section: identify sections needed for vhost access
500   *
501   * We only care about RAM sections here (where virtqueue and guest
502   * internals accessed by virtio might live).
503   */
504  static bool vhost_section(struct vhost_dev *dev, MemoryRegionSection *section)
505  {
506      MemoryRegion *mr = section->mr;
507  
508      if (memory_region_is_ram(mr) && !memory_region_is_rom(mr)) {
509          uint8_t dirty_mask = memory_region_get_dirty_log_mask(mr);
510          uint8_t handled_dirty;
511  
512          /*
513           * Kernel based vhost doesn't handle any block which is doing
514           * dirty-tracking other than migration for which it has
515           * specific logging support. However for TCG the kernel never
516           * gets involved anyway so we can also ignore it's
517           * self-modiying code detection flags. However a vhost-user
518           * client could still confuse a TCG guest if it re-writes
519           * executable memory that has already been translated.
520           */
521          handled_dirty = (1 << DIRTY_MEMORY_MIGRATION) |
522              (1 << DIRTY_MEMORY_CODE);
523  
524          if (dirty_mask & ~handled_dirty) {
525              trace_vhost_reject_section(mr->name, 1);
526              return false;
527          }
528  
529          /*
530           * Some backends (like vhost-user) can only handle memory regions
531           * that have an fd (can be mapped into a different process). Filter
532           * the ones without an fd out, if requested.
533           *
534           * TODO: we might have to limit to MAP_SHARED as well.
535           */
536          if (memory_region_get_fd(section->mr) < 0 &&
537              dev->vhost_ops->vhost_backend_no_private_memslots &&
538              dev->vhost_ops->vhost_backend_no_private_memslots(dev)) {
539              trace_vhost_reject_section(mr->name, 2);
540              return false;
541          }
542  
543          trace_vhost_section(mr->name);
544          return true;
545      } else {
546          trace_vhost_reject_section(mr->name, 3);
547          return false;
548      }
549  }
550  
551  static void vhost_begin(MemoryListener *listener)
552  {
553      struct vhost_dev *dev = container_of(listener, struct vhost_dev,
554                                           memory_listener);
555      dev->tmp_sections = NULL;
556      dev->n_tmp_sections = 0;
557  }
558  
559  static void vhost_commit(MemoryListener *listener)
560  {
561      struct vhost_dev *dev = container_of(listener, struct vhost_dev,
562                                           memory_listener);
563      MemoryRegionSection *old_sections;
564      int n_old_sections;
565      uint64_t log_size;
566      size_t regions_size;
567      int r;
568      int i;
569      bool changed = false;
570  
571      /* Note we can be called before the device is started, but then
572       * starting the device calls set_mem_table, so we need to have
573       * built the data structures.
574       */
575      old_sections = dev->mem_sections;
576      n_old_sections = dev->n_mem_sections;
577      dev->mem_sections = dev->tmp_sections;
578      dev->n_mem_sections = dev->n_tmp_sections;
579  
580      if (dev->n_mem_sections != n_old_sections) {
581          changed = true;
582      } else {
583          /* Same size, lets check the contents */
584          for (i = 0; i < n_old_sections; i++) {
585              if (!MemoryRegionSection_eq(&old_sections[i],
586                                          &dev->mem_sections[i])) {
587                  changed = true;
588                  break;
589              }
590          }
591      }
592  
593      trace_vhost_commit(dev->started, changed);
594      if (!changed) {
595          goto out;
596      }
597  
598      /* Rebuild the regions list from the new sections list */
599      regions_size = offsetof(struct vhost_memory, regions) +
600                         dev->n_mem_sections * sizeof dev->mem->regions[0];
601      dev->mem = g_realloc(dev->mem, regions_size);
602      dev->mem->nregions = dev->n_mem_sections;
603  
604      if (dev->vhost_ops->vhost_backend_no_private_memslots &&
605          dev->vhost_ops->vhost_backend_no_private_memslots(dev)) {
606          used_shared_memslots = dev->mem->nregions;
607      } else {
608          used_memslots = dev->mem->nregions;
609      }
610  
611      for (i = 0; i < dev->n_mem_sections; i++) {
612          struct vhost_memory_region *cur_vmr = dev->mem->regions + i;
613          struct MemoryRegionSection *mrs = dev->mem_sections + i;
614  
615          cur_vmr->guest_phys_addr = mrs->offset_within_address_space;
616          cur_vmr->memory_size     = int128_get64(mrs->size);
617          cur_vmr->userspace_addr  =
618              (uintptr_t)memory_region_get_ram_ptr(mrs->mr) +
619              mrs->offset_within_region;
620          cur_vmr->flags_padding   = 0;
621      }
622  
623      if (!dev->started) {
624          goto out;
625      }
626  
627      for (i = 0; i < dev->mem->nregions; i++) {
628          if (vhost_verify_ring_mappings(dev,
629                         (void *)(uintptr_t)dev->mem->regions[i].userspace_addr,
630                         dev->mem->regions[i].guest_phys_addr,
631                         dev->mem->regions[i].memory_size)) {
632              error_report("Verify ring failure on region %d", i);
633              abort();
634          }
635      }
636  
637      if (!dev->log_enabled) {
638          r = dev->vhost_ops->vhost_set_mem_table(dev, dev->mem);
639          if (r < 0) {
640              VHOST_OPS_DEBUG(r, "vhost_set_mem_table failed");
641          }
642          goto out;
643      }
644      log_size = vhost_get_log_size(dev);
645      /* We allocate an extra 4K bytes to log,
646       * to reduce the * number of reallocations. */
647  #define VHOST_LOG_BUFFER (0x1000 / sizeof *dev->log)
648      /* To log more, must increase log size before table update. */
649      if (dev->log_size < log_size) {
650          vhost_dev_log_resize(dev, log_size + VHOST_LOG_BUFFER);
651      }
652      r = dev->vhost_ops->vhost_set_mem_table(dev, dev->mem);
653      if (r < 0) {
654          VHOST_OPS_DEBUG(r, "vhost_set_mem_table failed");
655      }
656      /* To log less, can only decrease log size after table update. */
657      if (dev->log_size > log_size + VHOST_LOG_BUFFER) {
658          vhost_dev_log_resize(dev, log_size);
659      }
660  
661  out:
662      /* Deref the old list of sections, this must happen _after_ the
663       * vhost_set_mem_table to ensure the client isn't still using the
664       * section we're about to unref.
665       */
666      while (n_old_sections--) {
667          memory_region_unref(old_sections[n_old_sections].mr);
668      }
669      g_free(old_sections);
670      return;
671  }
672  
673  /* Adds the section data to the tmp_section structure.
674   * It relies on the listener calling us in memory address order
675   * and for each region (via the _add and _nop methods) to
676   * join neighbours.
677   */
678  static void vhost_region_add_section(struct vhost_dev *dev,
679                                       MemoryRegionSection *section)
680  {
681      bool need_add = true;
682      uint64_t mrs_size = int128_get64(section->size);
683      uint64_t mrs_gpa = section->offset_within_address_space;
684      uintptr_t mrs_host = (uintptr_t)memory_region_get_ram_ptr(section->mr) +
685                           section->offset_within_region;
686      RAMBlock *mrs_rb = section->mr->ram_block;
687  
688      trace_vhost_region_add_section(section->mr->name, mrs_gpa, mrs_size,
689                                     mrs_host);
690  
691      if (dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_USER) {
692          /* Round the section to it's page size */
693          /* First align the start down to a page boundary */
694          size_t mrs_page = qemu_ram_pagesize(mrs_rb);
695          uint64_t alignage = mrs_host & (mrs_page - 1);
696          if (alignage) {
697              mrs_host -= alignage;
698              mrs_size += alignage;
699              mrs_gpa  -= alignage;
700          }
701          /* Now align the size up to a page boundary */
702          alignage = mrs_size & (mrs_page - 1);
703          if (alignage) {
704              mrs_size += mrs_page - alignage;
705          }
706          trace_vhost_region_add_section_aligned(section->mr->name, mrs_gpa,
707                                                 mrs_size, mrs_host);
708      }
709  
710      if (dev->n_tmp_sections && !section->unmergeable) {
711          /* Since we already have at least one section, lets see if
712           * this extends it; since we're scanning in order, we only
713           * have to look at the last one, and the FlatView that calls
714           * us shouldn't have overlaps.
715           */
716          MemoryRegionSection *prev_sec = dev->tmp_sections +
717                                                 (dev->n_tmp_sections - 1);
718          uint64_t prev_gpa_start = prev_sec->offset_within_address_space;
719          uint64_t prev_size = int128_get64(prev_sec->size);
720          uint64_t prev_gpa_end   = range_get_last(prev_gpa_start, prev_size);
721          uint64_t prev_host_start =
722                          (uintptr_t)memory_region_get_ram_ptr(prev_sec->mr) +
723                          prev_sec->offset_within_region;
724          uint64_t prev_host_end   = range_get_last(prev_host_start, prev_size);
725  
726          if (mrs_gpa <= (prev_gpa_end + 1)) {
727              /* OK, looks like overlapping/intersecting - it's possible that
728               * the rounding to page sizes has made them overlap, but they should
729               * match up in the same RAMBlock if they do.
730               */
731              if (mrs_gpa < prev_gpa_start) {
732                  error_report("%s:Section '%s' rounded to %"PRIx64
733                               " prior to previous '%s' %"PRIx64,
734                               __func__, section->mr->name, mrs_gpa,
735                               prev_sec->mr->name, prev_gpa_start);
736                  /* A way to cleanly fail here would be better */
737                  return;
738              }
739              /* Offset from the start of the previous GPA to this GPA */
740              size_t offset = mrs_gpa - prev_gpa_start;
741  
742              if (prev_host_start + offset == mrs_host &&
743                  section->mr == prev_sec->mr && !prev_sec->unmergeable) {
744                  uint64_t max_end = MAX(prev_host_end, mrs_host + mrs_size);
745                  need_add = false;
746                  prev_sec->offset_within_address_space =
747                      MIN(prev_gpa_start, mrs_gpa);
748                  prev_sec->offset_within_region =
749                      MIN(prev_host_start, mrs_host) -
750                      (uintptr_t)memory_region_get_ram_ptr(prev_sec->mr);
751                  prev_sec->size = int128_make64(max_end - MIN(prev_host_start,
752                                                 mrs_host));
753                  trace_vhost_region_add_section_merge(section->mr->name,
754                                          int128_get64(prev_sec->size),
755                                          prev_sec->offset_within_address_space,
756                                          prev_sec->offset_within_region);
757              } else {
758                  /* adjoining regions are fine, but overlapping ones with
759                   * different blocks/offsets shouldn't happen
760                   */
761                  if (mrs_gpa != prev_gpa_end + 1) {
762                      error_report("%s: Overlapping but not coherent sections "
763                                   "at %"PRIx64,
764                                   __func__, mrs_gpa);
765                      return;
766                  }
767              }
768          }
769      }
770  
771      if (need_add) {
772          ++dev->n_tmp_sections;
773          dev->tmp_sections = g_renew(MemoryRegionSection, dev->tmp_sections,
774                                      dev->n_tmp_sections);
775          dev->tmp_sections[dev->n_tmp_sections - 1] = *section;
776          /* The flatview isn't stable and we don't use it, making it NULL
777           * means we can memcmp the list.
778           */
779          dev->tmp_sections[dev->n_tmp_sections - 1].fv = NULL;
780          memory_region_ref(section->mr);
781      }
782  }
783  
784  /* Used for both add and nop callbacks */
785  static void vhost_region_addnop(MemoryListener *listener,
786                                  MemoryRegionSection *section)
787  {
788      struct vhost_dev *dev = container_of(listener, struct vhost_dev,
789                                           memory_listener);
790  
791      if (!vhost_section(dev, section)) {
792          return;
793      }
794      vhost_region_add_section(dev, section);
795  }
796  
797  static void vhost_iommu_unmap_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
798  {
799      struct vhost_iommu *iommu = container_of(n, struct vhost_iommu, n);
800      struct vhost_dev *hdev = iommu->hdev;
801      hwaddr iova = iotlb->iova + iommu->iommu_offset;
802  
803      if (vhost_backend_invalidate_device_iotlb(hdev, iova,
804                                                iotlb->addr_mask + 1)) {
805          error_report("Fail to invalidate device iotlb");
806      }
807  }
808  
809  static void vhost_iommu_region_add(MemoryListener *listener,
810                                     MemoryRegionSection *section)
811  {
812      struct vhost_dev *dev = container_of(listener, struct vhost_dev,
813                                           iommu_listener);
814      struct vhost_iommu *iommu;
815      Int128 end;
816      int iommu_idx;
817      IOMMUMemoryRegion *iommu_mr;
818  
819      if (!memory_region_is_iommu(section->mr)) {
820          return;
821      }
822  
823      iommu_mr = IOMMU_MEMORY_REGION(section->mr);
824  
825      iommu = g_malloc0(sizeof(*iommu));
826      end = int128_add(int128_make64(section->offset_within_region),
827                       section->size);
828      end = int128_sub(end, int128_one());
829      iommu_idx = memory_region_iommu_attrs_to_index(iommu_mr,
830                                                     MEMTXATTRS_UNSPECIFIED);
831      iommu_notifier_init(&iommu->n, vhost_iommu_unmap_notify,
832                          dev->vdev->device_iotlb_enabled ?
833                              IOMMU_NOTIFIER_DEVIOTLB_UNMAP :
834                              IOMMU_NOTIFIER_UNMAP,
835                          section->offset_within_region,
836                          int128_get64(end),
837                          iommu_idx);
838      iommu->mr = section->mr;
839      iommu->iommu_offset = section->offset_within_address_space -
840                            section->offset_within_region;
841      iommu->hdev = dev;
842      memory_region_register_iommu_notifier(section->mr, &iommu->n,
843                                            &error_fatal);
844      QLIST_INSERT_HEAD(&dev->iommu_list, iommu, iommu_next);
845      /* TODO: can replay help performance here? */
846  }
847  
848  static void vhost_iommu_region_del(MemoryListener *listener,
849                                     MemoryRegionSection *section)
850  {
851      struct vhost_dev *dev = container_of(listener, struct vhost_dev,
852                                           iommu_listener);
853      struct vhost_iommu *iommu;
854  
855      if (!memory_region_is_iommu(section->mr)) {
856          return;
857      }
858  
859      QLIST_FOREACH(iommu, &dev->iommu_list, iommu_next) {
860          if (iommu->mr == section->mr &&
861              iommu->n.start == section->offset_within_region) {
862              memory_region_unregister_iommu_notifier(iommu->mr,
863                                                      &iommu->n);
864              QLIST_REMOVE(iommu, iommu_next);
865              g_free(iommu);
866              break;
867          }
868      }
869  }
870  
871  void vhost_toggle_device_iotlb(VirtIODevice *vdev)
872  {
873      VirtioDeviceClass *vdc = VIRTIO_DEVICE_GET_CLASS(vdev);
874      struct vhost_dev *dev;
875      struct vhost_iommu *iommu;
876  
877      if (vdev->vhost_started) {
878          dev = vdc->get_vhost(vdev);
879      } else {
880          return;
881      }
882  
883      QLIST_FOREACH(iommu, &dev->iommu_list, iommu_next) {
884          memory_region_unregister_iommu_notifier(iommu->mr, &iommu->n);
885          iommu->n.notifier_flags = vdev->device_iotlb_enabled ?
886                  IOMMU_NOTIFIER_DEVIOTLB_UNMAP : IOMMU_NOTIFIER_UNMAP;
887          memory_region_register_iommu_notifier(iommu->mr, &iommu->n,
888                                                &error_fatal);
889      }
890  }
891  
892  static int vhost_virtqueue_set_addr(struct vhost_dev *dev,
893                                      struct vhost_virtqueue *vq,
894                                      unsigned idx, bool enable_log)
895  {
896      struct vhost_vring_addr addr;
897      int r;
898      memset(&addr, 0, sizeof(struct vhost_vring_addr));
899  
900      if (dev->vhost_ops->vhost_vq_get_addr) {
901          r = dev->vhost_ops->vhost_vq_get_addr(dev, &addr, vq);
902          if (r < 0) {
903              VHOST_OPS_DEBUG(r, "vhost_vq_get_addr failed");
904              return r;
905          }
906      } else {
907          addr.desc_user_addr = (uint64_t)(unsigned long)vq->desc;
908          addr.avail_user_addr = (uint64_t)(unsigned long)vq->avail;
909          addr.used_user_addr = (uint64_t)(unsigned long)vq->used;
910      }
911      addr.index = idx;
912      addr.log_guest_addr = vq->used_phys;
913      addr.flags = enable_log ? (1 << VHOST_VRING_F_LOG) : 0;
914      r = dev->vhost_ops->vhost_set_vring_addr(dev, &addr);
915      if (r < 0) {
916          VHOST_OPS_DEBUG(r, "vhost_set_vring_addr failed");
917      }
918      return r;
919  }
920  
921  static int vhost_dev_set_features(struct vhost_dev *dev,
922                                    bool enable_log)
923  {
924      uint64_t features = dev->acked_features;
925      int r;
926      if (enable_log) {
927          features |= 0x1ULL << VHOST_F_LOG_ALL;
928      }
929      if (!vhost_dev_has_iommu(dev)) {
930          features &= ~(0x1ULL << VIRTIO_F_IOMMU_PLATFORM);
931      }
932      if (dev->vhost_ops->vhost_force_iommu) {
933          if (dev->vhost_ops->vhost_force_iommu(dev) == true) {
934              features |= 0x1ULL << VIRTIO_F_IOMMU_PLATFORM;
935         }
936      }
937      r = dev->vhost_ops->vhost_set_features(dev, features);
938      if (r < 0) {
939          VHOST_OPS_DEBUG(r, "vhost_set_features failed");
940          goto out;
941      }
942      if (dev->vhost_ops->vhost_set_backend_cap) {
943          r = dev->vhost_ops->vhost_set_backend_cap(dev);
944          if (r < 0) {
945              VHOST_OPS_DEBUG(r, "vhost_set_backend_cap failed");
946              goto out;
947          }
948      }
949  
950  out:
951      return r;
952  }
953  
954  static int vhost_dev_set_log(struct vhost_dev *dev, bool enable_log)
955  {
956      int r, i, idx;
957      hwaddr addr;
958  
959      r = vhost_dev_set_features(dev, enable_log);
960      if (r < 0) {
961          goto err_features;
962      }
963      for (i = 0; i < dev->nvqs; ++i) {
964          idx = dev->vhost_ops->vhost_get_vq_index(dev, dev->vq_index + i);
965          addr = virtio_queue_get_desc_addr(dev->vdev, idx);
966          if (!addr) {
967              /*
968               * The queue might not be ready for start. If this
969               * is the case there is no reason to continue the process.
970               * The similar logic is used by the vhost_virtqueue_start()
971               * routine.
972               */
973              continue;
974          }
975          r = vhost_virtqueue_set_addr(dev, dev->vqs + i, idx,
976                                       enable_log);
977          if (r < 0) {
978              goto err_vq;
979          }
980      }
981      return 0;
982  err_vq:
983      for (; i >= 0; --i) {
984          idx = dev->vhost_ops->vhost_get_vq_index(dev, dev->vq_index + i);
985          addr = virtio_queue_get_desc_addr(dev->vdev, idx);
986          if (!addr) {
987              continue;
988          }
989          vhost_virtqueue_set_addr(dev, dev->vqs + i, idx,
990                                   dev->log_enabled);
991      }
992      vhost_dev_set_features(dev, dev->log_enabled);
993  err_features:
994      return r;
995  }
996  
997  static int vhost_migration_log(MemoryListener *listener, bool enable)
998  {
999      struct vhost_dev *dev = container_of(listener, struct vhost_dev,
1000                                           memory_listener);
1001      int r;
1002      if (enable == dev->log_enabled) {
1003          return 0;
1004      }
1005      if (!dev->started) {
1006          dev->log_enabled = enable;
1007          return 0;
1008      }
1009  
1010      r = 0;
1011      if (!enable) {
1012          r = vhost_dev_set_log(dev, false);
1013          if (r < 0) {
1014              goto check_dev_state;
1015          }
1016          vhost_log_put(dev, false);
1017      } else {
1018          vhost_dev_log_resize(dev, vhost_get_log_size(dev));
1019          r = vhost_dev_set_log(dev, true);
1020          if (r < 0) {
1021              goto check_dev_state;
1022          }
1023      }
1024  
1025  check_dev_state:
1026      dev->log_enabled = enable;
1027      /*
1028       * vhost-user-* devices could change their state during log
1029       * initialization due to disconnect. So check dev state after
1030       * vhost communication.
1031       */
1032      if (!dev->started) {
1033          /*
1034           * Since device is in the stopped state, it is okay for
1035           * migration. Return success.
1036           */
1037          r = 0;
1038      }
1039      if (r) {
1040          /* An error occurred. */
1041          dev->log_enabled = false;
1042      }
1043  
1044      return r;
1045  }
1046  
1047  static void vhost_log_global_start(MemoryListener *listener)
1048  {
1049      int r;
1050  
1051      r = vhost_migration_log(listener, true);
1052      if (r < 0) {
1053          abort();
1054      }
1055  }
1056  
1057  static void vhost_log_global_stop(MemoryListener *listener)
1058  {
1059      int r;
1060  
1061      r = vhost_migration_log(listener, false);
1062      if (r < 0) {
1063          abort();
1064      }
1065  }
1066  
1067  static void vhost_log_start(MemoryListener *listener,
1068                              MemoryRegionSection *section,
1069                              int old, int new)
1070  {
1071      /* FIXME: implement */
1072  }
1073  
1074  static void vhost_log_stop(MemoryListener *listener,
1075                             MemoryRegionSection *section,
1076                             int old, int new)
1077  {
1078      /* FIXME: implement */
1079  }
1080  
1081  /* The vhost driver natively knows how to handle the vrings of non
1082   * cross-endian legacy devices and modern devices. Only legacy devices
1083   * exposed to a bi-endian guest may require the vhost driver to use a
1084   * specific endianness.
1085   */
1086  static inline bool vhost_needs_vring_endian(VirtIODevice *vdev)
1087  {
1088      if (virtio_vdev_has_feature(vdev, VIRTIO_F_VERSION_1)) {
1089          return false;
1090      }
1091  #if HOST_BIG_ENDIAN
1092      return vdev->device_endian == VIRTIO_DEVICE_ENDIAN_LITTLE;
1093  #else
1094      return vdev->device_endian == VIRTIO_DEVICE_ENDIAN_BIG;
1095  #endif
1096  }
1097  
1098  static int vhost_virtqueue_set_vring_endian_legacy(struct vhost_dev *dev,
1099                                                     bool is_big_endian,
1100                                                     int vhost_vq_index)
1101  {
1102      int r;
1103      struct vhost_vring_state s = {
1104          .index = vhost_vq_index,
1105          .num = is_big_endian
1106      };
1107  
1108      r = dev->vhost_ops->vhost_set_vring_endian(dev, &s);
1109      if (r < 0) {
1110          VHOST_OPS_DEBUG(r, "vhost_set_vring_endian failed");
1111      }
1112      return r;
1113  }
1114  
1115  static int vhost_memory_region_lookup(struct vhost_dev *hdev,
1116                                        uint64_t gpa, uint64_t *uaddr,
1117                                        uint64_t *len)
1118  {
1119      int i;
1120  
1121      for (i = 0; i < hdev->mem->nregions; i++) {
1122          struct vhost_memory_region *reg = hdev->mem->regions + i;
1123  
1124          if (gpa >= reg->guest_phys_addr &&
1125              reg->guest_phys_addr + reg->memory_size > gpa) {
1126              *uaddr = reg->userspace_addr + gpa - reg->guest_phys_addr;
1127              *len = reg->guest_phys_addr + reg->memory_size - gpa;
1128              return 0;
1129          }
1130      }
1131  
1132      return -EFAULT;
1133  }
1134  
1135  int vhost_device_iotlb_miss(struct vhost_dev *dev, uint64_t iova, int write)
1136  {
1137      IOMMUTLBEntry iotlb;
1138      uint64_t uaddr, len;
1139      int ret = -EFAULT;
1140  
1141      RCU_READ_LOCK_GUARD();
1142  
1143      trace_vhost_iotlb_miss(dev, 1);
1144  
1145      iotlb = address_space_get_iotlb_entry(dev->vdev->dma_as,
1146                                            iova, write,
1147                                            MEMTXATTRS_UNSPECIFIED);
1148      if (iotlb.target_as != NULL) {
1149          ret = vhost_memory_region_lookup(dev, iotlb.translated_addr,
1150                                           &uaddr, &len);
1151          if (ret) {
1152              trace_vhost_iotlb_miss(dev, 3);
1153              error_report("Fail to lookup the translated address "
1154                           "%"PRIx64, iotlb.translated_addr);
1155              goto out;
1156          }
1157  
1158          len = MIN(iotlb.addr_mask + 1, len);
1159          iova = iova & ~iotlb.addr_mask;
1160  
1161          ret = vhost_backend_update_device_iotlb(dev, iova, uaddr,
1162                                                  len, iotlb.perm);
1163          if (ret) {
1164              trace_vhost_iotlb_miss(dev, 4);
1165              error_report("Fail to update device iotlb");
1166              goto out;
1167          }
1168      }
1169  
1170      trace_vhost_iotlb_miss(dev, 2);
1171  
1172  out:
1173      return ret;
1174  }
1175  
1176  int vhost_virtqueue_start(struct vhost_dev *dev,
1177                            struct VirtIODevice *vdev,
1178                            struct vhost_virtqueue *vq,
1179                            unsigned idx)
1180  {
1181      BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev)));
1182      VirtioBusState *vbus = VIRTIO_BUS(qbus);
1183      VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(vbus);
1184      hwaddr s, l, a;
1185      int r;
1186      int vhost_vq_index = dev->vhost_ops->vhost_get_vq_index(dev, idx);
1187      struct vhost_vring_file file = {
1188          .index = vhost_vq_index
1189      };
1190      struct vhost_vring_state state = {
1191          .index = vhost_vq_index
1192      };
1193      struct VirtQueue *vvq = virtio_get_queue(vdev, idx);
1194  
1195      a = virtio_queue_get_desc_addr(vdev, idx);
1196      if (a == 0) {
1197          /* Queue might not be ready for start */
1198          return 0;
1199      }
1200  
1201      vq->num = state.num = virtio_queue_get_num(vdev, idx);
1202      r = dev->vhost_ops->vhost_set_vring_num(dev, &state);
1203      if (r) {
1204          VHOST_OPS_DEBUG(r, "vhost_set_vring_num failed");
1205          return r;
1206      }
1207  
1208      state.num = virtio_queue_get_last_avail_idx(vdev, idx);
1209      r = dev->vhost_ops->vhost_set_vring_base(dev, &state);
1210      if (r) {
1211          VHOST_OPS_DEBUG(r, "vhost_set_vring_base failed");
1212          return r;
1213      }
1214  
1215      if (vhost_needs_vring_endian(vdev)) {
1216          r = vhost_virtqueue_set_vring_endian_legacy(dev,
1217                                                      virtio_is_big_endian(vdev),
1218                                                      vhost_vq_index);
1219          if (r) {
1220              return r;
1221          }
1222      }
1223  
1224      vq->desc_size = s = l = virtio_queue_get_desc_size(vdev, idx);
1225      vq->desc_phys = a;
1226      vq->desc = vhost_memory_map(dev, a, &l, false);
1227      if (!vq->desc || l != s) {
1228          r = -ENOMEM;
1229          goto fail_alloc_desc;
1230      }
1231      vq->avail_size = s = l = virtio_queue_get_avail_size(vdev, idx);
1232      vq->avail_phys = a = virtio_queue_get_avail_addr(vdev, idx);
1233      vq->avail = vhost_memory_map(dev, a, &l, false);
1234      if (!vq->avail || l != s) {
1235          r = -ENOMEM;
1236          goto fail_alloc_avail;
1237      }
1238      vq->used_size = s = l = virtio_queue_get_used_size(vdev, idx);
1239      vq->used_phys = a = virtio_queue_get_used_addr(vdev, idx);
1240      vq->used = vhost_memory_map(dev, a, &l, true);
1241      if (!vq->used || l != s) {
1242          r = -ENOMEM;
1243          goto fail_alloc_used;
1244      }
1245  
1246      r = vhost_virtqueue_set_addr(dev, vq, vhost_vq_index, dev->log_enabled);
1247      if (r < 0) {
1248          goto fail_alloc;
1249      }
1250  
1251      file.fd = event_notifier_get_fd(virtio_queue_get_host_notifier(vvq));
1252      r = dev->vhost_ops->vhost_set_vring_kick(dev, &file);
1253      if (r) {
1254          VHOST_OPS_DEBUG(r, "vhost_set_vring_kick failed");
1255          goto fail_kick;
1256      }
1257  
1258      /* Clear and discard previous events if any. */
1259      event_notifier_test_and_clear(&vq->masked_notifier);
1260  
1261      /* Init vring in unmasked state, unless guest_notifier_mask
1262       * will do it later.
1263       */
1264      if (!vdev->use_guest_notifier_mask) {
1265          /* TODO: check and handle errors. */
1266          vhost_virtqueue_mask(dev, vdev, idx, false);
1267      }
1268  
1269      if (k->query_guest_notifiers &&
1270          k->query_guest_notifiers(qbus->parent) &&
1271          virtio_queue_vector(vdev, idx) == VIRTIO_NO_VECTOR) {
1272          file.fd = -1;
1273          r = dev->vhost_ops->vhost_set_vring_call(dev, &file);
1274          if (r) {
1275              goto fail_vector;
1276          }
1277      }
1278  
1279      return 0;
1280  
1281  fail_vector:
1282  fail_kick:
1283  fail_alloc:
1284      vhost_memory_unmap(dev, vq->used, virtio_queue_get_used_size(vdev, idx),
1285                         0, 0);
1286  fail_alloc_used:
1287      vhost_memory_unmap(dev, vq->avail, virtio_queue_get_avail_size(vdev, idx),
1288                         0, 0);
1289  fail_alloc_avail:
1290      vhost_memory_unmap(dev, vq->desc, virtio_queue_get_desc_size(vdev, idx),
1291                         0, 0);
1292  fail_alloc_desc:
1293      return r;
1294  }
1295  
1296  void vhost_virtqueue_stop(struct vhost_dev *dev,
1297                            struct VirtIODevice *vdev,
1298                            struct vhost_virtqueue *vq,
1299                            unsigned idx)
1300  {
1301      int vhost_vq_index = dev->vhost_ops->vhost_get_vq_index(dev, idx);
1302      struct vhost_vring_state state = {
1303          .index = vhost_vq_index,
1304      };
1305      int r;
1306  
1307      if (virtio_queue_get_desc_addr(vdev, idx) == 0) {
1308          /* Don't stop the virtqueue which might have not been started */
1309          return;
1310      }
1311  
1312      r = dev->vhost_ops->vhost_get_vring_base(dev, &state);
1313      if (r < 0) {
1314          VHOST_OPS_DEBUG(r, "vhost VQ %u ring restore failed: %d", idx, r);
1315          /* Connection to the backend is broken, so let's sync internal
1316           * last avail idx to the device used idx.
1317           */
1318          virtio_queue_restore_last_avail_idx(vdev, idx);
1319      } else {
1320          virtio_queue_set_last_avail_idx(vdev, idx, state.num);
1321      }
1322      virtio_queue_invalidate_signalled_used(vdev, idx);
1323      virtio_queue_update_used_idx(vdev, idx);
1324  
1325      /* In the cross-endian case, we need to reset the vring endianness to
1326       * native as legacy devices expect so by default.
1327       */
1328      if (vhost_needs_vring_endian(vdev)) {
1329          vhost_virtqueue_set_vring_endian_legacy(dev,
1330                                                  !virtio_is_big_endian(vdev),
1331                                                  vhost_vq_index);
1332      }
1333  
1334      vhost_memory_unmap(dev, vq->used, virtio_queue_get_used_size(vdev, idx),
1335                         1, virtio_queue_get_used_size(vdev, idx));
1336      vhost_memory_unmap(dev, vq->avail, virtio_queue_get_avail_size(vdev, idx),
1337                         0, virtio_queue_get_avail_size(vdev, idx));
1338      vhost_memory_unmap(dev, vq->desc, virtio_queue_get_desc_size(vdev, idx),
1339                         0, virtio_queue_get_desc_size(vdev, idx));
1340  }
1341  
1342  static int vhost_virtqueue_set_busyloop_timeout(struct vhost_dev *dev,
1343                                                  int n, uint32_t timeout)
1344  {
1345      int vhost_vq_index = dev->vhost_ops->vhost_get_vq_index(dev, n);
1346      struct vhost_vring_state state = {
1347          .index = vhost_vq_index,
1348          .num = timeout,
1349      };
1350      int r;
1351  
1352      if (!dev->vhost_ops->vhost_set_vring_busyloop_timeout) {
1353          return -EINVAL;
1354      }
1355  
1356      r = dev->vhost_ops->vhost_set_vring_busyloop_timeout(dev, &state);
1357      if (r) {
1358          VHOST_OPS_DEBUG(r, "vhost_set_vring_busyloop_timeout failed");
1359          return r;
1360      }
1361  
1362      return 0;
1363  }
1364  
1365  static void vhost_virtqueue_error_notifier(EventNotifier *n)
1366  {
1367      struct vhost_virtqueue *vq = container_of(n, struct vhost_virtqueue,
1368                                                error_notifier);
1369      struct vhost_dev *dev = vq->dev;
1370      int index = vq - dev->vqs;
1371  
1372      if (event_notifier_test_and_clear(n) && dev->vdev) {
1373          VHOST_OPS_DEBUG(-EINVAL,  "vhost vring error in virtqueue %d",
1374                          dev->vq_index + index);
1375      }
1376  }
1377  
1378  static int vhost_virtqueue_init(struct vhost_dev *dev,
1379                                  struct vhost_virtqueue *vq, int n)
1380  {
1381      int vhost_vq_index = dev->vhost_ops->vhost_get_vq_index(dev, n);
1382      struct vhost_vring_file file = {
1383          .index = vhost_vq_index,
1384      };
1385      int r = event_notifier_init(&vq->masked_notifier, 0);
1386      if (r < 0) {
1387          return r;
1388      }
1389  
1390      file.fd = event_notifier_get_wfd(&vq->masked_notifier);
1391      r = dev->vhost_ops->vhost_set_vring_call(dev, &file);
1392      if (r) {
1393          VHOST_OPS_DEBUG(r, "vhost_set_vring_call failed");
1394          goto fail_call;
1395      }
1396  
1397      vq->dev = dev;
1398  
1399      if (dev->vhost_ops->vhost_set_vring_err) {
1400          r = event_notifier_init(&vq->error_notifier, 0);
1401          if (r < 0) {
1402              goto fail_call;
1403          }
1404  
1405          file.fd = event_notifier_get_fd(&vq->error_notifier);
1406          r = dev->vhost_ops->vhost_set_vring_err(dev, &file);
1407          if (r) {
1408              VHOST_OPS_DEBUG(r, "vhost_set_vring_err failed");
1409              goto fail_err;
1410          }
1411  
1412          event_notifier_set_handler(&vq->error_notifier,
1413                                     vhost_virtqueue_error_notifier);
1414      }
1415  
1416      return 0;
1417  
1418  fail_err:
1419      event_notifier_cleanup(&vq->error_notifier);
1420  fail_call:
1421      event_notifier_cleanup(&vq->masked_notifier);
1422      return r;
1423  }
1424  
1425  static void vhost_virtqueue_cleanup(struct vhost_virtqueue *vq)
1426  {
1427      event_notifier_cleanup(&vq->masked_notifier);
1428      if (vq->dev->vhost_ops->vhost_set_vring_err) {
1429          event_notifier_set_handler(&vq->error_notifier, NULL);
1430          event_notifier_cleanup(&vq->error_notifier);
1431      }
1432  }
1433  
1434  int vhost_dev_init(struct vhost_dev *hdev, void *opaque,
1435                     VhostBackendType backend_type, uint32_t busyloop_timeout,
1436                     Error **errp)
1437  {
1438      unsigned int used, reserved, limit;
1439      uint64_t features;
1440      int i, r, n_initialized_vqs = 0;
1441  
1442      hdev->vdev = NULL;
1443      hdev->migration_blocker = NULL;
1444  
1445      r = vhost_set_backend_type(hdev, backend_type);
1446      assert(r >= 0);
1447  
1448      r = hdev->vhost_ops->vhost_backend_init(hdev, opaque, errp);
1449      if (r < 0) {
1450          goto fail;
1451      }
1452  
1453      r = hdev->vhost_ops->vhost_set_owner(hdev);
1454      if (r < 0) {
1455          error_setg_errno(errp, -r, "vhost_set_owner failed");
1456          goto fail;
1457      }
1458  
1459      r = hdev->vhost_ops->vhost_get_features(hdev, &features);
1460      if (r < 0) {
1461          error_setg_errno(errp, -r, "vhost_get_features failed");
1462          goto fail;
1463      }
1464  
1465      limit = hdev->vhost_ops->vhost_backend_memslots_limit(hdev);
1466      if (limit < MEMORY_DEVICES_SAFE_MAX_MEMSLOTS &&
1467          memory_devices_memslot_auto_decision_active()) {
1468          error_setg(errp, "some memory device (like virtio-mem)"
1469              " decided how many memory slots to use based on the overall"
1470              " number of memory slots; this vhost backend would further"
1471              " restricts the overall number of memory slots");
1472          error_append_hint(errp, "Try plugging this vhost backend before"
1473              " plugging such memory devices.\n");
1474          r = -EINVAL;
1475          goto fail;
1476      }
1477  
1478      for (i = 0; i < hdev->nvqs; ++i, ++n_initialized_vqs) {
1479          r = vhost_virtqueue_init(hdev, hdev->vqs + i, hdev->vq_index + i);
1480          if (r < 0) {
1481              error_setg_errno(errp, -r, "Failed to initialize virtqueue %d", i);
1482              goto fail;
1483          }
1484      }
1485  
1486      if (busyloop_timeout) {
1487          for (i = 0; i < hdev->nvqs; ++i) {
1488              r = vhost_virtqueue_set_busyloop_timeout(hdev, hdev->vq_index + i,
1489                                                       busyloop_timeout);
1490              if (r < 0) {
1491                  error_setg_errno(errp, -r, "Failed to set busyloop timeout");
1492                  goto fail_busyloop;
1493              }
1494          }
1495      }
1496  
1497      hdev->features = features;
1498  
1499      hdev->memory_listener = (MemoryListener) {
1500          .name = "vhost",
1501          .begin = vhost_begin,
1502          .commit = vhost_commit,
1503          .region_add = vhost_region_addnop,
1504          .region_nop = vhost_region_addnop,
1505          .log_start = vhost_log_start,
1506          .log_stop = vhost_log_stop,
1507          .log_sync = vhost_log_sync,
1508          .log_global_start = vhost_log_global_start,
1509          .log_global_stop = vhost_log_global_stop,
1510          .priority = MEMORY_LISTENER_PRIORITY_DEV_BACKEND
1511      };
1512  
1513      hdev->iommu_listener = (MemoryListener) {
1514          .name = "vhost-iommu",
1515          .region_add = vhost_iommu_region_add,
1516          .region_del = vhost_iommu_region_del,
1517      };
1518  
1519      if (hdev->migration_blocker == NULL) {
1520          if (!(hdev->features & (0x1ULL << VHOST_F_LOG_ALL))) {
1521              error_setg(&hdev->migration_blocker,
1522                         "Migration disabled: vhost lacks VHOST_F_LOG_ALL feature.");
1523          } else if (vhost_dev_log_is_shared(hdev) && !qemu_memfd_alloc_check()) {
1524              error_setg(&hdev->migration_blocker,
1525                         "Migration disabled: failed to allocate shared memory");
1526          }
1527      }
1528  
1529      if (hdev->migration_blocker != NULL) {
1530          r = migrate_add_blocker_normal(&hdev->migration_blocker, errp);
1531          if (r < 0) {
1532              goto fail_busyloop;
1533          }
1534      }
1535  
1536      hdev->mem = g_malloc0(offsetof(struct vhost_memory, regions));
1537      hdev->n_mem_sections = 0;
1538      hdev->mem_sections = NULL;
1539      hdev->log = NULL;
1540      hdev->log_size = 0;
1541      hdev->log_enabled = false;
1542      hdev->started = false;
1543      memory_listener_register(&hdev->memory_listener, &address_space_memory);
1544      QLIST_INSERT_HEAD(&vhost_devices, hdev, entry);
1545  
1546      /*
1547       * The listener we registered properly updated the corresponding counter.
1548       * So we can trust that these values are accurate.
1549       */
1550      if (hdev->vhost_ops->vhost_backend_no_private_memslots &&
1551          hdev->vhost_ops->vhost_backend_no_private_memslots(hdev)) {
1552          used = used_shared_memslots;
1553      } else {
1554          used = used_memslots;
1555      }
1556      /*
1557       * We assume that all reserved memslots actually require a real memslot
1558       * in our vhost backend. This might not be true, for example, if the
1559       * memslot would be ROM. If ever relevant, we can optimize for that --
1560       * but we'll need additional information about the reservations.
1561       */
1562      reserved = memory_devices_get_reserved_memslots();
1563      if (used + reserved > limit) {
1564          error_setg(errp, "vhost backend memory slots limit (%d) is less"
1565                     " than current number of used (%d) and reserved (%d)"
1566                     " memory slots for memory devices.", limit, used, reserved);
1567          r = -EINVAL;
1568          goto fail_busyloop;
1569      }
1570  
1571      return 0;
1572  
1573  fail_busyloop:
1574      if (busyloop_timeout) {
1575          while (--i >= 0) {
1576              vhost_virtqueue_set_busyloop_timeout(hdev, hdev->vq_index + i, 0);
1577          }
1578      }
1579  fail:
1580      hdev->nvqs = n_initialized_vqs;
1581      vhost_dev_cleanup(hdev);
1582      return r;
1583  }
1584  
1585  void vhost_dev_cleanup(struct vhost_dev *hdev)
1586  {
1587      int i;
1588  
1589      trace_vhost_dev_cleanup(hdev);
1590  
1591      for (i = 0; i < hdev->nvqs; ++i) {
1592          vhost_virtqueue_cleanup(hdev->vqs + i);
1593      }
1594      if (hdev->mem) {
1595          /* those are only safe after successful init */
1596          memory_listener_unregister(&hdev->memory_listener);
1597          QLIST_REMOVE(hdev, entry);
1598      }
1599      migrate_del_blocker(&hdev->migration_blocker);
1600      g_free(hdev->mem);
1601      g_free(hdev->mem_sections);
1602      if (hdev->vhost_ops) {
1603          hdev->vhost_ops->vhost_backend_cleanup(hdev);
1604      }
1605      assert(!hdev->log);
1606  
1607      memset(hdev, 0, sizeof(struct vhost_dev));
1608  }
1609  
1610  static void vhost_dev_disable_notifiers_nvqs(struct vhost_dev *hdev,
1611                                               VirtIODevice *vdev,
1612                                               unsigned int nvqs)
1613  {
1614      BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev)));
1615      int i, r;
1616  
1617      /*
1618       * Batch all the host notifiers in a single transaction to avoid
1619       * quadratic time complexity in address_space_update_ioeventfds().
1620       */
1621      memory_region_transaction_begin();
1622  
1623      for (i = 0; i < nvqs; ++i) {
1624          r = virtio_bus_set_host_notifier(VIRTIO_BUS(qbus), hdev->vq_index + i,
1625                                           false);
1626          if (r < 0) {
1627              error_report("vhost VQ %d notifier cleanup failed: %d", i, -r);
1628          }
1629          assert(r >= 0);
1630      }
1631  
1632      /*
1633       * The transaction expects the ioeventfds to be open when it
1634       * commits. Do it now, before the cleanup loop.
1635       */
1636      memory_region_transaction_commit();
1637  
1638      for (i = 0; i < nvqs; ++i) {
1639          virtio_bus_cleanup_host_notifier(VIRTIO_BUS(qbus), hdev->vq_index + i);
1640      }
1641      virtio_device_release_ioeventfd(vdev);
1642  }
1643  
1644  /* Stop processing guest IO notifications in qemu.
1645   * Start processing them in vhost in kernel.
1646   */
1647  int vhost_dev_enable_notifiers(struct vhost_dev *hdev, VirtIODevice *vdev)
1648  {
1649      BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev)));
1650      int i, r;
1651  
1652      /* We will pass the notifiers to the kernel, make sure that QEMU
1653       * doesn't interfere.
1654       */
1655      r = virtio_device_grab_ioeventfd(vdev);
1656      if (r < 0) {
1657          error_report("binding does not support host notifiers");
1658          return r;
1659      }
1660  
1661      /*
1662       * Batch all the host notifiers in a single transaction to avoid
1663       * quadratic time complexity in address_space_update_ioeventfds().
1664       */
1665      memory_region_transaction_begin();
1666  
1667      for (i = 0; i < hdev->nvqs; ++i) {
1668          r = virtio_bus_set_host_notifier(VIRTIO_BUS(qbus), hdev->vq_index + i,
1669                                           true);
1670          if (r < 0) {
1671              error_report("vhost VQ %d notifier binding failed: %d", i, -r);
1672              memory_region_transaction_commit();
1673              vhost_dev_disable_notifiers_nvqs(hdev, vdev, i);
1674              return r;
1675          }
1676      }
1677  
1678      memory_region_transaction_commit();
1679  
1680      return 0;
1681  }
1682  
1683  /* Stop processing guest IO notifications in vhost.
1684   * Start processing them in qemu.
1685   * This might actually run the qemu handlers right away,
1686   * so virtio in qemu must be completely setup when this is called.
1687   */
1688  void vhost_dev_disable_notifiers(struct vhost_dev *hdev, VirtIODevice *vdev)
1689  {
1690      vhost_dev_disable_notifiers_nvqs(hdev, vdev, hdev->nvqs);
1691  }
1692  
1693  /* Test and clear event pending status.
1694   * Should be called after unmask to avoid losing events.
1695   */
1696  bool vhost_virtqueue_pending(struct vhost_dev *hdev, int n)
1697  {
1698      struct vhost_virtqueue *vq = hdev->vqs + n - hdev->vq_index;
1699      assert(n >= hdev->vq_index && n < hdev->vq_index + hdev->nvqs);
1700      return event_notifier_test_and_clear(&vq->masked_notifier);
1701  }
1702  
1703  /* Mask/unmask events from this vq. */
1704  void vhost_virtqueue_mask(struct vhost_dev *hdev, VirtIODevice *vdev, int n,
1705                           bool mask)
1706  {
1707      struct VirtQueue *vvq = virtio_get_queue(vdev, n);
1708      int r, index = n - hdev->vq_index;
1709      struct vhost_vring_file file;
1710  
1711      /* should only be called after backend is connected */
1712      assert(hdev->vhost_ops);
1713  
1714      if (mask) {
1715          assert(vdev->use_guest_notifier_mask);
1716          file.fd = event_notifier_get_wfd(&hdev->vqs[index].masked_notifier);
1717      } else {
1718          file.fd = event_notifier_get_wfd(virtio_queue_get_guest_notifier(vvq));
1719      }
1720  
1721      file.index = hdev->vhost_ops->vhost_get_vq_index(hdev, n);
1722      r = hdev->vhost_ops->vhost_set_vring_call(hdev, &file);
1723      if (r < 0) {
1724          error_report("vhost_set_vring_call failed %d", -r);
1725      }
1726  }
1727  
1728  bool vhost_config_pending(struct vhost_dev *hdev)
1729  {
1730      assert(hdev->vhost_ops);
1731      if ((hdev->started == false) ||
1732          (hdev->vhost_ops->vhost_set_config_call == NULL)) {
1733          return false;
1734      }
1735  
1736      EventNotifier *notifier =
1737          &hdev->vqs[VHOST_QUEUE_NUM_CONFIG_INR].masked_config_notifier;
1738      return event_notifier_test_and_clear(notifier);
1739  }
1740  
1741  void vhost_config_mask(struct vhost_dev *hdev, VirtIODevice *vdev, bool mask)
1742  {
1743      int fd;
1744      int r;
1745      EventNotifier *notifier =
1746          &hdev->vqs[VHOST_QUEUE_NUM_CONFIG_INR].masked_config_notifier;
1747      EventNotifier *config_notifier = &vdev->config_notifier;
1748      assert(hdev->vhost_ops);
1749  
1750      if ((hdev->started == false) ||
1751          (hdev->vhost_ops->vhost_set_config_call == NULL)) {
1752          return;
1753      }
1754      if (mask) {
1755          assert(vdev->use_guest_notifier_mask);
1756          fd = event_notifier_get_fd(notifier);
1757      } else {
1758          fd = event_notifier_get_fd(config_notifier);
1759      }
1760      r = hdev->vhost_ops->vhost_set_config_call(hdev, fd);
1761      if (r < 0) {
1762          error_report("vhost_set_config_call failed %d", -r);
1763      }
1764  }
1765  
1766  static void vhost_stop_config_intr(struct vhost_dev *dev)
1767  {
1768      int fd = -1;
1769      assert(dev->vhost_ops);
1770      if (dev->vhost_ops->vhost_set_config_call) {
1771          dev->vhost_ops->vhost_set_config_call(dev, fd);
1772      }
1773  }
1774  
1775  static void vhost_start_config_intr(struct vhost_dev *dev)
1776  {
1777      int r;
1778  
1779      assert(dev->vhost_ops);
1780      int fd = event_notifier_get_fd(&dev->vdev->config_notifier);
1781      if (dev->vhost_ops->vhost_set_config_call) {
1782          r = dev->vhost_ops->vhost_set_config_call(dev, fd);
1783          if (!r) {
1784              event_notifier_set(&dev->vdev->config_notifier);
1785          }
1786      }
1787  }
1788  
1789  uint64_t vhost_get_features(struct vhost_dev *hdev, const int *feature_bits,
1790                              uint64_t features)
1791  {
1792      const int *bit = feature_bits;
1793      while (*bit != VHOST_INVALID_FEATURE_BIT) {
1794          uint64_t bit_mask = (1ULL << *bit);
1795          if (!(hdev->features & bit_mask)) {
1796              features &= ~bit_mask;
1797          }
1798          bit++;
1799      }
1800      return features;
1801  }
1802  
1803  void vhost_ack_features(struct vhost_dev *hdev, const int *feature_bits,
1804                          uint64_t features)
1805  {
1806      const int *bit = feature_bits;
1807      while (*bit != VHOST_INVALID_FEATURE_BIT) {
1808          uint64_t bit_mask = (1ULL << *bit);
1809          if (features & bit_mask) {
1810              hdev->acked_features |= bit_mask;
1811          }
1812          bit++;
1813      }
1814  }
1815  
1816  int vhost_dev_get_config(struct vhost_dev *hdev, uint8_t *config,
1817                           uint32_t config_len, Error **errp)
1818  {
1819      assert(hdev->vhost_ops);
1820  
1821      if (hdev->vhost_ops->vhost_get_config) {
1822          return hdev->vhost_ops->vhost_get_config(hdev, config, config_len,
1823                                                   errp);
1824      }
1825  
1826      error_setg(errp, "vhost_get_config not implemented");
1827      return -ENOSYS;
1828  }
1829  
1830  int vhost_dev_set_config(struct vhost_dev *hdev, const uint8_t *data,
1831                           uint32_t offset, uint32_t size, uint32_t flags)
1832  {
1833      assert(hdev->vhost_ops);
1834  
1835      if (hdev->vhost_ops->vhost_set_config) {
1836          return hdev->vhost_ops->vhost_set_config(hdev, data, offset,
1837                                                   size, flags);
1838      }
1839  
1840      return -ENOSYS;
1841  }
1842  
1843  void vhost_dev_set_config_notifier(struct vhost_dev *hdev,
1844                                     const VhostDevConfigOps *ops)
1845  {
1846      hdev->config_ops = ops;
1847  }
1848  
1849  void vhost_dev_free_inflight(struct vhost_inflight *inflight)
1850  {
1851      if (inflight && inflight->addr) {
1852          qemu_memfd_free(inflight->addr, inflight->size, inflight->fd);
1853          inflight->addr = NULL;
1854          inflight->fd = -1;
1855      }
1856  }
1857  
1858  static int vhost_dev_resize_inflight(struct vhost_inflight *inflight,
1859                                       uint64_t new_size)
1860  {
1861      Error *err = NULL;
1862      int fd = -1;
1863      void *addr = qemu_memfd_alloc("vhost-inflight", new_size,
1864                                    F_SEAL_GROW | F_SEAL_SHRINK | F_SEAL_SEAL,
1865                                    &fd, &err);
1866  
1867      if (err) {
1868          error_report_err(err);
1869          return -ENOMEM;
1870      }
1871  
1872      vhost_dev_free_inflight(inflight);
1873      inflight->offset = 0;
1874      inflight->addr = addr;
1875      inflight->fd = fd;
1876      inflight->size = new_size;
1877  
1878      return 0;
1879  }
1880  
1881  void vhost_dev_save_inflight(struct vhost_inflight *inflight, QEMUFile *f)
1882  {
1883      if (inflight->addr) {
1884          qemu_put_be64(f, inflight->size);
1885          qemu_put_be16(f, inflight->queue_size);
1886          qemu_put_buffer(f, inflight->addr, inflight->size);
1887      } else {
1888          qemu_put_be64(f, 0);
1889      }
1890  }
1891  
1892  int vhost_dev_load_inflight(struct vhost_inflight *inflight, QEMUFile *f)
1893  {
1894      uint64_t size;
1895  
1896      size = qemu_get_be64(f);
1897      if (!size) {
1898          return 0;
1899      }
1900  
1901      if (inflight->size != size) {
1902          int ret = vhost_dev_resize_inflight(inflight, size);
1903          if (ret < 0) {
1904              return ret;
1905          }
1906      }
1907      inflight->queue_size = qemu_get_be16(f);
1908  
1909      qemu_get_buffer(f, inflight->addr, size);
1910  
1911      return 0;
1912  }
1913  
1914  int vhost_dev_prepare_inflight(struct vhost_dev *hdev, VirtIODevice *vdev)
1915  {
1916      int r;
1917  
1918      if (hdev->vhost_ops->vhost_get_inflight_fd == NULL ||
1919          hdev->vhost_ops->vhost_set_inflight_fd == NULL) {
1920          return 0;
1921      }
1922  
1923      hdev->vdev = vdev;
1924  
1925      r = vhost_dev_set_features(hdev, hdev->log_enabled);
1926      if (r < 0) {
1927          VHOST_OPS_DEBUG(r, "vhost_dev_prepare_inflight failed");
1928          return r;
1929      }
1930  
1931      return 0;
1932  }
1933  
1934  int vhost_dev_set_inflight(struct vhost_dev *dev,
1935                             struct vhost_inflight *inflight)
1936  {
1937      int r;
1938  
1939      if (dev->vhost_ops->vhost_set_inflight_fd && inflight->addr) {
1940          r = dev->vhost_ops->vhost_set_inflight_fd(dev, inflight);
1941          if (r) {
1942              VHOST_OPS_DEBUG(r, "vhost_set_inflight_fd failed");
1943              return r;
1944          }
1945      }
1946  
1947      return 0;
1948  }
1949  
1950  int vhost_dev_get_inflight(struct vhost_dev *dev, uint16_t queue_size,
1951                             struct vhost_inflight *inflight)
1952  {
1953      int r;
1954  
1955      if (dev->vhost_ops->vhost_get_inflight_fd) {
1956          r = dev->vhost_ops->vhost_get_inflight_fd(dev, queue_size, inflight);
1957          if (r) {
1958              VHOST_OPS_DEBUG(r, "vhost_get_inflight_fd failed");
1959              return r;
1960          }
1961      }
1962  
1963      return 0;
1964  }
1965  
1966  static int vhost_dev_set_vring_enable(struct vhost_dev *hdev, int enable)
1967  {
1968      if (!hdev->vhost_ops->vhost_set_vring_enable) {
1969          return 0;
1970      }
1971  
1972      /*
1973       * For vhost-user devices, if VHOST_USER_F_PROTOCOL_FEATURES has not
1974       * been negotiated, the rings start directly in the enabled state, and
1975       * .vhost_set_vring_enable callback will fail since
1976       * VHOST_USER_SET_VRING_ENABLE is not supported.
1977       */
1978      if (hdev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_USER &&
1979          !virtio_has_feature(hdev->backend_features,
1980                              VHOST_USER_F_PROTOCOL_FEATURES)) {
1981          return 0;
1982      }
1983  
1984      return hdev->vhost_ops->vhost_set_vring_enable(hdev, enable);
1985  }
1986  
1987  /*
1988   * Host notifiers must be enabled at this point.
1989   *
1990   * If @vrings is true, this function will enable all vrings before starting the
1991   * device. If it is false, the vring initialization is left to be done by the
1992   * caller.
1993   */
1994  int vhost_dev_start(struct vhost_dev *hdev, VirtIODevice *vdev, bool vrings)
1995  {
1996      int i, r;
1997  
1998      /* should only be called after backend is connected */
1999      assert(hdev->vhost_ops);
2000  
2001      trace_vhost_dev_start(hdev, vdev->name, vrings);
2002  
2003      vdev->vhost_started = true;
2004      hdev->started = true;
2005      hdev->vdev = vdev;
2006  
2007      r = vhost_dev_set_features(hdev, hdev->log_enabled);
2008      if (r < 0) {
2009          goto fail_features;
2010      }
2011  
2012      if (vhost_dev_has_iommu(hdev)) {
2013          memory_listener_register(&hdev->iommu_listener, vdev->dma_as);
2014      }
2015  
2016      r = hdev->vhost_ops->vhost_set_mem_table(hdev, hdev->mem);
2017      if (r < 0) {
2018          VHOST_OPS_DEBUG(r, "vhost_set_mem_table failed");
2019          goto fail_mem;
2020      }
2021      for (i = 0; i < hdev->nvqs; ++i) {
2022          r = vhost_virtqueue_start(hdev,
2023                                    vdev,
2024                                    hdev->vqs + i,
2025                                    hdev->vq_index + i);
2026          if (r < 0) {
2027              goto fail_vq;
2028          }
2029      }
2030  
2031      r = event_notifier_init(
2032          &hdev->vqs[VHOST_QUEUE_NUM_CONFIG_INR].masked_config_notifier, 0);
2033      if (r < 0) {
2034          VHOST_OPS_DEBUG(r, "event_notifier_init failed");
2035          goto fail_vq;
2036      }
2037      event_notifier_test_and_clear(
2038          &hdev->vqs[VHOST_QUEUE_NUM_CONFIG_INR].masked_config_notifier);
2039      if (!vdev->use_guest_notifier_mask) {
2040          vhost_config_mask(hdev, vdev, true);
2041      }
2042      if (hdev->log_enabled) {
2043          uint64_t log_base;
2044  
2045          hdev->log_size = vhost_get_log_size(hdev);
2046          hdev->log = vhost_log_get(hdev->log_size,
2047                                    vhost_dev_log_is_shared(hdev));
2048          log_base = (uintptr_t)hdev->log->log;
2049          r = hdev->vhost_ops->vhost_set_log_base(hdev,
2050                                                  hdev->log_size ? log_base : 0,
2051                                                  hdev->log);
2052          if (r < 0) {
2053              VHOST_OPS_DEBUG(r, "vhost_set_log_base failed");
2054              goto fail_log;
2055          }
2056      }
2057      if (vrings) {
2058          r = vhost_dev_set_vring_enable(hdev, true);
2059          if (r) {
2060              goto fail_log;
2061          }
2062      }
2063      if (hdev->vhost_ops->vhost_dev_start) {
2064          r = hdev->vhost_ops->vhost_dev_start(hdev, true);
2065          if (r) {
2066              goto fail_start;
2067          }
2068      }
2069      if (vhost_dev_has_iommu(hdev) &&
2070          hdev->vhost_ops->vhost_set_iotlb_callback) {
2071              hdev->vhost_ops->vhost_set_iotlb_callback(hdev, true);
2072  
2073          /* Update used ring information for IOTLB to work correctly,
2074           * vhost-kernel code requires for this.*/
2075          for (i = 0; i < hdev->nvqs; ++i) {
2076              struct vhost_virtqueue *vq = hdev->vqs + i;
2077              vhost_device_iotlb_miss(hdev, vq->used_phys, true);
2078          }
2079      }
2080      vhost_start_config_intr(hdev);
2081      return 0;
2082  fail_start:
2083      if (vrings) {
2084          vhost_dev_set_vring_enable(hdev, false);
2085      }
2086  fail_log:
2087      vhost_log_put(hdev, false);
2088  fail_vq:
2089      while (--i >= 0) {
2090          vhost_virtqueue_stop(hdev,
2091                               vdev,
2092                               hdev->vqs + i,
2093                               hdev->vq_index + i);
2094      }
2095  
2096  fail_mem:
2097      if (vhost_dev_has_iommu(hdev)) {
2098          memory_listener_unregister(&hdev->iommu_listener);
2099      }
2100  fail_features:
2101      vdev->vhost_started = false;
2102      hdev->started = false;
2103      return r;
2104  }
2105  
2106  /* Host notifiers must be enabled at this point. */
2107  void vhost_dev_stop(struct vhost_dev *hdev, VirtIODevice *vdev, bool vrings)
2108  {
2109      int i;
2110  
2111      /* should only be called after backend is connected */
2112      assert(hdev->vhost_ops);
2113      event_notifier_test_and_clear(
2114          &hdev->vqs[VHOST_QUEUE_NUM_CONFIG_INR].masked_config_notifier);
2115      event_notifier_test_and_clear(&vdev->config_notifier);
2116      event_notifier_cleanup(
2117          &hdev->vqs[VHOST_QUEUE_NUM_CONFIG_INR].masked_config_notifier);
2118  
2119      trace_vhost_dev_stop(hdev, vdev->name, vrings);
2120  
2121      if (hdev->vhost_ops->vhost_dev_start) {
2122          hdev->vhost_ops->vhost_dev_start(hdev, false);
2123      }
2124      if (vrings) {
2125          vhost_dev_set_vring_enable(hdev, false);
2126      }
2127      for (i = 0; i < hdev->nvqs; ++i) {
2128          vhost_virtqueue_stop(hdev,
2129                               vdev,
2130                               hdev->vqs + i,
2131                               hdev->vq_index + i);
2132      }
2133      if (hdev->vhost_ops->vhost_reset_status) {
2134          hdev->vhost_ops->vhost_reset_status(hdev);
2135      }
2136  
2137      if (vhost_dev_has_iommu(hdev)) {
2138          if (hdev->vhost_ops->vhost_set_iotlb_callback) {
2139              hdev->vhost_ops->vhost_set_iotlb_callback(hdev, false);
2140          }
2141          memory_listener_unregister(&hdev->iommu_listener);
2142      }
2143      vhost_stop_config_intr(hdev);
2144      vhost_log_put(hdev, true);
2145      hdev->started = false;
2146      vdev->vhost_started = false;
2147      hdev->vdev = NULL;
2148  }
2149  
2150  int vhost_net_set_backend(struct vhost_dev *hdev,
2151                            struct vhost_vring_file *file)
2152  {
2153      if (hdev->vhost_ops->vhost_net_set_backend) {
2154          return hdev->vhost_ops->vhost_net_set_backend(hdev, file);
2155      }
2156  
2157      return -ENOSYS;
2158  }
2159  
2160  int vhost_reset_device(struct vhost_dev *hdev)
2161  {
2162      if (hdev->vhost_ops->vhost_reset_device) {
2163          return hdev->vhost_ops->vhost_reset_device(hdev);
2164      }
2165  
2166      return -ENOSYS;
2167  }
2168  
2169  bool vhost_supports_device_state(struct vhost_dev *dev)
2170  {
2171      if (dev->vhost_ops->vhost_supports_device_state) {
2172          return dev->vhost_ops->vhost_supports_device_state(dev);
2173      }
2174  
2175      return false;
2176  }
2177  
2178  int vhost_set_device_state_fd(struct vhost_dev *dev,
2179                                VhostDeviceStateDirection direction,
2180                                VhostDeviceStatePhase phase,
2181                                int fd,
2182                                int *reply_fd,
2183                                Error **errp)
2184  {
2185      if (dev->vhost_ops->vhost_set_device_state_fd) {
2186          return dev->vhost_ops->vhost_set_device_state_fd(dev, direction, phase,
2187                                                           fd, reply_fd, errp);
2188      }
2189  
2190      error_setg(errp,
2191                 "vhost transport does not support migration state transfer");
2192      return -ENOSYS;
2193  }
2194  
2195  int vhost_check_device_state(struct vhost_dev *dev, Error **errp)
2196  {
2197      if (dev->vhost_ops->vhost_check_device_state) {
2198          return dev->vhost_ops->vhost_check_device_state(dev, errp);
2199      }
2200  
2201      error_setg(errp,
2202                 "vhost transport does not support migration state transfer");
2203      return -ENOSYS;
2204  }
2205  
2206  int vhost_save_backend_state(struct vhost_dev *dev, QEMUFile *f, Error **errp)
2207  {
2208      ERRP_GUARD();
2209      /* Maximum chunk size in which to transfer the state */
2210      const size_t chunk_size = 1 * 1024 * 1024;
2211      g_autofree void *transfer_buf = NULL;
2212      g_autoptr(GError) g_err = NULL;
2213      int pipe_fds[2], read_fd = -1, write_fd = -1, reply_fd = -1;
2214      int ret;
2215  
2216      /* [0] for reading (our end), [1] for writing (back-end's end) */
2217      if (!g_unix_open_pipe(pipe_fds, FD_CLOEXEC, &g_err)) {
2218          error_setg(errp, "Failed to set up state transfer pipe: %s",
2219                     g_err->message);
2220          ret = -EINVAL;
2221          goto fail;
2222      }
2223  
2224      read_fd = pipe_fds[0];
2225      write_fd = pipe_fds[1];
2226  
2227      /*
2228       * VHOST_TRANSFER_STATE_PHASE_STOPPED means the device must be stopped.
2229       * Ideally, it is suspended, but SUSPEND/RESUME currently do not exist for
2230       * vhost-user, so just check that it is stopped at all.
2231       */
2232      assert(!dev->started);
2233  
2234      /* Transfer ownership of write_fd to the back-end */
2235      ret = vhost_set_device_state_fd(dev,
2236                                      VHOST_TRANSFER_STATE_DIRECTION_SAVE,
2237                                      VHOST_TRANSFER_STATE_PHASE_STOPPED,
2238                                      write_fd,
2239                                      &reply_fd,
2240                                      errp);
2241      if (ret < 0) {
2242          error_prepend(errp, "Failed to initiate state transfer: ");
2243          goto fail;
2244      }
2245  
2246      /* If the back-end wishes to use a different pipe, switch over */
2247      if (reply_fd >= 0) {
2248          close(read_fd);
2249          read_fd = reply_fd;
2250      }
2251  
2252      transfer_buf = g_malloc(chunk_size);
2253  
2254      while (true) {
2255          ssize_t read_ret;
2256  
2257          read_ret = RETRY_ON_EINTR(read(read_fd, transfer_buf, chunk_size));
2258          if (read_ret < 0) {
2259              ret = -errno;
2260              error_setg_errno(errp, -ret, "Failed to receive state");
2261              goto fail;
2262          }
2263  
2264          assert(read_ret <= chunk_size);
2265          qemu_put_be32(f, read_ret);
2266  
2267          if (read_ret == 0) {
2268              /* EOF */
2269              break;
2270          }
2271  
2272          qemu_put_buffer(f, transfer_buf, read_ret);
2273      }
2274  
2275      /*
2276       * Back-end will not really care, but be clean and close our end of the pipe
2277       * before inquiring the back-end about whether transfer was successful
2278       */
2279      close(read_fd);
2280      read_fd = -1;
2281  
2282      /* Also, verify that the device is still stopped */
2283      assert(!dev->started);
2284  
2285      ret = vhost_check_device_state(dev, errp);
2286      if (ret < 0) {
2287          goto fail;
2288      }
2289  
2290      ret = 0;
2291  fail:
2292      if (read_fd >= 0) {
2293          close(read_fd);
2294      }
2295  
2296      return ret;
2297  }
2298  
2299  int vhost_load_backend_state(struct vhost_dev *dev, QEMUFile *f, Error **errp)
2300  {
2301      ERRP_GUARD();
2302      size_t transfer_buf_size = 0;
2303      g_autofree void *transfer_buf = NULL;
2304      g_autoptr(GError) g_err = NULL;
2305      int pipe_fds[2], read_fd = -1, write_fd = -1, reply_fd = -1;
2306      int ret;
2307  
2308      /* [0] for reading (back-end's end), [1] for writing (our end) */
2309      if (!g_unix_open_pipe(pipe_fds, FD_CLOEXEC, &g_err)) {
2310          error_setg(errp, "Failed to set up state transfer pipe: %s",
2311                     g_err->message);
2312          ret = -EINVAL;
2313          goto fail;
2314      }
2315  
2316      read_fd = pipe_fds[0];
2317      write_fd = pipe_fds[1];
2318  
2319      /*
2320       * VHOST_TRANSFER_STATE_PHASE_STOPPED means the device must be stopped.
2321       * Ideally, it is suspended, but SUSPEND/RESUME currently do not exist for
2322       * vhost-user, so just check that it is stopped at all.
2323       */
2324      assert(!dev->started);
2325  
2326      /* Transfer ownership of read_fd to the back-end */
2327      ret = vhost_set_device_state_fd(dev,
2328                                      VHOST_TRANSFER_STATE_DIRECTION_LOAD,
2329                                      VHOST_TRANSFER_STATE_PHASE_STOPPED,
2330                                      read_fd,
2331                                      &reply_fd,
2332                                      errp);
2333      if (ret < 0) {
2334          error_prepend(errp, "Failed to initiate state transfer: ");
2335          goto fail;
2336      }
2337  
2338      /* If the back-end wishes to use a different pipe, switch over */
2339      if (reply_fd >= 0) {
2340          close(write_fd);
2341          write_fd = reply_fd;
2342      }
2343  
2344      while (true) {
2345          size_t this_chunk_size = qemu_get_be32(f);
2346          ssize_t write_ret;
2347          const uint8_t *transfer_pointer;
2348  
2349          if (this_chunk_size == 0) {
2350              /* End of state */
2351              break;
2352          }
2353  
2354          if (transfer_buf_size < this_chunk_size) {
2355              transfer_buf = g_realloc(transfer_buf, this_chunk_size);
2356              transfer_buf_size = this_chunk_size;
2357          }
2358  
2359          if (qemu_get_buffer(f, transfer_buf, this_chunk_size) <
2360                  this_chunk_size)
2361          {
2362              error_setg(errp, "Failed to read state");
2363              ret = -EINVAL;
2364              goto fail;
2365          }
2366  
2367          transfer_pointer = transfer_buf;
2368          while (this_chunk_size > 0) {
2369              write_ret = RETRY_ON_EINTR(
2370                  write(write_fd, transfer_pointer, this_chunk_size)
2371              );
2372              if (write_ret < 0) {
2373                  ret = -errno;
2374                  error_setg_errno(errp, -ret, "Failed to send state");
2375                  goto fail;
2376              } else if (write_ret == 0) {
2377                  error_setg(errp, "Failed to send state: Connection is closed");
2378                  ret = -ECONNRESET;
2379                  goto fail;
2380              }
2381  
2382              assert(write_ret <= this_chunk_size);
2383              this_chunk_size -= write_ret;
2384              transfer_pointer += write_ret;
2385          }
2386      }
2387  
2388      /*
2389       * Close our end, thus ending transfer, before inquiring the back-end about
2390       * whether transfer was successful
2391       */
2392      close(write_fd);
2393      write_fd = -1;
2394  
2395      /* Also, verify that the device is still stopped */
2396      assert(!dev->started);
2397  
2398      ret = vhost_check_device_state(dev, errp);
2399      if (ret < 0) {
2400          goto fail;
2401      }
2402  
2403      ret = 0;
2404  fail:
2405      if (write_fd >= 0) {
2406          close(write_fd);
2407      }
2408  
2409      return ret;
2410  }
2411