xref: /openbmc/qemu/accel/kvm/kvm-all.c (revision 474487611b7d14be6bd864e8b2a2a642bad9f720)
1  /*
2   * QEMU KVM support
3   *
4   * Copyright IBM, Corp. 2008
5   *           Red Hat, Inc. 2008
6   *
7   * Authors:
8   *  Anthony Liguori   <aliguori@us.ibm.com>
9   *  Glauber Costa     <gcosta@redhat.com>
10   *
11   * This work is licensed under the terms of the GNU GPL, version 2 or later.
12   * See the COPYING file in the top-level directory.
13   *
14   */
15  
16  #include "qemu/osdep.h"
17  #include <sys/ioctl.h>
18  #include <poll.h>
19  
20  #include <linux/kvm.h>
21  
22  #include "qemu/atomic.h"
23  #include "qemu/option.h"
24  #include "qemu/config-file.h"
25  #include "qemu/error-report.h"
26  #include "qapi/error.h"
27  #include "hw/pci/msi.h"
28  #include "hw/pci/msix.h"
29  #include "hw/s390x/adapter.h"
30  #include "gdbstub/enums.h"
31  #include "sysemu/kvm_int.h"
32  #include "sysemu/runstate.h"
33  #include "sysemu/cpus.h"
34  #include "sysemu/accel-blocker.h"
35  #include "qemu/bswap.h"
36  #include "exec/memory.h"
37  #include "exec/ram_addr.h"
38  #include "qemu/event_notifier.h"
39  #include "qemu/main-loop.h"
40  #include "trace.h"
41  #include "hw/irq.h"
42  #include "qapi/visitor.h"
43  #include "qapi/qapi-types-common.h"
44  #include "qapi/qapi-visit-common.h"
45  #include "sysemu/reset.h"
46  #include "qemu/guest-random.h"
47  #include "sysemu/hw_accel.h"
48  #include "kvm-cpus.h"
49  #include "sysemu/dirtylimit.h"
50  #include "qemu/range.h"
51  
52  #include "hw/boards.h"
53  #include "sysemu/stats.h"
54  
55  /* This check must be after config-host.h is included */
56  #ifdef CONFIG_EVENTFD
57  #include <sys/eventfd.h>
58  #endif
59  
60  /* KVM uses PAGE_SIZE in its definition of KVM_COALESCED_MMIO_MAX. We
61   * need to use the real host PAGE_SIZE, as that's what KVM will use.
62   */
63  #ifdef PAGE_SIZE
64  #undef PAGE_SIZE
65  #endif
66  #define PAGE_SIZE qemu_real_host_page_size()
67  
68  #ifndef KVM_GUESTDBG_BLOCKIRQ
69  #define KVM_GUESTDBG_BLOCKIRQ 0
70  #endif
71  
72  /* Default num of memslots to be allocated when VM starts */
73  #define  KVM_MEMSLOTS_NR_ALLOC_DEFAULT                      16
74  
75  struct KVMParkedVcpu {
76      unsigned long vcpu_id;
77      int kvm_fd;
78      QLIST_ENTRY(KVMParkedVcpu) node;
79  };
80  
81  KVMState *kvm_state;
82  bool kvm_kernel_irqchip;
83  bool kvm_split_irqchip;
84  bool kvm_async_interrupts_allowed;
85  bool kvm_halt_in_kernel_allowed;
86  bool kvm_resamplefds_allowed;
87  bool kvm_msi_via_irqfd_allowed;
88  bool kvm_gsi_routing_allowed;
89  bool kvm_gsi_direct_mapping;
90  bool kvm_allowed;
91  bool kvm_readonly_mem_allowed;
92  bool kvm_vm_attributes_allowed;
93  bool kvm_msi_use_devid;
94  static bool kvm_has_guest_debug;
95  static int kvm_sstep_flags;
96  static bool kvm_immediate_exit;
97  static uint64_t kvm_supported_memory_attributes;
98  static bool kvm_guest_memfd_supported;
99  static hwaddr kvm_max_slot_size = ~0;
100  
101  static const KVMCapabilityInfo kvm_required_capabilites[] = {
102      KVM_CAP_INFO(USER_MEMORY),
103      KVM_CAP_INFO(DESTROY_MEMORY_REGION_WORKS),
104      KVM_CAP_INFO(JOIN_MEMORY_REGIONS_WORKS),
105      KVM_CAP_INFO(INTERNAL_ERROR_DATA),
106      KVM_CAP_INFO(IOEVENTFD),
107      KVM_CAP_INFO(IOEVENTFD_ANY_LENGTH),
108      KVM_CAP_LAST_INFO
109  };
110  
111  static NotifierList kvm_irqchip_change_notifiers =
112      NOTIFIER_LIST_INITIALIZER(kvm_irqchip_change_notifiers);
113  
114  struct KVMResampleFd {
115      int gsi;
116      EventNotifier *resample_event;
117      QLIST_ENTRY(KVMResampleFd) node;
118  };
119  typedef struct KVMResampleFd KVMResampleFd;
120  
121  /*
122   * Only used with split irqchip where we need to do the resample fd
123   * kick for the kernel from userspace.
124   */
125  static QLIST_HEAD(, KVMResampleFd) kvm_resample_fd_list =
126      QLIST_HEAD_INITIALIZER(kvm_resample_fd_list);
127  
128  static QemuMutex kml_slots_lock;
129  
130  #define kvm_slots_lock()    qemu_mutex_lock(&kml_slots_lock)
131  #define kvm_slots_unlock()  qemu_mutex_unlock(&kml_slots_lock)
132  
133  static void kvm_slot_init_dirty_bitmap(KVMSlot *mem);
134  
135  static inline void kvm_resample_fd_remove(int gsi)
136  {
137      KVMResampleFd *rfd;
138  
139      QLIST_FOREACH(rfd, &kvm_resample_fd_list, node) {
140          if (rfd->gsi == gsi) {
141              QLIST_REMOVE(rfd, node);
142              g_free(rfd);
143              break;
144          }
145      }
146  }
147  
148  static inline void kvm_resample_fd_insert(int gsi, EventNotifier *event)
149  {
150      KVMResampleFd *rfd = g_new0(KVMResampleFd, 1);
151  
152      rfd->gsi = gsi;
153      rfd->resample_event = event;
154  
155      QLIST_INSERT_HEAD(&kvm_resample_fd_list, rfd, node);
156  }
157  
158  void kvm_resample_fd_notify(int gsi)
159  {
160      KVMResampleFd *rfd;
161  
162      QLIST_FOREACH(rfd, &kvm_resample_fd_list, node) {
163          if (rfd->gsi == gsi) {
164              event_notifier_set(rfd->resample_event);
165              trace_kvm_resample_fd_notify(gsi);
166              return;
167          }
168      }
169  }
170  
171  /**
172   * kvm_slots_grow(): Grow the slots[] array in the KVMMemoryListener
173   *
174   * @kml: The KVMMemoryListener* to grow the slots[] array
175   * @nr_slots_new: The new size of slots[] array
176   *
177   * Returns: True if the array grows larger, false otherwise.
178   */
179  static bool kvm_slots_grow(KVMMemoryListener *kml, unsigned int nr_slots_new)
180  {
181      unsigned int i, cur = kml->nr_slots_allocated;
182      KVMSlot *slots;
183  
184      if (nr_slots_new > kvm_state->nr_slots) {
185          nr_slots_new = kvm_state->nr_slots;
186      }
187  
188      if (cur >= nr_slots_new) {
189          /* Big enough, no need to grow, or we reached max */
190          return false;
191      }
192  
193      if (cur == 0) {
194          slots = g_new0(KVMSlot, nr_slots_new);
195      } else {
196          assert(kml->slots);
197          slots = g_renew(KVMSlot, kml->slots, nr_slots_new);
198          /*
199           * g_renew() doesn't initialize extended buffers, however kvm
200           * memslots require fields to be zero-initialized. E.g. pointers,
201           * memory_size field, etc.
202           */
203          memset(&slots[cur], 0x0, sizeof(slots[0]) * (nr_slots_new - cur));
204      }
205  
206      for (i = cur; i < nr_slots_new; i++) {
207          slots[i].slot = i;
208      }
209  
210      kml->slots = slots;
211      kml->nr_slots_allocated = nr_slots_new;
212      trace_kvm_slots_grow(cur, nr_slots_new);
213  
214      return true;
215  }
216  
217  static bool kvm_slots_double(KVMMemoryListener *kml)
218  {
219      return kvm_slots_grow(kml, kml->nr_slots_allocated * 2);
220  }
221  
222  unsigned int kvm_get_max_memslots(void)
223  {
224      KVMState *s = KVM_STATE(current_accel());
225  
226      return s->nr_slots;
227  }
228  
229  unsigned int kvm_get_free_memslots(void)
230  {
231      unsigned int used_slots = 0;
232      KVMState *s = kvm_state;
233      int i;
234  
235      kvm_slots_lock();
236      for (i = 0; i < s->nr_as; i++) {
237          if (!s->as[i].ml) {
238              continue;
239          }
240          used_slots = MAX(used_slots, s->as[i].ml->nr_used_slots);
241      }
242      kvm_slots_unlock();
243  
244      return s->nr_slots - used_slots;
245  }
246  
247  /* Called with KVMMemoryListener.slots_lock held */
248  static KVMSlot *kvm_get_free_slot(KVMMemoryListener *kml)
249  {
250      unsigned int n;
251      int i;
252  
253      for (i = 0; i < kml->nr_slots_allocated; i++) {
254          if (kml->slots[i].memory_size == 0) {
255              return &kml->slots[i];
256          }
257      }
258  
259      /*
260       * If no free slots, try to grow first by doubling.  Cache the old size
261       * here to avoid another round of search: if the grow succeeded, it
262       * means slots[] now must have the existing "n" slots occupied,
263       * followed by one or more free slots starting from slots[n].
264       */
265      n = kml->nr_slots_allocated;
266      if (kvm_slots_double(kml)) {
267          return &kml->slots[n];
268      }
269  
270      return NULL;
271  }
272  
273  /* Called with KVMMemoryListener.slots_lock held */
274  static KVMSlot *kvm_alloc_slot(KVMMemoryListener *kml)
275  {
276      KVMSlot *slot = kvm_get_free_slot(kml);
277  
278      if (slot) {
279          return slot;
280      }
281  
282      fprintf(stderr, "%s: no free slot available\n", __func__);
283      abort();
284  }
285  
286  static KVMSlot *kvm_lookup_matching_slot(KVMMemoryListener *kml,
287                                           hwaddr start_addr,
288                                           hwaddr size)
289  {
290      int i;
291  
292      for (i = 0; i < kml->nr_slots_allocated; i++) {
293          KVMSlot *mem = &kml->slots[i];
294  
295          if (start_addr == mem->start_addr && size == mem->memory_size) {
296              return mem;
297          }
298      }
299  
300      return NULL;
301  }
302  
303  /*
304   * Calculate and align the start address and the size of the section.
305   * Return the size. If the size is 0, the aligned section is empty.
306   */
307  static hwaddr kvm_align_section(MemoryRegionSection *section,
308                                  hwaddr *start)
309  {
310      hwaddr size = int128_get64(section->size);
311      hwaddr delta, aligned;
312  
313      /* kvm works in page size chunks, but the function may be called
314         with sub-page size and unaligned start address. Pad the start
315         address to next and truncate size to previous page boundary. */
316      aligned = ROUND_UP(section->offset_within_address_space,
317                         qemu_real_host_page_size());
318      delta = aligned - section->offset_within_address_space;
319      *start = aligned;
320      if (delta > size) {
321          return 0;
322      }
323  
324      return (size - delta) & qemu_real_host_page_mask();
325  }
326  
327  int kvm_physical_memory_addr_from_host(KVMState *s, void *ram,
328                                         hwaddr *phys_addr)
329  {
330      KVMMemoryListener *kml = &s->memory_listener;
331      int i, ret = 0;
332  
333      kvm_slots_lock();
334      for (i = 0; i < kml->nr_slots_allocated; i++) {
335          KVMSlot *mem = &kml->slots[i];
336  
337          if (ram >= mem->ram && ram < mem->ram + mem->memory_size) {
338              *phys_addr = mem->start_addr + (ram - mem->ram);
339              ret = 1;
340              break;
341          }
342      }
343      kvm_slots_unlock();
344  
345      return ret;
346  }
347  
348  static int kvm_set_user_memory_region(KVMMemoryListener *kml, KVMSlot *slot, bool new)
349  {
350      KVMState *s = kvm_state;
351      struct kvm_userspace_memory_region2 mem;
352      int ret;
353  
354      mem.slot = slot->slot | (kml->as_id << 16);
355      mem.guest_phys_addr = slot->start_addr;
356      mem.userspace_addr = (unsigned long)slot->ram;
357      mem.flags = slot->flags;
358      mem.guest_memfd = slot->guest_memfd;
359      mem.guest_memfd_offset = slot->guest_memfd_offset;
360  
361      if (slot->memory_size && !new && (mem.flags ^ slot->old_flags) & KVM_MEM_READONLY) {
362          /* Set the slot size to 0 before setting the slot to the desired
363           * value. This is needed based on KVM commit 75d61fbc. */
364          mem.memory_size = 0;
365  
366          if (kvm_guest_memfd_supported) {
367              ret = kvm_vm_ioctl(s, KVM_SET_USER_MEMORY_REGION2, &mem);
368          } else {
369              ret = kvm_vm_ioctl(s, KVM_SET_USER_MEMORY_REGION, &mem);
370          }
371          if (ret < 0) {
372              goto err;
373          }
374      }
375      mem.memory_size = slot->memory_size;
376      if (kvm_guest_memfd_supported) {
377          ret = kvm_vm_ioctl(s, KVM_SET_USER_MEMORY_REGION2, &mem);
378      } else {
379          ret = kvm_vm_ioctl(s, KVM_SET_USER_MEMORY_REGION, &mem);
380      }
381      slot->old_flags = mem.flags;
382  err:
383      trace_kvm_set_user_memory(mem.slot >> 16, (uint16_t)mem.slot, mem.flags,
384                                mem.guest_phys_addr, mem.memory_size,
385                                mem.userspace_addr, mem.guest_memfd,
386                                mem.guest_memfd_offset, ret);
387      if (ret < 0) {
388          if (kvm_guest_memfd_supported) {
389                  error_report("%s: KVM_SET_USER_MEMORY_REGION2 failed, slot=%d,"
390                          " start=0x%" PRIx64 ", size=0x%" PRIx64 ","
391                          " flags=0x%" PRIx32 ", guest_memfd=%" PRId32 ","
392                          " guest_memfd_offset=0x%" PRIx64 ": %s",
393                          __func__, mem.slot, slot->start_addr,
394                          (uint64_t)mem.memory_size, mem.flags,
395                          mem.guest_memfd, (uint64_t)mem.guest_memfd_offset,
396                          strerror(errno));
397          } else {
398                  error_report("%s: KVM_SET_USER_MEMORY_REGION failed, slot=%d,"
399                              " start=0x%" PRIx64 ", size=0x%" PRIx64 ": %s",
400                              __func__, mem.slot, slot->start_addr,
401                              (uint64_t)mem.memory_size, strerror(errno));
402          }
403      }
404      return ret;
405  }
406  
407  void kvm_park_vcpu(CPUState *cpu)
408  {
409      struct KVMParkedVcpu *vcpu;
410  
411      trace_kvm_park_vcpu(cpu->cpu_index, kvm_arch_vcpu_id(cpu));
412  
413      vcpu = g_malloc0(sizeof(*vcpu));
414      vcpu->vcpu_id = kvm_arch_vcpu_id(cpu);
415      vcpu->kvm_fd = cpu->kvm_fd;
416      QLIST_INSERT_HEAD(&kvm_state->kvm_parked_vcpus, vcpu, node);
417  }
418  
419  int kvm_unpark_vcpu(KVMState *s, unsigned long vcpu_id)
420  {
421      struct KVMParkedVcpu *cpu;
422      int kvm_fd = -ENOENT;
423  
424      QLIST_FOREACH(cpu, &s->kvm_parked_vcpus, node) {
425          if (cpu->vcpu_id == vcpu_id) {
426              QLIST_REMOVE(cpu, node);
427              kvm_fd = cpu->kvm_fd;
428              g_free(cpu);
429              break;
430          }
431      }
432  
433      trace_kvm_unpark_vcpu(vcpu_id, kvm_fd > 0 ? "unparked" : "!found parked");
434  
435      return kvm_fd;
436  }
437  
438  int kvm_create_vcpu(CPUState *cpu)
439  {
440      unsigned long vcpu_id = kvm_arch_vcpu_id(cpu);
441      KVMState *s = kvm_state;
442      int kvm_fd;
443  
444      /* check if the KVM vCPU already exist but is parked */
445      kvm_fd = kvm_unpark_vcpu(s, vcpu_id);
446      if (kvm_fd < 0) {
447          /* vCPU not parked: create a new KVM vCPU */
448          kvm_fd = kvm_vm_ioctl(s, KVM_CREATE_VCPU, vcpu_id);
449          if (kvm_fd < 0) {
450              error_report("KVM_CREATE_VCPU IOCTL failed for vCPU %lu", vcpu_id);
451              return kvm_fd;
452          }
453      }
454  
455      cpu->kvm_fd = kvm_fd;
456      cpu->kvm_state = s;
457      cpu->vcpu_dirty = true;
458      cpu->dirty_pages = 0;
459      cpu->throttle_us_per_full = 0;
460  
461      trace_kvm_create_vcpu(cpu->cpu_index, vcpu_id, kvm_fd);
462  
463      return 0;
464  }
465  
466  int kvm_create_and_park_vcpu(CPUState *cpu)
467  {
468      int ret = 0;
469  
470      ret = kvm_create_vcpu(cpu);
471      if (!ret) {
472          kvm_park_vcpu(cpu);
473      }
474  
475      return ret;
476  }
477  
478  static int do_kvm_destroy_vcpu(CPUState *cpu)
479  {
480      KVMState *s = kvm_state;
481      long mmap_size;
482      int ret = 0;
483  
484      trace_kvm_destroy_vcpu(cpu->cpu_index, kvm_arch_vcpu_id(cpu));
485  
486      ret = kvm_arch_destroy_vcpu(cpu);
487      if (ret < 0) {
488          goto err;
489      }
490  
491      mmap_size = kvm_ioctl(s, KVM_GET_VCPU_MMAP_SIZE, 0);
492      if (mmap_size < 0) {
493          ret = mmap_size;
494          trace_kvm_failed_get_vcpu_mmap_size();
495          goto err;
496      }
497  
498      ret = munmap(cpu->kvm_run, mmap_size);
499      if (ret < 0) {
500          goto err;
501      }
502  
503      if (cpu->kvm_dirty_gfns) {
504          ret = munmap(cpu->kvm_dirty_gfns, s->kvm_dirty_ring_bytes);
505          if (ret < 0) {
506              goto err;
507          }
508      }
509  
510      kvm_park_vcpu(cpu);
511  err:
512      return ret;
513  }
514  
515  void kvm_destroy_vcpu(CPUState *cpu)
516  {
517      if (do_kvm_destroy_vcpu(cpu) < 0) {
518          error_report("kvm_destroy_vcpu failed");
519          exit(EXIT_FAILURE);
520      }
521  }
522  
523  int kvm_init_vcpu(CPUState *cpu, Error **errp)
524  {
525      KVMState *s = kvm_state;
526      long mmap_size;
527      int ret;
528  
529      trace_kvm_init_vcpu(cpu->cpu_index, kvm_arch_vcpu_id(cpu));
530  
531      ret = kvm_create_vcpu(cpu);
532      if (ret < 0) {
533          error_setg_errno(errp, -ret,
534                           "kvm_init_vcpu: kvm_create_vcpu failed (%lu)",
535                           kvm_arch_vcpu_id(cpu));
536          goto err;
537      }
538  
539      mmap_size = kvm_ioctl(s, KVM_GET_VCPU_MMAP_SIZE, 0);
540      if (mmap_size < 0) {
541          ret = mmap_size;
542          error_setg_errno(errp, -mmap_size,
543                           "kvm_init_vcpu: KVM_GET_VCPU_MMAP_SIZE failed");
544          goto err;
545      }
546  
547      cpu->kvm_run = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED,
548                          cpu->kvm_fd, 0);
549      if (cpu->kvm_run == MAP_FAILED) {
550          ret = -errno;
551          error_setg_errno(errp, ret,
552                           "kvm_init_vcpu: mmap'ing vcpu state failed (%lu)",
553                           kvm_arch_vcpu_id(cpu));
554          goto err;
555      }
556  
557      if (s->coalesced_mmio && !s->coalesced_mmio_ring) {
558          s->coalesced_mmio_ring =
559              (void *)cpu->kvm_run + s->coalesced_mmio * PAGE_SIZE;
560      }
561  
562      if (s->kvm_dirty_ring_size) {
563          /* Use MAP_SHARED to share pages with the kernel */
564          cpu->kvm_dirty_gfns = mmap(NULL, s->kvm_dirty_ring_bytes,
565                                     PROT_READ | PROT_WRITE, MAP_SHARED,
566                                     cpu->kvm_fd,
567                                     PAGE_SIZE * KVM_DIRTY_LOG_PAGE_OFFSET);
568          if (cpu->kvm_dirty_gfns == MAP_FAILED) {
569              ret = -errno;
570              goto err;
571          }
572      }
573  
574      ret = kvm_arch_init_vcpu(cpu);
575      if (ret < 0) {
576          error_setg_errno(errp, -ret,
577                           "kvm_init_vcpu: kvm_arch_init_vcpu failed (%lu)",
578                           kvm_arch_vcpu_id(cpu));
579      }
580      cpu->kvm_vcpu_stats_fd = kvm_vcpu_ioctl(cpu, KVM_GET_STATS_FD, NULL);
581  
582  err:
583      return ret;
584  }
585  
586  /*
587   * dirty pages logging control
588   */
589  
590  static int kvm_mem_flags(MemoryRegion *mr)
591  {
592      bool readonly = mr->readonly || memory_region_is_romd(mr);
593      int flags = 0;
594  
595      if (memory_region_get_dirty_log_mask(mr) != 0) {
596          flags |= KVM_MEM_LOG_DIRTY_PAGES;
597      }
598      if (readonly && kvm_readonly_mem_allowed) {
599          flags |= KVM_MEM_READONLY;
600      }
601      if (memory_region_has_guest_memfd(mr)) {
602          assert(kvm_guest_memfd_supported);
603          flags |= KVM_MEM_GUEST_MEMFD;
604      }
605      return flags;
606  }
607  
608  /* Called with KVMMemoryListener.slots_lock held */
609  static int kvm_slot_update_flags(KVMMemoryListener *kml, KVMSlot *mem,
610                                   MemoryRegion *mr)
611  {
612      mem->flags = kvm_mem_flags(mr);
613  
614      /* If nothing changed effectively, no need to issue ioctl */
615      if (mem->flags == mem->old_flags) {
616          return 0;
617      }
618  
619      kvm_slot_init_dirty_bitmap(mem);
620      return kvm_set_user_memory_region(kml, mem, false);
621  }
622  
623  static int kvm_section_update_flags(KVMMemoryListener *kml,
624                                      MemoryRegionSection *section)
625  {
626      hwaddr start_addr, size, slot_size;
627      KVMSlot *mem;
628      int ret = 0;
629  
630      size = kvm_align_section(section, &start_addr);
631      if (!size) {
632          return 0;
633      }
634  
635      kvm_slots_lock();
636  
637      while (size && !ret) {
638          slot_size = MIN(kvm_max_slot_size, size);
639          mem = kvm_lookup_matching_slot(kml, start_addr, slot_size);
640          if (!mem) {
641              /* We don't have a slot if we want to trap every access. */
642              goto out;
643          }
644  
645          ret = kvm_slot_update_flags(kml, mem, section->mr);
646          start_addr += slot_size;
647          size -= slot_size;
648      }
649  
650  out:
651      kvm_slots_unlock();
652      return ret;
653  }
654  
655  static void kvm_log_start(MemoryListener *listener,
656                            MemoryRegionSection *section,
657                            int old, int new)
658  {
659      KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener);
660      int r;
661  
662      if (old != 0) {
663          return;
664      }
665  
666      r = kvm_section_update_flags(kml, section);
667      if (r < 0) {
668          abort();
669      }
670  }
671  
672  static void kvm_log_stop(MemoryListener *listener,
673                            MemoryRegionSection *section,
674                            int old, int new)
675  {
676      KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener);
677      int r;
678  
679      if (new != 0) {
680          return;
681      }
682  
683      r = kvm_section_update_flags(kml, section);
684      if (r < 0) {
685          abort();
686      }
687  }
688  
689  /* get kvm's dirty pages bitmap and update qemu's */
690  static void kvm_slot_sync_dirty_pages(KVMSlot *slot)
691  {
692      ram_addr_t start = slot->ram_start_offset;
693      ram_addr_t pages = slot->memory_size / qemu_real_host_page_size();
694  
695      cpu_physical_memory_set_dirty_lebitmap(slot->dirty_bmap, start, pages);
696  }
697  
698  static void kvm_slot_reset_dirty_pages(KVMSlot *slot)
699  {
700      memset(slot->dirty_bmap, 0, slot->dirty_bmap_size);
701  }
702  
703  #define ALIGN(x, y)  (((x)+(y)-1) & ~((y)-1))
704  
705  /* Allocate the dirty bitmap for a slot  */
706  static void kvm_slot_init_dirty_bitmap(KVMSlot *mem)
707  {
708      if (!(mem->flags & KVM_MEM_LOG_DIRTY_PAGES) || mem->dirty_bmap) {
709          return;
710      }
711  
712      /*
713       * XXX bad kernel interface alert
714       * For dirty bitmap, kernel allocates array of size aligned to
715       * bits-per-long.  But for case when the kernel is 64bits and
716       * the userspace is 32bits, userspace can't align to the same
717       * bits-per-long, since sizeof(long) is different between kernel
718       * and user space.  This way, userspace will provide buffer which
719       * may be 4 bytes less than the kernel will use, resulting in
720       * userspace memory corruption (which is not detectable by valgrind
721       * too, in most cases).
722       * So for now, let's align to 64 instead of HOST_LONG_BITS here, in
723       * a hope that sizeof(long) won't become >8 any time soon.
724       *
725       * Note: the granule of kvm dirty log is qemu_real_host_page_size.
726       * And mem->memory_size is aligned to it (otherwise this mem can't
727       * be registered to KVM).
728       */
729      hwaddr bitmap_size = ALIGN(mem->memory_size / qemu_real_host_page_size(),
730                                          /*HOST_LONG_BITS*/ 64) / 8;
731      mem->dirty_bmap = g_malloc0(bitmap_size);
732      mem->dirty_bmap_size = bitmap_size;
733  }
734  
735  /*
736   * Sync dirty bitmap from kernel to KVMSlot.dirty_bmap, return true if
737   * succeeded, false otherwise
738   */
739  static bool kvm_slot_get_dirty_log(KVMState *s, KVMSlot *slot)
740  {
741      struct kvm_dirty_log d = {};
742      int ret;
743  
744      d.dirty_bitmap = slot->dirty_bmap;
745      d.slot = slot->slot | (slot->as_id << 16);
746      ret = kvm_vm_ioctl(s, KVM_GET_DIRTY_LOG, &d);
747  
748      if (ret == -ENOENT) {
749          /* kernel does not have dirty bitmap in this slot */
750          ret = 0;
751      }
752      if (ret) {
753          error_report_once("%s: KVM_GET_DIRTY_LOG failed with %d",
754                            __func__, ret);
755      }
756      return ret == 0;
757  }
758  
759  /* Should be with all slots_lock held for the address spaces. */
760  static void kvm_dirty_ring_mark_page(KVMState *s, uint32_t as_id,
761                                       uint32_t slot_id, uint64_t offset)
762  {
763      KVMMemoryListener *kml;
764      KVMSlot *mem;
765  
766      if (as_id >= s->nr_as) {
767          return;
768      }
769  
770      kml = s->as[as_id].ml;
771      mem = &kml->slots[slot_id];
772  
773      if (!mem->memory_size || offset >=
774          (mem->memory_size / qemu_real_host_page_size())) {
775          return;
776      }
777  
778      set_bit(offset, mem->dirty_bmap);
779  }
780  
781  static bool dirty_gfn_is_dirtied(struct kvm_dirty_gfn *gfn)
782  {
783      /*
784       * Read the flags before the value.  Pairs with barrier in
785       * KVM's kvm_dirty_ring_push() function.
786       */
787      return qatomic_load_acquire(&gfn->flags) == KVM_DIRTY_GFN_F_DIRTY;
788  }
789  
790  static void dirty_gfn_set_collected(struct kvm_dirty_gfn *gfn)
791  {
792      /*
793       * Use a store-release so that the CPU that executes KVM_RESET_DIRTY_RINGS
794       * sees the full content of the ring:
795       *
796       * CPU0                     CPU1                         CPU2
797       * ------------------------------------------------------------------------------
798       *                                                       fill gfn0
799       *                                                       store-rel flags for gfn0
800       * load-acq flags for gfn0
801       * store-rel RESET for gfn0
802       *                          ioctl(RESET_RINGS)
803       *                            load-acq flags for gfn0
804       *                            check if flags have RESET
805       *
806       * The synchronization goes from CPU2 to CPU0 to CPU1.
807       */
808      qatomic_store_release(&gfn->flags, KVM_DIRTY_GFN_F_RESET);
809  }
810  
811  /*
812   * Should be with all slots_lock held for the address spaces.  It returns the
813   * dirty page we've collected on this dirty ring.
814   */
815  static uint32_t kvm_dirty_ring_reap_one(KVMState *s, CPUState *cpu)
816  {
817      struct kvm_dirty_gfn *dirty_gfns = cpu->kvm_dirty_gfns, *cur;
818      uint32_t ring_size = s->kvm_dirty_ring_size;
819      uint32_t count = 0, fetch = cpu->kvm_fetch_index;
820  
821      /*
822       * It's possible that we race with vcpu creation code where the vcpu is
823       * put onto the vcpus list but not yet initialized the dirty ring
824       * structures.  If so, skip it.
825       */
826      if (!cpu->created) {
827          return 0;
828      }
829  
830      assert(dirty_gfns && ring_size);
831      trace_kvm_dirty_ring_reap_vcpu(cpu->cpu_index);
832  
833      while (true) {
834          cur = &dirty_gfns[fetch % ring_size];
835          if (!dirty_gfn_is_dirtied(cur)) {
836              break;
837          }
838          kvm_dirty_ring_mark_page(s, cur->slot >> 16, cur->slot & 0xffff,
839                                   cur->offset);
840          dirty_gfn_set_collected(cur);
841          trace_kvm_dirty_ring_page(cpu->cpu_index, fetch, cur->offset);
842          fetch++;
843          count++;
844      }
845      cpu->kvm_fetch_index = fetch;
846      cpu->dirty_pages += count;
847  
848      return count;
849  }
850  
851  /* Must be with slots_lock held */
852  static uint64_t kvm_dirty_ring_reap_locked(KVMState *s, CPUState* cpu)
853  {
854      int ret;
855      uint64_t total = 0;
856      int64_t stamp;
857  
858      stamp = get_clock();
859  
860      if (cpu) {
861          total = kvm_dirty_ring_reap_one(s, cpu);
862      } else {
863          CPU_FOREACH(cpu) {
864              total += kvm_dirty_ring_reap_one(s, cpu);
865          }
866      }
867  
868      if (total) {
869          ret = kvm_vm_ioctl(s, KVM_RESET_DIRTY_RINGS);
870          assert(ret == total);
871      }
872  
873      stamp = get_clock() - stamp;
874  
875      if (total) {
876          trace_kvm_dirty_ring_reap(total, stamp / 1000);
877      }
878  
879      return total;
880  }
881  
882  /*
883   * Currently for simplicity, we must hold BQL before calling this.  We can
884   * consider to drop the BQL if we're clear with all the race conditions.
885   */
886  static uint64_t kvm_dirty_ring_reap(KVMState *s, CPUState *cpu)
887  {
888      uint64_t total;
889  
890      /*
891       * We need to lock all kvm slots for all address spaces here,
892       * because:
893       *
894       * (1) We need to mark dirty for dirty bitmaps in multiple slots
895       *     and for tons of pages, so it's better to take the lock here
896       *     once rather than once per page.  And more importantly,
897       *
898       * (2) We must _NOT_ publish dirty bits to the other threads
899       *     (e.g., the migration thread) via the kvm memory slot dirty
900       *     bitmaps before correctly re-protect those dirtied pages.
901       *     Otherwise we can have potential risk of data corruption if
902       *     the page data is read in the other thread before we do
903       *     reset below.
904       */
905      kvm_slots_lock();
906      total = kvm_dirty_ring_reap_locked(s, cpu);
907      kvm_slots_unlock();
908  
909      return total;
910  }
911  
912  static void do_kvm_cpu_synchronize_kick(CPUState *cpu, run_on_cpu_data arg)
913  {
914      /* No need to do anything */
915  }
916  
917  /*
918   * Kick all vcpus out in a synchronized way.  When returned, we
919   * guarantee that every vcpu has been kicked and at least returned to
920   * userspace once.
921   */
922  static void kvm_cpu_synchronize_kick_all(void)
923  {
924      CPUState *cpu;
925  
926      CPU_FOREACH(cpu) {
927          run_on_cpu(cpu, do_kvm_cpu_synchronize_kick, RUN_ON_CPU_NULL);
928      }
929  }
930  
931  /*
932   * Flush all the existing dirty pages to the KVM slot buffers.  When
933   * this call returns, we guarantee that all the touched dirty pages
934   * before calling this function have been put into the per-kvmslot
935   * dirty bitmap.
936   *
937   * This function must be called with BQL held.
938   */
939  static void kvm_dirty_ring_flush(void)
940  {
941      trace_kvm_dirty_ring_flush(0);
942      /*
943       * The function needs to be serialized.  Since this function
944       * should always be with BQL held, serialization is guaranteed.
945       * However, let's be sure of it.
946       */
947      assert(bql_locked());
948      /*
949       * First make sure to flush the hardware buffers by kicking all
950       * vcpus out in a synchronous way.
951       */
952      kvm_cpu_synchronize_kick_all();
953      kvm_dirty_ring_reap(kvm_state, NULL);
954      trace_kvm_dirty_ring_flush(1);
955  }
956  
957  /**
958   * kvm_physical_sync_dirty_bitmap - Sync dirty bitmap from kernel space
959   *
960   * This function will first try to fetch dirty bitmap from the kernel,
961   * and then updates qemu's dirty bitmap.
962   *
963   * NOTE: caller must be with kml->slots_lock held.
964   *
965   * @kml: the KVM memory listener object
966   * @section: the memory section to sync the dirty bitmap with
967   */
968  static void kvm_physical_sync_dirty_bitmap(KVMMemoryListener *kml,
969                                             MemoryRegionSection *section)
970  {
971      KVMState *s = kvm_state;
972      KVMSlot *mem;
973      hwaddr start_addr, size;
974      hwaddr slot_size;
975  
976      size = kvm_align_section(section, &start_addr);
977      while (size) {
978          slot_size = MIN(kvm_max_slot_size, size);
979          mem = kvm_lookup_matching_slot(kml, start_addr, slot_size);
980          if (!mem) {
981              /* We don't have a slot if we want to trap every access. */
982              return;
983          }
984          if (kvm_slot_get_dirty_log(s, mem)) {
985              kvm_slot_sync_dirty_pages(mem);
986          }
987          start_addr += slot_size;
988          size -= slot_size;
989      }
990  }
991  
992  /* Alignment requirement for KVM_CLEAR_DIRTY_LOG - 64 pages */
993  #define KVM_CLEAR_LOG_SHIFT  6
994  #define KVM_CLEAR_LOG_ALIGN  (qemu_real_host_page_size() << KVM_CLEAR_LOG_SHIFT)
995  #define KVM_CLEAR_LOG_MASK   (-KVM_CLEAR_LOG_ALIGN)
996  
997  static int kvm_log_clear_one_slot(KVMSlot *mem, int as_id, uint64_t start,
998                                    uint64_t size)
999  {
1000      KVMState *s = kvm_state;
1001      uint64_t end, bmap_start, start_delta, bmap_npages;
1002      struct kvm_clear_dirty_log d;
1003      unsigned long *bmap_clear = NULL, psize = qemu_real_host_page_size();
1004      int ret;
1005  
1006      /*
1007       * We need to extend either the start or the size or both to
1008       * satisfy the KVM interface requirement.  Firstly, do the start
1009       * page alignment on 64 host pages
1010       */
1011      bmap_start = start & KVM_CLEAR_LOG_MASK;
1012      start_delta = start - bmap_start;
1013      bmap_start /= psize;
1014  
1015      /*
1016       * The kernel interface has restriction on the size too, that either:
1017       *
1018       * (1) the size is 64 host pages aligned (just like the start), or
1019       * (2) the size fills up until the end of the KVM memslot.
1020       */
1021      bmap_npages = DIV_ROUND_UP(size + start_delta, KVM_CLEAR_LOG_ALIGN)
1022          << KVM_CLEAR_LOG_SHIFT;
1023      end = mem->memory_size / psize;
1024      if (bmap_npages > end - bmap_start) {
1025          bmap_npages = end - bmap_start;
1026      }
1027      start_delta /= psize;
1028  
1029      /*
1030       * Prepare the bitmap to clear dirty bits.  Here we must guarantee
1031       * that we won't clear any unknown dirty bits otherwise we might
1032       * accidentally clear some set bits which are not yet synced from
1033       * the kernel into QEMU's bitmap, then we'll lose track of the
1034       * guest modifications upon those pages (which can directly lead
1035       * to guest data loss or panic after migration).
1036       *
1037       * Layout of the KVMSlot.dirty_bmap:
1038       *
1039       *                   |<-------- bmap_npages -----------..>|
1040       *                                                     [1]
1041       *                     start_delta         size
1042       *  |----------------|-------------|------------------|------------|
1043       *  ^                ^             ^                               ^
1044       *  |                |             |                               |
1045       * start          bmap_start     (start)                         end
1046       * of memslot                                             of memslot
1047       *
1048       * [1] bmap_npages can be aligned to either 64 pages or the end of slot
1049       */
1050  
1051      assert(bmap_start % BITS_PER_LONG == 0);
1052      /* We should never do log_clear before log_sync */
1053      assert(mem->dirty_bmap);
1054      if (start_delta || bmap_npages - size / psize) {
1055          /* Slow path - we need to manipulate a temp bitmap */
1056          bmap_clear = bitmap_new(bmap_npages);
1057          bitmap_copy_with_src_offset(bmap_clear, mem->dirty_bmap,
1058                                      bmap_start, start_delta + size / psize);
1059          /*
1060           * We need to fill the holes at start because that was not
1061           * specified by the caller and we extended the bitmap only for
1062           * 64 pages alignment
1063           */
1064          bitmap_clear(bmap_clear, 0, start_delta);
1065          d.dirty_bitmap = bmap_clear;
1066      } else {
1067          /*
1068           * Fast path - both start and size align well with BITS_PER_LONG
1069           * (or the end of memory slot)
1070           */
1071          d.dirty_bitmap = mem->dirty_bmap + BIT_WORD(bmap_start);
1072      }
1073  
1074      d.first_page = bmap_start;
1075      /* It should never overflow.  If it happens, say something */
1076      assert(bmap_npages <= UINT32_MAX);
1077      d.num_pages = bmap_npages;
1078      d.slot = mem->slot | (as_id << 16);
1079  
1080      ret = kvm_vm_ioctl(s, KVM_CLEAR_DIRTY_LOG, &d);
1081      if (ret < 0 && ret != -ENOENT) {
1082          error_report("%s: KVM_CLEAR_DIRTY_LOG failed, slot=%d, "
1083                       "start=0x%"PRIx64", size=0x%"PRIx32", errno=%d",
1084                       __func__, d.slot, (uint64_t)d.first_page,
1085                       (uint32_t)d.num_pages, ret);
1086      } else {
1087          ret = 0;
1088          trace_kvm_clear_dirty_log(d.slot, d.first_page, d.num_pages);
1089      }
1090  
1091      /*
1092       * After we have updated the remote dirty bitmap, we update the
1093       * cached bitmap as well for the memslot, then if another user
1094       * clears the same region we know we shouldn't clear it again on
1095       * the remote otherwise it's data loss as well.
1096       */
1097      bitmap_clear(mem->dirty_bmap, bmap_start + start_delta,
1098                   size / psize);
1099      /* This handles the NULL case well */
1100      g_free(bmap_clear);
1101      return ret;
1102  }
1103  
1104  
1105  /**
1106   * kvm_physical_log_clear - Clear the kernel's dirty bitmap for range
1107   *
1108   * NOTE: this will be a no-op if we haven't enabled manual dirty log
1109   * protection in the host kernel because in that case this operation
1110   * will be done within log_sync().
1111   *
1112   * @kml:     the kvm memory listener
1113   * @section: the memory range to clear dirty bitmap
1114   */
1115  static int kvm_physical_log_clear(KVMMemoryListener *kml,
1116                                    MemoryRegionSection *section)
1117  {
1118      KVMState *s = kvm_state;
1119      uint64_t start, size, offset, count;
1120      KVMSlot *mem;
1121      int ret = 0, i;
1122  
1123      if (!s->manual_dirty_log_protect) {
1124          /* No need to do explicit clear */
1125          return ret;
1126      }
1127  
1128      start = section->offset_within_address_space;
1129      size = int128_get64(section->size);
1130  
1131      if (!size) {
1132          /* Nothing more we can do... */
1133          return ret;
1134      }
1135  
1136      kvm_slots_lock();
1137  
1138      for (i = 0; i < kml->nr_slots_allocated; i++) {
1139          mem = &kml->slots[i];
1140          /* Discard slots that are empty or do not overlap the section */
1141          if (!mem->memory_size ||
1142              mem->start_addr > start + size - 1 ||
1143              start > mem->start_addr + mem->memory_size - 1) {
1144              continue;
1145          }
1146  
1147          if (start >= mem->start_addr) {
1148              /* The slot starts before section or is aligned to it.  */
1149              offset = start - mem->start_addr;
1150              count = MIN(mem->memory_size - offset, size);
1151          } else {
1152              /* The slot starts after section.  */
1153              offset = 0;
1154              count = MIN(mem->memory_size, size - (mem->start_addr - start));
1155          }
1156          ret = kvm_log_clear_one_slot(mem, kml->as_id, offset, count);
1157          if (ret < 0) {
1158              break;
1159          }
1160      }
1161  
1162      kvm_slots_unlock();
1163  
1164      return ret;
1165  }
1166  
1167  static void kvm_coalesce_mmio_region(MemoryListener *listener,
1168                                       MemoryRegionSection *secion,
1169                                       hwaddr start, hwaddr size)
1170  {
1171      KVMState *s = kvm_state;
1172  
1173      if (s->coalesced_mmio) {
1174          struct kvm_coalesced_mmio_zone zone;
1175  
1176          zone.addr = start;
1177          zone.size = size;
1178          zone.pad = 0;
1179  
1180          (void)kvm_vm_ioctl(s, KVM_REGISTER_COALESCED_MMIO, &zone);
1181      }
1182  }
1183  
1184  static void kvm_uncoalesce_mmio_region(MemoryListener *listener,
1185                                         MemoryRegionSection *secion,
1186                                         hwaddr start, hwaddr size)
1187  {
1188      KVMState *s = kvm_state;
1189  
1190      if (s->coalesced_mmio) {
1191          struct kvm_coalesced_mmio_zone zone;
1192  
1193          zone.addr = start;
1194          zone.size = size;
1195          zone.pad = 0;
1196  
1197          (void)kvm_vm_ioctl(s, KVM_UNREGISTER_COALESCED_MMIO, &zone);
1198      }
1199  }
1200  
1201  static void kvm_coalesce_pio_add(MemoryListener *listener,
1202                                  MemoryRegionSection *section,
1203                                  hwaddr start, hwaddr size)
1204  {
1205      KVMState *s = kvm_state;
1206  
1207      if (s->coalesced_pio) {
1208          struct kvm_coalesced_mmio_zone zone;
1209  
1210          zone.addr = start;
1211          zone.size = size;
1212          zone.pio = 1;
1213  
1214          (void)kvm_vm_ioctl(s, KVM_REGISTER_COALESCED_MMIO, &zone);
1215      }
1216  }
1217  
1218  static void kvm_coalesce_pio_del(MemoryListener *listener,
1219                                  MemoryRegionSection *section,
1220                                  hwaddr start, hwaddr size)
1221  {
1222      KVMState *s = kvm_state;
1223  
1224      if (s->coalesced_pio) {
1225          struct kvm_coalesced_mmio_zone zone;
1226  
1227          zone.addr = start;
1228          zone.size = size;
1229          zone.pio = 1;
1230  
1231          (void)kvm_vm_ioctl(s, KVM_UNREGISTER_COALESCED_MMIO, &zone);
1232       }
1233  }
1234  
1235  int kvm_check_extension(KVMState *s, unsigned int extension)
1236  {
1237      int ret;
1238  
1239      ret = kvm_ioctl(s, KVM_CHECK_EXTENSION, extension);
1240      if (ret < 0) {
1241          ret = 0;
1242      }
1243  
1244      return ret;
1245  }
1246  
1247  int kvm_vm_check_extension(KVMState *s, unsigned int extension)
1248  {
1249      int ret;
1250  
1251      ret = kvm_vm_ioctl(s, KVM_CHECK_EXTENSION, extension);
1252      if (ret < 0) {
1253          /* VM wide version not implemented, use global one instead */
1254          ret = kvm_check_extension(s, extension);
1255      }
1256  
1257      return ret;
1258  }
1259  
1260  /*
1261   * We track the poisoned pages to be able to:
1262   * - replace them on VM reset
1263   * - block a migration for a VM with a poisoned page
1264   */
1265  typedef struct HWPoisonPage {
1266      ram_addr_t ram_addr;
1267      QLIST_ENTRY(HWPoisonPage) list;
1268  } HWPoisonPage;
1269  
1270  static QLIST_HEAD(, HWPoisonPage) hwpoison_page_list =
1271      QLIST_HEAD_INITIALIZER(hwpoison_page_list);
1272  
1273  static void kvm_unpoison_all(void *param)
1274  {
1275      HWPoisonPage *page, *next_page;
1276  
1277      QLIST_FOREACH_SAFE(page, &hwpoison_page_list, list, next_page) {
1278          QLIST_REMOVE(page, list);
1279          qemu_ram_remap(page->ram_addr, TARGET_PAGE_SIZE);
1280          g_free(page);
1281      }
1282  }
1283  
1284  void kvm_hwpoison_page_add(ram_addr_t ram_addr)
1285  {
1286      HWPoisonPage *page;
1287  
1288      QLIST_FOREACH(page, &hwpoison_page_list, list) {
1289          if (page->ram_addr == ram_addr) {
1290              return;
1291          }
1292      }
1293      page = g_new(HWPoisonPage, 1);
1294      page->ram_addr = ram_addr;
1295      QLIST_INSERT_HEAD(&hwpoison_page_list, page, list);
1296  }
1297  
1298  bool kvm_hwpoisoned_mem(void)
1299  {
1300      return !QLIST_EMPTY(&hwpoison_page_list);
1301  }
1302  
1303  static uint32_t adjust_ioeventfd_endianness(uint32_t val, uint32_t size)
1304  {
1305  #if HOST_BIG_ENDIAN != TARGET_BIG_ENDIAN
1306      /* The kernel expects ioeventfd values in HOST_BIG_ENDIAN
1307       * endianness, but the memory core hands them in target endianness.
1308       * For example, PPC is always treated as big-endian even if running
1309       * on KVM and on PPC64LE.  Correct here.
1310       */
1311      switch (size) {
1312      case 2:
1313          val = bswap16(val);
1314          break;
1315      case 4:
1316          val = bswap32(val);
1317          break;
1318      }
1319  #endif
1320      return val;
1321  }
1322  
1323  static int kvm_set_ioeventfd_mmio(int fd, hwaddr addr, uint32_t val,
1324                                    bool assign, uint32_t size, bool datamatch)
1325  {
1326      int ret;
1327      struct kvm_ioeventfd iofd = {
1328          .datamatch = datamatch ? adjust_ioeventfd_endianness(val, size) : 0,
1329          .addr = addr,
1330          .len = size,
1331          .flags = 0,
1332          .fd = fd,
1333      };
1334  
1335      trace_kvm_set_ioeventfd_mmio(fd, (uint64_t)addr, val, assign, size,
1336                                   datamatch);
1337      if (!kvm_enabled()) {
1338          return -ENOSYS;
1339      }
1340  
1341      if (datamatch) {
1342          iofd.flags |= KVM_IOEVENTFD_FLAG_DATAMATCH;
1343      }
1344      if (!assign) {
1345          iofd.flags |= KVM_IOEVENTFD_FLAG_DEASSIGN;
1346      }
1347  
1348      ret = kvm_vm_ioctl(kvm_state, KVM_IOEVENTFD, &iofd);
1349  
1350      if (ret < 0) {
1351          return -errno;
1352      }
1353  
1354      return 0;
1355  }
1356  
1357  static int kvm_set_ioeventfd_pio(int fd, uint16_t addr, uint16_t val,
1358                                   bool assign, uint32_t size, bool datamatch)
1359  {
1360      struct kvm_ioeventfd kick = {
1361          .datamatch = datamatch ? adjust_ioeventfd_endianness(val, size) : 0,
1362          .addr = addr,
1363          .flags = KVM_IOEVENTFD_FLAG_PIO,
1364          .len = size,
1365          .fd = fd,
1366      };
1367      int r;
1368      trace_kvm_set_ioeventfd_pio(fd, addr, val, assign, size, datamatch);
1369      if (!kvm_enabled()) {
1370          return -ENOSYS;
1371      }
1372      if (datamatch) {
1373          kick.flags |= KVM_IOEVENTFD_FLAG_DATAMATCH;
1374      }
1375      if (!assign) {
1376          kick.flags |= KVM_IOEVENTFD_FLAG_DEASSIGN;
1377      }
1378      r = kvm_vm_ioctl(kvm_state, KVM_IOEVENTFD, &kick);
1379      if (r < 0) {
1380          return r;
1381      }
1382      return 0;
1383  }
1384  
1385  
1386  static const KVMCapabilityInfo *
1387  kvm_check_extension_list(KVMState *s, const KVMCapabilityInfo *list)
1388  {
1389      while (list->name) {
1390          if (!kvm_check_extension(s, list->value)) {
1391              return list;
1392          }
1393          list++;
1394      }
1395      return NULL;
1396  }
1397  
1398  void kvm_set_max_memslot_size(hwaddr max_slot_size)
1399  {
1400      g_assert(
1401          ROUND_UP(max_slot_size, qemu_real_host_page_size()) == max_slot_size
1402      );
1403      kvm_max_slot_size = max_slot_size;
1404  }
1405  
1406  static int kvm_set_memory_attributes(hwaddr start, uint64_t size, uint64_t attr)
1407  {
1408      struct kvm_memory_attributes attrs;
1409      int r;
1410  
1411      assert((attr & kvm_supported_memory_attributes) == attr);
1412      attrs.attributes = attr;
1413      attrs.address = start;
1414      attrs.size = size;
1415      attrs.flags = 0;
1416  
1417      r = kvm_vm_ioctl(kvm_state, KVM_SET_MEMORY_ATTRIBUTES, &attrs);
1418      if (r) {
1419          error_report("failed to set memory (0x%" HWADDR_PRIx "+0x%" PRIx64 ") "
1420                       "with attr 0x%" PRIx64 " error '%s'",
1421                       start, size, attr, strerror(errno));
1422      }
1423      return r;
1424  }
1425  
1426  int kvm_set_memory_attributes_private(hwaddr start, uint64_t size)
1427  {
1428      return kvm_set_memory_attributes(start, size, KVM_MEMORY_ATTRIBUTE_PRIVATE);
1429  }
1430  
1431  int kvm_set_memory_attributes_shared(hwaddr start, uint64_t size)
1432  {
1433      return kvm_set_memory_attributes(start, size, 0);
1434  }
1435  
1436  /* Called with KVMMemoryListener.slots_lock held */
1437  static void kvm_set_phys_mem(KVMMemoryListener *kml,
1438                               MemoryRegionSection *section, bool add)
1439  {
1440      KVMSlot *mem;
1441      int err;
1442      MemoryRegion *mr = section->mr;
1443      bool writable = !mr->readonly && !mr->rom_device;
1444      hwaddr start_addr, size, slot_size, mr_offset;
1445      ram_addr_t ram_start_offset;
1446      void *ram;
1447  
1448      if (!memory_region_is_ram(mr)) {
1449          if (writable || !kvm_readonly_mem_allowed) {
1450              return;
1451          } else if (!mr->romd_mode) {
1452              /* If the memory device is not in romd_mode, then we actually want
1453               * to remove the kvm memory slot so all accesses will trap. */
1454              add = false;
1455          }
1456      }
1457  
1458      size = kvm_align_section(section, &start_addr);
1459      if (!size) {
1460          return;
1461      }
1462  
1463      /* The offset of the kvmslot within the memory region */
1464      mr_offset = section->offset_within_region + start_addr -
1465          section->offset_within_address_space;
1466  
1467      /* use aligned delta to align the ram address and offset */
1468      ram = memory_region_get_ram_ptr(mr) + mr_offset;
1469      ram_start_offset = memory_region_get_ram_addr(mr) + mr_offset;
1470  
1471      if (!add) {
1472          do {
1473              slot_size = MIN(kvm_max_slot_size, size);
1474              mem = kvm_lookup_matching_slot(kml, start_addr, slot_size);
1475              if (!mem) {
1476                  return;
1477              }
1478              if (mem->flags & KVM_MEM_LOG_DIRTY_PAGES) {
1479                  /*
1480                   * NOTE: We should be aware of the fact that here we're only
1481                   * doing a best effort to sync dirty bits.  No matter whether
1482                   * we're using dirty log or dirty ring, we ignored two facts:
1483                   *
1484                   * (1) dirty bits can reside in hardware buffers (PML)
1485                   *
1486                   * (2) after we collected dirty bits here, pages can be dirtied
1487                   * again before we do the final KVM_SET_USER_MEMORY_REGION to
1488                   * remove the slot.
1489                   *
1490                   * Not easy.  Let's cross the fingers until it's fixed.
1491                   */
1492                  if (kvm_state->kvm_dirty_ring_size) {
1493                      kvm_dirty_ring_reap_locked(kvm_state, NULL);
1494                      if (kvm_state->kvm_dirty_ring_with_bitmap) {
1495                          kvm_slot_sync_dirty_pages(mem);
1496                          kvm_slot_get_dirty_log(kvm_state, mem);
1497                      }
1498                  } else {
1499                      kvm_slot_get_dirty_log(kvm_state, mem);
1500                  }
1501                  kvm_slot_sync_dirty_pages(mem);
1502              }
1503  
1504              /* unregister the slot */
1505              g_free(mem->dirty_bmap);
1506              mem->dirty_bmap = NULL;
1507              mem->memory_size = 0;
1508              mem->flags = 0;
1509              err = kvm_set_user_memory_region(kml, mem, false);
1510              if (err) {
1511                  fprintf(stderr, "%s: error unregistering slot: %s\n",
1512                          __func__, strerror(-err));
1513                  abort();
1514              }
1515              start_addr += slot_size;
1516              size -= slot_size;
1517              kml->nr_used_slots--;
1518          } while (size);
1519          return;
1520      }
1521  
1522      /* register the new slot */
1523      do {
1524          slot_size = MIN(kvm_max_slot_size, size);
1525          mem = kvm_alloc_slot(kml);
1526          mem->as_id = kml->as_id;
1527          mem->memory_size = slot_size;
1528          mem->start_addr = start_addr;
1529          mem->ram_start_offset = ram_start_offset;
1530          mem->ram = ram;
1531          mem->flags = kvm_mem_flags(mr);
1532          mem->guest_memfd = mr->ram_block->guest_memfd;
1533          mem->guest_memfd_offset = (uint8_t*)ram - mr->ram_block->host;
1534  
1535          kvm_slot_init_dirty_bitmap(mem);
1536          err = kvm_set_user_memory_region(kml, mem, true);
1537          if (err) {
1538              fprintf(stderr, "%s: error registering slot: %s\n", __func__,
1539                      strerror(-err));
1540              abort();
1541          }
1542  
1543          if (memory_region_has_guest_memfd(mr)) {
1544              err = kvm_set_memory_attributes_private(start_addr, slot_size);
1545              if (err) {
1546                  error_report("%s: failed to set memory attribute private: %s",
1547                               __func__, strerror(-err));
1548                  exit(1);
1549              }
1550          }
1551  
1552          start_addr += slot_size;
1553          ram_start_offset += slot_size;
1554          ram += slot_size;
1555          size -= slot_size;
1556          kml->nr_used_slots++;
1557      } while (size);
1558  }
1559  
1560  static void *kvm_dirty_ring_reaper_thread(void *data)
1561  {
1562      KVMState *s = data;
1563      struct KVMDirtyRingReaper *r = &s->reaper;
1564  
1565      rcu_register_thread();
1566  
1567      trace_kvm_dirty_ring_reaper("init");
1568  
1569      while (true) {
1570          r->reaper_state = KVM_DIRTY_RING_REAPER_WAIT;
1571          trace_kvm_dirty_ring_reaper("wait");
1572          /*
1573           * TODO: provide a smarter timeout rather than a constant?
1574           */
1575          sleep(1);
1576  
1577          /* keep sleeping so that dirtylimit not be interfered by reaper */
1578          if (dirtylimit_in_service()) {
1579              continue;
1580          }
1581  
1582          trace_kvm_dirty_ring_reaper("wakeup");
1583          r->reaper_state = KVM_DIRTY_RING_REAPER_REAPING;
1584  
1585          bql_lock();
1586          kvm_dirty_ring_reap(s, NULL);
1587          bql_unlock();
1588  
1589          r->reaper_iteration++;
1590      }
1591  
1592      trace_kvm_dirty_ring_reaper("exit");
1593  
1594      rcu_unregister_thread();
1595  
1596      return NULL;
1597  }
1598  
1599  static void kvm_dirty_ring_reaper_init(KVMState *s)
1600  {
1601      struct KVMDirtyRingReaper *r = &s->reaper;
1602  
1603      qemu_thread_create(&r->reaper_thr, "kvm-reaper",
1604                         kvm_dirty_ring_reaper_thread,
1605                         s, QEMU_THREAD_JOINABLE);
1606  }
1607  
1608  static int kvm_dirty_ring_init(KVMState *s)
1609  {
1610      uint32_t ring_size = s->kvm_dirty_ring_size;
1611      uint64_t ring_bytes = ring_size * sizeof(struct kvm_dirty_gfn);
1612      unsigned int capability = KVM_CAP_DIRTY_LOG_RING;
1613      int ret;
1614  
1615      s->kvm_dirty_ring_size = 0;
1616      s->kvm_dirty_ring_bytes = 0;
1617  
1618      /* Bail if the dirty ring size isn't specified */
1619      if (!ring_size) {
1620          return 0;
1621      }
1622  
1623      /*
1624       * Read the max supported pages. Fall back to dirty logging mode
1625       * if the dirty ring isn't supported.
1626       */
1627      ret = kvm_vm_check_extension(s, capability);
1628      if (ret <= 0) {
1629          capability = KVM_CAP_DIRTY_LOG_RING_ACQ_REL;
1630          ret = kvm_vm_check_extension(s, capability);
1631      }
1632  
1633      if (ret <= 0) {
1634          warn_report("KVM dirty ring not available, using bitmap method");
1635          return 0;
1636      }
1637  
1638      if (ring_bytes > ret) {
1639          error_report("KVM dirty ring size %" PRIu32 " too big "
1640                       "(maximum is %ld).  Please use a smaller value.",
1641                       ring_size, (long)ret / sizeof(struct kvm_dirty_gfn));
1642          return -EINVAL;
1643      }
1644  
1645      ret = kvm_vm_enable_cap(s, capability, 0, ring_bytes);
1646      if (ret) {
1647          error_report("Enabling of KVM dirty ring failed: %s. "
1648                       "Suggested minimum value is 1024.", strerror(-ret));
1649          return -EIO;
1650      }
1651  
1652      /* Enable the backup bitmap if it is supported */
1653      ret = kvm_vm_check_extension(s, KVM_CAP_DIRTY_LOG_RING_WITH_BITMAP);
1654      if (ret > 0) {
1655          ret = kvm_vm_enable_cap(s, KVM_CAP_DIRTY_LOG_RING_WITH_BITMAP, 0);
1656          if (ret) {
1657              error_report("Enabling of KVM dirty ring's backup bitmap failed: "
1658                           "%s. ", strerror(-ret));
1659              return -EIO;
1660          }
1661  
1662          s->kvm_dirty_ring_with_bitmap = true;
1663      }
1664  
1665      s->kvm_dirty_ring_size = ring_size;
1666      s->kvm_dirty_ring_bytes = ring_bytes;
1667  
1668      return 0;
1669  }
1670  
1671  static void kvm_region_add(MemoryListener *listener,
1672                             MemoryRegionSection *section)
1673  {
1674      KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener);
1675      KVMMemoryUpdate *update;
1676  
1677      update = g_new0(KVMMemoryUpdate, 1);
1678      update->section = *section;
1679  
1680      QSIMPLEQ_INSERT_TAIL(&kml->transaction_add, update, next);
1681  }
1682  
1683  static void kvm_region_del(MemoryListener *listener,
1684                             MemoryRegionSection *section)
1685  {
1686      KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener);
1687      KVMMemoryUpdate *update;
1688  
1689      update = g_new0(KVMMemoryUpdate, 1);
1690      update->section = *section;
1691  
1692      QSIMPLEQ_INSERT_TAIL(&kml->transaction_del, update, next);
1693  }
1694  
1695  static void kvm_region_commit(MemoryListener *listener)
1696  {
1697      KVMMemoryListener *kml = container_of(listener, KVMMemoryListener,
1698                                            listener);
1699      KVMMemoryUpdate *u1, *u2;
1700      bool need_inhibit = false;
1701  
1702      if (QSIMPLEQ_EMPTY(&kml->transaction_add) &&
1703          QSIMPLEQ_EMPTY(&kml->transaction_del)) {
1704          return;
1705      }
1706  
1707      /*
1708       * We have to be careful when regions to add overlap with ranges to remove.
1709       * We have to simulate atomic KVM memslot updates by making sure no ioctl()
1710       * is currently active.
1711       *
1712       * The lists are order by addresses, so it's easy to find overlaps.
1713       */
1714      u1 = QSIMPLEQ_FIRST(&kml->transaction_del);
1715      u2 = QSIMPLEQ_FIRST(&kml->transaction_add);
1716      while (u1 && u2) {
1717          Range r1, r2;
1718  
1719          range_init_nofail(&r1, u1->section.offset_within_address_space,
1720                            int128_get64(u1->section.size));
1721          range_init_nofail(&r2, u2->section.offset_within_address_space,
1722                            int128_get64(u2->section.size));
1723  
1724          if (range_overlaps_range(&r1, &r2)) {
1725              need_inhibit = true;
1726              break;
1727          }
1728          if (range_lob(&r1) < range_lob(&r2)) {
1729              u1 = QSIMPLEQ_NEXT(u1, next);
1730          } else {
1731              u2 = QSIMPLEQ_NEXT(u2, next);
1732          }
1733      }
1734  
1735      kvm_slots_lock();
1736      if (need_inhibit) {
1737          accel_ioctl_inhibit_begin();
1738      }
1739  
1740      /* Remove all memslots before adding the new ones. */
1741      while (!QSIMPLEQ_EMPTY(&kml->transaction_del)) {
1742          u1 = QSIMPLEQ_FIRST(&kml->transaction_del);
1743          QSIMPLEQ_REMOVE_HEAD(&kml->transaction_del, next);
1744  
1745          kvm_set_phys_mem(kml, &u1->section, false);
1746          memory_region_unref(u1->section.mr);
1747  
1748          g_free(u1);
1749      }
1750      while (!QSIMPLEQ_EMPTY(&kml->transaction_add)) {
1751          u1 = QSIMPLEQ_FIRST(&kml->transaction_add);
1752          QSIMPLEQ_REMOVE_HEAD(&kml->transaction_add, next);
1753  
1754          memory_region_ref(u1->section.mr);
1755          kvm_set_phys_mem(kml, &u1->section, true);
1756  
1757          g_free(u1);
1758      }
1759  
1760      if (need_inhibit) {
1761          accel_ioctl_inhibit_end();
1762      }
1763      kvm_slots_unlock();
1764  }
1765  
1766  static void kvm_log_sync(MemoryListener *listener,
1767                           MemoryRegionSection *section)
1768  {
1769      KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener);
1770  
1771      kvm_slots_lock();
1772      kvm_physical_sync_dirty_bitmap(kml, section);
1773      kvm_slots_unlock();
1774  }
1775  
1776  static void kvm_log_sync_global(MemoryListener *l, bool last_stage)
1777  {
1778      KVMMemoryListener *kml = container_of(l, KVMMemoryListener, listener);
1779      KVMState *s = kvm_state;
1780      KVMSlot *mem;
1781      int i;
1782  
1783      /* Flush all kernel dirty addresses into KVMSlot dirty bitmap */
1784      kvm_dirty_ring_flush();
1785  
1786      kvm_slots_lock();
1787      for (i = 0; i < kml->nr_slots_allocated; i++) {
1788          mem = &kml->slots[i];
1789          if (mem->memory_size && mem->flags & KVM_MEM_LOG_DIRTY_PAGES) {
1790              kvm_slot_sync_dirty_pages(mem);
1791  
1792              if (s->kvm_dirty_ring_with_bitmap && last_stage &&
1793                  kvm_slot_get_dirty_log(s, mem)) {
1794                  kvm_slot_sync_dirty_pages(mem);
1795              }
1796  
1797              /*
1798               * This is not needed by KVM_GET_DIRTY_LOG because the
1799               * ioctl will unconditionally overwrite the whole region.
1800               * However kvm dirty ring has no such side effect.
1801               */
1802              kvm_slot_reset_dirty_pages(mem);
1803          }
1804      }
1805      kvm_slots_unlock();
1806  }
1807  
1808  static void kvm_log_clear(MemoryListener *listener,
1809                            MemoryRegionSection *section)
1810  {
1811      KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener);
1812      int r;
1813  
1814      r = kvm_physical_log_clear(kml, section);
1815      if (r < 0) {
1816          error_report_once("%s: kvm log clear failed: mr=%s "
1817                            "offset=%"HWADDR_PRIx" size=%"PRIx64, __func__,
1818                            section->mr->name, section->offset_within_region,
1819                            int128_get64(section->size));
1820          abort();
1821      }
1822  }
1823  
1824  static void kvm_mem_ioeventfd_add(MemoryListener *listener,
1825                                    MemoryRegionSection *section,
1826                                    bool match_data, uint64_t data,
1827                                    EventNotifier *e)
1828  {
1829      int fd = event_notifier_get_fd(e);
1830      int r;
1831  
1832      r = kvm_set_ioeventfd_mmio(fd, section->offset_within_address_space,
1833                                 data, true, int128_get64(section->size),
1834                                 match_data);
1835      if (r < 0) {
1836          fprintf(stderr, "%s: error adding ioeventfd: %s (%d)\n",
1837                  __func__, strerror(-r), -r);
1838          abort();
1839      }
1840  }
1841  
1842  static void kvm_mem_ioeventfd_del(MemoryListener *listener,
1843                                    MemoryRegionSection *section,
1844                                    bool match_data, uint64_t data,
1845                                    EventNotifier *e)
1846  {
1847      int fd = event_notifier_get_fd(e);
1848      int r;
1849  
1850      r = kvm_set_ioeventfd_mmio(fd, section->offset_within_address_space,
1851                                 data, false, int128_get64(section->size),
1852                                 match_data);
1853      if (r < 0) {
1854          fprintf(stderr, "%s: error deleting ioeventfd: %s (%d)\n",
1855                  __func__, strerror(-r), -r);
1856          abort();
1857      }
1858  }
1859  
1860  static void kvm_io_ioeventfd_add(MemoryListener *listener,
1861                                   MemoryRegionSection *section,
1862                                   bool match_data, uint64_t data,
1863                                   EventNotifier *e)
1864  {
1865      int fd = event_notifier_get_fd(e);
1866      int r;
1867  
1868      r = kvm_set_ioeventfd_pio(fd, section->offset_within_address_space,
1869                                data, true, int128_get64(section->size),
1870                                match_data);
1871      if (r < 0) {
1872          fprintf(stderr, "%s: error adding ioeventfd: %s (%d)\n",
1873                  __func__, strerror(-r), -r);
1874          abort();
1875      }
1876  }
1877  
1878  static void kvm_io_ioeventfd_del(MemoryListener *listener,
1879                                   MemoryRegionSection *section,
1880                                   bool match_data, uint64_t data,
1881                                   EventNotifier *e)
1882  
1883  {
1884      int fd = event_notifier_get_fd(e);
1885      int r;
1886  
1887      r = kvm_set_ioeventfd_pio(fd, section->offset_within_address_space,
1888                                data, false, int128_get64(section->size),
1889                                match_data);
1890      if (r < 0) {
1891          fprintf(stderr, "%s: error deleting ioeventfd: %s (%d)\n",
1892                  __func__, strerror(-r), -r);
1893          abort();
1894      }
1895  }
1896  
1897  void kvm_memory_listener_register(KVMState *s, KVMMemoryListener *kml,
1898                                    AddressSpace *as, int as_id, const char *name)
1899  {
1900      int i;
1901  
1902      kml->as_id = as_id;
1903  
1904      kvm_slots_grow(kml, KVM_MEMSLOTS_NR_ALLOC_DEFAULT);
1905  
1906      QSIMPLEQ_INIT(&kml->transaction_add);
1907      QSIMPLEQ_INIT(&kml->transaction_del);
1908  
1909      kml->listener.region_add = kvm_region_add;
1910      kml->listener.region_del = kvm_region_del;
1911      kml->listener.commit = kvm_region_commit;
1912      kml->listener.log_start = kvm_log_start;
1913      kml->listener.log_stop = kvm_log_stop;
1914      kml->listener.priority = MEMORY_LISTENER_PRIORITY_ACCEL;
1915      kml->listener.name = name;
1916  
1917      if (s->kvm_dirty_ring_size) {
1918          kml->listener.log_sync_global = kvm_log_sync_global;
1919      } else {
1920          kml->listener.log_sync = kvm_log_sync;
1921          kml->listener.log_clear = kvm_log_clear;
1922      }
1923  
1924      memory_listener_register(&kml->listener, as);
1925  
1926      for (i = 0; i < s->nr_as; ++i) {
1927          if (!s->as[i].as) {
1928              s->as[i].as = as;
1929              s->as[i].ml = kml;
1930              break;
1931          }
1932      }
1933  }
1934  
1935  static MemoryListener kvm_io_listener = {
1936      .name = "kvm-io",
1937      .coalesced_io_add = kvm_coalesce_pio_add,
1938      .coalesced_io_del = kvm_coalesce_pio_del,
1939      .eventfd_add = kvm_io_ioeventfd_add,
1940      .eventfd_del = kvm_io_ioeventfd_del,
1941      .priority = MEMORY_LISTENER_PRIORITY_DEV_BACKEND,
1942  };
1943  
1944  int kvm_set_irq(KVMState *s, int irq, int level)
1945  {
1946      struct kvm_irq_level event;
1947      int ret;
1948  
1949      assert(kvm_async_interrupts_enabled());
1950  
1951      event.level = level;
1952      event.irq = irq;
1953      ret = kvm_vm_ioctl(s, s->irq_set_ioctl, &event);
1954      if (ret < 0) {
1955          perror("kvm_set_irq");
1956          abort();
1957      }
1958  
1959      return (s->irq_set_ioctl == KVM_IRQ_LINE) ? 1 : event.status;
1960  }
1961  
1962  #ifdef KVM_CAP_IRQ_ROUTING
1963  typedef struct KVMMSIRoute {
1964      struct kvm_irq_routing_entry kroute;
1965      QTAILQ_ENTRY(KVMMSIRoute) entry;
1966  } KVMMSIRoute;
1967  
1968  static void set_gsi(KVMState *s, unsigned int gsi)
1969  {
1970      set_bit(gsi, s->used_gsi_bitmap);
1971  }
1972  
1973  static void clear_gsi(KVMState *s, unsigned int gsi)
1974  {
1975      clear_bit(gsi, s->used_gsi_bitmap);
1976  }
1977  
1978  void kvm_init_irq_routing(KVMState *s)
1979  {
1980      int gsi_count;
1981  
1982      gsi_count = kvm_check_extension(s, KVM_CAP_IRQ_ROUTING) - 1;
1983      if (gsi_count > 0) {
1984          /* Round up so we can search ints using ffs */
1985          s->used_gsi_bitmap = bitmap_new(gsi_count);
1986          s->gsi_count = gsi_count;
1987      }
1988  
1989      s->irq_routes = g_malloc0(sizeof(*s->irq_routes));
1990      s->nr_allocated_irq_routes = 0;
1991  
1992      kvm_arch_init_irq_routing(s);
1993  }
1994  
1995  void kvm_irqchip_commit_routes(KVMState *s)
1996  {
1997      int ret;
1998  
1999      if (kvm_gsi_direct_mapping()) {
2000          return;
2001      }
2002  
2003      if (!kvm_gsi_routing_enabled()) {
2004          return;
2005      }
2006  
2007      s->irq_routes->flags = 0;
2008      trace_kvm_irqchip_commit_routes();
2009      ret = kvm_vm_ioctl(s, KVM_SET_GSI_ROUTING, s->irq_routes);
2010      assert(ret == 0);
2011  }
2012  
2013  void kvm_add_routing_entry(KVMState *s,
2014                             struct kvm_irq_routing_entry *entry)
2015  {
2016      struct kvm_irq_routing_entry *new;
2017      int n, size;
2018  
2019      if (s->irq_routes->nr == s->nr_allocated_irq_routes) {
2020          n = s->nr_allocated_irq_routes * 2;
2021          if (n < 64) {
2022              n = 64;
2023          }
2024          size = sizeof(struct kvm_irq_routing);
2025          size += n * sizeof(*new);
2026          s->irq_routes = g_realloc(s->irq_routes, size);
2027          s->nr_allocated_irq_routes = n;
2028      }
2029      n = s->irq_routes->nr++;
2030      new = &s->irq_routes->entries[n];
2031  
2032      *new = *entry;
2033  
2034      set_gsi(s, entry->gsi);
2035  }
2036  
2037  static int kvm_update_routing_entry(KVMState *s,
2038                                      struct kvm_irq_routing_entry *new_entry)
2039  {
2040      struct kvm_irq_routing_entry *entry;
2041      int n;
2042  
2043      for (n = 0; n < s->irq_routes->nr; n++) {
2044          entry = &s->irq_routes->entries[n];
2045          if (entry->gsi != new_entry->gsi) {
2046              continue;
2047          }
2048  
2049          if(!memcmp(entry, new_entry, sizeof *entry)) {
2050              return 0;
2051          }
2052  
2053          *entry = *new_entry;
2054  
2055          return 0;
2056      }
2057  
2058      return -ESRCH;
2059  }
2060  
2061  void kvm_irqchip_add_irq_route(KVMState *s, int irq, int irqchip, int pin)
2062  {
2063      struct kvm_irq_routing_entry e = {};
2064  
2065      assert(pin < s->gsi_count);
2066  
2067      e.gsi = irq;
2068      e.type = KVM_IRQ_ROUTING_IRQCHIP;
2069      e.flags = 0;
2070      e.u.irqchip.irqchip = irqchip;
2071      e.u.irqchip.pin = pin;
2072      kvm_add_routing_entry(s, &e);
2073  }
2074  
2075  void kvm_irqchip_release_virq(KVMState *s, int virq)
2076  {
2077      struct kvm_irq_routing_entry *e;
2078      int i;
2079  
2080      if (kvm_gsi_direct_mapping()) {
2081          return;
2082      }
2083  
2084      for (i = 0; i < s->irq_routes->nr; i++) {
2085          e = &s->irq_routes->entries[i];
2086          if (e->gsi == virq) {
2087              s->irq_routes->nr--;
2088              *e = s->irq_routes->entries[s->irq_routes->nr];
2089          }
2090      }
2091      clear_gsi(s, virq);
2092      kvm_arch_release_virq_post(virq);
2093      trace_kvm_irqchip_release_virq(virq);
2094  }
2095  
2096  void kvm_irqchip_add_change_notifier(Notifier *n)
2097  {
2098      notifier_list_add(&kvm_irqchip_change_notifiers, n);
2099  }
2100  
2101  void kvm_irqchip_remove_change_notifier(Notifier *n)
2102  {
2103      notifier_remove(n);
2104  }
2105  
2106  void kvm_irqchip_change_notify(void)
2107  {
2108      notifier_list_notify(&kvm_irqchip_change_notifiers, NULL);
2109  }
2110  
2111  int kvm_irqchip_get_virq(KVMState *s)
2112  {
2113      int next_virq;
2114  
2115      /* Return the lowest unused GSI in the bitmap */
2116      next_virq = find_first_zero_bit(s->used_gsi_bitmap, s->gsi_count);
2117      if (next_virq >= s->gsi_count) {
2118          return -ENOSPC;
2119      } else {
2120          return next_virq;
2121      }
2122  }
2123  
2124  int kvm_irqchip_send_msi(KVMState *s, MSIMessage msg)
2125  {
2126      struct kvm_msi msi;
2127  
2128      msi.address_lo = (uint32_t)msg.address;
2129      msi.address_hi = msg.address >> 32;
2130      msi.data = le32_to_cpu(msg.data);
2131      msi.flags = 0;
2132      memset(msi.pad, 0, sizeof(msi.pad));
2133  
2134      return kvm_vm_ioctl(s, KVM_SIGNAL_MSI, &msi);
2135  }
2136  
2137  int kvm_irqchip_add_msi_route(KVMRouteChange *c, int vector, PCIDevice *dev)
2138  {
2139      struct kvm_irq_routing_entry kroute = {};
2140      int virq;
2141      KVMState *s = c->s;
2142      MSIMessage msg = {0, 0};
2143  
2144      if (pci_available && dev) {
2145          msg = pci_get_msi_message(dev, vector);
2146      }
2147  
2148      if (kvm_gsi_direct_mapping()) {
2149          return kvm_arch_msi_data_to_gsi(msg.data);
2150      }
2151  
2152      if (!kvm_gsi_routing_enabled()) {
2153          return -ENOSYS;
2154      }
2155  
2156      virq = kvm_irqchip_get_virq(s);
2157      if (virq < 0) {
2158          return virq;
2159      }
2160  
2161      kroute.gsi = virq;
2162      kroute.type = KVM_IRQ_ROUTING_MSI;
2163      kroute.flags = 0;
2164      kroute.u.msi.address_lo = (uint32_t)msg.address;
2165      kroute.u.msi.address_hi = msg.address >> 32;
2166      kroute.u.msi.data = le32_to_cpu(msg.data);
2167      if (pci_available && kvm_msi_devid_required()) {
2168          kroute.flags = KVM_MSI_VALID_DEVID;
2169          kroute.u.msi.devid = pci_requester_id(dev);
2170      }
2171      if (kvm_arch_fixup_msi_route(&kroute, msg.address, msg.data, dev)) {
2172          kvm_irqchip_release_virq(s, virq);
2173          return -EINVAL;
2174      }
2175  
2176      if (s->irq_routes->nr < s->gsi_count) {
2177          trace_kvm_irqchip_add_msi_route(dev ? dev->name : (char *)"N/A",
2178                                          vector, virq);
2179  
2180          kvm_add_routing_entry(s, &kroute);
2181          kvm_arch_add_msi_route_post(&kroute, vector, dev);
2182          c->changes++;
2183      } else {
2184          kvm_irqchip_release_virq(s, virq);
2185          return -ENOSPC;
2186      }
2187  
2188      return virq;
2189  }
2190  
2191  int kvm_irqchip_update_msi_route(KVMState *s, int virq, MSIMessage msg,
2192                                   PCIDevice *dev)
2193  {
2194      struct kvm_irq_routing_entry kroute = {};
2195  
2196      if (kvm_gsi_direct_mapping()) {
2197          return 0;
2198      }
2199  
2200      if (!kvm_irqchip_in_kernel()) {
2201          return -ENOSYS;
2202      }
2203  
2204      kroute.gsi = virq;
2205      kroute.type = KVM_IRQ_ROUTING_MSI;
2206      kroute.flags = 0;
2207      kroute.u.msi.address_lo = (uint32_t)msg.address;
2208      kroute.u.msi.address_hi = msg.address >> 32;
2209      kroute.u.msi.data = le32_to_cpu(msg.data);
2210      if (pci_available && kvm_msi_devid_required()) {
2211          kroute.flags = KVM_MSI_VALID_DEVID;
2212          kroute.u.msi.devid = pci_requester_id(dev);
2213      }
2214      if (kvm_arch_fixup_msi_route(&kroute, msg.address, msg.data, dev)) {
2215          return -EINVAL;
2216      }
2217  
2218      trace_kvm_irqchip_update_msi_route(virq);
2219  
2220      return kvm_update_routing_entry(s, &kroute);
2221  }
2222  
2223  static int kvm_irqchip_assign_irqfd(KVMState *s, EventNotifier *event,
2224                                      EventNotifier *resample, int virq,
2225                                      bool assign)
2226  {
2227      int fd = event_notifier_get_fd(event);
2228      int rfd = resample ? event_notifier_get_fd(resample) : -1;
2229  
2230      struct kvm_irqfd irqfd = {
2231          .fd = fd,
2232          .gsi = virq,
2233          .flags = assign ? 0 : KVM_IRQFD_FLAG_DEASSIGN,
2234      };
2235  
2236      if (rfd != -1) {
2237          assert(assign);
2238          if (kvm_irqchip_is_split()) {
2239              /*
2240               * When the slow irqchip (e.g. IOAPIC) is in the
2241               * userspace, KVM kernel resamplefd will not work because
2242               * the EOI of the interrupt will be delivered to userspace
2243               * instead, so the KVM kernel resamplefd kick will be
2244               * skipped.  The userspace here mimics what the kernel
2245               * provides with resamplefd, remember the resamplefd and
2246               * kick it when we receive EOI of this IRQ.
2247               *
2248               * This is hackery because IOAPIC is mostly bypassed
2249               * (except EOI broadcasts) when irqfd is used.  However
2250               * this can bring much performance back for split irqchip
2251               * with INTx IRQs (for VFIO, this gives 93% perf of the
2252               * full fast path, which is 46% perf boost comparing to
2253               * the INTx slow path).
2254               */
2255              kvm_resample_fd_insert(virq, resample);
2256          } else {
2257              irqfd.flags |= KVM_IRQFD_FLAG_RESAMPLE;
2258              irqfd.resamplefd = rfd;
2259          }
2260      } else if (!assign) {
2261          if (kvm_irqchip_is_split()) {
2262              kvm_resample_fd_remove(virq);
2263          }
2264      }
2265  
2266      return kvm_vm_ioctl(s, KVM_IRQFD, &irqfd);
2267  }
2268  
2269  #else /* !KVM_CAP_IRQ_ROUTING */
2270  
2271  void kvm_init_irq_routing(KVMState *s)
2272  {
2273  }
2274  
2275  void kvm_irqchip_release_virq(KVMState *s, int virq)
2276  {
2277  }
2278  
2279  int kvm_irqchip_send_msi(KVMState *s, MSIMessage msg)
2280  {
2281      abort();
2282  }
2283  
2284  int kvm_irqchip_add_msi_route(KVMRouteChange *c, int vector, PCIDevice *dev)
2285  {
2286      return -ENOSYS;
2287  }
2288  
2289  int kvm_irqchip_add_adapter_route(KVMState *s, AdapterInfo *adapter)
2290  {
2291      return -ENOSYS;
2292  }
2293  
2294  int kvm_irqchip_add_hv_sint_route(KVMState *s, uint32_t vcpu, uint32_t sint)
2295  {
2296      return -ENOSYS;
2297  }
2298  
2299  static int kvm_irqchip_assign_irqfd(KVMState *s, EventNotifier *event,
2300                                      EventNotifier *resample, int virq,
2301                                      bool assign)
2302  {
2303      abort();
2304  }
2305  
2306  int kvm_irqchip_update_msi_route(KVMState *s, int virq, MSIMessage msg)
2307  {
2308      return -ENOSYS;
2309  }
2310  #endif /* !KVM_CAP_IRQ_ROUTING */
2311  
2312  int kvm_irqchip_add_irqfd_notifier_gsi(KVMState *s, EventNotifier *n,
2313                                         EventNotifier *rn, int virq)
2314  {
2315      return kvm_irqchip_assign_irqfd(s, n, rn, virq, true);
2316  }
2317  
2318  int kvm_irqchip_remove_irqfd_notifier_gsi(KVMState *s, EventNotifier *n,
2319                                            int virq)
2320  {
2321      return kvm_irqchip_assign_irqfd(s, n, NULL, virq, false);
2322  }
2323  
2324  int kvm_irqchip_add_irqfd_notifier(KVMState *s, EventNotifier *n,
2325                                     EventNotifier *rn, qemu_irq irq)
2326  {
2327      gpointer key, gsi;
2328      gboolean found = g_hash_table_lookup_extended(s->gsimap, irq, &key, &gsi);
2329  
2330      if (!found) {
2331          return -ENXIO;
2332      }
2333      return kvm_irqchip_add_irqfd_notifier_gsi(s, n, rn, GPOINTER_TO_INT(gsi));
2334  }
2335  
2336  int kvm_irqchip_remove_irqfd_notifier(KVMState *s, EventNotifier *n,
2337                                        qemu_irq irq)
2338  {
2339      gpointer key, gsi;
2340      gboolean found = g_hash_table_lookup_extended(s->gsimap, irq, &key, &gsi);
2341  
2342      if (!found) {
2343          return -ENXIO;
2344      }
2345      return kvm_irqchip_remove_irqfd_notifier_gsi(s, n, GPOINTER_TO_INT(gsi));
2346  }
2347  
2348  void kvm_irqchip_set_qemuirq_gsi(KVMState *s, qemu_irq irq, int gsi)
2349  {
2350      g_hash_table_insert(s->gsimap, irq, GINT_TO_POINTER(gsi));
2351  }
2352  
2353  static void kvm_irqchip_create(KVMState *s)
2354  {
2355      int ret;
2356  
2357      assert(s->kernel_irqchip_split != ON_OFF_AUTO_AUTO);
2358      if (kvm_check_extension(s, KVM_CAP_IRQCHIP)) {
2359          ;
2360      } else if (kvm_check_extension(s, KVM_CAP_S390_IRQCHIP)) {
2361          ret = kvm_vm_enable_cap(s, KVM_CAP_S390_IRQCHIP, 0);
2362          if (ret < 0) {
2363              fprintf(stderr, "Enable kernel irqchip failed: %s\n", strerror(-ret));
2364              exit(1);
2365          }
2366      } else {
2367          return;
2368      }
2369  
2370      if (kvm_check_extension(s, KVM_CAP_IRQFD) <= 0) {
2371          fprintf(stderr, "kvm: irqfd not implemented\n");
2372          exit(1);
2373      }
2374  
2375      /* First probe and see if there's a arch-specific hook to create the
2376       * in-kernel irqchip for us */
2377      ret = kvm_arch_irqchip_create(s);
2378      if (ret == 0) {
2379          if (s->kernel_irqchip_split == ON_OFF_AUTO_ON) {
2380              error_report("Split IRQ chip mode not supported.");
2381              exit(1);
2382          } else {
2383              ret = kvm_vm_ioctl(s, KVM_CREATE_IRQCHIP);
2384          }
2385      }
2386      if (ret < 0) {
2387          fprintf(stderr, "Create kernel irqchip failed: %s\n", strerror(-ret));
2388          exit(1);
2389      }
2390  
2391      kvm_kernel_irqchip = true;
2392      /* If we have an in-kernel IRQ chip then we must have asynchronous
2393       * interrupt delivery (though the reverse is not necessarily true)
2394       */
2395      kvm_async_interrupts_allowed = true;
2396      kvm_halt_in_kernel_allowed = true;
2397  
2398      kvm_init_irq_routing(s);
2399  
2400      s->gsimap = g_hash_table_new(g_direct_hash, g_direct_equal);
2401  }
2402  
2403  /* Find number of supported CPUs using the recommended
2404   * procedure from the kernel API documentation to cope with
2405   * older kernels that may be missing capabilities.
2406   */
2407  static int kvm_recommended_vcpus(KVMState *s)
2408  {
2409      int ret = kvm_vm_check_extension(s, KVM_CAP_NR_VCPUS);
2410      return (ret) ? ret : 4;
2411  }
2412  
2413  static int kvm_max_vcpus(KVMState *s)
2414  {
2415      int ret = kvm_check_extension(s, KVM_CAP_MAX_VCPUS);
2416      return (ret) ? ret : kvm_recommended_vcpus(s);
2417  }
2418  
2419  static int kvm_max_vcpu_id(KVMState *s)
2420  {
2421      int ret = kvm_check_extension(s, KVM_CAP_MAX_VCPU_ID);
2422      return (ret) ? ret : kvm_max_vcpus(s);
2423  }
2424  
2425  bool kvm_vcpu_id_is_valid(int vcpu_id)
2426  {
2427      KVMState *s = KVM_STATE(current_accel());
2428      return vcpu_id >= 0 && vcpu_id < kvm_max_vcpu_id(s);
2429  }
2430  
2431  bool kvm_dirty_ring_enabled(void)
2432  {
2433      return kvm_state && kvm_state->kvm_dirty_ring_size;
2434  }
2435  
2436  static void query_stats_cb(StatsResultList **result, StatsTarget target,
2437                             strList *names, strList *targets, Error **errp);
2438  static void query_stats_schemas_cb(StatsSchemaList **result, Error **errp);
2439  
2440  uint32_t kvm_dirty_ring_size(void)
2441  {
2442      return kvm_state->kvm_dirty_ring_size;
2443  }
2444  
2445  static int kvm_init(MachineState *ms)
2446  {
2447      MachineClass *mc = MACHINE_GET_CLASS(ms);
2448      static const char upgrade_note[] =
2449          "Please upgrade to at least kernel 2.6.29 or recent kvm-kmod\n"
2450          "(see http://sourceforge.net/projects/kvm).\n";
2451      const struct {
2452          const char *name;
2453          int num;
2454      } num_cpus[] = {
2455          { "SMP",          ms->smp.cpus },
2456          { "hotpluggable", ms->smp.max_cpus },
2457          { /* end of list */ }
2458      }, *nc = num_cpus;
2459      int soft_vcpus_limit, hard_vcpus_limit;
2460      KVMState *s;
2461      const KVMCapabilityInfo *missing_cap;
2462      int ret;
2463      int type;
2464      uint64_t dirty_log_manual_caps;
2465  
2466      qemu_mutex_init(&kml_slots_lock);
2467  
2468      s = KVM_STATE(ms->accelerator);
2469  
2470      /*
2471       * On systems where the kernel can support different base page
2472       * sizes, host page size may be different from TARGET_PAGE_SIZE,
2473       * even with KVM.  TARGET_PAGE_SIZE is assumed to be the minimum
2474       * page size for the system though.
2475       */
2476      assert(TARGET_PAGE_SIZE <= qemu_real_host_page_size());
2477  
2478      s->sigmask_len = 8;
2479      accel_blocker_init();
2480  
2481  #ifdef TARGET_KVM_HAVE_GUEST_DEBUG
2482      QTAILQ_INIT(&s->kvm_sw_breakpoints);
2483  #endif
2484      QLIST_INIT(&s->kvm_parked_vcpus);
2485      s->fd = qemu_open_old(s->device ?: "/dev/kvm", O_RDWR);
2486      if (s->fd == -1) {
2487          fprintf(stderr, "Could not access KVM kernel module: %m\n");
2488          ret = -errno;
2489          goto err;
2490      }
2491  
2492      ret = kvm_ioctl(s, KVM_GET_API_VERSION, 0);
2493      if (ret < KVM_API_VERSION) {
2494          if (ret >= 0) {
2495              ret = -EINVAL;
2496          }
2497          fprintf(stderr, "kvm version too old\n");
2498          goto err;
2499      }
2500  
2501      if (ret > KVM_API_VERSION) {
2502          ret = -EINVAL;
2503          fprintf(stderr, "kvm version not supported\n");
2504          goto err;
2505      }
2506  
2507      kvm_supported_memory_attributes = kvm_check_extension(s, KVM_CAP_MEMORY_ATTRIBUTES);
2508      kvm_guest_memfd_supported =
2509          kvm_check_extension(s, KVM_CAP_GUEST_MEMFD) &&
2510          kvm_check_extension(s, KVM_CAP_USER_MEMORY2) &&
2511          (kvm_supported_memory_attributes & KVM_MEMORY_ATTRIBUTE_PRIVATE);
2512  
2513      kvm_immediate_exit = kvm_check_extension(s, KVM_CAP_IMMEDIATE_EXIT);
2514      s->nr_slots = kvm_check_extension(s, KVM_CAP_NR_MEMSLOTS);
2515  
2516      /* If unspecified, use the default value */
2517      if (!s->nr_slots) {
2518          s->nr_slots = 32;
2519      }
2520  
2521      s->nr_as = kvm_check_extension(s, KVM_CAP_MULTI_ADDRESS_SPACE);
2522      if (s->nr_as <= 1) {
2523          s->nr_as = 1;
2524      }
2525      s->as = g_new0(struct KVMAs, s->nr_as);
2526  
2527      if (object_property_find(OBJECT(current_machine), "kvm-type")) {
2528          g_autofree char *kvm_type = object_property_get_str(OBJECT(current_machine),
2529                                                              "kvm-type",
2530                                                              &error_abort);
2531          type = mc->kvm_type(ms, kvm_type);
2532      } else if (mc->kvm_type) {
2533          type = mc->kvm_type(ms, NULL);
2534      } else {
2535          type = kvm_arch_get_default_type(ms);
2536      }
2537  
2538      if (type < 0) {
2539          ret = -EINVAL;
2540          goto err;
2541      }
2542  
2543      do {
2544          ret = kvm_ioctl(s, KVM_CREATE_VM, type);
2545      } while (ret == -EINTR);
2546  
2547      if (ret < 0) {
2548          fprintf(stderr, "ioctl(KVM_CREATE_VM) failed: %d %s\n", -ret,
2549                  strerror(-ret));
2550  
2551  #ifdef TARGET_S390X
2552          if (ret == -EINVAL) {
2553              fprintf(stderr,
2554                      "Host kernel setup problem detected. Please verify:\n");
2555              fprintf(stderr, "- for kernels supporting the switch_amode or"
2556                      " user_mode parameters, whether\n");
2557              fprintf(stderr,
2558                      "  user space is running in primary address space\n");
2559              fprintf(stderr,
2560                      "- for kernels supporting the vm.allocate_pgste sysctl, "
2561                      "whether it is enabled\n");
2562          }
2563  #elif defined(TARGET_PPC)
2564          if (ret == -EINVAL) {
2565              fprintf(stderr,
2566                      "PPC KVM module is not loaded. Try modprobe kvm_%s.\n",
2567                      (type == 2) ? "pr" : "hv");
2568          }
2569  #endif
2570          goto err;
2571      }
2572  
2573      s->vmfd = ret;
2574  
2575      /* check the vcpu limits */
2576      soft_vcpus_limit = kvm_recommended_vcpus(s);
2577      hard_vcpus_limit = kvm_max_vcpus(s);
2578  
2579      while (nc->name) {
2580          if (nc->num > soft_vcpus_limit) {
2581              warn_report("Number of %s cpus requested (%d) exceeds "
2582                          "the recommended cpus supported by KVM (%d)",
2583                          nc->name, nc->num, soft_vcpus_limit);
2584  
2585              if (nc->num > hard_vcpus_limit) {
2586                  fprintf(stderr, "Number of %s cpus requested (%d) exceeds "
2587                          "the maximum cpus supported by KVM (%d)\n",
2588                          nc->name, nc->num, hard_vcpus_limit);
2589                  exit(1);
2590              }
2591          }
2592          nc++;
2593      }
2594  
2595      missing_cap = kvm_check_extension_list(s, kvm_required_capabilites);
2596      if (!missing_cap) {
2597          missing_cap =
2598              kvm_check_extension_list(s, kvm_arch_required_capabilities);
2599      }
2600      if (missing_cap) {
2601          ret = -EINVAL;
2602          fprintf(stderr, "kvm does not support %s\n%s",
2603                  missing_cap->name, upgrade_note);
2604          goto err;
2605      }
2606  
2607      s->coalesced_mmio = kvm_check_extension(s, KVM_CAP_COALESCED_MMIO);
2608      s->coalesced_pio = s->coalesced_mmio &&
2609                         kvm_check_extension(s, KVM_CAP_COALESCED_PIO);
2610  
2611      /*
2612       * Enable KVM dirty ring if supported, otherwise fall back to
2613       * dirty logging mode
2614       */
2615      ret = kvm_dirty_ring_init(s);
2616      if (ret < 0) {
2617          goto err;
2618      }
2619  
2620      /*
2621       * KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 is not needed when dirty ring is
2622       * enabled.  More importantly, KVM_DIRTY_LOG_INITIALLY_SET will assume no
2623       * page is wr-protected initially, which is against how kvm dirty ring is
2624       * usage - kvm dirty ring requires all pages are wr-protected at the very
2625       * beginning.  Enabling this feature for dirty ring causes data corruption.
2626       *
2627       * TODO: Without KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 and kvm clear dirty log,
2628       * we may expect a higher stall time when starting the migration.  In the
2629       * future we can enable KVM_CLEAR_DIRTY_LOG to work with dirty ring too:
2630       * instead of clearing dirty bit, it can be a way to explicitly wr-protect
2631       * guest pages.
2632       */
2633      if (!s->kvm_dirty_ring_size) {
2634          dirty_log_manual_caps =
2635              kvm_check_extension(s, KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2);
2636          dirty_log_manual_caps &= (KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE |
2637                                    KVM_DIRTY_LOG_INITIALLY_SET);
2638          s->manual_dirty_log_protect = dirty_log_manual_caps;
2639          if (dirty_log_manual_caps) {
2640              ret = kvm_vm_enable_cap(s, KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2, 0,
2641                                      dirty_log_manual_caps);
2642              if (ret) {
2643                  warn_report("Trying to enable capability %"PRIu64" of "
2644                              "KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 but failed. "
2645                              "Falling back to the legacy mode. ",
2646                              dirty_log_manual_caps);
2647                  s->manual_dirty_log_protect = 0;
2648              }
2649          }
2650      }
2651  
2652  #ifdef KVM_CAP_VCPU_EVENTS
2653      s->vcpu_events = kvm_check_extension(s, KVM_CAP_VCPU_EVENTS);
2654  #endif
2655      s->max_nested_state_len = kvm_check_extension(s, KVM_CAP_NESTED_STATE);
2656  
2657      s->irq_set_ioctl = KVM_IRQ_LINE;
2658      if (kvm_check_extension(s, KVM_CAP_IRQ_INJECT_STATUS)) {
2659          s->irq_set_ioctl = KVM_IRQ_LINE_STATUS;
2660      }
2661  
2662      kvm_readonly_mem_allowed =
2663          (kvm_vm_check_extension(s, KVM_CAP_READONLY_MEM) > 0);
2664  
2665      kvm_resamplefds_allowed =
2666          (kvm_check_extension(s, KVM_CAP_IRQFD_RESAMPLE) > 0);
2667  
2668      kvm_vm_attributes_allowed =
2669          (kvm_check_extension(s, KVM_CAP_VM_ATTRIBUTES) > 0);
2670  
2671  #ifdef TARGET_KVM_HAVE_GUEST_DEBUG
2672      kvm_has_guest_debug =
2673          (kvm_check_extension(s, KVM_CAP_SET_GUEST_DEBUG) > 0);
2674  #endif
2675  
2676      kvm_sstep_flags = 0;
2677      if (kvm_has_guest_debug) {
2678          kvm_sstep_flags = SSTEP_ENABLE;
2679  
2680  #if defined TARGET_KVM_HAVE_GUEST_DEBUG
2681          int guest_debug_flags =
2682              kvm_check_extension(s, KVM_CAP_SET_GUEST_DEBUG2);
2683  
2684          if (guest_debug_flags & KVM_GUESTDBG_BLOCKIRQ) {
2685              kvm_sstep_flags |= SSTEP_NOIRQ;
2686          }
2687  #endif
2688      }
2689  
2690      kvm_state = s;
2691  
2692      ret = kvm_arch_init(ms, s);
2693      if (ret < 0) {
2694          goto err;
2695      }
2696  
2697      if (s->kernel_irqchip_split == ON_OFF_AUTO_AUTO) {
2698          s->kernel_irqchip_split = mc->default_kernel_irqchip_split ? ON_OFF_AUTO_ON : ON_OFF_AUTO_OFF;
2699      }
2700  
2701      qemu_register_reset(kvm_unpoison_all, NULL);
2702  
2703      if (s->kernel_irqchip_allowed) {
2704          kvm_irqchip_create(s);
2705      }
2706  
2707      s->memory_listener.listener.eventfd_add = kvm_mem_ioeventfd_add;
2708      s->memory_listener.listener.eventfd_del = kvm_mem_ioeventfd_del;
2709      s->memory_listener.listener.coalesced_io_add = kvm_coalesce_mmio_region;
2710      s->memory_listener.listener.coalesced_io_del = kvm_uncoalesce_mmio_region;
2711  
2712      kvm_memory_listener_register(s, &s->memory_listener,
2713                                   &address_space_memory, 0, "kvm-memory");
2714      memory_listener_register(&kvm_io_listener,
2715                               &address_space_io);
2716  
2717      s->sync_mmu = !!kvm_vm_check_extension(kvm_state, KVM_CAP_SYNC_MMU);
2718      if (!s->sync_mmu) {
2719          ret = ram_block_discard_disable(true);
2720          assert(!ret);
2721      }
2722  
2723      if (s->kvm_dirty_ring_size) {
2724          kvm_dirty_ring_reaper_init(s);
2725      }
2726  
2727      if (kvm_check_extension(kvm_state, KVM_CAP_BINARY_STATS_FD)) {
2728          add_stats_callbacks(STATS_PROVIDER_KVM, query_stats_cb,
2729                              query_stats_schemas_cb);
2730      }
2731  
2732      return 0;
2733  
2734  err:
2735      assert(ret < 0);
2736      if (s->vmfd >= 0) {
2737          close(s->vmfd);
2738      }
2739      if (s->fd != -1) {
2740          close(s->fd);
2741      }
2742      g_free(s->as);
2743      g_free(s->memory_listener.slots);
2744  
2745      return ret;
2746  }
2747  
2748  void kvm_set_sigmask_len(KVMState *s, unsigned int sigmask_len)
2749  {
2750      s->sigmask_len = sigmask_len;
2751  }
2752  
2753  static void kvm_handle_io(uint16_t port, MemTxAttrs attrs, void *data, int direction,
2754                            int size, uint32_t count)
2755  {
2756      int i;
2757      uint8_t *ptr = data;
2758  
2759      for (i = 0; i < count; i++) {
2760          address_space_rw(&address_space_io, port, attrs,
2761                           ptr, size,
2762                           direction == KVM_EXIT_IO_OUT);
2763          ptr += size;
2764      }
2765  }
2766  
2767  static int kvm_handle_internal_error(CPUState *cpu, struct kvm_run *run)
2768  {
2769      int i;
2770  
2771      fprintf(stderr, "KVM internal error. Suberror: %d\n",
2772              run->internal.suberror);
2773  
2774      for (i = 0; i < run->internal.ndata; ++i) {
2775          fprintf(stderr, "extra data[%d]: 0x%016"PRIx64"\n",
2776                  i, (uint64_t)run->internal.data[i]);
2777      }
2778      if (run->internal.suberror == KVM_INTERNAL_ERROR_EMULATION) {
2779          fprintf(stderr, "emulation failure\n");
2780          if (!kvm_arch_stop_on_emulation_error(cpu)) {
2781              cpu_dump_state(cpu, stderr, CPU_DUMP_CODE);
2782              return EXCP_INTERRUPT;
2783          }
2784      }
2785      /* FIXME: Should trigger a qmp message to let management know
2786       * something went wrong.
2787       */
2788      return -1;
2789  }
2790  
2791  void kvm_flush_coalesced_mmio_buffer(void)
2792  {
2793      KVMState *s = kvm_state;
2794  
2795      if (!s || s->coalesced_flush_in_progress) {
2796          return;
2797      }
2798  
2799      s->coalesced_flush_in_progress = true;
2800  
2801      if (s->coalesced_mmio_ring) {
2802          struct kvm_coalesced_mmio_ring *ring = s->coalesced_mmio_ring;
2803          while (ring->first != ring->last) {
2804              struct kvm_coalesced_mmio *ent;
2805  
2806              ent = &ring->coalesced_mmio[ring->first];
2807  
2808              if (ent->pio == 1) {
2809                  address_space_write(&address_space_io, ent->phys_addr,
2810                                      MEMTXATTRS_UNSPECIFIED, ent->data,
2811                                      ent->len);
2812              } else {
2813                  cpu_physical_memory_write(ent->phys_addr, ent->data, ent->len);
2814              }
2815              smp_wmb();
2816              ring->first = (ring->first + 1) % KVM_COALESCED_MMIO_MAX;
2817          }
2818      }
2819  
2820      s->coalesced_flush_in_progress = false;
2821  }
2822  
2823  static void do_kvm_cpu_synchronize_state(CPUState *cpu, run_on_cpu_data arg)
2824  {
2825      if (!cpu->vcpu_dirty && !kvm_state->guest_state_protected) {
2826          int ret = kvm_arch_get_registers(cpu);
2827          if (ret) {
2828              error_report("Failed to get registers: %s", strerror(-ret));
2829              cpu_dump_state(cpu, stderr, CPU_DUMP_CODE);
2830              vm_stop(RUN_STATE_INTERNAL_ERROR);
2831          }
2832  
2833          cpu->vcpu_dirty = true;
2834      }
2835  }
2836  
2837  void kvm_cpu_synchronize_state(CPUState *cpu)
2838  {
2839      if (!cpu->vcpu_dirty && !kvm_state->guest_state_protected) {
2840          run_on_cpu(cpu, do_kvm_cpu_synchronize_state, RUN_ON_CPU_NULL);
2841      }
2842  }
2843  
2844  static void do_kvm_cpu_synchronize_post_reset(CPUState *cpu, run_on_cpu_data arg)
2845  {
2846      int ret = kvm_arch_put_registers(cpu, KVM_PUT_RESET_STATE);
2847      if (ret) {
2848          error_report("Failed to put registers after reset: %s", strerror(-ret));
2849          cpu_dump_state(cpu, stderr, CPU_DUMP_CODE);
2850          vm_stop(RUN_STATE_INTERNAL_ERROR);
2851      }
2852  
2853      cpu->vcpu_dirty = false;
2854  }
2855  
2856  void kvm_cpu_synchronize_post_reset(CPUState *cpu)
2857  {
2858      run_on_cpu(cpu, do_kvm_cpu_synchronize_post_reset, RUN_ON_CPU_NULL);
2859  }
2860  
2861  static void do_kvm_cpu_synchronize_post_init(CPUState *cpu, run_on_cpu_data arg)
2862  {
2863      int ret = kvm_arch_put_registers(cpu, KVM_PUT_FULL_STATE);
2864      if (ret) {
2865          error_report("Failed to put registers after init: %s", strerror(-ret));
2866          exit(1);
2867      }
2868  
2869      cpu->vcpu_dirty = false;
2870  }
2871  
2872  void kvm_cpu_synchronize_post_init(CPUState *cpu)
2873  {
2874      if (!kvm_state->guest_state_protected) {
2875          /*
2876           * This runs before the machine_init_done notifiers, and is the last
2877           * opportunity to synchronize the state of confidential guests.
2878           */
2879          run_on_cpu(cpu, do_kvm_cpu_synchronize_post_init, RUN_ON_CPU_NULL);
2880      }
2881  }
2882  
2883  static void do_kvm_cpu_synchronize_pre_loadvm(CPUState *cpu, run_on_cpu_data arg)
2884  {
2885      cpu->vcpu_dirty = true;
2886  }
2887  
2888  void kvm_cpu_synchronize_pre_loadvm(CPUState *cpu)
2889  {
2890      run_on_cpu(cpu, do_kvm_cpu_synchronize_pre_loadvm, RUN_ON_CPU_NULL);
2891  }
2892  
2893  #ifdef KVM_HAVE_MCE_INJECTION
2894  static __thread void *pending_sigbus_addr;
2895  static __thread int pending_sigbus_code;
2896  static __thread bool have_sigbus_pending;
2897  #endif
2898  
2899  static void kvm_cpu_kick(CPUState *cpu)
2900  {
2901      qatomic_set(&cpu->kvm_run->immediate_exit, 1);
2902  }
2903  
2904  static void kvm_cpu_kick_self(void)
2905  {
2906      if (kvm_immediate_exit) {
2907          kvm_cpu_kick(current_cpu);
2908      } else {
2909          qemu_cpu_kick_self();
2910      }
2911  }
2912  
2913  static void kvm_eat_signals(CPUState *cpu)
2914  {
2915      struct timespec ts = { 0, 0 };
2916      siginfo_t siginfo;
2917      sigset_t waitset;
2918      sigset_t chkset;
2919      int r;
2920  
2921      if (kvm_immediate_exit) {
2922          qatomic_set(&cpu->kvm_run->immediate_exit, 0);
2923          /* Write kvm_run->immediate_exit before the cpu->exit_request
2924           * write in kvm_cpu_exec.
2925           */
2926          smp_wmb();
2927          return;
2928      }
2929  
2930      sigemptyset(&waitset);
2931      sigaddset(&waitset, SIG_IPI);
2932  
2933      do {
2934          r = sigtimedwait(&waitset, &siginfo, &ts);
2935          if (r == -1 && !(errno == EAGAIN || errno == EINTR)) {
2936              perror("sigtimedwait");
2937              exit(1);
2938          }
2939  
2940          r = sigpending(&chkset);
2941          if (r == -1) {
2942              perror("sigpending");
2943              exit(1);
2944          }
2945      } while (sigismember(&chkset, SIG_IPI));
2946  }
2947  
2948  int kvm_convert_memory(hwaddr start, hwaddr size, bool to_private)
2949  {
2950      MemoryRegionSection section;
2951      ram_addr_t offset;
2952      MemoryRegion *mr;
2953      RAMBlock *rb;
2954      void *addr;
2955      int ret = -1;
2956  
2957      trace_kvm_convert_memory(start, size, to_private ? "shared_to_private" : "private_to_shared");
2958  
2959      if (!QEMU_PTR_IS_ALIGNED(start, qemu_real_host_page_size()) ||
2960          !QEMU_PTR_IS_ALIGNED(size, qemu_real_host_page_size())) {
2961          return -1;
2962      }
2963  
2964      if (!size) {
2965          return -1;
2966      }
2967  
2968      section = memory_region_find(get_system_memory(), start, size);
2969      mr = section.mr;
2970      if (!mr) {
2971          /*
2972           * Ignore converting non-assigned region to shared.
2973           *
2974           * TDX requires vMMIO region to be shared to inject #VE to guest.
2975           * OVMF issues conservatively MapGPA(shared) on 32bit PCI MMIO region,
2976           * and vIO-APIC 0xFEC00000 4K page.
2977           * OVMF assigns 32bit PCI MMIO region to
2978           * [top of low memory: typically 2GB=0xC000000,  0xFC00000)
2979           */
2980          if (!to_private) {
2981              return 0;
2982          }
2983          return -1;
2984      }
2985  
2986      if (!memory_region_has_guest_memfd(mr)) {
2987          /*
2988           * Because vMMIO region must be shared, guest TD may convert vMMIO
2989           * region to shared explicitly.  Don't complain such case.  See
2990           * memory_region_type() for checking if the region is MMIO region.
2991           */
2992          if (!to_private &&
2993              !memory_region_is_ram(mr) &&
2994              !memory_region_is_ram_device(mr) &&
2995              !memory_region_is_rom(mr) &&
2996              !memory_region_is_romd(mr)) {
2997              ret = 0;
2998          } else {
2999              error_report("Convert non guest_memfd backed memory region "
3000                          "(0x%"HWADDR_PRIx" ,+ 0x%"HWADDR_PRIx") to %s",
3001                          start, size, to_private ? "private" : "shared");
3002          }
3003          goto out_unref;
3004      }
3005  
3006      if (to_private) {
3007          ret = kvm_set_memory_attributes_private(start, size);
3008      } else {
3009          ret = kvm_set_memory_attributes_shared(start, size);
3010      }
3011      if (ret) {
3012          goto out_unref;
3013      }
3014  
3015      addr = memory_region_get_ram_ptr(mr) + section.offset_within_region;
3016      rb = qemu_ram_block_from_host(addr, false, &offset);
3017  
3018      if (to_private) {
3019          if (rb->page_size != qemu_real_host_page_size()) {
3020              /*
3021               * shared memory is backed by hugetlb, which is supposed to be
3022               * pre-allocated and doesn't need to be discarded
3023               */
3024              goto out_unref;
3025          }
3026          ret = ram_block_discard_range(rb, offset, size);
3027      } else {
3028          ret = ram_block_discard_guest_memfd_range(rb, offset, size);
3029      }
3030  
3031  out_unref:
3032      memory_region_unref(mr);
3033      return ret;
3034  }
3035  
3036  int kvm_cpu_exec(CPUState *cpu)
3037  {
3038      struct kvm_run *run = cpu->kvm_run;
3039      int ret, run_ret;
3040  
3041      trace_kvm_cpu_exec();
3042  
3043      if (kvm_arch_process_async_events(cpu)) {
3044          qatomic_set(&cpu->exit_request, 0);
3045          return EXCP_HLT;
3046      }
3047  
3048      bql_unlock();
3049      cpu_exec_start(cpu);
3050  
3051      do {
3052          MemTxAttrs attrs;
3053  
3054          if (cpu->vcpu_dirty) {
3055              ret = kvm_arch_put_registers(cpu, KVM_PUT_RUNTIME_STATE);
3056              if (ret) {
3057                  error_report("Failed to put registers after init: %s",
3058                               strerror(-ret));
3059                  ret = -1;
3060                  break;
3061              }
3062  
3063              cpu->vcpu_dirty = false;
3064          }
3065  
3066          kvm_arch_pre_run(cpu, run);
3067          if (qatomic_read(&cpu->exit_request)) {
3068              trace_kvm_interrupt_exit_request();
3069              /*
3070               * KVM requires us to reenter the kernel after IO exits to complete
3071               * instruction emulation. This self-signal will ensure that we
3072               * leave ASAP again.
3073               */
3074              kvm_cpu_kick_self();
3075          }
3076  
3077          /* Read cpu->exit_request before KVM_RUN reads run->immediate_exit.
3078           * Matching barrier in kvm_eat_signals.
3079           */
3080          smp_rmb();
3081  
3082          run_ret = kvm_vcpu_ioctl(cpu, KVM_RUN, 0);
3083  
3084          attrs = kvm_arch_post_run(cpu, run);
3085  
3086  #ifdef KVM_HAVE_MCE_INJECTION
3087          if (unlikely(have_sigbus_pending)) {
3088              bql_lock();
3089              kvm_arch_on_sigbus_vcpu(cpu, pending_sigbus_code,
3090                                      pending_sigbus_addr);
3091              have_sigbus_pending = false;
3092              bql_unlock();
3093          }
3094  #endif
3095  
3096          if (run_ret < 0) {
3097              if (run_ret == -EINTR || run_ret == -EAGAIN) {
3098                  trace_kvm_io_window_exit();
3099                  kvm_eat_signals(cpu);
3100                  ret = EXCP_INTERRUPT;
3101                  break;
3102              }
3103              if (!(run_ret == -EFAULT && run->exit_reason == KVM_EXIT_MEMORY_FAULT)) {
3104                  fprintf(stderr, "error: kvm run failed %s\n",
3105                          strerror(-run_ret));
3106  #ifdef TARGET_PPC
3107                  if (run_ret == -EBUSY) {
3108                      fprintf(stderr,
3109                              "This is probably because your SMT is enabled.\n"
3110                              "VCPU can only run on primary threads with all "
3111                              "secondary threads offline.\n");
3112                  }
3113  #endif
3114                  ret = -1;
3115                  break;
3116              }
3117          }
3118  
3119          trace_kvm_run_exit(cpu->cpu_index, run->exit_reason);
3120          switch (run->exit_reason) {
3121          case KVM_EXIT_IO:
3122              /* Called outside BQL */
3123              kvm_handle_io(run->io.port, attrs,
3124                            (uint8_t *)run + run->io.data_offset,
3125                            run->io.direction,
3126                            run->io.size,
3127                            run->io.count);
3128              ret = 0;
3129              break;
3130          case KVM_EXIT_MMIO:
3131              /* Called outside BQL */
3132              address_space_rw(&address_space_memory,
3133                               run->mmio.phys_addr, attrs,
3134                               run->mmio.data,
3135                               run->mmio.len,
3136                               run->mmio.is_write);
3137              ret = 0;
3138              break;
3139          case KVM_EXIT_IRQ_WINDOW_OPEN:
3140              ret = EXCP_INTERRUPT;
3141              break;
3142          case KVM_EXIT_SHUTDOWN:
3143              qemu_system_reset_request(SHUTDOWN_CAUSE_GUEST_RESET);
3144              ret = EXCP_INTERRUPT;
3145              break;
3146          case KVM_EXIT_UNKNOWN:
3147              fprintf(stderr, "KVM: unknown exit, hardware reason %" PRIx64 "\n",
3148                      (uint64_t)run->hw.hardware_exit_reason);
3149              ret = -1;
3150              break;
3151          case KVM_EXIT_INTERNAL_ERROR:
3152              ret = kvm_handle_internal_error(cpu, run);
3153              break;
3154          case KVM_EXIT_DIRTY_RING_FULL:
3155              /*
3156               * We shouldn't continue if the dirty ring of this vcpu is
3157               * still full.  Got kicked by KVM_RESET_DIRTY_RINGS.
3158               */
3159              trace_kvm_dirty_ring_full(cpu->cpu_index);
3160              bql_lock();
3161              /*
3162               * We throttle vCPU by making it sleep once it exit from kernel
3163               * due to dirty ring full. In the dirtylimit scenario, reaping
3164               * all vCPUs after a single vCPU dirty ring get full result in
3165               * the miss of sleep, so just reap the ring-fulled vCPU.
3166               */
3167              if (dirtylimit_in_service()) {
3168                  kvm_dirty_ring_reap(kvm_state, cpu);
3169              } else {
3170                  kvm_dirty_ring_reap(kvm_state, NULL);
3171              }
3172              bql_unlock();
3173              dirtylimit_vcpu_execute(cpu);
3174              ret = 0;
3175              break;
3176          case KVM_EXIT_SYSTEM_EVENT:
3177              trace_kvm_run_exit_system_event(cpu->cpu_index, run->system_event.type);
3178              switch (run->system_event.type) {
3179              case KVM_SYSTEM_EVENT_SHUTDOWN:
3180                  qemu_system_shutdown_request(SHUTDOWN_CAUSE_GUEST_SHUTDOWN);
3181                  ret = EXCP_INTERRUPT;
3182                  break;
3183              case KVM_SYSTEM_EVENT_RESET:
3184                  qemu_system_reset_request(SHUTDOWN_CAUSE_GUEST_RESET);
3185                  ret = EXCP_INTERRUPT;
3186                  break;
3187              case KVM_SYSTEM_EVENT_CRASH:
3188                  kvm_cpu_synchronize_state(cpu);
3189                  bql_lock();
3190                  qemu_system_guest_panicked(cpu_get_crash_info(cpu));
3191                  bql_unlock();
3192                  ret = 0;
3193                  break;
3194              default:
3195                  ret = kvm_arch_handle_exit(cpu, run);
3196                  break;
3197              }
3198              break;
3199          case KVM_EXIT_MEMORY_FAULT:
3200              trace_kvm_memory_fault(run->memory_fault.gpa,
3201                                     run->memory_fault.size,
3202                                     run->memory_fault.flags);
3203              if (run->memory_fault.flags & ~KVM_MEMORY_EXIT_FLAG_PRIVATE) {
3204                  error_report("KVM_EXIT_MEMORY_FAULT: Unknown flag 0x%" PRIx64,
3205                               (uint64_t)run->memory_fault.flags);
3206                  ret = -1;
3207                  break;
3208              }
3209              ret = kvm_convert_memory(run->memory_fault.gpa, run->memory_fault.size,
3210                                       run->memory_fault.flags & KVM_MEMORY_EXIT_FLAG_PRIVATE);
3211              break;
3212          default:
3213              ret = kvm_arch_handle_exit(cpu, run);
3214              break;
3215          }
3216      } while (ret == 0);
3217  
3218      cpu_exec_end(cpu);
3219      bql_lock();
3220  
3221      if (ret < 0) {
3222          cpu_dump_state(cpu, stderr, CPU_DUMP_CODE);
3223          vm_stop(RUN_STATE_INTERNAL_ERROR);
3224      }
3225  
3226      qatomic_set(&cpu->exit_request, 0);
3227      return ret;
3228  }
3229  
3230  int kvm_ioctl(KVMState *s, int type, ...)
3231  {
3232      int ret;
3233      void *arg;
3234      va_list ap;
3235  
3236      va_start(ap, type);
3237      arg = va_arg(ap, void *);
3238      va_end(ap);
3239  
3240      trace_kvm_ioctl(type, arg);
3241      ret = ioctl(s->fd, type, arg);
3242      if (ret == -1) {
3243          ret = -errno;
3244      }
3245      return ret;
3246  }
3247  
3248  int kvm_vm_ioctl(KVMState *s, int type, ...)
3249  {
3250      int ret;
3251      void *arg;
3252      va_list ap;
3253  
3254      va_start(ap, type);
3255      arg = va_arg(ap, void *);
3256      va_end(ap);
3257  
3258      trace_kvm_vm_ioctl(type, arg);
3259      accel_ioctl_begin();
3260      ret = ioctl(s->vmfd, type, arg);
3261      accel_ioctl_end();
3262      if (ret == -1) {
3263          ret = -errno;
3264      }
3265      return ret;
3266  }
3267  
3268  int kvm_vcpu_ioctl(CPUState *cpu, int type, ...)
3269  {
3270      int ret;
3271      void *arg;
3272      va_list ap;
3273  
3274      va_start(ap, type);
3275      arg = va_arg(ap, void *);
3276      va_end(ap);
3277  
3278      trace_kvm_vcpu_ioctl(cpu->cpu_index, type, arg);
3279      accel_cpu_ioctl_begin(cpu);
3280      ret = ioctl(cpu->kvm_fd, type, arg);
3281      accel_cpu_ioctl_end(cpu);
3282      if (ret == -1) {
3283          ret = -errno;
3284      }
3285      return ret;
3286  }
3287  
3288  int kvm_device_ioctl(int fd, int type, ...)
3289  {
3290      int ret;
3291      void *arg;
3292      va_list ap;
3293  
3294      va_start(ap, type);
3295      arg = va_arg(ap, void *);
3296      va_end(ap);
3297  
3298      trace_kvm_device_ioctl(fd, type, arg);
3299      accel_ioctl_begin();
3300      ret = ioctl(fd, type, arg);
3301      accel_ioctl_end();
3302      if (ret == -1) {
3303          ret = -errno;
3304      }
3305      return ret;
3306  }
3307  
3308  int kvm_vm_check_attr(KVMState *s, uint32_t group, uint64_t attr)
3309  {
3310      int ret;
3311      struct kvm_device_attr attribute = {
3312          .group = group,
3313          .attr = attr,
3314      };
3315  
3316      if (!kvm_vm_attributes_allowed) {
3317          return 0;
3318      }
3319  
3320      ret = kvm_vm_ioctl(s, KVM_HAS_DEVICE_ATTR, &attribute);
3321      /* kvm returns 0 on success for HAS_DEVICE_ATTR */
3322      return ret ? 0 : 1;
3323  }
3324  
3325  int kvm_device_check_attr(int dev_fd, uint32_t group, uint64_t attr)
3326  {
3327      struct kvm_device_attr attribute = {
3328          .group = group,
3329          .attr = attr,
3330          .flags = 0,
3331      };
3332  
3333      return kvm_device_ioctl(dev_fd, KVM_HAS_DEVICE_ATTR, &attribute) ? 0 : 1;
3334  }
3335  
3336  int kvm_device_access(int fd, int group, uint64_t attr,
3337                        void *val, bool write, Error **errp)
3338  {
3339      struct kvm_device_attr kvmattr;
3340      int err;
3341  
3342      kvmattr.flags = 0;
3343      kvmattr.group = group;
3344      kvmattr.attr = attr;
3345      kvmattr.addr = (uintptr_t)val;
3346  
3347      err = kvm_device_ioctl(fd,
3348                             write ? KVM_SET_DEVICE_ATTR : KVM_GET_DEVICE_ATTR,
3349                             &kvmattr);
3350      if (err < 0) {
3351          error_setg_errno(errp, -err,
3352                           "KVM_%s_DEVICE_ATTR failed: Group %d "
3353                           "attr 0x%016" PRIx64,
3354                           write ? "SET" : "GET", group, attr);
3355      }
3356      return err;
3357  }
3358  
3359  bool kvm_has_sync_mmu(void)
3360  {
3361      return kvm_state->sync_mmu;
3362  }
3363  
3364  int kvm_has_vcpu_events(void)
3365  {
3366      return kvm_state->vcpu_events;
3367  }
3368  
3369  int kvm_max_nested_state_length(void)
3370  {
3371      return kvm_state->max_nested_state_len;
3372  }
3373  
3374  int kvm_has_gsi_routing(void)
3375  {
3376  #ifdef KVM_CAP_IRQ_ROUTING
3377      return kvm_check_extension(kvm_state, KVM_CAP_IRQ_ROUTING);
3378  #else
3379      return false;
3380  #endif
3381  }
3382  
3383  bool kvm_arm_supports_user_irq(void)
3384  {
3385      return kvm_check_extension(kvm_state, KVM_CAP_ARM_USER_IRQ);
3386  }
3387  
3388  #ifdef TARGET_KVM_HAVE_GUEST_DEBUG
3389  struct kvm_sw_breakpoint *kvm_find_sw_breakpoint(CPUState *cpu, vaddr pc)
3390  {
3391      struct kvm_sw_breakpoint *bp;
3392  
3393      QTAILQ_FOREACH(bp, &cpu->kvm_state->kvm_sw_breakpoints, entry) {
3394          if (bp->pc == pc) {
3395              return bp;
3396          }
3397      }
3398      return NULL;
3399  }
3400  
3401  int kvm_sw_breakpoints_active(CPUState *cpu)
3402  {
3403      return !QTAILQ_EMPTY(&cpu->kvm_state->kvm_sw_breakpoints);
3404  }
3405  
3406  struct kvm_set_guest_debug_data {
3407      struct kvm_guest_debug dbg;
3408      int err;
3409  };
3410  
3411  static void kvm_invoke_set_guest_debug(CPUState *cpu, run_on_cpu_data data)
3412  {
3413      struct kvm_set_guest_debug_data *dbg_data =
3414          (struct kvm_set_guest_debug_data *) data.host_ptr;
3415  
3416      dbg_data->err = kvm_vcpu_ioctl(cpu, KVM_SET_GUEST_DEBUG,
3417                                     &dbg_data->dbg);
3418  }
3419  
3420  int kvm_update_guest_debug(CPUState *cpu, unsigned long reinject_trap)
3421  {
3422      struct kvm_set_guest_debug_data data;
3423  
3424      data.dbg.control = reinject_trap;
3425  
3426      if (cpu->singlestep_enabled) {
3427          data.dbg.control |= KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_SINGLESTEP;
3428  
3429          if (cpu->singlestep_enabled & SSTEP_NOIRQ) {
3430              data.dbg.control |= KVM_GUESTDBG_BLOCKIRQ;
3431          }
3432      }
3433      kvm_arch_update_guest_debug(cpu, &data.dbg);
3434  
3435      run_on_cpu(cpu, kvm_invoke_set_guest_debug,
3436                 RUN_ON_CPU_HOST_PTR(&data));
3437      return data.err;
3438  }
3439  
3440  bool kvm_supports_guest_debug(void)
3441  {
3442      /* probed during kvm_init() */
3443      return kvm_has_guest_debug;
3444  }
3445  
3446  int kvm_insert_breakpoint(CPUState *cpu, int type, vaddr addr, vaddr len)
3447  {
3448      struct kvm_sw_breakpoint *bp;
3449      int err;
3450  
3451      if (type == GDB_BREAKPOINT_SW) {
3452          bp = kvm_find_sw_breakpoint(cpu, addr);
3453          if (bp) {
3454              bp->use_count++;
3455              return 0;
3456          }
3457  
3458          bp = g_new(struct kvm_sw_breakpoint, 1);
3459          bp->pc = addr;
3460          bp->use_count = 1;
3461          err = kvm_arch_insert_sw_breakpoint(cpu, bp);
3462          if (err) {
3463              g_free(bp);
3464              return err;
3465          }
3466  
3467          QTAILQ_INSERT_HEAD(&cpu->kvm_state->kvm_sw_breakpoints, bp, entry);
3468      } else {
3469          err = kvm_arch_insert_hw_breakpoint(addr, len, type);
3470          if (err) {
3471              return err;
3472          }
3473      }
3474  
3475      CPU_FOREACH(cpu) {
3476          err = kvm_update_guest_debug(cpu, 0);
3477          if (err) {
3478              return err;
3479          }
3480      }
3481      return 0;
3482  }
3483  
3484  int kvm_remove_breakpoint(CPUState *cpu, int type, vaddr addr, vaddr len)
3485  {
3486      struct kvm_sw_breakpoint *bp;
3487      int err;
3488  
3489      if (type == GDB_BREAKPOINT_SW) {
3490          bp = kvm_find_sw_breakpoint(cpu, addr);
3491          if (!bp) {
3492              return -ENOENT;
3493          }
3494  
3495          if (bp->use_count > 1) {
3496              bp->use_count--;
3497              return 0;
3498          }
3499  
3500          err = kvm_arch_remove_sw_breakpoint(cpu, bp);
3501          if (err) {
3502              return err;
3503          }
3504  
3505          QTAILQ_REMOVE(&cpu->kvm_state->kvm_sw_breakpoints, bp, entry);
3506          g_free(bp);
3507      } else {
3508          err = kvm_arch_remove_hw_breakpoint(addr, len, type);
3509          if (err) {
3510              return err;
3511          }
3512      }
3513  
3514      CPU_FOREACH(cpu) {
3515          err = kvm_update_guest_debug(cpu, 0);
3516          if (err) {
3517              return err;
3518          }
3519      }
3520      return 0;
3521  }
3522  
3523  void kvm_remove_all_breakpoints(CPUState *cpu)
3524  {
3525      struct kvm_sw_breakpoint *bp, *next;
3526      KVMState *s = cpu->kvm_state;
3527      CPUState *tmpcpu;
3528  
3529      QTAILQ_FOREACH_SAFE(bp, &s->kvm_sw_breakpoints, entry, next) {
3530          if (kvm_arch_remove_sw_breakpoint(cpu, bp) != 0) {
3531              /* Try harder to find a CPU that currently sees the breakpoint. */
3532              CPU_FOREACH(tmpcpu) {
3533                  if (kvm_arch_remove_sw_breakpoint(tmpcpu, bp) == 0) {
3534                      break;
3535                  }
3536              }
3537          }
3538          QTAILQ_REMOVE(&s->kvm_sw_breakpoints, bp, entry);
3539          g_free(bp);
3540      }
3541      kvm_arch_remove_all_hw_breakpoints();
3542  
3543      CPU_FOREACH(cpu) {
3544          kvm_update_guest_debug(cpu, 0);
3545      }
3546  }
3547  
3548  #endif /* !TARGET_KVM_HAVE_GUEST_DEBUG */
3549  
3550  static int kvm_set_signal_mask(CPUState *cpu, const sigset_t *sigset)
3551  {
3552      KVMState *s = kvm_state;
3553      struct kvm_signal_mask *sigmask;
3554      int r;
3555  
3556      sigmask = g_malloc(sizeof(*sigmask) + sizeof(*sigset));
3557  
3558      sigmask->len = s->sigmask_len;
3559      memcpy(sigmask->sigset, sigset, sizeof(*sigset));
3560      r = kvm_vcpu_ioctl(cpu, KVM_SET_SIGNAL_MASK, sigmask);
3561      g_free(sigmask);
3562  
3563      return r;
3564  }
3565  
3566  static void kvm_ipi_signal(int sig)
3567  {
3568      if (current_cpu) {
3569          assert(kvm_immediate_exit);
3570          kvm_cpu_kick(current_cpu);
3571      }
3572  }
3573  
3574  void kvm_init_cpu_signals(CPUState *cpu)
3575  {
3576      int r;
3577      sigset_t set;
3578      struct sigaction sigact;
3579  
3580      memset(&sigact, 0, sizeof(sigact));
3581      sigact.sa_handler = kvm_ipi_signal;
3582      sigaction(SIG_IPI, &sigact, NULL);
3583  
3584      pthread_sigmask(SIG_BLOCK, NULL, &set);
3585  #if defined KVM_HAVE_MCE_INJECTION
3586      sigdelset(&set, SIGBUS);
3587      pthread_sigmask(SIG_SETMASK, &set, NULL);
3588  #endif
3589      sigdelset(&set, SIG_IPI);
3590      if (kvm_immediate_exit) {
3591          r = pthread_sigmask(SIG_SETMASK, &set, NULL);
3592      } else {
3593          r = kvm_set_signal_mask(cpu, &set);
3594      }
3595      if (r) {
3596          fprintf(stderr, "kvm_set_signal_mask: %s\n", strerror(-r));
3597          exit(1);
3598      }
3599  }
3600  
3601  /* Called asynchronously in VCPU thread.  */
3602  int kvm_on_sigbus_vcpu(CPUState *cpu, int code, void *addr)
3603  {
3604  #ifdef KVM_HAVE_MCE_INJECTION
3605      if (have_sigbus_pending) {
3606          return 1;
3607      }
3608      have_sigbus_pending = true;
3609      pending_sigbus_addr = addr;
3610      pending_sigbus_code = code;
3611      qatomic_set(&cpu->exit_request, 1);
3612      return 0;
3613  #else
3614      return 1;
3615  #endif
3616  }
3617  
3618  /* Called synchronously (via signalfd) in main thread.  */
3619  int kvm_on_sigbus(int code, void *addr)
3620  {
3621  #ifdef KVM_HAVE_MCE_INJECTION
3622      /* Action required MCE kills the process if SIGBUS is blocked.  Because
3623       * that's what happens in the I/O thread, where we handle MCE via signalfd,
3624       * we can only get action optional here.
3625       */
3626      assert(code != BUS_MCEERR_AR);
3627      kvm_arch_on_sigbus_vcpu(first_cpu, code, addr);
3628      return 0;
3629  #else
3630      return 1;
3631  #endif
3632  }
3633  
3634  int kvm_create_device(KVMState *s, uint64_t type, bool test)
3635  {
3636      int ret;
3637      struct kvm_create_device create_dev;
3638  
3639      create_dev.type = type;
3640      create_dev.fd = -1;
3641      create_dev.flags = test ? KVM_CREATE_DEVICE_TEST : 0;
3642  
3643      if (!kvm_check_extension(s, KVM_CAP_DEVICE_CTRL)) {
3644          return -ENOTSUP;
3645      }
3646  
3647      ret = kvm_vm_ioctl(s, KVM_CREATE_DEVICE, &create_dev);
3648      if (ret) {
3649          return ret;
3650      }
3651  
3652      return test ? 0 : create_dev.fd;
3653  }
3654  
3655  bool kvm_device_supported(int vmfd, uint64_t type)
3656  {
3657      struct kvm_create_device create_dev = {
3658          .type = type,
3659          .fd = -1,
3660          .flags = KVM_CREATE_DEVICE_TEST,
3661      };
3662  
3663      if (ioctl(vmfd, KVM_CHECK_EXTENSION, KVM_CAP_DEVICE_CTRL) <= 0) {
3664          return false;
3665      }
3666  
3667      return (ioctl(vmfd, KVM_CREATE_DEVICE, &create_dev) >= 0);
3668  }
3669  
3670  int kvm_set_one_reg(CPUState *cs, uint64_t id, void *source)
3671  {
3672      struct kvm_one_reg reg;
3673      int r;
3674  
3675      reg.id = id;
3676      reg.addr = (uintptr_t) source;
3677      r = kvm_vcpu_ioctl(cs, KVM_SET_ONE_REG, &reg);
3678      if (r) {
3679          trace_kvm_failed_reg_set(id, strerror(-r));
3680      }
3681      return r;
3682  }
3683  
3684  int kvm_get_one_reg(CPUState *cs, uint64_t id, void *target)
3685  {
3686      struct kvm_one_reg reg;
3687      int r;
3688  
3689      reg.id = id;
3690      reg.addr = (uintptr_t) target;
3691      r = kvm_vcpu_ioctl(cs, KVM_GET_ONE_REG, &reg);
3692      if (r) {
3693          trace_kvm_failed_reg_get(id, strerror(-r));
3694      }
3695      return r;
3696  }
3697  
3698  static bool kvm_accel_has_memory(MachineState *ms, AddressSpace *as,
3699                                   hwaddr start_addr, hwaddr size)
3700  {
3701      KVMState *kvm = KVM_STATE(ms->accelerator);
3702      int i;
3703  
3704      for (i = 0; i < kvm->nr_as; ++i) {
3705          if (kvm->as[i].as == as && kvm->as[i].ml) {
3706              size = MIN(kvm_max_slot_size, size);
3707              return NULL != kvm_lookup_matching_slot(kvm->as[i].ml,
3708                                                      start_addr, size);
3709          }
3710      }
3711  
3712      return false;
3713  }
3714  
3715  static void kvm_get_kvm_shadow_mem(Object *obj, Visitor *v,
3716                                     const char *name, void *opaque,
3717                                     Error **errp)
3718  {
3719      KVMState *s = KVM_STATE(obj);
3720      int64_t value = s->kvm_shadow_mem;
3721  
3722      visit_type_int(v, name, &value, errp);
3723  }
3724  
3725  static void kvm_set_kvm_shadow_mem(Object *obj, Visitor *v,
3726                                     const char *name, void *opaque,
3727                                     Error **errp)
3728  {
3729      KVMState *s = KVM_STATE(obj);
3730      int64_t value;
3731  
3732      if (s->fd != -1) {
3733          error_setg(errp, "Cannot set properties after the accelerator has been initialized");
3734          return;
3735      }
3736  
3737      if (!visit_type_int(v, name, &value, errp)) {
3738          return;
3739      }
3740  
3741      s->kvm_shadow_mem = value;
3742  }
3743  
3744  static void kvm_set_kernel_irqchip(Object *obj, Visitor *v,
3745                                     const char *name, void *opaque,
3746                                     Error **errp)
3747  {
3748      KVMState *s = KVM_STATE(obj);
3749      OnOffSplit mode;
3750  
3751      if (s->fd != -1) {
3752          error_setg(errp, "Cannot set properties after the accelerator has been initialized");
3753          return;
3754      }
3755  
3756      if (!visit_type_OnOffSplit(v, name, &mode, errp)) {
3757          return;
3758      }
3759      switch (mode) {
3760      case ON_OFF_SPLIT_ON:
3761          s->kernel_irqchip_allowed = true;
3762          s->kernel_irqchip_required = true;
3763          s->kernel_irqchip_split = ON_OFF_AUTO_OFF;
3764          break;
3765      case ON_OFF_SPLIT_OFF:
3766          s->kernel_irqchip_allowed = false;
3767          s->kernel_irqchip_required = false;
3768          s->kernel_irqchip_split = ON_OFF_AUTO_OFF;
3769          break;
3770      case ON_OFF_SPLIT_SPLIT:
3771          s->kernel_irqchip_allowed = true;
3772          s->kernel_irqchip_required = true;
3773          s->kernel_irqchip_split = ON_OFF_AUTO_ON;
3774          break;
3775      default:
3776          /* The value was checked in visit_type_OnOffSplit() above. If
3777           * we get here, then something is wrong in QEMU.
3778           */
3779          abort();
3780      }
3781  }
3782  
3783  bool kvm_kernel_irqchip_allowed(void)
3784  {
3785      return kvm_state->kernel_irqchip_allowed;
3786  }
3787  
3788  bool kvm_kernel_irqchip_required(void)
3789  {
3790      return kvm_state->kernel_irqchip_required;
3791  }
3792  
3793  bool kvm_kernel_irqchip_split(void)
3794  {
3795      return kvm_state->kernel_irqchip_split == ON_OFF_AUTO_ON;
3796  }
3797  
3798  static void kvm_get_dirty_ring_size(Object *obj, Visitor *v,
3799                                      const char *name, void *opaque,
3800                                      Error **errp)
3801  {
3802      KVMState *s = KVM_STATE(obj);
3803      uint32_t value = s->kvm_dirty_ring_size;
3804  
3805      visit_type_uint32(v, name, &value, errp);
3806  }
3807  
3808  static void kvm_set_dirty_ring_size(Object *obj, Visitor *v,
3809                                      const char *name, void *opaque,
3810                                      Error **errp)
3811  {
3812      KVMState *s = KVM_STATE(obj);
3813      uint32_t value;
3814  
3815      if (s->fd != -1) {
3816          error_setg(errp, "Cannot set properties after the accelerator has been initialized");
3817          return;
3818      }
3819  
3820      if (!visit_type_uint32(v, name, &value, errp)) {
3821          return;
3822      }
3823      if (value & (value - 1)) {
3824          error_setg(errp, "dirty-ring-size must be a power of two.");
3825          return;
3826      }
3827  
3828      s->kvm_dirty_ring_size = value;
3829  }
3830  
3831  static char *kvm_get_device(Object *obj,
3832                              Error **errp G_GNUC_UNUSED)
3833  {
3834      KVMState *s = KVM_STATE(obj);
3835  
3836      return g_strdup(s->device);
3837  }
3838  
3839  static void kvm_set_device(Object *obj,
3840                             const char *value,
3841                             Error **errp G_GNUC_UNUSED)
3842  {
3843      KVMState *s = KVM_STATE(obj);
3844  
3845      g_free(s->device);
3846      s->device = g_strdup(value);
3847  }
3848  
3849  static void kvm_set_kvm_rapl(Object *obj, bool value, Error **errp)
3850  {
3851      KVMState *s = KVM_STATE(obj);
3852      s->msr_energy.enable = value;
3853  }
3854  
3855  static void kvm_set_kvm_rapl_socket_path(Object *obj,
3856                                           const char *str,
3857                                           Error **errp)
3858  {
3859      KVMState *s = KVM_STATE(obj);
3860      g_free(s->msr_energy.socket_path);
3861      s->msr_energy.socket_path = g_strdup(str);
3862  }
3863  
3864  static void kvm_accel_instance_init(Object *obj)
3865  {
3866      KVMState *s = KVM_STATE(obj);
3867  
3868      s->fd = -1;
3869      s->vmfd = -1;
3870      s->kvm_shadow_mem = -1;
3871      s->kernel_irqchip_allowed = true;
3872      s->kernel_irqchip_split = ON_OFF_AUTO_AUTO;
3873      /* KVM dirty ring is by default off */
3874      s->kvm_dirty_ring_size = 0;
3875      s->kvm_dirty_ring_with_bitmap = false;
3876      s->kvm_eager_split_size = 0;
3877      s->notify_vmexit = NOTIFY_VMEXIT_OPTION_RUN;
3878      s->notify_window = 0;
3879      s->xen_version = 0;
3880      s->xen_gnttab_max_frames = 64;
3881      s->xen_evtchn_max_pirq = 256;
3882      s->device = NULL;
3883      s->msr_energy.enable = false;
3884  }
3885  
3886  /**
3887   * kvm_gdbstub_sstep_flags():
3888   *
3889   * Returns: SSTEP_* flags that KVM supports for guest debug. The
3890   * support is probed during kvm_init()
3891   */
3892  static int kvm_gdbstub_sstep_flags(void)
3893  {
3894      return kvm_sstep_flags;
3895  }
3896  
3897  static void kvm_accel_class_init(ObjectClass *oc, void *data)
3898  {
3899      AccelClass *ac = ACCEL_CLASS(oc);
3900      ac->name = "KVM";
3901      ac->init_machine = kvm_init;
3902      ac->has_memory = kvm_accel_has_memory;
3903      ac->allowed = &kvm_allowed;
3904      ac->gdbstub_supported_sstep_flags = kvm_gdbstub_sstep_flags;
3905  
3906      object_class_property_add(oc, "kernel-irqchip", "on|off|split",
3907          NULL, kvm_set_kernel_irqchip,
3908          NULL, NULL);
3909      object_class_property_set_description(oc, "kernel-irqchip",
3910          "Configure KVM in-kernel irqchip");
3911  
3912      object_class_property_add(oc, "kvm-shadow-mem", "int",
3913          kvm_get_kvm_shadow_mem, kvm_set_kvm_shadow_mem,
3914          NULL, NULL);
3915      object_class_property_set_description(oc, "kvm-shadow-mem",
3916          "KVM shadow MMU size");
3917  
3918      object_class_property_add(oc, "dirty-ring-size", "uint32",
3919          kvm_get_dirty_ring_size, kvm_set_dirty_ring_size,
3920          NULL, NULL);
3921      object_class_property_set_description(oc, "dirty-ring-size",
3922          "Size of KVM dirty page ring buffer (default: 0, i.e. use bitmap)");
3923  
3924      object_class_property_add_str(oc, "device", kvm_get_device, kvm_set_device);
3925      object_class_property_set_description(oc, "device",
3926          "Path to the device node to use (default: /dev/kvm)");
3927  
3928      object_class_property_add_bool(oc, "rapl",
3929                                     NULL,
3930                                     kvm_set_kvm_rapl);
3931      object_class_property_set_description(oc, "rapl",
3932          "Allow energy related MSRs for RAPL interface in Guest");
3933  
3934      object_class_property_add_str(oc, "rapl-helper-socket", NULL,
3935                                    kvm_set_kvm_rapl_socket_path);
3936      object_class_property_set_description(oc, "rapl-helper-socket",
3937          "Socket Path for comminucating with the Virtual MSR helper daemon");
3938  
3939      kvm_arch_accel_class_init(oc);
3940  }
3941  
3942  static const TypeInfo kvm_accel_type = {
3943      .name = TYPE_KVM_ACCEL,
3944      .parent = TYPE_ACCEL,
3945      .instance_init = kvm_accel_instance_init,
3946      .class_init = kvm_accel_class_init,
3947      .instance_size = sizeof(KVMState),
3948  };
3949  
3950  static void kvm_type_init(void)
3951  {
3952      type_register_static(&kvm_accel_type);
3953  }
3954  
3955  type_init(kvm_type_init);
3956  
3957  typedef struct StatsArgs {
3958      union StatsResultsType {
3959          StatsResultList **stats;
3960          StatsSchemaList **schema;
3961      } result;
3962      strList *names;
3963      Error **errp;
3964  } StatsArgs;
3965  
3966  static StatsList *add_kvmstat_entry(struct kvm_stats_desc *pdesc,
3967                                      uint64_t *stats_data,
3968                                      StatsList *stats_list,
3969                                      Error **errp)
3970  {
3971  
3972      Stats *stats;
3973      uint64List *val_list = NULL;
3974  
3975      /* Only add stats that we understand.  */
3976      switch (pdesc->flags & KVM_STATS_TYPE_MASK) {
3977      case KVM_STATS_TYPE_CUMULATIVE:
3978      case KVM_STATS_TYPE_INSTANT:
3979      case KVM_STATS_TYPE_PEAK:
3980      case KVM_STATS_TYPE_LINEAR_HIST:
3981      case KVM_STATS_TYPE_LOG_HIST:
3982          break;
3983      default:
3984          return stats_list;
3985      }
3986  
3987      switch (pdesc->flags & KVM_STATS_UNIT_MASK) {
3988      case KVM_STATS_UNIT_NONE:
3989      case KVM_STATS_UNIT_BYTES:
3990      case KVM_STATS_UNIT_CYCLES:
3991      case KVM_STATS_UNIT_SECONDS:
3992      case KVM_STATS_UNIT_BOOLEAN:
3993          break;
3994      default:
3995          return stats_list;
3996      }
3997  
3998      switch (pdesc->flags & KVM_STATS_BASE_MASK) {
3999      case KVM_STATS_BASE_POW10:
4000      case KVM_STATS_BASE_POW2:
4001          break;
4002      default:
4003          return stats_list;
4004      }
4005  
4006      /* Alloc and populate data list */
4007      stats = g_new0(Stats, 1);
4008      stats->name = g_strdup(pdesc->name);
4009      stats->value = g_new0(StatsValue, 1);
4010  
4011      if ((pdesc->flags & KVM_STATS_UNIT_MASK) == KVM_STATS_UNIT_BOOLEAN) {
4012          stats->value->u.boolean = *stats_data;
4013          stats->value->type = QTYPE_QBOOL;
4014      } else if (pdesc->size == 1) {
4015          stats->value->u.scalar = *stats_data;
4016          stats->value->type = QTYPE_QNUM;
4017      } else {
4018          int i;
4019          for (i = 0; i < pdesc->size; i++) {
4020              QAPI_LIST_PREPEND(val_list, stats_data[i]);
4021          }
4022          stats->value->u.list = val_list;
4023          stats->value->type = QTYPE_QLIST;
4024      }
4025  
4026      QAPI_LIST_PREPEND(stats_list, stats);
4027      return stats_list;
4028  }
4029  
4030  static StatsSchemaValueList *add_kvmschema_entry(struct kvm_stats_desc *pdesc,
4031                                                   StatsSchemaValueList *list,
4032                                                   Error **errp)
4033  {
4034      StatsSchemaValueList *schema_entry = g_new0(StatsSchemaValueList, 1);
4035      schema_entry->value = g_new0(StatsSchemaValue, 1);
4036  
4037      switch (pdesc->flags & KVM_STATS_TYPE_MASK) {
4038      case KVM_STATS_TYPE_CUMULATIVE:
4039          schema_entry->value->type = STATS_TYPE_CUMULATIVE;
4040          break;
4041      case KVM_STATS_TYPE_INSTANT:
4042          schema_entry->value->type = STATS_TYPE_INSTANT;
4043          break;
4044      case KVM_STATS_TYPE_PEAK:
4045          schema_entry->value->type = STATS_TYPE_PEAK;
4046          break;
4047      case KVM_STATS_TYPE_LINEAR_HIST:
4048          schema_entry->value->type = STATS_TYPE_LINEAR_HISTOGRAM;
4049          schema_entry->value->bucket_size = pdesc->bucket_size;
4050          schema_entry->value->has_bucket_size = true;
4051          break;
4052      case KVM_STATS_TYPE_LOG_HIST:
4053          schema_entry->value->type = STATS_TYPE_LOG2_HISTOGRAM;
4054          break;
4055      default:
4056          goto exit;
4057      }
4058  
4059      switch (pdesc->flags & KVM_STATS_UNIT_MASK) {
4060      case KVM_STATS_UNIT_NONE:
4061          break;
4062      case KVM_STATS_UNIT_BOOLEAN:
4063          schema_entry->value->has_unit = true;
4064          schema_entry->value->unit = STATS_UNIT_BOOLEAN;
4065          break;
4066      case KVM_STATS_UNIT_BYTES:
4067          schema_entry->value->has_unit = true;
4068          schema_entry->value->unit = STATS_UNIT_BYTES;
4069          break;
4070      case KVM_STATS_UNIT_CYCLES:
4071          schema_entry->value->has_unit = true;
4072          schema_entry->value->unit = STATS_UNIT_CYCLES;
4073          break;
4074      case KVM_STATS_UNIT_SECONDS:
4075          schema_entry->value->has_unit = true;
4076          schema_entry->value->unit = STATS_UNIT_SECONDS;
4077          break;
4078      default:
4079          goto exit;
4080      }
4081  
4082      schema_entry->value->exponent = pdesc->exponent;
4083      if (pdesc->exponent) {
4084          switch (pdesc->flags & KVM_STATS_BASE_MASK) {
4085          case KVM_STATS_BASE_POW10:
4086              schema_entry->value->has_base = true;
4087              schema_entry->value->base = 10;
4088              break;
4089          case KVM_STATS_BASE_POW2:
4090              schema_entry->value->has_base = true;
4091              schema_entry->value->base = 2;
4092              break;
4093          default:
4094              goto exit;
4095          }
4096      }
4097  
4098      schema_entry->value->name = g_strdup(pdesc->name);
4099      schema_entry->next = list;
4100      return schema_entry;
4101  exit:
4102      g_free(schema_entry->value);
4103      g_free(schema_entry);
4104      return list;
4105  }
4106  
4107  /* Cached stats descriptors */
4108  typedef struct StatsDescriptors {
4109      const char *ident; /* cache key, currently the StatsTarget */
4110      struct kvm_stats_desc *kvm_stats_desc;
4111      struct kvm_stats_header kvm_stats_header;
4112      QTAILQ_ENTRY(StatsDescriptors) next;
4113  } StatsDescriptors;
4114  
4115  static QTAILQ_HEAD(, StatsDescriptors) stats_descriptors =
4116      QTAILQ_HEAD_INITIALIZER(stats_descriptors);
4117  
4118  /*
4119   * Return the descriptors for 'target', that either have already been read
4120   * or are retrieved from 'stats_fd'.
4121   */
4122  static StatsDescriptors *find_stats_descriptors(StatsTarget target, int stats_fd,
4123                                                  Error **errp)
4124  {
4125      StatsDescriptors *descriptors;
4126      const char *ident;
4127      struct kvm_stats_desc *kvm_stats_desc;
4128      struct kvm_stats_header *kvm_stats_header;
4129      size_t size_desc;
4130      ssize_t ret;
4131  
4132      ident = StatsTarget_str(target);
4133      QTAILQ_FOREACH(descriptors, &stats_descriptors, next) {
4134          if (g_str_equal(descriptors->ident, ident)) {
4135              return descriptors;
4136          }
4137      }
4138  
4139      descriptors = g_new0(StatsDescriptors, 1);
4140  
4141      /* Read stats header */
4142      kvm_stats_header = &descriptors->kvm_stats_header;
4143      ret = pread(stats_fd, kvm_stats_header, sizeof(*kvm_stats_header), 0);
4144      if (ret != sizeof(*kvm_stats_header)) {
4145          error_setg(errp, "KVM stats: failed to read stats header: "
4146                     "expected %zu actual %zu",
4147                     sizeof(*kvm_stats_header), ret);
4148          g_free(descriptors);
4149          return NULL;
4150      }
4151      size_desc = sizeof(*kvm_stats_desc) + kvm_stats_header->name_size;
4152  
4153      /* Read stats descriptors */
4154      kvm_stats_desc = g_malloc0_n(kvm_stats_header->num_desc, size_desc);
4155      ret = pread(stats_fd, kvm_stats_desc,
4156                  size_desc * kvm_stats_header->num_desc,
4157                  kvm_stats_header->desc_offset);
4158  
4159      if (ret != size_desc * kvm_stats_header->num_desc) {
4160          error_setg(errp, "KVM stats: failed to read stats descriptors: "
4161                     "expected %zu actual %zu",
4162                     size_desc * kvm_stats_header->num_desc, ret);
4163          g_free(descriptors);
4164          g_free(kvm_stats_desc);
4165          return NULL;
4166      }
4167      descriptors->kvm_stats_desc = kvm_stats_desc;
4168      descriptors->ident = ident;
4169      QTAILQ_INSERT_TAIL(&stats_descriptors, descriptors, next);
4170      return descriptors;
4171  }
4172  
4173  static void query_stats(StatsResultList **result, StatsTarget target,
4174                          strList *names, int stats_fd, CPUState *cpu,
4175                          Error **errp)
4176  {
4177      struct kvm_stats_desc *kvm_stats_desc;
4178      struct kvm_stats_header *kvm_stats_header;
4179      StatsDescriptors *descriptors;
4180      g_autofree uint64_t *stats_data = NULL;
4181      struct kvm_stats_desc *pdesc;
4182      StatsList *stats_list = NULL;
4183      size_t size_desc, size_data = 0;
4184      ssize_t ret;
4185      int i;
4186  
4187      descriptors = find_stats_descriptors(target, stats_fd, errp);
4188      if (!descriptors) {
4189          return;
4190      }
4191  
4192      kvm_stats_header = &descriptors->kvm_stats_header;
4193      kvm_stats_desc = descriptors->kvm_stats_desc;
4194      size_desc = sizeof(*kvm_stats_desc) + kvm_stats_header->name_size;
4195  
4196      /* Tally the total data size; read schema data */
4197      for (i = 0; i < kvm_stats_header->num_desc; ++i) {
4198          pdesc = (void *)kvm_stats_desc + i * size_desc;
4199          size_data += pdesc->size * sizeof(*stats_data);
4200      }
4201  
4202      stats_data = g_malloc0(size_data);
4203      ret = pread(stats_fd, stats_data, size_data, kvm_stats_header->data_offset);
4204  
4205      if (ret != size_data) {
4206          error_setg(errp, "KVM stats: failed to read data: "
4207                     "expected %zu actual %zu", size_data, ret);
4208          return;
4209      }
4210  
4211      for (i = 0; i < kvm_stats_header->num_desc; ++i) {
4212          uint64_t *stats;
4213          pdesc = (void *)kvm_stats_desc + i * size_desc;
4214  
4215          /* Add entry to the list */
4216          stats = (void *)stats_data + pdesc->offset;
4217          if (!apply_str_list_filter(pdesc->name, names)) {
4218              continue;
4219          }
4220          stats_list = add_kvmstat_entry(pdesc, stats, stats_list, errp);
4221      }
4222  
4223      if (!stats_list) {
4224          return;
4225      }
4226  
4227      switch (target) {
4228      case STATS_TARGET_VM:
4229          add_stats_entry(result, STATS_PROVIDER_KVM, NULL, stats_list);
4230          break;
4231      case STATS_TARGET_VCPU:
4232          add_stats_entry(result, STATS_PROVIDER_KVM,
4233                          cpu->parent_obj.canonical_path,
4234                          stats_list);
4235          break;
4236      default:
4237          g_assert_not_reached();
4238      }
4239  }
4240  
4241  static void query_stats_schema(StatsSchemaList **result, StatsTarget target,
4242                                 int stats_fd, Error **errp)
4243  {
4244      struct kvm_stats_desc *kvm_stats_desc;
4245      struct kvm_stats_header *kvm_stats_header;
4246      StatsDescriptors *descriptors;
4247      struct kvm_stats_desc *pdesc;
4248      StatsSchemaValueList *stats_list = NULL;
4249      size_t size_desc;
4250      int i;
4251  
4252      descriptors = find_stats_descriptors(target, stats_fd, errp);
4253      if (!descriptors) {
4254          return;
4255      }
4256  
4257      kvm_stats_header = &descriptors->kvm_stats_header;
4258      kvm_stats_desc = descriptors->kvm_stats_desc;
4259      size_desc = sizeof(*kvm_stats_desc) + kvm_stats_header->name_size;
4260  
4261      /* Tally the total data size; read schema data */
4262      for (i = 0; i < kvm_stats_header->num_desc; ++i) {
4263          pdesc = (void *)kvm_stats_desc + i * size_desc;
4264          stats_list = add_kvmschema_entry(pdesc, stats_list, errp);
4265      }
4266  
4267      add_stats_schema(result, STATS_PROVIDER_KVM, target, stats_list);
4268  }
4269  
4270  static void query_stats_vcpu(CPUState *cpu, StatsArgs *kvm_stats_args)
4271  {
4272      int stats_fd = cpu->kvm_vcpu_stats_fd;
4273      Error *local_err = NULL;
4274  
4275      if (stats_fd == -1) {
4276          error_setg_errno(&local_err, errno, "KVM stats: ioctl failed");
4277          error_propagate(kvm_stats_args->errp, local_err);
4278          return;
4279      }
4280      query_stats(kvm_stats_args->result.stats, STATS_TARGET_VCPU,
4281                  kvm_stats_args->names, stats_fd, cpu,
4282                  kvm_stats_args->errp);
4283  }
4284  
4285  static void query_stats_schema_vcpu(CPUState *cpu, StatsArgs *kvm_stats_args)
4286  {
4287      int stats_fd = cpu->kvm_vcpu_stats_fd;
4288      Error *local_err = NULL;
4289  
4290      if (stats_fd == -1) {
4291          error_setg_errno(&local_err, errno, "KVM stats: ioctl failed");
4292          error_propagate(kvm_stats_args->errp, local_err);
4293          return;
4294      }
4295      query_stats_schema(kvm_stats_args->result.schema, STATS_TARGET_VCPU, stats_fd,
4296                         kvm_stats_args->errp);
4297  }
4298  
4299  static void query_stats_cb(StatsResultList **result, StatsTarget target,
4300                             strList *names, strList *targets, Error **errp)
4301  {
4302      KVMState *s = kvm_state;
4303      CPUState *cpu;
4304      int stats_fd;
4305  
4306      switch (target) {
4307      case STATS_TARGET_VM:
4308      {
4309          stats_fd = kvm_vm_ioctl(s, KVM_GET_STATS_FD, NULL);
4310          if (stats_fd == -1) {
4311              error_setg_errno(errp, errno, "KVM stats: ioctl failed");
4312              return;
4313          }
4314          query_stats(result, target, names, stats_fd, NULL, errp);
4315          close(stats_fd);
4316          break;
4317      }
4318      case STATS_TARGET_VCPU:
4319      {
4320          StatsArgs stats_args;
4321          stats_args.result.stats = result;
4322          stats_args.names = names;
4323          stats_args.errp = errp;
4324          CPU_FOREACH(cpu) {
4325              if (!apply_str_list_filter(cpu->parent_obj.canonical_path, targets)) {
4326                  continue;
4327              }
4328              query_stats_vcpu(cpu, &stats_args);
4329          }
4330          break;
4331      }
4332      default:
4333          break;
4334      }
4335  }
4336  
4337  void query_stats_schemas_cb(StatsSchemaList **result, Error **errp)
4338  {
4339      StatsArgs stats_args;
4340      KVMState *s = kvm_state;
4341      int stats_fd;
4342  
4343      stats_fd = kvm_vm_ioctl(s, KVM_GET_STATS_FD, NULL);
4344      if (stats_fd == -1) {
4345          error_setg_errno(errp, errno, "KVM stats: ioctl failed");
4346          return;
4347      }
4348      query_stats_schema(result, STATS_TARGET_VM, stats_fd, errp);
4349      close(stats_fd);
4350  
4351      if (first_cpu) {
4352          stats_args.result.schema = result;
4353          stats_args.errp = errp;
4354          query_stats_schema_vcpu(first_cpu, &stats_args);
4355      }
4356  }
4357  
4358  void kvm_mark_guest_state_protected(void)
4359  {
4360      kvm_state->guest_state_protected = true;
4361  }
4362  
4363  int kvm_create_guest_memfd(uint64_t size, uint64_t flags, Error **errp)
4364  {
4365      int fd;
4366      struct kvm_create_guest_memfd guest_memfd = {
4367          .size = size,
4368          .flags = flags,
4369      };
4370  
4371      if (!kvm_guest_memfd_supported) {
4372          error_setg(errp, "KVM does not support guest_memfd");
4373          return -1;
4374      }
4375  
4376      fd = kvm_vm_ioctl(kvm_state, KVM_CREATE_GUEST_MEMFD, &guest_memfd);
4377      if (fd < 0) {
4378          error_setg_errno(errp, errno, "Error creating KVM guest_memfd");
4379          return -1;
4380      }
4381  
4382      return fd;
4383  }
4384