xref: /openbmc/qemu/hw/i386/xen/xen-hvm.c (revision 464e447a)
1 /*
2  * Copyright (C) 2010       Citrix Ltd.
3  *
4  * This work is licensed under the terms of the GNU GPL, version 2.  See
5  * the COPYING file in the top-level directory.
6  *
7  * Contributions after 2012-01-13 are licensed under the terms of the
8  * GNU GPL, version 2 or (at your option) any later version.
9  */
10 
11 #include "qemu/osdep.h"
12 
13 #include "cpu.h"
14 #include "hw/pci/pci.h"
15 #include "hw/pci/pci_host.h"
16 #include "hw/i386/pc.h"
17 #include "hw/i386/apic-msidef.h"
18 #include "hw/xen/xen_common.h"
19 #include "hw/xen/xen_backend.h"
20 #include "qapi/error.h"
21 #include "qapi/qapi-commands-misc.h"
22 #include "qemu/error-report.h"
23 #include "qemu/range.h"
24 #include "sysemu/xen-mapcache.h"
25 #include "trace.h"
26 #include "exec/address-spaces.h"
27 
28 #include <xen/hvm/ioreq.h>
29 #include <xen/hvm/params.h>
30 #include <xen/hvm/e820.h>
31 
32 //#define DEBUG_XEN_HVM
33 
34 #ifdef DEBUG_XEN_HVM
35 #define DPRINTF(fmt, ...) \
36     do { fprintf(stderr, "xen: " fmt, ## __VA_ARGS__); } while (0)
37 #else
38 #define DPRINTF(fmt, ...) \
39     do { } while (0)
40 #endif
41 
42 static MemoryRegion ram_memory, ram_640k, ram_lo, ram_hi;
43 static MemoryRegion *framebuffer;
44 static bool xen_in_migration;
45 
46 /* Compatibility with older version */
47 
48 /* This allows QEMU to build on a system that has Xen 4.5 or earlier
49  * installed.  This here (not in hw/xen/xen_common.h) because xen/hvm/ioreq.h
50  * needs to be included before this block and hw/xen/xen_common.h needs to
51  * be included before xen/hvm/ioreq.h
52  */
53 #ifndef IOREQ_TYPE_VMWARE_PORT
54 #define IOREQ_TYPE_VMWARE_PORT  3
55 struct vmware_regs {
56     uint32_t esi;
57     uint32_t edi;
58     uint32_t ebx;
59     uint32_t ecx;
60     uint32_t edx;
61 };
62 typedef struct vmware_regs vmware_regs_t;
63 
64 struct shared_vmport_iopage {
65     struct vmware_regs vcpu_vmport_regs[1];
66 };
67 typedef struct shared_vmport_iopage shared_vmport_iopage_t;
68 #endif
69 
70 static inline uint32_t xen_vcpu_eport(shared_iopage_t *shared_page, int i)
71 {
72     return shared_page->vcpu_ioreq[i].vp_eport;
73 }
74 static inline ioreq_t *xen_vcpu_ioreq(shared_iopage_t *shared_page, int vcpu)
75 {
76     return &shared_page->vcpu_ioreq[vcpu];
77 }
78 
79 #define BUFFER_IO_MAX_DELAY  100
80 
81 typedef struct XenPhysmap {
82     hwaddr start_addr;
83     ram_addr_t size;
84     const char *name;
85     hwaddr phys_offset;
86 
87     QLIST_ENTRY(XenPhysmap) list;
88 } XenPhysmap;
89 
90 static QLIST_HEAD(, XenPhysmap) xen_physmap;
91 
92 typedef struct XenPciDevice {
93     PCIDevice *pci_dev;
94     uint32_t sbdf;
95     QLIST_ENTRY(XenPciDevice) entry;
96 } XenPciDevice;
97 
98 typedef struct XenIOState {
99     ioservid_t ioservid;
100     shared_iopage_t *shared_page;
101     shared_vmport_iopage_t *shared_vmport_page;
102     buffered_iopage_t *buffered_io_page;
103     QEMUTimer *buffered_io_timer;
104     CPUState **cpu_by_vcpu_id;
105     /* the evtchn port for polling the notification, */
106     evtchn_port_t *ioreq_local_port;
107     /* evtchn remote and local ports for buffered io */
108     evtchn_port_t bufioreq_remote_port;
109     evtchn_port_t bufioreq_local_port;
110     /* the evtchn fd for polling */
111     xenevtchn_handle *xce_handle;
112     /* which vcpu we are serving */
113     int send_vcpu;
114 
115     struct xs_handle *xenstore;
116     MemoryListener memory_listener;
117     MemoryListener io_listener;
118     QLIST_HEAD(, XenPciDevice) dev_list;
119     DeviceListener device_listener;
120     hwaddr free_phys_offset;
121     const XenPhysmap *log_for_dirtybit;
122 
123     Notifier exit;
124     Notifier suspend;
125     Notifier wakeup;
126 } XenIOState;
127 
128 /* Xen specific function for piix pci */
129 
130 int xen_pci_slot_get_pirq(PCIDevice *pci_dev, int irq_num)
131 {
132     return irq_num + ((pci_dev->devfn >> 3) << 2);
133 }
134 
135 void xen_piix3_set_irq(void *opaque, int irq_num, int level)
136 {
137     xen_set_pci_intx_level(xen_domid, 0, 0, irq_num >> 2,
138                            irq_num & 3, level);
139 }
140 
141 void xen_piix_pci_write_config_client(uint32_t address, uint32_t val, int len)
142 {
143     int i;
144 
145     /* Scan for updates to PCI link routes (0x60-0x63). */
146     for (i = 0; i < len; i++) {
147         uint8_t v = (val >> (8 * i)) & 0xff;
148         if (v & 0x80) {
149             v = 0;
150         }
151         v &= 0xf;
152         if (((address + i) >= 0x60) && ((address + i) <= 0x63)) {
153             xen_set_pci_link_route(xen_domid, address + i - 0x60, v);
154         }
155     }
156 }
157 
158 int xen_is_pirq_msi(uint32_t msi_data)
159 {
160     /* If vector is 0, the msi is remapped into a pirq, passed as
161      * dest_id.
162      */
163     return ((msi_data & MSI_DATA_VECTOR_MASK) >> MSI_DATA_VECTOR_SHIFT) == 0;
164 }
165 
166 void xen_hvm_inject_msi(uint64_t addr, uint32_t data)
167 {
168     xen_inject_msi(xen_domid, addr, data);
169 }
170 
171 static void xen_suspend_notifier(Notifier *notifier, void *data)
172 {
173     xc_set_hvm_param(xen_xc, xen_domid, HVM_PARAM_ACPI_S_STATE, 3);
174 }
175 
176 /* Xen Interrupt Controller */
177 
178 static void xen_set_irq(void *opaque, int irq, int level)
179 {
180     xen_set_isa_irq_level(xen_domid, irq, level);
181 }
182 
183 qemu_irq *xen_interrupt_controller_init(void)
184 {
185     return qemu_allocate_irqs(xen_set_irq, NULL, 16);
186 }
187 
188 /* Memory Ops */
189 
190 static void xen_ram_init(PCMachineState *pcms,
191                          ram_addr_t ram_size, MemoryRegion **ram_memory_p)
192 {
193     MemoryRegion *sysmem = get_system_memory();
194     ram_addr_t block_len;
195     uint64_t user_lowmem = object_property_get_uint(qdev_get_machine(),
196                                                     PC_MACHINE_MAX_RAM_BELOW_4G,
197                                                     &error_abort);
198 
199     /* Handle the machine opt max-ram-below-4g.  It is basically doing
200      * min(xen limit, user limit).
201      */
202     if (!user_lowmem) {
203         user_lowmem = HVM_BELOW_4G_RAM_END; /* default */
204     }
205     if (HVM_BELOW_4G_RAM_END <= user_lowmem) {
206         user_lowmem = HVM_BELOW_4G_RAM_END;
207     }
208 
209     if (ram_size >= user_lowmem) {
210         pcms->above_4g_mem_size = ram_size - user_lowmem;
211         pcms->below_4g_mem_size = user_lowmem;
212     } else {
213         pcms->above_4g_mem_size = 0;
214         pcms->below_4g_mem_size = ram_size;
215     }
216     if (!pcms->above_4g_mem_size) {
217         block_len = ram_size;
218     } else {
219         /*
220          * Xen does not allocate the memory continuously, it keeps a
221          * hole of the size computed above or passed in.
222          */
223         block_len = (1ULL << 32) + pcms->above_4g_mem_size;
224     }
225     memory_region_init_ram(&ram_memory, NULL, "xen.ram", block_len,
226                            &error_fatal);
227     *ram_memory_p = &ram_memory;
228 
229     memory_region_init_alias(&ram_640k, NULL, "xen.ram.640k",
230                              &ram_memory, 0, 0xa0000);
231     memory_region_add_subregion(sysmem, 0, &ram_640k);
232     /* Skip of the VGA IO memory space, it will be registered later by the VGA
233      * emulated device.
234      *
235      * The area between 0xc0000 and 0x100000 will be used by SeaBIOS to load
236      * the Options ROM, so it is registered here as RAM.
237      */
238     memory_region_init_alias(&ram_lo, NULL, "xen.ram.lo",
239                              &ram_memory, 0xc0000,
240                              pcms->below_4g_mem_size - 0xc0000);
241     memory_region_add_subregion(sysmem, 0xc0000, &ram_lo);
242     if (pcms->above_4g_mem_size > 0) {
243         memory_region_init_alias(&ram_hi, NULL, "xen.ram.hi",
244                                  &ram_memory, 0x100000000ULL,
245                                  pcms->above_4g_mem_size);
246         memory_region_add_subregion(sysmem, 0x100000000ULL, &ram_hi);
247     }
248 }
249 
250 void xen_ram_alloc(ram_addr_t ram_addr, ram_addr_t size, MemoryRegion *mr,
251                    Error **errp)
252 {
253     unsigned long nr_pfn;
254     xen_pfn_t *pfn_list;
255     int i;
256 
257     if (runstate_check(RUN_STATE_INMIGRATE)) {
258         /* RAM already populated in Xen */
259         fprintf(stderr, "%s: do not alloc "RAM_ADDR_FMT
260                 " bytes of ram at "RAM_ADDR_FMT" when runstate is INMIGRATE\n",
261                 __func__, size, ram_addr);
262         return;
263     }
264 
265     if (mr == &ram_memory) {
266         return;
267     }
268 
269     trace_xen_ram_alloc(ram_addr, size);
270 
271     nr_pfn = size >> TARGET_PAGE_BITS;
272     pfn_list = g_malloc(sizeof (*pfn_list) * nr_pfn);
273 
274     for (i = 0; i < nr_pfn; i++) {
275         pfn_list[i] = (ram_addr >> TARGET_PAGE_BITS) + i;
276     }
277 
278     if (xc_domain_populate_physmap_exact(xen_xc, xen_domid, nr_pfn, 0, 0, pfn_list)) {
279         error_setg(errp, "xen: failed to populate ram at " RAM_ADDR_FMT,
280                    ram_addr);
281     }
282 
283     g_free(pfn_list);
284 }
285 
286 static XenPhysmap *get_physmapping(hwaddr start_addr, ram_addr_t size)
287 {
288     XenPhysmap *physmap = NULL;
289 
290     start_addr &= TARGET_PAGE_MASK;
291 
292     QLIST_FOREACH(physmap, &xen_physmap, list) {
293         if (range_covers_byte(physmap->start_addr, physmap->size, start_addr)) {
294             return physmap;
295         }
296     }
297     return NULL;
298 }
299 
300 static hwaddr xen_phys_offset_to_gaddr(hwaddr phys_offset, ram_addr_t size)
301 {
302     hwaddr addr = phys_offset & TARGET_PAGE_MASK;
303     XenPhysmap *physmap = NULL;
304 
305     QLIST_FOREACH(physmap, &xen_physmap, list) {
306         if (range_covers_byte(physmap->phys_offset, physmap->size, addr)) {
307             return physmap->start_addr + (phys_offset - physmap->phys_offset);
308         }
309     }
310 
311     return phys_offset;
312 }
313 
314 #ifdef XEN_COMPAT_PHYSMAP
315 static int xen_save_physmap(XenIOState *state, XenPhysmap *physmap)
316 {
317     char path[80], value[17];
318 
319     snprintf(path, sizeof(path),
320             "/local/domain/0/device-model/%d/physmap/%"PRIx64"/start_addr",
321             xen_domid, (uint64_t)physmap->phys_offset);
322     snprintf(value, sizeof(value), "%"PRIx64, (uint64_t)physmap->start_addr);
323     if (!xs_write(state->xenstore, 0, path, value, strlen(value))) {
324         return -1;
325     }
326     snprintf(path, sizeof(path),
327             "/local/domain/0/device-model/%d/physmap/%"PRIx64"/size",
328             xen_domid, (uint64_t)physmap->phys_offset);
329     snprintf(value, sizeof(value), "%"PRIx64, (uint64_t)physmap->size);
330     if (!xs_write(state->xenstore, 0, path, value, strlen(value))) {
331         return -1;
332     }
333     if (physmap->name) {
334         snprintf(path, sizeof(path),
335                 "/local/domain/0/device-model/%d/physmap/%"PRIx64"/name",
336                 xen_domid, (uint64_t)physmap->phys_offset);
337         if (!xs_write(state->xenstore, 0, path,
338                       physmap->name, strlen(physmap->name))) {
339             return -1;
340         }
341     }
342     return 0;
343 }
344 #else
345 static int xen_save_physmap(XenIOState *state, XenPhysmap *physmap)
346 {
347     return 0;
348 }
349 #endif
350 
351 static int xen_add_to_physmap(XenIOState *state,
352                               hwaddr start_addr,
353                               ram_addr_t size,
354                               MemoryRegion *mr,
355                               hwaddr offset_within_region)
356 {
357     unsigned long nr_pages;
358     int rc = 0;
359     XenPhysmap *physmap = NULL;
360     hwaddr pfn, start_gpfn;
361     hwaddr phys_offset = memory_region_get_ram_addr(mr);
362     const char *mr_name;
363 
364     if (get_physmapping(start_addr, size)) {
365         return 0;
366     }
367     if (size <= 0) {
368         return -1;
369     }
370 
371     /* Xen can only handle a single dirty log region for now and we want
372      * the linear framebuffer to be that region.
373      * Avoid tracking any regions that is not videoram and avoid tracking
374      * the legacy vga region. */
375     if (mr == framebuffer && start_addr > 0xbffff) {
376         goto go_physmap;
377     }
378     return -1;
379 
380 go_physmap:
381     DPRINTF("mapping vram to %"HWADDR_PRIx" - %"HWADDR_PRIx"\n",
382             start_addr, start_addr + size);
383 
384     mr_name = memory_region_name(mr);
385 
386     physmap = g_malloc(sizeof(XenPhysmap));
387 
388     physmap->start_addr = start_addr;
389     physmap->size = size;
390     physmap->name = mr_name;
391     physmap->phys_offset = phys_offset;
392 
393     QLIST_INSERT_HEAD(&xen_physmap, physmap, list);
394 
395     if (runstate_check(RUN_STATE_INMIGRATE)) {
396         /* Now when we have a physmap entry we can replace a dummy mapping with
397          * a real one of guest foreign memory. */
398         uint8_t *p = xen_replace_cache_entry(phys_offset, start_addr, size);
399         assert(p && p == memory_region_get_ram_ptr(mr));
400 
401         return 0;
402     }
403 
404     pfn = phys_offset >> TARGET_PAGE_BITS;
405     start_gpfn = start_addr >> TARGET_PAGE_BITS;
406     nr_pages = size >> TARGET_PAGE_BITS;
407     rc = xendevicemodel_relocate_memory(xen_dmod, xen_domid, nr_pages, pfn,
408                                         start_gpfn);
409     if (rc) {
410         int saved_errno = errno;
411 
412         error_report("relocate_memory %lu pages from GFN %"HWADDR_PRIx
413                      " to GFN %"HWADDR_PRIx" failed: %s",
414                      nr_pages, pfn, start_gpfn, strerror(saved_errno));
415         errno = saved_errno;
416         return -1;
417     }
418 
419     rc = xendevicemodel_pin_memory_cacheattr(xen_dmod, xen_domid,
420                                    start_addr >> TARGET_PAGE_BITS,
421                                    (start_addr + size - 1) >> TARGET_PAGE_BITS,
422                                    XEN_DOMCTL_MEM_CACHEATTR_WB);
423     if (rc) {
424         error_report("pin_memory_cacheattr failed: %s", strerror(errno));
425     }
426     return xen_save_physmap(state, physmap);
427 }
428 
429 static int xen_remove_from_physmap(XenIOState *state,
430                                    hwaddr start_addr,
431                                    ram_addr_t size)
432 {
433     int rc = 0;
434     XenPhysmap *physmap = NULL;
435     hwaddr phys_offset = 0;
436 
437     physmap = get_physmapping(start_addr, size);
438     if (physmap == NULL) {
439         return -1;
440     }
441 
442     phys_offset = physmap->phys_offset;
443     size = physmap->size;
444 
445     DPRINTF("unmapping vram to %"HWADDR_PRIx" - %"HWADDR_PRIx", at "
446             "%"HWADDR_PRIx"\n", start_addr, start_addr + size, phys_offset);
447 
448     size >>= TARGET_PAGE_BITS;
449     start_addr >>= TARGET_PAGE_BITS;
450     phys_offset >>= TARGET_PAGE_BITS;
451     rc = xendevicemodel_relocate_memory(xen_dmod, xen_domid, size, start_addr,
452                                         phys_offset);
453     if (rc) {
454         int saved_errno = errno;
455 
456         error_report("relocate_memory "RAM_ADDR_FMT" pages"
457                      " from GFN %"HWADDR_PRIx
458                      " to GFN %"HWADDR_PRIx" failed: %s",
459                      size, start_addr, phys_offset, strerror(saved_errno));
460         errno = saved_errno;
461         return -1;
462     }
463 
464     QLIST_REMOVE(physmap, list);
465     if (state->log_for_dirtybit == physmap) {
466         state->log_for_dirtybit = NULL;
467     }
468     g_free(physmap);
469 
470     return 0;
471 }
472 
473 static void xen_set_memory(struct MemoryListener *listener,
474                            MemoryRegionSection *section,
475                            bool add)
476 {
477     XenIOState *state = container_of(listener, XenIOState, memory_listener);
478     hwaddr start_addr = section->offset_within_address_space;
479     ram_addr_t size = int128_get64(section->size);
480     bool log_dirty = memory_region_is_logging(section->mr, DIRTY_MEMORY_VGA);
481     hvmmem_type_t mem_type;
482 
483     if (section->mr == &ram_memory) {
484         return;
485     } else {
486         if (add) {
487             xen_map_memory_section(xen_domid, state->ioservid,
488                                    section);
489         } else {
490             xen_unmap_memory_section(xen_domid, state->ioservid,
491                                      section);
492         }
493     }
494 
495     if (!memory_region_is_ram(section->mr)) {
496         return;
497     }
498 
499     if (log_dirty != add) {
500         return;
501     }
502 
503     trace_xen_client_set_memory(start_addr, size, log_dirty);
504 
505     start_addr &= TARGET_PAGE_MASK;
506     size = TARGET_PAGE_ALIGN(size);
507 
508     if (add) {
509         if (!memory_region_is_rom(section->mr)) {
510             xen_add_to_physmap(state, start_addr, size,
511                                section->mr, section->offset_within_region);
512         } else {
513             mem_type = HVMMEM_ram_ro;
514             if (xen_set_mem_type(xen_domid, mem_type,
515                                  start_addr >> TARGET_PAGE_BITS,
516                                  size >> TARGET_PAGE_BITS)) {
517                 DPRINTF("xen_set_mem_type error, addr: "TARGET_FMT_plx"\n",
518                         start_addr);
519             }
520         }
521     } else {
522         if (xen_remove_from_physmap(state, start_addr, size) < 0) {
523             DPRINTF("physmapping does not exist at "TARGET_FMT_plx"\n", start_addr);
524         }
525     }
526 }
527 
528 static void xen_region_add(MemoryListener *listener,
529                            MemoryRegionSection *section)
530 {
531     memory_region_ref(section->mr);
532     xen_set_memory(listener, section, true);
533 }
534 
535 static void xen_region_del(MemoryListener *listener,
536                            MemoryRegionSection *section)
537 {
538     xen_set_memory(listener, section, false);
539     memory_region_unref(section->mr);
540 }
541 
542 static void xen_io_add(MemoryListener *listener,
543                        MemoryRegionSection *section)
544 {
545     XenIOState *state = container_of(listener, XenIOState, io_listener);
546     MemoryRegion *mr = section->mr;
547 
548     if (mr->ops == &unassigned_io_ops) {
549         return;
550     }
551 
552     memory_region_ref(mr);
553 
554     xen_map_io_section(xen_domid, state->ioservid, section);
555 }
556 
557 static void xen_io_del(MemoryListener *listener,
558                        MemoryRegionSection *section)
559 {
560     XenIOState *state = container_of(listener, XenIOState, io_listener);
561     MemoryRegion *mr = section->mr;
562 
563     if (mr->ops == &unassigned_io_ops) {
564         return;
565     }
566 
567     xen_unmap_io_section(xen_domid, state->ioservid, section);
568 
569     memory_region_unref(mr);
570 }
571 
572 static void xen_device_realize(DeviceListener *listener,
573 			       DeviceState *dev)
574 {
575     XenIOState *state = container_of(listener, XenIOState, device_listener);
576 
577     if (object_dynamic_cast(OBJECT(dev), TYPE_PCI_DEVICE)) {
578         PCIDevice *pci_dev = PCI_DEVICE(dev);
579         XenPciDevice *xendev = g_new(XenPciDevice, 1);
580 
581         xendev->pci_dev = pci_dev;
582         xendev->sbdf = PCI_BUILD_BDF(pci_dev_bus_num(pci_dev),
583                                      pci_dev->devfn);
584         QLIST_INSERT_HEAD(&state->dev_list, xendev, entry);
585 
586         xen_map_pcidev(xen_domid, state->ioservid, pci_dev);
587     }
588 }
589 
590 static void xen_device_unrealize(DeviceListener *listener,
591 				 DeviceState *dev)
592 {
593     XenIOState *state = container_of(listener, XenIOState, device_listener);
594 
595     if (object_dynamic_cast(OBJECT(dev), TYPE_PCI_DEVICE)) {
596         PCIDevice *pci_dev = PCI_DEVICE(dev);
597         XenPciDevice *xendev, *next;
598 
599         xen_unmap_pcidev(xen_domid, state->ioservid, pci_dev);
600 
601         QLIST_FOREACH_SAFE(xendev, &state->dev_list, entry, next) {
602             if (xendev->pci_dev == pci_dev) {
603                 QLIST_REMOVE(xendev, entry);
604                 g_free(xendev);
605                 break;
606             }
607         }
608     }
609 }
610 
611 static void xen_sync_dirty_bitmap(XenIOState *state,
612                                   hwaddr start_addr,
613                                   ram_addr_t size)
614 {
615     hwaddr npages = size >> TARGET_PAGE_BITS;
616     const int width = sizeof(unsigned long) * 8;
617     unsigned long bitmap[DIV_ROUND_UP(npages, width)];
618     int rc, i, j;
619     const XenPhysmap *physmap = NULL;
620 
621     physmap = get_physmapping(start_addr, size);
622     if (physmap == NULL) {
623         /* not handled */
624         return;
625     }
626 
627     if (state->log_for_dirtybit == NULL) {
628         state->log_for_dirtybit = physmap;
629     } else if (state->log_for_dirtybit != physmap) {
630         /* Only one range for dirty bitmap can be tracked. */
631         return;
632     }
633 
634     rc = xen_track_dirty_vram(xen_domid, start_addr >> TARGET_PAGE_BITS,
635                               npages, bitmap);
636     if (rc < 0) {
637 #ifndef ENODATA
638 #define ENODATA  ENOENT
639 #endif
640         if (errno == ENODATA) {
641             memory_region_set_dirty(framebuffer, 0, size);
642             DPRINTF("xen: track_dirty_vram failed (0x" TARGET_FMT_plx
643                     ", 0x" TARGET_FMT_plx "): %s\n",
644                     start_addr, start_addr + size, strerror(errno));
645         }
646         return;
647     }
648 
649     for (i = 0; i < ARRAY_SIZE(bitmap); i++) {
650         unsigned long map = bitmap[i];
651         while (map != 0) {
652             j = ctzl(map);
653             map &= ~(1ul << j);
654             memory_region_set_dirty(framebuffer,
655                                     (i * width + j) * TARGET_PAGE_SIZE,
656                                     TARGET_PAGE_SIZE);
657         };
658     }
659 }
660 
661 static void xen_log_start(MemoryListener *listener,
662                           MemoryRegionSection *section,
663                           int old, int new)
664 {
665     XenIOState *state = container_of(listener, XenIOState, memory_listener);
666 
667     if (new & ~old & (1 << DIRTY_MEMORY_VGA)) {
668         xen_sync_dirty_bitmap(state, section->offset_within_address_space,
669                               int128_get64(section->size));
670     }
671 }
672 
673 static void xen_log_stop(MemoryListener *listener, MemoryRegionSection *section,
674                          int old, int new)
675 {
676     XenIOState *state = container_of(listener, XenIOState, memory_listener);
677 
678     if (old & ~new & (1 << DIRTY_MEMORY_VGA)) {
679         state->log_for_dirtybit = NULL;
680         /* Disable dirty bit tracking */
681         xen_track_dirty_vram(xen_domid, 0, 0, NULL);
682     }
683 }
684 
685 static void xen_log_sync(MemoryListener *listener, MemoryRegionSection *section)
686 {
687     XenIOState *state = container_of(listener, XenIOState, memory_listener);
688 
689     xen_sync_dirty_bitmap(state, section->offset_within_address_space,
690                           int128_get64(section->size));
691 }
692 
693 static void xen_log_global_start(MemoryListener *listener)
694 {
695     if (xen_enabled()) {
696         xen_in_migration = true;
697     }
698 }
699 
700 static void xen_log_global_stop(MemoryListener *listener)
701 {
702     xen_in_migration = false;
703 }
704 
705 static MemoryListener xen_memory_listener = {
706     .region_add = xen_region_add,
707     .region_del = xen_region_del,
708     .log_start = xen_log_start,
709     .log_stop = xen_log_stop,
710     .log_sync = xen_log_sync,
711     .log_global_start = xen_log_global_start,
712     .log_global_stop = xen_log_global_stop,
713     .priority = 10,
714 };
715 
716 static MemoryListener xen_io_listener = {
717     .region_add = xen_io_add,
718     .region_del = xen_io_del,
719     .priority = 10,
720 };
721 
722 static DeviceListener xen_device_listener = {
723     .realize = xen_device_realize,
724     .unrealize = xen_device_unrealize,
725 };
726 
727 /* get the ioreq packets from share mem */
728 static ioreq_t *cpu_get_ioreq_from_shared_memory(XenIOState *state, int vcpu)
729 {
730     ioreq_t *req = xen_vcpu_ioreq(state->shared_page, vcpu);
731 
732     if (req->state != STATE_IOREQ_READY) {
733         DPRINTF("I/O request not ready: "
734                 "%x, ptr: %x, port: %"PRIx64", "
735                 "data: %"PRIx64", count: %u, size: %u\n",
736                 req->state, req->data_is_ptr, req->addr,
737                 req->data, req->count, req->size);
738         return NULL;
739     }
740 
741     xen_rmb(); /* see IOREQ_READY /then/ read contents of ioreq */
742 
743     req->state = STATE_IOREQ_INPROCESS;
744     return req;
745 }
746 
747 /* use poll to get the port notification */
748 /* ioreq_vec--out,the */
749 /* retval--the number of ioreq packet */
750 static ioreq_t *cpu_get_ioreq(XenIOState *state)
751 {
752     int i;
753     evtchn_port_t port;
754 
755     port = xenevtchn_pending(state->xce_handle);
756     if (port == state->bufioreq_local_port) {
757         timer_mod(state->buffered_io_timer,
758                 BUFFER_IO_MAX_DELAY + qemu_clock_get_ms(QEMU_CLOCK_REALTIME));
759         return NULL;
760     }
761 
762     if (port != -1) {
763         for (i = 0; i < max_cpus; i++) {
764             if (state->ioreq_local_port[i] == port) {
765                 break;
766             }
767         }
768 
769         if (i == max_cpus) {
770             hw_error("Fatal error while trying to get io event!\n");
771         }
772 
773         /* unmask the wanted port again */
774         xenevtchn_unmask(state->xce_handle, port);
775 
776         /* get the io packet from shared memory */
777         state->send_vcpu = i;
778         return cpu_get_ioreq_from_shared_memory(state, i);
779     }
780 
781     /* read error or read nothing */
782     return NULL;
783 }
784 
785 static uint32_t do_inp(uint32_t addr, unsigned long size)
786 {
787     switch (size) {
788         case 1:
789             return cpu_inb(addr);
790         case 2:
791             return cpu_inw(addr);
792         case 4:
793             return cpu_inl(addr);
794         default:
795             hw_error("inp: bad size: %04x %lx", addr, size);
796     }
797 }
798 
799 static void do_outp(uint32_t addr,
800         unsigned long size, uint32_t val)
801 {
802     switch (size) {
803         case 1:
804             return cpu_outb(addr, val);
805         case 2:
806             return cpu_outw(addr, val);
807         case 4:
808             return cpu_outl(addr, val);
809         default:
810             hw_error("outp: bad size: %04x %lx", addr, size);
811     }
812 }
813 
814 /*
815  * Helper functions which read/write an object from/to physical guest
816  * memory, as part of the implementation of an ioreq.
817  *
818  * Equivalent to
819  *   cpu_physical_memory_rw(addr + (req->df ? -1 : +1) * req->size * i,
820  *                          val, req->size, 0/1)
821  * except without the integer overflow problems.
822  */
823 static void rw_phys_req_item(hwaddr addr,
824                              ioreq_t *req, uint32_t i, void *val, int rw)
825 {
826     /* Do everything unsigned so overflow just results in a truncated result
827      * and accesses to undesired parts of guest memory, which is up
828      * to the guest */
829     hwaddr offset = (hwaddr)req->size * i;
830     if (req->df) {
831         addr -= offset;
832     } else {
833         addr += offset;
834     }
835     cpu_physical_memory_rw(addr, val, req->size, rw);
836 }
837 
838 static inline void read_phys_req_item(hwaddr addr,
839                                       ioreq_t *req, uint32_t i, void *val)
840 {
841     rw_phys_req_item(addr, req, i, val, 0);
842 }
843 static inline void write_phys_req_item(hwaddr addr,
844                                        ioreq_t *req, uint32_t i, void *val)
845 {
846     rw_phys_req_item(addr, req, i, val, 1);
847 }
848 
849 
850 static void cpu_ioreq_pio(ioreq_t *req)
851 {
852     uint32_t i;
853 
854     trace_cpu_ioreq_pio(req, req->dir, req->df, req->data_is_ptr, req->addr,
855                          req->data, req->count, req->size);
856 
857     if (req->size > sizeof(uint32_t)) {
858         hw_error("PIO: bad size (%u)", req->size);
859     }
860 
861     if (req->dir == IOREQ_READ) {
862         if (!req->data_is_ptr) {
863             req->data = do_inp(req->addr, req->size);
864             trace_cpu_ioreq_pio_read_reg(req, req->data, req->addr,
865                                          req->size);
866         } else {
867             uint32_t tmp;
868 
869             for (i = 0; i < req->count; i++) {
870                 tmp = do_inp(req->addr, req->size);
871                 write_phys_req_item(req->data, req, i, &tmp);
872             }
873         }
874     } else if (req->dir == IOREQ_WRITE) {
875         if (!req->data_is_ptr) {
876             trace_cpu_ioreq_pio_write_reg(req, req->data, req->addr,
877                                           req->size);
878             do_outp(req->addr, req->size, req->data);
879         } else {
880             for (i = 0; i < req->count; i++) {
881                 uint32_t tmp = 0;
882 
883                 read_phys_req_item(req->data, req, i, &tmp);
884                 do_outp(req->addr, req->size, tmp);
885             }
886         }
887     }
888 }
889 
890 static void cpu_ioreq_move(ioreq_t *req)
891 {
892     uint32_t i;
893 
894     trace_cpu_ioreq_move(req, req->dir, req->df, req->data_is_ptr, req->addr,
895                          req->data, req->count, req->size);
896 
897     if (req->size > sizeof(req->data)) {
898         hw_error("MMIO: bad size (%u)", req->size);
899     }
900 
901     if (!req->data_is_ptr) {
902         if (req->dir == IOREQ_READ) {
903             for (i = 0; i < req->count; i++) {
904                 read_phys_req_item(req->addr, req, i, &req->data);
905             }
906         } else if (req->dir == IOREQ_WRITE) {
907             for (i = 0; i < req->count; i++) {
908                 write_phys_req_item(req->addr, req, i, &req->data);
909             }
910         }
911     } else {
912         uint64_t tmp;
913 
914         if (req->dir == IOREQ_READ) {
915             for (i = 0; i < req->count; i++) {
916                 read_phys_req_item(req->addr, req, i, &tmp);
917                 write_phys_req_item(req->data, req, i, &tmp);
918             }
919         } else if (req->dir == IOREQ_WRITE) {
920             for (i = 0; i < req->count; i++) {
921                 read_phys_req_item(req->data, req, i, &tmp);
922                 write_phys_req_item(req->addr, req, i, &tmp);
923             }
924         }
925     }
926 }
927 
928 static void cpu_ioreq_config(XenIOState *state, ioreq_t *req)
929 {
930     uint32_t sbdf = req->addr >> 32;
931     uint32_t reg = req->addr;
932     XenPciDevice *xendev;
933 
934     if (req->size != sizeof(uint8_t) && req->size != sizeof(uint16_t) &&
935         req->size != sizeof(uint32_t)) {
936         hw_error("PCI config access: bad size (%u)", req->size);
937     }
938 
939     if (req->count != 1) {
940         hw_error("PCI config access: bad count (%u)", req->count);
941     }
942 
943     QLIST_FOREACH(xendev, &state->dev_list, entry) {
944         if (xendev->sbdf != sbdf) {
945             continue;
946         }
947 
948         if (!req->data_is_ptr) {
949             if (req->dir == IOREQ_READ) {
950                 req->data = pci_host_config_read_common(
951                     xendev->pci_dev, reg, PCI_CONFIG_SPACE_SIZE,
952                     req->size);
953                 trace_cpu_ioreq_config_read(req, xendev->sbdf, reg,
954                                             req->size, req->data);
955             } else if (req->dir == IOREQ_WRITE) {
956                 trace_cpu_ioreq_config_write(req, xendev->sbdf, reg,
957                                              req->size, req->data);
958                 pci_host_config_write_common(
959                     xendev->pci_dev, reg, PCI_CONFIG_SPACE_SIZE,
960                     req->data, req->size);
961             }
962         } else {
963             uint32_t tmp;
964 
965             if (req->dir == IOREQ_READ) {
966                 tmp = pci_host_config_read_common(
967                     xendev->pci_dev, reg, PCI_CONFIG_SPACE_SIZE,
968                     req->size);
969                 trace_cpu_ioreq_config_read(req, xendev->sbdf, reg,
970                                             req->size, tmp);
971                 write_phys_req_item(req->data, req, 0, &tmp);
972             } else if (req->dir == IOREQ_WRITE) {
973                 read_phys_req_item(req->data, req, 0, &tmp);
974                 trace_cpu_ioreq_config_write(req, xendev->sbdf, reg,
975                                              req->size, tmp);
976                 pci_host_config_write_common(
977                     xendev->pci_dev, reg, PCI_CONFIG_SPACE_SIZE,
978                     tmp, req->size);
979             }
980         }
981     }
982 }
983 
984 static void regs_to_cpu(vmware_regs_t *vmport_regs, ioreq_t *req)
985 {
986     X86CPU *cpu;
987     CPUX86State *env;
988 
989     cpu = X86_CPU(current_cpu);
990     env = &cpu->env;
991     env->regs[R_EAX] = req->data;
992     env->regs[R_EBX] = vmport_regs->ebx;
993     env->regs[R_ECX] = vmport_regs->ecx;
994     env->regs[R_EDX] = vmport_regs->edx;
995     env->regs[R_ESI] = vmport_regs->esi;
996     env->regs[R_EDI] = vmport_regs->edi;
997 }
998 
999 static void regs_from_cpu(vmware_regs_t *vmport_regs)
1000 {
1001     X86CPU *cpu = X86_CPU(current_cpu);
1002     CPUX86State *env = &cpu->env;
1003 
1004     vmport_regs->ebx = env->regs[R_EBX];
1005     vmport_regs->ecx = env->regs[R_ECX];
1006     vmport_regs->edx = env->regs[R_EDX];
1007     vmport_regs->esi = env->regs[R_ESI];
1008     vmport_regs->edi = env->regs[R_EDI];
1009 }
1010 
1011 static void handle_vmport_ioreq(XenIOState *state, ioreq_t *req)
1012 {
1013     vmware_regs_t *vmport_regs;
1014 
1015     assert(state->shared_vmport_page);
1016     vmport_regs =
1017         &state->shared_vmport_page->vcpu_vmport_regs[state->send_vcpu];
1018     QEMU_BUILD_BUG_ON(sizeof(*req) < sizeof(*vmport_regs));
1019 
1020     current_cpu = state->cpu_by_vcpu_id[state->send_vcpu];
1021     regs_to_cpu(vmport_regs, req);
1022     cpu_ioreq_pio(req);
1023     regs_from_cpu(vmport_regs);
1024     current_cpu = NULL;
1025 }
1026 
1027 static void handle_ioreq(XenIOState *state, ioreq_t *req)
1028 {
1029     trace_handle_ioreq(req, req->type, req->dir, req->df, req->data_is_ptr,
1030                        req->addr, req->data, req->count, req->size);
1031 
1032     if (!req->data_is_ptr && (req->dir == IOREQ_WRITE) &&
1033             (req->size < sizeof (target_ulong))) {
1034         req->data &= ((target_ulong) 1 << (8 * req->size)) - 1;
1035     }
1036 
1037     if (req->dir == IOREQ_WRITE)
1038         trace_handle_ioreq_write(req, req->type, req->df, req->data_is_ptr,
1039                                  req->addr, req->data, req->count, req->size);
1040 
1041     switch (req->type) {
1042         case IOREQ_TYPE_PIO:
1043             cpu_ioreq_pio(req);
1044             break;
1045         case IOREQ_TYPE_COPY:
1046             cpu_ioreq_move(req);
1047             break;
1048         case IOREQ_TYPE_VMWARE_PORT:
1049             handle_vmport_ioreq(state, req);
1050             break;
1051         case IOREQ_TYPE_TIMEOFFSET:
1052             break;
1053         case IOREQ_TYPE_INVALIDATE:
1054             xen_invalidate_map_cache();
1055             break;
1056         case IOREQ_TYPE_PCI_CONFIG:
1057             cpu_ioreq_config(state, req);
1058             break;
1059         default:
1060             hw_error("Invalid ioreq type 0x%x\n", req->type);
1061     }
1062     if (req->dir == IOREQ_READ) {
1063         trace_handle_ioreq_read(req, req->type, req->df, req->data_is_ptr,
1064                                 req->addr, req->data, req->count, req->size);
1065     }
1066 }
1067 
1068 static int handle_buffered_iopage(XenIOState *state)
1069 {
1070     buffered_iopage_t *buf_page = state->buffered_io_page;
1071     buf_ioreq_t *buf_req = NULL;
1072     ioreq_t req;
1073     int qw;
1074 
1075     if (!buf_page) {
1076         return 0;
1077     }
1078 
1079     memset(&req, 0x00, sizeof(req));
1080     req.state = STATE_IOREQ_READY;
1081     req.count = 1;
1082     req.dir = IOREQ_WRITE;
1083 
1084     for (;;) {
1085         uint32_t rdptr = buf_page->read_pointer, wrptr;
1086 
1087         xen_rmb();
1088         wrptr = buf_page->write_pointer;
1089         xen_rmb();
1090         if (rdptr != buf_page->read_pointer) {
1091             continue;
1092         }
1093         if (rdptr == wrptr) {
1094             break;
1095         }
1096         buf_req = &buf_page->buf_ioreq[rdptr % IOREQ_BUFFER_SLOT_NUM];
1097         req.size = 1U << buf_req->size;
1098         req.addr = buf_req->addr;
1099         req.data = buf_req->data;
1100         req.type = buf_req->type;
1101         xen_rmb();
1102         qw = (req.size == 8);
1103         if (qw) {
1104             if (rdptr + 1 == wrptr) {
1105                 hw_error("Incomplete quad word buffered ioreq");
1106             }
1107             buf_req = &buf_page->buf_ioreq[(rdptr + 1) %
1108                                            IOREQ_BUFFER_SLOT_NUM];
1109             req.data |= ((uint64_t)buf_req->data) << 32;
1110             xen_rmb();
1111         }
1112 
1113         handle_ioreq(state, &req);
1114 
1115         /* Only req.data may get updated by handle_ioreq(), albeit even that
1116          * should not happen as such data would never make it to the guest (we
1117          * can only usefully see writes here after all).
1118          */
1119         assert(req.state == STATE_IOREQ_READY);
1120         assert(req.count == 1);
1121         assert(req.dir == IOREQ_WRITE);
1122         assert(!req.data_is_ptr);
1123 
1124         atomic_add(&buf_page->read_pointer, qw + 1);
1125     }
1126 
1127     return req.count;
1128 }
1129 
1130 static void handle_buffered_io(void *opaque)
1131 {
1132     XenIOState *state = opaque;
1133 
1134     if (handle_buffered_iopage(state)) {
1135         timer_mod(state->buffered_io_timer,
1136                 BUFFER_IO_MAX_DELAY + qemu_clock_get_ms(QEMU_CLOCK_REALTIME));
1137     } else {
1138         timer_del(state->buffered_io_timer);
1139         xenevtchn_unmask(state->xce_handle, state->bufioreq_local_port);
1140     }
1141 }
1142 
1143 static void cpu_handle_ioreq(void *opaque)
1144 {
1145     XenIOState *state = opaque;
1146     ioreq_t *req = cpu_get_ioreq(state);
1147 
1148     handle_buffered_iopage(state);
1149     if (req) {
1150         ioreq_t copy = *req;
1151 
1152         xen_rmb();
1153         handle_ioreq(state, &copy);
1154         req->data = copy.data;
1155 
1156         if (req->state != STATE_IOREQ_INPROCESS) {
1157             fprintf(stderr, "Badness in I/O request ... not in service?!: "
1158                     "%x, ptr: %x, port: %"PRIx64", "
1159                     "data: %"PRIx64", count: %u, size: %u, type: %u\n",
1160                     req->state, req->data_is_ptr, req->addr,
1161                     req->data, req->count, req->size, req->type);
1162             destroy_hvm_domain(false);
1163             return;
1164         }
1165 
1166         xen_wmb(); /* Update ioreq contents /then/ update state. */
1167 
1168         /*
1169          * We do this before we send the response so that the tools
1170          * have the opportunity to pick up on the reset before the
1171          * guest resumes and does a hlt with interrupts disabled which
1172          * causes Xen to powerdown the domain.
1173          */
1174         if (runstate_is_running()) {
1175             ShutdownCause request;
1176 
1177             if (qemu_shutdown_requested_get()) {
1178                 destroy_hvm_domain(false);
1179             }
1180             request = qemu_reset_requested_get();
1181             if (request) {
1182                 qemu_system_reset(request);
1183                 destroy_hvm_domain(true);
1184             }
1185         }
1186 
1187         req->state = STATE_IORESP_READY;
1188         xenevtchn_notify(state->xce_handle,
1189                          state->ioreq_local_port[state->send_vcpu]);
1190     }
1191 }
1192 
1193 static void xen_main_loop_prepare(XenIOState *state)
1194 {
1195     int evtchn_fd = -1;
1196 
1197     if (state->xce_handle != NULL) {
1198         evtchn_fd = xenevtchn_fd(state->xce_handle);
1199     }
1200 
1201     state->buffered_io_timer = timer_new_ms(QEMU_CLOCK_REALTIME, handle_buffered_io,
1202                                                  state);
1203 
1204     if (evtchn_fd != -1) {
1205         CPUState *cpu_state;
1206 
1207         DPRINTF("%s: Init cpu_by_vcpu_id\n", __func__);
1208         CPU_FOREACH(cpu_state) {
1209             DPRINTF("%s: cpu_by_vcpu_id[%d]=%p\n",
1210                     __func__, cpu_state->cpu_index, cpu_state);
1211             state->cpu_by_vcpu_id[cpu_state->cpu_index] = cpu_state;
1212         }
1213         qemu_set_fd_handler(evtchn_fd, cpu_handle_ioreq, NULL, state);
1214     }
1215 }
1216 
1217 
1218 static void xen_hvm_change_state_handler(void *opaque, int running,
1219                                          RunState rstate)
1220 {
1221     XenIOState *state = opaque;
1222 
1223     if (running) {
1224         xen_main_loop_prepare(state);
1225     }
1226 
1227     xen_set_ioreq_server_state(xen_domid,
1228                                state->ioservid,
1229                                (rstate == RUN_STATE_RUNNING));
1230 }
1231 
1232 static void xen_exit_notifier(Notifier *n, void *data)
1233 {
1234     XenIOState *state = container_of(n, XenIOState, exit);
1235 
1236     xenevtchn_close(state->xce_handle);
1237     xs_daemon_close(state->xenstore);
1238 }
1239 
1240 #ifdef XEN_COMPAT_PHYSMAP
1241 static void xen_read_physmap(XenIOState *state)
1242 {
1243     XenPhysmap *physmap = NULL;
1244     unsigned int len, num, i;
1245     char path[80], *value = NULL;
1246     char **entries = NULL;
1247 
1248     snprintf(path, sizeof(path),
1249             "/local/domain/0/device-model/%d/physmap", xen_domid);
1250     entries = xs_directory(state->xenstore, 0, path, &num);
1251     if (entries == NULL)
1252         return;
1253 
1254     for (i = 0; i < num; i++) {
1255         physmap = g_malloc(sizeof (XenPhysmap));
1256         physmap->phys_offset = strtoull(entries[i], NULL, 16);
1257         snprintf(path, sizeof(path),
1258                 "/local/domain/0/device-model/%d/physmap/%s/start_addr",
1259                 xen_domid, entries[i]);
1260         value = xs_read(state->xenstore, 0, path, &len);
1261         if (value == NULL) {
1262             g_free(physmap);
1263             continue;
1264         }
1265         physmap->start_addr = strtoull(value, NULL, 16);
1266         free(value);
1267 
1268         snprintf(path, sizeof(path),
1269                 "/local/domain/0/device-model/%d/physmap/%s/size",
1270                 xen_domid, entries[i]);
1271         value = xs_read(state->xenstore, 0, path, &len);
1272         if (value == NULL) {
1273             g_free(physmap);
1274             continue;
1275         }
1276         physmap->size = strtoull(value, NULL, 16);
1277         free(value);
1278 
1279         snprintf(path, sizeof(path),
1280                 "/local/domain/0/device-model/%d/physmap/%s/name",
1281                 xen_domid, entries[i]);
1282         physmap->name = xs_read(state->xenstore, 0, path, &len);
1283 
1284         QLIST_INSERT_HEAD(&xen_physmap, physmap, list);
1285     }
1286     free(entries);
1287 }
1288 #else
1289 static void xen_read_physmap(XenIOState *state)
1290 {
1291 }
1292 #endif
1293 
1294 static void xen_wakeup_notifier(Notifier *notifier, void *data)
1295 {
1296     xc_set_hvm_param(xen_xc, xen_domid, HVM_PARAM_ACPI_S_STATE, 0);
1297 }
1298 
1299 static int xen_map_ioreq_server(XenIOState *state)
1300 {
1301     void *addr = NULL;
1302     xenforeignmemory_resource_handle *fres;
1303     xen_pfn_t ioreq_pfn;
1304     xen_pfn_t bufioreq_pfn;
1305     evtchn_port_t bufioreq_evtchn;
1306     int rc;
1307 
1308     /*
1309      * Attempt to map using the resource API and fall back to normal
1310      * foreign mapping if this is not supported.
1311      */
1312     QEMU_BUILD_BUG_ON(XENMEM_resource_ioreq_server_frame_bufioreq != 0);
1313     QEMU_BUILD_BUG_ON(XENMEM_resource_ioreq_server_frame_ioreq(0) != 1);
1314     fres = xenforeignmemory_map_resource(xen_fmem, xen_domid,
1315                                          XENMEM_resource_ioreq_server,
1316                                          state->ioservid, 0, 2,
1317                                          &addr,
1318                                          PROT_READ | PROT_WRITE, 0);
1319     if (fres != NULL) {
1320         trace_xen_map_resource_ioreq(state->ioservid, addr);
1321         state->buffered_io_page = addr;
1322         state->shared_page = addr + TARGET_PAGE_SIZE;
1323     } else if (errno != EOPNOTSUPP) {
1324         error_report("failed to map ioreq server resources: error %d handle=%p",
1325                      errno, xen_xc);
1326         return -1;
1327     }
1328 
1329     rc = xen_get_ioreq_server_info(xen_domid, state->ioservid,
1330                                    (state->shared_page == NULL) ?
1331                                    &ioreq_pfn : NULL,
1332                                    (state->buffered_io_page == NULL) ?
1333                                    &bufioreq_pfn : NULL,
1334                                    &bufioreq_evtchn);
1335     if (rc < 0) {
1336         error_report("failed to get ioreq server info: error %d handle=%p",
1337                      errno, xen_xc);
1338         return rc;
1339     }
1340 
1341     if (state->shared_page == NULL) {
1342         DPRINTF("shared page at pfn %lx\n", ioreq_pfn);
1343 
1344         state->shared_page = xenforeignmemory_map(xen_fmem, xen_domid,
1345                                                   PROT_READ | PROT_WRITE,
1346                                                   1, &ioreq_pfn, NULL);
1347         if (state->shared_page == NULL) {
1348             error_report("map shared IO page returned error %d handle=%p",
1349                          errno, xen_xc);
1350         }
1351     }
1352 
1353     if (state->buffered_io_page == NULL) {
1354         DPRINTF("buffered io page at pfn %lx\n", bufioreq_pfn);
1355 
1356         state->buffered_io_page = xenforeignmemory_map(xen_fmem, xen_domid,
1357                                                        PROT_READ | PROT_WRITE,
1358                                                        1, &bufioreq_pfn,
1359                                                        NULL);
1360         if (state->buffered_io_page == NULL) {
1361             error_report("map buffered IO page returned error %d", errno);
1362             return -1;
1363         }
1364     }
1365 
1366     if (state->shared_page == NULL || state->buffered_io_page == NULL) {
1367         return -1;
1368     }
1369 
1370     DPRINTF("buffered io evtchn is %x\n", bufioreq_evtchn);
1371 
1372     state->bufioreq_remote_port = bufioreq_evtchn;
1373 
1374     return 0;
1375 }
1376 
1377 void xen_hvm_init(PCMachineState *pcms, MemoryRegion **ram_memory)
1378 {
1379     int i, rc;
1380     xen_pfn_t ioreq_pfn;
1381     XenIOState *state;
1382 
1383     state = g_malloc0(sizeof (XenIOState));
1384 
1385     state->xce_handle = xenevtchn_open(NULL, 0);
1386     if (state->xce_handle == NULL) {
1387         perror("xen: event channel open");
1388         goto err;
1389     }
1390 
1391     state->xenstore = xs_daemon_open();
1392     if (state->xenstore == NULL) {
1393         perror("xen: xenstore open");
1394         goto err;
1395     }
1396 
1397     xen_create_ioreq_server(xen_domid, &state->ioservid);
1398 
1399     state->exit.notify = xen_exit_notifier;
1400     qemu_add_exit_notifier(&state->exit);
1401 
1402     state->suspend.notify = xen_suspend_notifier;
1403     qemu_register_suspend_notifier(&state->suspend);
1404 
1405     state->wakeup.notify = xen_wakeup_notifier;
1406     qemu_register_wakeup_notifier(&state->wakeup);
1407 
1408     /*
1409      * Register wake-up support in QMP query-current-machine API
1410      */
1411     qemu_register_wakeup_support();
1412 
1413     rc = xen_map_ioreq_server(state);
1414     if (rc < 0) {
1415         goto err;
1416     }
1417 
1418     rc = xen_get_vmport_regs_pfn(xen_xc, xen_domid, &ioreq_pfn);
1419     if (!rc) {
1420         DPRINTF("shared vmport page at pfn %lx\n", ioreq_pfn);
1421         state->shared_vmport_page =
1422             xenforeignmemory_map(xen_fmem, xen_domid, PROT_READ|PROT_WRITE,
1423                                  1, &ioreq_pfn, NULL);
1424         if (state->shared_vmport_page == NULL) {
1425             error_report("map shared vmport IO page returned error %d handle=%p",
1426                          errno, xen_xc);
1427             goto err;
1428         }
1429     } else if (rc != -ENOSYS) {
1430         error_report("get vmport regs pfn returned error %d, rc=%d",
1431                      errno, rc);
1432         goto err;
1433     }
1434 
1435     /* Note: cpus is empty at this point in init */
1436     state->cpu_by_vcpu_id = g_malloc0(max_cpus * sizeof(CPUState *));
1437 
1438     rc = xen_set_ioreq_server_state(xen_domid, state->ioservid, true);
1439     if (rc < 0) {
1440         error_report("failed to enable ioreq server info: error %d handle=%p",
1441                      errno, xen_xc);
1442         goto err;
1443     }
1444 
1445     state->ioreq_local_port = g_malloc0(max_cpus * sizeof (evtchn_port_t));
1446 
1447     /* FIXME: how about if we overflow the page here? */
1448     for (i = 0; i < max_cpus; i++) {
1449         rc = xenevtchn_bind_interdomain(state->xce_handle, xen_domid,
1450                                         xen_vcpu_eport(state->shared_page, i));
1451         if (rc == -1) {
1452             error_report("shared evtchn %d bind error %d", i, errno);
1453             goto err;
1454         }
1455         state->ioreq_local_port[i] = rc;
1456     }
1457 
1458     rc = xenevtchn_bind_interdomain(state->xce_handle, xen_domid,
1459                                     state->bufioreq_remote_port);
1460     if (rc == -1) {
1461         error_report("buffered evtchn bind error %d", errno);
1462         goto err;
1463     }
1464     state->bufioreq_local_port = rc;
1465 
1466     /* Init RAM management */
1467 #ifdef XEN_COMPAT_PHYSMAP
1468     xen_map_cache_init(xen_phys_offset_to_gaddr, state);
1469 #else
1470     xen_map_cache_init(NULL, state);
1471 #endif
1472     xen_ram_init(pcms, ram_size, ram_memory);
1473 
1474     qemu_add_vm_change_state_handler(xen_hvm_change_state_handler, state);
1475 
1476     state->memory_listener = xen_memory_listener;
1477     memory_listener_register(&state->memory_listener, &address_space_memory);
1478     state->log_for_dirtybit = NULL;
1479 
1480     state->io_listener = xen_io_listener;
1481     memory_listener_register(&state->io_listener, &address_space_io);
1482 
1483     state->device_listener = xen_device_listener;
1484     QLIST_INIT(&state->dev_list);
1485     device_listener_register(&state->device_listener);
1486 
1487     /* Initialize backend core & drivers */
1488     if (xen_be_init() != 0) {
1489         error_report("xen backend core setup failed");
1490         goto err;
1491     }
1492     xen_be_register_common();
1493 
1494     QLIST_INIT(&xen_physmap);
1495     xen_read_physmap(state);
1496 
1497     /* Disable ACPI build because Xen handles it */
1498     pcms->acpi_build_enabled = false;
1499 
1500     return;
1501 
1502 err:
1503     error_report("xen hardware virtual machine initialisation failed");
1504     exit(1);
1505 }
1506 
1507 void destroy_hvm_domain(bool reboot)
1508 {
1509     xc_interface *xc_handle;
1510     int sts;
1511     int rc;
1512 
1513     unsigned int reason = reboot ? SHUTDOWN_reboot : SHUTDOWN_poweroff;
1514 
1515     if (xen_dmod) {
1516         rc = xendevicemodel_shutdown(xen_dmod, xen_domid, reason);
1517         if (!rc) {
1518             return;
1519         }
1520         if (errno != ENOTTY /* old Xen */) {
1521             perror("xendevicemodel_shutdown failed");
1522         }
1523         /* well, try the old thing then */
1524     }
1525 
1526     xc_handle = xc_interface_open(0, 0, 0);
1527     if (xc_handle == NULL) {
1528         fprintf(stderr, "Cannot acquire xenctrl handle\n");
1529     } else {
1530         sts = xc_domain_shutdown(xc_handle, xen_domid, reason);
1531         if (sts != 0) {
1532             fprintf(stderr, "xc_domain_shutdown failed to issue %s, "
1533                     "sts %d, %s\n", reboot ? "reboot" : "poweroff",
1534                     sts, strerror(errno));
1535         } else {
1536             fprintf(stderr, "Issued domain %d %s\n", xen_domid,
1537                     reboot ? "reboot" : "poweroff");
1538         }
1539         xc_interface_close(xc_handle);
1540     }
1541 }
1542 
1543 void xen_register_framebuffer(MemoryRegion *mr)
1544 {
1545     framebuffer = mr;
1546 }
1547 
1548 void xen_shutdown_fatal_error(const char *fmt, ...)
1549 {
1550     va_list ap;
1551 
1552     va_start(ap, fmt);
1553     vfprintf(stderr, fmt, ap);
1554     va_end(ap);
1555     fprintf(stderr, "Will destroy the domain.\n");
1556     /* destroy the domain */
1557     qemu_system_shutdown_request(SHUTDOWN_CAUSE_HOST_ERROR);
1558 }
1559 
1560 void xen_hvm_modified_memory(ram_addr_t start, ram_addr_t length)
1561 {
1562     if (unlikely(xen_in_migration)) {
1563         int rc;
1564         ram_addr_t start_pfn, nb_pages;
1565 
1566         start = xen_phys_offset_to_gaddr(start, length);
1567 
1568         if (length == 0) {
1569             length = TARGET_PAGE_SIZE;
1570         }
1571         start_pfn = start >> TARGET_PAGE_BITS;
1572         nb_pages = ((start + length + TARGET_PAGE_SIZE - 1) >> TARGET_PAGE_BITS)
1573             - start_pfn;
1574         rc = xen_modified_memory(xen_domid, start_pfn, nb_pages);
1575         if (rc) {
1576             fprintf(stderr,
1577                     "%s failed for "RAM_ADDR_FMT" ("RAM_ADDR_FMT"): %i, %s\n",
1578                     __func__, start, nb_pages, errno, strerror(errno));
1579         }
1580     }
1581 }
1582 
1583 void qmp_xen_set_global_dirty_log(bool enable, Error **errp)
1584 {
1585     if (enable) {
1586         memory_global_dirty_log_start();
1587     } else {
1588         memory_global_dirty_log_stop();
1589     }
1590 }
1591