1418026caSFam Zheng /* 2418026caSFam Zheng * VFIO utility 3418026caSFam Zheng * 4418026caSFam Zheng * Copyright 2016 - 2018 Red Hat, Inc. 5418026caSFam Zheng * 6418026caSFam Zheng * Authors: 7418026caSFam Zheng * Fam Zheng <famz@redhat.com> 8418026caSFam Zheng * 9418026caSFam Zheng * This work is licensed under the terms of the GNU GPL, version 2 or later. 10418026caSFam Zheng * See the COPYING file in the top-level directory. 11418026caSFam Zheng */ 12418026caSFam Zheng 13418026caSFam Zheng #include "qemu/osdep.h" 14418026caSFam Zheng #include <sys/ioctl.h> 15418026caSFam Zheng #include <linux/vfio.h> 16418026caSFam Zheng #include "qapi/error.h" 17418026caSFam Zheng #include "exec/ramlist.h" 18418026caSFam Zheng #include "exec/cpu-common.h" 19418026caSFam Zheng #include "trace.h" 20418026caSFam Zheng #include "qemu/error-report.h" 21418026caSFam Zheng #include "standard-headers/linux/pci_regs.h" 22418026caSFam Zheng #include "qemu/event_notifier.h" 23418026caSFam Zheng #include "qemu/vfio-helpers.h" 24*6e8a355dSDaniel Brodsky #include "qemu/lockable.h" 25418026caSFam Zheng #include "trace.h" 26418026caSFam Zheng 27418026caSFam Zheng #define QEMU_VFIO_DEBUG 0 28418026caSFam Zheng 29418026caSFam Zheng #define QEMU_VFIO_IOVA_MIN 0x10000ULL 30418026caSFam Zheng /* XXX: Once VFIO exposes the iova bit width in the IOMMU capability interface, 31418026caSFam Zheng * we can use a runtime limit; alternatively it's also possible to do platform 32418026caSFam Zheng * specific detection by reading sysfs entries. Until then, 39 is a safe bet. 33418026caSFam Zheng **/ 34418026caSFam Zheng #define QEMU_VFIO_IOVA_MAX (1ULL << 39) 35418026caSFam Zheng 36418026caSFam Zheng typedef struct { 37418026caSFam Zheng /* Page aligned addr. */ 38418026caSFam Zheng void *host; 39418026caSFam Zheng size_t size; 40418026caSFam Zheng uint64_t iova; 41418026caSFam Zheng } IOVAMapping; 42418026caSFam Zheng 43418026caSFam Zheng struct QEMUVFIOState { 44418026caSFam Zheng QemuMutex lock; 45418026caSFam Zheng 46418026caSFam Zheng /* These fields are protected by BQL */ 47418026caSFam Zheng int container; 48418026caSFam Zheng int group; 49418026caSFam Zheng int device; 50418026caSFam Zheng RAMBlockNotifier ram_notifier; 51418026caSFam Zheng struct vfio_region_info config_region_info, bar_region_info[6]; 52418026caSFam Zheng 53418026caSFam Zheng /* These fields are protected by @lock */ 54418026caSFam Zheng /* VFIO's IO virtual address space is managed by splitting into a few 55418026caSFam Zheng * sections: 56418026caSFam Zheng * 57418026caSFam Zheng * --------------- <= 0 58418026caSFam Zheng * |xxxxxxxxxxxxx| 59418026caSFam Zheng * |-------------| <= QEMU_VFIO_IOVA_MIN 60418026caSFam Zheng * | | 61418026caSFam Zheng * | Fixed | 62418026caSFam Zheng * | | 63418026caSFam Zheng * |-------------| <= low_water_mark 64418026caSFam Zheng * | | 65418026caSFam Zheng * | Free | 66418026caSFam Zheng * | | 67418026caSFam Zheng * |-------------| <= high_water_mark 68418026caSFam Zheng * | | 69418026caSFam Zheng * | Temp | 70418026caSFam Zheng * | | 71418026caSFam Zheng * |-------------| <= QEMU_VFIO_IOVA_MAX 72418026caSFam Zheng * |xxxxxxxxxxxxx| 73418026caSFam Zheng * |xxxxxxxxxxxxx| 74418026caSFam Zheng * --------------- 75418026caSFam Zheng * 76418026caSFam Zheng * - Addresses lower than QEMU_VFIO_IOVA_MIN are reserved as invalid; 77418026caSFam Zheng * 78418026caSFam Zheng * - Fixed mappings of HVAs are assigned "low" IOVAs in the range of 79418026caSFam Zheng * [QEMU_VFIO_IOVA_MIN, low_water_mark). Once allocated they will not be 80418026caSFam Zheng * reclaimed - low_water_mark never shrinks; 81418026caSFam Zheng * 82418026caSFam Zheng * - IOVAs in range [low_water_mark, high_water_mark) are free; 83418026caSFam Zheng * 84418026caSFam Zheng * - IOVAs in range [high_water_mark, QEMU_VFIO_IOVA_MAX) are volatile 85418026caSFam Zheng * mappings. At each qemu_vfio_dma_reset_temporary() call, the whole area 86418026caSFam Zheng * is recycled. The caller should make sure I/O's depending on these 87418026caSFam Zheng * mappings are completed before calling. 88418026caSFam Zheng **/ 89418026caSFam Zheng uint64_t low_water_mark; 90418026caSFam Zheng uint64_t high_water_mark; 91418026caSFam Zheng IOVAMapping *mappings; 92418026caSFam Zheng int nr_mappings; 93418026caSFam Zheng }; 94418026caSFam Zheng 95418026caSFam Zheng /** 96418026caSFam Zheng * Find group file by PCI device address as specified @device, and return the 97418026caSFam Zheng * path. The returned string is owned by caller and should be g_free'ed later. 98418026caSFam Zheng */ 99418026caSFam Zheng static char *sysfs_find_group_file(const char *device, Error **errp) 100418026caSFam Zheng { 101418026caSFam Zheng char *sysfs_link; 102418026caSFam Zheng char *sysfs_group; 103418026caSFam Zheng char *p; 104418026caSFam Zheng char *path = NULL; 105418026caSFam Zheng 106418026caSFam Zheng sysfs_link = g_strdup_printf("/sys/bus/pci/devices/%s/iommu_group", device); 10778d8c99eSPaolo Bonzini sysfs_group = g_malloc0(PATH_MAX); 108418026caSFam Zheng if (readlink(sysfs_link, sysfs_group, PATH_MAX - 1) == -1) { 109418026caSFam Zheng error_setg_errno(errp, errno, "Failed to find iommu group sysfs path"); 110418026caSFam Zheng goto out; 111418026caSFam Zheng } 112418026caSFam Zheng p = strrchr(sysfs_group, '/'); 113418026caSFam Zheng if (!p) { 114418026caSFam Zheng error_setg(errp, "Failed to find iommu group number"); 115418026caSFam Zheng goto out; 116418026caSFam Zheng } 117418026caSFam Zheng 118418026caSFam Zheng path = g_strdup_printf("/dev/vfio/%s", p + 1); 119418026caSFam Zheng out: 120418026caSFam Zheng g_free(sysfs_link); 121418026caSFam Zheng g_free(sysfs_group); 122418026caSFam Zheng return path; 123418026caSFam Zheng } 124418026caSFam Zheng 125418026caSFam Zheng static inline void assert_bar_index_valid(QEMUVFIOState *s, int index) 126418026caSFam Zheng { 127418026caSFam Zheng assert(index >= 0 && index < ARRAY_SIZE(s->bar_region_info)); 128418026caSFam Zheng } 129418026caSFam Zheng 130418026caSFam Zheng static int qemu_vfio_pci_init_bar(QEMUVFIOState *s, int index, Error **errp) 131418026caSFam Zheng { 132418026caSFam Zheng assert_bar_index_valid(s, index); 133418026caSFam Zheng s->bar_region_info[index] = (struct vfio_region_info) { 134418026caSFam Zheng .index = VFIO_PCI_BAR0_REGION_INDEX + index, 135418026caSFam Zheng .argsz = sizeof(struct vfio_region_info), 136418026caSFam Zheng }; 137418026caSFam Zheng if (ioctl(s->device, VFIO_DEVICE_GET_REGION_INFO, &s->bar_region_info[index])) { 138418026caSFam Zheng error_setg_errno(errp, errno, "Failed to get BAR region info"); 139418026caSFam Zheng return -errno; 140418026caSFam Zheng } 141418026caSFam Zheng 142418026caSFam Zheng return 0; 143418026caSFam Zheng } 144418026caSFam Zheng 145418026caSFam Zheng /** 146418026caSFam Zheng * Map a PCI bar area. 147418026caSFam Zheng */ 148418026caSFam Zheng void *qemu_vfio_pci_map_bar(QEMUVFIOState *s, int index, 149418026caSFam Zheng uint64_t offset, uint64_t size, 150418026caSFam Zheng Error **errp) 151418026caSFam Zheng { 152418026caSFam Zheng void *p; 153418026caSFam Zheng assert_bar_index_valid(s, index); 154418026caSFam Zheng p = mmap(NULL, MIN(size, s->bar_region_info[index].size - offset), 155418026caSFam Zheng PROT_READ | PROT_WRITE, MAP_SHARED, 156418026caSFam Zheng s->device, s->bar_region_info[index].offset + offset); 157418026caSFam Zheng if (p == MAP_FAILED) { 158418026caSFam Zheng error_setg_errno(errp, errno, "Failed to map BAR region"); 159418026caSFam Zheng p = NULL; 160418026caSFam Zheng } 161418026caSFam Zheng return p; 162418026caSFam Zheng } 163418026caSFam Zheng 164418026caSFam Zheng /** 165418026caSFam Zheng * Unmap a PCI bar area. 166418026caSFam Zheng */ 167418026caSFam Zheng void qemu_vfio_pci_unmap_bar(QEMUVFIOState *s, int index, void *bar, 168418026caSFam Zheng uint64_t offset, uint64_t size) 169418026caSFam Zheng { 170418026caSFam Zheng if (bar) { 171418026caSFam Zheng munmap(bar, MIN(size, s->bar_region_info[index].size - offset)); 172418026caSFam Zheng } 173418026caSFam Zheng } 174418026caSFam Zheng 175418026caSFam Zheng /** 176418026caSFam Zheng * Initialize device IRQ with @irq_type and and register an event notifier. 177418026caSFam Zheng */ 178418026caSFam Zheng int qemu_vfio_pci_init_irq(QEMUVFIOState *s, EventNotifier *e, 179418026caSFam Zheng int irq_type, Error **errp) 180418026caSFam Zheng { 181418026caSFam Zheng int r; 182418026caSFam Zheng struct vfio_irq_set *irq_set; 183418026caSFam Zheng size_t irq_set_size; 184418026caSFam Zheng struct vfio_irq_info irq_info = { .argsz = sizeof(irq_info) }; 185418026caSFam Zheng 186418026caSFam Zheng irq_info.index = irq_type; 187418026caSFam Zheng if (ioctl(s->device, VFIO_DEVICE_GET_IRQ_INFO, &irq_info)) { 188418026caSFam Zheng error_setg_errno(errp, errno, "Failed to get device interrupt info"); 189418026caSFam Zheng return -errno; 190418026caSFam Zheng } 191418026caSFam Zheng if (!(irq_info.flags & VFIO_IRQ_INFO_EVENTFD)) { 192418026caSFam Zheng error_setg(errp, "Device interrupt doesn't support eventfd"); 193418026caSFam Zheng return -EINVAL; 194418026caSFam Zheng } 195418026caSFam Zheng 196418026caSFam Zheng irq_set_size = sizeof(*irq_set) + sizeof(int); 197418026caSFam Zheng irq_set = g_malloc0(irq_set_size); 198418026caSFam Zheng 199418026caSFam Zheng /* Get to a known IRQ state */ 200418026caSFam Zheng *irq_set = (struct vfio_irq_set) { 201418026caSFam Zheng .argsz = irq_set_size, 202418026caSFam Zheng .flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER, 203418026caSFam Zheng .index = irq_info.index, 204418026caSFam Zheng .start = 0, 205418026caSFam Zheng .count = 1, 206418026caSFam Zheng }; 207418026caSFam Zheng 208418026caSFam Zheng *(int *)&irq_set->data = event_notifier_get_fd(e); 209418026caSFam Zheng r = ioctl(s->device, VFIO_DEVICE_SET_IRQS, irq_set); 210418026caSFam Zheng g_free(irq_set); 211418026caSFam Zheng if (r) { 212418026caSFam Zheng error_setg_errno(errp, errno, "Failed to setup device interrupt"); 213418026caSFam Zheng return -errno; 214418026caSFam Zheng } 215418026caSFam Zheng return 0; 216418026caSFam Zheng } 217418026caSFam Zheng 218418026caSFam Zheng static int qemu_vfio_pci_read_config(QEMUVFIOState *s, void *buf, 219418026caSFam Zheng int size, int ofs) 220418026caSFam Zheng { 221418026caSFam Zheng int ret; 222418026caSFam Zheng 223418026caSFam Zheng do { 224418026caSFam Zheng ret = pread(s->device, buf, size, s->config_region_info.offset + ofs); 225418026caSFam Zheng } while (ret == -1 && errno == EINTR); 226418026caSFam Zheng return ret == size ? 0 : -errno; 227418026caSFam Zheng } 228418026caSFam Zheng 229418026caSFam Zheng static int qemu_vfio_pci_write_config(QEMUVFIOState *s, void *buf, int size, int ofs) 230418026caSFam Zheng { 231418026caSFam Zheng int ret; 232418026caSFam Zheng 233418026caSFam Zheng do { 234418026caSFam Zheng ret = pwrite(s->device, buf, size, s->config_region_info.offset + ofs); 235418026caSFam Zheng } while (ret == -1 && errno == EINTR); 236418026caSFam Zheng return ret == size ? 0 : -errno; 237418026caSFam Zheng } 238418026caSFam Zheng 239418026caSFam Zheng static int qemu_vfio_init_pci(QEMUVFIOState *s, const char *device, 240418026caSFam Zheng Error **errp) 241418026caSFam Zheng { 242418026caSFam Zheng int ret; 243418026caSFam Zheng int i; 244418026caSFam Zheng uint16_t pci_cmd; 245418026caSFam Zheng struct vfio_group_status group_status = { .argsz = sizeof(group_status) }; 246418026caSFam Zheng struct vfio_iommu_type1_info iommu_info = { .argsz = sizeof(iommu_info) }; 247418026caSFam Zheng struct vfio_device_info device_info = { .argsz = sizeof(device_info) }; 248418026caSFam Zheng char *group_file = NULL; 249418026caSFam Zheng 250418026caSFam Zheng /* Create a new container */ 251418026caSFam Zheng s->container = open("/dev/vfio/vfio", O_RDWR); 252418026caSFam Zheng 253418026caSFam Zheng if (s->container == -1) { 254418026caSFam Zheng error_setg_errno(errp, errno, "Failed to open /dev/vfio/vfio"); 255418026caSFam Zheng return -errno; 256418026caSFam Zheng } 257418026caSFam Zheng if (ioctl(s->container, VFIO_GET_API_VERSION) != VFIO_API_VERSION) { 258418026caSFam Zheng error_setg(errp, "Invalid VFIO version"); 259418026caSFam Zheng ret = -EINVAL; 260418026caSFam Zheng goto fail_container; 261418026caSFam Zheng } 262418026caSFam Zheng 263418026caSFam Zheng if (!ioctl(s->container, VFIO_CHECK_EXTENSION, VFIO_TYPE1_IOMMU)) { 264418026caSFam Zheng error_setg_errno(errp, errno, "VFIO IOMMU check failed"); 265418026caSFam Zheng ret = -EINVAL; 266418026caSFam Zheng goto fail_container; 267418026caSFam Zheng } 268418026caSFam Zheng 269418026caSFam Zheng /* Open the group */ 270418026caSFam Zheng group_file = sysfs_find_group_file(device, errp); 271418026caSFam Zheng if (!group_file) { 272418026caSFam Zheng ret = -EINVAL; 273418026caSFam Zheng goto fail_container; 274418026caSFam Zheng } 275418026caSFam Zheng 276418026caSFam Zheng s->group = open(group_file, O_RDWR); 277418026caSFam Zheng if (s->group == -1) { 278418026caSFam Zheng error_setg_errno(errp, errno, "Failed to open VFIO group file: %s", 279418026caSFam Zheng group_file); 280418026caSFam Zheng g_free(group_file); 281418026caSFam Zheng ret = -errno; 282418026caSFam Zheng goto fail_container; 283418026caSFam Zheng } 284418026caSFam Zheng g_free(group_file); 285418026caSFam Zheng 286418026caSFam Zheng /* Test the group is viable and available */ 287418026caSFam Zheng if (ioctl(s->group, VFIO_GROUP_GET_STATUS, &group_status)) { 288418026caSFam Zheng error_setg_errno(errp, errno, "Failed to get VFIO group status"); 289418026caSFam Zheng ret = -errno; 290418026caSFam Zheng goto fail; 291418026caSFam Zheng } 292418026caSFam Zheng 293418026caSFam Zheng if (!(group_status.flags & VFIO_GROUP_FLAGS_VIABLE)) { 294418026caSFam Zheng error_setg(errp, "VFIO group is not viable"); 295418026caSFam Zheng ret = -EINVAL; 296418026caSFam Zheng goto fail; 297418026caSFam Zheng } 298418026caSFam Zheng 299418026caSFam Zheng /* Add the group to the container */ 300418026caSFam Zheng if (ioctl(s->group, VFIO_GROUP_SET_CONTAINER, &s->container)) { 301418026caSFam Zheng error_setg_errno(errp, errno, "Failed to add group to VFIO container"); 302418026caSFam Zheng ret = -errno; 303418026caSFam Zheng goto fail; 304418026caSFam Zheng } 305418026caSFam Zheng 306418026caSFam Zheng /* Enable the IOMMU model we want */ 307418026caSFam Zheng if (ioctl(s->container, VFIO_SET_IOMMU, VFIO_TYPE1_IOMMU)) { 308418026caSFam Zheng error_setg_errno(errp, errno, "Failed to set VFIO IOMMU type"); 309418026caSFam Zheng ret = -errno; 310418026caSFam Zheng goto fail; 311418026caSFam Zheng } 312418026caSFam Zheng 313418026caSFam Zheng /* Get additional IOMMU info */ 314418026caSFam Zheng if (ioctl(s->container, VFIO_IOMMU_GET_INFO, &iommu_info)) { 315418026caSFam Zheng error_setg_errno(errp, errno, "Failed to get IOMMU info"); 316418026caSFam Zheng ret = -errno; 317418026caSFam Zheng goto fail; 318418026caSFam Zheng } 319418026caSFam Zheng 320418026caSFam Zheng s->device = ioctl(s->group, VFIO_GROUP_GET_DEVICE_FD, device); 321418026caSFam Zheng 322418026caSFam Zheng if (s->device < 0) { 323418026caSFam Zheng error_setg_errno(errp, errno, "Failed to get device fd"); 324418026caSFam Zheng ret = -errno; 325418026caSFam Zheng goto fail; 326418026caSFam Zheng } 327418026caSFam Zheng 328418026caSFam Zheng /* Test and setup the device */ 329418026caSFam Zheng if (ioctl(s->device, VFIO_DEVICE_GET_INFO, &device_info)) { 330418026caSFam Zheng error_setg_errno(errp, errno, "Failed to get device info"); 331418026caSFam Zheng ret = -errno; 332418026caSFam Zheng goto fail; 333418026caSFam Zheng } 334418026caSFam Zheng 335418026caSFam Zheng if (device_info.num_regions < VFIO_PCI_CONFIG_REGION_INDEX) { 336418026caSFam Zheng error_setg(errp, "Invalid device regions"); 337418026caSFam Zheng ret = -EINVAL; 338418026caSFam Zheng goto fail; 339418026caSFam Zheng } 340418026caSFam Zheng 341418026caSFam Zheng s->config_region_info = (struct vfio_region_info) { 342418026caSFam Zheng .index = VFIO_PCI_CONFIG_REGION_INDEX, 343418026caSFam Zheng .argsz = sizeof(struct vfio_region_info), 344418026caSFam Zheng }; 345418026caSFam Zheng if (ioctl(s->device, VFIO_DEVICE_GET_REGION_INFO, &s->config_region_info)) { 346418026caSFam Zheng error_setg_errno(errp, errno, "Failed to get config region info"); 347418026caSFam Zheng ret = -errno; 348418026caSFam Zheng goto fail; 349418026caSFam Zheng } 350418026caSFam Zheng 3519e722ebcSLi Qiang for (i = 0; i < ARRAY_SIZE(s->bar_region_info); i++) { 352418026caSFam Zheng ret = qemu_vfio_pci_init_bar(s, i, errp); 353418026caSFam Zheng if (ret) { 354418026caSFam Zheng goto fail; 355418026caSFam Zheng } 356418026caSFam Zheng } 357418026caSFam Zheng 358418026caSFam Zheng /* Enable bus master */ 359418026caSFam Zheng ret = qemu_vfio_pci_read_config(s, &pci_cmd, sizeof(pci_cmd), PCI_COMMAND); 360418026caSFam Zheng if (ret) { 361418026caSFam Zheng goto fail; 362418026caSFam Zheng } 363418026caSFam Zheng pci_cmd |= PCI_COMMAND_MASTER; 364418026caSFam Zheng ret = qemu_vfio_pci_write_config(s, &pci_cmd, sizeof(pci_cmd), PCI_COMMAND); 365418026caSFam Zheng if (ret) { 366418026caSFam Zheng goto fail; 367418026caSFam Zheng } 368418026caSFam Zheng return 0; 369418026caSFam Zheng fail: 370418026caSFam Zheng close(s->group); 371418026caSFam Zheng fail_container: 372418026caSFam Zheng close(s->container); 373418026caSFam Zheng return ret; 374418026caSFam Zheng } 375418026caSFam Zheng 376418026caSFam Zheng static void qemu_vfio_ram_block_added(RAMBlockNotifier *n, 377418026caSFam Zheng void *host, size_t size) 378418026caSFam Zheng { 379418026caSFam Zheng QEMUVFIOState *s = container_of(n, QEMUVFIOState, ram_notifier); 380418026caSFam Zheng trace_qemu_vfio_ram_block_added(s, host, size); 381418026caSFam Zheng qemu_vfio_dma_map(s, host, size, false, NULL); 382418026caSFam Zheng } 383418026caSFam Zheng 384418026caSFam Zheng static void qemu_vfio_ram_block_removed(RAMBlockNotifier *n, 385418026caSFam Zheng void *host, size_t size) 386418026caSFam Zheng { 387418026caSFam Zheng QEMUVFIOState *s = container_of(n, QEMUVFIOState, ram_notifier); 388418026caSFam Zheng if (host) { 389418026caSFam Zheng trace_qemu_vfio_ram_block_removed(s, host, size); 390418026caSFam Zheng qemu_vfio_dma_unmap(s, host); 391418026caSFam Zheng } 392418026caSFam Zheng } 393418026caSFam Zheng 394754cb9c0SYury Kotov static int qemu_vfio_init_ramblock(RAMBlock *rb, void *opaque) 395418026caSFam Zheng { 396754cb9c0SYury Kotov void *host_addr = qemu_ram_get_host_addr(rb); 397754cb9c0SYury Kotov ram_addr_t length = qemu_ram_get_used_length(rb); 398418026caSFam Zheng int ret; 399418026caSFam Zheng QEMUVFIOState *s = opaque; 400418026caSFam Zheng 401418026caSFam Zheng if (!host_addr) { 402418026caSFam Zheng return 0; 403418026caSFam Zheng } 404418026caSFam Zheng ret = qemu_vfio_dma_map(s, host_addr, length, false, NULL); 405418026caSFam Zheng if (ret) { 406418026caSFam Zheng fprintf(stderr, "qemu_vfio_init_ramblock: failed %p %" PRId64 "\n", 407418026caSFam Zheng host_addr, (uint64_t)length); 408418026caSFam Zheng } 409418026caSFam Zheng return 0; 410418026caSFam Zheng } 411418026caSFam Zheng 412418026caSFam Zheng static void qemu_vfio_open_common(QEMUVFIOState *s) 413418026caSFam Zheng { 414549b50a3SMarkus Armbruster qemu_mutex_init(&s->lock); 415418026caSFam Zheng s->ram_notifier.ram_block_added = qemu_vfio_ram_block_added; 416418026caSFam Zheng s->ram_notifier.ram_block_removed = qemu_vfio_ram_block_removed; 417418026caSFam Zheng ram_block_notifier_add(&s->ram_notifier); 418418026caSFam Zheng s->low_water_mark = QEMU_VFIO_IOVA_MIN; 419418026caSFam Zheng s->high_water_mark = QEMU_VFIO_IOVA_MAX; 420418026caSFam Zheng qemu_ram_foreach_block(qemu_vfio_init_ramblock, s); 421418026caSFam Zheng } 422418026caSFam Zheng 423418026caSFam Zheng /** 424418026caSFam Zheng * Open a PCI device, e.g. "0000:00:01.0". 425418026caSFam Zheng */ 426418026caSFam Zheng QEMUVFIOState *qemu_vfio_open_pci(const char *device, Error **errp) 427418026caSFam Zheng { 428418026caSFam Zheng int r; 429418026caSFam Zheng QEMUVFIOState *s = g_new0(QEMUVFIOState, 1); 430418026caSFam Zheng 431418026caSFam Zheng r = qemu_vfio_init_pci(s, device, errp); 432418026caSFam Zheng if (r) { 433418026caSFam Zheng g_free(s); 434418026caSFam Zheng return NULL; 435418026caSFam Zheng } 436418026caSFam Zheng qemu_vfio_open_common(s); 437418026caSFam Zheng return s; 438418026caSFam Zheng } 439418026caSFam Zheng 440418026caSFam Zheng static void qemu_vfio_dump_mapping(IOVAMapping *m) 441418026caSFam Zheng { 442418026caSFam Zheng if (QEMU_VFIO_DEBUG) { 443418026caSFam Zheng printf(" vfio mapping %p %" PRIx64 " to %" PRIx64 "\n", m->host, 444418026caSFam Zheng (uint64_t)m->size, (uint64_t)m->iova); 445418026caSFam Zheng } 446418026caSFam Zheng } 447418026caSFam Zheng 448418026caSFam Zheng static void qemu_vfio_dump_mappings(QEMUVFIOState *s) 449418026caSFam Zheng { 450418026caSFam Zheng int i; 451418026caSFam Zheng 452418026caSFam Zheng if (QEMU_VFIO_DEBUG) { 453418026caSFam Zheng printf("vfio mappings\n"); 454418026caSFam Zheng for (i = 0; i < s->nr_mappings; ++i) { 455418026caSFam Zheng qemu_vfio_dump_mapping(&s->mappings[i]); 456418026caSFam Zheng } 457418026caSFam Zheng } 458418026caSFam Zheng } 459418026caSFam Zheng 460418026caSFam Zheng /** 461418026caSFam Zheng * Find the mapping entry that contains [host, host + size) and set @index to 462418026caSFam Zheng * the position. If no entry contains it, @index is the position _after_ which 463418026caSFam Zheng * to insert the new mapping. IOW, it is the index of the largest element that 464418026caSFam Zheng * is smaller than @host, or -1 if no entry is. 465418026caSFam Zheng */ 466418026caSFam Zheng static IOVAMapping *qemu_vfio_find_mapping(QEMUVFIOState *s, void *host, 467418026caSFam Zheng int *index) 468418026caSFam Zheng { 469418026caSFam Zheng IOVAMapping *p = s->mappings; 470418026caSFam Zheng IOVAMapping *q = p ? p + s->nr_mappings - 1 : NULL; 471418026caSFam Zheng IOVAMapping *mid; 472418026caSFam Zheng trace_qemu_vfio_find_mapping(s, host); 473418026caSFam Zheng if (!p) { 474418026caSFam Zheng *index = -1; 475418026caSFam Zheng return NULL; 476418026caSFam Zheng } 477418026caSFam Zheng while (true) { 478418026caSFam Zheng mid = p + (q - p) / 2; 479418026caSFam Zheng if (mid == p) { 480418026caSFam Zheng break; 481418026caSFam Zheng } 482418026caSFam Zheng if (mid->host > host) { 483418026caSFam Zheng q = mid; 484418026caSFam Zheng } else if (mid->host < host) { 485418026caSFam Zheng p = mid; 486418026caSFam Zheng } else { 487418026caSFam Zheng break; 488418026caSFam Zheng } 489418026caSFam Zheng } 490418026caSFam Zheng if (mid->host > host) { 491418026caSFam Zheng mid--; 492418026caSFam Zheng } else if (mid < &s->mappings[s->nr_mappings - 1] 493418026caSFam Zheng && (mid + 1)->host <= host) { 494418026caSFam Zheng mid++; 495418026caSFam Zheng } 496418026caSFam Zheng *index = mid - &s->mappings[0]; 497418026caSFam Zheng if (mid >= &s->mappings[0] && 498418026caSFam Zheng mid->host <= host && mid->host + mid->size > host) { 499418026caSFam Zheng assert(mid < &s->mappings[s->nr_mappings]); 500418026caSFam Zheng return mid; 501418026caSFam Zheng } 502418026caSFam Zheng /* At this point *index + 1 is the right position to insert the new 503418026caSFam Zheng * mapping.*/ 504418026caSFam Zheng return NULL; 505418026caSFam Zheng } 506418026caSFam Zheng 507418026caSFam Zheng /** 508418026caSFam Zheng * Allocate IOVA and and create a new mapping record and insert it in @s. 509418026caSFam Zheng */ 510418026caSFam Zheng static IOVAMapping *qemu_vfio_add_mapping(QEMUVFIOState *s, 511418026caSFam Zheng void *host, size_t size, 512418026caSFam Zheng int index, uint64_t iova) 513418026caSFam Zheng { 514418026caSFam Zheng int shift; 515418026caSFam Zheng IOVAMapping m = {.host = host, .size = size, .iova = iova}; 516418026caSFam Zheng IOVAMapping *insert; 517418026caSFam Zheng 518038adc2fSWei Yang assert(QEMU_IS_ALIGNED(size, qemu_real_host_page_size)); 519038adc2fSWei Yang assert(QEMU_IS_ALIGNED(s->low_water_mark, qemu_real_host_page_size)); 520038adc2fSWei Yang assert(QEMU_IS_ALIGNED(s->high_water_mark, qemu_real_host_page_size)); 521418026caSFam Zheng trace_qemu_vfio_new_mapping(s, host, size, index, iova); 522418026caSFam Zheng 523418026caSFam Zheng assert(index >= 0); 524418026caSFam Zheng s->nr_mappings++; 525d29eb678SOlaf Hering s->mappings = g_renew(IOVAMapping, s->mappings, s->nr_mappings); 526418026caSFam Zheng insert = &s->mappings[index]; 527418026caSFam Zheng shift = s->nr_mappings - index - 1; 528418026caSFam Zheng if (shift) { 529418026caSFam Zheng memmove(insert + 1, insert, shift * sizeof(s->mappings[0])); 530418026caSFam Zheng } 531418026caSFam Zheng *insert = m; 532418026caSFam Zheng return insert; 533418026caSFam Zheng } 534418026caSFam Zheng 535418026caSFam Zheng /* Do the DMA mapping with VFIO. */ 536418026caSFam Zheng static int qemu_vfio_do_mapping(QEMUVFIOState *s, void *host, size_t size, 537418026caSFam Zheng uint64_t iova) 538418026caSFam Zheng { 539418026caSFam Zheng struct vfio_iommu_type1_dma_map dma_map = { 540418026caSFam Zheng .argsz = sizeof(dma_map), 541418026caSFam Zheng .flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE, 542418026caSFam Zheng .iova = iova, 543418026caSFam Zheng .vaddr = (uintptr_t)host, 544418026caSFam Zheng .size = size, 545418026caSFam Zheng }; 546418026caSFam Zheng trace_qemu_vfio_do_mapping(s, host, size, iova); 547418026caSFam Zheng 548418026caSFam Zheng if (ioctl(s->container, VFIO_IOMMU_MAP_DMA, &dma_map)) { 549b09d51c9SMichal Privoznik error_report("VFIO_MAP_DMA failed: %s", strerror(errno)); 550418026caSFam Zheng return -errno; 551418026caSFam Zheng } 552418026caSFam Zheng return 0; 553418026caSFam Zheng } 554418026caSFam Zheng 555418026caSFam Zheng /** 556418026caSFam Zheng * Undo the DMA mapping from @s with VFIO, and remove from mapping list. 557418026caSFam Zheng */ 558418026caSFam Zheng static void qemu_vfio_undo_mapping(QEMUVFIOState *s, IOVAMapping *mapping, 559418026caSFam Zheng Error **errp) 560418026caSFam Zheng { 561418026caSFam Zheng int index; 562418026caSFam Zheng struct vfio_iommu_type1_dma_unmap unmap = { 563418026caSFam Zheng .argsz = sizeof(unmap), 564418026caSFam Zheng .flags = 0, 565418026caSFam Zheng .iova = mapping->iova, 566418026caSFam Zheng .size = mapping->size, 567418026caSFam Zheng }; 568418026caSFam Zheng 569418026caSFam Zheng index = mapping - s->mappings; 570418026caSFam Zheng assert(mapping->size > 0); 571038adc2fSWei Yang assert(QEMU_IS_ALIGNED(mapping->size, qemu_real_host_page_size)); 572418026caSFam Zheng assert(index >= 0 && index < s->nr_mappings); 573418026caSFam Zheng if (ioctl(s->container, VFIO_IOMMU_UNMAP_DMA, &unmap)) { 574b09d51c9SMichal Privoznik error_setg_errno(errp, errno, "VFIO_UNMAP_DMA failed"); 575418026caSFam Zheng } 576418026caSFam Zheng memmove(mapping, &s->mappings[index + 1], 577418026caSFam Zheng sizeof(s->mappings[0]) * (s->nr_mappings - index - 1)); 578418026caSFam Zheng s->nr_mappings--; 579d29eb678SOlaf Hering s->mappings = g_renew(IOVAMapping, s->mappings, s->nr_mappings); 580418026caSFam Zheng } 581418026caSFam Zheng 582418026caSFam Zheng /* Check if the mapping list is (ascending) ordered. */ 583418026caSFam Zheng static bool qemu_vfio_verify_mappings(QEMUVFIOState *s) 584418026caSFam Zheng { 585418026caSFam Zheng int i; 586418026caSFam Zheng if (QEMU_VFIO_DEBUG) { 587418026caSFam Zheng for (i = 0; i < s->nr_mappings - 1; ++i) { 588418026caSFam Zheng if (!(s->mappings[i].host < s->mappings[i + 1].host)) { 589418026caSFam Zheng fprintf(stderr, "item %d not sorted!\n", i); 590418026caSFam Zheng qemu_vfio_dump_mappings(s); 591418026caSFam Zheng return false; 592418026caSFam Zheng } 593418026caSFam Zheng if (!(s->mappings[i].host + s->mappings[i].size <= 594418026caSFam Zheng s->mappings[i + 1].host)) { 595418026caSFam Zheng fprintf(stderr, "item %d overlap with next!\n", i); 596418026caSFam Zheng qemu_vfio_dump_mappings(s); 597418026caSFam Zheng return false; 598418026caSFam Zheng } 599418026caSFam Zheng } 600418026caSFam Zheng } 601418026caSFam Zheng return true; 602418026caSFam Zheng } 603418026caSFam Zheng 604418026caSFam Zheng /* Map [host, host + size) area into a contiguous IOVA address space, and store 605418026caSFam Zheng * the result in @iova if not NULL. The caller need to make sure the area is 606418026caSFam Zheng * aligned to page size, and mustn't overlap with existing mapping areas (split 607418026caSFam Zheng * mapping status within this area is not allowed). 608418026caSFam Zheng */ 609418026caSFam Zheng int qemu_vfio_dma_map(QEMUVFIOState *s, void *host, size_t size, 610418026caSFam Zheng bool temporary, uint64_t *iova) 611418026caSFam Zheng { 612418026caSFam Zheng int ret = 0; 613418026caSFam Zheng int index; 614418026caSFam Zheng IOVAMapping *mapping; 615418026caSFam Zheng uint64_t iova0; 616418026caSFam Zheng 617038adc2fSWei Yang assert(QEMU_PTR_IS_ALIGNED(host, qemu_real_host_page_size)); 618038adc2fSWei Yang assert(QEMU_IS_ALIGNED(size, qemu_real_host_page_size)); 619418026caSFam Zheng trace_qemu_vfio_dma_map(s, host, size, temporary, iova); 620418026caSFam Zheng qemu_mutex_lock(&s->lock); 621418026caSFam Zheng mapping = qemu_vfio_find_mapping(s, host, &index); 622418026caSFam Zheng if (mapping) { 623418026caSFam Zheng iova0 = mapping->iova + ((uint8_t *)host - (uint8_t *)mapping->host); 624418026caSFam Zheng } else { 625418026caSFam Zheng if (s->high_water_mark - s->low_water_mark + 1 < size) { 626418026caSFam Zheng ret = -ENOMEM; 627418026caSFam Zheng goto out; 628418026caSFam Zheng } 629418026caSFam Zheng if (!temporary) { 630418026caSFam Zheng iova0 = s->low_water_mark; 631418026caSFam Zheng mapping = qemu_vfio_add_mapping(s, host, size, index + 1, iova0); 632418026caSFam Zheng if (!mapping) { 633418026caSFam Zheng ret = -ENOMEM; 634418026caSFam Zheng goto out; 635418026caSFam Zheng } 636418026caSFam Zheng assert(qemu_vfio_verify_mappings(s)); 637418026caSFam Zheng ret = qemu_vfio_do_mapping(s, host, size, iova0); 638418026caSFam Zheng if (ret) { 639418026caSFam Zheng qemu_vfio_undo_mapping(s, mapping, NULL); 640418026caSFam Zheng goto out; 641418026caSFam Zheng } 642418026caSFam Zheng s->low_water_mark += size; 643418026caSFam Zheng qemu_vfio_dump_mappings(s); 644418026caSFam Zheng } else { 645418026caSFam Zheng iova0 = s->high_water_mark - size; 646418026caSFam Zheng ret = qemu_vfio_do_mapping(s, host, size, iova0); 647418026caSFam Zheng if (ret) { 648418026caSFam Zheng goto out; 649418026caSFam Zheng } 650418026caSFam Zheng s->high_water_mark -= size; 651418026caSFam Zheng } 652418026caSFam Zheng } 653418026caSFam Zheng if (iova) { 654418026caSFam Zheng *iova = iova0; 655418026caSFam Zheng } 656418026caSFam Zheng out: 657418026caSFam Zheng qemu_mutex_unlock(&s->lock); 658418026caSFam Zheng return ret; 659418026caSFam Zheng } 660418026caSFam Zheng 661418026caSFam Zheng /* Reset the high watermark and free all "temporary" mappings. */ 662418026caSFam Zheng int qemu_vfio_dma_reset_temporary(QEMUVFIOState *s) 663418026caSFam Zheng { 664418026caSFam Zheng struct vfio_iommu_type1_dma_unmap unmap = { 665418026caSFam Zheng .argsz = sizeof(unmap), 666418026caSFam Zheng .flags = 0, 667418026caSFam Zheng .iova = s->high_water_mark, 668418026caSFam Zheng .size = QEMU_VFIO_IOVA_MAX - s->high_water_mark, 669418026caSFam Zheng }; 670418026caSFam Zheng trace_qemu_vfio_dma_reset_temporary(s); 671*6e8a355dSDaniel Brodsky QEMU_LOCK_GUARD(&s->lock); 672418026caSFam Zheng if (ioctl(s->container, VFIO_IOMMU_UNMAP_DMA, &unmap)) { 673b09d51c9SMichal Privoznik error_report("VFIO_UNMAP_DMA failed: %s", strerror(errno)); 674418026caSFam Zheng return -errno; 675418026caSFam Zheng } 676418026caSFam Zheng s->high_water_mark = QEMU_VFIO_IOVA_MAX; 677418026caSFam Zheng return 0; 678418026caSFam Zheng } 679418026caSFam Zheng 680418026caSFam Zheng /* Unmapping the whole area that was previously mapped with 681418026caSFam Zheng * qemu_vfio_dma_map(). */ 682418026caSFam Zheng void qemu_vfio_dma_unmap(QEMUVFIOState *s, void *host) 683418026caSFam Zheng { 684418026caSFam Zheng int index = 0; 685418026caSFam Zheng IOVAMapping *m; 686418026caSFam Zheng 687418026caSFam Zheng if (!host) { 688418026caSFam Zheng return; 689418026caSFam Zheng } 690418026caSFam Zheng 691418026caSFam Zheng trace_qemu_vfio_dma_unmap(s, host); 692418026caSFam Zheng qemu_mutex_lock(&s->lock); 693418026caSFam Zheng m = qemu_vfio_find_mapping(s, host, &index); 694418026caSFam Zheng if (!m) { 695418026caSFam Zheng goto out; 696418026caSFam Zheng } 697418026caSFam Zheng qemu_vfio_undo_mapping(s, m, NULL); 698418026caSFam Zheng out: 699418026caSFam Zheng qemu_mutex_unlock(&s->lock); 700418026caSFam Zheng } 701418026caSFam Zheng 702418026caSFam Zheng static void qemu_vfio_reset(QEMUVFIOState *s) 703418026caSFam Zheng { 704418026caSFam Zheng ioctl(s->device, VFIO_DEVICE_RESET); 705418026caSFam Zheng } 706418026caSFam Zheng 707418026caSFam Zheng /* Close and free the VFIO resources. */ 708418026caSFam Zheng void qemu_vfio_close(QEMUVFIOState *s) 709418026caSFam Zheng { 710418026caSFam Zheng int i; 711418026caSFam Zheng 712418026caSFam Zheng if (!s) { 713418026caSFam Zheng return; 714418026caSFam Zheng } 715418026caSFam Zheng for (i = 0; i < s->nr_mappings; ++i) { 716418026caSFam Zheng qemu_vfio_undo_mapping(s, &s->mappings[i], NULL); 717418026caSFam Zheng } 718418026caSFam Zheng ram_block_notifier_remove(&s->ram_notifier); 719418026caSFam Zheng qemu_vfio_reset(s); 720418026caSFam Zheng close(s->device); 721418026caSFam Zheng close(s->group); 722418026caSFam Zheng close(s->container); 723418026caSFam Zheng } 724