1*418026caSFam Zheng /* 2*418026caSFam Zheng * VFIO utility 3*418026caSFam Zheng * 4*418026caSFam Zheng * Copyright 2016 - 2018 Red Hat, Inc. 5*418026caSFam Zheng * 6*418026caSFam Zheng * Authors: 7*418026caSFam Zheng * Fam Zheng <famz@redhat.com> 8*418026caSFam Zheng * 9*418026caSFam Zheng * This work is licensed under the terms of the GNU GPL, version 2 or later. 10*418026caSFam Zheng * See the COPYING file in the top-level directory. 11*418026caSFam Zheng */ 12*418026caSFam Zheng 13*418026caSFam Zheng #include "qemu/osdep.h" 14*418026caSFam Zheng #include <sys/ioctl.h> 15*418026caSFam Zheng #include <linux/vfio.h> 16*418026caSFam Zheng #include "qapi/error.h" 17*418026caSFam Zheng #include "exec/ramlist.h" 18*418026caSFam Zheng #include "exec/cpu-common.h" 19*418026caSFam Zheng #include "trace.h" 20*418026caSFam Zheng #include "qemu/queue.h" 21*418026caSFam Zheng #include "qemu/error-report.h" 22*418026caSFam Zheng #include "standard-headers/linux/pci_regs.h" 23*418026caSFam Zheng #include "qemu/event_notifier.h" 24*418026caSFam Zheng #include "qemu/vfio-helpers.h" 25*418026caSFam Zheng #include "trace.h" 26*418026caSFam Zheng 27*418026caSFam Zheng #define QEMU_VFIO_DEBUG 0 28*418026caSFam Zheng 29*418026caSFam Zheng #define QEMU_VFIO_IOVA_MIN 0x10000ULL 30*418026caSFam Zheng /* XXX: Once VFIO exposes the iova bit width in the IOMMU capability interface, 31*418026caSFam Zheng * we can use a runtime limit; alternatively it's also possible to do platform 32*418026caSFam Zheng * specific detection by reading sysfs entries. Until then, 39 is a safe bet. 33*418026caSFam Zheng **/ 34*418026caSFam Zheng #define QEMU_VFIO_IOVA_MAX (1ULL << 39) 35*418026caSFam Zheng 36*418026caSFam Zheng typedef struct { 37*418026caSFam Zheng /* Page aligned addr. */ 38*418026caSFam Zheng void *host; 39*418026caSFam Zheng size_t size; 40*418026caSFam Zheng uint64_t iova; 41*418026caSFam Zheng } IOVAMapping; 42*418026caSFam Zheng 43*418026caSFam Zheng struct QEMUVFIOState { 44*418026caSFam Zheng QemuMutex lock; 45*418026caSFam Zheng 46*418026caSFam Zheng /* These fields are protected by BQL */ 47*418026caSFam Zheng int container; 48*418026caSFam Zheng int group; 49*418026caSFam Zheng int device; 50*418026caSFam Zheng RAMBlockNotifier ram_notifier; 51*418026caSFam Zheng struct vfio_region_info config_region_info, bar_region_info[6]; 52*418026caSFam Zheng 53*418026caSFam Zheng /* These fields are protected by @lock */ 54*418026caSFam Zheng /* VFIO's IO virtual address space is managed by splitting into a few 55*418026caSFam Zheng * sections: 56*418026caSFam Zheng * 57*418026caSFam Zheng * --------------- <= 0 58*418026caSFam Zheng * |xxxxxxxxxxxxx| 59*418026caSFam Zheng * |-------------| <= QEMU_VFIO_IOVA_MIN 60*418026caSFam Zheng * | | 61*418026caSFam Zheng * | Fixed | 62*418026caSFam Zheng * | | 63*418026caSFam Zheng * |-------------| <= low_water_mark 64*418026caSFam Zheng * | | 65*418026caSFam Zheng * | Free | 66*418026caSFam Zheng * | | 67*418026caSFam Zheng * |-------------| <= high_water_mark 68*418026caSFam Zheng * | | 69*418026caSFam Zheng * | Temp | 70*418026caSFam Zheng * | | 71*418026caSFam Zheng * |-------------| <= QEMU_VFIO_IOVA_MAX 72*418026caSFam Zheng * |xxxxxxxxxxxxx| 73*418026caSFam Zheng * |xxxxxxxxxxxxx| 74*418026caSFam Zheng * --------------- 75*418026caSFam Zheng * 76*418026caSFam Zheng * - Addresses lower than QEMU_VFIO_IOVA_MIN are reserved as invalid; 77*418026caSFam Zheng * 78*418026caSFam Zheng * - Fixed mappings of HVAs are assigned "low" IOVAs in the range of 79*418026caSFam Zheng * [QEMU_VFIO_IOVA_MIN, low_water_mark). Once allocated they will not be 80*418026caSFam Zheng * reclaimed - low_water_mark never shrinks; 81*418026caSFam Zheng * 82*418026caSFam Zheng * - IOVAs in range [low_water_mark, high_water_mark) are free; 83*418026caSFam Zheng * 84*418026caSFam Zheng * - IOVAs in range [high_water_mark, QEMU_VFIO_IOVA_MAX) are volatile 85*418026caSFam Zheng * mappings. At each qemu_vfio_dma_reset_temporary() call, the whole area 86*418026caSFam Zheng * is recycled. The caller should make sure I/O's depending on these 87*418026caSFam Zheng * mappings are completed before calling. 88*418026caSFam Zheng **/ 89*418026caSFam Zheng uint64_t low_water_mark; 90*418026caSFam Zheng uint64_t high_water_mark; 91*418026caSFam Zheng IOVAMapping *mappings; 92*418026caSFam Zheng int nr_mappings; 93*418026caSFam Zheng }; 94*418026caSFam Zheng 95*418026caSFam Zheng /** 96*418026caSFam Zheng * Find group file by PCI device address as specified @device, and return the 97*418026caSFam Zheng * path. The returned string is owned by caller and should be g_free'ed later. 98*418026caSFam Zheng */ 99*418026caSFam Zheng static char *sysfs_find_group_file(const char *device, Error **errp) 100*418026caSFam Zheng { 101*418026caSFam Zheng char *sysfs_link; 102*418026caSFam Zheng char *sysfs_group; 103*418026caSFam Zheng char *p; 104*418026caSFam Zheng char *path = NULL; 105*418026caSFam Zheng 106*418026caSFam Zheng sysfs_link = g_strdup_printf("/sys/bus/pci/devices/%s/iommu_group", device); 107*418026caSFam Zheng sysfs_group = g_malloc(PATH_MAX); 108*418026caSFam Zheng if (readlink(sysfs_link, sysfs_group, PATH_MAX - 1) == -1) { 109*418026caSFam Zheng error_setg_errno(errp, errno, "Failed to find iommu group sysfs path"); 110*418026caSFam Zheng goto out; 111*418026caSFam Zheng } 112*418026caSFam Zheng p = strrchr(sysfs_group, '/'); 113*418026caSFam Zheng if (!p) { 114*418026caSFam Zheng error_setg(errp, "Failed to find iommu group number"); 115*418026caSFam Zheng goto out; 116*418026caSFam Zheng } 117*418026caSFam Zheng 118*418026caSFam Zheng path = g_strdup_printf("/dev/vfio/%s", p + 1); 119*418026caSFam Zheng out: 120*418026caSFam Zheng g_free(sysfs_link); 121*418026caSFam Zheng g_free(sysfs_group); 122*418026caSFam Zheng return path; 123*418026caSFam Zheng } 124*418026caSFam Zheng 125*418026caSFam Zheng static inline void assert_bar_index_valid(QEMUVFIOState *s, int index) 126*418026caSFam Zheng { 127*418026caSFam Zheng assert(index >= 0 && index < ARRAY_SIZE(s->bar_region_info)); 128*418026caSFam Zheng } 129*418026caSFam Zheng 130*418026caSFam Zheng static int qemu_vfio_pci_init_bar(QEMUVFIOState *s, int index, Error **errp) 131*418026caSFam Zheng { 132*418026caSFam Zheng assert_bar_index_valid(s, index); 133*418026caSFam Zheng s->bar_region_info[index] = (struct vfio_region_info) { 134*418026caSFam Zheng .index = VFIO_PCI_BAR0_REGION_INDEX + index, 135*418026caSFam Zheng .argsz = sizeof(struct vfio_region_info), 136*418026caSFam Zheng }; 137*418026caSFam Zheng if (ioctl(s->device, VFIO_DEVICE_GET_REGION_INFO, &s->bar_region_info[index])) { 138*418026caSFam Zheng error_setg_errno(errp, errno, "Failed to get BAR region info"); 139*418026caSFam Zheng return -errno; 140*418026caSFam Zheng } 141*418026caSFam Zheng 142*418026caSFam Zheng return 0; 143*418026caSFam Zheng } 144*418026caSFam Zheng 145*418026caSFam Zheng /** 146*418026caSFam Zheng * Map a PCI bar area. 147*418026caSFam Zheng */ 148*418026caSFam Zheng void *qemu_vfio_pci_map_bar(QEMUVFIOState *s, int index, 149*418026caSFam Zheng uint64_t offset, uint64_t size, 150*418026caSFam Zheng Error **errp) 151*418026caSFam Zheng { 152*418026caSFam Zheng void *p; 153*418026caSFam Zheng assert_bar_index_valid(s, index); 154*418026caSFam Zheng p = mmap(NULL, MIN(size, s->bar_region_info[index].size - offset), 155*418026caSFam Zheng PROT_READ | PROT_WRITE, MAP_SHARED, 156*418026caSFam Zheng s->device, s->bar_region_info[index].offset + offset); 157*418026caSFam Zheng if (p == MAP_FAILED) { 158*418026caSFam Zheng error_setg_errno(errp, errno, "Failed to map BAR region"); 159*418026caSFam Zheng p = NULL; 160*418026caSFam Zheng } 161*418026caSFam Zheng return p; 162*418026caSFam Zheng } 163*418026caSFam Zheng 164*418026caSFam Zheng /** 165*418026caSFam Zheng * Unmap a PCI bar area. 166*418026caSFam Zheng */ 167*418026caSFam Zheng void qemu_vfio_pci_unmap_bar(QEMUVFIOState *s, int index, void *bar, 168*418026caSFam Zheng uint64_t offset, uint64_t size) 169*418026caSFam Zheng { 170*418026caSFam Zheng if (bar) { 171*418026caSFam Zheng munmap(bar, MIN(size, s->bar_region_info[index].size - offset)); 172*418026caSFam Zheng } 173*418026caSFam Zheng } 174*418026caSFam Zheng 175*418026caSFam Zheng /** 176*418026caSFam Zheng * Initialize device IRQ with @irq_type and and register an event notifier. 177*418026caSFam Zheng */ 178*418026caSFam Zheng int qemu_vfio_pci_init_irq(QEMUVFIOState *s, EventNotifier *e, 179*418026caSFam Zheng int irq_type, Error **errp) 180*418026caSFam Zheng { 181*418026caSFam Zheng int r; 182*418026caSFam Zheng struct vfio_irq_set *irq_set; 183*418026caSFam Zheng size_t irq_set_size; 184*418026caSFam Zheng struct vfio_irq_info irq_info = { .argsz = sizeof(irq_info) }; 185*418026caSFam Zheng 186*418026caSFam Zheng irq_info.index = irq_type; 187*418026caSFam Zheng if (ioctl(s->device, VFIO_DEVICE_GET_IRQ_INFO, &irq_info)) { 188*418026caSFam Zheng error_setg_errno(errp, errno, "Failed to get device interrupt info"); 189*418026caSFam Zheng return -errno; 190*418026caSFam Zheng } 191*418026caSFam Zheng if (!(irq_info.flags & VFIO_IRQ_INFO_EVENTFD)) { 192*418026caSFam Zheng error_setg(errp, "Device interrupt doesn't support eventfd"); 193*418026caSFam Zheng return -EINVAL; 194*418026caSFam Zheng } 195*418026caSFam Zheng 196*418026caSFam Zheng irq_set_size = sizeof(*irq_set) + sizeof(int); 197*418026caSFam Zheng irq_set = g_malloc0(irq_set_size); 198*418026caSFam Zheng 199*418026caSFam Zheng /* Get to a known IRQ state */ 200*418026caSFam Zheng *irq_set = (struct vfio_irq_set) { 201*418026caSFam Zheng .argsz = irq_set_size, 202*418026caSFam Zheng .flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER, 203*418026caSFam Zheng .index = irq_info.index, 204*418026caSFam Zheng .start = 0, 205*418026caSFam Zheng .count = 1, 206*418026caSFam Zheng }; 207*418026caSFam Zheng 208*418026caSFam Zheng *(int *)&irq_set->data = event_notifier_get_fd(e); 209*418026caSFam Zheng r = ioctl(s->device, VFIO_DEVICE_SET_IRQS, irq_set); 210*418026caSFam Zheng g_free(irq_set); 211*418026caSFam Zheng if (r) { 212*418026caSFam Zheng error_setg_errno(errp, errno, "Failed to setup device interrupt"); 213*418026caSFam Zheng return -errno; 214*418026caSFam Zheng } 215*418026caSFam Zheng return 0; 216*418026caSFam Zheng } 217*418026caSFam Zheng 218*418026caSFam Zheng static int qemu_vfio_pci_read_config(QEMUVFIOState *s, void *buf, 219*418026caSFam Zheng int size, int ofs) 220*418026caSFam Zheng { 221*418026caSFam Zheng int ret; 222*418026caSFam Zheng 223*418026caSFam Zheng do { 224*418026caSFam Zheng ret = pread(s->device, buf, size, s->config_region_info.offset + ofs); 225*418026caSFam Zheng } while (ret == -1 && errno == EINTR); 226*418026caSFam Zheng return ret == size ? 0 : -errno; 227*418026caSFam Zheng } 228*418026caSFam Zheng 229*418026caSFam Zheng static int qemu_vfio_pci_write_config(QEMUVFIOState *s, void *buf, int size, int ofs) 230*418026caSFam Zheng { 231*418026caSFam Zheng int ret; 232*418026caSFam Zheng 233*418026caSFam Zheng do { 234*418026caSFam Zheng ret = pwrite(s->device, buf, size, s->config_region_info.offset + ofs); 235*418026caSFam Zheng } while (ret == -1 && errno == EINTR); 236*418026caSFam Zheng return ret == size ? 0 : -errno; 237*418026caSFam Zheng } 238*418026caSFam Zheng 239*418026caSFam Zheng static int qemu_vfio_init_pci(QEMUVFIOState *s, const char *device, 240*418026caSFam Zheng Error **errp) 241*418026caSFam Zheng { 242*418026caSFam Zheng int ret; 243*418026caSFam Zheng int i; 244*418026caSFam Zheng uint16_t pci_cmd; 245*418026caSFam Zheng struct vfio_group_status group_status = { .argsz = sizeof(group_status) }; 246*418026caSFam Zheng struct vfio_iommu_type1_info iommu_info = { .argsz = sizeof(iommu_info) }; 247*418026caSFam Zheng struct vfio_device_info device_info = { .argsz = sizeof(device_info) }; 248*418026caSFam Zheng char *group_file = NULL; 249*418026caSFam Zheng 250*418026caSFam Zheng /* Create a new container */ 251*418026caSFam Zheng s->container = open("/dev/vfio/vfio", O_RDWR); 252*418026caSFam Zheng 253*418026caSFam Zheng if (s->container == -1) { 254*418026caSFam Zheng error_setg_errno(errp, errno, "Failed to open /dev/vfio/vfio"); 255*418026caSFam Zheng return -errno; 256*418026caSFam Zheng } 257*418026caSFam Zheng if (ioctl(s->container, VFIO_GET_API_VERSION) != VFIO_API_VERSION) { 258*418026caSFam Zheng error_setg(errp, "Invalid VFIO version"); 259*418026caSFam Zheng ret = -EINVAL; 260*418026caSFam Zheng goto fail_container; 261*418026caSFam Zheng } 262*418026caSFam Zheng 263*418026caSFam Zheng if (!ioctl(s->container, VFIO_CHECK_EXTENSION, VFIO_TYPE1_IOMMU)) { 264*418026caSFam Zheng error_setg_errno(errp, errno, "VFIO IOMMU check failed"); 265*418026caSFam Zheng ret = -EINVAL; 266*418026caSFam Zheng goto fail_container; 267*418026caSFam Zheng } 268*418026caSFam Zheng 269*418026caSFam Zheng /* Open the group */ 270*418026caSFam Zheng group_file = sysfs_find_group_file(device, errp); 271*418026caSFam Zheng if (!group_file) { 272*418026caSFam Zheng ret = -EINVAL; 273*418026caSFam Zheng goto fail_container; 274*418026caSFam Zheng } 275*418026caSFam Zheng 276*418026caSFam Zheng s->group = open(group_file, O_RDWR); 277*418026caSFam Zheng if (s->group == -1) { 278*418026caSFam Zheng error_setg_errno(errp, errno, "Failed to open VFIO group file: %s", 279*418026caSFam Zheng group_file); 280*418026caSFam Zheng g_free(group_file); 281*418026caSFam Zheng ret = -errno; 282*418026caSFam Zheng goto fail_container; 283*418026caSFam Zheng } 284*418026caSFam Zheng g_free(group_file); 285*418026caSFam Zheng 286*418026caSFam Zheng /* Test the group is viable and available */ 287*418026caSFam Zheng if (ioctl(s->group, VFIO_GROUP_GET_STATUS, &group_status)) { 288*418026caSFam Zheng error_setg_errno(errp, errno, "Failed to get VFIO group status"); 289*418026caSFam Zheng ret = -errno; 290*418026caSFam Zheng goto fail; 291*418026caSFam Zheng } 292*418026caSFam Zheng 293*418026caSFam Zheng if (!(group_status.flags & VFIO_GROUP_FLAGS_VIABLE)) { 294*418026caSFam Zheng error_setg(errp, "VFIO group is not viable"); 295*418026caSFam Zheng ret = -EINVAL; 296*418026caSFam Zheng goto fail; 297*418026caSFam Zheng } 298*418026caSFam Zheng 299*418026caSFam Zheng /* Add the group to the container */ 300*418026caSFam Zheng if (ioctl(s->group, VFIO_GROUP_SET_CONTAINER, &s->container)) { 301*418026caSFam Zheng error_setg_errno(errp, errno, "Failed to add group to VFIO container"); 302*418026caSFam Zheng ret = -errno; 303*418026caSFam Zheng goto fail; 304*418026caSFam Zheng } 305*418026caSFam Zheng 306*418026caSFam Zheng /* Enable the IOMMU model we want */ 307*418026caSFam Zheng if (ioctl(s->container, VFIO_SET_IOMMU, VFIO_TYPE1_IOMMU)) { 308*418026caSFam Zheng error_setg_errno(errp, errno, "Failed to set VFIO IOMMU type"); 309*418026caSFam Zheng ret = -errno; 310*418026caSFam Zheng goto fail; 311*418026caSFam Zheng } 312*418026caSFam Zheng 313*418026caSFam Zheng /* Get additional IOMMU info */ 314*418026caSFam Zheng if (ioctl(s->container, VFIO_IOMMU_GET_INFO, &iommu_info)) { 315*418026caSFam Zheng error_setg_errno(errp, errno, "Failed to get IOMMU info"); 316*418026caSFam Zheng ret = -errno; 317*418026caSFam Zheng goto fail; 318*418026caSFam Zheng } 319*418026caSFam Zheng 320*418026caSFam Zheng s->device = ioctl(s->group, VFIO_GROUP_GET_DEVICE_FD, device); 321*418026caSFam Zheng 322*418026caSFam Zheng if (s->device < 0) { 323*418026caSFam Zheng error_setg_errno(errp, errno, "Failed to get device fd"); 324*418026caSFam Zheng ret = -errno; 325*418026caSFam Zheng goto fail; 326*418026caSFam Zheng } 327*418026caSFam Zheng 328*418026caSFam Zheng /* Test and setup the device */ 329*418026caSFam Zheng if (ioctl(s->device, VFIO_DEVICE_GET_INFO, &device_info)) { 330*418026caSFam Zheng error_setg_errno(errp, errno, "Failed to get device info"); 331*418026caSFam Zheng ret = -errno; 332*418026caSFam Zheng goto fail; 333*418026caSFam Zheng } 334*418026caSFam Zheng 335*418026caSFam Zheng if (device_info.num_regions < VFIO_PCI_CONFIG_REGION_INDEX) { 336*418026caSFam Zheng error_setg(errp, "Invalid device regions"); 337*418026caSFam Zheng ret = -EINVAL; 338*418026caSFam Zheng goto fail; 339*418026caSFam Zheng } 340*418026caSFam Zheng 341*418026caSFam Zheng s->config_region_info = (struct vfio_region_info) { 342*418026caSFam Zheng .index = VFIO_PCI_CONFIG_REGION_INDEX, 343*418026caSFam Zheng .argsz = sizeof(struct vfio_region_info), 344*418026caSFam Zheng }; 345*418026caSFam Zheng if (ioctl(s->device, VFIO_DEVICE_GET_REGION_INFO, &s->config_region_info)) { 346*418026caSFam Zheng error_setg_errno(errp, errno, "Failed to get config region info"); 347*418026caSFam Zheng ret = -errno; 348*418026caSFam Zheng goto fail; 349*418026caSFam Zheng } 350*418026caSFam Zheng 351*418026caSFam Zheng for (i = 0; i < 6; i++) { 352*418026caSFam Zheng ret = qemu_vfio_pci_init_bar(s, i, errp); 353*418026caSFam Zheng if (ret) { 354*418026caSFam Zheng goto fail; 355*418026caSFam Zheng } 356*418026caSFam Zheng } 357*418026caSFam Zheng 358*418026caSFam Zheng /* Enable bus master */ 359*418026caSFam Zheng ret = qemu_vfio_pci_read_config(s, &pci_cmd, sizeof(pci_cmd), PCI_COMMAND); 360*418026caSFam Zheng if (ret) { 361*418026caSFam Zheng goto fail; 362*418026caSFam Zheng } 363*418026caSFam Zheng pci_cmd |= PCI_COMMAND_MASTER; 364*418026caSFam Zheng ret = qemu_vfio_pci_write_config(s, &pci_cmd, sizeof(pci_cmd), PCI_COMMAND); 365*418026caSFam Zheng if (ret) { 366*418026caSFam Zheng goto fail; 367*418026caSFam Zheng } 368*418026caSFam Zheng return 0; 369*418026caSFam Zheng fail: 370*418026caSFam Zheng close(s->group); 371*418026caSFam Zheng fail_container: 372*418026caSFam Zheng close(s->container); 373*418026caSFam Zheng return ret; 374*418026caSFam Zheng } 375*418026caSFam Zheng 376*418026caSFam Zheng static void qemu_vfio_ram_block_added(RAMBlockNotifier *n, 377*418026caSFam Zheng void *host, size_t size) 378*418026caSFam Zheng { 379*418026caSFam Zheng QEMUVFIOState *s = container_of(n, QEMUVFIOState, ram_notifier); 380*418026caSFam Zheng trace_qemu_vfio_ram_block_added(s, host, size); 381*418026caSFam Zheng qemu_vfio_dma_map(s, host, size, false, NULL); 382*418026caSFam Zheng } 383*418026caSFam Zheng 384*418026caSFam Zheng static void qemu_vfio_ram_block_removed(RAMBlockNotifier *n, 385*418026caSFam Zheng void *host, size_t size) 386*418026caSFam Zheng { 387*418026caSFam Zheng QEMUVFIOState *s = container_of(n, QEMUVFIOState, ram_notifier); 388*418026caSFam Zheng if (host) { 389*418026caSFam Zheng trace_qemu_vfio_ram_block_removed(s, host, size); 390*418026caSFam Zheng qemu_vfio_dma_unmap(s, host); 391*418026caSFam Zheng } 392*418026caSFam Zheng } 393*418026caSFam Zheng 394*418026caSFam Zheng static int qemu_vfio_init_ramblock(const char *block_name, void *host_addr, 395*418026caSFam Zheng ram_addr_t offset, ram_addr_t length, 396*418026caSFam Zheng void *opaque) 397*418026caSFam Zheng { 398*418026caSFam Zheng int ret; 399*418026caSFam Zheng QEMUVFIOState *s = opaque; 400*418026caSFam Zheng 401*418026caSFam Zheng if (!host_addr) { 402*418026caSFam Zheng return 0; 403*418026caSFam Zheng } 404*418026caSFam Zheng ret = qemu_vfio_dma_map(s, host_addr, length, false, NULL); 405*418026caSFam Zheng if (ret) { 406*418026caSFam Zheng fprintf(stderr, "qemu_vfio_init_ramblock: failed %p %" PRId64 "\n", 407*418026caSFam Zheng host_addr, (uint64_t)length); 408*418026caSFam Zheng } 409*418026caSFam Zheng return 0; 410*418026caSFam Zheng } 411*418026caSFam Zheng 412*418026caSFam Zheng static void qemu_vfio_open_common(QEMUVFIOState *s) 413*418026caSFam Zheng { 414*418026caSFam Zheng s->ram_notifier.ram_block_added = qemu_vfio_ram_block_added; 415*418026caSFam Zheng s->ram_notifier.ram_block_removed = qemu_vfio_ram_block_removed; 416*418026caSFam Zheng ram_block_notifier_add(&s->ram_notifier); 417*418026caSFam Zheng s->low_water_mark = QEMU_VFIO_IOVA_MIN; 418*418026caSFam Zheng s->high_water_mark = QEMU_VFIO_IOVA_MAX; 419*418026caSFam Zheng qemu_ram_foreach_block(qemu_vfio_init_ramblock, s); 420*418026caSFam Zheng qemu_mutex_init(&s->lock); 421*418026caSFam Zheng } 422*418026caSFam Zheng 423*418026caSFam Zheng /** 424*418026caSFam Zheng * Open a PCI device, e.g. "0000:00:01.0". 425*418026caSFam Zheng */ 426*418026caSFam Zheng QEMUVFIOState *qemu_vfio_open_pci(const char *device, Error **errp) 427*418026caSFam Zheng { 428*418026caSFam Zheng int r; 429*418026caSFam Zheng QEMUVFIOState *s = g_new0(QEMUVFIOState, 1); 430*418026caSFam Zheng 431*418026caSFam Zheng r = qemu_vfio_init_pci(s, device, errp); 432*418026caSFam Zheng if (r) { 433*418026caSFam Zheng g_free(s); 434*418026caSFam Zheng return NULL; 435*418026caSFam Zheng } 436*418026caSFam Zheng qemu_vfio_open_common(s); 437*418026caSFam Zheng return s; 438*418026caSFam Zheng } 439*418026caSFam Zheng 440*418026caSFam Zheng static void qemu_vfio_dump_mapping(IOVAMapping *m) 441*418026caSFam Zheng { 442*418026caSFam Zheng if (QEMU_VFIO_DEBUG) { 443*418026caSFam Zheng printf(" vfio mapping %p %" PRIx64 " to %" PRIx64 "\n", m->host, 444*418026caSFam Zheng (uint64_t)m->size, (uint64_t)m->iova); 445*418026caSFam Zheng } 446*418026caSFam Zheng } 447*418026caSFam Zheng 448*418026caSFam Zheng static void qemu_vfio_dump_mappings(QEMUVFIOState *s) 449*418026caSFam Zheng { 450*418026caSFam Zheng int i; 451*418026caSFam Zheng 452*418026caSFam Zheng if (QEMU_VFIO_DEBUG) { 453*418026caSFam Zheng printf("vfio mappings\n"); 454*418026caSFam Zheng for (i = 0; i < s->nr_mappings; ++i) { 455*418026caSFam Zheng qemu_vfio_dump_mapping(&s->mappings[i]); 456*418026caSFam Zheng } 457*418026caSFam Zheng } 458*418026caSFam Zheng } 459*418026caSFam Zheng 460*418026caSFam Zheng /** 461*418026caSFam Zheng * Find the mapping entry that contains [host, host + size) and set @index to 462*418026caSFam Zheng * the position. If no entry contains it, @index is the position _after_ which 463*418026caSFam Zheng * to insert the new mapping. IOW, it is the index of the largest element that 464*418026caSFam Zheng * is smaller than @host, or -1 if no entry is. 465*418026caSFam Zheng */ 466*418026caSFam Zheng static IOVAMapping *qemu_vfio_find_mapping(QEMUVFIOState *s, void *host, 467*418026caSFam Zheng int *index) 468*418026caSFam Zheng { 469*418026caSFam Zheng IOVAMapping *p = s->mappings; 470*418026caSFam Zheng IOVAMapping *q = p ? p + s->nr_mappings - 1 : NULL; 471*418026caSFam Zheng IOVAMapping *mid; 472*418026caSFam Zheng trace_qemu_vfio_find_mapping(s, host); 473*418026caSFam Zheng if (!p) { 474*418026caSFam Zheng *index = -1; 475*418026caSFam Zheng return NULL; 476*418026caSFam Zheng } 477*418026caSFam Zheng while (true) { 478*418026caSFam Zheng mid = p + (q - p) / 2; 479*418026caSFam Zheng if (mid == p) { 480*418026caSFam Zheng break; 481*418026caSFam Zheng } 482*418026caSFam Zheng if (mid->host > host) { 483*418026caSFam Zheng q = mid; 484*418026caSFam Zheng } else if (mid->host < host) { 485*418026caSFam Zheng p = mid; 486*418026caSFam Zheng } else { 487*418026caSFam Zheng break; 488*418026caSFam Zheng } 489*418026caSFam Zheng } 490*418026caSFam Zheng if (mid->host > host) { 491*418026caSFam Zheng mid--; 492*418026caSFam Zheng } else if (mid < &s->mappings[s->nr_mappings - 1] 493*418026caSFam Zheng && (mid + 1)->host <= host) { 494*418026caSFam Zheng mid++; 495*418026caSFam Zheng } 496*418026caSFam Zheng *index = mid - &s->mappings[0]; 497*418026caSFam Zheng if (mid >= &s->mappings[0] && 498*418026caSFam Zheng mid->host <= host && mid->host + mid->size > host) { 499*418026caSFam Zheng assert(mid < &s->mappings[s->nr_mappings]); 500*418026caSFam Zheng return mid; 501*418026caSFam Zheng } 502*418026caSFam Zheng /* At this point *index + 1 is the right position to insert the new 503*418026caSFam Zheng * mapping.*/ 504*418026caSFam Zheng return NULL; 505*418026caSFam Zheng } 506*418026caSFam Zheng 507*418026caSFam Zheng /** 508*418026caSFam Zheng * Allocate IOVA and and create a new mapping record and insert it in @s. 509*418026caSFam Zheng */ 510*418026caSFam Zheng static IOVAMapping *qemu_vfio_add_mapping(QEMUVFIOState *s, 511*418026caSFam Zheng void *host, size_t size, 512*418026caSFam Zheng int index, uint64_t iova) 513*418026caSFam Zheng { 514*418026caSFam Zheng int shift; 515*418026caSFam Zheng IOVAMapping m = {.host = host, .size = size, .iova = iova}; 516*418026caSFam Zheng IOVAMapping *insert; 517*418026caSFam Zheng 518*418026caSFam Zheng assert(QEMU_IS_ALIGNED(size, getpagesize())); 519*418026caSFam Zheng assert(QEMU_IS_ALIGNED(s->low_water_mark, getpagesize())); 520*418026caSFam Zheng assert(QEMU_IS_ALIGNED(s->high_water_mark, getpagesize())); 521*418026caSFam Zheng trace_qemu_vfio_new_mapping(s, host, size, index, iova); 522*418026caSFam Zheng 523*418026caSFam Zheng assert(index >= 0); 524*418026caSFam Zheng s->nr_mappings++; 525*418026caSFam Zheng s->mappings = g_realloc_n(s->mappings, sizeof(s->mappings[0]), 526*418026caSFam Zheng s->nr_mappings); 527*418026caSFam Zheng insert = &s->mappings[index]; 528*418026caSFam Zheng shift = s->nr_mappings - index - 1; 529*418026caSFam Zheng if (shift) { 530*418026caSFam Zheng memmove(insert + 1, insert, shift * sizeof(s->mappings[0])); 531*418026caSFam Zheng } 532*418026caSFam Zheng *insert = m; 533*418026caSFam Zheng return insert; 534*418026caSFam Zheng } 535*418026caSFam Zheng 536*418026caSFam Zheng /* Do the DMA mapping with VFIO. */ 537*418026caSFam Zheng static int qemu_vfio_do_mapping(QEMUVFIOState *s, void *host, size_t size, 538*418026caSFam Zheng uint64_t iova) 539*418026caSFam Zheng { 540*418026caSFam Zheng struct vfio_iommu_type1_dma_map dma_map = { 541*418026caSFam Zheng .argsz = sizeof(dma_map), 542*418026caSFam Zheng .flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE, 543*418026caSFam Zheng .iova = iova, 544*418026caSFam Zheng .vaddr = (uintptr_t)host, 545*418026caSFam Zheng .size = size, 546*418026caSFam Zheng }; 547*418026caSFam Zheng trace_qemu_vfio_do_mapping(s, host, size, iova); 548*418026caSFam Zheng 549*418026caSFam Zheng if (ioctl(s->container, VFIO_IOMMU_MAP_DMA, &dma_map)) { 550*418026caSFam Zheng error_report("VFIO_MAP_DMA: %d", -errno); 551*418026caSFam Zheng return -errno; 552*418026caSFam Zheng } 553*418026caSFam Zheng return 0; 554*418026caSFam Zheng } 555*418026caSFam Zheng 556*418026caSFam Zheng /** 557*418026caSFam Zheng * Undo the DMA mapping from @s with VFIO, and remove from mapping list. 558*418026caSFam Zheng */ 559*418026caSFam Zheng static void qemu_vfio_undo_mapping(QEMUVFIOState *s, IOVAMapping *mapping, 560*418026caSFam Zheng Error **errp) 561*418026caSFam Zheng { 562*418026caSFam Zheng int index; 563*418026caSFam Zheng struct vfio_iommu_type1_dma_unmap unmap = { 564*418026caSFam Zheng .argsz = sizeof(unmap), 565*418026caSFam Zheng .flags = 0, 566*418026caSFam Zheng .iova = mapping->iova, 567*418026caSFam Zheng .size = mapping->size, 568*418026caSFam Zheng }; 569*418026caSFam Zheng 570*418026caSFam Zheng index = mapping - s->mappings; 571*418026caSFam Zheng assert(mapping->size > 0); 572*418026caSFam Zheng assert(QEMU_IS_ALIGNED(mapping->size, getpagesize())); 573*418026caSFam Zheng assert(index >= 0 && index < s->nr_mappings); 574*418026caSFam Zheng if (ioctl(s->container, VFIO_IOMMU_UNMAP_DMA, &unmap)) { 575*418026caSFam Zheng error_setg(errp, "VFIO_UNMAP_DMA failed: %d", -errno); 576*418026caSFam Zheng } 577*418026caSFam Zheng memmove(mapping, &s->mappings[index + 1], 578*418026caSFam Zheng sizeof(s->mappings[0]) * (s->nr_mappings - index - 1)); 579*418026caSFam Zheng s->nr_mappings--; 580*418026caSFam Zheng s->mappings = g_realloc_n(s->mappings, sizeof(s->mappings[0]), 581*418026caSFam Zheng s->nr_mappings); 582*418026caSFam Zheng } 583*418026caSFam Zheng 584*418026caSFam Zheng /* Check if the mapping list is (ascending) ordered. */ 585*418026caSFam Zheng static bool qemu_vfio_verify_mappings(QEMUVFIOState *s) 586*418026caSFam Zheng { 587*418026caSFam Zheng int i; 588*418026caSFam Zheng if (QEMU_VFIO_DEBUG) { 589*418026caSFam Zheng for (i = 0; i < s->nr_mappings - 1; ++i) { 590*418026caSFam Zheng if (!(s->mappings[i].host < s->mappings[i + 1].host)) { 591*418026caSFam Zheng fprintf(stderr, "item %d not sorted!\n", i); 592*418026caSFam Zheng qemu_vfio_dump_mappings(s); 593*418026caSFam Zheng return false; 594*418026caSFam Zheng } 595*418026caSFam Zheng if (!(s->mappings[i].host + s->mappings[i].size <= 596*418026caSFam Zheng s->mappings[i + 1].host)) { 597*418026caSFam Zheng fprintf(stderr, "item %d overlap with next!\n", i); 598*418026caSFam Zheng qemu_vfio_dump_mappings(s); 599*418026caSFam Zheng return false; 600*418026caSFam Zheng } 601*418026caSFam Zheng } 602*418026caSFam Zheng } 603*418026caSFam Zheng return true; 604*418026caSFam Zheng } 605*418026caSFam Zheng 606*418026caSFam Zheng /* Map [host, host + size) area into a contiguous IOVA address space, and store 607*418026caSFam Zheng * the result in @iova if not NULL. The caller need to make sure the area is 608*418026caSFam Zheng * aligned to page size, and mustn't overlap with existing mapping areas (split 609*418026caSFam Zheng * mapping status within this area is not allowed). 610*418026caSFam Zheng */ 611*418026caSFam Zheng int qemu_vfio_dma_map(QEMUVFIOState *s, void *host, size_t size, 612*418026caSFam Zheng bool temporary, uint64_t *iova) 613*418026caSFam Zheng { 614*418026caSFam Zheng int ret = 0; 615*418026caSFam Zheng int index; 616*418026caSFam Zheng IOVAMapping *mapping; 617*418026caSFam Zheng uint64_t iova0; 618*418026caSFam Zheng 619*418026caSFam Zheng assert(QEMU_PTR_IS_ALIGNED(host, getpagesize())); 620*418026caSFam Zheng assert(QEMU_IS_ALIGNED(size, getpagesize())); 621*418026caSFam Zheng trace_qemu_vfio_dma_map(s, host, size, temporary, iova); 622*418026caSFam Zheng qemu_mutex_lock(&s->lock); 623*418026caSFam Zheng mapping = qemu_vfio_find_mapping(s, host, &index); 624*418026caSFam Zheng if (mapping) { 625*418026caSFam Zheng iova0 = mapping->iova + ((uint8_t *)host - (uint8_t *)mapping->host); 626*418026caSFam Zheng } else { 627*418026caSFam Zheng if (s->high_water_mark - s->low_water_mark + 1 < size) { 628*418026caSFam Zheng ret = -ENOMEM; 629*418026caSFam Zheng goto out; 630*418026caSFam Zheng } 631*418026caSFam Zheng if (!temporary) { 632*418026caSFam Zheng iova0 = s->low_water_mark; 633*418026caSFam Zheng mapping = qemu_vfio_add_mapping(s, host, size, index + 1, iova0); 634*418026caSFam Zheng if (!mapping) { 635*418026caSFam Zheng ret = -ENOMEM; 636*418026caSFam Zheng goto out; 637*418026caSFam Zheng } 638*418026caSFam Zheng assert(qemu_vfio_verify_mappings(s)); 639*418026caSFam Zheng ret = qemu_vfio_do_mapping(s, host, size, iova0); 640*418026caSFam Zheng if (ret) { 641*418026caSFam Zheng qemu_vfio_undo_mapping(s, mapping, NULL); 642*418026caSFam Zheng goto out; 643*418026caSFam Zheng } 644*418026caSFam Zheng s->low_water_mark += size; 645*418026caSFam Zheng qemu_vfio_dump_mappings(s); 646*418026caSFam Zheng } else { 647*418026caSFam Zheng iova0 = s->high_water_mark - size; 648*418026caSFam Zheng ret = qemu_vfio_do_mapping(s, host, size, iova0); 649*418026caSFam Zheng if (ret) { 650*418026caSFam Zheng goto out; 651*418026caSFam Zheng } 652*418026caSFam Zheng s->high_water_mark -= size; 653*418026caSFam Zheng } 654*418026caSFam Zheng } 655*418026caSFam Zheng if (iova) { 656*418026caSFam Zheng *iova = iova0; 657*418026caSFam Zheng } 658*418026caSFam Zheng out: 659*418026caSFam Zheng qemu_mutex_unlock(&s->lock); 660*418026caSFam Zheng return ret; 661*418026caSFam Zheng } 662*418026caSFam Zheng 663*418026caSFam Zheng /* Reset the high watermark and free all "temporary" mappings. */ 664*418026caSFam Zheng int qemu_vfio_dma_reset_temporary(QEMUVFIOState *s) 665*418026caSFam Zheng { 666*418026caSFam Zheng struct vfio_iommu_type1_dma_unmap unmap = { 667*418026caSFam Zheng .argsz = sizeof(unmap), 668*418026caSFam Zheng .flags = 0, 669*418026caSFam Zheng .iova = s->high_water_mark, 670*418026caSFam Zheng .size = QEMU_VFIO_IOVA_MAX - s->high_water_mark, 671*418026caSFam Zheng }; 672*418026caSFam Zheng trace_qemu_vfio_dma_reset_temporary(s); 673*418026caSFam Zheng qemu_mutex_lock(&s->lock); 674*418026caSFam Zheng if (ioctl(s->container, VFIO_IOMMU_UNMAP_DMA, &unmap)) { 675*418026caSFam Zheng error_report("VFIO_UNMAP_DMA: %d", -errno); 676*418026caSFam Zheng qemu_mutex_unlock(&s->lock); 677*418026caSFam Zheng return -errno; 678*418026caSFam Zheng } 679*418026caSFam Zheng s->high_water_mark = QEMU_VFIO_IOVA_MAX; 680*418026caSFam Zheng qemu_mutex_unlock(&s->lock); 681*418026caSFam Zheng return 0; 682*418026caSFam Zheng } 683*418026caSFam Zheng 684*418026caSFam Zheng /* Unmapping the whole area that was previously mapped with 685*418026caSFam Zheng * qemu_vfio_dma_map(). */ 686*418026caSFam Zheng void qemu_vfio_dma_unmap(QEMUVFIOState *s, void *host) 687*418026caSFam Zheng { 688*418026caSFam Zheng int index = 0; 689*418026caSFam Zheng IOVAMapping *m; 690*418026caSFam Zheng 691*418026caSFam Zheng if (!host) { 692*418026caSFam Zheng return; 693*418026caSFam Zheng } 694*418026caSFam Zheng 695*418026caSFam Zheng trace_qemu_vfio_dma_unmap(s, host); 696*418026caSFam Zheng qemu_mutex_lock(&s->lock); 697*418026caSFam Zheng m = qemu_vfio_find_mapping(s, host, &index); 698*418026caSFam Zheng if (!m) { 699*418026caSFam Zheng goto out; 700*418026caSFam Zheng } 701*418026caSFam Zheng qemu_vfio_undo_mapping(s, m, NULL); 702*418026caSFam Zheng out: 703*418026caSFam Zheng qemu_mutex_unlock(&s->lock); 704*418026caSFam Zheng } 705*418026caSFam Zheng 706*418026caSFam Zheng static void qemu_vfio_reset(QEMUVFIOState *s) 707*418026caSFam Zheng { 708*418026caSFam Zheng ioctl(s->device, VFIO_DEVICE_RESET); 709*418026caSFam Zheng } 710*418026caSFam Zheng 711*418026caSFam Zheng /* Close and free the VFIO resources. */ 712*418026caSFam Zheng void qemu_vfio_close(QEMUVFIOState *s) 713*418026caSFam Zheng { 714*418026caSFam Zheng int i; 715*418026caSFam Zheng 716*418026caSFam Zheng if (!s) { 717*418026caSFam Zheng return; 718*418026caSFam Zheng } 719*418026caSFam Zheng for (i = 0; i < s->nr_mappings; ++i) { 720*418026caSFam Zheng qemu_vfio_undo_mapping(s, &s->mappings[i], NULL); 721*418026caSFam Zheng } 722*418026caSFam Zheng ram_block_notifier_remove(&s->ram_notifier); 723*418026caSFam Zheng qemu_vfio_reset(s); 724*418026caSFam Zheng close(s->device); 725*418026caSFam Zheng close(s->group); 726*418026caSFam Zheng close(s->container); 727*418026caSFam Zheng } 728