1 /* 2 * VFIO utility 3 * 4 * Copyright 2016 - 2018 Red Hat, Inc. 5 * 6 * Authors: 7 * Fam Zheng <famz@redhat.com> 8 * 9 * This work is licensed under the terms of the GNU GPL, version 2 or later. 10 * See the COPYING file in the top-level directory. 11 */ 12 13 #include "qemu/osdep.h" 14 #include <sys/ioctl.h> 15 #include <linux/vfio.h> 16 #include "qapi/error.h" 17 #include "exec/ramlist.h" 18 #include "exec/cpu-common.h" 19 #include "trace.h" 20 #include "qemu/error-report.h" 21 #include "standard-headers/linux/pci_regs.h" 22 #include "qemu/event_notifier.h" 23 #include "qemu/vfio-helpers.h" 24 #include "qemu/lockable.h" 25 #include "trace.h" 26 27 #define QEMU_VFIO_DEBUG 0 28 29 #define QEMU_VFIO_IOVA_MIN 0x10000ULL 30 /* XXX: Once VFIO exposes the iova bit width in the IOMMU capability interface, 31 * we can use a runtime limit; alternatively it's also possible to do platform 32 * specific detection by reading sysfs entries. Until then, 39 is a safe bet. 33 **/ 34 #define QEMU_VFIO_IOVA_MAX (1ULL << 39) 35 36 typedef struct { 37 /* Page aligned addr. */ 38 void *host; 39 size_t size; 40 uint64_t iova; 41 } IOVAMapping; 42 43 struct IOVARange { 44 uint64_t start; 45 uint64_t end; 46 }; 47 48 struct QEMUVFIOState { 49 QemuMutex lock; 50 51 /* These fields are protected by BQL */ 52 int container; 53 int group; 54 int device; 55 RAMBlockNotifier ram_notifier; 56 struct vfio_region_info config_region_info, bar_region_info[6]; 57 struct IOVARange *usable_iova_ranges; 58 uint8_t nb_iova_ranges; 59 60 /* These fields are protected by @lock */ 61 /* VFIO's IO virtual address space is managed by splitting into a few 62 * sections: 63 * 64 * --------------- <= 0 65 * |xxxxxxxxxxxxx| 66 * |-------------| <= QEMU_VFIO_IOVA_MIN 67 * | | 68 * | Fixed | 69 * | | 70 * |-------------| <= low_water_mark 71 * | | 72 * | Free | 73 * | | 74 * |-------------| <= high_water_mark 75 * | | 76 * | Temp | 77 * | | 78 * |-------------| <= QEMU_VFIO_IOVA_MAX 79 * |xxxxxxxxxxxxx| 80 * |xxxxxxxxxxxxx| 81 * --------------- 82 * 83 * - Addresses lower than QEMU_VFIO_IOVA_MIN are reserved as invalid; 84 * 85 * - Fixed mappings of HVAs are assigned "low" IOVAs in the range of 86 * [QEMU_VFIO_IOVA_MIN, low_water_mark). Once allocated they will not be 87 * reclaimed - low_water_mark never shrinks; 88 * 89 * - IOVAs in range [low_water_mark, high_water_mark) are free; 90 * 91 * - IOVAs in range [high_water_mark, QEMU_VFIO_IOVA_MAX) are volatile 92 * mappings. At each qemu_vfio_dma_reset_temporary() call, the whole area 93 * is recycled. The caller should make sure I/O's depending on these 94 * mappings are completed before calling. 95 **/ 96 uint64_t low_water_mark; 97 uint64_t high_water_mark; 98 IOVAMapping *mappings; 99 int nr_mappings; 100 }; 101 102 /** 103 * Find group file by PCI device address as specified @device, and return the 104 * path. The returned string is owned by caller and should be g_free'ed later. 105 */ 106 static char *sysfs_find_group_file(const char *device, Error **errp) 107 { 108 char *sysfs_link; 109 char *sysfs_group; 110 char *p; 111 char *path = NULL; 112 113 sysfs_link = g_strdup_printf("/sys/bus/pci/devices/%s/iommu_group", device); 114 sysfs_group = g_malloc0(PATH_MAX); 115 if (readlink(sysfs_link, sysfs_group, PATH_MAX - 1) == -1) { 116 error_setg_errno(errp, errno, "Failed to find iommu group sysfs path"); 117 goto out; 118 } 119 p = strrchr(sysfs_group, '/'); 120 if (!p) { 121 error_setg(errp, "Failed to find iommu group number"); 122 goto out; 123 } 124 125 path = g_strdup_printf("/dev/vfio/%s", p + 1); 126 out: 127 g_free(sysfs_link); 128 g_free(sysfs_group); 129 return path; 130 } 131 132 static inline void assert_bar_index_valid(QEMUVFIOState *s, int index) 133 { 134 assert(index >= 0 && index < ARRAY_SIZE(s->bar_region_info)); 135 } 136 137 static int qemu_vfio_pci_init_bar(QEMUVFIOState *s, int index, Error **errp) 138 { 139 assert_bar_index_valid(s, index); 140 s->bar_region_info[index] = (struct vfio_region_info) { 141 .index = VFIO_PCI_BAR0_REGION_INDEX + index, 142 .argsz = sizeof(struct vfio_region_info), 143 }; 144 if (ioctl(s->device, VFIO_DEVICE_GET_REGION_INFO, &s->bar_region_info[index])) { 145 error_setg_errno(errp, errno, "Failed to get BAR region info"); 146 return -errno; 147 } 148 149 return 0; 150 } 151 152 /** 153 * Map a PCI bar area. 154 */ 155 void *qemu_vfio_pci_map_bar(QEMUVFIOState *s, int index, 156 uint64_t offset, uint64_t size, int prot, 157 Error **errp) 158 { 159 void *p; 160 assert_bar_index_valid(s, index); 161 p = mmap(NULL, MIN(size, s->bar_region_info[index].size - offset), 162 prot, MAP_SHARED, 163 s->device, s->bar_region_info[index].offset + offset); 164 if (p == MAP_FAILED) { 165 error_setg_errno(errp, errno, "Failed to map BAR region"); 166 p = NULL; 167 } 168 return p; 169 } 170 171 /** 172 * Unmap a PCI bar area. 173 */ 174 void qemu_vfio_pci_unmap_bar(QEMUVFIOState *s, int index, void *bar, 175 uint64_t offset, uint64_t size) 176 { 177 if (bar) { 178 munmap(bar, MIN(size, s->bar_region_info[index].size - offset)); 179 } 180 } 181 182 /** 183 * Initialize device IRQ with @irq_type and register an event notifier. 184 */ 185 int qemu_vfio_pci_init_irq(QEMUVFIOState *s, EventNotifier *e, 186 int irq_type, Error **errp) 187 { 188 int r; 189 struct vfio_irq_set *irq_set; 190 size_t irq_set_size; 191 struct vfio_irq_info irq_info = { .argsz = sizeof(irq_info) }; 192 193 irq_info.index = irq_type; 194 if (ioctl(s->device, VFIO_DEVICE_GET_IRQ_INFO, &irq_info)) { 195 error_setg_errno(errp, errno, "Failed to get device interrupt info"); 196 return -errno; 197 } 198 if (!(irq_info.flags & VFIO_IRQ_INFO_EVENTFD)) { 199 error_setg(errp, "Device interrupt doesn't support eventfd"); 200 return -EINVAL; 201 } 202 203 irq_set_size = sizeof(*irq_set) + sizeof(int); 204 irq_set = g_malloc0(irq_set_size); 205 206 /* Get to a known IRQ state */ 207 *irq_set = (struct vfio_irq_set) { 208 .argsz = irq_set_size, 209 .flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER, 210 .index = irq_info.index, 211 .start = 0, 212 .count = 1, 213 }; 214 215 *(int *)&irq_set->data = event_notifier_get_fd(e); 216 r = ioctl(s->device, VFIO_DEVICE_SET_IRQS, irq_set); 217 g_free(irq_set); 218 if (r) { 219 error_setg_errno(errp, errno, "Failed to setup device interrupt"); 220 return -errno; 221 } 222 return 0; 223 } 224 225 static int qemu_vfio_pci_read_config(QEMUVFIOState *s, void *buf, 226 int size, int ofs) 227 { 228 int ret; 229 230 do { 231 ret = pread(s->device, buf, size, s->config_region_info.offset + ofs); 232 } while (ret == -1 && errno == EINTR); 233 return ret == size ? 0 : -errno; 234 } 235 236 static int qemu_vfio_pci_write_config(QEMUVFIOState *s, void *buf, int size, int ofs) 237 { 238 int ret; 239 240 do { 241 ret = pwrite(s->device, buf, size, s->config_region_info.offset + ofs); 242 } while (ret == -1 && errno == EINTR); 243 return ret == size ? 0 : -errno; 244 } 245 246 static void collect_usable_iova_ranges(QEMUVFIOState *s, void *buf) 247 { 248 struct vfio_iommu_type1_info *info = (struct vfio_iommu_type1_info *)buf; 249 struct vfio_info_cap_header *cap = (void *)buf + info->cap_offset; 250 struct vfio_iommu_type1_info_cap_iova_range *cap_iova_range; 251 int i; 252 253 while (cap->id != VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE) { 254 if (!cap->next) { 255 return; 256 } 257 cap = (struct vfio_info_cap_header *)(buf + cap->next); 258 } 259 260 cap_iova_range = (struct vfio_iommu_type1_info_cap_iova_range *)cap; 261 262 s->nb_iova_ranges = cap_iova_range->nr_iovas; 263 if (s->nb_iova_ranges > 1) { 264 s->usable_iova_ranges = 265 g_realloc(s->usable_iova_ranges, 266 s->nb_iova_ranges * sizeof(struct IOVARange)); 267 } 268 269 for (i = 0; i < s->nb_iova_ranges; i++) { 270 s->usable_iova_ranges[i].start = cap_iova_range->iova_ranges[i].start; 271 s->usable_iova_ranges[i].end = cap_iova_range->iova_ranges[i].end; 272 } 273 } 274 275 static int qemu_vfio_init_pci(QEMUVFIOState *s, const char *device, 276 Error **errp) 277 { 278 int ret; 279 int i; 280 uint16_t pci_cmd; 281 struct vfio_group_status group_status = { .argsz = sizeof(group_status) }; 282 struct vfio_iommu_type1_info *iommu_info = NULL; 283 size_t iommu_info_size = sizeof(*iommu_info); 284 struct vfio_device_info device_info = { .argsz = sizeof(device_info) }; 285 char *group_file = NULL; 286 287 s->usable_iova_ranges = NULL; 288 289 /* Create a new container */ 290 s->container = open("/dev/vfio/vfio", O_RDWR); 291 292 if (s->container == -1) { 293 error_setg_errno(errp, errno, "Failed to open /dev/vfio/vfio"); 294 return -errno; 295 } 296 if (ioctl(s->container, VFIO_GET_API_VERSION) != VFIO_API_VERSION) { 297 error_setg(errp, "Invalid VFIO version"); 298 ret = -EINVAL; 299 goto fail_container; 300 } 301 302 if (!ioctl(s->container, VFIO_CHECK_EXTENSION, VFIO_TYPE1_IOMMU)) { 303 error_setg_errno(errp, errno, "VFIO IOMMU check failed"); 304 ret = -EINVAL; 305 goto fail_container; 306 } 307 308 /* Open the group */ 309 group_file = sysfs_find_group_file(device, errp); 310 if (!group_file) { 311 ret = -EINVAL; 312 goto fail_container; 313 } 314 315 s->group = open(group_file, O_RDWR); 316 if (s->group == -1) { 317 error_setg_errno(errp, errno, "Failed to open VFIO group file: %s", 318 group_file); 319 g_free(group_file); 320 ret = -errno; 321 goto fail_container; 322 } 323 g_free(group_file); 324 325 /* Test the group is viable and available */ 326 if (ioctl(s->group, VFIO_GROUP_GET_STATUS, &group_status)) { 327 error_setg_errno(errp, errno, "Failed to get VFIO group status"); 328 ret = -errno; 329 goto fail; 330 } 331 332 if (!(group_status.flags & VFIO_GROUP_FLAGS_VIABLE)) { 333 error_setg(errp, "VFIO group is not viable"); 334 ret = -EINVAL; 335 goto fail; 336 } 337 338 /* Add the group to the container */ 339 if (ioctl(s->group, VFIO_GROUP_SET_CONTAINER, &s->container)) { 340 error_setg_errno(errp, errno, "Failed to add group to VFIO container"); 341 ret = -errno; 342 goto fail; 343 } 344 345 /* Enable the IOMMU model we want */ 346 if (ioctl(s->container, VFIO_SET_IOMMU, VFIO_TYPE1_IOMMU)) { 347 error_setg_errno(errp, errno, "Failed to set VFIO IOMMU type"); 348 ret = -errno; 349 goto fail; 350 } 351 352 iommu_info = g_malloc0(iommu_info_size); 353 iommu_info->argsz = iommu_info_size; 354 355 /* Get additional IOMMU info */ 356 if (ioctl(s->container, VFIO_IOMMU_GET_INFO, iommu_info)) { 357 error_setg_errno(errp, errno, "Failed to get IOMMU info"); 358 ret = -errno; 359 goto fail; 360 } 361 362 /* 363 * if the kernel does not report usable IOVA regions, choose 364 * the legacy [QEMU_VFIO_IOVA_MIN, QEMU_VFIO_IOVA_MAX -1] region 365 */ 366 s->nb_iova_ranges = 1; 367 s->usable_iova_ranges = g_new0(struct IOVARange, 1); 368 s->usable_iova_ranges[0].start = QEMU_VFIO_IOVA_MIN; 369 s->usable_iova_ranges[0].end = QEMU_VFIO_IOVA_MAX - 1; 370 371 if (iommu_info->argsz > iommu_info_size) { 372 iommu_info_size = iommu_info->argsz; 373 iommu_info = g_realloc(iommu_info, iommu_info_size); 374 if (ioctl(s->container, VFIO_IOMMU_GET_INFO, iommu_info)) { 375 ret = -errno; 376 goto fail; 377 } 378 collect_usable_iova_ranges(s, iommu_info); 379 } 380 381 s->device = ioctl(s->group, VFIO_GROUP_GET_DEVICE_FD, device); 382 383 if (s->device < 0) { 384 error_setg_errno(errp, errno, "Failed to get device fd"); 385 ret = -errno; 386 goto fail; 387 } 388 389 /* Test and setup the device */ 390 if (ioctl(s->device, VFIO_DEVICE_GET_INFO, &device_info)) { 391 error_setg_errno(errp, errno, "Failed to get device info"); 392 ret = -errno; 393 goto fail; 394 } 395 396 if (device_info.num_regions < VFIO_PCI_CONFIG_REGION_INDEX) { 397 error_setg(errp, "Invalid device regions"); 398 ret = -EINVAL; 399 goto fail; 400 } 401 402 s->config_region_info = (struct vfio_region_info) { 403 .index = VFIO_PCI_CONFIG_REGION_INDEX, 404 .argsz = sizeof(struct vfio_region_info), 405 }; 406 if (ioctl(s->device, VFIO_DEVICE_GET_REGION_INFO, &s->config_region_info)) { 407 error_setg_errno(errp, errno, "Failed to get config region info"); 408 ret = -errno; 409 goto fail; 410 } 411 412 for (i = 0; i < ARRAY_SIZE(s->bar_region_info); i++) { 413 ret = qemu_vfio_pci_init_bar(s, i, errp); 414 if (ret) { 415 goto fail; 416 } 417 } 418 419 /* Enable bus master */ 420 ret = qemu_vfio_pci_read_config(s, &pci_cmd, sizeof(pci_cmd), PCI_COMMAND); 421 if (ret) { 422 goto fail; 423 } 424 pci_cmd |= PCI_COMMAND_MASTER; 425 ret = qemu_vfio_pci_write_config(s, &pci_cmd, sizeof(pci_cmd), PCI_COMMAND); 426 if (ret) { 427 goto fail; 428 } 429 g_free(iommu_info); 430 return 0; 431 fail: 432 g_free(s->usable_iova_ranges); 433 s->usable_iova_ranges = NULL; 434 s->nb_iova_ranges = 0; 435 g_free(iommu_info); 436 close(s->group); 437 fail_container: 438 close(s->container); 439 return ret; 440 } 441 442 static void qemu_vfio_ram_block_added(RAMBlockNotifier *n, 443 void *host, size_t size) 444 { 445 QEMUVFIOState *s = container_of(n, QEMUVFIOState, ram_notifier); 446 trace_qemu_vfio_ram_block_added(s, host, size); 447 qemu_vfio_dma_map(s, host, size, false, NULL); 448 } 449 450 static void qemu_vfio_ram_block_removed(RAMBlockNotifier *n, 451 void *host, size_t size) 452 { 453 QEMUVFIOState *s = container_of(n, QEMUVFIOState, ram_notifier); 454 if (host) { 455 trace_qemu_vfio_ram_block_removed(s, host, size); 456 qemu_vfio_dma_unmap(s, host); 457 } 458 } 459 460 static int qemu_vfio_init_ramblock(RAMBlock *rb, void *opaque) 461 { 462 void *host_addr = qemu_ram_get_host_addr(rb); 463 ram_addr_t length = qemu_ram_get_used_length(rb); 464 int ret; 465 QEMUVFIOState *s = opaque; 466 467 if (!host_addr) { 468 return 0; 469 } 470 ret = qemu_vfio_dma_map(s, host_addr, length, false, NULL); 471 if (ret) { 472 fprintf(stderr, "qemu_vfio_init_ramblock: failed %p %" PRId64 "\n", 473 host_addr, (uint64_t)length); 474 } 475 return 0; 476 } 477 478 static void qemu_vfio_open_common(QEMUVFIOState *s) 479 { 480 qemu_mutex_init(&s->lock); 481 s->ram_notifier.ram_block_added = qemu_vfio_ram_block_added; 482 s->ram_notifier.ram_block_removed = qemu_vfio_ram_block_removed; 483 ram_block_notifier_add(&s->ram_notifier); 484 s->low_water_mark = QEMU_VFIO_IOVA_MIN; 485 s->high_water_mark = QEMU_VFIO_IOVA_MAX; 486 qemu_ram_foreach_block(qemu_vfio_init_ramblock, s); 487 } 488 489 /** 490 * Open a PCI device, e.g. "0000:00:01.0". 491 */ 492 QEMUVFIOState *qemu_vfio_open_pci(const char *device, Error **errp) 493 { 494 int r; 495 QEMUVFIOState *s = g_new0(QEMUVFIOState, 1); 496 497 r = qemu_vfio_init_pci(s, device, errp); 498 if (r) { 499 g_free(s); 500 return NULL; 501 } 502 qemu_vfio_open_common(s); 503 return s; 504 } 505 506 static void qemu_vfio_dump_mapping(IOVAMapping *m) 507 { 508 if (QEMU_VFIO_DEBUG) { 509 printf(" vfio mapping %p %" PRIx64 " to %" PRIx64 "\n", m->host, 510 (uint64_t)m->size, (uint64_t)m->iova); 511 } 512 } 513 514 static void qemu_vfio_dump_mappings(QEMUVFIOState *s) 515 { 516 int i; 517 518 if (QEMU_VFIO_DEBUG) { 519 printf("vfio mappings\n"); 520 for (i = 0; i < s->nr_mappings; ++i) { 521 qemu_vfio_dump_mapping(&s->mappings[i]); 522 } 523 } 524 } 525 526 /** 527 * Find the mapping entry that contains [host, host + size) and set @index to 528 * the position. If no entry contains it, @index is the position _after_ which 529 * to insert the new mapping. IOW, it is the index of the largest element that 530 * is smaller than @host, or -1 if no entry is. 531 */ 532 static IOVAMapping *qemu_vfio_find_mapping(QEMUVFIOState *s, void *host, 533 int *index) 534 { 535 IOVAMapping *p = s->mappings; 536 IOVAMapping *q = p ? p + s->nr_mappings - 1 : NULL; 537 IOVAMapping *mid; 538 trace_qemu_vfio_find_mapping(s, host); 539 if (!p) { 540 *index = -1; 541 return NULL; 542 } 543 while (true) { 544 mid = p + (q - p) / 2; 545 if (mid == p) { 546 break; 547 } 548 if (mid->host > host) { 549 q = mid; 550 } else if (mid->host < host) { 551 p = mid; 552 } else { 553 break; 554 } 555 } 556 if (mid->host > host) { 557 mid--; 558 } else if (mid < &s->mappings[s->nr_mappings - 1] 559 && (mid + 1)->host <= host) { 560 mid++; 561 } 562 *index = mid - &s->mappings[0]; 563 if (mid >= &s->mappings[0] && 564 mid->host <= host && mid->host + mid->size > host) { 565 assert(mid < &s->mappings[s->nr_mappings]); 566 return mid; 567 } 568 /* At this point *index + 1 is the right position to insert the new 569 * mapping.*/ 570 return NULL; 571 } 572 573 /** 574 * Allocate IOVA and create a new mapping record and insert it in @s. 575 */ 576 static IOVAMapping *qemu_vfio_add_mapping(QEMUVFIOState *s, 577 void *host, size_t size, 578 int index, uint64_t iova) 579 { 580 int shift; 581 IOVAMapping m = {.host = host, .size = size, .iova = iova}; 582 IOVAMapping *insert; 583 584 assert(QEMU_IS_ALIGNED(size, qemu_real_host_page_size)); 585 assert(QEMU_IS_ALIGNED(s->low_water_mark, qemu_real_host_page_size)); 586 assert(QEMU_IS_ALIGNED(s->high_water_mark, qemu_real_host_page_size)); 587 trace_qemu_vfio_new_mapping(s, host, size, index, iova); 588 589 assert(index >= 0); 590 s->nr_mappings++; 591 s->mappings = g_renew(IOVAMapping, s->mappings, s->nr_mappings); 592 insert = &s->mappings[index]; 593 shift = s->nr_mappings - index - 1; 594 if (shift) { 595 memmove(insert + 1, insert, shift * sizeof(s->mappings[0])); 596 } 597 *insert = m; 598 return insert; 599 } 600 601 /* Do the DMA mapping with VFIO. */ 602 static int qemu_vfio_do_mapping(QEMUVFIOState *s, void *host, size_t size, 603 uint64_t iova) 604 { 605 struct vfio_iommu_type1_dma_map dma_map = { 606 .argsz = sizeof(dma_map), 607 .flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE, 608 .iova = iova, 609 .vaddr = (uintptr_t)host, 610 .size = size, 611 }; 612 trace_qemu_vfio_do_mapping(s, host, size, iova); 613 614 if (ioctl(s->container, VFIO_IOMMU_MAP_DMA, &dma_map)) { 615 error_report("VFIO_MAP_DMA failed: %s", strerror(errno)); 616 return -errno; 617 } 618 return 0; 619 } 620 621 /** 622 * Undo the DMA mapping from @s with VFIO, and remove from mapping list. 623 */ 624 static void qemu_vfio_undo_mapping(QEMUVFIOState *s, IOVAMapping *mapping, 625 Error **errp) 626 { 627 int index; 628 struct vfio_iommu_type1_dma_unmap unmap = { 629 .argsz = sizeof(unmap), 630 .flags = 0, 631 .iova = mapping->iova, 632 .size = mapping->size, 633 }; 634 635 index = mapping - s->mappings; 636 assert(mapping->size > 0); 637 assert(QEMU_IS_ALIGNED(mapping->size, qemu_real_host_page_size)); 638 assert(index >= 0 && index < s->nr_mappings); 639 if (ioctl(s->container, VFIO_IOMMU_UNMAP_DMA, &unmap)) { 640 error_setg_errno(errp, errno, "VFIO_UNMAP_DMA failed"); 641 } 642 memmove(mapping, &s->mappings[index + 1], 643 sizeof(s->mappings[0]) * (s->nr_mappings - index - 1)); 644 s->nr_mappings--; 645 s->mappings = g_renew(IOVAMapping, s->mappings, s->nr_mappings); 646 } 647 648 /* Check if the mapping list is (ascending) ordered. */ 649 static bool qemu_vfio_verify_mappings(QEMUVFIOState *s) 650 { 651 int i; 652 if (QEMU_VFIO_DEBUG) { 653 for (i = 0; i < s->nr_mappings - 1; ++i) { 654 if (!(s->mappings[i].host < s->mappings[i + 1].host)) { 655 fprintf(stderr, "item %d not sorted!\n", i); 656 qemu_vfio_dump_mappings(s); 657 return false; 658 } 659 if (!(s->mappings[i].host + s->mappings[i].size <= 660 s->mappings[i + 1].host)) { 661 fprintf(stderr, "item %d overlap with next!\n", i); 662 qemu_vfio_dump_mappings(s); 663 return false; 664 } 665 } 666 } 667 return true; 668 } 669 670 static int 671 qemu_vfio_find_fixed_iova(QEMUVFIOState *s, size_t size, uint64_t *iova) 672 { 673 int i; 674 675 for (i = 0; i < s->nb_iova_ranges; i++) { 676 if (s->usable_iova_ranges[i].end < s->low_water_mark) { 677 continue; 678 } 679 s->low_water_mark = 680 MAX(s->low_water_mark, s->usable_iova_ranges[i].start); 681 682 if (s->usable_iova_ranges[i].end - s->low_water_mark + 1 >= size || 683 s->usable_iova_ranges[i].end - s->low_water_mark + 1 == 0) { 684 *iova = s->low_water_mark; 685 s->low_water_mark += size; 686 return 0; 687 } 688 } 689 return -ENOMEM; 690 } 691 692 static int 693 qemu_vfio_find_temp_iova(QEMUVFIOState *s, size_t size, uint64_t *iova) 694 { 695 int i; 696 697 for (i = s->nb_iova_ranges - 1; i >= 0; i--) { 698 if (s->usable_iova_ranges[i].start > s->high_water_mark) { 699 continue; 700 } 701 s->high_water_mark = 702 MIN(s->high_water_mark, s->usable_iova_ranges[i].end + 1); 703 704 if (s->high_water_mark - s->usable_iova_ranges[i].start + 1 >= size || 705 s->high_water_mark - s->usable_iova_ranges[i].start + 1 == 0) { 706 *iova = s->high_water_mark - size; 707 s->high_water_mark = *iova; 708 return 0; 709 } 710 } 711 return -ENOMEM; 712 } 713 714 /* Map [host, host + size) area into a contiguous IOVA address space, and store 715 * the result in @iova if not NULL. The caller need to make sure the area is 716 * aligned to page size, and mustn't overlap with existing mapping areas (split 717 * mapping status within this area is not allowed). 718 */ 719 int qemu_vfio_dma_map(QEMUVFIOState *s, void *host, size_t size, 720 bool temporary, uint64_t *iova) 721 { 722 int ret = 0; 723 int index; 724 IOVAMapping *mapping; 725 uint64_t iova0; 726 727 assert(QEMU_PTR_IS_ALIGNED(host, qemu_real_host_page_size)); 728 assert(QEMU_IS_ALIGNED(size, qemu_real_host_page_size)); 729 trace_qemu_vfio_dma_map(s, host, size, temporary, iova); 730 qemu_mutex_lock(&s->lock); 731 mapping = qemu_vfio_find_mapping(s, host, &index); 732 if (mapping) { 733 iova0 = mapping->iova + ((uint8_t *)host - (uint8_t *)mapping->host); 734 } else { 735 if (s->high_water_mark - s->low_water_mark + 1 < size) { 736 ret = -ENOMEM; 737 goto out; 738 } 739 if (!temporary) { 740 if (qemu_vfio_find_fixed_iova(s, size, &iova0)) { 741 ret = -ENOMEM; 742 goto out; 743 } 744 745 mapping = qemu_vfio_add_mapping(s, host, size, index + 1, iova0); 746 if (!mapping) { 747 ret = -ENOMEM; 748 goto out; 749 } 750 assert(qemu_vfio_verify_mappings(s)); 751 ret = qemu_vfio_do_mapping(s, host, size, iova0); 752 if (ret) { 753 qemu_vfio_undo_mapping(s, mapping, NULL); 754 goto out; 755 } 756 qemu_vfio_dump_mappings(s); 757 } else { 758 if (qemu_vfio_find_temp_iova(s, size, &iova0)) { 759 ret = -ENOMEM; 760 goto out; 761 } 762 ret = qemu_vfio_do_mapping(s, host, size, iova0); 763 if (ret) { 764 goto out; 765 } 766 } 767 } 768 if (iova) { 769 *iova = iova0; 770 } 771 out: 772 qemu_mutex_unlock(&s->lock); 773 return ret; 774 } 775 776 /* Reset the high watermark and free all "temporary" mappings. */ 777 int qemu_vfio_dma_reset_temporary(QEMUVFIOState *s) 778 { 779 struct vfio_iommu_type1_dma_unmap unmap = { 780 .argsz = sizeof(unmap), 781 .flags = 0, 782 .iova = s->high_water_mark, 783 .size = QEMU_VFIO_IOVA_MAX - s->high_water_mark, 784 }; 785 trace_qemu_vfio_dma_reset_temporary(s); 786 QEMU_LOCK_GUARD(&s->lock); 787 if (ioctl(s->container, VFIO_IOMMU_UNMAP_DMA, &unmap)) { 788 error_report("VFIO_UNMAP_DMA failed: %s", strerror(errno)); 789 return -errno; 790 } 791 s->high_water_mark = QEMU_VFIO_IOVA_MAX; 792 return 0; 793 } 794 795 /* Unmapping the whole area that was previously mapped with 796 * qemu_vfio_dma_map(). */ 797 void qemu_vfio_dma_unmap(QEMUVFIOState *s, void *host) 798 { 799 int index = 0; 800 IOVAMapping *m; 801 802 if (!host) { 803 return; 804 } 805 806 trace_qemu_vfio_dma_unmap(s, host); 807 qemu_mutex_lock(&s->lock); 808 m = qemu_vfio_find_mapping(s, host, &index); 809 if (!m) { 810 goto out; 811 } 812 qemu_vfio_undo_mapping(s, m, NULL); 813 out: 814 qemu_mutex_unlock(&s->lock); 815 } 816 817 static void qemu_vfio_reset(QEMUVFIOState *s) 818 { 819 ioctl(s->device, VFIO_DEVICE_RESET); 820 } 821 822 /* Close and free the VFIO resources. */ 823 void qemu_vfio_close(QEMUVFIOState *s) 824 { 825 int i; 826 827 if (!s) { 828 return; 829 } 830 for (i = 0; i < s->nr_mappings; ++i) { 831 qemu_vfio_undo_mapping(s, &s->mappings[i], NULL); 832 } 833 ram_block_notifier_remove(&s->ram_notifier); 834 g_free(s->usable_iova_ranges); 835 s->nb_iova_ranges = 0; 836 qemu_vfio_reset(s); 837 close(s->device); 838 close(s->group); 839 close(s->container); 840 } 841