1 /* 2 * VFIO utility 3 * 4 * Copyright 2016 - 2018 Red Hat, Inc. 5 * 6 * Authors: 7 * Fam Zheng <famz@redhat.com> 8 * 9 * This work is licensed under the terms of the GNU GPL, version 2 or later. 10 * See the COPYING file in the top-level directory. 11 */ 12 13 #include "qemu/osdep.h" 14 #include <sys/ioctl.h> 15 #include <linux/vfio.h> 16 #include "qapi/error.h" 17 #include "exec/ramlist.h" 18 #include "exec/cpu-common.h" 19 #include "exec/memory.h" 20 #include "trace.h" 21 #include "qemu/error-report.h" 22 #include "standard-headers/linux/pci_regs.h" 23 #include "qemu/event_notifier.h" 24 #include "qemu/vfio-helpers.h" 25 #include "qemu/lockable.h" 26 #include "trace.h" 27 28 #define QEMU_VFIO_DEBUG 0 29 30 #define QEMU_VFIO_IOVA_MIN 0x10000ULL 31 /* XXX: Once VFIO exposes the iova bit width in the IOMMU capability interface, 32 * we can use a runtime limit; alternatively it's also possible to do platform 33 * specific detection by reading sysfs entries. Until then, 39 is a safe bet. 34 **/ 35 #define QEMU_VFIO_IOVA_MAX (1ULL << 39) 36 37 typedef struct { 38 /* Page aligned addr. */ 39 void *host; 40 size_t size; 41 uint64_t iova; 42 } IOVAMapping; 43 44 struct IOVARange { 45 uint64_t start; 46 uint64_t end; 47 }; 48 49 struct QEMUVFIOState { 50 QemuMutex lock; 51 52 /* These fields are protected by BQL */ 53 int container; 54 int group; 55 int device; 56 RAMBlockNotifier ram_notifier; 57 struct vfio_region_info config_region_info, bar_region_info[6]; 58 struct IOVARange *usable_iova_ranges; 59 uint8_t nb_iova_ranges; 60 61 /* These fields are protected by @lock */ 62 /* VFIO's IO virtual address space is managed by splitting into a few 63 * sections: 64 * 65 * --------------- <= 0 66 * |xxxxxxxxxxxxx| 67 * |-------------| <= QEMU_VFIO_IOVA_MIN 68 * | | 69 * | Fixed | 70 * | | 71 * |-------------| <= low_water_mark 72 * | | 73 * | Free | 74 * | | 75 * |-------------| <= high_water_mark 76 * | | 77 * | Temp | 78 * | | 79 * |-------------| <= QEMU_VFIO_IOVA_MAX 80 * |xxxxxxxxxxxxx| 81 * |xxxxxxxxxxxxx| 82 * --------------- 83 * 84 * - Addresses lower than QEMU_VFIO_IOVA_MIN are reserved as invalid; 85 * 86 * - Fixed mappings of HVAs are assigned "low" IOVAs in the range of 87 * [QEMU_VFIO_IOVA_MIN, low_water_mark). Once allocated they will not be 88 * reclaimed - low_water_mark never shrinks; 89 * 90 * - IOVAs in range [low_water_mark, high_water_mark) are free; 91 * 92 * - IOVAs in range [high_water_mark, QEMU_VFIO_IOVA_MAX) are volatile 93 * mappings. At each qemu_vfio_dma_reset_temporary() call, the whole area 94 * is recycled. The caller should make sure I/O's depending on these 95 * mappings are completed before calling. 96 **/ 97 uint64_t low_water_mark; 98 uint64_t high_water_mark; 99 IOVAMapping *mappings; 100 int nr_mappings; 101 }; 102 103 /** 104 * Find group file by PCI device address as specified @device, and return the 105 * path. The returned string is owned by caller and should be g_free'ed later. 106 */ 107 static char *sysfs_find_group_file(const char *device, Error **errp) 108 { 109 char *sysfs_link; 110 char *sysfs_group; 111 char *p; 112 char *path = NULL; 113 114 sysfs_link = g_strdup_printf("/sys/bus/pci/devices/%s/iommu_group", device); 115 sysfs_group = g_malloc0(PATH_MAX); 116 if (readlink(sysfs_link, sysfs_group, PATH_MAX - 1) == -1) { 117 error_setg_errno(errp, errno, "Failed to find iommu group sysfs path"); 118 goto out; 119 } 120 p = strrchr(sysfs_group, '/'); 121 if (!p) { 122 error_setg(errp, "Failed to find iommu group number"); 123 goto out; 124 } 125 126 path = g_strdup_printf("/dev/vfio/%s", p + 1); 127 out: 128 g_free(sysfs_link); 129 g_free(sysfs_group); 130 return path; 131 } 132 133 static inline void assert_bar_index_valid(QEMUVFIOState *s, int index) 134 { 135 assert(index >= 0 && index < ARRAY_SIZE(s->bar_region_info)); 136 } 137 138 static int qemu_vfio_pci_init_bar(QEMUVFIOState *s, int index, Error **errp) 139 { 140 assert_bar_index_valid(s, index); 141 s->bar_region_info[index] = (struct vfio_region_info) { 142 .index = VFIO_PCI_BAR0_REGION_INDEX + index, 143 .argsz = sizeof(struct vfio_region_info), 144 }; 145 if (ioctl(s->device, VFIO_DEVICE_GET_REGION_INFO, &s->bar_region_info[index])) { 146 error_setg_errno(errp, errno, "Failed to get BAR region info"); 147 return -errno; 148 } 149 150 return 0; 151 } 152 153 /** 154 * Map a PCI bar area. 155 */ 156 void *qemu_vfio_pci_map_bar(QEMUVFIOState *s, int index, 157 uint64_t offset, uint64_t size, int prot, 158 Error **errp) 159 { 160 void *p; 161 assert_bar_index_valid(s, index); 162 p = mmap(NULL, MIN(size, s->bar_region_info[index].size - offset), 163 prot, MAP_SHARED, 164 s->device, s->bar_region_info[index].offset + offset); 165 if (p == MAP_FAILED) { 166 error_setg_errno(errp, errno, "Failed to map BAR region"); 167 p = NULL; 168 } 169 return p; 170 } 171 172 /** 173 * Unmap a PCI bar area. 174 */ 175 void qemu_vfio_pci_unmap_bar(QEMUVFIOState *s, int index, void *bar, 176 uint64_t offset, uint64_t size) 177 { 178 if (bar) { 179 munmap(bar, MIN(size, s->bar_region_info[index].size - offset)); 180 } 181 } 182 183 /** 184 * Initialize device IRQ with @irq_type and register an event notifier. 185 */ 186 int qemu_vfio_pci_init_irq(QEMUVFIOState *s, EventNotifier *e, 187 int irq_type, Error **errp) 188 { 189 int r; 190 struct vfio_irq_set *irq_set; 191 size_t irq_set_size; 192 struct vfio_irq_info irq_info = { .argsz = sizeof(irq_info) }; 193 194 irq_info.index = irq_type; 195 if (ioctl(s->device, VFIO_DEVICE_GET_IRQ_INFO, &irq_info)) { 196 error_setg_errno(errp, errno, "Failed to get device interrupt info"); 197 return -errno; 198 } 199 if (!(irq_info.flags & VFIO_IRQ_INFO_EVENTFD)) { 200 error_setg(errp, "Device interrupt doesn't support eventfd"); 201 return -EINVAL; 202 } 203 204 irq_set_size = sizeof(*irq_set) + sizeof(int); 205 irq_set = g_malloc0(irq_set_size); 206 207 /* Get to a known IRQ state */ 208 *irq_set = (struct vfio_irq_set) { 209 .argsz = irq_set_size, 210 .flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER, 211 .index = irq_info.index, 212 .start = 0, 213 .count = 1, 214 }; 215 216 *(int *)&irq_set->data = event_notifier_get_fd(e); 217 r = ioctl(s->device, VFIO_DEVICE_SET_IRQS, irq_set); 218 g_free(irq_set); 219 if (r) { 220 error_setg_errno(errp, errno, "Failed to setup device interrupt"); 221 return -errno; 222 } 223 return 0; 224 } 225 226 static int qemu_vfio_pci_read_config(QEMUVFIOState *s, void *buf, 227 int size, int ofs) 228 { 229 int ret; 230 231 do { 232 ret = pread(s->device, buf, size, s->config_region_info.offset + ofs); 233 } while (ret == -1 && errno == EINTR); 234 return ret == size ? 0 : -errno; 235 } 236 237 static int qemu_vfio_pci_write_config(QEMUVFIOState *s, void *buf, int size, int ofs) 238 { 239 int ret; 240 241 do { 242 ret = pwrite(s->device, buf, size, s->config_region_info.offset + ofs); 243 } while (ret == -1 && errno == EINTR); 244 return ret == size ? 0 : -errno; 245 } 246 247 static void collect_usable_iova_ranges(QEMUVFIOState *s, void *buf) 248 { 249 struct vfio_iommu_type1_info *info = (struct vfio_iommu_type1_info *)buf; 250 struct vfio_info_cap_header *cap = (void *)buf + info->cap_offset; 251 struct vfio_iommu_type1_info_cap_iova_range *cap_iova_range; 252 int i; 253 254 while (cap->id != VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE) { 255 if (!cap->next) { 256 return; 257 } 258 cap = (struct vfio_info_cap_header *)(buf + cap->next); 259 } 260 261 cap_iova_range = (struct vfio_iommu_type1_info_cap_iova_range *)cap; 262 263 s->nb_iova_ranges = cap_iova_range->nr_iovas; 264 if (s->nb_iova_ranges > 1) { 265 s->usable_iova_ranges = 266 g_realloc(s->usable_iova_ranges, 267 s->nb_iova_ranges * sizeof(struct IOVARange)); 268 } 269 270 for (i = 0; i < s->nb_iova_ranges; i++) { 271 s->usable_iova_ranges[i].start = cap_iova_range->iova_ranges[i].start; 272 s->usable_iova_ranges[i].end = cap_iova_range->iova_ranges[i].end; 273 } 274 } 275 276 static int qemu_vfio_init_pci(QEMUVFIOState *s, const char *device, 277 Error **errp) 278 { 279 int ret; 280 int i; 281 uint16_t pci_cmd; 282 struct vfio_group_status group_status = { .argsz = sizeof(group_status) }; 283 struct vfio_iommu_type1_info *iommu_info = NULL; 284 size_t iommu_info_size = sizeof(*iommu_info); 285 struct vfio_device_info device_info = { .argsz = sizeof(device_info) }; 286 char *group_file = NULL; 287 288 s->usable_iova_ranges = NULL; 289 290 /* Create a new container */ 291 s->container = open("/dev/vfio/vfio", O_RDWR); 292 293 if (s->container == -1) { 294 error_setg_errno(errp, errno, "Failed to open /dev/vfio/vfio"); 295 return -errno; 296 } 297 if (ioctl(s->container, VFIO_GET_API_VERSION) != VFIO_API_VERSION) { 298 error_setg(errp, "Invalid VFIO version"); 299 ret = -EINVAL; 300 goto fail_container; 301 } 302 303 if (!ioctl(s->container, VFIO_CHECK_EXTENSION, VFIO_TYPE1_IOMMU)) { 304 error_setg_errno(errp, errno, "VFIO IOMMU check failed"); 305 ret = -EINVAL; 306 goto fail_container; 307 } 308 309 /* Open the group */ 310 group_file = sysfs_find_group_file(device, errp); 311 if (!group_file) { 312 ret = -EINVAL; 313 goto fail_container; 314 } 315 316 s->group = open(group_file, O_RDWR); 317 if (s->group == -1) { 318 error_setg_errno(errp, errno, "Failed to open VFIO group file: %s", 319 group_file); 320 g_free(group_file); 321 ret = -errno; 322 goto fail_container; 323 } 324 g_free(group_file); 325 326 /* Test the group is viable and available */ 327 if (ioctl(s->group, VFIO_GROUP_GET_STATUS, &group_status)) { 328 error_setg_errno(errp, errno, "Failed to get VFIO group status"); 329 ret = -errno; 330 goto fail; 331 } 332 333 if (!(group_status.flags & VFIO_GROUP_FLAGS_VIABLE)) { 334 error_setg(errp, "VFIO group is not viable"); 335 ret = -EINVAL; 336 goto fail; 337 } 338 339 /* Add the group to the container */ 340 if (ioctl(s->group, VFIO_GROUP_SET_CONTAINER, &s->container)) { 341 error_setg_errno(errp, errno, "Failed to add group to VFIO container"); 342 ret = -errno; 343 goto fail; 344 } 345 346 /* Enable the IOMMU model we want */ 347 if (ioctl(s->container, VFIO_SET_IOMMU, VFIO_TYPE1_IOMMU)) { 348 error_setg_errno(errp, errno, "Failed to set VFIO IOMMU type"); 349 ret = -errno; 350 goto fail; 351 } 352 353 iommu_info = g_malloc0(iommu_info_size); 354 iommu_info->argsz = iommu_info_size; 355 356 /* Get additional IOMMU info */ 357 if (ioctl(s->container, VFIO_IOMMU_GET_INFO, iommu_info)) { 358 error_setg_errno(errp, errno, "Failed to get IOMMU info"); 359 ret = -errno; 360 goto fail; 361 } 362 363 /* 364 * if the kernel does not report usable IOVA regions, choose 365 * the legacy [QEMU_VFIO_IOVA_MIN, QEMU_VFIO_IOVA_MAX -1] region 366 */ 367 s->nb_iova_ranges = 1; 368 s->usable_iova_ranges = g_new0(struct IOVARange, 1); 369 s->usable_iova_ranges[0].start = QEMU_VFIO_IOVA_MIN; 370 s->usable_iova_ranges[0].end = QEMU_VFIO_IOVA_MAX - 1; 371 372 if (iommu_info->argsz > iommu_info_size) { 373 iommu_info_size = iommu_info->argsz; 374 iommu_info = g_realloc(iommu_info, iommu_info_size); 375 if (ioctl(s->container, VFIO_IOMMU_GET_INFO, iommu_info)) { 376 ret = -errno; 377 goto fail; 378 } 379 collect_usable_iova_ranges(s, iommu_info); 380 } 381 382 s->device = ioctl(s->group, VFIO_GROUP_GET_DEVICE_FD, device); 383 384 if (s->device < 0) { 385 error_setg_errno(errp, errno, "Failed to get device fd"); 386 ret = -errno; 387 goto fail; 388 } 389 390 /* Test and setup the device */ 391 if (ioctl(s->device, VFIO_DEVICE_GET_INFO, &device_info)) { 392 error_setg_errno(errp, errno, "Failed to get device info"); 393 ret = -errno; 394 goto fail; 395 } 396 397 if (device_info.num_regions < VFIO_PCI_CONFIG_REGION_INDEX) { 398 error_setg(errp, "Invalid device regions"); 399 ret = -EINVAL; 400 goto fail; 401 } 402 403 s->config_region_info = (struct vfio_region_info) { 404 .index = VFIO_PCI_CONFIG_REGION_INDEX, 405 .argsz = sizeof(struct vfio_region_info), 406 }; 407 if (ioctl(s->device, VFIO_DEVICE_GET_REGION_INFO, &s->config_region_info)) { 408 error_setg_errno(errp, errno, "Failed to get config region info"); 409 ret = -errno; 410 goto fail; 411 } 412 413 for (i = 0; i < ARRAY_SIZE(s->bar_region_info); i++) { 414 ret = qemu_vfio_pci_init_bar(s, i, errp); 415 if (ret) { 416 goto fail; 417 } 418 } 419 420 /* Enable bus master */ 421 ret = qemu_vfio_pci_read_config(s, &pci_cmd, sizeof(pci_cmd), PCI_COMMAND); 422 if (ret) { 423 goto fail; 424 } 425 pci_cmd |= PCI_COMMAND_MASTER; 426 ret = qemu_vfio_pci_write_config(s, &pci_cmd, sizeof(pci_cmd), PCI_COMMAND); 427 if (ret) { 428 goto fail; 429 } 430 g_free(iommu_info); 431 return 0; 432 fail: 433 g_free(s->usable_iova_ranges); 434 s->usable_iova_ranges = NULL; 435 s->nb_iova_ranges = 0; 436 g_free(iommu_info); 437 close(s->group); 438 fail_container: 439 close(s->container); 440 return ret; 441 } 442 443 static void qemu_vfio_ram_block_added(RAMBlockNotifier *n, 444 void *host, size_t size) 445 { 446 QEMUVFIOState *s = container_of(n, QEMUVFIOState, ram_notifier); 447 trace_qemu_vfio_ram_block_added(s, host, size); 448 qemu_vfio_dma_map(s, host, size, false, NULL); 449 } 450 451 static void qemu_vfio_ram_block_removed(RAMBlockNotifier *n, 452 void *host, size_t size) 453 { 454 QEMUVFIOState *s = container_of(n, QEMUVFIOState, ram_notifier); 455 if (host) { 456 trace_qemu_vfio_ram_block_removed(s, host, size); 457 qemu_vfio_dma_unmap(s, host); 458 } 459 } 460 461 static int qemu_vfio_init_ramblock(RAMBlock *rb, void *opaque) 462 { 463 void *host_addr = qemu_ram_get_host_addr(rb); 464 ram_addr_t length = qemu_ram_get_used_length(rb); 465 int ret; 466 QEMUVFIOState *s = opaque; 467 468 if (!host_addr) { 469 return 0; 470 } 471 ret = qemu_vfio_dma_map(s, host_addr, length, false, NULL); 472 if (ret) { 473 fprintf(stderr, "qemu_vfio_init_ramblock: failed %p %" PRId64 "\n", 474 host_addr, (uint64_t)length); 475 } 476 return 0; 477 } 478 479 static void qemu_vfio_open_common(QEMUVFIOState *s) 480 { 481 qemu_mutex_init(&s->lock); 482 s->ram_notifier.ram_block_added = qemu_vfio_ram_block_added; 483 s->ram_notifier.ram_block_removed = qemu_vfio_ram_block_removed; 484 ram_block_notifier_add(&s->ram_notifier); 485 s->low_water_mark = QEMU_VFIO_IOVA_MIN; 486 s->high_water_mark = QEMU_VFIO_IOVA_MAX; 487 qemu_ram_foreach_block(qemu_vfio_init_ramblock, s); 488 } 489 490 /** 491 * Open a PCI device, e.g. "0000:00:01.0". 492 */ 493 QEMUVFIOState *qemu_vfio_open_pci(const char *device, Error **errp) 494 { 495 int r; 496 QEMUVFIOState *s = g_new0(QEMUVFIOState, 1); 497 498 /* 499 * VFIO may pin all memory inside mappings, resulting it in pinning 500 * all memory inside RAM blocks unconditionally. 501 */ 502 r = ram_block_discard_disable(true); 503 if (r) { 504 error_setg_errno(errp, -r, "Cannot set discarding of RAM broken"); 505 g_free(s); 506 return NULL; 507 } 508 509 r = qemu_vfio_init_pci(s, device, errp); 510 if (r) { 511 ram_block_discard_disable(false); 512 g_free(s); 513 return NULL; 514 } 515 qemu_vfio_open_common(s); 516 return s; 517 } 518 519 static void qemu_vfio_dump_mapping(IOVAMapping *m) 520 { 521 if (QEMU_VFIO_DEBUG) { 522 printf(" vfio mapping %p %" PRIx64 " to %" PRIx64 "\n", m->host, 523 (uint64_t)m->size, (uint64_t)m->iova); 524 } 525 } 526 527 static void qemu_vfio_dump_mappings(QEMUVFIOState *s) 528 { 529 int i; 530 531 if (QEMU_VFIO_DEBUG) { 532 printf("vfio mappings\n"); 533 for (i = 0; i < s->nr_mappings; ++i) { 534 qemu_vfio_dump_mapping(&s->mappings[i]); 535 } 536 } 537 } 538 539 /** 540 * Find the mapping entry that contains [host, host + size) and set @index to 541 * the position. If no entry contains it, @index is the position _after_ which 542 * to insert the new mapping. IOW, it is the index of the largest element that 543 * is smaller than @host, or -1 if no entry is. 544 */ 545 static IOVAMapping *qemu_vfio_find_mapping(QEMUVFIOState *s, void *host, 546 int *index) 547 { 548 IOVAMapping *p = s->mappings; 549 IOVAMapping *q = p ? p + s->nr_mappings - 1 : NULL; 550 IOVAMapping *mid; 551 trace_qemu_vfio_find_mapping(s, host); 552 if (!p) { 553 *index = -1; 554 return NULL; 555 } 556 while (true) { 557 mid = p + (q - p) / 2; 558 if (mid == p) { 559 break; 560 } 561 if (mid->host > host) { 562 q = mid; 563 } else if (mid->host < host) { 564 p = mid; 565 } else { 566 break; 567 } 568 } 569 if (mid->host > host) { 570 mid--; 571 } else if (mid < &s->mappings[s->nr_mappings - 1] 572 && (mid + 1)->host <= host) { 573 mid++; 574 } 575 *index = mid - &s->mappings[0]; 576 if (mid >= &s->mappings[0] && 577 mid->host <= host && mid->host + mid->size > host) { 578 assert(mid < &s->mappings[s->nr_mappings]); 579 return mid; 580 } 581 /* At this point *index + 1 is the right position to insert the new 582 * mapping.*/ 583 return NULL; 584 } 585 586 /** 587 * Allocate IOVA and create a new mapping record and insert it in @s. 588 */ 589 static IOVAMapping *qemu_vfio_add_mapping(QEMUVFIOState *s, 590 void *host, size_t size, 591 int index, uint64_t iova) 592 { 593 int shift; 594 IOVAMapping m = {.host = host, .size = size, .iova = iova}; 595 IOVAMapping *insert; 596 597 assert(QEMU_IS_ALIGNED(size, qemu_real_host_page_size)); 598 assert(QEMU_IS_ALIGNED(s->low_water_mark, qemu_real_host_page_size)); 599 assert(QEMU_IS_ALIGNED(s->high_water_mark, qemu_real_host_page_size)); 600 trace_qemu_vfio_new_mapping(s, host, size, index, iova); 601 602 assert(index >= 0); 603 s->nr_mappings++; 604 s->mappings = g_renew(IOVAMapping, s->mappings, s->nr_mappings); 605 insert = &s->mappings[index]; 606 shift = s->nr_mappings - index - 1; 607 if (shift) { 608 memmove(insert + 1, insert, shift * sizeof(s->mappings[0])); 609 } 610 *insert = m; 611 return insert; 612 } 613 614 /* Do the DMA mapping with VFIO. */ 615 static int qemu_vfio_do_mapping(QEMUVFIOState *s, void *host, size_t size, 616 uint64_t iova) 617 { 618 struct vfio_iommu_type1_dma_map dma_map = { 619 .argsz = sizeof(dma_map), 620 .flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE, 621 .iova = iova, 622 .vaddr = (uintptr_t)host, 623 .size = size, 624 }; 625 trace_qemu_vfio_do_mapping(s, host, size, iova); 626 627 if (ioctl(s->container, VFIO_IOMMU_MAP_DMA, &dma_map)) { 628 error_report("VFIO_MAP_DMA failed: %s", strerror(errno)); 629 return -errno; 630 } 631 return 0; 632 } 633 634 /** 635 * Undo the DMA mapping from @s with VFIO, and remove from mapping list. 636 */ 637 static void qemu_vfio_undo_mapping(QEMUVFIOState *s, IOVAMapping *mapping, 638 Error **errp) 639 { 640 int index; 641 struct vfio_iommu_type1_dma_unmap unmap = { 642 .argsz = sizeof(unmap), 643 .flags = 0, 644 .iova = mapping->iova, 645 .size = mapping->size, 646 }; 647 648 index = mapping - s->mappings; 649 assert(mapping->size > 0); 650 assert(QEMU_IS_ALIGNED(mapping->size, qemu_real_host_page_size)); 651 assert(index >= 0 && index < s->nr_mappings); 652 if (ioctl(s->container, VFIO_IOMMU_UNMAP_DMA, &unmap)) { 653 error_setg_errno(errp, errno, "VFIO_UNMAP_DMA failed"); 654 } 655 memmove(mapping, &s->mappings[index + 1], 656 sizeof(s->mappings[0]) * (s->nr_mappings - index - 1)); 657 s->nr_mappings--; 658 s->mappings = g_renew(IOVAMapping, s->mappings, s->nr_mappings); 659 } 660 661 /* Check if the mapping list is (ascending) ordered. */ 662 static bool qemu_vfio_verify_mappings(QEMUVFIOState *s) 663 { 664 int i; 665 if (QEMU_VFIO_DEBUG) { 666 for (i = 0; i < s->nr_mappings - 1; ++i) { 667 if (!(s->mappings[i].host < s->mappings[i + 1].host)) { 668 fprintf(stderr, "item %d not sorted!\n", i); 669 qemu_vfio_dump_mappings(s); 670 return false; 671 } 672 if (!(s->mappings[i].host + s->mappings[i].size <= 673 s->mappings[i + 1].host)) { 674 fprintf(stderr, "item %d overlap with next!\n", i); 675 qemu_vfio_dump_mappings(s); 676 return false; 677 } 678 } 679 } 680 return true; 681 } 682 683 static int 684 qemu_vfio_find_fixed_iova(QEMUVFIOState *s, size_t size, uint64_t *iova) 685 { 686 int i; 687 688 for (i = 0; i < s->nb_iova_ranges; i++) { 689 if (s->usable_iova_ranges[i].end < s->low_water_mark) { 690 continue; 691 } 692 s->low_water_mark = 693 MAX(s->low_water_mark, s->usable_iova_ranges[i].start); 694 695 if (s->usable_iova_ranges[i].end - s->low_water_mark + 1 >= size || 696 s->usable_iova_ranges[i].end - s->low_water_mark + 1 == 0) { 697 *iova = s->low_water_mark; 698 s->low_water_mark += size; 699 return 0; 700 } 701 } 702 return -ENOMEM; 703 } 704 705 static int 706 qemu_vfio_find_temp_iova(QEMUVFIOState *s, size_t size, uint64_t *iova) 707 { 708 int i; 709 710 for (i = s->nb_iova_ranges - 1; i >= 0; i--) { 711 if (s->usable_iova_ranges[i].start > s->high_water_mark) { 712 continue; 713 } 714 s->high_water_mark = 715 MIN(s->high_water_mark, s->usable_iova_ranges[i].end + 1); 716 717 if (s->high_water_mark - s->usable_iova_ranges[i].start + 1 >= size || 718 s->high_water_mark - s->usable_iova_ranges[i].start + 1 == 0) { 719 *iova = s->high_water_mark - size; 720 s->high_water_mark = *iova; 721 return 0; 722 } 723 } 724 return -ENOMEM; 725 } 726 727 /* Map [host, host + size) area into a contiguous IOVA address space, and store 728 * the result in @iova if not NULL. The caller need to make sure the area is 729 * aligned to page size, and mustn't overlap with existing mapping areas (split 730 * mapping status within this area is not allowed). 731 */ 732 int qemu_vfio_dma_map(QEMUVFIOState *s, void *host, size_t size, 733 bool temporary, uint64_t *iova) 734 { 735 int ret = 0; 736 int index; 737 IOVAMapping *mapping; 738 uint64_t iova0; 739 740 assert(QEMU_PTR_IS_ALIGNED(host, qemu_real_host_page_size)); 741 assert(QEMU_IS_ALIGNED(size, qemu_real_host_page_size)); 742 trace_qemu_vfio_dma_map(s, host, size, temporary, iova); 743 qemu_mutex_lock(&s->lock); 744 mapping = qemu_vfio_find_mapping(s, host, &index); 745 if (mapping) { 746 iova0 = mapping->iova + ((uint8_t *)host - (uint8_t *)mapping->host); 747 } else { 748 if (s->high_water_mark - s->low_water_mark + 1 < size) { 749 ret = -ENOMEM; 750 goto out; 751 } 752 if (!temporary) { 753 if (qemu_vfio_find_fixed_iova(s, size, &iova0)) { 754 ret = -ENOMEM; 755 goto out; 756 } 757 758 mapping = qemu_vfio_add_mapping(s, host, size, index + 1, iova0); 759 if (!mapping) { 760 ret = -ENOMEM; 761 goto out; 762 } 763 assert(qemu_vfio_verify_mappings(s)); 764 ret = qemu_vfio_do_mapping(s, host, size, iova0); 765 if (ret) { 766 qemu_vfio_undo_mapping(s, mapping, NULL); 767 goto out; 768 } 769 qemu_vfio_dump_mappings(s); 770 } else { 771 if (qemu_vfio_find_temp_iova(s, size, &iova0)) { 772 ret = -ENOMEM; 773 goto out; 774 } 775 ret = qemu_vfio_do_mapping(s, host, size, iova0); 776 if (ret) { 777 goto out; 778 } 779 } 780 } 781 if (iova) { 782 *iova = iova0; 783 } 784 out: 785 qemu_mutex_unlock(&s->lock); 786 return ret; 787 } 788 789 /* Reset the high watermark and free all "temporary" mappings. */ 790 int qemu_vfio_dma_reset_temporary(QEMUVFIOState *s) 791 { 792 struct vfio_iommu_type1_dma_unmap unmap = { 793 .argsz = sizeof(unmap), 794 .flags = 0, 795 .iova = s->high_water_mark, 796 .size = QEMU_VFIO_IOVA_MAX - s->high_water_mark, 797 }; 798 trace_qemu_vfio_dma_reset_temporary(s); 799 QEMU_LOCK_GUARD(&s->lock); 800 if (ioctl(s->container, VFIO_IOMMU_UNMAP_DMA, &unmap)) { 801 error_report("VFIO_UNMAP_DMA failed: %s", strerror(errno)); 802 return -errno; 803 } 804 s->high_water_mark = QEMU_VFIO_IOVA_MAX; 805 return 0; 806 } 807 808 /* Unmapping the whole area that was previously mapped with 809 * qemu_vfio_dma_map(). */ 810 void qemu_vfio_dma_unmap(QEMUVFIOState *s, void *host) 811 { 812 int index = 0; 813 IOVAMapping *m; 814 815 if (!host) { 816 return; 817 } 818 819 trace_qemu_vfio_dma_unmap(s, host); 820 qemu_mutex_lock(&s->lock); 821 m = qemu_vfio_find_mapping(s, host, &index); 822 if (!m) { 823 goto out; 824 } 825 qemu_vfio_undo_mapping(s, m, NULL); 826 out: 827 qemu_mutex_unlock(&s->lock); 828 } 829 830 static void qemu_vfio_reset(QEMUVFIOState *s) 831 { 832 ioctl(s->device, VFIO_DEVICE_RESET); 833 } 834 835 /* Close and free the VFIO resources. */ 836 void qemu_vfio_close(QEMUVFIOState *s) 837 { 838 int i; 839 840 if (!s) { 841 return; 842 } 843 for (i = 0; i < s->nr_mappings; ++i) { 844 qemu_vfio_undo_mapping(s, &s->mappings[i], NULL); 845 } 846 ram_block_notifier_remove(&s->ram_notifier); 847 g_free(s->usable_iova_ranges); 848 s->nb_iova_ranges = 0; 849 qemu_vfio_reset(s); 850 close(s->device); 851 close(s->group); 852 close(s->container); 853 ram_block_discard_disable(false); 854 } 855