1 /* 2 * VFIO utility 3 * 4 * Copyright 2016 - 2018 Red Hat, Inc. 5 * 6 * Authors: 7 * Fam Zheng <famz@redhat.com> 8 * 9 * This work is licensed under the terms of the GNU GPL, version 2 or later. 10 * See the COPYING file in the top-level directory. 11 */ 12 13 #include "qemu/osdep.h" 14 #include <sys/ioctl.h> 15 #include <linux/vfio.h> 16 #include "qapi/error.h" 17 #include "exec/ramlist.h" 18 #include "exec/cpu-common.h" 19 #include "exec/memory.h" 20 #include "trace.h" 21 #include "qemu/error-report.h" 22 #include "standard-headers/linux/pci_regs.h" 23 #include "qemu/event_notifier.h" 24 #include "qemu/vfio-helpers.h" 25 #include "qemu/lockable.h" 26 #include "trace.h" 27 28 #define QEMU_VFIO_DEBUG 0 29 30 #define QEMU_VFIO_IOVA_MIN 0x10000ULL 31 /* XXX: Once VFIO exposes the iova bit width in the IOMMU capability interface, 32 * we can use a runtime limit; alternatively it's also possible to do platform 33 * specific detection by reading sysfs entries. Until then, 39 is a safe bet. 34 **/ 35 #define QEMU_VFIO_IOVA_MAX (1ULL << 39) 36 37 typedef struct { 38 /* Page aligned addr. */ 39 void *host; 40 size_t size; 41 uint64_t iova; 42 } IOVAMapping; 43 44 struct IOVARange { 45 uint64_t start; 46 uint64_t end; 47 }; 48 49 struct QEMUVFIOState { 50 QemuMutex lock; 51 52 /* These fields are protected by BQL */ 53 int container; 54 int group; 55 int device; 56 RAMBlockNotifier ram_notifier; 57 struct vfio_region_info config_region_info, bar_region_info[6]; 58 struct IOVARange *usable_iova_ranges; 59 uint8_t nb_iova_ranges; 60 61 /* These fields are protected by @lock */ 62 /* VFIO's IO virtual address space is managed by splitting into a few 63 * sections: 64 * 65 * --------------- <= 0 66 * |xxxxxxxxxxxxx| 67 * |-------------| <= QEMU_VFIO_IOVA_MIN 68 * | | 69 * | Fixed | 70 * | | 71 * |-------------| <= low_water_mark 72 * | | 73 * | Free | 74 * | | 75 * |-------------| <= high_water_mark 76 * | | 77 * | Temp | 78 * | | 79 * |-------------| <= QEMU_VFIO_IOVA_MAX 80 * |xxxxxxxxxxxxx| 81 * |xxxxxxxxxxxxx| 82 * --------------- 83 * 84 * - Addresses lower than QEMU_VFIO_IOVA_MIN are reserved as invalid; 85 * 86 * - Fixed mappings of HVAs are assigned "low" IOVAs in the range of 87 * [QEMU_VFIO_IOVA_MIN, low_water_mark). Once allocated they will not be 88 * reclaimed - low_water_mark never shrinks; 89 * 90 * - IOVAs in range [low_water_mark, high_water_mark) are free; 91 * 92 * - IOVAs in range [high_water_mark, QEMU_VFIO_IOVA_MAX) are volatile 93 * mappings. At each qemu_vfio_dma_reset_temporary() call, the whole area 94 * is recycled. The caller should make sure I/O's depending on these 95 * mappings are completed before calling. 96 **/ 97 uint64_t low_water_mark; 98 uint64_t high_water_mark; 99 IOVAMapping *mappings; 100 int nr_mappings; 101 }; 102 103 /** 104 * Find group file by PCI device address as specified @device, and return the 105 * path. The returned string is owned by caller and should be g_free'ed later. 106 */ 107 static char *sysfs_find_group_file(const char *device, Error **errp) 108 { 109 g_autoptr(GError) gerr = NULL; 110 char *sysfs_link; 111 char *sysfs_group; 112 char *p; 113 char *path = NULL; 114 115 sysfs_link = g_strdup_printf("/sys/bus/pci/devices/%s/iommu_group", device); 116 sysfs_group = g_file_read_link(sysfs_link, &gerr); 117 if (gerr) { 118 error_setg(errp, "Failed to find iommu group sysfs path: %s", 119 gerr->message); 120 goto out; 121 } 122 p = strrchr(sysfs_group, '/'); 123 if (!p) { 124 error_setg(errp, "Failed to find iommu group number"); 125 goto out; 126 } 127 128 path = g_strdup_printf("/dev/vfio/%s", p + 1); 129 out: 130 g_free(sysfs_link); 131 g_free(sysfs_group); 132 return path; 133 } 134 135 static inline void assert_bar_index_valid(QEMUVFIOState *s, int index) 136 { 137 assert(index >= 0 && index < ARRAY_SIZE(s->bar_region_info)); 138 } 139 140 static int qemu_vfio_pci_init_bar(QEMUVFIOState *s, int index, Error **errp) 141 { 142 g_autofree char *barname = NULL; 143 assert_bar_index_valid(s, index); 144 s->bar_region_info[index] = (struct vfio_region_info) { 145 .index = VFIO_PCI_BAR0_REGION_INDEX + index, 146 .argsz = sizeof(struct vfio_region_info), 147 }; 148 if (ioctl(s->device, VFIO_DEVICE_GET_REGION_INFO, &s->bar_region_info[index])) { 149 error_setg_errno(errp, errno, "Failed to get BAR region info"); 150 return -errno; 151 } 152 barname = g_strdup_printf("bar[%d]", index); 153 trace_qemu_vfio_region_info(barname, s->bar_region_info[index].offset, 154 s->bar_region_info[index].size, 155 s->bar_region_info[index].cap_offset); 156 157 return 0; 158 } 159 160 /** 161 * Map a PCI bar area. 162 */ 163 void *qemu_vfio_pci_map_bar(QEMUVFIOState *s, int index, 164 uint64_t offset, uint64_t size, int prot, 165 Error **errp) 166 { 167 void *p; 168 assert(QEMU_IS_ALIGNED(offset, qemu_real_host_page_size())); 169 assert_bar_index_valid(s, index); 170 p = mmap(NULL, MIN(size, s->bar_region_info[index].size - offset), 171 prot, MAP_SHARED, 172 s->device, s->bar_region_info[index].offset + offset); 173 trace_qemu_vfio_pci_map_bar(index, s->bar_region_info[index].offset , 174 size, offset, p); 175 if (p == MAP_FAILED) { 176 error_setg_errno(errp, errno, "Failed to map BAR region"); 177 p = NULL; 178 } 179 return p; 180 } 181 182 /** 183 * Unmap a PCI bar area. 184 */ 185 void qemu_vfio_pci_unmap_bar(QEMUVFIOState *s, int index, void *bar, 186 uint64_t offset, uint64_t size) 187 { 188 if (bar) { 189 munmap(bar, MIN(size, s->bar_region_info[index].size - offset)); 190 } 191 } 192 193 /** 194 * Initialize device IRQ with @irq_type and register an event notifier. 195 */ 196 int qemu_vfio_pci_init_irq(QEMUVFIOState *s, EventNotifier *e, 197 int irq_type, Error **errp) 198 { 199 int r; 200 struct vfio_irq_set *irq_set; 201 size_t irq_set_size; 202 struct vfio_irq_info irq_info = { .argsz = sizeof(irq_info) }; 203 204 irq_info.index = irq_type; 205 if (ioctl(s->device, VFIO_DEVICE_GET_IRQ_INFO, &irq_info)) { 206 error_setg_errno(errp, errno, "Failed to get device interrupt info"); 207 return -errno; 208 } 209 if (!(irq_info.flags & VFIO_IRQ_INFO_EVENTFD)) { 210 error_setg(errp, "Device interrupt doesn't support eventfd"); 211 return -EINVAL; 212 } 213 214 irq_set_size = sizeof(*irq_set) + sizeof(int); 215 irq_set = g_malloc0(irq_set_size); 216 217 /* Get to a known IRQ state */ 218 *irq_set = (struct vfio_irq_set) { 219 .argsz = irq_set_size, 220 .flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER, 221 .index = irq_info.index, 222 .start = 0, 223 .count = 1, 224 }; 225 226 *(int *)&irq_set->data = event_notifier_get_fd(e); 227 r = ioctl(s->device, VFIO_DEVICE_SET_IRQS, irq_set); 228 g_free(irq_set); 229 if (r) { 230 error_setg_errno(errp, errno, "Failed to setup device interrupt"); 231 return -errno; 232 } 233 return 0; 234 } 235 236 static int qemu_vfio_pci_read_config(QEMUVFIOState *s, void *buf, 237 int size, int ofs) 238 { 239 int ret; 240 241 trace_qemu_vfio_pci_read_config(buf, ofs, size, 242 s->config_region_info.offset, 243 s->config_region_info.size); 244 assert(QEMU_IS_ALIGNED(s->config_region_info.offset + ofs, size)); 245 ret = RETRY_ON_EINTR( 246 pread(s->device, buf, size, s->config_region_info.offset + ofs) 247 ); 248 return ret == size ? 0 : -errno; 249 } 250 251 static int qemu_vfio_pci_write_config(QEMUVFIOState *s, void *buf, int size, int ofs) 252 { 253 int ret; 254 255 trace_qemu_vfio_pci_write_config(buf, ofs, size, 256 s->config_region_info.offset, 257 s->config_region_info.size); 258 assert(QEMU_IS_ALIGNED(s->config_region_info.offset + ofs, size)); 259 ret = RETRY_ON_EINTR( 260 pwrite(s->device, buf, size, s->config_region_info.offset + ofs) 261 ); 262 return ret == size ? 0 : -errno; 263 } 264 265 static void collect_usable_iova_ranges(QEMUVFIOState *s, void *buf) 266 { 267 struct vfio_iommu_type1_info *info = (struct vfio_iommu_type1_info *)buf; 268 struct vfio_info_cap_header *cap = (void *)buf + info->cap_offset; 269 struct vfio_iommu_type1_info_cap_iova_range *cap_iova_range; 270 int i; 271 272 while (cap->id != VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE) { 273 if (!cap->next) { 274 return; 275 } 276 cap = buf + cap->next; 277 } 278 279 cap_iova_range = (struct vfio_iommu_type1_info_cap_iova_range *)cap; 280 281 s->nb_iova_ranges = cap_iova_range->nr_iovas; 282 if (s->nb_iova_ranges > 1) { 283 s->usable_iova_ranges = 284 g_renew(struct IOVARange, s->usable_iova_ranges, 285 s->nb_iova_ranges); 286 } 287 288 for (i = 0; i < s->nb_iova_ranges; i++) { 289 s->usable_iova_ranges[i].start = cap_iova_range->iova_ranges[i].start; 290 s->usable_iova_ranges[i].end = cap_iova_range->iova_ranges[i].end; 291 } 292 } 293 294 static int qemu_vfio_init_pci(QEMUVFIOState *s, const char *device, 295 Error **errp) 296 { 297 int ret; 298 int i; 299 uint16_t pci_cmd; 300 struct vfio_group_status group_status = { .argsz = sizeof(group_status) }; 301 struct vfio_iommu_type1_info *iommu_info = NULL; 302 size_t iommu_info_size = sizeof(*iommu_info); 303 struct vfio_device_info device_info = { .argsz = sizeof(device_info) }; 304 char *group_file = NULL; 305 306 s->usable_iova_ranges = NULL; 307 308 /* Create a new container */ 309 s->container = open("/dev/vfio/vfio", O_RDWR); 310 311 if (s->container == -1) { 312 error_setg_errno(errp, errno, "Failed to open /dev/vfio/vfio"); 313 return -errno; 314 } 315 if (ioctl(s->container, VFIO_GET_API_VERSION) != VFIO_API_VERSION) { 316 error_setg(errp, "Invalid VFIO version"); 317 ret = -EINVAL; 318 goto fail_container; 319 } 320 321 if (!ioctl(s->container, VFIO_CHECK_EXTENSION, VFIO_TYPE1_IOMMU)) { 322 error_setg_errno(errp, errno, "VFIO IOMMU Type1 is not supported"); 323 ret = -EINVAL; 324 goto fail_container; 325 } 326 327 /* Open the group */ 328 group_file = sysfs_find_group_file(device, errp); 329 if (!group_file) { 330 ret = -EINVAL; 331 goto fail_container; 332 } 333 334 s->group = open(group_file, O_RDWR); 335 if (s->group == -1) { 336 error_setg_errno(errp, errno, "Failed to open VFIO group file: %s", 337 group_file); 338 g_free(group_file); 339 ret = -errno; 340 goto fail_container; 341 } 342 g_free(group_file); 343 344 /* Test the group is viable and available */ 345 if (ioctl(s->group, VFIO_GROUP_GET_STATUS, &group_status)) { 346 error_setg_errno(errp, errno, "Failed to get VFIO group status"); 347 ret = -errno; 348 goto fail; 349 } 350 351 if (!(group_status.flags & VFIO_GROUP_FLAGS_VIABLE)) { 352 error_setg(errp, "VFIO group is not viable"); 353 ret = -EINVAL; 354 goto fail; 355 } 356 357 /* Add the group to the container */ 358 if (ioctl(s->group, VFIO_GROUP_SET_CONTAINER, &s->container)) { 359 error_setg_errno(errp, errno, "Failed to add group to VFIO container"); 360 ret = -errno; 361 goto fail; 362 } 363 364 /* Enable the IOMMU model we want */ 365 if (ioctl(s->container, VFIO_SET_IOMMU, VFIO_TYPE1_IOMMU)) { 366 error_setg_errno(errp, errno, "Failed to set VFIO IOMMU type"); 367 ret = -errno; 368 goto fail; 369 } 370 371 iommu_info = g_malloc0(iommu_info_size); 372 iommu_info->argsz = iommu_info_size; 373 374 /* Get additional IOMMU info */ 375 if (ioctl(s->container, VFIO_IOMMU_GET_INFO, iommu_info)) { 376 error_setg_errno(errp, errno, "Failed to get IOMMU info"); 377 ret = -errno; 378 goto fail; 379 } 380 381 /* 382 * if the kernel does not report usable IOVA regions, choose 383 * the legacy [QEMU_VFIO_IOVA_MIN, QEMU_VFIO_IOVA_MAX -1] region 384 */ 385 s->nb_iova_ranges = 1; 386 s->usable_iova_ranges = g_new0(struct IOVARange, 1); 387 s->usable_iova_ranges[0].start = QEMU_VFIO_IOVA_MIN; 388 s->usable_iova_ranges[0].end = QEMU_VFIO_IOVA_MAX - 1; 389 390 if (iommu_info->argsz > iommu_info_size) { 391 iommu_info_size = iommu_info->argsz; 392 iommu_info = g_realloc(iommu_info, iommu_info_size); 393 if (ioctl(s->container, VFIO_IOMMU_GET_INFO, iommu_info)) { 394 ret = -errno; 395 goto fail; 396 } 397 collect_usable_iova_ranges(s, iommu_info); 398 } 399 400 s->device = ioctl(s->group, VFIO_GROUP_GET_DEVICE_FD, device); 401 402 if (s->device < 0) { 403 error_setg_errno(errp, errno, "Failed to get device fd"); 404 ret = -errno; 405 goto fail; 406 } 407 408 /* Test and setup the device */ 409 if (ioctl(s->device, VFIO_DEVICE_GET_INFO, &device_info)) { 410 error_setg_errno(errp, errno, "Failed to get device info"); 411 ret = -errno; 412 goto fail; 413 } 414 415 if (device_info.num_regions < VFIO_PCI_CONFIG_REGION_INDEX) { 416 error_setg(errp, "Invalid device regions"); 417 ret = -EINVAL; 418 goto fail; 419 } 420 421 s->config_region_info = (struct vfio_region_info) { 422 .index = VFIO_PCI_CONFIG_REGION_INDEX, 423 .argsz = sizeof(struct vfio_region_info), 424 }; 425 if (ioctl(s->device, VFIO_DEVICE_GET_REGION_INFO, &s->config_region_info)) { 426 error_setg_errno(errp, errno, "Failed to get config region info"); 427 ret = -errno; 428 goto fail; 429 } 430 trace_qemu_vfio_region_info("config", s->config_region_info.offset, 431 s->config_region_info.size, 432 s->config_region_info.cap_offset); 433 434 for (i = 0; i < ARRAY_SIZE(s->bar_region_info); i++) { 435 ret = qemu_vfio_pci_init_bar(s, i, errp); 436 if (ret) { 437 goto fail; 438 } 439 } 440 441 /* Enable bus master */ 442 ret = qemu_vfio_pci_read_config(s, &pci_cmd, sizeof(pci_cmd), PCI_COMMAND); 443 if (ret) { 444 goto fail; 445 } 446 pci_cmd |= PCI_COMMAND_MASTER; 447 ret = qemu_vfio_pci_write_config(s, &pci_cmd, sizeof(pci_cmd), PCI_COMMAND); 448 if (ret) { 449 goto fail; 450 } 451 g_free(iommu_info); 452 return 0; 453 fail: 454 g_free(s->usable_iova_ranges); 455 s->usable_iova_ranges = NULL; 456 s->nb_iova_ranges = 0; 457 g_free(iommu_info); 458 close(s->group); 459 fail_container: 460 close(s->container); 461 return ret; 462 } 463 464 static void qemu_vfio_ram_block_added(RAMBlockNotifier *n, void *host, 465 size_t size, size_t max_size) 466 { 467 QEMUVFIOState *s = container_of(n, QEMUVFIOState, ram_notifier); 468 Error *local_err = NULL; 469 int ret; 470 471 trace_qemu_vfio_ram_block_added(s, host, max_size); 472 ret = qemu_vfio_dma_map(s, host, max_size, false, NULL, &local_err); 473 if (ret) { 474 error_reportf_err(local_err, 475 "qemu_vfio_dma_map(%p, %zu) failed: ", 476 host, max_size); 477 } 478 } 479 480 static void qemu_vfio_ram_block_removed(RAMBlockNotifier *n, void *host, 481 size_t size, size_t max_size) 482 { 483 QEMUVFIOState *s = container_of(n, QEMUVFIOState, ram_notifier); 484 if (host) { 485 trace_qemu_vfio_ram_block_removed(s, host, max_size); 486 qemu_vfio_dma_unmap(s, host); 487 } 488 } 489 490 static void qemu_vfio_open_common(QEMUVFIOState *s) 491 { 492 qemu_mutex_init(&s->lock); 493 s->ram_notifier.ram_block_added = qemu_vfio_ram_block_added; 494 s->ram_notifier.ram_block_removed = qemu_vfio_ram_block_removed; 495 s->low_water_mark = QEMU_VFIO_IOVA_MIN; 496 s->high_water_mark = QEMU_VFIO_IOVA_MAX; 497 ram_block_notifier_add(&s->ram_notifier); 498 } 499 500 /** 501 * Open a PCI device, e.g. "0000:00:01.0". 502 */ 503 QEMUVFIOState *qemu_vfio_open_pci(const char *device, Error **errp) 504 { 505 int r; 506 QEMUVFIOState *s = g_new0(QEMUVFIOState, 1); 507 508 /* 509 * VFIO may pin all memory inside mappings, resulting it in pinning 510 * all memory inside RAM blocks unconditionally. 511 */ 512 r = ram_block_discard_disable(true); 513 if (r) { 514 error_setg_errno(errp, -r, "Cannot set discarding of RAM broken"); 515 g_free(s); 516 return NULL; 517 } 518 519 r = qemu_vfio_init_pci(s, device, errp); 520 if (r) { 521 ram_block_discard_disable(false); 522 g_free(s); 523 return NULL; 524 } 525 qemu_vfio_open_common(s); 526 return s; 527 } 528 529 static void qemu_vfio_dump_mappings(QEMUVFIOState *s) 530 { 531 for (int i = 0; i < s->nr_mappings; ++i) { 532 trace_qemu_vfio_dump_mapping(s->mappings[i].host, 533 s->mappings[i].iova, 534 s->mappings[i].size); 535 } 536 } 537 538 /** 539 * Find the mapping entry that contains [host, host + size) and set @index to 540 * the position. If no entry contains it, @index is the position _after_ which 541 * to insert the new mapping. IOW, it is the index of the largest element that 542 * is smaller than @host, or -1 if no entry is. 543 */ 544 static IOVAMapping *qemu_vfio_find_mapping(QEMUVFIOState *s, void *host, 545 int *index) 546 { 547 IOVAMapping *p = s->mappings; 548 IOVAMapping *q = p ? p + s->nr_mappings - 1 : NULL; 549 IOVAMapping *mid; 550 trace_qemu_vfio_find_mapping(s, host); 551 if (!p) { 552 *index = -1; 553 return NULL; 554 } 555 while (true) { 556 mid = p + (q - p) / 2; 557 if (mid == p) { 558 break; 559 } 560 if (mid->host > host) { 561 q = mid; 562 } else if (mid->host < host) { 563 p = mid; 564 } else { 565 break; 566 } 567 } 568 if (mid->host > host) { 569 mid--; 570 } else if (mid < &s->mappings[s->nr_mappings - 1] 571 && (mid + 1)->host <= host) { 572 mid++; 573 } 574 *index = mid - &s->mappings[0]; 575 if (mid >= &s->mappings[0] && 576 mid->host <= host && mid->host + mid->size > host) { 577 assert(mid < &s->mappings[s->nr_mappings]); 578 return mid; 579 } 580 /* At this point *index + 1 is the right position to insert the new 581 * mapping.*/ 582 return NULL; 583 } 584 585 /** 586 * Allocate IOVA and create a new mapping record and insert it in @s. 587 */ 588 static IOVAMapping *qemu_vfio_add_mapping(QEMUVFIOState *s, 589 void *host, size_t size, 590 int index, uint64_t iova) 591 { 592 int shift; 593 IOVAMapping m = {.host = host, .size = size, .iova = iova}; 594 IOVAMapping *insert; 595 596 assert(QEMU_IS_ALIGNED(size, qemu_real_host_page_size())); 597 assert(QEMU_IS_ALIGNED(s->low_water_mark, qemu_real_host_page_size())); 598 assert(QEMU_IS_ALIGNED(s->high_water_mark, qemu_real_host_page_size())); 599 trace_qemu_vfio_new_mapping(s, host, size, index, iova); 600 601 assert(index >= 0); 602 s->nr_mappings++; 603 s->mappings = g_renew(IOVAMapping, s->mappings, s->nr_mappings); 604 insert = &s->mappings[index]; 605 shift = s->nr_mappings - index - 1; 606 if (shift) { 607 memmove(insert + 1, insert, shift * sizeof(s->mappings[0])); 608 } 609 *insert = m; 610 return insert; 611 } 612 613 /* Do the DMA mapping with VFIO. */ 614 static int qemu_vfio_do_mapping(QEMUVFIOState *s, void *host, size_t size, 615 uint64_t iova, Error **errp) 616 { 617 struct vfio_iommu_type1_dma_map dma_map = { 618 .argsz = sizeof(dma_map), 619 .flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE, 620 .iova = iova, 621 .vaddr = (uintptr_t)host, 622 .size = size, 623 }; 624 trace_qemu_vfio_do_mapping(s, host, iova, size); 625 626 if (ioctl(s->container, VFIO_IOMMU_MAP_DMA, &dma_map)) { 627 error_setg_errno(errp, errno, "VFIO_MAP_DMA failed"); 628 return -errno; 629 } 630 return 0; 631 } 632 633 /** 634 * Undo the DMA mapping from @s with VFIO, and remove from mapping list. 635 */ 636 static void qemu_vfio_undo_mapping(QEMUVFIOState *s, IOVAMapping *mapping, 637 Error **errp) 638 { 639 int index; 640 struct vfio_iommu_type1_dma_unmap unmap = { 641 .argsz = sizeof(unmap), 642 .flags = 0, 643 .iova = mapping->iova, 644 .size = mapping->size, 645 }; 646 647 index = mapping - s->mappings; 648 assert(mapping->size > 0); 649 assert(QEMU_IS_ALIGNED(mapping->size, qemu_real_host_page_size())); 650 assert(index >= 0 && index < s->nr_mappings); 651 if (ioctl(s->container, VFIO_IOMMU_UNMAP_DMA, &unmap)) { 652 error_setg_errno(errp, errno, "VFIO_UNMAP_DMA failed"); 653 } 654 memmove(mapping, &s->mappings[index + 1], 655 sizeof(s->mappings[0]) * (s->nr_mappings - index - 1)); 656 s->nr_mappings--; 657 s->mappings = g_renew(IOVAMapping, s->mappings, s->nr_mappings); 658 } 659 660 /* Check if the mapping list is (ascending) ordered. */ 661 static bool qemu_vfio_verify_mappings(QEMUVFIOState *s) 662 { 663 int i; 664 if (QEMU_VFIO_DEBUG) { 665 for (i = 0; i < s->nr_mappings - 1; ++i) { 666 if (!(s->mappings[i].host < s->mappings[i + 1].host)) { 667 error_report("item %d not sorted!", i); 668 qemu_vfio_dump_mappings(s); 669 return false; 670 } 671 if (!(s->mappings[i].host + s->mappings[i].size <= 672 s->mappings[i + 1].host)) { 673 error_report("item %d overlap with next!", i); 674 qemu_vfio_dump_mappings(s); 675 return false; 676 } 677 } 678 } 679 return true; 680 } 681 682 static bool qemu_vfio_find_fixed_iova(QEMUVFIOState *s, size_t size, 683 uint64_t *iova, Error **errp) 684 { 685 int i; 686 687 for (i = 0; i < s->nb_iova_ranges; i++) { 688 if (s->usable_iova_ranges[i].end < s->low_water_mark) { 689 continue; 690 } 691 s->low_water_mark = 692 MAX(s->low_water_mark, s->usable_iova_ranges[i].start); 693 694 if (s->usable_iova_ranges[i].end - s->low_water_mark + 1 >= size || 695 s->usable_iova_ranges[i].end - s->low_water_mark + 1 == 0) { 696 *iova = s->low_water_mark; 697 s->low_water_mark += size; 698 return true; 699 } 700 } 701 error_setg(errp, "fixed iova range not found"); 702 703 return false; 704 } 705 706 static bool qemu_vfio_find_temp_iova(QEMUVFIOState *s, size_t size, 707 uint64_t *iova, Error **errp) 708 { 709 int i; 710 711 for (i = s->nb_iova_ranges - 1; i >= 0; i--) { 712 if (s->usable_iova_ranges[i].start > s->high_water_mark) { 713 continue; 714 } 715 s->high_water_mark = 716 MIN(s->high_water_mark, s->usable_iova_ranges[i].end + 1); 717 718 if (s->high_water_mark - s->usable_iova_ranges[i].start + 1 >= size || 719 s->high_water_mark - s->usable_iova_ranges[i].start + 1 == 0) { 720 *iova = s->high_water_mark - size; 721 s->high_water_mark = *iova; 722 return true; 723 } 724 } 725 error_setg(errp, "temporary iova range not found"); 726 727 return false; 728 } 729 730 /** 731 * qemu_vfio_water_mark_reached: 732 * 733 * Returns %true if high watermark has been reached, %false otherwise. 734 */ 735 static bool qemu_vfio_water_mark_reached(QEMUVFIOState *s, size_t size, 736 Error **errp) 737 { 738 if (s->high_water_mark - s->low_water_mark + 1 < size) { 739 error_setg(errp, "iova exhausted (water mark reached)"); 740 return true; 741 } 742 return false; 743 } 744 745 /* Map [host, host + size) area into a contiguous IOVA address space, and store 746 * the result in @iova if not NULL. The caller need to make sure the area is 747 * aligned to page size, and mustn't overlap with existing mapping areas (split 748 * mapping status within this area is not allowed). 749 */ 750 int qemu_vfio_dma_map(QEMUVFIOState *s, void *host, size_t size, 751 bool temporary, uint64_t *iova, Error **errp) 752 { 753 int index; 754 IOVAMapping *mapping; 755 uint64_t iova0; 756 757 assert(QEMU_PTR_IS_ALIGNED(host, qemu_real_host_page_size())); 758 assert(QEMU_IS_ALIGNED(size, qemu_real_host_page_size())); 759 trace_qemu_vfio_dma_map(s, host, size, temporary, iova); 760 QEMU_LOCK_GUARD(&s->lock); 761 mapping = qemu_vfio_find_mapping(s, host, &index); 762 if (mapping) { 763 iova0 = mapping->iova + ((uint8_t *)host - (uint8_t *)mapping->host); 764 } else { 765 int ret; 766 767 if (qemu_vfio_water_mark_reached(s, size, errp)) { 768 return -ENOMEM; 769 } 770 if (!temporary) { 771 if (!qemu_vfio_find_fixed_iova(s, size, &iova0, errp)) { 772 return -ENOMEM; 773 } 774 775 mapping = qemu_vfio_add_mapping(s, host, size, index + 1, iova0); 776 assert(qemu_vfio_verify_mappings(s)); 777 ret = qemu_vfio_do_mapping(s, host, size, iova0, errp); 778 if (ret < 0) { 779 qemu_vfio_undo_mapping(s, mapping, NULL); 780 return ret; 781 } 782 qemu_vfio_dump_mappings(s); 783 } else { 784 if (!qemu_vfio_find_temp_iova(s, size, &iova0, errp)) { 785 return -ENOMEM; 786 } 787 ret = qemu_vfio_do_mapping(s, host, size, iova0, errp); 788 if (ret < 0) { 789 return ret; 790 } 791 } 792 } 793 trace_qemu_vfio_dma_mapped(s, host, iova0, size); 794 if (iova) { 795 *iova = iova0; 796 } 797 return 0; 798 } 799 800 /* Reset the high watermark and free all "temporary" mappings. */ 801 int qemu_vfio_dma_reset_temporary(QEMUVFIOState *s) 802 { 803 struct vfio_iommu_type1_dma_unmap unmap = { 804 .argsz = sizeof(unmap), 805 .flags = 0, 806 .iova = s->high_water_mark, 807 .size = QEMU_VFIO_IOVA_MAX - s->high_water_mark, 808 }; 809 trace_qemu_vfio_dma_reset_temporary(s); 810 QEMU_LOCK_GUARD(&s->lock); 811 if (ioctl(s->container, VFIO_IOMMU_UNMAP_DMA, &unmap)) { 812 error_report("VFIO_UNMAP_DMA failed: %s", strerror(errno)); 813 return -errno; 814 } 815 s->high_water_mark = QEMU_VFIO_IOVA_MAX; 816 return 0; 817 } 818 819 /* Unmapping the whole area that was previously mapped with 820 * qemu_vfio_dma_map(). */ 821 void qemu_vfio_dma_unmap(QEMUVFIOState *s, void *host) 822 { 823 int index = 0; 824 IOVAMapping *m; 825 826 if (!host) { 827 return; 828 } 829 830 trace_qemu_vfio_dma_unmap(s, host); 831 QEMU_LOCK_GUARD(&s->lock); 832 m = qemu_vfio_find_mapping(s, host, &index); 833 if (!m) { 834 return; 835 } 836 qemu_vfio_undo_mapping(s, m, NULL); 837 } 838 839 static void qemu_vfio_reset(QEMUVFIOState *s) 840 { 841 ioctl(s->device, VFIO_DEVICE_RESET); 842 } 843 844 /* Close and free the VFIO resources. */ 845 void qemu_vfio_close(QEMUVFIOState *s) 846 { 847 int i; 848 849 if (!s) { 850 return; 851 } 852 853 ram_block_notifier_remove(&s->ram_notifier); 854 855 for (i = 0; i < s->nr_mappings; ++i) { 856 qemu_vfio_undo_mapping(s, &s->mappings[i], NULL); 857 } 858 859 g_free(s->usable_iova_ranges); 860 s->nb_iova_ranges = 0; 861 qemu_vfio_reset(s); 862 close(s->device); 863 close(s->group); 864 close(s->container); 865 ram_block_discard_disable(false); 866 } 867