1 /* 2 * generic functions used by VFIO devices 3 * 4 * Copyright Red Hat, Inc. 2012 5 * 6 * Authors: 7 * Alex Williamson <alex.williamson@redhat.com> 8 * 9 * This work is licensed under the terms of the GNU GPL, version 2. See 10 * the COPYING file in the top-level directory. 11 * 12 * Based on qemu-kvm device-assignment: 13 * Adapted for KVM by Qumranet. 14 * Copyright (c) 2007, Neocleus, Alex Novik (alex@neocleus.com) 15 * Copyright (c) 2007, Neocleus, Guy Zana (guy@neocleus.com) 16 * Copyright (C) 2008, Qumranet, Amit Shah (amit.shah@qumranet.com) 17 * Copyright (C) 2008, Red Hat, Amit Shah (amit.shah@redhat.com) 18 * Copyright (C) 2008, IBM, Muli Ben-Yehuda (muli@il.ibm.com) 19 */ 20 21 #include <sys/ioctl.h> 22 #include <sys/mman.h> 23 #include <linux/vfio.h> 24 25 #include "hw/vfio/vfio-common.h" 26 #include "hw/vfio/vfio.h" 27 #include "exec/address-spaces.h" 28 #include "exec/memory.h" 29 #include "hw/hw.h" 30 #include "qemu/error-report.h" 31 #include "sysemu/kvm.h" 32 #include "trace.h" 33 34 struct vfio_group_head vfio_group_list = 35 QLIST_HEAD_INITIALIZER(vfio_group_list); 36 struct vfio_as_head vfio_address_spaces = 37 QLIST_HEAD_INITIALIZER(vfio_address_spaces); 38 39 #ifdef CONFIG_KVM 40 /* 41 * We have a single VFIO pseudo device per KVM VM. Once created it lives 42 * for the life of the VM. Closing the file descriptor only drops our 43 * reference to it and the device's reference to kvm. Therefore once 44 * initialized, this file descriptor is only released on QEMU exit and 45 * we'll re-use it should another vfio device be attached before then. 46 */ 47 static int vfio_kvm_device_fd = -1; 48 #endif 49 50 /* 51 * Common VFIO interrupt disable 52 */ 53 void vfio_disable_irqindex(VFIODevice *vbasedev, int index) 54 { 55 struct vfio_irq_set irq_set = { 56 .argsz = sizeof(irq_set), 57 .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER, 58 .index = index, 59 .start = 0, 60 .count = 0, 61 }; 62 63 ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, &irq_set); 64 } 65 66 void vfio_unmask_single_irqindex(VFIODevice *vbasedev, int index) 67 { 68 struct vfio_irq_set irq_set = { 69 .argsz = sizeof(irq_set), 70 .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_UNMASK, 71 .index = index, 72 .start = 0, 73 .count = 1, 74 }; 75 76 ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, &irq_set); 77 } 78 79 void vfio_mask_single_irqindex(VFIODevice *vbasedev, int index) 80 { 81 struct vfio_irq_set irq_set = { 82 .argsz = sizeof(irq_set), 83 .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_MASK, 84 .index = index, 85 .start = 0, 86 .count = 1, 87 }; 88 89 ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, &irq_set); 90 } 91 92 /* 93 * IO Port/MMIO - Beware of the endians, VFIO is always little endian 94 */ 95 void vfio_region_write(void *opaque, hwaddr addr, 96 uint64_t data, unsigned size) 97 { 98 VFIORegion *region = opaque; 99 VFIODevice *vbasedev = region->vbasedev; 100 union { 101 uint8_t byte; 102 uint16_t word; 103 uint32_t dword; 104 uint64_t qword; 105 } buf; 106 107 switch (size) { 108 case 1: 109 buf.byte = data; 110 break; 111 case 2: 112 buf.word = cpu_to_le16(data); 113 break; 114 case 4: 115 buf.dword = cpu_to_le32(data); 116 break; 117 default: 118 hw_error("vfio: unsupported write size, %d bytes", size); 119 break; 120 } 121 122 if (pwrite(vbasedev->fd, &buf, size, region->fd_offset + addr) != size) { 123 error_report("%s(%s:region%d+0x%"HWADDR_PRIx", 0x%"PRIx64 124 ",%d) failed: %m", 125 __func__, vbasedev->name, region->nr, 126 addr, data, size); 127 } 128 129 trace_vfio_region_write(vbasedev->name, region->nr, addr, data, size); 130 131 /* 132 * A read or write to a BAR always signals an INTx EOI. This will 133 * do nothing if not pending (including not in INTx mode). We assume 134 * that a BAR access is in response to an interrupt and that BAR 135 * accesses will service the interrupt. Unfortunately, we don't know 136 * which access will service the interrupt, so we're potentially 137 * getting quite a few host interrupts per guest interrupt. 138 */ 139 vbasedev->ops->vfio_eoi(vbasedev); 140 } 141 142 uint64_t vfio_region_read(void *opaque, 143 hwaddr addr, unsigned size) 144 { 145 VFIORegion *region = opaque; 146 VFIODevice *vbasedev = region->vbasedev; 147 union { 148 uint8_t byte; 149 uint16_t word; 150 uint32_t dword; 151 uint64_t qword; 152 } buf; 153 uint64_t data = 0; 154 155 if (pread(vbasedev->fd, &buf, size, region->fd_offset + addr) != size) { 156 error_report("%s(%s:region%d+0x%"HWADDR_PRIx", %d) failed: %m", 157 __func__, vbasedev->name, region->nr, 158 addr, size); 159 return (uint64_t)-1; 160 } 161 switch (size) { 162 case 1: 163 data = buf.byte; 164 break; 165 case 2: 166 data = le16_to_cpu(buf.word); 167 break; 168 case 4: 169 data = le32_to_cpu(buf.dword); 170 break; 171 default: 172 hw_error("vfio: unsupported read size, %d bytes", size); 173 break; 174 } 175 176 trace_vfio_region_read(vbasedev->name, region->nr, addr, size, data); 177 178 /* Same as write above */ 179 vbasedev->ops->vfio_eoi(vbasedev); 180 181 return data; 182 } 183 184 const MemoryRegionOps vfio_region_ops = { 185 .read = vfio_region_read, 186 .write = vfio_region_write, 187 .endianness = DEVICE_LITTLE_ENDIAN, 188 }; 189 190 /* 191 * DMA - Mapping and unmapping for the "type1" IOMMU interface used on x86 192 */ 193 static int vfio_dma_unmap(VFIOContainer *container, 194 hwaddr iova, ram_addr_t size) 195 { 196 struct vfio_iommu_type1_dma_unmap unmap = { 197 .argsz = sizeof(unmap), 198 .flags = 0, 199 .iova = iova, 200 .size = size, 201 }; 202 203 if (ioctl(container->fd, VFIO_IOMMU_UNMAP_DMA, &unmap)) { 204 error_report("VFIO_UNMAP_DMA: %d", -errno); 205 return -errno; 206 } 207 208 return 0; 209 } 210 211 static int vfio_dma_map(VFIOContainer *container, hwaddr iova, 212 ram_addr_t size, void *vaddr, bool readonly) 213 { 214 struct vfio_iommu_type1_dma_map map = { 215 .argsz = sizeof(map), 216 .flags = VFIO_DMA_MAP_FLAG_READ, 217 .vaddr = (__u64)(uintptr_t)vaddr, 218 .iova = iova, 219 .size = size, 220 }; 221 222 if (!readonly) { 223 map.flags |= VFIO_DMA_MAP_FLAG_WRITE; 224 } 225 226 /* 227 * Try the mapping, if it fails with EBUSY, unmap the region and try 228 * again. This shouldn't be necessary, but we sometimes see it in 229 * the VGA ROM space. 230 */ 231 if (ioctl(container->fd, VFIO_IOMMU_MAP_DMA, &map) == 0 || 232 (errno == EBUSY && vfio_dma_unmap(container, iova, size) == 0 && 233 ioctl(container->fd, VFIO_IOMMU_MAP_DMA, &map) == 0)) { 234 return 0; 235 } 236 237 error_report("VFIO_MAP_DMA: %d", -errno); 238 return -errno; 239 } 240 241 static bool vfio_listener_skipped_section(MemoryRegionSection *section) 242 { 243 return (!memory_region_is_ram(section->mr) && 244 !memory_region_is_iommu(section->mr)) || 245 /* 246 * Sizing an enabled 64-bit BAR can cause spurious mappings to 247 * addresses in the upper part of the 64-bit address space. These 248 * are never accessed by the CPU and beyond the address width of 249 * some IOMMU hardware. TODO: VFIO should tell us the IOMMU width. 250 */ 251 section->offset_within_address_space & (1ULL << 63); 252 } 253 254 static void vfio_iommu_map_notify(Notifier *n, void *data) 255 { 256 VFIOGuestIOMMU *giommu = container_of(n, VFIOGuestIOMMU, n); 257 VFIOContainer *container = giommu->container; 258 IOMMUTLBEntry *iotlb = data; 259 MemoryRegion *mr; 260 hwaddr xlat; 261 hwaddr len = iotlb->addr_mask + 1; 262 void *vaddr; 263 int ret; 264 265 trace_vfio_iommu_map_notify(iotlb->iova, 266 iotlb->iova + iotlb->addr_mask); 267 268 /* 269 * The IOMMU TLB entry we have just covers translation through 270 * this IOMMU to its immediate target. We need to translate 271 * it the rest of the way through to memory. 272 */ 273 rcu_read_lock(); 274 mr = address_space_translate(&address_space_memory, 275 iotlb->translated_addr, 276 &xlat, &len, iotlb->perm & IOMMU_WO); 277 if (!memory_region_is_ram(mr)) { 278 error_report("iommu map to non memory area %"HWADDR_PRIx"", 279 xlat); 280 goto out; 281 } 282 /* 283 * Translation truncates length to the IOMMU page size, 284 * check that it did not truncate too much. 285 */ 286 if (len & iotlb->addr_mask) { 287 error_report("iommu has granularity incompatible with target AS"); 288 goto out; 289 } 290 291 if ((iotlb->perm & IOMMU_RW) != IOMMU_NONE) { 292 vaddr = memory_region_get_ram_ptr(mr) + xlat; 293 ret = vfio_dma_map(container, iotlb->iova, 294 iotlb->addr_mask + 1, vaddr, 295 !(iotlb->perm & IOMMU_WO) || mr->readonly); 296 if (ret) { 297 error_report("vfio_dma_map(%p, 0x%"HWADDR_PRIx", " 298 "0x%"HWADDR_PRIx", %p) = %d (%m)", 299 container, iotlb->iova, 300 iotlb->addr_mask + 1, vaddr, ret); 301 } 302 } else { 303 ret = vfio_dma_unmap(container, iotlb->iova, iotlb->addr_mask + 1); 304 if (ret) { 305 error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", " 306 "0x%"HWADDR_PRIx") = %d (%m)", 307 container, iotlb->iova, 308 iotlb->addr_mask + 1, ret); 309 } 310 } 311 out: 312 rcu_read_unlock(); 313 } 314 315 static hwaddr vfio_container_granularity(VFIOContainer *container) 316 { 317 return (hwaddr)1 << ctz64(container->iova_pgsizes); 318 } 319 320 static void vfio_listener_region_add(MemoryListener *listener, 321 MemoryRegionSection *section) 322 { 323 VFIOContainer *container = container_of(listener, VFIOContainer, listener); 324 hwaddr iova, end; 325 Int128 llend; 326 void *vaddr; 327 int ret; 328 329 if (vfio_listener_skipped_section(section)) { 330 trace_vfio_listener_region_add_skip( 331 section->offset_within_address_space, 332 section->offset_within_address_space + 333 int128_get64(int128_sub(section->size, int128_one()))); 334 return; 335 } 336 337 if (unlikely((section->offset_within_address_space & ~TARGET_PAGE_MASK) != 338 (section->offset_within_region & ~TARGET_PAGE_MASK))) { 339 error_report("%s received unaligned region", __func__); 340 return; 341 } 342 343 iova = TARGET_PAGE_ALIGN(section->offset_within_address_space); 344 llend = int128_make64(section->offset_within_address_space); 345 llend = int128_add(llend, section->size); 346 llend = int128_and(llend, int128_exts64(TARGET_PAGE_MASK)); 347 348 if (int128_ge(int128_make64(iova), llend)) { 349 return; 350 } 351 end = int128_get64(llend); 352 353 if ((iova < container->min_iova) || ((end - 1) > container->max_iova)) { 354 error_report("vfio: IOMMU container %p can't map guest IOVA region" 355 " 0x%"HWADDR_PRIx"..0x%"HWADDR_PRIx, 356 container, iova, end - 1); 357 ret = -EFAULT; 358 goto fail; 359 } 360 361 memory_region_ref(section->mr); 362 363 if (memory_region_is_iommu(section->mr)) { 364 VFIOGuestIOMMU *giommu; 365 366 trace_vfio_listener_region_add_iommu(iova, end - 1); 367 /* 368 * FIXME: We should do some checking to see if the 369 * capabilities of the host VFIO IOMMU are adequate to model 370 * the guest IOMMU 371 * 372 * FIXME: For VFIO iommu types which have KVM acceleration to 373 * avoid bouncing all map/unmaps through qemu this way, this 374 * would be the right place to wire that up (tell the KVM 375 * device emulation the VFIO iommu handles to use). 376 */ 377 giommu = g_malloc0(sizeof(*giommu)); 378 giommu->iommu = section->mr; 379 giommu->container = container; 380 giommu->n.notify = vfio_iommu_map_notify; 381 QLIST_INSERT_HEAD(&container->giommu_list, giommu, giommu_next); 382 383 memory_region_register_iommu_notifier(giommu->iommu, &giommu->n); 384 memory_region_iommu_replay(giommu->iommu, &giommu->n, 385 vfio_container_granularity(container), 386 false); 387 388 return; 389 } 390 391 /* Here we assume that memory_region_is_ram(section->mr)==true */ 392 393 vaddr = memory_region_get_ram_ptr(section->mr) + 394 section->offset_within_region + 395 (iova - section->offset_within_address_space); 396 397 trace_vfio_listener_region_add_ram(iova, end - 1, vaddr); 398 399 ret = vfio_dma_map(container, iova, end - iova, vaddr, section->readonly); 400 if (ret) { 401 error_report("vfio_dma_map(%p, 0x%"HWADDR_PRIx", " 402 "0x%"HWADDR_PRIx", %p) = %d (%m)", 403 container, iova, end - iova, vaddr, ret); 404 goto fail; 405 } 406 407 return; 408 409 fail: 410 /* 411 * On the initfn path, store the first error in the container so we 412 * can gracefully fail. Runtime, there's not much we can do other 413 * than throw a hardware error. 414 */ 415 if (!container->initialized) { 416 if (!container->error) { 417 container->error = ret; 418 } 419 } else { 420 hw_error("vfio: DMA mapping failed, unable to continue"); 421 } 422 } 423 424 static void vfio_listener_region_del(MemoryListener *listener, 425 MemoryRegionSection *section) 426 { 427 VFIOContainer *container = container_of(listener, VFIOContainer, listener); 428 hwaddr iova, end; 429 int ret; 430 431 if (vfio_listener_skipped_section(section)) { 432 trace_vfio_listener_region_del_skip( 433 section->offset_within_address_space, 434 section->offset_within_address_space + 435 int128_get64(int128_sub(section->size, int128_one()))); 436 return; 437 } 438 439 if (unlikely((section->offset_within_address_space & ~TARGET_PAGE_MASK) != 440 (section->offset_within_region & ~TARGET_PAGE_MASK))) { 441 error_report("%s received unaligned region", __func__); 442 return; 443 } 444 445 if (memory_region_is_iommu(section->mr)) { 446 VFIOGuestIOMMU *giommu; 447 448 QLIST_FOREACH(giommu, &container->giommu_list, giommu_next) { 449 if (giommu->iommu == section->mr) { 450 memory_region_unregister_iommu_notifier(&giommu->n); 451 QLIST_REMOVE(giommu, giommu_next); 452 g_free(giommu); 453 break; 454 } 455 } 456 457 /* 458 * FIXME: We assume the one big unmap below is adequate to 459 * remove any individual page mappings in the IOMMU which 460 * might have been copied into VFIO. This works for a page table 461 * based IOMMU where a big unmap flattens a large range of IO-PTEs. 462 * That may not be true for all IOMMU types. 463 */ 464 } 465 466 iova = TARGET_PAGE_ALIGN(section->offset_within_address_space); 467 end = (section->offset_within_address_space + int128_get64(section->size)) & 468 TARGET_PAGE_MASK; 469 470 if (iova >= end) { 471 return; 472 } 473 474 trace_vfio_listener_region_del(iova, end - 1); 475 476 ret = vfio_dma_unmap(container, iova, end - iova); 477 memory_region_unref(section->mr); 478 if (ret) { 479 error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", " 480 "0x%"HWADDR_PRIx") = %d (%m)", 481 container, iova, end - iova, ret); 482 } 483 } 484 485 static const MemoryListener vfio_memory_listener = { 486 .region_add = vfio_listener_region_add, 487 .region_del = vfio_listener_region_del, 488 }; 489 490 static void vfio_listener_release(VFIOContainer *container) 491 { 492 memory_listener_unregister(&container->listener); 493 } 494 495 int vfio_mmap_region(Object *obj, VFIORegion *region, 496 MemoryRegion *mem, MemoryRegion *submem, 497 void **map, size_t size, off_t offset, 498 const char *name) 499 { 500 int ret = 0; 501 VFIODevice *vbasedev = region->vbasedev; 502 503 if (!vbasedev->no_mmap && size && region->flags & 504 VFIO_REGION_INFO_FLAG_MMAP) { 505 int prot = 0; 506 507 if (region->flags & VFIO_REGION_INFO_FLAG_READ) { 508 prot |= PROT_READ; 509 } 510 511 if (region->flags & VFIO_REGION_INFO_FLAG_WRITE) { 512 prot |= PROT_WRITE; 513 } 514 515 *map = mmap(NULL, size, prot, MAP_SHARED, 516 vbasedev->fd, 517 region->fd_offset + offset); 518 if (*map == MAP_FAILED) { 519 *map = NULL; 520 ret = -errno; 521 goto empty_region; 522 } 523 524 memory_region_init_ram_ptr(submem, obj, name, size, *map); 525 memory_region_set_skip_dump(submem); 526 } else { 527 empty_region: 528 /* Create a zero sized sub-region to make cleanup easy. */ 529 memory_region_init(submem, obj, name, 0); 530 } 531 532 memory_region_add_subregion(mem, offset, submem); 533 534 return ret; 535 } 536 537 void vfio_reset_handler(void *opaque) 538 { 539 VFIOGroup *group; 540 VFIODevice *vbasedev; 541 542 QLIST_FOREACH(group, &vfio_group_list, next) { 543 QLIST_FOREACH(vbasedev, &group->device_list, next) { 544 vbasedev->ops->vfio_compute_needs_reset(vbasedev); 545 } 546 } 547 548 QLIST_FOREACH(group, &vfio_group_list, next) { 549 QLIST_FOREACH(vbasedev, &group->device_list, next) { 550 if (vbasedev->needs_reset) { 551 vbasedev->ops->vfio_hot_reset_multi(vbasedev); 552 } 553 } 554 } 555 } 556 557 static void vfio_kvm_device_add_group(VFIOGroup *group) 558 { 559 #ifdef CONFIG_KVM 560 struct kvm_device_attr attr = { 561 .group = KVM_DEV_VFIO_GROUP, 562 .attr = KVM_DEV_VFIO_GROUP_ADD, 563 .addr = (uint64_t)(unsigned long)&group->fd, 564 }; 565 566 if (!kvm_enabled()) { 567 return; 568 } 569 570 if (vfio_kvm_device_fd < 0) { 571 struct kvm_create_device cd = { 572 .type = KVM_DEV_TYPE_VFIO, 573 }; 574 575 if (kvm_vm_ioctl(kvm_state, KVM_CREATE_DEVICE, &cd)) { 576 error_report("Failed to create KVM VFIO device: %m"); 577 return; 578 } 579 580 vfio_kvm_device_fd = cd.fd; 581 } 582 583 if (ioctl(vfio_kvm_device_fd, KVM_SET_DEVICE_ATTR, &attr)) { 584 error_report("Failed to add group %d to KVM VFIO device: %m", 585 group->groupid); 586 } 587 #endif 588 } 589 590 static void vfio_kvm_device_del_group(VFIOGroup *group) 591 { 592 #ifdef CONFIG_KVM 593 struct kvm_device_attr attr = { 594 .group = KVM_DEV_VFIO_GROUP, 595 .attr = KVM_DEV_VFIO_GROUP_DEL, 596 .addr = (uint64_t)(unsigned long)&group->fd, 597 }; 598 599 if (vfio_kvm_device_fd < 0) { 600 return; 601 } 602 603 if (ioctl(vfio_kvm_device_fd, KVM_SET_DEVICE_ATTR, &attr)) { 604 error_report("Failed to remove group %d from KVM VFIO device: %m", 605 group->groupid); 606 } 607 #endif 608 } 609 610 static VFIOAddressSpace *vfio_get_address_space(AddressSpace *as) 611 { 612 VFIOAddressSpace *space; 613 614 QLIST_FOREACH(space, &vfio_address_spaces, list) { 615 if (space->as == as) { 616 return space; 617 } 618 } 619 620 /* No suitable VFIOAddressSpace, create a new one */ 621 space = g_malloc0(sizeof(*space)); 622 space->as = as; 623 QLIST_INIT(&space->containers); 624 625 QLIST_INSERT_HEAD(&vfio_address_spaces, space, list); 626 627 return space; 628 } 629 630 static void vfio_put_address_space(VFIOAddressSpace *space) 631 { 632 if (QLIST_EMPTY(&space->containers)) { 633 QLIST_REMOVE(space, list); 634 g_free(space); 635 } 636 } 637 638 static int vfio_connect_container(VFIOGroup *group, AddressSpace *as) 639 { 640 VFIOContainer *container; 641 int ret, fd; 642 VFIOAddressSpace *space; 643 644 space = vfio_get_address_space(as); 645 646 QLIST_FOREACH(container, &space->containers, next) { 647 if (!ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &container->fd)) { 648 group->container = container; 649 QLIST_INSERT_HEAD(&container->group_list, group, container_next); 650 return 0; 651 } 652 } 653 654 fd = qemu_open("/dev/vfio/vfio", O_RDWR); 655 if (fd < 0) { 656 error_report("vfio: failed to open /dev/vfio/vfio: %m"); 657 ret = -errno; 658 goto put_space_exit; 659 } 660 661 ret = ioctl(fd, VFIO_GET_API_VERSION); 662 if (ret != VFIO_API_VERSION) { 663 error_report("vfio: supported vfio version: %d, " 664 "reported version: %d", VFIO_API_VERSION, ret); 665 ret = -EINVAL; 666 goto close_fd_exit; 667 } 668 669 container = g_malloc0(sizeof(*container)); 670 container->space = space; 671 container->fd = fd; 672 if (ioctl(fd, VFIO_CHECK_EXTENSION, VFIO_TYPE1_IOMMU) || 673 ioctl(fd, VFIO_CHECK_EXTENSION, VFIO_TYPE1v2_IOMMU)) { 674 bool v2 = !!ioctl(fd, VFIO_CHECK_EXTENSION, VFIO_TYPE1v2_IOMMU); 675 struct vfio_iommu_type1_info info; 676 677 ret = ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &fd); 678 if (ret) { 679 error_report("vfio: failed to set group container: %m"); 680 ret = -errno; 681 goto free_container_exit; 682 } 683 684 ret = ioctl(fd, VFIO_SET_IOMMU, 685 v2 ? VFIO_TYPE1v2_IOMMU : VFIO_TYPE1_IOMMU); 686 if (ret) { 687 error_report("vfio: failed to set iommu for container: %m"); 688 ret = -errno; 689 goto free_container_exit; 690 } 691 692 /* 693 * FIXME: This assumes that a Type1 IOMMU can map any 64-bit 694 * IOVA whatsoever. That's not actually true, but the current 695 * kernel interface doesn't tell us what it can map, and the 696 * existing Type1 IOMMUs generally support any IOVA we're 697 * going to actually try in practice. 698 */ 699 container->min_iova = 0; 700 container->max_iova = (hwaddr)-1; 701 702 /* Assume just 4K IOVA page size */ 703 container->iova_pgsizes = 0x1000; 704 info.argsz = sizeof(info); 705 ret = ioctl(fd, VFIO_IOMMU_GET_INFO, &info); 706 /* Ignore errors */ 707 if ((ret == 0) && (info.flags & VFIO_IOMMU_INFO_PGSIZES)) { 708 container->iova_pgsizes = info.iova_pgsizes; 709 } 710 } else if (ioctl(fd, VFIO_CHECK_EXTENSION, VFIO_SPAPR_TCE_IOMMU)) { 711 struct vfio_iommu_spapr_tce_info info; 712 713 ret = ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &fd); 714 if (ret) { 715 error_report("vfio: failed to set group container: %m"); 716 ret = -errno; 717 goto free_container_exit; 718 } 719 ret = ioctl(fd, VFIO_SET_IOMMU, VFIO_SPAPR_TCE_IOMMU); 720 if (ret) { 721 error_report("vfio: failed to set iommu for container: %m"); 722 ret = -errno; 723 goto free_container_exit; 724 } 725 726 /* 727 * The host kernel code implementing VFIO_IOMMU_DISABLE is called 728 * when container fd is closed so we do not call it explicitly 729 * in this file. 730 */ 731 ret = ioctl(fd, VFIO_IOMMU_ENABLE); 732 if (ret) { 733 error_report("vfio: failed to enable container: %m"); 734 ret = -errno; 735 goto free_container_exit; 736 } 737 738 /* 739 * This only considers the host IOMMU's 32-bit window. At 740 * some point we need to add support for the optional 64-bit 741 * window and dynamic windows 742 */ 743 info.argsz = sizeof(info); 744 ret = ioctl(fd, VFIO_IOMMU_SPAPR_TCE_GET_INFO, &info); 745 if (ret) { 746 error_report("vfio: VFIO_IOMMU_SPAPR_TCE_GET_INFO failed: %m"); 747 ret = -errno; 748 goto free_container_exit; 749 } 750 container->min_iova = info.dma32_window_start; 751 container->max_iova = container->min_iova + info.dma32_window_size - 1; 752 753 /* Assume just 4K IOVA pages for now */ 754 container->iova_pgsizes = 0x1000; 755 } else { 756 error_report("vfio: No available IOMMU models"); 757 ret = -EINVAL; 758 goto free_container_exit; 759 } 760 761 container->listener = vfio_memory_listener; 762 763 memory_listener_register(&container->listener, container->space->as); 764 765 if (container->error) { 766 ret = container->error; 767 error_report("vfio: memory listener initialization failed for container"); 768 goto listener_release_exit; 769 } 770 771 container->initialized = true; 772 773 QLIST_INIT(&container->group_list); 774 QLIST_INSERT_HEAD(&space->containers, container, next); 775 776 group->container = container; 777 QLIST_INSERT_HEAD(&container->group_list, group, container_next); 778 779 return 0; 780 listener_release_exit: 781 vfio_listener_release(container); 782 783 free_container_exit: 784 g_free(container); 785 786 close_fd_exit: 787 close(fd); 788 789 put_space_exit: 790 vfio_put_address_space(space); 791 792 return ret; 793 } 794 795 static void vfio_disconnect_container(VFIOGroup *group) 796 { 797 VFIOContainer *container = group->container; 798 799 if (ioctl(group->fd, VFIO_GROUP_UNSET_CONTAINER, &container->fd)) { 800 error_report("vfio: error disconnecting group %d from container", 801 group->groupid); 802 } 803 804 QLIST_REMOVE(group, container_next); 805 group->container = NULL; 806 807 if (QLIST_EMPTY(&container->group_list)) { 808 VFIOAddressSpace *space = container->space; 809 VFIOGuestIOMMU *giommu, *tmp; 810 811 vfio_listener_release(container); 812 QLIST_REMOVE(container, next); 813 814 QLIST_FOREACH_SAFE(giommu, &container->giommu_list, giommu_next, tmp) { 815 memory_region_unregister_iommu_notifier(&giommu->n); 816 QLIST_REMOVE(giommu, giommu_next); 817 g_free(giommu); 818 } 819 820 trace_vfio_disconnect_container(container->fd); 821 close(container->fd); 822 g_free(container); 823 824 vfio_put_address_space(space); 825 } 826 } 827 828 VFIOGroup *vfio_get_group(int groupid, AddressSpace *as) 829 { 830 VFIOGroup *group; 831 char path[32]; 832 struct vfio_group_status status = { .argsz = sizeof(status) }; 833 834 QLIST_FOREACH(group, &vfio_group_list, next) { 835 if (group->groupid == groupid) { 836 /* Found it. Now is it already in the right context? */ 837 if (group->container->space->as == as) { 838 return group; 839 } else { 840 error_report("vfio: group %d used in multiple address spaces", 841 group->groupid); 842 return NULL; 843 } 844 } 845 } 846 847 group = g_malloc0(sizeof(*group)); 848 849 snprintf(path, sizeof(path), "/dev/vfio/%d", groupid); 850 group->fd = qemu_open(path, O_RDWR); 851 if (group->fd < 0) { 852 error_report("vfio: error opening %s: %m", path); 853 goto free_group_exit; 854 } 855 856 if (ioctl(group->fd, VFIO_GROUP_GET_STATUS, &status)) { 857 error_report("vfio: error getting group status: %m"); 858 goto close_fd_exit; 859 } 860 861 if (!(status.flags & VFIO_GROUP_FLAGS_VIABLE)) { 862 error_report("vfio: error, group %d is not viable, please ensure " 863 "all devices within the iommu_group are bound to their " 864 "vfio bus driver.", groupid); 865 goto close_fd_exit; 866 } 867 868 group->groupid = groupid; 869 QLIST_INIT(&group->device_list); 870 871 if (vfio_connect_container(group, as)) { 872 error_report("vfio: failed to setup container for group %d", groupid); 873 goto close_fd_exit; 874 } 875 876 if (QLIST_EMPTY(&vfio_group_list)) { 877 qemu_register_reset(vfio_reset_handler, NULL); 878 } 879 880 QLIST_INSERT_HEAD(&vfio_group_list, group, next); 881 882 vfio_kvm_device_add_group(group); 883 884 return group; 885 886 close_fd_exit: 887 close(group->fd); 888 889 free_group_exit: 890 g_free(group); 891 892 return NULL; 893 } 894 895 void vfio_put_group(VFIOGroup *group) 896 { 897 if (!group || !QLIST_EMPTY(&group->device_list)) { 898 return; 899 } 900 901 vfio_kvm_device_del_group(group); 902 vfio_disconnect_container(group); 903 QLIST_REMOVE(group, next); 904 trace_vfio_put_group(group->fd); 905 close(group->fd); 906 g_free(group); 907 908 if (QLIST_EMPTY(&vfio_group_list)) { 909 qemu_unregister_reset(vfio_reset_handler, NULL); 910 } 911 } 912 913 int vfio_get_device(VFIOGroup *group, const char *name, 914 VFIODevice *vbasedev) 915 { 916 struct vfio_device_info dev_info = { .argsz = sizeof(dev_info) }; 917 int ret, fd; 918 919 fd = ioctl(group->fd, VFIO_GROUP_GET_DEVICE_FD, name); 920 if (fd < 0) { 921 error_report("vfio: error getting device %s from group %d: %m", 922 name, group->groupid); 923 error_printf("Verify all devices in group %d are bound to vfio-<bus> " 924 "or pci-stub and not already in use\n", group->groupid); 925 return fd; 926 } 927 928 ret = ioctl(fd, VFIO_DEVICE_GET_INFO, &dev_info); 929 if (ret) { 930 error_report("vfio: error getting device info: %m"); 931 close(fd); 932 return ret; 933 } 934 935 vbasedev->fd = fd; 936 vbasedev->group = group; 937 QLIST_INSERT_HEAD(&group->device_list, vbasedev, next); 938 939 vbasedev->num_irqs = dev_info.num_irqs; 940 vbasedev->num_regions = dev_info.num_regions; 941 vbasedev->flags = dev_info.flags; 942 943 trace_vfio_get_device(name, dev_info.flags, dev_info.num_regions, 944 dev_info.num_irqs); 945 946 vbasedev->reset_works = !!(dev_info.flags & VFIO_DEVICE_FLAGS_RESET); 947 return 0; 948 } 949 950 void vfio_put_base_device(VFIODevice *vbasedev) 951 { 952 if (!vbasedev->group) { 953 return; 954 } 955 QLIST_REMOVE(vbasedev, next); 956 vbasedev->group = NULL; 957 trace_vfio_put_base_device(vbasedev->fd); 958 close(vbasedev->fd); 959 } 960 961 static int vfio_container_do_ioctl(AddressSpace *as, int32_t groupid, 962 int req, void *param) 963 { 964 VFIOGroup *group; 965 VFIOContainer *container; 966 int ret = -1; 967 968 group = vfio_get_group(groupid, as); 969 if (!group) { 970 error_report("vfio: group %d not registered", groupid); 971 return ret; 972 } 973 974 container = group->container; 975 if (group->container) { 976 ret = ioctl(container->fd, req, param); 977 if (ret < 0) { 978 error_report("vfio: failed to ioctl %d to container: ret=%d, %s", 979 _IOC_NR(req) - VFIO_BASE, ret, strerror(errno)); 980 } 981 } 982 983 vfio_put_group(group); 984 985 return ret; 986 } 987 988 int vfio_container_ioctl(AddressSpace *as, int32_t groupid, 989 int req, void *param) 990 { 991 /* We allow only certain ioctls to the container */ 992 switch (req) { 993 case VFIO_CHECK_EXTENSION: 994 case VFIO_IOMMU_SPAPR_TCE_GET_INFO: 995 case VFIO_EEH_PE_OP: 996 break; 997 default: 998 /* Return an error on unknown requests */ 999 error_report("vfio: unsupported ioctl %X", req); 1000 return -1; 1001 } 1002 1003 return vfio_container_do_ioctl(as, groupid, req, param); 1004 } 1005