1 /* 2 * generic functions used by VFIO devices 3 * 4 * Copyright Red Hat, Inc. 2012 5 * 6 * Authors: 7 * Alex Williamson <alex.williamson@redhat.com> 8 * 9 * This work is licensed under the terms of the GNU GPL, version 2. See 10 * the COPYING file in the top-level directory. 11 * 12 * Based on qemu-kvm device-assignment: 13 * Adapted for KVM by Qumranet. 14 * Copyright (c) 2007, Neocleus, Alex Novik (alex@neocleus.com) 15 * Copyright (c) 2007, Neocleus, Guy Zana (guy@neocleus.com) 16 * Copyright (C) 2008, Qumranet, Amit Shah (amit.shah@qumranet.com) 17 * Copyright (C) 2008, Red Hat, Amit Shah (amit.shah@redhat.com) 18 * Copyright (C) 2008, IBM, Muli Ben-Yehuda (muli@il.ibm.com) 19 */ 20 21 #include <sys/ioctl.h> 22 #include <sys/mman.h> 23 #include <linux/vfio.h> 24 25 #include "hw/vfio/vfio-common.h" 26 #include "hw/vfio/vfio.h" 27 #include "exec/address-spaces.h" 28 #include "exec/memory.h" 29 #include "hw/hw.h" 30 #include "qemu/error-report.h" 31 #include "sysemu/kvm.h" 32 #include "trace.h" 33 34 struct vfio_group_head vfio_group_list = 35 QLIST_HEAD_INITIALIZER(vfio_group_list); 36 struct vfio_as_head vfio_address_spaces = 37 QLIST_HEAD_INITIALIZER(vfio_address_spaces); 38 39 #ifdef CONFIG_KVM 40 /* 41 * We have a single VFIO pseudo device per KVM VM. Once created it lives 42 * for the life of the VM. Closing the file descriptor only drops our 43 * reference to it and the device's reference to kvm. Therefore once 44 * initialized, this file descriptor is only released on QEMU exit and 45 * we'll re-use it should another vfio device be attached before then. 46 */ 47 static int vfio_kvm_device_fd = -1; 48 #endif 49 50 /* 51 * Common VFIO interrupt disable 52 */ 53 void vfio_disable_irqindex(VFIODevice *vbasedev, int index) 54 { 55 struct vfio_irq_set irq_set = { 56 .argsz = sizeof(irq_set), 57 .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER, 58 .index = index, 59 .start = 0, 60 .count = 0, 61 }; 62 63 ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, &irq_set); 64 } 65 66 void vfio_unmask_single_irqindex(VFIODevice *vbasedev, int index) 67 { 68 struct vfio_irq_set irq_set = { 69 .argsz = sizeof(irq_set), 70 .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_UNMASK, 71 .index = index, 72 .start = 0, 73 .count = 1, 74 }; 75 76 ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, &irq_set); 77 } 78 79 void vfio_mask_single_irqindex(VFIODevice *vbasedev, int index) 80 { 81 struct vfio_irq_set irq_set = { 82 .argsz = sizeof(irq_set), 83 .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_MASK, 84 .index = index, 85 .start = 0, 86 .count = 1, 87 }; 88 89 ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, &irq_set); 90 } 91 92 /* 93 * IO Port/MMIO - Beware of the endians, VFIO is always little endian 94 */ 95 void vfio_region_write(void *opaque, hwaddr addr, 96 uint64_t data, unsigned size) 97 { 98 VFIORegion *region = opaque; 99 VFIODevice *vbasedev = region->vbasedev; 100 union { 101 uint8_t byte; 102 uint16_t word; 103 uint32_t dword; 104 uint64_t qword; 105 } buf; 106 107 switch (size) { 108 case 1: 109 buf.byte = data; 110 break; 111 case 2: 112 buf.word = cpu_to_le16(data); 113 break; 114 case 4: 115 buf.dword = cpu_to_le32(data); 116 break; 117 default: 118 hw_error("vfio: unsupported write size, %d bytes", size); 119 break; 120 } 121 122 if (pwrite(vbasedev->fd, &buf, size, region->fd_offset + addr) != size) { 123 error_report("%s(%s:region%d+0x%"HWADDR_PRIx", 0x%"PRIx64 124 ",%d) failed: %m", 125 __func__, vbasedev->name, region->nr, 126 addr, data, size); 127 } 128 129 trace_vfio_region_write(vbasedev->name, region->nr, addr, data, size); 130 131 /* 132 * A read or write to a BAR always signals an INTx EOI. This will 133 * do nothing if not pending (including not in INTx mode). We assume 134 * that a BAR access is in response to an interrupt and that BAR 135 * accesses will service the interrupt. Unfortunately, we don't know 136 * which access will service the interrupt, so we're potentially 137 * getting quite a few host interrupts per guest interrupt. 138 */ 139 vbasedev->ops->vfio_eoi(vbasedev); 140 } 141 142 uint64_t vfio_region_read(void *opaque, 143 hwaddr addr, unsigned size) 144 { 145 VFIORegion *region = opaque; 146 VFIODevice *vbasedev = region->vbasedev; 147 union { 148 uint8_t byte; 149 uint16_t word; 150 uint32_t dword; 151 uint64_t qword; 152 } buf; 153 uint64_t data = 0; 154 155 if (pread(vbasedev->fd, &buf, size, region->fd_offset + addr) != size) { 156 error_report("%s(%s:region%d+0x%"HWADDR_PRIx", %d) failed: %m", 157 __func__, vbasedev->name, region->nr, 158 addr, size); 159 return (uint64_t)-1; 160 } 161 switch (size) { 162 case 1: 163 data = buf.byte; 164 break; 165 case 2: 166 data = le16_to_cpu(buf.word); 167 break; 168 case 4: 169 data = le32_to_cpu(buf.dword); 170 break; 171 default: 172 hw_error("vfio: unsupported read size, %d bytes", size); 173 break; 174 } 175 176 trace_vfio_region_read(vbasedev->name, region->nr, addr, size, data); 177 178 /* Same as write above */ 179 vbasedev->ops->vfio_eoi(vbasedev); 180 181 return data; 182 } 183 184 const MemoryRegionOps vfio_region_ops = { 185 .read = vfio_region_read, 186 .write = vfio_region_write, 187 .endianness = DEVICE_LITTLE_ENDIAN, 188 }; 189 190 /* 191 * DMA - Mapping and unmapping for the "type1" IOMMU interface used on x86 192 */ 193 static int vfio_dma_unmap(VFIOContainer *container, 194 hwaddr iova, ram_addr_t size) 195 { 196 struct vfio_iommu_type1_dma_unmap unmap = { 197 .argsz = sizeof(unmap), 198 .flags = 0, 199 .iova = iova, 200 .size = size, 201 }; 202 203 if (ioctl(container->fd, VFIO_IOMMU_UNMAP_DMA, &unmap)) { 204 error_report("VFIO_UNMAP_DMA: %d", -errno); 205 return -errno; 206 } 207 208 return 0; 209 } 210 211 static int vfio_dma_map(VFIOContainer *container, hwaddr iova, 212 ram_addr_t size, void *vaddr, bool readonly) 213 { 214 struct vfio_iommu_type1_dma_map map = { 215 .argsz = sizeof(map), 216 .flags = VFIO_DMA_MAP_FLAG_READ, 217 .vaddr = (__u64)(uintptr_t)vaddr, 218 .iova = iova, 219 .size = size, 220 }; 221 222 if (!readonly) { 223 map.flags |= VFIO_DMA_MAP_FLAG_WRITE; 224 } 225 226 /* 227 * Try the mapping, if it fails with EBUSY, unmap the region and try 228 * again. This shouldn't be necessary, but we sometimes see it in 229 * the VGA ROM space. 230 */ 231 if (ioctl(container->fd, VFIO_IOMMU_MAP_DMA, &map) == 0 || 232 (errno == EBUSY && vfio_dma_unmap(container, iova, size) == 0 && 233 ioctl(container->fd, VFIO_IOMMU_MAP_DMA, &map) == 0)) { 234 return 0; 235 } 236 237 error_report("VFIO_MAP_DMA: %d", -errno); 238 return -errno; 239 } 240 241 static bool vfio_listener_skipped_section(MemoryRegionSection *section) 242 { 243 return (!memory_region_is_ram(section->mr) && 244 !memory_region_is_iommu(section->mr)) || 245 /* 246 * Sizing an enabled 64-bit BAR can cause spurious mappings to 247 * addresses in the upper part of the 64-bit address space. These 248 * are never accessed by the CPU and beyond the address width of 249 * some IOMMU hardware. TODO: VFIO should tell us the IOMMU width. 250 */ 251 section->offset_within_address_space & (1ULL << 63); 252 } 253 254 static void vfio_iommu_map_notify(Notifier *n, void *data) 255 { 256 VFIOGuestIOMMU *giommu = container_of(n, VFIOGuestIOMMU, n); 257 VFIOContainer *container = giommu->container; 258 IOMMUTLBEntry *iotlb = data; 259 MemoryRegion *mr; 260 hwaddr xlat; 261 hwaddr len = iotlb->addr_mask + 1; 262 void *vaddr; 263 int ret; 264 265 trace_vfio_iommu_map_notify(iotlb->iova, 266 iotlb->iova + iotlb->addr_mask); 267 268 /* 269 * The IOMMU TLB entry we have just covers translation through 270 * this IOMMU to its immediate target. We need to translate 271 * it the rest of the way through to memory. 272 */ 273 rcu_read_lock(); 274 mr = address_space_translate(&address_space_memory, 275 iotlb->translated_addr, 276 &xlat, &len, iotlb->perm & IOMMU_WO); 277 if (!memory_region_is_ram(mr)) { 278 error_report("iommu map to non memory area %"HWADDR_PRIx"", 279 xlat); 280 goto out; 281 } 282 /* 283 * Translation truncates length to the IOMMU page size, 284 * check that it did not truncate too much. 285 */ 286 if (len & iotlb->addr_mask) { 287 error_report("iommu has granularity incompatible with target AS"); 288 goto out; 289 } 290 291 if ((iotlb->perm & IOMMU_RW) != IOMMU_NONE) { 292 vaddr = memory_region_get_ram_ptr(mr) + xlat; 293 ret = vfio_dma_map(container, iotlb->iova, 294 iotlb->addr_mask + 1, vaddr, 295 !(iotlb->perm & IOMMU_WO) || mr->readonly); 296 if (ret) { 297 error_report("vfio_dma_map(%p, 0x%"HWADDR_PRIx", " 298 "0x%"HWADDR_PRIx", %p) = %d (%m)", 299 container, iotlb->iova, 300 iotlb->addr_mask + 1, vaddr, ret); 301 } 302 } else { 303 ret = vfio_dma_unmap(container, iotlb->iova, iotlb->addr_mask + 1); 304 if (ret) { 305 error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", " 306 "0x%"HWADDR_PRIx") = %d (%m)", 307 container, iotlb->iova, 308 iotlb->addr_mask + 1, ret); 309 } 310 } 311 out: 312 rcu_read_unlock(); 313 } 314 315 static void vfio_listener_region_add(MemoryListener *listener, 316 MemoryRegionSection *section) 317 { 318 VFIOContainer *container = container_of(listener, VFIOContainer, 319 iommu_data.type1.listener); 320 hwaddr iova, end; 321 Int128 llend; 322 void *vaddr; 323 int ret; 324 325 if (vfio_listener_skipped_section(section)) { 326 trace_vfio_listener_region_add_skip( 327 section->offset_within_address_space, 328 section->offset_within_address_space + 329 int128_get64(int128_sub(section->size, int128_one()))); 330 return; 331 } 332 333 if (unlikely((section->offset_within_address_space & ~TARGET_PAGE_MASK) != 334 (section->offset_within_region & ~TARGET_PAGE_MASK))) { 335 error_report("%s received unaligned region", __func__); 336 return; 337 } 338 339 iova = TARGET_PAGE_ALIGN(section->offset_within_address_space); 340 llend = int128_make64(section->offset_within_address_space); 341 llend = int128_add(llend, section->size); 342 llend = int128_and(llend, int128_exts64(TARGET_PAGE_MASK)); 343 344 if (int128_ge(int128_make64(iova), llend)) { 345 return; 346 } 347 348 memory_region_ref(section->mr); 349 350 if (memory_region_is_iommu(section->mr)) { 351 VFIOGuestIOMMU *giommu; 352 353 trace_vfio_listener_region_add_iommu(iova, 354 int128_get64(int128_sub(llend, int128_one()))); 355 /* 356 * FIXME: We should do some checking to see if the 357 * capabilities of the host VFIO IOMMU are adequate to model 358 * the guest IOMMU 359 * 360 * FIXME: For VFIO iommu types which have KVM acceleration to 361 * avoid bouncing all map/unmaps through qemu this way, this 362 * would be the right place to wire that up (tell the KVM 363 * device emulation the VFIO iommu handles to use). 364 */ 365 /* 366 * This assumes that the guest IOMMU is empty of 367 * mappings at this point. 368 * 369 * One way of doing this is: 370 * 1. Avoid sharing IOMMUs between emulated devices or different 371 * IOMMU groups. 372 * 2. Implement VFIO_IOMMU_ENABLE in the host kernel to fail if 373 * there are some mappings in IOMMU. 374 * 375 * VFIO on SPAPR does that. Other IOMMU models may do that different, 376 * they must make sure there are no existing mappings or 377 * loop through existing mappings to map them into VFIO. 378 */ 379 giommu = g_malloc0(sizeof(*giommu)); 380 giommu->iommu = section->mr; 381 giommu->container = container; 382 giommu->n.notify = vfio_iommu_map_notify; 383 QLIST_INSERT_HEAD(&container->giommu_list, giommu, giommu_next); 384 memory_region_register_iommu_notifier(giommu->iommu, &giommu->n); 385 386 return; 387 } 388 389 /* Here we assume that memory_region_is_ram(section->mr)==true */ 390 391 end = int128_get64(llend); 392 vaddr = memory_region_get_ram_ptr(section->mr) + 393 section->offset_within_region + 394 (iova - section->offset_within_address_space); 395 396 trace_vfio_listener_region_add_ram(iova, end - 1, vaddr); 397 398 ret = vfio_dma_map(container, iova, end - iova, vaddr, section->readonly); 399 if (ret) { 400 error_report("vfio_dma_map(%p, 0x%"HWADDR_PRIx", " 401 "0x%"HWADDR_PRIx", %p) = %d (%m)", 402 container, iova, end - iova, vaddr, ret); 403 404 /* 405 * On the initfn path, store the first error in the container so we 406 * can gracefully fail. Runtime, there's not much we can do other 407 * than throw a hardware error. 408 */ 409 if (!container->iommu_data.type1.initialized) { 410 if (!container->iommu_data.type1.error) { 411 container->iommu_data.type1.error = ret; 412 } 413 } else { 414 hw_error("vfio: DMA mapping failed, unable to continue"); 415 } 416 } 417 } 418 419 static void vfio_listener_region_del(MemoryListener *listener, 420 MemoryRegionSection *section) 421 { 422 VFIOContainer *container = container_of(listener, VFIOContainer, 423 iommu_data.type1.listener); 424 hwaddr iova, end; 425 int ret; 426 427 if (vfio_listener_skipped_section(section)) { 428 trace_vfio_listener_region_del_skip( 429 section->offset_within_address_space, 430 section->offset_within_address_space + 431 int128_get64(int128_sub(section->size, int128_one()))); 432 return; 433 } 434 435 if (unlikely((section->offset_within_address_space & ~TARGET_PAGE_MASK) != 436 (section->offset_within_region & ~TARGET_PAGE_MASK))) { 437 error_report("%s received unaligned region", __func__); 438 return; 439 } 440 441 if (memory_region_is_iommu(section->mr)) { 442 VFIOGuestIOMMU *giommu; 443 444 QLIST_FOREACH(giommu, &container->giommu_list, giommu_next) { 445 if (giommu->iommu == section->mr) { 446 memory_region_unregister_iommu_notifier(&giommu->n); 447 QLIST_REMOVE(giommu, giommu_next); 448 g_free(giommu); 449 break; 450 } 451 } 452 453 /* 454 * FIXME: We assume the one big unmap below is adequate to 455 * remove any individual page mappings in the IOMMU which 456 * might have been copied into VFIO. This works for a page table 457 * based IOMMU where a big unmap flattens a large range of IO-PTEs. 458 * That may not be true for all IOMMU types. 459 */ 460 } 461 462 iova = TARGET_PAGE_ALIGN(section->offset_within_address_space); 463 end = (section->offset_within_address_space + int128_get64(section->size)) & 464 TARGET_PAGE_MASK; 465 466 if (iova >= end) { 467 return; 468 } 469 470 trace_vfio_listener_region_del(iova, end - 1); 471 472 ret = vfio_dma_unmap(container, iova, end - iova); 473 memory_region_unref(section->mr); 474 if (ret) { 475 error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", " 476 "0x%"HWADDR_PRIx") = %d (%m)", 477 container, iova, end - iova, ret); 478 } 479 } 480 481 static const MemoryListener vfio_memory_listener = { 482 .region_add = vfio_listener_region_add, 483 .region_del = vfio_listener_region_del, 484 }; 485 486 static void vfio_listener_release(VFIOContainer *container) 487 { 488 memory_listener_unregister(&container->iommu_data.type1.listener); 489 } 490 491 int vfio_mmap_region(Object *obj, VFIORegion *region, 492 MemoryRegion *mem, MemoryRegion *submem, 493 void **map, size_t size, off_t offset, 494 const char *name) 495 { 496 int ret = 0; 497 VFIODevice *vbasedev = region->vbasedev; 498 499 if (vbasedev->allow_mmap && size && region->flags & 500 VFIO_REGION_INFO_FLAG_MMAP) { 501 int prot = 0; 502 503 if (region->flags & VFIO_REGION_INFO_FLAG_READ) { 504 prot |= PROT_READ; 505 } 506 507 if (region->flags & VFIO_REGION_INFO_FLAG_WRITE) { 508 prot |= PROT_WRITE; 509 } 510 511 *map = mmap(NULL, size, prot, MAP_SHARED, 512 vbasedev->fd, 513 region->fd_offset + offset); 514 if (*map == MAP_FAILED) { 515 *map = NULL; 516 ret = -errno; 517 goto empty_region; 518 } 519 520 memory_region_init_ram_ptr(submem, obj, name, size, *map); 521 memory_region_set_skip_dump(submem); 522 } else { 523 empty_region: 524 /* Create a zero sized sub-region to make cleanup easy. */ 525 memory_region_init(submem, obj, name, 0); 526 } 527 528 memory_region_add_subregion(mem, offset, submem); 529 530 return ret; 531 } 532 533 void vfio_reset_handler(void *opaque) 534 { 535 VFIOGroup *group; 536 VFIODevice *vbasedev; 537 538 QLIST_FOREACH(group, &vfio_group_list, next) { 539 QLIST_FOREACH(vbasedev, &group->device_list, next) { 540 vbasedev->ops->vfio_compute_needs_reset(vbasedev); 541 } 542 } 543 544 QLIST_FOREACH(group, &vfio_group_list, next) { 545 QLIST_FOREACH(vbasedev, &group->device_list, next) { 546 if (vbasedev->needs_reset) { 547 vbasedev->ops->vfio_hot_reset_multi(vbasedev); 548 } 549 } 550 } 551 } 552 553 static void vfio_kvm_device_add_group(VFIOGroup *group) 554 { 555 #ifdef CONFIG_KVM 556 struct kvm_device_attr attr = { 557 .group = KVM_DEV_VFIO_GROUP, 558 .attr = KVM_DEV_VFIO_GROUP_ADD, 559 .addr = (uint64_t)(unsigned long)&group->fd, 560 }; 561 562 if (!kvm_enabled()) { 563 return; 564 } 565 566 if (vfio_kvm_device_fd < 0) { 567 struct kvm_create_device cd = { 568 .type = KVM_DEV_TYPE_VFIO, 569 }; 570 571 if (kvm_vm_ioctl(kvm_state, KVM_CREATE_DEVICE, &cd)) { 572 error_report("Failed to create KVM VFIO device: %m"); 573 return; 574 } 575 576 vfio_kvm_device_fd = cd.fd; 577 } 578 579 if (ioctl(vfio_kvm_device_fd, KVM_SET_DEVICE_ATTR, &attr)) { 580 error_report("Failed to add group %d to KVM VFIO device: %m", 581 group->groupid); 582 } 583 #endif 584 } 585 586 static void vfio_kvm_device_del_group(VFIOGroup *group) 587 { 588 #ifdef CONFIG_KVM 589 struct kvm_device_attr attr = { 590 .group = KVM_DEV_VFIO_GROUP, 591 .attr = KVM_DEV_VFIO_GROUP_DEL, 592 .addr = (uint64_t)(unsigned long)&group->fd, 593 }; 594 595 if (vfio_kvm_device_fd < 0) { 596 return; 597 } 598 599 if (ioctl(vfio_kvm_device_fd, KVM_SET_DEVICE_ATTR, &attr)) { 600 error_report("Failed to remove group %d from KVM VFIO device: %m", 601 group->groupid); 602 } 603 #endif 604 } 605 606 static VFIOAddressSpace *vfio_get_address_space(AddressSpace *as) 607 { 608 VFIOAddressSpace *space; 609 610 QLIST_FOREACH(space, &vfio_address_spaces, list) { 611 if (space->as == as) { 612 return space; 613 } 614 } 615 616 /* No suitable VFIOAddressSpace, create a new one */ 617 space = g_malloc0(sizeof(*space)); 618 space->as = as; 619 QLIST_INIT(&space->containers); 620 621 QLIST_INSERT_HEAD(&vfio_address_spaces, space, list); 622 623 return space; 624 } 625 626 static void vfio_put_address_space(VFIOAddressSpace *space) 627 { 628 if (QLIST_EMPTY(&space->containers)) { 629 QLIST_REMOVE(space, list); 630 g_free(space); 631 } 632 } 633 634 static int vfio_connect_container(VFIOGroup *group, AddressSpace *as) 635 { 636 VFIOContainer *container; 637 int ret, fd; 638 VFIOAddressSpace *space; 639 640 space = vfio_get_address_space(as); 641 642 QLIST_FOREACH(container, &space->containers, next) { 643 if (!ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &container->fd)) { 644 group->container = container; 645 QLIST_INSERT_HEAD(&container->group_list, group, container_next); 646 return 0; 647 } 648 } 649 650 fd = qemu_open("/dev/vfio/vfio", O_RDWR); 651 if (fd < 0) { 652 error_report("vfio: failed to open /dev/vfio/vfio: %m"); 653 ret = -errno; 654 goto put_space_exit; 655 } 656 657 ret = ioctl(fd, VFIO_GET_API_VERSION); 658 if (ret != VFIO_API_VERSION) { 659 error_report("vfio: supported vfio version: %d, " 660 "reported version: %d", VFIO_API_VERSION, ret); 661 ret = -EINVAL; 662 goto close_fd_exit; 663 } 664 665 container = g_malloc0(sizeof(*container)); 666 container->space = space; 667 container->fd = fd; 668 if (ioctl(fd, VFIO_CHECK_EXTENSION, VFIO_TYPE1_IOMMU) || 669 ioctl(fd, VFIO_CHECK_EXTENSION, VFIO_TYPE1v2_IOMMU)) { 670 bool v2 = !!ioctl(fd, VFIO_CHECK_EXTENSION, VFIO_TYPE1v2_IOMMU); 671 672 ret = ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &fd); 673 if (ret) { 674 error_report("vfio: failed to set group container: %m"); 675 ret = -errno; 676 goto free_container_exit; 677 } 678 679 ret = ioctl(fd, VFIO_SET_IOMMU, 680 v2 ? VFIO_TYPE1v2_IOMMU : VFIO_TYPE1_IOMMU); 681 if (ret) { 682 error_report("vfio: failed to set iommu for container: %m"); 683 ret = -errno; 684 goto free_container_exit; 685 } 686 687 container->iommu_data.type1.listener = vfio_memory_listener; 688 container->iommu_data.release = vfio_listener_release; 689 690 memory_listener_register(&container->iommu_data.type1.listener, 691 container->space->as); 692 693 if (container->iommu_data.type1.error) { 694 ret = container->iommu_data.type1.error; 695 error_report("vfio: memory listener initialization failed for container"); 696 goto listener_release_exit; 697 } 698 699 container->iommu_data.type1.initialized = true; 700 701 } else if (ioctl(fd, VFIO_CHECK_EXTENSION, VFIO_SPAPR_TCE_IOMMU)) { 702 ret = ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &fd); 703 if (ret) { 704 error_report("vfio: failed to set group container: %m"); 705 ret = -errno; 706 goto free_container_exit; 707 } 708 ret = ioctl(fd, VFIO_SET_IOMMU, VFIO_SPAPR_TCE_IOMMU); 709 if (ret) { 710 error_report("vfio: failed to set iommu for container: %m"); 711 ret = -errno; 712 goto free_container_exit; 713 } 714 715 /* 716 * The host kernel code implementing VFIO_IOMMU_DISABLE is called 717 * when container fd is closed so we do not call it explicitly 718 * in this file. 719 */ 720 ret = ioctl(fd, VFIO_IOMMU_ENABLE); 721 if (ret) { 722 error_report("vfio: failed to enable container: %m"); 723 ret = -errno; 724 goto free_container_exit; 725 } 726 727 container->iommu_data.type1.listener = vfio_memory_listener; 728 container->iommu_data.release = vfio_listener_release; 729 730 memory_listener_register(&container->iommu_data.type1.listener, 731 container->space->as); 732 733 } else { 734 error_report("vfio: No available IOMMU models"); 735 ret = -EINVAL; 736 goto free_container_exit; 737 } 738 739 QLIST_INIT(&container->group_list); 740 QLIST_INSERT_HEAD(&space->containers, container, next); 741 742 group->container = container; 743 QLIST_INSERT_HEAD(&container->group_list, group, container_next); 744 745 return 0; 746 listener_release_exit: 747 vfio_listener_release(container); 748 749 free_container_exit: 750 g_free(container); 751 752 close_fd_exit: 753 close(fd); 754 755 put_space_exit: 756 vfio_put_address_space(space); 757 758 return ret; 759 } 760 761 static void vfio_disconnect_container(VFIOGroup *group) 762 { 763 VFIOContainer *container = group->container; 764 765 if (ioctl(group->fd, VFIO_GROUP_UNSET_CONTAINER, &container->fd)) { 766 error_report("vfio: error disconnecting group %d from container", 767 group->groupid); 768 } 769 770 QLIST_REMOVE(group, container_next); 771 group->container = NULL; 772 773 if (QLIST_EMPTY(&container->group_list)) { 774 VFIOAddressSpace *space = container->space; 775 VFIOGuestIOMMU *giommu, *tmp; 776 777 if (container->iommu_data.release) { 778 container->iommu_data.release(container); 779 } 780 QLIST_REMOVE(container, next); 781 782 QLIST_FOREACH_SAFE(giommu, &container->giommu_list, giommu_next, tmp) { 783 memory_region_unregister_iommu_notifier(&giommu->n); 784 QLIST_REMOVE(giommu, giommu_next); 785 g_free(giommu); 786 } 787 788 trace_vfio_disconnect_container(container->fd); 789 close(container->fd); 790 g_free(container); 791 792 vfio_put_address_space(space); 793 } 794 } 795 796 VFIOGroup *vfio_get_group(int groupid, AddressSpace *as) 797 { 798 VFIOGroup *group; 799 char path[32]; 800 struct vfio_group_status status = { .argsz = sizeof(status) }; 801 802 QLIST_FOREACH(group, &vfio_group_list, next) { 803 if (group->groupid == groupid) { 804 /* Found it. Now is it already in the right context? */ 805 if (group->container->space->as == as) { 806 return group; 807 } else { 808 error_report("vfio: group %d used in multiple address spaces", 809 group->groupid); 810 return NULL; 811 } 812 } 813 } 814 815 group = g_malloc0(sizeof(*group)); 816 817 snprintf(path, sizeof(path), "/dev/vfio/%d", groupid); 818 group->fd = qemu_open(path, O_RDWR); 819 if (group->fd < 0) { 820 error_report("vfio: error opening %s: %m", path); 821 goto free_group_exit; 822 } 823 824 if (ioctl(group->fd, VFIO_GROUP_GET_STATUS, &status)) { 825 error_report("vfio: error getting group status: %m"); 826 goto close_fd_exit; 827 } 828 829 if (!(status.flags & VFIO_GROUP_FLAGS_VIABLE)) { 830 error_report("vfio: error, group %d is not viable, please ensure " 831 "all devices within the iommu_group are bound to their " 832 "vfio bus driver.", groupid); 833 goto close_fd_exit; 834 } 835 836 group->groupid = groupid; 837 QLIST_INIT(&group->device_list); 838 839 if (vfio_connect_container(group, as)) { 840 error_report("vfio: failed to setup container for group %d", groupid); 841 goto close_fd_exit; 842 } 843 844 if (QLIST_EMPTY(&vfio_group_list)) { 845 qemu_register_reset(vfio_reset_handler, NULL); 846 } 847 848 QLIST_INSERT_HEAD(&vfio_group_list, group, next); 849 850 vfio_kvm_device_add_group(group); 851 852 return group; 853 854 close_fd_exit: 855 close(group->fd); 856 857 free_group_exit: 858 g_free(group); 859 860 return NULL; 861 } 862 863 void vfio_put_group(VFIOGroup *group) 864 { 865 if (!group || !QLIST_EMPTY(&group->device_list)) { 866 return; 867 } 868 869 vfio_kvm_device_del_group(group); 870 vfio_disconnect_container(group); 871 QLIST_REMOVE(group, next); 872 trace_vfio_put_group(group->fd); 873 close(group->fd); 874 g_free(group); 875 876 if (QLIST_EMPTY(&vfio_group_list)) { 877 qemu_unregister_reset(vfio_reset_handler, NULL); 878 } 879 } 880 881 int vfio_get_device(VFIOGroup *group, const char *name, 882 VFIODevice *vbasedev) 883 { 884 struct vfio_device_info dev_info = { .argsz = sizeof(dev_info) }; 885 int ret, fd; 886 887 fd = ioctl(group->fd, VFIO_GROUP_GET_DEVICE_FD, name); 888 if (fd < 0) { 889 error_report("vfio: error getting device %s from group %d: %m", 890 name, group->groupid); 891 error_printf("Verify all devices in group %d are bound to vfio-<bus> " 892 "or pci-stub and not already in use\n", group->groupid); 893 return fd; 894 } 895 896 ret = ioctl(fd, VFIO_DEVICE_GET_INFO, &dev_info); 897 if (ret) { 898 error_report("vfio: error getting device info: %m"); 899 close(fd); 900 return ret; 901 } 902 903 vbasedev->fd = fd; 904 vbasedev->group = group; 905 QLIST_INSERT_HEAD(&group->device_list, vbasedev, next); 906 907 vbasedev->num_irqs = dev_info.num_irqs; 908 vbasedev->num_regions = dev_info.num_regions; 909 vbasedev->flags = dev_info.flags; 910 911 trace_vfio_get_device(name, dev_info.flags, dev_info.num_regions, 912 dev_info.num_irqs); 913 914 vbasedev->reset_works = !!(dev_info.flags & VFIO_DEVICE_FLAGS_RESET); 915 return 0; 916 } 917 918 void vfio_put_base_device(VFIODevice *vbasedev) 919 { 920 if (!vbasedev->group) { 921 return; 922 } 923 QLIST_REMOVE(vbasedev, next); 924 vbasedev->group = NULL; 925 trace_vfio_put_base_device(vbasedev->fd); 926 close(vbasedev->fd); 927 } 928 929 static int vfio_container_do_ioctl(AddressSpace *as, int32_t groupid, 930 int req, void *param) 931 { 932 VFIOGroup *group; 933 VFIOContainer *container; 934 int ret = -1; 935 936 group = vfio_get_group(groupid, as); 937 if (!group) { 938 error_report("vfio: group %d not registered", groupid); 939 return ret; 940 } 941 942 container = group->container; 943 if (group->container) { 944 ret = ioctl(container->fd, req, param); 945 if (ret < 0) { 946 error_report("vfio: failed to ioctl %d to container: ret=%d, %s", 947 _IOC_NR(req) - VFIO_BASE, ret, strerror(errno)); 948 } 949 } 950 951 vfio_put_group(group); 952 953 return ret; 954 } 955 956 int vfio_container_ioctl(AddressSpace *as, int32_t groupid, 957 int req, void *param) 958 { 959 /* We allow only certain ioctls to the container */ 960 switch (req) { 961 case VFIO_CHECK_EXTENSION: 962 case VFIO_IOMMU_SPAPR_TCE_GET_INFO: 963 case VFIO_EEH_PE_OP: 964 break; 965 default: 966 /* Return an error on unknown requests */ 967 error_report("vfio: unsupported ioctl %X", req); 968 return -1; 969 } 970 971 return vfio_container_do_ioctl(as, groupid, req, param); 972 } 973