1 /* 2 * low level and IOMMU backend agnostic helpers used by VFIO devices, 3 * related to regions, interrupts, capabilities 4 * 5 * Copyright Red Hat, Inc. 2012 6 * 7 * Authors: 8 * Alex Williamson <alex.williamson@redhat.com> 9 * 10 * This work is licensed under the terms of the GNU GPL, version 2. See 11 * the COPYING file in the top-level directory. 12 * 13 * Based on qemu-kvm device-assignment: 14 * Adapted for KVM by Qumranet. 15 * Copyright (c) 2007, Neocleus, Alex Novik (alex@neocleus.com) 16 * Copyright (c) 2007, Neocleus, Guy Zana (guy@neocleus.com) 17 * Copyright (C) 2008, Qumranet, Amit Shah (amit.shah@qumranet.com) 18 * Copyright (C) 2008, Red Hat, Amit Shah (amit.shah@redhat.com) 19 * Copyright (C) 2008, IBM, Muli Ben-Yehuda (muli@il.ibm.com) 20 */ 21 22 #include "qemu/osdep.h" 23 #include <sys/ioctl.h> 24 25 #include "hw/vfio/vfio-common.h" 26 #include "hw/hw.h" 27 #include "trace.h" 28 #include "qapi/error.h" 29 #include "qemu/error-report.h" 30 #include "qemu/units.h" 31 #include "monitor/monitor.h" 32 33 /* 34 * Common VFIO interrupt disable 35 */ 36 void vfio_disable_irqindex(VFIODevice *vbasedev, int index) 37 { 38 struct vfio_irq_set irq_set = { 39 .argsz = sizeof(irq_set), 40 .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER, 41 .index = index, 42 .start = 0, 43 .count = 0, 44 }; 45 46 ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, &irq_set); 47 } 48 49 void vfio_unmask_single_irqindex(VFIODevice *vbasedev, int index) 50 { 51 struct vfio_irq_set irq_set = { 52 .argsz = sizeof(irq_set), 53 .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_UNMASK, 54 .index = index, 55 .start = 0, 56 .count = 1, 57 }; 58 59 ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, &irq_set); 60 } 61 62 void vfio_mask_single_irqindex(VFIODevice *vbasedev, int index) 63 { 64 struct vfio_irq_set irq_set = { 65 .argsz = sizeof(irq_set), 66 .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_MASK, 67 .index = index, 68 .start = 0, 69 .count = 1, 70 }; 71 72 ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, &irq_set); 73 } 74 75 static inline const char *action_to_str(int action) 76 { 77 switch (action) { 78 case VFIO_IRQ_SET_ACTION_MASK: 79 return "MASK"; 80 case VFIO_IRQ_SET_ACTION_UNMASK: 81 return "UNMASK"; 82 case VFIO_IRQ_SET_ACTION_TRIGGER: 83 return "TRIGGER"; 84 default: 85 return "UNKNOWN ACTION"; 86 } 87 } 88 89 static const char *index_to_str(VFIODevice *vbasedev, int index) 90 { 91 if (vbasedev->type != VFIO_DEVICE_TYPE_PCI) { 92 return NULL; 93 } 94 95 switch (index) { 96 case VFIO_PCI_INTX_IRQ_INDEX: 97 return "INTX"; 98 case VFIO_PCI_MSI_IRQ_INDEX: 99 return "MSI"; 100 case VFIO_PCI_MSIX_IRQ_INDEX: 101 return "MSIX"; 102 case VFIO_PCI_ERR_IRQ_INDEX: 103 return "ERR"; 104 case VFIO_PCI_REQ_IRQ_INDEX: 105 return "REQ"; 106 default: 107 return NULL; 108 } 109 } 110 111 bool vfio_set_irq_signaling(VFIODevice *vbasedev, int index, int subindex, 112 int action, int fd, Error **errp) 113 { 114 ERRP_GUARD(); 115 g_autofree struct vfio_irq_set *irq_set = NULL; 116 int argsz; 117 const char *name; 118 int32_t *pfd; 119 120 argsz = sizeof(*irq_set) + sizeof(*pfd); 121 122 irq_set = g_malloc0(argsz); 123 irq_set->argsz = argsz; 124 irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | action; 125 irq_set->index = index; 126 irq_set->start = subindex; 127 irq_set->count = 1; 128 pfd = (int32_t *)&irq_set->data; 129 *pfd = fd; 130 131 if (!ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, irq_set)) { 132 return true; 133 } 134 135 error_setg_errno(errp, errno, "VFIO_DEVICE_SET_IRQS failure"); 136 137 name = index_to_str(vbasedev, index); 138 if (name) { 139 error_prepend(errp, "%s-%d: ", name, subindex); 140 } else { 141 error_prepend(errp, "index %d-%d: ", index, subindex); 142 } 143 error_prepend(errp, 144 "Failed to %s %s eventfd signaling for interrupt ", 145 fd < 0 ? "tear down" : "set up", action_to_str(action)); 146 return false; 147 } 148 149 /* 150 * IO Port/MMIO - Beware of the endians, VFIO is always little endian 151 */ 152 void vfio_region_write(void *opaque, hwaddr addr, 153 uint64_t data, unsigned size) 154 { 155 VFIORegion *region = opaque; 156 VFIODevice *vbasedev = region->vbasedev; 157 union { 158 uint8_t byte; 159 uint16_t word; 160 uint32_t dword; 161 uint64_t qword; 162 } buf; 163 164 switch (size) { 165 case 1: 166 buf.byte = data; 167 break; 168 case 2: 169 buf.word = cpu_to_le16(data); 170 break; 171 case 4: 172 buf.dword = cpu_to_le32(data); 173 break; 174 case 8: 175 buf.qword = cpu_to_le64(data); 176 break; 177 default: 178 hw_error("vfio: unsupported write size, %u bytes", size); 179 break; 180 } 181 182 if (pwrite(vbasedev->fd, &buf, size, region->fd_offset + addr) != size) { 183 error_report("%s(%s:region%d+0x%"HWADDR_PRIx", 0x%"PRIx64 184 ",%d) failed: %m", 185 __func__, vbasedev->name, region->nr, 186 addr, data, size); 187 } 188 189 trace_vfio_region_write(vbasedev->name, region->nr, addr, data, size); 190 191 /* 192 * A read or write to a BAR always signals an INTx EOI. This will 193 * do nothing if not pending (including not in INTx mode). We assume 194 * that a BAR access is in response to an interrupt and that BAR 195 * accesses will service the interrupt. Unfortunately, we don't know 196 * which access will service the interrupt, so we're potentially 197 * getting quite a few host interrupts per guest interrupt. 198 */ 199 vbasedev->ops->vfio_eoi(vbasedev); 200 } 201 202 uint64_t vfio_region_read(void *opaque, 203 hwaddr addr, unsigned size) 204 { 205 VFIORegion *region = opaque; 206 VFIODevice *vbasedev = region->vbasedev; 207 union { 208 uint8_t byte; 209 uint16_t word; 210 uint32_t dword; 211 uint64_t qword; 212 } buf; 213 uint64_t data = 0; 214 215 if (pread(vbasedev->fd, &buf, size, region->fd_offset + addr) != size) { 216 error_report("%s(%s:region%d+0x%"HWADDR_PRIx", %d) failed: %m", 217 __func__, vbasedev->name, region->nr, 218 addr, size); 219 return (uint64_t)-1; 220 } 221 switch (size) { 222 case 1: 223 data = buf.byte; 224 break; 225 case 2: 226 data = le16_to_cpu(buf.word); 227 break; 228 case 4: 229 data = le32_to_cpu(buf.dword); 230 break; 231 case 8: 232 data = le64_to_cpu(buf.qword); 233 break; 234 default: 235 hw_error("vfio: unsupported read size, %u bytes", size); 236 break; 237 } 238 239 trace_vfio_region_read(vbasedev->name, region->nr, addr, size, data); 240 241 /* Same as write above */ 242 vbasedev->ops->vfio_eoi(vbasedev); 243 244 return data; 245 } 246 247 const MemoryRegionOps vfio_region_ops = { 248 .read = vfio_region_read, 249 .write = vfio_region_write, 250 .endianness = DEVICE_LITTLE_ENDIAN, 251 .valid = { 252 .min_access_size = 1, 253 .max_access_size = 8, 254 }, 255 .impl = { 256 .min_access_size = 1, 257 .max_access_size = 8, 258 }, 259 }; 260 261 int vfio_bitmap_alloc(VFIOBitmap *vbmap, hwaddr size) 262 { 263 vbmap->pages = REAL_HOST_PAGE_ALIGN(size) / qemu_real_host_page_size(); 264 vbmap->size = ROUND_UP(vbmap->pages, sizeof(__u64) * BITS_PER_BYTE) / 265 BITS_PER_BYTE; 266 vbmap->bitmap = g_try_malloc0(vbmap->size); 267 if (!vbmap->bitmap) { 268 return -ENOMEM; 269 } 270 271 return 0; 272 } 273 274 struct vfio_info_cap_header * 275 vfio_get_cap(void *ptr, uint32_t cap_offset, uint16_t id) 276 { 277 struct vfio_info_cap_header *hdr; 278 279 for (hdr = ptr + cap_offset; hdr != ptr; hdr = ptr + hdr->next) { 280 if (hdr->id == id) { 281 return hdr; 282 } 283 } 284 285 return NULL; 286 } 287 288 struct vfio_info_cap_header * 289 vfio_get_region_info_cap(struct vfio_region_info *info, uint16_t id) 290 { 291 if (!(info->flags & VFIO_REGION_INFO_FLAG_CAPS)) { 292 return NULL; 293 } 294 295 return vfio_get_cap((void *)info, info->cap_offset, id); 296 } 297 298 struct vfio_info_cap_header * 299 vfio_get_device_info_cap(struct vfio_device_info *info, uint16_t id) 300 { 301 if (!(info->flags & VFIO_DEVICE_FLAGS_CAPS)) { 302 return NULL; 303 } 304 305 return vfio_get_cap((void *)info, info->cap_offset, id); 306 } 307 308 static int vfio_setup_region_sparse_mmaps(VFIORegion *region, 309 struct vfio_region_info *info) 310 { 311 struct vfio_info_cap_header *hdr; 312 struct vfio_region_info_cap_sparse_mmap *sparse; 313 int i, j; 314 315 hdr = vfio_get_region_info_cap(info, VFIO_REGION_INFO_CAP_SPARSE_MMAP); 316 if (!hdr) { 317 return -ENODEV; 318 } 319 320 sparse = container_of(hdr, struct vfio_region_info_cap_sparse_mmap, header); 321 322 trace_vfio_region_sparse_mmap_header(region->vbasedev->name, 323 region->nr, sparse->nr_areas); 324 325 region->mmaps = g_new0(VFIOMmap, sparse->nr_areas); 326 327 for (i = 0, j = 0; i < sparse->nr_areas; i++) { 328 if (sparse->areas[i].size) { 329 trace_vfio_region_sparse_mmap_entry(i, sparse->areas[i].offset, 330 sparse->areas[i].offset + 331 sparse->areas[i].size - 1); 332 region->mmaps[j].offset = sparse->areas[i].offset; 333 region->mmaps[j].size = sparse->areas[i].size; 334 j++; 335 } 336 } 337 338 region->nr_mmaps = j; 339 region->mmaps = g_realloc(region->mmaps, j * sizeof(VFIOMmap)); 340 341 return 0; 342 } 343 344 int vfio_region_setup(Object *obj, VFIODevice *vbasedev, VFIORegion *region, 345 int index, const char *name) 346 { 347 g_autofree struct vfio_region_info *info = NULL; 348 int ret; 349 350 ret = vfio_get_region_info(vbasedev, index, &info); 351 if (ret) { 352 return ret; 353 } 354 355 region->vbasedev = vbasedev; 356 region->flags = info->flags; 357 region->size = info->size; 358 region->fd_offset = info->offset; 359 region->nr = index; 360 361 if (region->size) { 362 region->mem = g_new0(MemoryRegion, 1); 363 memory_region_init_io(region->mem, obj, &vfio_region_ops, 364 region, name, region->size); 365 366 if (!vbasedev->no_mmap && 367 region->flags & VFIO_REGION_INFO_FLAG_MMAP) { 368 369 ret = vfio_setup_region_sparse_mmaps(region, info); 370 371 if (ret) { 372 region->nr_mmaps = 1; 373 region->mmaps = g_new0(VFIOMmap, region->nr_mmaps); 374 region->mmaps[0].offset = 0; 375 region->mmaps[0].size = region->size; 376 } 377 } 378 } 379 380 trace_vfio_region_setup(vbasedev->name, index, name, 381 region->flags, region->fd_offset, region->size); 382 return 0; 383 } 384 385 static void vfio_subregion_unmap(VFIORegion *region, int index) 386 { 387 trace_vfio_region_unmap(memory_region_name(®ion->mmaps[index].mem), 388 region->mmaps[index].offset, 389 region->mmaps[index].offset + 390 region->mmaps[index].size - 1); 391 memory_region_del_subregion(region->mem, ®ion->mmaps[index].mem); 392 munmap(region->mmaps[index].mmap, region->mmaps[index].size); 393 object_unparent(OBJECT(®ion->mmaps[index].mem)); 394 region->mmaps[index].mmap = NULL; 395 } 396 397 int vfio_region_mmap(VFIORegion *region) 398 { 399 int i, ret, prot = 0; 400 char *name; 401 402 if (!region->mem) { 403 return 0; 404 } 405 406 prot |= region->flags & VFIO_REGION_INFO_FLAG_READ ? PROT_READ : 0; 407 prot |= region->flags & VFIO_REGION_INFO_FLAG_WRITE ? PROT_WRITE : 0; 408 409 for (i = 0; i < region->nr_mmaps; i++) { 410 size_t align = MIN(1ULL << ctz64(region->mmaps[i].size), 1 * GiB); 411 void *map_base, *map_align; 412 413 /* 414 * Align the mmap for more efficient mapping in the kernel. Ideally 415 * we'd know the PMD and PUD mapping sizes to use as discrete alignment 416 * intervals, but we don't. As of Linux v6.12, the largest PUD size 417 * supporting huge pfnmap is 1GiB (ARCH_SUPPORTS_PUD_PFNMAP is only set 418 * on x86_64). Align by power-of-two size, capped at 1GiB. 419 * 420 * NB. qemu_memalign() and friends actually allocate memory, whereas 421 * the region size here can exceed host memory, therefore we manually 422 * create an oversized anonymous mapping and clean it up for alignment. 423 */ 424 map_base = mmap(0, region->mmaps[i].size + align, PROT_NONE, 425 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); 426 if (map_base == MAP_FAILED) { 427 ret = -errno; 428 goto no_mmap; 429 } 430 431 map_align = (void *)ROUND_UP((uintptr_t)map_base, (uintptr_t)align); 432 munmap(map_base, map_align - map_base); 433 munmap(map_align + region->mmaps[i].size, 434 align - (map_align - map_base)); 435 436 region->mmaps[i].mmap = mmap(map_align, region->mmaps[i].size, prot, 437 MAP_SHARED | MAP_FIXED, 438 region->vbasedev->fd, 439 region->fd_offset + 440 region->mmaps[i].offset); 441 if (region->mmaps[i].mmap == MAP_FAILED) { 442 ret = -errno; 443 goto no_mmap; 444 } 445 446 name = g_strdup_printf("%s mmaps[%d]", 447 memory_region_name(region->mem), i); 448 memory_region_init_ram_device_ptr(®ion->mmaps[i].mem, 449 memory_region_owner(region->mem), 450 name, region->mmaps[i].size, 451 region->mmaps[i].mmap); 452 g_free(name); 453 memory_region_add_subregion(region->mem, region->mmaps[i].offset, 454 ®ion->mmaps[i].mem); 455 456 trace_vfio_region_mmap(memory_region_name(®ion->mmaps[i].mem), 457 region->mmaps[i].offset, 458 region->mmaps[i].offset + 459 region->mmaps[i].size - 1); 460 } 461 462 return 0; 463 464 no_mmap: 465 trace_vfio_region_mmap_fault(memory_region_name(region->mem), i, 466 region->fd_offset + region->mmaps[i].offset, 467 region->fd_offset + region->mmaps[i].offset + 468 region->mmaps[i].size - 1, ret); 469 470 region->mmaps[i].mmap = NULL; 471 472 for (i--; i >= 0; i--) { 473 vfio_subregion_unmap(region, i); 474 } 475 476 return ret; 477 } 478 479 void vfio_region_unmap(VFIORegion *region) 480 { 481 int i; 482 483 if (!region->mem) { 484 return; 485 } 486 487 for (i = 0; i < region->nr_mmaps; i++) { 488 if (region->mmaps[i].mmap) { 489 vfio_subregion_unmap(region, i); 490 } 491 } 492 } 493 494 void vfio_region_exit(VFIORegion *region) 495 { 496 int i; 497 498 if (!region->mem) { 499 return; 500 } 501 502 for (i = 0; i < region->nr_mmaps; i++) { 503 if (region->mmaps[i].mmap) { 504 memory_region_del_subregion(region->mem, ®ion->mmaps[i].mem); 505 } 506 } 507 508 trace_vfio_region_exit(region->vbasedev->name, region->nr); 509 } 510 511 void vfio_region_finalize(VFIORegion *region) 512 { 513 int i; 514 515 if (!region->mem) { 516 return; 517 } 518 519 for (i = 0; i < region->nr_mmaps; i++) { 520 if (region->mmaps[i].mmap) { 521 munmap(region->mmaps[i].mmap, region->mmaps[i].size); 522 object_unparent(OBJECT(®ion->mmaps[i].mem)); 523 } 524 } 525 526 object_unparent(OBJECT(region->mem)); 527 528 g_free(region->mem); 529 g_free(region->mmaps); 530 531 trace_vfio_region_finalize(region->vbasedev->name, region->nr); 532 533 region->mem = NULL; 534 region->mmaps = NULL; 535 region->nr_mmaps = 0; 536 region->size = 0; 537 region->flags = 0; 538 region->nr = 0; 539 } 540 541 void vfio_region_mmaps_set_enabled(VFIORegion *region, bool enabled) 542 { 543 int i; 544 545 if (!region->mem) { 546 return; 547 } 548 549 for (i = 0; i < region->nr_mmaps; i++) { 550 if (region->mmaps[i].mmap) { 551 memory_region_set_enabled(®ion->mmaps[i].mem, enabled); 552 } 553 } 554 555 trace_vfio_region_mmaps_set_enabled(memory_region_name(region->mem), 556 enabled); 557 } 558 559 int vfio_get_region_info(VFIODevice *vbasedev, int index, 560 struct vfio_region_info **info) 561 { 562 size_t argsz = sizeof(struct vfio_region_info); 563 564 *info = g_malloc0(argsz); 565 566 (*info)->index = index; 567 retry: 568 (*info)->argsz = argsz; 569 570 if (ioctl(vbasedev->fd, VFIO_DEVICE_GET_REGION_INFO, *info)) { 571 g_free(*info); 572 *info = NULL; 573 return -errno; 574 } 575 576 if ((*info)->argsz > argsz) { 577 argsz = (*info)->argsz; 578 *info = g_realloc(*info, argsz); 579 580 goto retry; 581 } 582 583 return 0; 584 } 585 586 int vfio_get_dev_region_info(VFIODevice *vbasedev, uint32_t type, 587 uint32_t subtype, struct vfio_region_info **info) 588 { 589 int i; 590 591 for (i = 0; i < vbasedev->num_regions; i++) { 592 struct vfio_info_cap_header *hdr; 593 struct vfio_region_info_cap_type *cap_type; 594 595 if (vfio_get_region_info(vbasedev, i, info)) { 596 continue; 597 } 598 599 hdr = vfio_get_region_info_cap(*info, VFIO_REGION_INFO_CAP_TYPE); 600 if (!hdr) { 601 g_free(*info); 602 continue; 603 } 604 605 cap_type = container_of(hdr, struct vfio_region_info_cap_type, header); 606 607 trace_vfio_get_dev_region(vbasedev->name, i, 608 cap_type->type, cap_type->subtype); 609 610 if (cap_type->type == type && cap_type->subtype == subtype) { 611 return 0; 612 } 613 614 g_free(*info); 615 } 616 617 *info = NULL; 618 return -ENODEV; 619 } 620 621 bool vfio_has_region_cap(VFIODevice *vbasedev, int region, uint16_t cap_type) 622 { 623 g_autofree struct vfio_region_info *info = NULL; 624 bool ret = false; 625 626 if (!vfio_get_region_info(vbasedev, region, &info)) { 627 if (vfio_get_region_info_cap(info, cap_type)) { 628 ret = true; 629 } 630 } 631 632 return ret; 633 } 634 635 bool vfio_device_get_name(VFIODevice *vbasedev, Error **errp) 636 { 637 ERRP_GUARD(); 638 struct stat st; 639 640 if (vbasedev->fd < 0) { 641 if (stat(vbasedev->sysfsdev, &st) < 0) { 642 error_setg_errno(errp, errno, "no such host device"); 643 error_prepend(errp, VFIO_MSG_PREFIX, vbasedev->sysfsdev); 644 return false; 645 } 646 /* User may specify a name, e.g: VFIO platform device */ 647 if (!vbasedev->name) { 648 vbasedev->name = g_path_get_basename(vbasedev->sysfsdev); 649 } 650 } else { 651 if (!vbasedev->iommufd) { 652 error_setg(errp, "Use FD passing only with iommufd backend"); 653 return false; 654 } 655 /* 656 * Give a name with fd so any function printing out vbasedev->name 657 * will not break. 658 */ 659 if (!vbasedev->name) { 660 vbasedev->name = g_strdup_printf("VFIO_FD%d", vbasedev->fd); 661 } 662 } 663 664 return true; 665 } 666 667 void vfio_device_set_fd(VFIODevice *vbasedev, const char *str, Error **errp) 668 { 669 ERRP_GUARD(); 670 int fd = monitor_fd_param(monitor_cur(), str, errp); 671 672 if (fd < 0) { 673 error_prepend(errp, "Could not parse remote object fd %s:", str); 674 return; 675 } 676 vbasedev->fd = fd; 677 } 678 679 void vfio_device_init(VFIODevice *vbasedev, int type, VFIODeviceOps *ops, 680 DeviceState *dev, bool ram_discard) 681 { 682 vbasedev->type = type; 683 vbasedev->ops = ops; 684 vbasedev->dev = dev; 685 vbasedev->fd = -1; 686 687 vbasedev->ram_block_discard_allowed = ram_discard; 688 } 689 690 int vfio_device_get_aw_bits(VFIODevice *vdev) 691 { 692 /* 693 * iova_ranges is a sorted list. For old kernels that support 694 * VFIO but not support query of iova ranges, iova_ranges is NULL, 695 * in this case HOST_IOMMU_DEVICE_CAP_AW_BITS_MAX(64) is returned. 696 */ 697 GList *l = g_list_last(vdev->bcontainer->iova_ranges); 698 699 if (l) { 700 Range *range = l->data; 701 return range_get_last_bit(range) + 1; 702 } 703 704 return HOST_IOMMU_DEVICE_CAP_AW_BITS_MAX; 705 } 706 707 bool vfio_device_is_mdev(VFIODevice *vbasedev) 708 { 709 g_autofree char *subsys = NULL; 710 g_autofree char *tmp = NULL; 711 712 if (!vbasedev->sysfsdev) { 713 return false; 714 } 715 716 tmp = g_strdup_printf("%s/subsystem", vbasedev->sysfsdev); 717 subsys = realpath(tmp, NULL); 718 return subsys && (strcmp(subsys, "/sys/bus/mdev") == 0); 719 } 720 721 bool vfio_device_hiod_realize(VFIODevice *vbasedev, Error **errp) 722 { 723 HostIOMMUDevice *hiod = vbasedev->hiod; 724 725 if (!hiod) { 726 return true; 727 } 728 729 return HOST_IOMMU_DEVICE_GET_CLASS(hiod)->realize(hiod, vbasedev, errp); 730 } 731