1 /* 2 * DMA memory preregistration 3 * 4 * Authors: 5 * Alexey Kardashevskiy <aik@ozlabs.ru> 6 * 7 * This work is licensed under the terms of the GNU GPL, version 2. See 8 * the COPYING file in the top-level directory. 9 */ 10 11 #include "qemu/osdep.h" 12 #include <sys/ioctl.h> 13 #include <linux/vfio.h> 14 #include "system/kvm.h" 15 #include "system/hostmem.h" 16 #include "system/address-spaces.h" 17 18 #include "hw/vfio/vfio-common.h" 19 #include "hw/hw.h" 20 #include "system/ram_addr.h" 21 #include "qemu/error-report.h" 22 #include "qapi/error.h" 23 #include "trace.h" 24 25 typedef struct VFIOHostDMAWindow { 26 hwaddr min_iova; 27 hwaddr max_iova; 28 uint64_t iova_pgsizes; 29 QLIST_ENTRY(VFIOHostDMAWindow) hostwin_next; 30 } VFIOHostDMAWindow; 31 32 typedef struct VFIOSpaprContainer { 33 VFIOContainer container; 34 MemoryListener prereg_listener; 35 QLIST_HEAD(, VFIOHostDMAWindow) hostwin_list; 36 unsigned int levels; 37 } VFIOSpaprContainer; 38 39 OBJECT_DECLARE_SIMPLE_TYPE(VFIOSpaprContainer, VFIO_IOMMU_SPAPR); 40 41 static bool vfio_prereg_listener_skipped_section(MemoryRegionSection *section) 42 { 43 if (memory_region_is_iommu(section->mr)) { 44 hw_error("Cannot possibly preregister IOMMU memory"); 45 } 46 47 return !memory_region_is_ram(section->mr) || 48 memory_region_is_ram_device(section->mr); 49 } 50 51 static void *vfio_prereg_gpa_to_vaddr(MemoryRegionSection *section, hwaddr gpa) 52 { 53 return memory_region_get_ram_ptr(section->mr) + 54 section->offset_within_region + 55 (gpa - section->offset_within_address_space); 56 } 57 58 static void vfio_prereg_listener_region_add(MemoryListener *listener, 59 MemoryRegionSection *section) 60 { 61 VFIOSpaprContainer *scontainer = container_of(listener, VFIOSpaprContainer, 62 prereg_listener); 63 VFIOContainer *container = &scontainer->container; 64 VFIOContainerBase *bcontainer = &container->bcontainer; 65 const hwaddr gpa = section->offset_within_address_space; 66 hwaddr end; 67 int ret; 68 hwaddr page_mask = qemu_real_host_page_mask(); 69 struct vfio_iommu_spapr_register_memory reg = { 70 .argsz = sizeof(reg), 71 .flags = 0, 72 }; 73 74 if (vfio_prereg_listener_skipped_section(section)) { 75 trace_vfio_prereg_listener_region_add_skip( 76 section->offset_within_address_space, 77 section->offset_within_address_space + 78 int128_get64(int128_sub(section->size, int128_one()))); 79 return; 80 } 81 82 if (unlikely((section->offset_within_address_space & ~page_mask) || 83 (section->offset_within_region & ~page_mask) || 84 (int128_get64(section->size) & ~page_mask))) { 85 error_report("%s received unaligned region", __func__); 86 return; 87 } 88 89 end = section->offset_within_address_space + int128_get64(section->size); 90 if (gpa >= end) { 91 return; 92 } 93 94 memory_region_ref(section->mr); 95 96 reg.vaddr = (uintptr_t) vfio_prereg_gpa_to_vaddr(section, gpa); 97 reg.size = end - gpa; 98 99 ret = ioctl(container->fd, VFIO_IOMMU_SPAPR_REGISTER_MEMORY, ®); 100 trace_vfio_prereg_register(reg.vaddr, reg.size, ret ? -errno : 0); 101 if (ret) { 102 /* 103 * On the initfn path, store the first error in the container so we 104 * can gracefully fail. Runtime, there's not much we can do other 105 * than throw a hardware error. 106 */ 107 if (!bcontainer->initialized) { 108 if (!bcontainer->error) { 109 error_setg_errno(&bcontainer->error, -ret, 110 "Memory registering failed"); 111 } 112 } else { 113 hw_error("vfio: Memory registering failed, unable to continue"); 114 } 115 } 116 } 117 118 static void vfio_prereg_listener_region_del(MemoryListener *listener, 119 MemoryRegionSection *section) 120 { 121 VFIOSpaprContainer *scontainer = container_of(listener, VFIOSpaprContainer, 122 prereg_listener); 123 VFIOContainer *container = &scontainer->container; 124 const hwaddr gpa = section->offset_within_address_space; 125 hwaddr end; 126 int ret; 127 hwaddr page_mask = qemu_real_host_page_mask(); 128 struct vfio_iommu_spapr_register_memory reg = { 129 .argsz = sizeof(reg), 130 .flags = 0, 131 }; 132 133 if (vfio_prereg_listener_skipped_section(section)) { 134 trace_vfio_prereg_listener_region_del_skip( 135 section->offset_within_address_space, 136 section->offset_within_address_space + 137 int128_get64(int128_sub(section->size, int128_one()))); 138 return; 139 } 140 141 if (unlikely((section->offset_within_address_space & ~page_mask) || 142 (section->offset_within_region & ~page_mask) || 143 (int128_get64(section->size) & ~page_mask))) { 144 error_report("%s received unaligned region", __func__); 145 return; 146 } 147 148 end = section->offset_within_address_space + int128_get64(section->size); 149 if (gpa >= end) { 150 return; 151 } 152 153 reg.vaddr = (uintptr_t) vfio_prereg_gpa_to_vaddr(section, gpa); 154 reg.size = end - gpa; 155 156 ret = ioctl(container->fd, VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY, ®); 157 trace_vfio_prereg_unregister(reg.vaddr, reg.size, ret ? -errno : 0); 158 } 159 160 static const MemoryListener vfio_prereg_listener = { 161 .name = "vfio-pre-reg", 162 .region_add = vfio_prereg_listener_region_add, 163 .region_del = vfio_prereg_listener_region_del, 164 }; 165 166 static void vfio_host_win_add(VFIOSpaprContainer *scontainer, hwaddr min_iova, 167 hwaddr max_iova, uint64_t iova_pgsizes) 168 { 169 VFIOHostDMAWindow *hostwin; 170 171 QLIST_FOREACH(hostwin, &scontainer->hostwin_list, hostwin_next) { 172 if (ranges_overlap(hostwin->min_iova, 173 hostwin->max_iova - hostwin->min_iova + 1, 174 min_iova, 175 max_iova - min_iova + 1)) { 176 hw_error("%s: Overlapped IOMMU are not enabled", __func__); 177 } 178 } 179 180 hostwin = g_malloc0(sizeof(*hostwin)); 181 182 hostwin->min_iova = min_iova; 183 hostwin->max_iova = max_iova; 184 hostwin->iova_pgsizes = iova_pgsizes; 185 QLIST_INSERT_HEAD(&scontainer->hostwin_list, hostwin, hostwin_next); 186 } 187 188 static int vfio_host_win_del(VFIOSpaprContainer *scontainer, 189 hwaddr min_iova, hwaddr max_iova) 190 { 191 VFIOHostDMAWindow *hostwin; 192 193 QLIST_FOREACH(hostwin, &scontainer->hostwin_list, hostwin_next) { 194 if (hostwin->min_iova == min_iova && hostwin->max_iova == max_iova) { 195 QLIST_REMOVE(hostwin, hostwin_next); 196 g_free(hostwin); 197 return 0; 198 } 199 } 200 201 return -1; 202 } 203 204 static VFIOHostDMAWindow *vfio_find_hostwin(VFIOSpaprContainer *container, 205 hwaddr iova, hwaddr end) 206 { 207 VFIOHostDMAWindow *hostwin; 208 bool hostwin_found = false; 209 210 QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) { 211 if (hostwin->min_iova <= iova && end <= hostwin->max_iova) { 212 hostwin_found = true; 213 break; 214 } 215 } 216 217 return hostwin_found ? hostwin : NULL; 218 } 219 220 static int vfio_spapr_remove_window(VFIOContainer *container, 221 hwaddr offset_within_address_space) 222 { 223 struct vfio_iommu_spapr_tce_remove remove = { 224 .argsz = sizeof(remove), 225 .start_addr = offset_within_address_space, 226 }; 227 int ret; 228 229 ret = ioctl(container->fd, VFIO_IOMMU_SPAPR_TCE_REMOVE, &remove); 230 if (ret) { 231 error_report("Failed to remove window at %"PRIx64, 232 (uint64_t)remove.start_addr); 233 return -errno; 234 } 235 236 trace_vfio_spapr_remove_window(offset_within_address_space); 237 238 return 0; 239 } 240 241 static bool vfio_spapr_create_window(VFIOContainer *container, 242 MemoryRegionSection *section, 243 hwaddr *pgsize, Error **errp) 244 { 245 int ret = 0; 246 VFIOContainerBase *bcontainer = &container->bcontainer; 247 VFIOSpaprContainer *scontainer = container_of(container, VFIOSpaprContainer, 248 container); 249 IOMMUMemoryRegion *iommu_mr = IOMMU_MEMORY_REGION(section->mr); 250 uint64_t pagesize = memory_region_iommu_get_min_page_size(iommu_mr), pgmask; 251 unsigned entries, bits_total, bits_per_level, max_levels, ddw_levels; 252 struct vfio_iommu_spapr_tce_create create = { .argsz = sizeof(create) }; 253 long rampagesize = qemu_minrampagesize(); 254 255 /* 256 * The host might not support the guest supported IOMMU page size, 257 * so we will use smaller physical IOMMU pages to back them. 258 */ 259 if (pagesize > rampagesize) { 260 pagesize = rampagesize; 261 } 262 pgmask = bcontainer->pgsizes & (pagesize | (pagesize - 1)); 263 pagesize = pgmask ? (1ULL << (63 - clz64(pgmask))) : 0; 264 if (!pagesize) { 265 error_setg_errno(errp, EINVAL, "Host doesn't support page size 0x%"PRIx64 266 ", the supported mask is 0x%lx", 267 memory_region_iommu_get_min_page_size(iommu_mr), 268 bcontainer->pgsizes); 269 return false; 270 } 271 272 /* 273 * FIXME: For VFIO iommu types which have KVM acceleration to 274 * avoid bouncing all map/unmaps through qemu this way, this 275 * would be the right place to wire that up (tell the KVM 276 * device emulation the VFIO iommu handles to use). 277 */ 278 create.window_size = int128_get64(section->size); 279 create.page_shift = ctz64(pagesize); 280 /* 281 * SPAPR host supports multilevel TCE tables. We try to guess optimal 282 * levels number and if this fails (for example due to the host memory 283 * fragmentation), we increase levels. The DMA address structure is: 284 * rrrrrrrr rxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx iiiiiiii 285 * where: 286 * r = reserved (bits >= 55 are reserved in the existing hardware) 287 * i = IOMMU page offset (64K in this example) 288 * x = bits to index a TCE which can be split to equal chunks to index 289 * within the level. 290 * The aim is to split "x" to smaller possible number of levels. 291 */ 292 entries = create.window_size >> create.page_shift; 293 /* bits_total is number of "x" needed */ 294 bits_total = ctz64(entries * sizeof(uint64_t)); 295 /* 296 * bits_per_level is a safe guess of how much we can allocate per level: 297 * 8 is the current minimum for CONFIG_FORCE_MAX_ZONEORDER and MAX_ORDER 298 * is usually bigger than that. 299 * Below we look at qemu_real_host_page_size as TCEs are allocated from 300 * system pages. 301 */ 302 bits_per_level = ctz64(qemu_real_host_page_size()) + 8; 303 create.levels = bits_total / bits_per_level; 304 305 ddw_levels = scontainer->levels; 306 if (ddw_levels > 1) { 307 if (bits_total % bits_per_level) { 308 ++create.levels; 309 } 310 max_levels = (64 - create.page_shift) / ctz64(qemu_real_host_page_size()); 311 for ( ; create.levels <= max_levels; ++create.levels) { 312 ret = ioctl(container->fd, VFIO_IOMMU_SPAPR_TCE_CREATE, &create); 313 if (!ret) { 314 break; 315 } 316 } 317 } else { /* ddw_levels == 1 */ 318 if (create.levels > ddw_levels) { 319 error_setg_errno(errp, EINVAL, "Host doesn't support multi-level TCE tables" 320 ". Use larger IO page size. Supported mask is 0x%lx", 321 bcontainer->pgsizes); 322 return false; 323 } 324 ret = ioctl(container->fd, VFIO_IOMMU_SPAPR_TCE_CREATE, &create); 325 } 326 327 if (ret) { 328 error_setg_errno(errp, errno, "Failed to create a window, ret = %d", ret); 329 return false; 330 } 331 332 if (create.start_addr != section->offset_within_address_space) { 333 vfio_spapr_remove_window(container, create.start_addr); 334 335 error_setg_errno(errp, EINVAL, "Host doesn't support DMA window at %"HWADDR_PRIx 336 ", must be %"PRIx64, section->offset_within_address_space, 337 (uint64_t)create.start_addr); 338 return false; 339 } 340 trace_vfio_spapr_create_window(create.page_shift, 341 create.levels, 342 create.window_size, 343 create.start_addr); 344 *pgsize = pagesize; 345 346 return true; 347 } 348 349 static bool 350 vfio_spapr_container_add_section_window(VFIOContainerBase *bcontainer, 351 MemoryRegionSection *section, 352 Error **errp) 353 { 354 VFIOContainer *container = container_of(bcontainer, VFIOContainer, 355 bcontainer); 356 VFIOSpaprContainer *scontainer = container_of(container, VFIOSpaprContainer, 357 container); 358 VFIOHostDMAWindow *hostwin; 359 hwaddr pgsize = 0; 360 int ret; 361 362 /* 363 * VFIO_SPAPR_TCE_IOMMU supports a single host window between 364 * [dma32_window_start, dma32_window_size), we need to ensure 365 * the section fall in this range. 366 */ 367 if (container->iommu_type == VFIO_SPAPR_TCE_IOMMU) { 368 hwaddr iova, end; 369 370 iova = section->offset_within_address_space; 371 end = iova + int128_get64(section->size) - 1; 372 373 if (!vfio_find_hostwin(scontainer, iova, end)) { 374 error_setg(errp, "Container %p can't map guest IOVA region" 375 " 0x%"HWADDR_PRIx"..0x%"HWADDR_PRIx, container, 376 iova, end); 377 return false; 378 } 379 return true; 380 } 381 382 if (container->iommu_type != VFIO_SPAPR_TCE_v2_IOMMU) { 383 return true; 384 } 385 386 /* For now intersections are not allowed, we may relax this later */ 387 QLIST_FOREACH(hostwin, &scontainer->hostwin_list, hostwin_next) { 388 if (ranges_overlap(hostwin->min_iova, 389 hostwin->max_iova - hostwin->min_iova + 1, 390 section->offset_within_address_space, 391 int128_get64(section->size))) { 392 error_setg(errp, 393 "region [0x%"PRIx64",0x%"PRIx64"] overlaps with existing" 394 "host DMA window [0x%"PRIx64",0x%"PRIx64"]", 395 section->offset_within_address_space, 396 section->offset_within_address_space + 397 int128_get64(section->size) - 1, 398 hostwin->min_iova, hostwin->max_iova); 399 return false; 400 } 401 } 402 403 ret = vfio_spapr_create_window(container, section, &pgsize, errp); 404 if (!ret) { 405 return false; 406 } 407 408 vfio_host_win_add(scontainer, section->offset_within_address_space, 409 section->offset_within_address_space + 410 int128_get64(section->size) - 1, pgsize); 411 #ifdef CONFIG_KVM 412 if (kvm_enabled()) { 413 VFIOGroup *group; 414 IOMMUMemoryRegion *iommu_mr = IOMMU_MEMORY_REGION(section->mr); 415 struct kvm_vfio_spapr_tce param; 416 struct kvm_device_attr attr = { 417 .group = KVM_DEV_VFIO_GROUP, 418 .attr = KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE, 419 .addr = (uint64_t)(unsigned long)¶m, 420 }; 421 422 if (!memory_region_iommu_get_attr(iommu_mr, IOMMU_ATTR_SPAPR_TCE_FD, 423 ¶m.tablefd)) { 424 QLIST_FOREACH(group, &container->group_list, container_next) { 425 param.groupfd = group->fd; 426 if (ioctl(vfio_kvm_device_fd, KVM_SET_DEVICE_ATTR, &attr)) { 427 error_setg_errno(errp, errno, 428 "vfio: failed GROUP_SET_SPAPR_TCE for " 429 "KVM VFIO device %d and group fd %d", 430 param.tablefd, param.groupfd); 431 return false; 432 } 433 trace_vfio_spapr_group_attach(param.groupfd, param.tablefd); 434 } 435 } 436 } 437 #endif 438 return true; 439 } 440 441 static void 442 vfio_spapr_container_del_section_window(VFIOContainerBase *bcontainer, 443 MemoryRegionSection *section) 444 { 445 VFIOContainer *container = container_of(bcontainer, VFIOContainer, 446 bcontainer); 447 VFIOSpaprContainer *scontainer = container_of(container, VFIOSpaprContainer, 448 container); 449 450 if (container->iommu_type != VFIO_SPAPR_TCE_v2_IOMMU) { 451 return; 452 } 453 454 vfio_spapr_remove_window(container, 455 section->offset_within_address_space); 456 if (vfio_host_win_del(scontainer, 457 section->offset_within_address_space, 458 section->offset_within_address_space + 459 int128_get64(section->size) - 1) < 0) { 460 hw_error("%s: Cannot delete missing window at %"HWADDR_PRIx, 461 __func__, section->offset_within_address_space); 462 } 463 } 464 465 static void vfio_spapr_container_release(VFIOContainerBase *bcontainer) 466 { 467 VFIOContainer *container = container_of(bcontainer, VFIOContainer, 468 bcontainer); 469 VFIOSpaprContainer *scontainer = container_of(container, VFIOSpaprContainer, 470 container); 471 VFIOHostDMAWindow *hostwin, *next; 472 473 if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) { 474 memory_listener_unregister(&scontainer->prereg_listener); 475 } 476 QLIST_FOREACH_SAFE(hostwin, &scontainer->hostwin_list, hostwin_next, 477 next) { 478 QLIST_REMOVE(hostwin, hostwin_next); 479 g_free(hostwin); 480 } 481 } 482 483 static bool vfio_spapr_container_setup(VFIOContainerBase *bcontainer, 484 Error **errp) 485 { 486 VFIOContainer *container = container_of(bcontainer, VFIOContainer, 487 bcontainer); 488 VFIOSpaprContainer *scontainer = container_of(container, VFIOSpaprContainer, 489 container); 490 struct vfio_iommu_spapr_tce_info info; 491 bool v2 = container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU; 492 int ret, fd = container->fd; 493 494 QLIST_INIT(&scontainer->hostwin_list); 495 496 /* 497 * The host kernel code implementing VFIO_IOMMU_DISABLE is called 498 * when container fd is closed so we do not call it explicitly 499 * in this file. 500 */ 501 if (!v2) { 502 ret = ioctl(fd, VFIO_IOMMU_ENABLE); 503 if (ret) { 504 error_setg_errno(errp, errno, "failed to enable container"); 505 return false; 506 } 507 } else { 508 scontainer->prereg_listener = vfio_prereg_listener; 509 510 memory_listener_register(&scontainer->prereg_listener, 511 &address_space_memory); 512 if (bcontainer->error) { 513 error_propagate_prepend(errp, bcontainer->error, 514 "RAM memory listener initialization failed: "); 515 goto listener_unregister_exit; 516 } 517 } 518 519 info.argsz = sizeof(info); 520 ret = ioctl(fd, VFIO_IOMMU_SPAPR_TCE_GET_INFO, &info); 521 if (ret) { 522 error_setg_errno(errp, errno, 523 "VFIO_IOMMU_SPAPR_TCE_GET_INFO failed"); 524 goto listener_unregister_exit; 525 } 526 527 scontainer->levels = info.ddw.levels; 528 529 if (v2) { 530 bcontainer->pgsizes = info.ddw.pgsizes; 531 /* 532 * There is a default window in just created container. 533 * To make region_add/del simpler, we better remove this 534 * window now and let those iommu_listener callbacks 535 * create/remove them when needed. 536 */ 537 ret = vfio_spapr_remove_window(container, info.dma32_window_start); 538 if (ret) { 539 error_setg_errno(errp, -ret, 540 "failed to remove existing window"); 541 goto listener_unregister_exit; 542 } 543 } else { 544 /* The default table uses 4K pages */ 545 bcontainer->pgsizes = 0x1000; 546 vfio_host_win_add(scontainer, info.dma32_window_start, 547 info.dma32_window_start + 548 info.dma32_window_size - 1, 549 0x1000); 550 } 551 552 return true; 553 554 listener_unregister_exit: 555 if (v2) { 556 memory_listener_unregister(&scontainer->prereg_listener); 557 } 558 return false; 559 } 560 561 static void vfio_iommu_spapr_class_init(ObjectClass *klass, void *data) 562 { 563 VFIOIOMMUClass *vioc = VFIO_IOMMU_CLASS(klass); 564 565 vioc->add_window = vfio_spapr_container_add_section_window; 566 vioc->del_window = vfio_spapr_container_del_section_window; 567 vioc->release = vfio_spapr_container_release; 568 vioc->setup = vfio_spapr_container_setup; 569 }; 570 571 static const TypeInfo types[] = { 572 { 573 .name = TYPE_VFIO_IOMMU_SPAPR, 574 .parent = TYPE_VFIO_IOMMU_LEGACY, 575 .instance_size = sizeof(VFIOSpaprContainer), 576 .class_init = vfio_iommu_spapr_class_init, 577 }, 578 }; 579 580 DEFINE_TYPES(types) 581