1 /* 2 * DMA memory preregistration 3 * 4 * Authors: 5 * Alexey Kardashevskiy <aik@ozlabs.ru> 6 * 7 * This work is licensed under the terms of the GNU GPL, version 2. See 8 * the COPYING file in the top-level directory. 9 */ 10 11 #include "qemu/osdep.h" 12 #include <sys/ioctl.h> 13 #include <linux/vfio.h> 14 #ifdef CONFIG_KVM 15 #include <linux/kvm.h> 16 #endif 17 #include "sysemu/kvm.h" 18 #include "exec/address-spaces.h" 19 20 #include "hw/vfio/vfio-common.h" 21 #include "hw/hw.h" 22 #include "exec/ram_addr.h" 23 #include "qemu/error-report.h" 24 #include "qapi/error.h" 25 #include "trace.h" 26 27 typedef struct VFIOSpaprContainer { 28 VFIOContainer container; 29 MemoryListener prereg_listener; 30 QLIST_HEAD(, VFIOHostDMAWindow) hostwin_list; 31 } VFIOSpaprContainer; 32 33 OBJECT_DECLARE_SIMPLE_TYPE(VFIOSpaprContainer, VFIO_IOMMU_SPAPR); 34 35 static bool vfio_prereg_listener_skipped_section(MemoryRegionSection *section) 36 { 37 if (memory_region_is_iommu(section->mr)) { 38 hw_error("Cannot possibly preregister IOMMU memory"); 39 } 40 41 return !memory_region_is_ram(section->mr) || 42 memory_region_is_ram_device(section->mr); 43 } 44 45 static void *vfio_prereg_gpa_to_vaddr(MemoryRegionSection *section, hwaddr gpa) 46 { 47 return memory_region_get_ram_ptr(section->mr) + 48 section->offset_within_region + 49 (gpa - section->offset_within_address_space); 50 } 51 52 static void vfio_prereg_listener_region_add(MemoryListener *listener, 53 MemoryRegionSection *section) 54 { 55 VFIOSpaprContainer *scontainer = container_of(listener, VFIOSpaprContainer, 56 prereg_listener); 57 VFIOContainer *container = &scontainer->container; 58 VFIOContainerBase *bcontainer = &container->bcontainer; 59 const hwaddr gpa = section->offset_within_address_space; 60 hwaddr end; 61 int ret; 62 hwaddr page_mask = qemu_real_host_page_mask(); 63 struct vfio_iommu_spapr_register_memory reg = { 64 .argsz = sizeof(reg), 65 .flags = 0, 66 }; 67 68 if (vfio_prereg_listener_skipped_section(section)) { 69 trace_vfio_prereg_listener_region_add_skip( 70 section->offset_within_address_space, 71 section->offset_within_address_space + 72 int128_get64(int128_sub(section->size, int128_one()))); 73 return; 74 } 75 76 if (unlikely((section->offset_within_address_space & ~page_mask) || 77 (section->offset_within_region & ~page_mask) || 78 (int128_get64(section->size) & ~page_mask))) { 79 error_report("%s received unaligned region", __func__); 80 return; 81 } 82 83 end = section->offset_within_address_space + int128_get64(section->size); 84 if (gpa >= end) { 85 return; 86 } 87 88 memory_region_ref(section->mr); 89 90 reg.vaddr = (uintptr_t) vfio_prereg_gpa_to_vaddr(section, gpa); 91 reg.size = end - gpa; 92 93 ret = ioctl(container->fd, VFIO_IOMMU_SPAPR_REGISTER_MEMORY, ®); 94 trace_vfio_prereg_register(reg.vaddr, reg.size, ret ? -errno : 0); 95 if (ret) { 96 /* 97 * On the initfn path, store the first error in the container so we 98 * can gracefully fail. Runtime, there's not much we can do other 99 * than throw a hardware error. 100 */ 101 if (!bcontainer->initialized) { 102 if (!bcontainer->error) { 103 error_setg_errno(&bcontainer->error, -ret, 104 "Memory registering failed"); 105 } 106 } else { 107 hw_error("vfio: Memory registering failed, unable to continue"); 108 } 109 } 110 } 111 112 static void vfio_prereg_listener_region_del(MemoryListener *listener, 113 MemoryRegionSection *section) 114 { 115 VFIOSpaprContainer *scontainer = container_of(listener, VFIOSpaprContainer, 116 prereg_listener); 117 VFIOContainer *container = &scontainer->container; 118 const hwaddr gpa = section->offset_within_address_space; 119 hwaddr end; 120 int ret; 121 hwaddr page_mask = qemu_real_host_page_mask(); 122 struct vfio_iommu_spapr_register_memory reg = { 123 .argsz = sizeof(reg), 124 .flags = 0, 125 }; 126 127 if (vfio_prereg_listener_skipped_section(section)) { 128 trace_vfio_prereg_listener_region_del_skip( 129 section->offset_within_address_space, 130 section->offset_within_address_space + 131 int128_get64(int128_sub(section->size, int128_one()))); 132 return; 133 } 134 135 if (unlikely((section->offset_within_address_space & ~page_mask) || 136 (section->offset_within_region & ~page_mask) || 137 (int128_get64(section->size) & ~page_mask))) { 138 error_report("%s received unaligned region", __func__); 139 return; 140 } 141 142 end = section->offset_within_address_space + int128_get64(section->size); 143 if (gpa >= end) { 144 return; 145 } 146 147 reg.vaddr = (uintptr_t) vfio_prereg_gpa_to_vaddr(section, gpa); 148 reg.size = end - gpa; 149 150 ret = ioctl(container->fd, VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY, ®); 151 trace_vfio_prereg_unregister(reg.vaddr, reg.size, ret ? -errno : 0); 152 } 153 154 static const MemoryListener vfio_prereg_listener = { 155 .name = "vfio-pre-reg", 156 .region_add = vfio_prereg_listener_region_add, 157 .region_del = vfio_prereg_listener_region_del, 158 }; 159 160 static void vfio_host_win_add(VFIOSpaprContainer *scontainer, hwaddr min_iova, 161 hwaddr max_iova, uint64_t iova_pgsizes) 162 { 163 VFIOHostDMAWindow *hostwin; 164 165 QLIST_FOREACH(hostwin, &scontainer->hostwin_list, hostwin_next) { 166 if (ranges_overlap(hostwin->min_iova, 167 hostwin->max_iova - hostwin->min_iova + 1, 168 min_iova, 169 max_iova - min_iova + 1)) { 170 hw_error("%s: Overlapped IOMMU are not enabled", __func__); 171 } 172 } 173 174 hostwin = g_malloc0(sizeof(*hostwin)); 175 176 hostwin->min_iova = min_iova; 177 hostwin->max_iova = max_iova; 178 hostwin->iova_pgsizes = iova_pgsizes; 179 QLIST_INSERT_HEAD(&scontainer->hostwin_list, hostwin, hostwin_next); 180 } 181 182 static int vfio_host_win_del(VFIOSpaprContainer *scontainer, 183 hwaddr min_iova, hwaddr max_iova) 184 { 185 VFIOHostDMAWindow *hostwin; 186 187 QLIST_FOREACH(hostwin, &scontainer->hostwin_list, hostwin_next) { 188 if (hostwin->min_iova == min_iova && hostwin->max_iova == max_iova) { 189 QLIST_REMOVE(hostwin, hostwin_next); 190 g_free(hostwin); 191 return 0; 192 } 193 } 194 195 return -1; 196 } 197 198 static VFIOHostDMAWindow *vfio_find_hostwin(VFIOSpaprContainer *container, 199 hwaddr iova, hwaddr end) 200 { 201 VFIOHostDMAWindow *hostwin; 202 bool hostwin_found = false; 203 204 QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) { 205 if (hostwin->min_iova <= iova && end <= hostwin->max_iova) { 206 hostwin_found = true; 207 break; 208 } 209 } 210 211 return hostwin_found ? hostwin : NULL; 212 } 213 214 static int vfio_spapr_remove_window(VFIOContainer *container, 215 hwaddr offset_within_address_space) 216 { 217 struct vfio_iommu_spapr_tce_remove remove = { 218 .argsz = sizeof(remove), 219 .start_addr = offset_within_address_space, 220 }; 221 int ret; 222 223 ret = ioctl(container->fd, VFIO_IOMMU_SPAPR_TCE_REMOVE, &remove); 224 if (ret) { 225 error_report("Failed to remove window at %"PRIx64, 226 (uint64_t)remove.start_addr); 227 return -errno; 228 } 229 230 trace_vfio_spapr_remove_window(offset_within_address_space); 231 232 return 0; 233 } 234 235 static int vfio_spapr_create_window(VFIOContainer *container, 236 MemoryRegionSection *section, 237 hwaddr *pgsize) 238 { 239 int ret = 0; 240 VFIOContainerBase *bcontainer = &container->bcontainer; 241 IOMMUMemoryRegion *iommu_mr = IOMMU_MEMORY_REGION(section->mr); 242 uint64_t pagesize = memory_region_iommu_get_min_page_size(iommu_mr), pgmask; 243 unsigned entries, bits_total, bits_per_level, max_levels; 244 struct vfio_iommu_spapr_tce_create create = { .argsz = sizeof(create) }; 245 long rampagesize = qemu_minrampagesize(); 246 247 /* 248 * The host might not support the guest supported IOMMU page size, 249 * so we will use smaller physical IOMMU pages to back them. 250 */ 251 if (pagesize > rampagesize) { 252 pagesize = rampagesize; 253 } 254 pgmask = bcontainer->pgsizes & (pagesize | (pagesize - 1)); 255 pagesize = pgmask ? (1ULL << (63 - clz64(pgmask))) : 0; 256 if (!pagesize) { 257 error_report("Host doesn't support page size 0x%"PRIx64 258 ", the supported mask is 0x%lx", 259 memory_region_iommu_get_min_page_size(iommu_mr), 260 bcontainer->pgsizes); 261 return -EINVAL; 262 } 263 264 /* 265 * FIXME: For VFIO iommu types which have KVM acceleration to 266 * avoid bouncing all map/unmaps through qemu this way, this 267 * would be the right place to wire that up (tell the KVM 268 * device emulation the VFIO iommu handles to use). 269 */ 270 create.window_size = int128_get64(section->size); 271 create.page_shift = ctz64(pagesize); 272 /* 273 * SPAPR host supports multilevel TCE tables. We try to guess optimal 274 * levels number and if this fails (for example due to the host memory 275 * fragmentation), we increase levels. The DMA address structure is: 276 * rrrrrrrr rxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx iiiiiiii 277 * where: 278 * r = reserved (bits >= 55 are reserved in the existing hardware) 279 * i = IOMMU page offset (64K in this example) 280 * x = bits to index a TCE which can be split to equal chunks to index 281 * within the level. 282 * The aim is to split "x" to smaller possible number of levels. 283 */ 284 entries = create.window_size >> create.page_shift; 285 /* bits_total is number of "x" needed */ 286 bits_total = ctz64(entries * sizeof(uint64_t)); 287 /* 288 * bits_per_level is a safe guess of how much we can allocate per level: 289 * 8 is the current minimum for CONFIG_FORCE_MAX_ZONEORDER and MAX_ORDER 290 * is usually bigger than that. 291 * Below we look at qemu_real_host_page_size as TCEs are allocated from 292 * system pages. 293 */ 294 bits_per_level = ctz64(qemu_real_host_page_size()) + 8; 295 create.levels = bits_total / bits_per_level; 296 if (bits_total % bits_per_level) { 297 ++create.levels; 298 } 299 max_levels = (64 - create.page_shift) / ctz64(qemu_real_host_page_size()); 300 for ( ; create.levels <= max_levels; ++create.levels) { 301 ret = ioctl(container->fd, VFIO_IOMMU_SPAPR_TCE_CREATE, &create); 302 if (!ret) { 303 break; 304 } 305 } 306 if (ret) { 307 error_report("Failed to create a window, ret = %d (%m)", ret); 308 return -errno; 309 } 310 311 if (create.start_addr != section->offset_within_address_space) { 312 vfio_spapr_remove_window(container, create.start_addr); 313 314 error_report("Host doesn't support DMA window at %"HWADDR_PRIx", must be %"PRIx64, 315 section->offset_within_address_space, 316 (uint64_t)create.start_addr); 317 return -EINVAL; 318 } 319 trace_vfio_spapr_create_window(create.page_shift, 320 create.levels, 321 create.window_size, 322 create.start_addr); 323 *pgsize = pagesize; 324 325 return 0; 326 } 327 328 static bool 329 vfio_spapr_container_add_section_window(VFIOContainerBase *bcontainer, 330 MemoryRegionSection *section, 331 Error **errp) 332 { 333 VFIOContainer *container = container_of(bcontainer, VFIOContainer, 334 bcontainer); 335 VFIOSpaprContainer *scontainer = container_of(container, VFIOSpaprContainer, 336 container); 337 VFIOHostDMAWindow *hostwin; 338 hwaddr pgsize = 0; 339 int ret; 340 341 /* 342 * VFIO_SPAPR_TCE_IOMMU supports a single host window between 343 * [dma32_window_start, dma32_window_size), we need to ensure 344 * the section fall in this range. 345 */ 346 if (container->iommu_type == VFIO_SPAPR_TCE_IOMMU) { 347 hwaddr iova, end; 348 349 iova = section->offset_within_address_space; 350 end = iova + int128_get64(section->size) - 1; 351 352 if (!vfio_find_hostwin(scontainer, iova, end)) { 353 error_setg(errp, "Container %p can't map guest IOVA region" 354 " 0x%"HWADDR_PRIx"..0x%"HWADDR_PRIx, container, 355 iova, end); 356 return false; 357 } 358 return true; 359 } 360 361 if (container->iommu_type != VFIO_SPAPR_TCE_v2_IOMMU) { 362 return true; 363 } 364 365 /* For now intersections are not allowed, we may relax this later */ 366 QLIST_FOREACH(hostwin, &scontainer->hostwin_list, hostwin_next) { 367 if (ranges_overlap(hostwin->min_iova, 368 hostwin->max_iova - hostwin->min_iova + 1, 369 section->offset_within_address_space, 370 int128_get64(section->size))) { 371 error_setg(errp, 372 "region [0x%"PRIx64",0x%"PRIx64"] overlaps with existing" 373 "host DMA window [0x%"PRIx64",0x%"PRIx64"]", 374 section->offset_within_address_space, 375 section->offset_within_address_space + 376 int128_get64(section->size) - 1, 377 hostwin->min_iova, hostwin->max_iova); 378 return false; 379 } 380 } 381 382 ret = vfio_spapr_create_window(container, section, &pgsize); 383 if (ret) { 384 error_setg_errno(errp, -ret, "Failed to create SPAPR window"); 385 return false; 386 } 387 388 vfio_host_win_add(scontainer, section->offset_within_address_space, 389 section->offset_within_address_space + 390 int128_get64(section->size) - 1, pgsize); 391 #ifdef CONFIG_KVM 392 if (kvm_enabled()) { 393 VFIOGroup *group; 394 IOMMUMemoryRegion *iommu_mr = IOMMU_MEMORY_REGION(section->mr); 395 struct kvm_vfio_spapr_tce param; 396 struct kvm_device_attr attr = { 397 .group = KVM_DEV_VFIO_GROUP, 398 .attr = KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE, 399 .addr = (uint64_t)(unsigned long)¶m, 400 }; 401 402 if (!memory_region_iommu_get_attr(iommu_mr, IOMMU_ATTR_SPAPR_TCE_FD, 403 ¶m.tablefd)) { 404 QLIST_FOREACH(group, &container->group_list, container_next) { 405 param.groupfd = group->fd; 406 if (ioctl(vfio_kvm_device_fd, KVM_SET_DEVICE_ATTR, &attr)) { 407 error_setg_errno(errp, errno, 408 "vfio: failed GROUP_SET_SPAPR_TCE for " 409 "KVM VFIO device %d and group fd %d", 410 param.tablefd, param.groupfd); 411 return false; 412 } 413 trace_vfio_spapr_group_attach(param.groupfd, param.tablefd); 414 } 415 } 416 } 417 #endif 418 return true; 419 } 420 421 static void 422 vfio_spapr_container_del_section_window(VFIOContainerBase *bcontainer, 423 MemoryRegionSection *section) 424 { 425 VFIOContainer *container = container_of(bcontainer, VFIOContainer, 426 bcontainer); 427 VFIOSpaprContainer *scontainer = container_of(container, VFIOSpaprContainer, 428 container); 429 430 if (container->iommu_type != VFIO_SPAPR_TCE_v2_IOMMU) { 431 return; 432 } 433 434 vfio_spapr_remove_window(container, 435 section->offset_within_address_space); 436 if (vfio_host_win_del(scontainer, 437 section->offset_within_address_space, 438 section->offset_within_address_space + 439 int128_get64(section->size) - 1) < 0) { 440 hw_error("%s: Cannot delete missing window at %"HWADDR_PRIx, 441 __func__, section->offset_within_address_space); 442 } 443 } 444 445 static void vfio_spapr_container_release(VFIOContainerBase *bcontainer) 446 { 447 VFIOContainer *container = container_of(bcontainer, VFIOContainer, 448 bcontainer); 449 VFIOSpaprContainer *scontainer = container_of(container, VFIOSpaprContainer, 450 container); 451 VFIOHostDMAWindow *hostwin, *next; 452 453 if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) { 454 memory_listener_unregister(&scontainer->prereg_listener); 455 } 456 QLIST_FOREACH_SAFE(hostwin, &scontainer->hostwin_list, hostwin_next, 457 next) { 458 QLIST_REMOVE(hostwin, hostwin_next); 459 g_free(hostwin); 460 } 461 } 462 463 static bool vfio_spapr_container_setup(VFIOContainerBase *bcontainer, 464 Error **errp) 465 { 466 VFIOContainer *container = container_of(bcontainer, VFIOContainer, 467 bcontainer); 468 VFIOSpaprContainer *scontainer = container_of(container, VFIOSpaprContainer, 469 container); 470 struct vfio_iommu_spapr_tce_info info; 471 bool v2 = container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU; 472 int ret, fd = container->fd; 473 474 QLIST_INIT(&scontainer->hostwin_list); 475 476 /* 477 * The host kernel code implementing VFIO_IOMMU_DISABLE is called 478 * when container fd is closed so we do not call it explicitly 479 * in this file. 480 */ 481 if (!v2) { 482 ret = ioctl(fd, VFIO_IOMMU_ENABLE); 483 if (ret) { 484 error_setg_errno(errp, errno, "failed to enable container"); 485 return false; 486 } 487 } else { 488 scontainer->prereg_listener = vfio_prereg_listener; 489 490 memory_listener_register(&scontainer->prereg_listener, 491 &address_space_memory); 492 if (bcontainer->error) { 493 error_propagate_prepend(errp, bcontainer->error, 494 "RAM memory listener initialization failed: "); 495 goto listener_unregister_exit; 496 } 497 } 498 499 info.argsz = sizeof(info); 500 ret = ioctl(fd, VFIO_IOMMU_SPAPR_TCE_GET_INFO, &info); 501 if (ret) { 502 error_setg_errno(errp, errno, 503 "VFIO_IOMMU_SPAPR_TCE_GET_INFO failed"); 504 goto listener_unregister_exit; 505 } 506 507 if (v2) { 508 bcontainer->pgsizes = info.ddw.pgsizes; 509 /* 510 * There is a default window in just created container. 511 * To make region_add/del simpler, we better remove this 512 * window now and let those iommu_listener callbacks 513 * create/remove them when needed. 514 */ 515 ret = vfio_spapr_remove_window(container, info.dma32_window_start); 516 if (ret) { 517 error_setg_errno(errp, -ret, 518 "failed to remove existing window"); 519 goto listener_unregister_exit; 520 } 521 } else { 522 /* The default table uses 4K pages */ 523 bcontainer->pgsizes = 0x1000; 524 vfio_host_win_add(scontainer, info.dma32_window_start, 525 info.dma32_window_start + 526 info.dma32_window_size - 1, 527 0x1000); 528 } 529 530 return true; 531 532 listener_unregister_exit: 533 if (v2) { 534 memory_listener_unregister(&scontainer->prereg_listener); 535 } 536 return false; 537 } 538 539 static void vfio_iommu_spapr_class_init(ObjectClass *klass, void *data) 540 { 541 VFIOIOMMUClass *vioc = VFIO_IOMMU_CLASS(klass); 542 543 vioc->add_window = vfio_spapr_container_add_section_window; 544 vioc->del_window = vfio_spapr_container_del_section_window; 545 vioc->release = vfio_spapr_container_release; 546 vioc->setup = vfio_spapr_container_setup; 547 }; 548 549 static const TypeInfo types[] = { 550 { 551 .name = TYPE_VFIO_IOMMU_SPAPR, 552 .parent = TYPE_VFIO_IOMMU_LEGACY, 553 .instance_size = sizeof(VFIOSpaprContainer), 554 .class_init = vfio_iommu_spapr_class_init, 555 }, 556 }; 557 558 DEFINE_TYPES(types) 559