1 /* 2 * DMA memory preregistration 3 * 4 * Authors: 5 * Alexey Kardashevskiy <aik@ozlabs.ru> 6 * 7 * This work is licensed under the terms of the GNU GPL, version 2. See 8 * the COPYING file in the top-level directory. 9 */ 10 11 #include "qemu/osdep.h" 12 #include <sys/ioctl.h> 13 #include <linux/vfio.h> 14 #ifdef CONFIG_KVM 15 #include <linux/kvm.h> 16 #endif 17 #include "sysemu/kvm.h" 18 #include "exec/address-spaces.h" 19 20 #include "hw/vfio/vfio-common.h" 21 #include "hw/hw.h" 22 #include "exec/ram_addr.h" 23 #include "qemu/error-report.h" 24 #include "qapi/error.h" 25 #include "trace.h" 26 27 static bool vfio_prereg_listener_skipped_section(MemoryRegionSection *section) 28 { 29 if (memory_region_is_iommu(section->mr)) { 30 hw_error("Cannot possibly preregister IOMMU memory"); 31 } 32 33 return !memory_region_is_ram(section->mr) || 34 memory_region_is_ram_device(section->mr); 35 } 36 37 static void *vfio_prereg_gpa_to_vaddr(MemoryRegionSection *section, hwaddr gpa) 38 { 39 return memory_region_get_ram_ptr(section->mr) + 40 section->offset_within_region + 41 (gpa - section->offset_within_address_space); 42 } 43 44 static void vfio_prereg_listener_region_add(MemoryListener *listener, 45 MemoryRegionSection *section) 46 { 47 VFIOContainer *container = container_of(listener, VFIOContainer, 48 prereg_listener); 49 const hwaddr gpa = section->offset_within_address_space; 50 hwaddr end; 51 int ret; 52 hwaddr page_mask = qemu_real_host_page_mask(); 53 struct vfio_iommu_spapr_register_memory reg = { 54 .argsz = sizeof(reg), 55 .flags = 0, 56 }; 57 58 if (vfio_prereg_listener_skipped_section(section)) { 59 trace_vfio_prereg_listener_region_add_skip( 60 section->offset_within_address_space, 61 section->offset_within_address_space + 62 int128_get64(int128_sub(section->size, int128_one()))); 63 return; 64 } 65 66 if (unlikely((section->offset_within_address_space & ~page_mask) || 67 (section->offset_within_region & ~page_mask) || 68 (int128_get64(section->size) & ~page_mask))) { 69 error_report("%s received unaligned region", __func__); 70 return; 71 } 72 73 end = section->offset_within_address_space + int128_get64(section->size); 74 if (gpa >= end) { 75 return; 76 } 77 78 memory_region_ref(section->mr); 79 80 reg.vaddr = (uintptr_t) vfio_prereg_gpa_to_vaddr(section, gpa); 81 reg.size = end - gpa; 82 83 ret = ioctl(container->fd, VFIO_IOMMU_SPAPR_REGISTER_MEMORY, ®); 84 trace_vfio_prereg_register(reg.vaddr, reg.size, ret ? -errno : 0); 85 if (ret) { 86 /* 87 * On the initfn path, store the first error in the container so we 88 * can gracefully fail. Runtime, there's not much we can do other 89 * than throw a hardware error. 90 */ 91 if (!container->initialized) { 92 if (!container->error) { 93 error_setg_errno(&container->error, -ret, 94 "Memory registering failed"); 95 } 96 } else { 97 hw_error("vfio: Memory registering failed, unable to continue"); 98 } 99 } 100 } 101 102 static void vfio_prereg_listener_region_del(MemoryListener *listener, 103 MemoryRegionSection *section) 104 { 105 VFIOContainer *container = container_of(listener, VFIOContainer, 106 prereg_listener); 107 const hwaddr gpa = section->offset_within_address_space; 108 hwaddr end; 109 int ret; 110 hwaddr page_mask = qemu_real_host_page_mask(); 111 struct vfio_iommu_spapr_register_memory reg = { 112 .argsz = sizeof(reg), 113 .flags = 0, 114 }; 115 116 if (vfio_prereg_listener_skipped_section(section)) { 117 trace_vfio_prereg_listener_region_del_skip( 118 section->offset_within_address_space, 119 section->offset_within_address_space + 120 int128_get64(int128_sub(section->size, int128_one()))); 121 return; 122 } 123 124 if (unlikely((section->offset_within_address_space & ~page_mask) || 125 (section->offset_within_region & ~page_mask) || 126 (int128_get64(section->size) & ~page_mask))) { 127 error_report("%s received unaligned region", __func__); 128 return; 129 } 130 131 end = section->offset_within_address_space + int128_get64(section->size); 132 if (gpa >= end) { 133 return; 134 } 135 136 reg.vaddr = (uintptr_t) vfio_prereg_gpa_to_vaddr(section, gpa); 137 reg.size = end - gpa; 138 139 ret = ioctl(container->fd, VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY, ®); 140 trace_vfio_prereg_unregister(reg.vaddr, reg.size, ret ? -errno : 0); 141 } 142 143 static const MemoryListener vfio_prereg_listener = { 144 .name = "vfio-pre-reg", 145 .region_add = vfio_prereg_listener_region_add, 146 .region_del = vfio_prereg_listener_region_del, 147 }; 148 149 static void vfio_host_win_add(VFIOContainer *container, hwaddr min_iova, 150 hwaddr max_iova, uint64_t iova_pgsizes) 151 { 152 VFIOHostDMAWindow *hostwin; 153 154 QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) { 155 if (ranges_overlap(hostwin->min_iova, 156 hostwin->max_iova - hostwin->min_iova + 1, 157 min_iova, 158 max_iova - min_iova + 1)) { 159 hw_error("%s: Overlapped IOMMU are not enabled", __func__); 160 } 161 } 162 163 hostwin = g_malloc0(sizeof(*hostwin)); 164 165 hostwin->min_iova = min_iova; 166 hostwin->max_iova = max_iova; 167 hostwin->iova_pgsizes = iova_pgsizes; 168 QLIST_INSERT_HEAD(&container->hostwin_list, hostwin, hostwin_next); 169 } 170 171 static int vfio_host_win_del(VFIOContainer *container, 172 hwaddr min_iova, hwaddr max_iova) 173 { 174 VFIOHostDMAWindow *hostwin; 175 176 QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) { 177 if (hostwin->min_iova == min_iova && hostwin->max_iova == max_iova) { 178 QLIST_REMOVE(hostwin, hostwin_next); 179 g_free(hostwin); 180 return 0; 181 } 182 } 183 184 return -1; 185 } 186 187 static VFIOHostDMAWindow *vfio_find_hostwin(VFIOContainer *container, 188 hwaddr iova, hwaddr end) 189 { 190 VFIOHostDMAWindow *hostwin; 191 bool hostwin_found = false; 192 193 QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) { 194 if (hostwin->min_iova <= iova && end <= hostwin->max_iova) { 195 hostwin_found = true; 196 break; 197 } 198 } 199 200 return hostwin_found ? hostwin : NULL; 201 } 202 203 static int vfio_spapr_remove_window(VFIOContainer *container, 204 hwaddr offset_within_address_space) 205 { 206 struct vfio_iommu_spapr_tce_remove remove = { 207 .argsz = sizeof(remove), 208 .start_addr = offset_within_address_space, 209 }; 210 int ret; 211 212 ret = ioctl(container->fd, VFIO_IOMMU_SPAPR_TCE_REMOVE, &remove); 213 if (ret) { 214 error_report("Failed to remove window at %"PRIx64, 215 (uint64_t)remove.start_addr); 216 return -errno; 217 } 218 219 trace_vfio_spapr_remove_window(offset_within_address_space); 220 221 return 0; 222 } 223 224 static int vfio_spapr_create_window(VFIOContainer *container, 225 MemoryRegionSection *section, 226 hwaddr *pgsize) 227 { 228 int ret = 0; 229 IOMMUMemoryRegion *iommu_mr = IOMMU_MEMORY_REGION(section->mr); 230 uint64_t pagesize = memory_region_iommu_get_min_page_size(iommu_mr), pgmask; 231 unsigned entries, bits_total, bits_per_level, max_levels; 232 struct vfio_iommu_spapr_tce_create create = { .argsz = sizeof(create) }; 233 long rampagesize = qemu_minrampagesize(); 234 235 /* 236 * The host might not support the guest supported IOMMU page size, 237 * so we will use smaller physical IOMMU pages to back them. 238 */ 239 if (pagesize > rampagesize) { 240 pagesize = rampagesize; 241 } 242 pgmask = container->pgsizes & (pagesize | (pagesize - 1)); 243 pagesize = pgmask ? (1ULL << (63 - clz64(pgmask))) : 0; 244 if (!pagesize) { 245 error_report("Host doesn't support page size 0x%"PRIx64 246 ", the supported mask is 0x%lx", 247 memory_region_iommu_get_min_page_size(iommu_mr), 248 container->pgsizes); 249 return -EINVAL; 250 } 251 252 /* 253 * FIXME: For VFIO iommu types which have KVM acceleration to 254 * avoid bouncing all map/unmaps through qemu this way, this 255 * would be the right place to wire that up (tell the KVM 256 * device emulation the VFIO iommu handles to use). 257 */ 258 create.window_size = int128_get64(section->size); 259 create.page_shift = ctz64(pagesize); 260 /* 261 * SPAPR host supports multilevel TCE tables. We try to guess optimal 262 * levels number and if this fails (for example due to the host memory 263 * fragmentation), we increase levels. The DMA address structure is: 264 * rrrrrrrr rxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx iiiiiiii 265 * where: 266 * r = reserved (bits >= 55 are reserved in the existing hardware) 267 * i = IOMMU page offset (64K in this example) 268 * x = bits to index a TCE which can be split to equal chunks to index 269 * within the level. 270 * The aim is to split "x" to smaller possible number of levels. 271 */ 272 entries = create.window_size >> create.page_shift; 273 /* bits_total is number of "x" needed */ 274 bits_total = ctz64(entries * sizeof(uint64_t)); 275 /* 276 * bits_per_level is a safe guess of how much we can allocate per level: 277 * 8 is the current minimum for CONFIG_FORCE_MAX_ZONEORDER and MAX_ORDER 278 * is usually bigger than that. 279 * Below we look at qemu_real_host_page_size as TCEs are allocated from 280 * system pages. 281 */ 282 bits_per_level = ctz64(qemu_real_host_page_size()) + 8; 283 create.levels = bits_total / bits_per_level; 284 if (bits_total % bits_per_level) { 285 ++create.levels; 286 } 287 max_levels = (64 - create.page_shift) / ctz64(qemu_real_host_page_size()); 288 for ( ; create.levels <= max_levels; ++create.levels) { 289 ret = ioctl(container->fd, VFIO_IOMMU_SPAPR_TCE_CREATE, &create); 290 if (!ret) { 291 break; 292 } 293 } 294 if (ret) { 295 error_report("Failed to create a window, ret = %d (%m)", ret); 296 return -errno; 297 } 298 299 if (create.start_addr != section->offset_within_address_space) { 300 vfio_spapr_remove_window(container, create.start_addr); 301 302 error_report("Host doesn't support DMA window at %"HWADDR_PRIx", must be %"PRIx64, 303 section->offset_within_address_space, 304 (uint64_t)create.start_addr); 305 return -EINVAL; 306 } 307 trace_vfio_spapr_create_window(create.page_shift, 308 create.levels, 309 create.window_size, 310 create.start_addr); 311 *pgsize = pagesize; 312 313 return 0; 314 } 315 316 int vfio_container_add_section_window(VFIOContainer *container, 317 MemoryRegionSection *section, 318 Error **errp) 319 { 320 VFIOHostDMAWindow *hostwin; 321 hwaddr pgsize = 0; 322 int ret; 323 324 /* 325 * VFIO_SPAPR_TCE_IOMMU supports a single host window between 326 * [dma32_window_start, dma32_window_size), we need to ensure 327 * the section fall in this range. 328 */ 329 if (container->iommu_type == VFIO_SPAPR_TCE_IOMMU) { 330 hwaddr iova, end; 331 332 iova = section->offset_within_address_space; 333 end = iova + int128_get64(section->size) - 1; 334 335 if (!vfio_find_hostwin(container, iova, end)) { 336 error_setg(errp, "Container %p can't map guest IOVA region" 337 " 0x%"HWADDR_PRIx"..0x%"HWADDR_PRIx, container, 338 iova, end); 339 return -EINVAL; 340 } 341 return 0; 342 } 343 344 if (container->iommu_type != VFIO_SPAPR_TCE_v2_IOMMU) { 345 return 0; 346 } 347 348 /* For now intersections are not allowed, we may relax this later */ 349 QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) { 350 if (ranges_overlap(hostwin->min_iova, 351 hostwin->max_iova - hostwin->min_iova + 1, 352 section->offset_within_address_space, 353 int128_get64(section->size))) { 354 error_setg(errp, 355 "region [0x%"PRIx64",0x%"PRIx64"] overlaps with existing" 356 "host DMA window [0x%"PRIx64",0x%"PRIx64"]", 357 section->offset_within_address_space, 358 section->offset_within_address_space + 359 int128_get64(section->size) - 1, 360 hostwin->min_iova, hostwin->max_iova); 361 return -EINVAL; 362 } 363 } 364 365 ret = vfio_spapr_create_window(container, section, &pgsize); 366 if (ret) { 367 error_setg_errno(errp, -ret, "Failed to create SPAPR window"); 368 return ret; 369 } 370 371 vfio_host_win_add(container, section->offset_within_address_space, 372 section->offset_within_address_space + 373 int128_get64(section->size) - 1, pgsize); 374 #ifdef CONFIG_KVM 375 if (kvm_enabled()) { 376 VFIOGroup *group; 377 IOMMUMemoryRegion *iommu_mr = IOMMU_MEMORY_REGION(section->mr); 378 struct kvm_vfio_spapr_tce param; 379 struct kvm_device_attr attr = { 380 .group = KVM_DEV_VFIO_GROUP, 381 .attr = KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE, 382 .addr = (uint64_t)(unsigned long)¶m, 383 }; 384 385 if (!memory_region_iommu_get_attr(iommu_mr, IOMMU_ATTR_SPAPR_TCE_FD, 386 ¶m.tablefd)) { 387 QLIST_FOREACH(group, &container->group_list, container_next) { 388 param.groupfd = group->fd; 389 if (ioctl(vfio_kvm_device_fd, KVM_SET_DEVICE_ATTR, &attr)) { 390 error_setg_errno(errp, errno, 391 "vfio: failed GROUP_SET_SPAPR_TCE for " 392 "KVM VFIO device %d and group fd %d", 393 param.tablefd, param.groupfd); 394 return -errno; 395 } 396 trace_vfio_spapr_group_attach(param.groupfd, param.tablefd); 397 } 398 } 399 } 400 #endif 401 return 0; 402 } 403 404 void vfio_container_del_section_window(VFIOContainer *container, 405 MemoryRegionSection *section) 406 { 407 if (container->iommu_type != VFIO_SPAPR_TCE_v2_IOMMU) { 408 return; 409 } 410 411 vfio_spapr_remove_window(container, 412 section->offset_within_address_space); 413 if (vfio_host_win_del(container, 414 section->offset_within_address_space, 415 section->offset_within_address_space + 416 int128_get64(section->size) - 1) < 0) { 417 hw_error("%s: Cannot delete missing window at %"HWADDR_PRIx, 418 __func__, section->offset_within_address_space); 419 } 420 } 421 422 int vfio_spapr_container_init(VFIOContainer *container, Error **errp) 423 { 424 struct vfio_iommu_spapr_tce_info info; 425 bool v2 = container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU; 426 int ret, fd = container->fd; 427 428 QLIST_INIT(&container->hostwin_list); 429 430 /* 431 * The host kernel code implementing VFIO_IOMMU_DISABLE is called 432 * when container fd is closed so we do not call it explicitly 433 * in this file. 434 */ 435 if (!v2) { 436 ret = ioctl(fd, VFIO_IOMMU_ENABLE); 437 if (ret) { 438 error_setg_errno(errp, errno, "failed to enable container"); 439 return -errno; 440 } 441 } else { 442 container->prereg_listener = vfio_prereg_listener; 443 444 memory_listener_register(&container->prereg_listener, 445 &address_space_memory); 446 if (container->error) { 447 ret = -1; 448 error_propagate_prepend(errp, container->error, 449 "RAM memory listener initialization failed: "); 450 goto listener_unregister_exit; 451 } 452 } 453 454 info.argsz = sizeof(info); 455 ret = ioctl(fd, VFIO_IOMMU_SPAPR_TCE_GET_INFO, &info); 456 if (ret) { 457 error_setg_errno(errp, errno, 458 "VFIO_IOMMU_SPAPR_TCE_GET_INFO failed"); 459 ret = -errno; 460 goto listener_unregister_exit; 461 } 462 463 if (v2) { 464 container->pgsizes = info.ddw.pgsizes; 465 /* 466 * There is a default window in just created container. 467 * To make region_add/del simpler, we better remove this 468 * window now and let those iommu_listener callbacks 469 * create/remove them when needed. 470 */ 471 ret = vfio_spapr_remove_window(container, info.dma32_window_start); 472 if (ret) { 473 error_setg_errno(errp, -ret, 474 "failed to remove existing window"); 475 goto listener_unregister_exit; 476 } 477 } else { 478 /* The default table uses 4K pages */ 479 container->pgsizes = 0x1000; 480 vfio_host_win_add(container, info.dma32_window_start, 481 info.dma32_window_start + 482 info.dma32_window_size - 1, 483 0x1000); 484 } 485 486 return 0; 487 488 listener_unregister_exit: 489 if (v2) { 490 memory_listener_unregister(&container->prereg_listener); 491 } 492 return ret; 493 } 494 495 void vfio_spapr_container_deinit(VFIOContainer *container) 496 { 497 VFIOHostDMAWindow *hostwin, *next; 498 499 if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) { 500 memory_listener_unregister(&container->prereg_listener); 501 } 502 QLIST_FOREACH_SAFE(hostwin, &container->hostwin_list, hostwin_next, 503 next) { 504 QLIST_REMOVE(hostwin, hostwin_next); 505 g_free(hostwin); 506 } 507 } 508