1 /*
2 * DMA memory preregistration
3 *
4 * Authors:
5 * Alexey Kardashevskiy <aik@ozlabs.ru>
6 *
7 * This work is licensed under the terms of the GNU GPL, version 2. See
8 * the COPYING file in the top-level directory.
9 */
10
11 #include "qemu/osdep.h"
12 #include <sys/ioctl.h>
13 #include <linux/vfio.h>
14 #ifdef CONFIG_KVM
15 #include <linux/kvm.h>
16 #endif
17 #include "sysemu/kvm.h"
18 #include "exec/address-spaces.h"
19
20 #include "hw/vfio/vfio-common.h"
21 #include "hw/hw.h"
22 #include "exec/ram_addr.h"
23 #include "qemu/error-report.h"
24 #include "qapi/error.h"
25 #include "trace.h"
26
27 typedef struct VFIOSpaprContainer {
28 VFIOContainer container;
29 MemoryListener prereg_listener;
30 QLIST_HEAD(, VFIOHostDMAWindow) hostwin_list;
31 } VFIOSpaprContainer;
32
33 OBJECT_DECLARE_SIMPLE_TYPE(VFIOSpaprContainer, VFIO_IOMMU_SPAPR);
34
vfio_prereg_listener_skipped_section(MemoryRegionSection * section)35 static bool vfio_prereg_listener_skipped_section(MemoryRegionSection *section)
36 {
37 if (memory_region_is_iommu(section->mr)) {
38 hw_error("Cannot possibly preregister IOMMU memory");
39 }
40
41 return !memory_region_is_ram(section->mr) ||
42 memory_region_is_ram_device(section->mr);
43 }
44
vfio_prereg_gpa_to_vaddr(MemoryRegionSection * section,hwaddr gpa)45 static void *vfio_prereg_gpa_to_vaddr(MemoryRegionSection *section, hwaddr gpa)
46 {
47 return memory_region_get_ram_ptr(section->mr) +
48 section->offset_within_region +
49 (gpa - section->offset_within_address_space);
50 }
51
vfio_prereg_listener_region_add(MemoryListener * listener,MemoryRegionSection * section)52 static void vfio_prereg_listener_region_add(MemoryListener *listener,
53 MemoryRegionSection *section)
54 {
55 VFIOSpaprContainer *scontainer = container_of(listener, VFIOSpaprContainer,
56 prereg_listener);
57 VFIOContainer *container = &scontainer->container;
58 VFIOContainerBase *bcontainer = &container->bcontainer;
59 const hwaddr gpa = section->offset_within_address_space;
60 hwaddr end;
61 int ret;
62 hwaddr page_mask = qemu_real_host_page_mask();
63 struct vfio_iommu_spapr_register_memory reg = {
64 .argsz = sizeof(reg),
65 .flags = 0,
66 };
67
68 if (vfio_prereg_listener_skipped_section(section)) {
69 trace_vfio_prereg_listener_region_add_skip(
70 section->offset_within_address_space,
71 section->offset_within_address_space +
72 int128_get64(int128_sub(section->size, int128_one())));
73 return;
74 }
75
76 if (unlikely((section->offset_within_address_space & ~page_mask) ||
77 (section->offset_within_region & ~page_mask) ||
78 (int128_get64(section->size) & ~page_mask))) {
79 error_report("%s received unaligned region", __func__);
80 return;
81 }
82
83 end = section->offset_within_address_space + int128_get64(section->size);
84 if (gpa >= end) {
85 return;
86 }
87
88 memory_region_ref(section->mr);
89
90 reg.vaddr = (uintptr_t) vfio_prereg_gpa_to_vaddr(section, gpa);
91 reg.size = end - gpa;
92
93 ret = ioctl(container->fd, VFIO_IOMMU_SPAPR_REGISTER_MEMORY, ®);
94 trace_vfio_prereg_register(reg.vaddr, reg.size, ret ? -errno : 0);
95 if (ret) {
96 /*
97 * On the initfn path, store the first error in the container so we
98 * can gracefully fail. Runtime, there's not much we can do other
99 * than throw a hardware error.
100 */
101 if (!bcontainer->initialized) {
102 if (!bcontainer->error) {
103 error_setg_errno(&bcontainer->error, -ret,
104 "Memory registering failed");
105 }
106 } else {
107 hw_error("vfio: Memory registering failed, unable to continue");
108 }
109 }
110 }
111
vfio_prereg_listener_region_del(MemoryListener * listener,MemoryRegionSection * section)112 static void vfio_prereg_listener_region_del(MemoryListener *listener,
113 MemoryRegionSection *section)
114 {
115 VFIOSpaprContainer *scontainer = container_of(listener, VFIOSpaprContainer,
116 prereg_listener);
117 VFIOContainer *container = &scontainer->container;
118 const hwaddr gpa = section->offset_within_address_space;
119 hwaddr end;
120 int ret;
121 hwaddr page_mask = qemu_real_host_page_mask();
122 struct vfio_iommu_spapr_register_memory reg = {
123 .argsz = sizeof(reg),
124 .flags = 0,
125 };
126
127 if (vfio_prereg_listener_skipped_section(section)) {
128 trace_vfio_prereg_listener_region_del_skip(
129 section->offset_within_address_space,
130 section->offset_within_address_space +
131 int128_get64(int128_sub(section->size, int128_one())));
132 return;
133 }
134
135 if (unlikely((section->offset_within_address_space & ~page_mask) ||
136 (section->offset_within_region & ~page_mask) ||
137 (int128_get64(section->size) & ~page_mask))) {
138 error_report("%s received unaligned region", __func__);
139 return;
140 }
141
142 end = section->offset_within_address_space + int128_get64(section->size);
143 if (gpa >= end) {
144 return;
145 }
146
147 reg.vaddr = (uintptr_t) vfio_prereg_gpa_to_vaddr(section, gpa);
148 reg.size = end - gpa;
149
150 ret = ioctl(container->fd, VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY, ®);
151 trace_vfio_prereg_unregister(reg.vaddr, reg.size, ret ? -errno : 0);
152 }
153
154 static const MemoryListener vfio_prereg_listener = {
155 .name = "vfio-pre-reg",
156 .region_add = vfio_prereg_listener_region_add,
157 .region_del = vfio_prereg_listener_region_del,
158 };
159
vfio_host_win_add(VFIOSpaprContainer * scontainer,hwaddr min_iova,hwaddr max_iova,uint64_t iova_pgsizes)160 static void vfio_host_win_add(VFIOSpaprContainer *scontainer, hwaddr min_iova,
161 hwaddr max_iova, uint64_t iova_pgsizes)
162 {
163 VFIOHostDMAWindow *hostwin;
164
165 QLIST_FOREACH(hostwin, &scontainer->hostwin_list, hostwin_next) {
166 if (ranges_overlap(hostwin->min_iova,
167 hostwin->max_iova - hostwin->min_iova + 1,
168 min_iova,
169 max_iova - min_iova + 1)) {
170 hw_error("%s: Overlapped IOMMU are not enabled", __func__);
171 }
172 }
173
174 hostwin = g_malloc0(sizeof(*hostwin));
175
176 hostwin->min_iova = min_iova;
177 hostwin->max_iova = max_iova;
178 hostwin->iova_pgsizes = iova_pgsizes;
179 QLIST_INSERT_HEAD(&scontainer->hostwin_list, hostwin, hostwin_next);
180 }
181
vfio_host_win_del(VFIOSpaprContainer * scontainer,hwaddr min_iova,hwaddr max_iova)182 static int vfio_host_win_del(VFIOSpaprContainer *scontainer,
183 hwaddr min_iova, hwaddr max_iova)
184 {
185 VFIOHostDMAWindow *hostwin;
186
187 QLIST_FOREACH(hostwin, &scontainer->hostwin_list, hostwin_next) {
188 if (hostwin->min_iova == min_iova && hostwin->max_iova == max_iova) {
189 QLIST_REMOVE(hostwin, hostwin_next);
190 g_free(hostwin);
191 return 0;
192 }
193 }
194
195 return -1;
196 }
197
vfio_find_hostwin(VFIOSpaprContainer * container,hwaddr iova,hwaddr end)198 static VFIOHostDMAWindow *vfio_find_hostwin(VFIOSpaprContainer *container,
199 hwaddr iova, hwaddr end)
200 {
201 VFIOHostDMAWindow *hostwin;
202 bool hostwin_found = false;
203
204 QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) {
205 if (hostwin->min_iova <= iova && end <= hostwin->max_iova) {
206 hostwin_found = true;
207 break;
208 }
209 }
210
211 return hostwin_found ? hostwin : NULL;
212 }
213
vfio_spapr_remove_window(VFIOContainer * container,hwaddr offset_within_address_space)214 static int vfio_spapr_remove_window(VFIOContainer *container,
215 hwaddr offset_within_address_space)
216 {
217 struct vfio_iommu_spapr_tce_remove remove = {
218 .argsz = sizeof(remove),
219 .start_addr = offset_within_address_space,
220 };
221 int ret;
222
223 ret = ioctl(container->fd, VFIO_IOMMU_SPAPR_TCE_REMOVE, &remove);
224 if (ret) {
225 error_report("Failed to remove window at %"PRIx64,
226 (uint64_t)remove.start_addr);
227 return -errno;
228 }
229
230 trace_vfio_spapr_remove_window(offset_within_address_space);
231
232 return 0;
233 }
234
vfio_spapr_create_window(VFIOContainer * container,MemoryRegionSection * section,hwaddr * pgsize)235 static int vfio_spapr_create_window(VFIOContainer *container,
236 MemoryRegionSection *section,
237 hwaddr *pgsize)
238 {
239 int ret = 0;
240 VFIOContainerBase *bcontainer = &container->bcontainer;
241 IOMMUMemoryRegion *iommu_mr = IOMMU_MEMORY_REGION(section->mr);
242 uint64_t pagesize = memory_region_iommu_get_min_page_size(iommu_mr), pgmask;
243 unsigned entries, bits_total, bits_per_level, max_levels;
244 struct vfio_iommu_spapr_tce_create create = { .argsz = sizeof(create) };
245 long rampagesize = qemu_minrampagesize();
246
247 /*
248 * The host might not support the guest supported IOMMU page size,
249 * so we will use smaller physical IOMMU pages to back them.
250 */
251 if (pagesize > rampagesize) {
252 pagesize = rampagesize;
253 }
254 pgmask = bcontainer->pgsizes & (pagesize | (pagesize - 1));
255 pagesize = pgmask ? (1ULL << (63 - clz64(pgmask))) : 0;
256 if (!pagesize) {
257 error_report("Host doesn't support page size 0x%"PRIx64
258 ", the supported mask is 0x%lx",
259 memory_region_iommu_get_min_page_size(iommu_mr),
260 bcontainer->pgsizes);
261 return -EINVAL;
262 }
263
264 /*
265 * FIXME: For VFIO iommu types which have KVM acceleration to
266 * avoid bouncing all map/unmaps through qemu this way, this
267 * would be the right place to wire that up (tell the KVM
268 * device emulation the VFIO iommu handles to use).
269 */
270 create.window_size = int128_get64(section->size);
271 create.page_shift = ctz64(pagesize);
272 /*
273 * SPAPR host supports multilevel TCE tables. We try to guess optimal
274 * levels number and if this fails (for example due to the host memory
275 * fragmentation), we increase levels. The DMA address structure is:
276 * rrrrrrrr rxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx iiiiiiii
277 * where:
278 * r = reserved (bits >= 55 are reserved in the existing hardware)
279 * i = IOMMU page offset (64K in this example)
280 * x = bits to index a TCE which can be split to equal chunks to index
281 * within the level.
282 * The aim is to split "x" to smaller possible number of levels.
283 */
284 entries = create.window_size >> create.page_shift;
285 /* bits_total is number of "x" needed */
286 bits_total = ctz64(entries * sizeof(uint64_t));
287 /*
288 * bits_per_level is a safe guess of how much we can allocate per level:
289 * 8 is the current minimum for CONFIG_FORCE_MAX_ZONEORDER and MAX_ORDER
290 * is usually bigger than that.
291 * Below we look at qemu_real_host_page_size as TCEs are allocated from
292 * system pages.
293 */
294 bits_per_level = ctz64(qemu_real_host_page_size()) + 8;
295 create.levels = bits_total / bits_per_level;
296 if (bits_total % bits_per_level) {
297 ++create.levels;
298 }
299 max_levels = (64 - create.page_shift) / ctz64(qemu_real_host_page_size());
300 for ( ; create.levels <= max_levels; ++create.levels) {
301 ret = ioctl(container->fd, VFIO_IOMMU_SPAPR_TCE_CREATE, &create);
302 if (!ret) {
303 break;
304 }
305 }
306 if (ret) {
307 error_report("Failed to create a window, ret = %d (%m)", ret);
308 return -errno;
309 }
310
311 if (create.start_addr != section->offset_within_address_space) {
312 vfio_spapr_remove_window(container, create.start_addr);
313
314 error_report("Host doesn't support DMA window at %"HWADDR_PRIx", must be %"PRIx64,
315 section->offset_within_address_space,
316 (uint64_t)create.start_addr);
317 return -EINVAL;
318 }
319 trace_vfio_spapr_create_window(create.page_shift,
320 create.levels,
321 create.window_size,
322 create.start_addr);
323 *pgsize = pagesize;
324
325 return 0;
326 }
327
328 static bool
vfio_spapr_container_add_section_window(VFIOContainerBase * bcontainer,MemoryRegionSection * section,Error ** errp)329 vfio_spapr_container_add_section_window(VFIOContainerBase *bcontainer,
330 MemoryRegionSection *section,
331 Error **errp)
332 {
333 VFIOContainer *container = container_of(bcontainer, VFIOContainer,
334 bcontainer);
335 VFIOSpaprContainer *scontainer = container_of(container, VFIOSpaprContainer,
336 container);
337 VFIOHostDMAWindow *hostwin;
338 hwaddr pgsize = 0;
339 int ret;
340
341 /*
342 * VFIO_SPAPR_TCE_IOMMU supports a single host window between
343 * [dma32_window_start, dma32_window_size), we need to ensure
344 * the section fall in this range.
345 */
346 if (container->iommu_type == VFIO_SPAPR_TCE_IOMMU) {
347 hwaddr iova, end;
348
349 iova = section->offset_within_address_space;
350 end = iova + int128_get64(section->size) - 1;
351
352 if (!vfio_find_hostwin(scontainer, iova, end)) {
353 error_setg(errp, "Container %p can't map guest IOVA region"
354 " 0x%"HWADDR_PRIx"..0x%"HWADDR_PRIx, container,
355 iova, end);
356 return false;
357 }
358 return true;
359 }
360
361 if (container->iommu_type != VFIO_SPAPR_TCE_v2_IOMMU) {
362 return true;
363 }
364
365 /* For now intersections are not allowed, we may relax this later */
366 QLIST_FOREACH(hostwin, &scontainer->hostwin_list, hostwin_next) {
367 if (ranges_overlap(hostwin->min_iova,
368 hostwin->max_iova - hostwin->min_iova + 1,
369 section->offset_within_address_space,
370 int128_get64(section->size))) {
371 error_setg(errp,
372 "region [0x%"PRIx64",0x%"PRIx64"] overlaps with existing"
373 "host DMA window [0x%"PRIx64",0x%"PRIx64"]",
374 section->offset_within_address_space,
375 section->offset_within_address_space +
376 int128_get64(section->size) - 1,
377 hostwin->min_iova, hostwin->max_iova);
378 return false;
379 }
380 }
381
382 ret = vfio_spapr_create_window(container, section, &pgsize);
383 if (ret) {
384 error_setg_errno(errp, -ret, "Failed to create SPAPR window");
385 return false;
386 }
387
388 vfio_host_win_add(scontainer, section->offset_within_address_space,
389 section->offset_within_address_space +
390 int128_get64(section->size) - 1, pgsize);
391 #ifdef CONFIG_KVM
392 if (kvm_enabled()) {
393 VFIOGroup *group;
394 IOMMUMemoryRegion *iommu_mr = IOMMU_MEMORY_REGION(section->mr);
395 struct kvm_vfio_spapr_tce param;
396 struct kvm_device_attr attr = {
397 .group = KVM_DEV_VFIO_GROUP,
398 .attr = KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE,
399 .addr = (uint64_t)(unsigned long)¶m,
400 };
401
402 if (!memory_region_iommu_get_attr(iommu_mr, IOMMU_ATTR_SPAPR_TCE_FD,
403 ¶m.tablefd)) {
404 QLIST_FOREACH(group, &container->group_list, container_next) {
405 param.groupfd = group->fd;
406 if (ioctl(vfio_kvm_device_fd, KVM_SET_DEVICE_ATTR, &attr)) {
407 error_setg_errno(errp, errno,
408 "vfio: failed GROUP_SET_SPAPR_TCE for "
409 "KVM VFIO device %d and group fd %d",
410 param.tablefd, param.groupfd);
411 return false;
412 }
413 trace_vfio_spapr_group_attach(param.groupfd, param.tablefd);
414 }
415 }
416 }
417 #endif
418 return true;
419 }
420
421 static void
vfio_spapr_container_del_section_window(VFIOContainerBase * bcontainer,MemoryRegionSection * section)422 vfio_spapr_container_del_section_window(VFIOContainerBase *bcontainer,
423 MemoryRegionSection *section)
424 {
425 VFIOContainer *container = container_of(bcontainer, VFIOContainer,
426 bcontainer);
427 VFIOSpaprContainer *scontainer = container_of(container, VFIOSpaprContainer,
428 container);
429
430 if (container->iommu_type != VFIO_SPAPR_TCE_v2_IOMMU) {
431 return;
432 }
433
434 vfio_spapr_remove_window(container,
435 section->offset_within_address_space);
436 if (vfio_host_win_del(scontainer,
437 section->offset_within_address_space,
438 section->offset_within_address_space +
439 int128_get64(section->size) - 1) < 0) {
440 hw_error("%s: Cannot delete missing window at %"HWADDR_PRIx,
441 __func__, section->offset_within_address_space);
442 }
443 }
444
vfio_spapr_container_release(VFIOContainerBase * bcontainer)445 static void vfio_spapr_container_release(VFIOContainerBase *bcontainer)
446 {
447 VFIOContainer *container = container_of(bcontainer, VFIOContainer,
448 bcontainer);
449 VFIOSpaprContainer *scontainer = container_of(container, VFIOSpaprContainer,
450 container);
451 VFIOHostDMAWindow *hostwin, *next;
452
453 if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) {
454 memory_listener_unregister(&scontainer->prereg_listener);
455 }
456 QLIST_FOREACH_SAFE(hostwin, &scontainer->hostwin_list, hostwin_next,
457 next) {
458 QLIST_REMOVE(hostwin, hostwin_next);
459 g_free(hostwin);
460 }
461 }
462
vfio_spapr_container_setup(VFIOContainerBase * bcontainer,Error ** errp)463 static bool vfio_spapr_container_setup(VFIOContainerBase *bcontainer,
464 Error **errp)
465 {
466 VFIOContainer *container = container_of(bcontainer, VFIOContainer,
467 bcontainer);
468 VFIOSpaprContainer *scontainer = container_of(container, VFIOSpaprContainer,
469 container);
470 struct vfio_iommu_spapr_tce_info info;
471 bool v2 = container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU;
472 int ret, fd = container->fd;
473
474 QLIST_INIT(&scontainer->hostwin_list);
475
476 /*
477 * The host kernel code implementing VFIO_IOMMU_DISABLE is called
478 * when container fd is closed so we do not call it explicitly
479 * in this file.
480 */
481 if (!v2) {
482 ret = ioctl(fd, VFIO_IOMMU_ENABLE);
483 if (ret) {
484 error_setg_errno(errp, errno, "failed to enable container");
485 return false;
486 }
487 } else {
488 scontainer->prereg_listener = vfio_prereg_listener;
489
490 memory_listener_register(&scontainer->prereg_listener,
491 &address_space_memory);
492 if (bcontainer->error) {
493 error_propagate_prepend(errp, bcontainer->error,
494 "RAM memory listener initialization failed: ");
495 goto listener_unregister_exit;
496 }
497 }
498
499 info.argsz = sizeof(info);
500 ret = ioctl(fd, VFIO_IOMMU_SPAPR_TCE_GET_INFO, &info);
501 if (ret) {
502 error_setg_errno(errp, errno,
503 "VFIO_IOMMU_SPAPR_TCE_GET_INFO failed");
504 goto listener_unregister_exit;
505 }
506
507 if (v2) {
508 bcontainer->pgsizes = info.ddw.pgsizes;
509 /*
510 * There is a default window in just created container.
511 * To make region_add/del simpler, we better remove this
512 * window now and let those iommu_listener callbacks
513 * create/remove them when needed.
514 */
515 ret = vfio_spapr_remove_window(container, info.dma32_window_start);
516 if (ret) {
517 error_setg_errno(errp, -ret,
518 "failed to remove existing window");
519 goto listener_unregister_exit;
520 }
521 } else {
522 /* The default table uses 4K pages */
523 bcontainer->pgsizes = 0x1000;
524 vfio_host_win_add(scontainer, info.dma32_window_start,
525 info.dma32_window_start +
526 info.dma32_window_size - 1,
527 0x1000);
528 }
529
530 return true;
531
532 listener_unregister_exit:
533 if (v2) {
534 memory_listener_unregister(&scontainer->prereg_listener);
535 }
536 return false;
537 }
538
vfio_iommu_spapr_class_init(ObjectClass * klass,void * data)539 static void vfio_iommu_spapr_class_init(ObjectClass *klass, void *data)
540 {
541 VFIOIOMMUClass *vioc = VFIO_IOMMU_CLASS(klass);
542
543 vioc->add_window = vfio_spapr_container_add_section_window;
544 vioc->del_window = vfio_spapr_container_del_section_window;
545 vioc->release = vfio_spapr_container_release;
546 vioc->setup = vfio_spapr_container_setup;
547 };
548
549 static const TypeInfo types[] = {
550 {
551 .name = TYPE_VFIO_IOMMU_SPAPR,
552 .parent = TYPE_VFIO_IOMMU_LEGACY,
553 .instance_size = sizeof(VFIOSpaprContainer),
554 .class_init = vfio_iommu_spapr_class_init,
555 },
556 };
557
558 DEFINE_TYPES(types)
559