xref: /openbmc/qemu/hw/vfio/spapr.c (revision 499e53cce9445d23ee1bf54562de558562fc8d22)
1 /*
2  * DMA memory preregistration
3  *
4  * Authors:
5  *  Alexey Kardashevskiy <aik@ozlabs.ru>
6  *
7  * This work is licensed under the terms of the GNU GPL, version 2.  See
8  * the COPYING file in the top-level directory.
9  */
10 
11 #include "qemu/osdep.h"
12 #include <sys/ioctl.h>
13 #include <linux/vfio.h>
14 #include "system/kvm.h"
15 #include "system/hostmem.h"
16 #include "system/address-spaces.h"
17 
18 #include "hw/vfio/vfio-common.h"
19 #include "hw/hw.h"
20 #include "system/ram_addr.h"
21 #include "qemu/error-report.h"
22 #include "qapi/error.h"
23 #include "trace.h"
24 
25 typedef struct VFIOHostDMAWindow {
26     hwaddr min_iova;
27     hwaddr max_iova;
28     uint64_t iova_pgsizes;
29     QLIST_ENTRY(VFIOHostDMAWindow) hostwin_next;
30 } VFIOHostDMAWindow;
31 
32 typedef struct VFIOSpaprContainer {
33     VFIOContainer container;
34     MemoryListener prereg_listener;
35     QLIST_HEAD(, VFIOHostDMAWindow) hostwin_list;
36     unsigned int levels;
37 } VFIOSpaprContainer;
38 
39 OBJECT_DECLARE_SIMPLE_TYPE(VFIOSpaprContainer, VFIO_IOMMU_SPAPR);
40 
41 static bool vfio_prereg_listener_skipped_section(MemoryRegionSection *section)
42 {
43     if (memory_region_is_iommu(section->mr)) {
44         hw_error("Cannot possibly preregister IOMMU memory");
45     }
46 
47     return !memory_region_is_ram(section->mr) ||
48             memory_region_is_ram_device(section->mr);
49 }
50 
51 static void *vfio_prereg_gpa_to_vaddr(MemoryRegionSection *section, hwaddr gpa)
52 {
53     return memory_region_get_ram_ptr(section->mr) +
54         section->offset_within_region +
55         (gpa - section->offset_within_address_space);
56 }
57 
58 static void vfio_prereg_listener_region_add(MemoryListener *listener,
59                                             MemoryRegionSection *section)
60 {
61     VFIOSpaprContainer *scontainer = container_of(listener, VFIOSpaprContainer,
62                                                   prereg_listener);
63     VFIOContainer *container = &scontainer->container;
64     VFIOContainerBase *bcontainer = &container->bcontainer;
65     const hwaddr gpa = section->offset_within_address_space;
66     hwaddr end;
67     int ret;
68     hwaddr page_mask = qemu_real_host_page_mask();
69     struct vfio_iommu_spapr_register_memory reg = {
70         .argsz = sizeof(reg),
71         .flags = 0,
72     };
73 
74     if (vfio_prereg_listener_skipped_section(section)) {
75         trace_vfio_prereg_listener_region_add_skip(
76                 section->offset_within_address_space,
77                 section->offset_within_address_space +
78                 int128_get64(int128_sub(section->size, int128_one())));
79         return;
80     }
81 
82     if (unlikely((section->offset_within_address_space & ~page_mask) ||
83                  (section->offset_within_region & ~page_mask) ||
84                  (int128_get64(section->size) & ~page_mask))) {
85         error_report("%s received unaligned region", __func__);
86         return;
87     }
88 
89     end = section->offset_within_address_space + int128_get64(section->size);
90     if (gpa >= end) {
91         return;
92     }
93 
94     memory_region_ref(section->mr);
95 
96     reg.vaddr = (uintptr_t) vfio_prereg_gpa_to_vaddr(section, gpa);
97     reg.size = end - gpa;
98 
99     ret = ioctl(container->fd, VFIO_IOMMU_SPAPR_REGISTER_MEMORY, &reg);
100     trace_vfio_prereg_register(reg.vaddr, reg.size, ret ? -errno : 0);
101     if (ret) {
102         /*
103          * On the initfn path, store the first error in the container so we
104          * can gracefully fail.  Runtime, there's not much we can do other
105          * than throw a hardware error.
106          */
107         if (!bcontainer->initialized) {
108             if (!bcontainer->error) {
109                 error_setg_errno(&bcontainer->error, -ret,
110                                  "Memory registering failed");
111             }
112         } else {
113             hw_error("vfio: Memory registering failed, unable to continue");
114         }
115     }
116 }
117 
118 static void vfio_prereg_listener_region_del(MemoryListener *listener,
119                                             MemoryRegionSection *section)
120 {
121     VFIOSpaprContainer *scontainer = container_of(listener, VFIOSpaprContainer,
122                                                   prereg_listener);
123     VFIOContainer *container = &scontainer->container;
124     const hwaddr gpa = section->offset_within_address_space;
125     hwaddr end;
126     int ret;
127     hwaddr page_mask = qemu_real_host_page_mask();
128     struct vfio_iommu_spapr_register_memory reg = {
129         .argsz = sizeof(reg),
130         .flags = 0,
131     };
132 
133     if (vfio_prereg_listener_skipped_section(section)) {
134         trace_vfio_prereg_listener_region_del_skip(
135                 section->offset_within_address_space,
136                 section->offset_within_address_space +
137                 int128_get64(int128_sub(section->size, int128_one())));
138         return;
139     }
140 
141     if (unlikely((section->offset_within_address_space & ~page_mask) ||
142                  (section->offset_within_region & ~page_mask) ||
143                  (int128_get64(section->size) & ~page_mask))) {
144         error_report("%s received unaligned region", __func__);
145         return;
146     }
147 
148     end = section->offset_within_address_space + int128_get64(section->size);
149     if (gpa >= end) {
150         return;
151     }
152 
153     reg.vaddr = (uintptr_t) vfio_prereg_gpa_to_vaddr(section, gpa);
154     reg.size = end - gpa;
155 
156     ret = ioctl(container->fd, VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY, &reg);
157     trace_vfio_prereg_unregister(reg.vaddr, reg.size, ret ? -errno : 0);
158 }
159 
160 static const MemoryListener vfio_prereg_listener = {
161     .name = "vfio-pre-reg",
162     .region_add = vfio_prereg_listener_region_add,
163     .region_del = vfio_prereg_listener_region_del,
164 };
165 
166 static void vfio_host_win_add(VFIOSpaprContainer *scontainer, hwaddr min_iova,
167                               hwaddr max_iova, uint64_t iova_pgsizes)
168 {
169     VFIOHostDMAWindow *hostwin;
170 
171     QLIST_FOREACH(hostwin, &scontainer->hostwin_list, hostwin_next) {
172         if (ranges_overlap(hostwin->min_iova,
173                            hostwin->max_iova - hostwin->min_iova + 1,
174                            min_iova,
175                            max_iova - min_iova + 1)) {
176             hw_error("%s: Overlapped IOMMU are not enabled", __func__);
177         }
178     }
179 
180     hostwin = g_malloc0(sizeof(*hostwin));
181 
182     hostwin->min_iova = min_iova;
183     hostwin->max_iova = max_iova;
184     hostwin->iova_pgsizes = iova_pgsizes;
185     QLIST_INSERT_HEAD(&scontainer->hostwin_list, hostwin, hostwin_next);
186 }
187 
188 static int vfio_host_win_del(VFIOSpaprContainer *scontainer,
189                              hwaddr min_iova, hwaddr max_iova)
190 {
191     VFIOHostDMAWindow *hostwin;
192 
193     QLIST_FOREACH(hostwin, &scontainer->hostwin_list, hostwin_next) {
194         if (hostwin->min_iova == min_iova && hostwin->max_iova == max_iova) {
195             QLIST_REMOVE(hostwin, hostwin_next);
196             g_free(hostwin);
197             return 0;
198         }
199     }
200 
201     return -1;
202 }
203 
204 static VFIOHostDMAWindow *vfio_find_hostwin(VFIOSpaprContainer *container,
205                                             hwaddr iova, hwaddr end)
206 {
207     VFIOHostDMAWindow *hostwin;
208     bool hostwin_found = false;
209 
210     QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) {
211         if (hostwin->min_iova <= iova && end <= hostwin->max_iova) {
212             hostwin_found = true;
213             break;
214         }
215     }
216 
217     return hostwin_found ? hostwin : NULL;
218 }
219 
220 static int vfio_spapr_remove_window(VFIOContainer *container,
221                                     hwaddr offset_within_address_space)
222 {
223     struct vfio_iommu_spapr_tce_remove remove = {
224         .argsz = sizeof(remove),
225         .start_addr = offset_within_address_space,
226     };
227     int ret;
228 
229     ret = ioctl(container->fd, VFIO_IOMMU_SPAPR_TCE_REMOVE, &remove);
230     if (ret) {
231         error_report("Failed to remove window at %"PRIx64,
232                      (uint64_t)remove.start_addr);
233         return -errno;
234     }
235 
236     trace_vfio_spapr_remove_window(offset_within_address_space);
237 
238     return 0;
239 }
240 
241 static bool vfio_spapr_create_window(VFIOContainer *container,
242                                     MemoryRegionSection *section,
243                                     hwaddr *pgsize, Error **errp)
244 {
245     int ret = 0;
246     VFIOContainerBase *bcontainer = &container->bcontainer;
247     VFIOSpaprContainer *scontainer = container_of(container, VFIOSpaprContainer,
248                                                   container);
249     IOMMUMemoryRegion *iommu_mr = IOMMU_MEMORY_REGION(section->mr);
250     uint64_t pagesize = memory_region_iommu_get_min_page_size(iommu_mr), pgmask;
251     unsigned entries, bits_total, bits_per_level, max_levels, ddw_levels;
252     struct vfio_iommu_spapr_tce_create create = { .argsz = sizeof(create) };
253     long rampagesize = qemu_minrampagesize();
254 
255     /*
256      * The host might not support the guest supported IOMMU page size,
257      * so we will use smaller physical IOMMU pages to back them.
258      */
259     if (pagesize > rampagesize) {
260         pagesize = rampagesize;
261     }
262     pgmask = bcontainer->pgsizes & (pagesize | (pagesize - 1));
263     pagesize = pgmask ? (1ULL << (63 - clz64(pgmask))) : 0;
264     if (!pagesize) {
265         error_setg_errno(errp, EINVAL, "Host doesn't support page size 0x%"PRIx64
266                          ", the supported mask is 0x%lx",
267                          memory_region_iommu_get_min_page_size(iommu_mr),
268                          bcontainer->pgsizes);
269         return false;
270     }
271 
272     /*
273      * FIXME: For VFIO iommu types which have KVM acceleration to
274      * avoid bouncing all map/unmaps through qemu this way, this
275      * would be the right place to wire that up (tell the KVM
276      * device emulation the VFIO iommu handles to use).
277      */
278     create.window_size = int128_get64(section->size);
279     create.page_shift = ctz64(pagesize);
280     /*
281      * SPAPR host supports multilevel TCE tables. We try to guess optimal
282      * levels number and if this fails (for example due to the host memory
283      * fragmentation), we increase levels. The DMA address structure is:
284      * rrrrrrrr rxxxxxxx xxxxxxxx xxxxxxxx  xxxxxxxx xxxxxxxx xxxxxxxx iiiiiiii
285      * where:
286      *   r = reserved (bits >= 55 are reserved in the existing hardware)
287      *   i = IOMMU page offset (64K in this example)
288      *   x = bits to index a TCE which can be split to equal chunks to index
289      *      within the level.
290      * The aim is to split "x" to smaller possible number of levels.
291      */
292     entries = create.window_size >> create.page_shift;
293     /* bits_total is number of "x" needed */
294     bits_total = ctz64(entries * sizeof(uint64_t));
295     /*
296      * bits_per_level is a safe guess of how much we can allocate per level:
297      * 8 is the current minimum for CONFIG_FORCE_MAX_ZONEORDER and MAX_ORDER
298      * is usually bigger than that.
299      * Below we look at qemu_real_host_page_size as TCEs are allocated from
300      * system pages.
301      */
302     bits_per_level = ctz64(qemu_real_host_page_size()) + 8;
303     create.levels = bits_total / bits_per_level;
304 
305     ddw_levels = scontainer->levels;
306     if (ddw_levels > 1) {
307         if (bits_total % bits_per_level) {
308             ++create.levels;
309         }
310         max_levels = (64 - create.page_shift) / ctz64(qemu_real_host_page_size());
311         for ( ; create.levels <= max_levels; ++create.levels) {
312             ret = ioctl(container->fd, VFIO_IOMMU_SPAPR_TCE_CREATE, &create);
313             if (!ret) {
314                 break;
315             }
316         }
317     } else { /* ddw_levels == 1 */
318         if (create.levels > ddw_levels) {
319             error_setg_errno(errp, EINVAL, "Host doesn't support multi-level TCE tables"
320                              ". Use larger IO page size. Supported mask is 0x%lx",
321                              bcontainer->pgsizes);
322             return false;
323         }
324         ret = ioctl(container->fd, VFIO_IOMMU_SPAPR_TCE_CREATE, &create);
325     }
326 
327     if (ret) {
328         error_setg_errno(errp, errno, "Failed to create a window, ret = %d", ret);
329         return false;
330     }
331 
332     if (create.start_addr != section->offset_within_address_space) {
333         vfio_spapr_remove_window(container, create.start_addr);
334 
335         error_setg_errno(errp, EINVAL, "Host doesn't support DMA window at %"HWADDR_PRIx
336                          ", must be %"PRIx64, section->offset_within_address_space,
337                          (uint64_t)create.start_addr);
338         return false;
339     }
340     trace_vfio_spapr_create_window(create.page_shift,
341                                    create.levels,
342                                    create.window_size,
343                                    create.start_addr);
344     *pgsize = pagesize;
345 
346     return true;
347 }
348 
349 static bool
350 vfio_spapr_container_add_section_window(VFIOContainerBase *bcontainer,
351                                         MemoryRegionSection *section,
352                                         Error **errp)
353 {
354     VFIOContainer *container = container_of(bcontainer, VFIOContainer,
355                                             bcontainer);
356     VFIOSpaprContainer *scontainer = container_of(container, VFIOSpaprContainer,
357                                                   container);
358     VFIOHostDMAWindow *hostwin;
359     hwaddr pgsize = 0;
360     int ret;
361 
362     /*
363      * VFIO_SPAPR_TCE_IOMMU supports a single host window between
364      * [dma32_window_start, dma32_window_size), we need to ensure
365      * the section fall in this range.
366      */
367     if (container->iommu_type == VFIO_SPAPR_TCE_IOMMU) {
368         hwaddr iova, end;
369 
370         iova = section->offset_within_address_space;
371         end = iova + int128_get64(section->size) - 1;
372 
373         if (!vfio_find_hostwin(scontainer, iova, end)) {
374             error_setg(errp, "Container %p can't map guest IOVA region"
375                        " 0x%"HWADDR_PRIx"..0x%"HWADDR_PRIx, container,
376                        iova, end);
377             return false;
378         }
379         return true;
380     }
381 
382     if (container->iommu_type != VFIO_SPAPR_TCE_v2_IOMMU) {
383         return true;
384     }
385 
386     /* For now intersections are not allowed, we may relax this later */
387     QLIST_FOREACH(hostwin, &scontainer->hostwin_list, hostwin_next) {
388         if (ranges_overlap(hostwin->min_iova,
389                            hostwin->max_iova - hostwin->min_iova + 1,
390                            section->offset_within_address_space,
391                            int128_get64(section->size))) {
392             error_setg(errp,
393                 "region [0x%"PRIx64",0x%"PRIx64"] overlaps with existing"
394                 "host DMA window [0x%"PRIx64",0x%"PRIx64"]",
395                 section->offset_within_address_space,
396                 section->offset_within_address_space +
397                     int128_get64(section->size) - 1,
398                 hostwin->min_iova, hostwin->max_iova);
399             return false;
400         }
401     }
402 
403     ret = vfio_spapr_create_window(container, section, &pgsize, errp);
404     if (!ret) {
405         return false;
406     }
407 
408     vfio_host_win_add(scontainer, section->offset_within_address_space,
409                       section->offset_within_address_space +
410                       int128_get64(section->size) - 1, pgsize);
411 #ifdef CONFIG_KVM
412     if (kvm_enabled()) {
413         VFIOGroup *group;
414         IOMMUMemoryRegion *iommu_mr = IOMMU_MEMORY_REGION(section->mr);
415         struct kvm_vfio_spapr_tce param;
416         struct kvm_device_attr attr = {
417             .group = KVM_DEV_VFIO_GROUP,
418             .attr = KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE,
419             .addr = (uint64_t)(unsigned long)&param,
420         };
421 
422         if (!memory_region_iommu_get_attr(iommu_mr, IOMMU_ATTR_SPAPR_TCE_FD,
423                                           &param.tablefd)) {
424             QLIST_FOREACH(group, &container->group_list, container_next) {
425                 param.groupfd = group->fd;
426                 if (ioctl(vfio_kvm_device_fd, KVM_SET_DEVICE_ATTR, &attr)) {
427                     error_setg_errno(errp, errno,
428                                      "vfio: failed GROUP_SET_SPAPR_TCE for "
429                                      "KVM VFIO device %d and group fd %d",
430                                      param.tablefd, param.groupfd);
431                     return false;
432                 }
433                 trace_vfio_spapr_group_attach(param.groupfd, param.tablefd);
434             }
435         }
436     }
437 #endif
438     return true;
439 }
440 
441 static void
442 vfio_spapr_container_del_section_window(VFIOContainerBase *bcontainer,
443                                         MemoryRegionSection *section)
444 {
445     VFIOContainer *container = container_of(bcontainer, VFIOContainer,
446                                             bcontainer);
447     VFIOSpaprContainer *scontainer = container_of(container, VFIOSpaprContainer,
448                                                   container);
449 
450     if (container->iommu_type != VFIO_SPAPR_TCE_v2_IOMMU) {
451         return;
452     }
453 
454     vfio_spapr_remove_window(container,
455                              section->offset_within_address_space);
456     if (vfio_host_win_del(scontainer,
457                           section->offset_within_address_space,
458                           section->offset_within_address_space +
459                           int128_get64(section->size) - 1) < 0) {
460         hw_error("%s: Cannot delete missing window at %"HWADDR_PRIx,
461                  __func__, section->offset_within_address_space);
462     }
463 }
464 
465 static void vfio_spapr_container_release(VFIOContainerBase *bcontainer)
466 {
467     VFIOContainer *container = container_of(bcontainer, VFIOContainer,
468                                             bcontainer);
469     VFIOSpaprContainer *scontainer = container_of(container, VFIOSpaprContainer,
470                                                   container);
471     VFIOHostDMAWindow *hostwin, *next;
472 
473     if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) {
474         memory_listener_unregister(&scontainer->prereg_listener);
475     }
476     QLIST_FOREACH_SAFE(hostwin, &scontainer->hostwin_list, hostwin_next,
477                        next) {
478         QLIST_REMOVE(hostwin, hostwin_next);
479         g_free(hostwin);
480     }
481 }
482 
483 static bool vfio_spapr_container_setup(VFIOContainerBase *bcontainer,
484                                        Error **errp)
485 {
486     VFIOContainer *container = container_of(bcontainer, VFIOContainer,
487                                             bcontainer);
488     VFIOSpaprContainer *scontainer = container_of(container, VFIOSpaprContainer,
489                                                   container);
490     struct vfio_iommu_spapr_tce_info info;
491     bool v2 = container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU;
492     int ret, fd = container->fd;
493 
494     QLIST_INIT(&scontainer->hostwin_list);
495 
496     /*
497      * The host kernel code implementing VFIO_IOMMU_DISABLE is called
498      * when container fd is closed so we do not call it explicitly
499      * in this file.
500      */
501     if (!v2) {
502         ret = ioctl(fd, VFIO_IOMMU_ENABLE);
503         if (ret) {
504             error_setg_errno(errp, errno, "failed to enable container");
505             return false;
506         }
507     } else {
508         scontainer->prereg_listener = vfio_prereg_listener;
509 
510         memory_listener_register(&scontainer->prereg_listener,
511                                  &address_space_memory);
512         if (bcontainer->error) {
513             error_propagate_prepend(errp, bcontainer->error,
514                     "RAM memory listener initialization failed: ");
515             goto listener_unregister_exit;
516         }
517     }
518 
519     info.argsz = sizeof(info);
520     ret = ioctl(fd, VFIO_IOMMU_SPAPR_TCE_GET_INFO, &info);
521     if (ret) {
522         error_setg_errno(errp, errno,
523                          "VFIO_IOMMU_SPAPR_TCE_GET_INFO failed");
524         goto listener_unregister_exit;
525     }
526 
527     scontainer->levels = info.ddw.levels;
528 
529     if (v2) {
530         bcontainer->pgsizes = info.ddw.pgsizes;
531         /*
532          * There is a default window in just created container.
533          * To make region_add/del simpler, we better remove this
534          * window now and let those iommu_listener callbacks
535          * create/remove them when needed.
536          */
537         ret = vfio_spapr_remove_window(container, info.dma32_window_start);
538         if (ret) {
539             error_setg_errno(errp, -ret,
540                              "failed to remove existing window");
541             goto listener_unregister_exit;
542         }
543     } else {
544         /* The default table uses 4K pages */
545         bcontainer->pgsizes = 0x1000;
546         vfio_host_win_add(scontainer, info.dma32_window_start,
547                           info.dma32_window_start +
548                           info.dma32_window_size - 1,
549                           0x1000);
550     }
551 
552     return true;
553 
554 listener_unregister_exit:
555     if (v2) {
556         memory_listener_unregister(&scontainer->prereg_listener);
557     }
558     return false;
559 }
560 
561 static void vfio_iommu_spapr_class_init(ObjectClass *klass, void *data)
562 {
563     VFIOIOMMUClass *vioc = VFIO_IOMMU_CLASS(klass);
564 
565     vioc->add_window = vfio_spapr_container_add_section_window;
566     vioc->del_window = vfio_spapr_container_del_section_window;
567     vioc->release = vfio_spapr_container_release;
568     vioc->setup = vfio_spapr_container_setup;
569 };
570 
571 static const TypeInfo types[] = {
572     {
573         .name = TYPE_VFIO_IOMMU_SPAPR,
574         .parent = TYPE_VFIO_IOMMU_LEGACY,
575         .instance_size = sizeof(VFIOSpaprContainer),
576         .class_init = vfio_iommu_spapr_class_init,
577     },
578 };
579 
580 DEFINE_TYPES(types)
581