xref: /openbmc/qemu/hw/vfio/helpers.c (revision 05caa062)
1 /*
2  * low level and IOMMU backend agnostic helpers used by VFIO devices,
3  * related to regions, interrupts, capabilities
4  *
5  * Copyright Red Hat, Inc. 2012
6  *
7  * Authors:
8  *  Alex Williamson <alex.williamson@redhat.com>
9  *
10  * This work is licensed under the terms of the GNU GPL, version 2.  See
11  * the COPYING file in the top-level directory.
12  *
13  * Based on qemu-kvm device-assignment:
14  *  Adapted for KVM by Qumranet.
15  *  Copyright (c) 2007, Neocleus, Alex Novik (alex@neocleus.com)
16  *  Copyright (c) 2007, Neocleus, Guy Zana (guy@neocleus.com)
17  *  Copyright (C) 2008, Qumranet, Amit Shah (amit.shah@qumranet.com)
18  *  Copyright (C) 2008, Red Hat, Amit Shah (amit.shah@redhat.com)
19  *  Copyright (C) 2008, IBM, Muli Ben-Yehuda (muli@il.ibm.com)
20  */
21 
22 #include "qemu/osdep.h"
23 #include <sys/ioctl.h>
24 
25 #include "hw/vfio/vfio-common.h"
26 #include "hw/hw.h"
27 #include "trace.h"
28 #include "qapi/error.h"
29 #include "qemu/error-report.h"
30 #include "monitor/monitor.h"
31 
32 /*
33  * Common VFIO interrupt disable
34  */
35 void vfio_disable_irqindex(VFIODevice *vbasedev, int index)
36 {
37     struct vfio_irq_set irq_set = {
38         .argsz = sizeof(irq_set),
39         .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER,
40         .index = index,
41         .start = 0,
42         .count = 0,
43     };
44 
45     ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
46 }
47 
48 void vfio_unmask_single_irqindex(VFIODevice *vbasedev, int index)
49 {
50     struct vfio_irq_set irq_set = {
51         .argsz = sizeof(irq_set),
52         .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_UNMASK,
53         .index = index,
54         .start = 0,
55         .count = 1,
56     };
57 
58     ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
59 }
60 
61 void vfio_mask_single_irqindex(VFIODevice *vbasedev, int index)
62 {
63     struct vfio_irq_set irq_set = {
64         .argsz = sizeof(irq_set),
65         .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_MASK,
66         .index = index,
67         .start = 0,
68         .count = 1,
69     };
70 
71     ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
72 }
73 
74 static inline const char *action_to_str(int action)
75 {
76     switch (action) {
77     case VFIO_IRQ_SET_ACTION_MASK:
78         return "MASK";
79     case VFIO_IRQ_SET_ACTION_UNMASK:
80         return "UNMASK";
81     case VFIO_IRQ_SET_ACTION_TRIGGER:
82         return "TRIGGER";
83     default:
84         return "UNKNOWN ACTION";
85     }
86 }
87 
88 static const char *index_to_str(VFIODevice *vbasedev, int index)
89 {
90     if (vbasedev->type != VFIO_DEVICE_TYPE_PCI) {
91         return NULL;
92     }
93 
94     switch (index) {
95     case VFIO_PCI_INTX_IRQ_INDEX:
96         return "INTX";
97     case VFIO_PCI_MSI_IRQ_INDEX:
98         return "MSI";
99     case VFIO_PCI_MSIX_IRQ_INDEX:
100         return "MSIX";
101     case VFIO_PCI_ERR_IRQ_INDEX:
102         return "ERR";
103     case VFIO_PCI_REQ_IRQ_INDEX:
104         return "REQ";
105     default:
106         return NULL;
107     }
108 }
109 
110 bool vfio_set_irq_signaling(VFIODevice *vbasedev, int index, int subindex,
111                             int action, int fd, Error **errp)
112 {
113     ERRP_GUARD();
114     g_autofree struct vfio_irq_set *irq_set = NULL;
115     int argsz;
116     const char *name;
117     int32_t *pfd;
118 
119     argsz = sizeof(*irq_set) + sizeof(*pfd);
120 
121     irq_set = g_malloc0(argsz);
122     irq_set->argsz = argsz;
123     irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | action;
124     irq_set->index = index;
125     irq_set->start = subindex;
126     irq_set->count = 1;
127     pfd = (int32_t *)&irq_set->data;
128     *pfd = fd;
129 
130     if (!ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, irq_set)) {
131         return true;
132     }
133 
134     error_setg_errno(errp, errno, "VFIO_DEVICE_SET_IRQS failure");
135 
136     name = index_to_str(vbasedev, index);
137     if (name) {
138         error_prepend(errp, "%s-%d: ", name, subindex);
139     } else {
140         error_prepend(errp, "index %d-%d: ", index, subindex);
141     }
142     error_prepend(errp,
143                   "Failed to %s %s eventfd signaling for interrupt ",
144                   fd < 0 ? "tear down" : "set up", action_to_str(action));
145     return false;
146 }
147 
148 /*
149  * IO Port/MMIO - Beware of the endians, VFIO is always little endian
150  */
151 void vfio_region_write(void *opaque, hwaddr addr,
152                        uint64_t data, unsigned size)
153 {
154     VFIORegion *region = opaque;
155     VFIODevice *vbasedev = region->vbasedev;
156     union {
157         uint8_t byte;
158         uint16_t word;
159         uint32_t dword;
160         uint64_t qword;
161     } buf;
162 
163     switch (size) {
164     case 1:
165         buf.byte = data;
166         break;
167     case 2:
168         buf.word = cpu_to_le16(data);
169         break;
170     case 4:
171         buf.dword = cpu_to_le32(data);
172         break;
173     case 8:
174         buf.qword = cpu_to_le64(data);
175         break;
176     default:
177         hw_error("vfio: unsupported write size, %u bytes", size);
178         break;
179     }
180 
181     if (pwrite(vbasedev->fd, &buf, size, region->fd_offset + addr) != size) {
182         error_report("%s(%s:region%d+0x%"HWADDR_PRIx", 0x%"PRIx64
183                      ",%d) failed: %m",
184                      __func__, vbasedev->name, region->nr,
185                      addr, data, size);
186     }
187 
188     trace_vfio_region_write(vbasedev->name, region->nr, addr, data, size);
189 
190     /*
191      * A read or write to a BAR always signals an INTx EOI.  This will
192      * do nothing if not pending (including not in INTx mode).  We assume
193      * that a BAR access is in response to an interrupt and that BAR
194      * accesses will service the interrupt.  Unfortunately, we don't know
195      * which access will service the interrupt, so we're potentially
196      * getting quite a few host interrupts per guest interrupt.
197      */
198     vbasedev->ops->vfio_eoi(vbasedev);
199 }
200 
201 uint64_t vfio_region_read(void *opaque,
202                           hwaddr addr, unsigned size)
203 {
204     VFIORegion *region = opaque;
205     VFIODevice *vbasedev = region->vbasedev;
206     union {
207         uint8_t byte;
208         uint16_t word;
209         uint32_t dword;
210         uint64_t qword;
211     } buf;
212     uint64_t data = 0;
213 
214     if (pread(vbasedev->fd, &buf, size, region->fd_offset + addr) != size) {
215         error_report("%s(%s:region%d+0x%"HWADDR_PRIx", %d) failed: %m",
216                      __func__, vbasedev->name, region->nr,
217                      addr, size);
218         return (uint64_t)-1;
219     }
220     switch (size) {
221     case 1:
222         data = buf.byte;
223         break;
224     case 2:
225         data = le16_to_cpu(buf.word);
226         break;
227     case 4:
228         data = le32_to_cpu(buf.dword);
229         break;
230     case 8:
231         data = le64_to_cpu(buf.qword);
232         break;
233     default:
234         hw_error("vfio: unsupported read size, %u bytes", size);
235         break;
236     }
237 
238     trace_vfio_region_read(vbasedev->name, region->nr, addr, size, data);
239 
240     /* Same as write above */
241     vbasedev->ops->vfio_eoi(vbasedev);
242 
243     return data;
244 }
245 
246 const MemoryRegionOps vfio_region_ops = {
247     .read = vfio_region_read,
248     .write = vfio_region_write,
249     .endianness = DEVICE_LITTLE_ENDIAN,
250     .valid = {
251         .min_access_size = 1,
252         .max_access_size = 8,
253     },
254     .impl = {
255         .min_access_size = 1,
256         .max_access_size = 8,
257     },
258 };
259 
260 int vfio_bitmap_alloc(VFIOBitmap *vbmap, hwaddr size)
261 {
262     vbmap->pages = REAL_HOST_PAGE_ALIGN(size) / qemu_real_host_page_size();
263     vbmap->size = ROUND_UP(vbmap->pages, sizeof(__u64) * BITS_PER_BYTE) /
264                                          BITS_PER_BYTE;
265     vbmap->bitmap = g_try_malloc0(vbmap->size);
266     if (!vbmap->bitmap) {
267         return -ENOMEM;
268     }
269 
270     return 0;
271 }
272 
273 struct vfio_info_cap_header *
274 vfio_get_cap(void *ptr, uint32_t cap_offset, uint16_t id)
275 {
276     struct vfio_info_cap_header *hdr;
277 
278     for (hdr = ptr + cap_offset; hdr != ptr; hdr = ptr + hdr->next) {
279         if (hdr->id == id) {
280             return hdr;
281         }
282     }
283 
284     return NULL;
285 }
286 
287 struct vfio_info_cap_header *
288 vfio_get_region_info_cap(struct vfio_region_info *info, uint16_t id)
289 {
290     if (!(info->flags & VFIO_REGION_INFO_FLAG_CAPS)) {
291         return NULL;
292     }
293 
294     return vfio_get_cap((void *)info, info->cap_offset, id);
295 }
296 
297 struct vfio_info_cap_header *
298 vfio_get_device_info_cap(struct vfio_device_info *info, uint16_t id)
299 {
300     if (!(info->flags & VFIO_DEVICE_FLAGS_CAPS)) {
301         return NULL;
302     }
303 
304     return vfio_get_cap((void *)info, info->cap_offset, id);
305 }
306 
307 static int vfio_setup_region_sparse_mmaps(VFIORegion *region,
308                                           struct vfio_region_info *info)
309 {
310     struct vfio_info_cap_header *hdr;
311     struct vfio_region_info_cap_sparse_mmap *sparse;
312     int i, j;
313 
314     hdr = vfio_get_region_info_cap(info, VFIO_REGION_INFO_CAP_SPARSE_MMAP);
315     if (!hdr) {
316         return -ENODEV;
317     }
318 
319     sparse = container_of(hdr, struct vfio_region_info_cap_sparse_mmap, header);
320 
321     trace_vfio_region_sparse_mmap_header(region->vbasedev->name,
322                                          region->nr, sparse->nr_areas);
323 
324     region->mmaps = g_new0(VFIOMmap, sparse->nr_areas);
325 
326     for (i = 0, j = 0; i < sparse->nr_areas; i++) {
327         if (sparse->areas[i].size) {
328             trace_vfio_region_sparse_mmap_entry(i, sparse->areas[i].offset,
329                                             sparse->areas[i].offset +
330                                             sparse->areas[i].size - 1);
331             region->mmaps[j].offset = sparse->areas[i].offset;
332             region->mmaps[j].size = sparse->areas[i].size;
333             j++;
334         }
335     }
336 
337     region->nr_mmaps = j;
338     region->mmaps = g_realloc(region->mmaps, j * sizeof(VFIOMmap));
339 
340     return 0;
341 }
342 
343 int vfio_region_setup(Object *obj, VFIODevice *vbasedev, VFIORegion *region,
344                       int index, const char *name)
345 {
346     g_autofree struct vfio_region_info *info = NULL;
347     int ret;
348 
349     ret = vfio_get_region_info(vbasedev, index, &info);
350     if (ret) {
351         return ret;
352     }
353 
354     region->vbasedev = vbasedev;
355     region->flags = info->flags;
356     region->size = info->size;
357     region->fd_offset = info->offset;
358     region->nr = index;
359 
360     if (region->size) {
361         region->mem = g_new0(MemoryRegion, 1);
362         memory_region_init_io(region->mem, obj, &vfio_region_ops,
363                               region, name, region->size);
364 
365         if (!vbasedev->no_mmap &&
366             region->flags & VFIO_REGION_INFO_FLAG_MMAP) {
367 
368             ret = vfio_setup_region_sparse_mmaps(region, info);
369 
370             if (ret) {
371                 region->nr_mmaps = 1;
372                 region->mmaps = g_new0(VFIOMmap, region->nr_mmaps);
373                 region->mmaps[0].offset = 0;
374                 region->mmaps[0].size = region->size;
375             }
376         }
377     }
378 
379     trace_vfio_region_setup(vbasedev->name, index, name,
380                             region->flags, region->fd_offset, region->size);
381     return 0;
382 }
383 
384 static void vfio_subregion_unmap(VFIORegion *region, int index)
385 {
386     trace_vfio_region_unmap(memory_region_name(&region->mmaps[index].mem),
387                             region->mmaps[index].offset,
388                             region->mmaps[index].offset +
389                             region->mmaps[index].size - 1);
390     memory_region_del_subregion(region->mem, &region->mmaps[index].mem);
391     munmap(region->mmaps[index].mmap, region->mmaps[index].size);
392     object_unparent(OBJECT(&region->mmaps[index].mem));
393     region->mmaps[index].mmap = NULL;
394 }
395 
396 int vfio_region_mmap(VFIORegion *region)
397 {
398     int i, prot = 0;
399     char *name;
400 
401     if (!region->mem) {
402         return 0;
403     }
404 
405     prot |= region->flags & VFIO_REGION_INFO_FLAG_READ ? PROT_READ : 0;
406     prot |= region->flags & VFIO_REGION_INFO_FLAG_WRITE ? PROT_WRITE : 0;
407 
408     for (i = 0; i < region->nr_mmaps; i++) {
409         region->mmaps[i].mmap = mmap(NULL, region->mmaps[i].size, prot,
410                                      MAP_SHARED, region->vbasedev->fd,
411                                      region->fd_offset +
412                                      region->mmaps[i].offset);
413         if (region->mmaps[i].mmap == MAP_FAILED) {
414             int ret = -errno;
415 
416             trace_vfio_region_mmap_fault(memory_region_name(region->mem), i,
417                                          region->fd_offset +
418                                          region->mmaps[i].offset,
419                                          region->fd_offset +
420                                          region->mmaps[i].offset +
421                                          region->mmaps[i].size - 1, ret);
422 
423             region->mmaps[i].mmap = NULL;
424 
425             for (i--; i >= 0; i--) {
426                 vfio_subregion_unmap(region, i);
427             }
428 
429             return ret;
430         }
431 
432         name = g_strdup_printf("%s mmaps[%d]",
433                                memory_region_name(region->mem), i);
434         memory_region_init_ram_device_ptr(&region->mmaps[i].mem,
435                                           memory_region_owner(region->mem),
436                                           name, region->mmaps[i].size,
437                                           region->mmaps[i].mmap);
438         g_free(name);
439         memory_region_add_subregion(region->mem, region->mmaps[i].offset,
440                                     &region->mmaps[i].mem);
441 
442         trace_vfio_region_mmap(memory_region_name(&region->mmaps[i].mem),
443                                region->mmaps[i].offset,
444                                region->mmaps[i].offset +
445                                region->mmaps[i].size - 1);
446     }
447 
448     return 0;
449 }
450 
451 void vfio_region_unmap(VFIORegion *region)
452 {
453     int i;
454 
455     if (!region->mem) {
456         return;
457     }
458 
459     for (i = 0; i < region->nr_mmaps; i++) {
460         if (region->mmaps[i].mmap) {
461             vfio_subregion_unmap(region, i);
462         }
463     }
464 }
465 
466 void vfio_region_exit(VFIORegion *region)
467 {
468     int i;
469 
470     if (!region->mem) {
471         return;
472     }
473 
474     for (i = 0; i < region->nr_mmaps; i++) {
475         if (region->mmaps[i].mmap) {
476             memory_region_del_subregion(region->mem, &region->mmaps[i].mem);
477         }
478     }
479 
480     trace_vfio_region_exit(region->vbasedev->name, region->nr);
481 }
482 
483 void vfio_region_finalize(VFIORegion *region)
484 {
485     int i;
486 
487     if (!region->mem) {
488         return;
489     }
490 
491     for (i = 0; i < region->nr_mmaps; i++) {
492         if (region->mmaps[i].mmap) {
493             munmap(region->mmaps[i].mmap, region->mmaps[i].size);
494             object_unparent(OBJECT(&region->mmaps[i].mem));
495         }
496     }
497 
498     object_unparent(OBJECT(region->mem));
499 
500     g_free(region->mem);
501     g_free(region->mmaps);
502 
503     trace_vfio_region_finalize(region->vbasedev->name, region->nr);
504 
505     region->mem = NULL;
506     region->mmaps = NULL;
507     region->nr_mmaps = 0;
508     region->size = 0;
509     region->flags = 0;
510     region->nr = 0;
511 }
512 
513 void vfio_region_mmaps_set_enabled(VFIORegion *region, bool enabled)
514 {
515     int i;
516 
517     if (!region->mem) {
518         return;
519     }
520 
521     for (i = 0; i < region->nr_mmaps; i++) {
522         if (region->mmaps[i].mmap) {
523             memory_region_set_enabled(&region->mmaps[i].mem, enabled);
524         }
525     }
526 
527     trace_vfio_region_mmaps_set_enabled(memory_region_name(region->mem),
528                                         enabled);
529 }
530 
531 int vfio_get_region_info(VFIODevice *vbasedev, int index,
532                          struct vfio_region_info **info)
533 {
534     size_t argsz = sizeof(struct vfio_region_info);
535 
536     *info = g_malloc0(argsz);
537 
538     (*info)->index = index;
539 retry:
540     (*info)->argsz = argsz;
541 
542     if (ioctl(vbasedev->fd, VFIO_DEVICE_GET_REGION_INFO, *info)) {
543         g_free(*info);
544         *info = NULL;
545         return -errno;
546     }
547 
548     if ((*info)->argsz > argsz) {
549         argsz = (*info)->argsz;
550         *info = g_realloc(*info, argsz);
551 
552         goto retry;
553     }
554 
555     return 0;
556 }
557 
558 int vfio_get_dev_region_info(VFIODevice *vbasedev, uint32_t type,
559                              uint32_t subtype, struct vfio_region_info **info)
560 {
561     int i;
562 
563     for (i = 0; i < vbasedev->num_regions; i++) {
564         struct vfio_info_cap_header *hdr;
565         struct vfio_region_info_cap_type *cap_type;
566 
567         if (vfio_get_region_info(vbasedev, i, info)) {
568             continue;
569         }
570 
571         hdr = vfio_get_region_info_cap(*info, VFIO_REGION_INFO_CAP_TYPE);
572         if (!hdr) {
573             g_free(*info);
574             continue;
575         }
576 
577         cap_type = container_of(hdr, struct vfio_region_info_cap_type, header);
578 
579         trace_vfio_get_dev_region(vbasedev->name, i,
580                                   cap_type->type, cap_type->subtype);
581 
582         if (cap_type->type == type && cap_type->subtype == subtype) {
583             return 0;
584         }
585 
586         g_free(*info);
587     }
588 
589     *info = NULL;
590     return -ENODEV;
591 }
592 
593 bool vfio_has_region_cap(VFIODevice *vbasedev, int region, uint16_t cap_type)
594 {
595     g_autofree struct vfio_region_info *info = NULL;
596     bool ret = false;
597 
598     if (!vfio_get_region_info(vbasedev, region, &info)) {
599         if (vfio_get_region_info_cap(info, cap_type)) {
600             ret = true;
601         }
602     }
603 
604     return ret;
605 }
606 
607 bool vfio_device_get_name(VFIODevice *vbasedev, Error **errp)
608 {
609     ERRP_GUARD();
610     struct stat st;
611 
612     if (vbasedev->fd < 0) {
613         if (stat(vbasedev->sysfsdev, &st) < 0) {
614             error_setg_errno(errp, errno, "no such host device");
615             error_prepend(errp, VFIO_MSG_PREFIX, vbasedev->sysfsdev);
616             return false;
617         }
618         /* User may specify a name, e.g: VFIO platform device */
619         if (!vbasedev->name) {
620             vbasedev->name = g_path_get_basename(vbasedev->sysfsdev);
621         }
622     } else {
623         if (!vbasedev->iommufd) {
624             error_setg(errp, "Use FD passing only with iommufd backend");
625             return false;
626         }
627         /*
628          * Give a name with fd so any function printing out vbasedev->name
629          * will not break.
630          */
631         if (!vbasedev->name) {
632             vbasedev->name = g_strdup_printf("VFIO_FD%d", vbasedev->fd);
633         }
634     }
635 
636     return true;
637 }
638 
639 void vfio_device_set_fd(VFIODevice *vbasedev, const char *str, Error **errp)
640 {
641     ERRP_GUARD();
642     int fd = monitor_fd_param(monitor_cur(), str, errp);
643 
644     if (fd < 0) {
645         error_prepend(errp, "Could not parse remote object fd %s:", str);
646         return;
647     }
648     vbasedev->fd = fd;
649 }
650 
651 void vfio_device_init(VFIODevice *vbasedev, int type, VFIODeviceOps *ops,
652                       DeviceState *dev, bool ram_discard)
653 {
654     vbasedev->type = type;
655     vbasedev->ops = ops;
656     vbasedev->dev = dev;
657     vbasedev->fd = -1;
658 
659     vbasedev->ram_block_discard_allowed = ram_discard;
660 }
661 
662 int vfio_device_get_aw_bits(VFIODevice *vdev)
663 {
664     /*
665      * iova_ranges is a sorted list. For old kernels that support
666      * VFIO but not support query of iova ranges, iova_ranges is NULL,
667      * in this case HOST_IOMMU_DEVICE_CAP_AW_BITS_MAX(64) is returned.
668      */
669     GList *l = g_list_last(vdev->bcontainer->iova_ranges);
670 
671     if (l) {
672         Range *range = l->data;
673         return range_get_last_bit(range) + 1;
674     }
675 
676     return HOST_IOMMU_DEVICE_CAP_AW_BITS_MAX;
677 }
678 
679 bool vfio_device_is_mdev(VFIODevice *vbasedev)
680 {
681     g_autofree char *subsys = NULL;
682     g_autofree char *tmp = NULL;
683 
684     if (!vbasedev->sysfsdev) {
685         return false;
686     }
687 
688     tmp = g_strdup_printf("%s/subsystem", vbasedev->sysfsdev);
689     subsys = realpath(tmp, NULL);
690     return subsys && (strcmp(subsys, "/sys/bus/mdev") == 0);
691 }
692 
693 bool vfio_device_hiod_realize(VFIODevice *vbasedev, Error **errp)
694 {
695     HostIOMMUDevice *hiod = vbasedev->hiod;
696 
697     if (!hiod) {
698         return true;
699     }
700 
701     return HOST_IOMMU_DEVICE_GET_CLASS(hiod)->realize(hiod, vbasedev, errp);
702 }
703