xref: /openbmc/qemu/hw/vfio/device.c (revision 072470dfae125f5622e2250ebd1daf626d4023b7)
1 /*
2  * VFIO device
3  *
4  * Copyright Red Hat, Inc. 2012
5  *
6  * Authors:
7  *  Alex Williamson <alex.williamson@redhat.com>
8  *
9  * This work is licensed under the terms of the GNU GPL, version 2.  See
10  * the COPYING file in the top-level directory.
11  *
12  * Based on qemu-kvm device-assignment:
13  *  Adapted for KVM by Qumranet.
14  *  Copyright (c) 2007, Neocleus, Alex Novik (alex@neocleus.com)
15  *  Copyright (c) 2007, Neocleus, Guy Zana (guy@neocleus.com)
16  *  Copyright (C) 2008, Qumranet, Amit Shah (amit.shah@qumranet.com)
17  *  Copyright (C) 2008, Red Hat, Amit Shah (amit.shah@redhat.com)
18  *  Copyright (C) 2008, IBM, Muli Ben-Yehuda (muli@il.ibm.com)
19  */
20 
21 #include "qemu/osdep.h"
22 #include <sys/ioctl.h>
23 
24 #include "hw/vfio/vfio-device.h"
25 #include "hw/vfio/pci.h"
26 #include "hw/hw.h"
27 #include "trace.h"
28 #include "qapi/error.h"
29 #include "qemu/error-report.h"
30 #include "qemu/units.h"
31 #include "migration/cpr.h"
32 #include "migration/blocker.h"
33 #include "monitor/monitor.h"
34 #include "vfio-helpers.h"
35 
36 VFIODeviceList vfio_device_list =
37     QLIST_HEAD_INITIALIZER(vfio_device_list);
38 
39 /*
40  * We want to differentiate hot reset of multiple in-use devices vs
41  * hot reset of a single in-use device. VFIO_DEVICE_RESET will already
42  * handle the case of doing hot resets when there is only a single
43  * device per bus. The in-use here refers to how many VFIODevices are
44  * affected. A hot reset that affects multiple devices, but only a
45  * single in-use device, means that we can call it from our bus
46  * ->reset() callback since the extent is effectively a single
47  * device. This allows us to make use of it in the hotplug path. When
48  * there are multiple in-use devices, we can only trigger the hot
49  * reset during a system reset and thus from our reset handler. We
50  * separate _one vs _multi here so that we don't overlap and do a
51  * double reset on the system reset path where both our reset handler
52  * and ->reset() callback are used. Calling _one() will only do a hot
53  * reset for the one in-use devices case, calling _multi() will do
54  * nothing if a _one() would have been sufficient.
55  */
56 void vfio_device_reset_handler(void *opaque)
57 {
58     VFIODevice *vbasedev;
59 
60     trace_vfio_device_reset_handler();
61     QLIST_FOREACH(vbasedev, &vfio_device_list, global_next) {
62         if (vbasedev->dev->realized) {
63             vbasedev->ops->vfio_compute_needs_reset(vbasedev);
64         }
65     }
66 
67     QLIST_FOREACH(vbasedev, &vfio_device_list, global_next) {
68         if (vbasedev->dev->realized && vbasedev->needs_reset) {
69             vbasedev->ops->vfio_hot_reset_multi(vbasedev);
70         }
71     }
72 }
73 
74 /*
75  * Common VFIO interrupt disable
76  */
77 void vfio_device_irq_disable(VFIODevice *vbasedev, int index)
78 {
79     struct vfio_irq_set irq_set = {
80         .argsz = sizeof(irq_set),
81         .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER,
82         .index = index,
83         .start = 0,
84         .count = 0,
85     };
86 
87     vbasedev->io_ops->set_irqs(vbasedev, &irq_set);
88 }
89 
90 void vfio_device_irq_unmask(VFIODevice *vbasedev, int index)
91 {
92     struct vfio_irq_set irq_set = {
93         .argsz = sizeof(irq_set),
94         .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_UNMASK,
95         .index = index,
96         .start = 0,
97         .count = 1,
98     };
99 
100     vbasedev->io_ops->set_irqs(vbasedev, &irq_set);
101 }
102 
103 void vfio_device_irq_mask(VFIODevice *vbasedev, int index)
104 {
105     struct vfio_irq_set irq_set = {
106         .argsz = sizeof(irq_set),
107         .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_MASK,
108         .index = index,
109         .start = 0,
110         .count = 1,
111     };
112 
113     vbasedev->io_ops->set_irqs(vbasedev, &irq_set);
114 }
115 
116 static inline const char *action_to_str(int action)
117 {
118     switch (action) {
119     case VFIO_IRQ_SET_ACTION_MASK:
120         return "MASK";
121     case VFIO_IRQ_SET_ACTION_UNMASK:
122         return "UNMASK";
123     case VFIO_IRQ_SET_ACTION_TRIGGER:
124         return "TRIGGER";
125     default:
126         return "UNKNOWN ACTION";
127     }
128 }
129 
130 static const char *index_to_str(VFIODevice *vbasedev, int index)
131 {
132     if (vbasedev->type != VFIO_DEVICE_TYPE_PCI) {
133         return NULL;
134     }
135 
136     switch (index) {
137     case VFIO_PCI_INTX_IRQ_INDEX:
138         return "INTX";
139     case VFIO_PCI_MSI_IRQ_INDEX:
140         return "MSI";
141     case VFIO_PCI_MSIX_IRQ_INDEX:
142         return "MSIX";
143     case VFIO_PCI_ERR_IRQ_INDEX:
144         return "ERR";
145     case VFIO_PCI_REQ_IRQ_INDEX:
146         return "REQ";
147     default:
148         return NULL;
149     }
150 }
151 
152 bool vfio_device_irq_set_signaling(VFIODevice *vbasedev, int index, int subindex,
153                                    int action, int fd, Error **errp)
154 {
155     ERRP_GUARD();
156     g_autofree struct vfio_irq_set *irq_set = NULL;
157     int argsz;
158     const char *name;
159     int32_t *pfd;
160 
161     argsz = sizeof(*irq_set) + sizeof(*pfd);
162 
163     irq_set = g_malloc0(argsz);
164     irq_set->argsz = argsz;
165     irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | action;
166     irq_set->index = index;
167     irq_set->start = subindex;
168     irq_set->count = 1;
169     pfd = (int32_t *)&irq_set->data;
170     *pfd = fd;
171 
172     if (!vbasedev->io_ops->set_irqs(vbasedev, irq_set)) {
173         return true;
174     }
175 
176     error_setg_errno(errp, errno, "VFIO_DEVICE_SET_IRQS failure");
177 
178     name = index_to_str(vbasedev, index);
179     if (name) {
180         error_prepend(errp, "%s-%d: ", name, subindex);
181     } else {
182         error_prepend(errp, "index %d-%d: ", index, subindex);
183     }
184     error_prepend(errp,
185                   "Failed to %s %s eventfd signaling for interrupt ",
186                   fd < 0 ? "tear down" : "set up", action_to_str(action));
187     return false;
188 }
189 
190 int vfio_device_get_irq_info(VFIODevice *vbasedev, int index,
191                              struct vfio_irq_info *info)
192 {
193     memset(info, 0, sizeof(*info));
194 
195     info->argsz = sizeof(*info);
196     info->index = index;
197 
198     return vbasedev->io_ops->get_irq_info(vbasedev, info);
199 }
200 
201 int vfio_device_get_region_info(VFIODevice *vbasedev, int index,
202                                 struct vfio_region_info **info)
203 {
204     size_t argsz = sizeof(struct vfio_region_info);
205     int fd = -1;
206     int ret;
207 
208     /*
209      * We only set up the region info cache for the initial number of regions.
210      *
211      * Since a VFIO device may later increase the number of regions then use
212      * such regions with an index past ->num_initial_regions, don't attempt to
213      * use the info cache in those cases.
214      */
215     if (index < vbasedev->num_initial_regions) {
216         /* check cache */
217         if (vbasedev->reginfo[index] != NULL) {
218             *info = vbasedev->reginfo[index];
219             return 0;
220         }
221     }
222 
223     *info = g_malloc0(argsz);
224 
225     (*info)->index = index;
226 retry:
227     (*info)->argsz = argsz;
228 
229     ret = vbasedev->io_ops->get_region_info(vbasedev, *info, &fd);
230     if (ret != 0) {
231         g_free(*info);
232         *info = NULL;
233         return ret;
234     }
235 
236     if ((*info)->argsz > argsz) {
237         argsz = (*info)->argsz;
238         *info = g_realloc(*info, argsz);
239 
240         if (fd != -1) {
241             close(fd);
242             fd = -1;
243         }
244 
245         goto retry;
246     }
247 
248     if (index < vbasedev->num_initial_regions) {
249         /* fill cache */
250         vbasedev->reginfo[index] = *info;
251         if (vbasedev->region_fds != NULL) {
252             vbasedev->region_fds[index] = fd;
253         }
254     }
255 
256     return 0;
257 }
258 
259 int vfio_device_get_region_fd(VFIODevice *vbasedev, int index)
260 {
261         return vbasedev->region_fds ?
262                vbasedev->region_fds[index] :
263                vbasedev->fd;
264 }
265 
266 int vfio_device_get_region_info_type(VFIODevice *vbasedev, uint32_t type,
267                                      uint32_t subtype, struct vfio_region_info **info)
268 {
269     int i;
270 
271     for (i = 0; i < vbasedev->num_initial_regions; i++) {
272         struct vfio_info_cap_header *hdr;
273         struct vfio_region_info_cap_type *cap_type;
274 
275         if (vfio_device_get_region_info(vbasedev, i, info)) {
276             continue;
277         }
278 
279         hdr = vfio_get_region_info_cap(*info, VFIO_REGION_INFO_CAP_TYPE);
280         if (!hdr) {
281             continue;
282         }
283 
284         cap_type = container_of(hdr, struct vfio_region_info_cap_type, header);
285 
286         trace_vfio_device_get_region_info_type(vbasedev->name, i,
287                                                cap_type->type, cap_type->subtype);
288 
289         if (cap_type->type == type && cap_type->subtype == subtype) {
290             return 0;
291         }
292     }
293 
294     *info = NULL;
295     return -ENODEV;
296 }
297 
298 bool vfio_device_has_region_cap(VFIODevice *vbasedev, int region, uint16_t cap_type)
299 {
300     struct vfio_region_info *info = NULL;
301     bool ret = false;
302 
303     if (!vfio_device_get_region_info(vbasedev, region, &info)) {
304         if (vfio_get_region_info_cap(info, cap_type)) {
305             ret = true;
306         }
307     }
308 
309     return ret;
310 }
311 
312 bool vfio_device_get_name(VFIODevice *vbasedev, Error **errp)
313 {
314     ERRP_GUARD();
315     struct stat st;
316 
317     if (vbasedev->fd < 0) {
318         if (stat(vbasedev->sysfsdev, &st) < 0) {
319             error_setg_errno(errp, errno, "no such host device");
320             error_prepend(errp, VFIO_MSG_PREFIX, vbasedev->sysfsdev);
321             return false;
322         }
323         /* User may specify a name, e.g: VFIO platform device */
324         if (!vbasedev->name) {
325             vbasedev->name = g_path_get_basename(vbasedev->sysfsdev);
326         }
327     } else {
328         if (!vbasedev->iommufd) {
329             error_setg(errp, "Use FD passing only with iommufd backend");
330             return false;
331         }
332         if (!vbasedev->name) {
333 
334             if (vbasedev->dev->id) {
335                 vbasedev->name = g_strdup(vbasedev->dev->id);
336                 return true;
337             } else {
338                 /*
339                  * Assign a name so any function printing it will not break.
340                  * The fd number changes across processes, so this cannot be
341                  * used as an invariant name for CPR.
342                  */
343                 vbasedev->name = g_strdup_printf("VFIO_FD%d", vbasedev->fd);
344                 error_setg(&vbasedev->cpr.id_blocker,
345                            "vfio device with fd=%d needs an id property",
346                            vbasedev->fd);
347                 return migrate_add_blocker_modes(&vbasedev->cpr.id_blocker,
348                                                  errp, MIG_MODE_CPR_TRANSFER,
349                                                  -1) == 0;
350             }
351         }
352     }
353 
354     return true;
355 }
356 
357 void vfio_device_free_name(VFIODevice *vbasedev)
358 {
359     g_clear_pointer(&vbasedev->name, g_free);
360     migrate_del_blocker(&vbasedev->cpr.id_blocker);
361 }
362 
363 void vfio_device_set_fd(VFIODevice *vbasedev, const char *str, Error **errp)
364 {
365     vbasedev->fd = cpr_get_fd_param(vbasedev->dev->id, str, 0, errp);
366 }
367 
368 static VFIODeviceIOOps vfio_device_io_ops_ioctl;
369 
370 void vfio_device_init(VFIODevice *vbasedev, int type, VFIODeviceOps *ops,
371                       DeviceState *dev, bool ram_discard)
372 {
373     vbasedev->type = type;
374     vbasedev->ops = ops;
375     vbasedev->io_ops = &vfio_device_io_ops_ioctl;
376     vbasedev->dev = dev;
377     vbasedev->fd = -1;
378     vbasedev->use_region_fds = false;
379 
380     vbasedev->ram_block_discard_allowed = ram_discard;
381 }
382 
383 int vfio_device_get_aw_bits(VFIODevice *vdev)
384 {
385     /*
386      * iova_ranges is a sorted list. For old kernels that support
387      * VFIO but not support query of iova ranges, iova_ranges is NULL,
388      * in this case HOST_IOMMU_DEVICE_CAP_AW_BITS_MAX(64) is returned.
389      */
390     GList *l = g_list_last(vdev->bcontainer->iova_ranges);
391 
392     if (l) {
393         Range *range = l->data;
394         return range_get_last_bit(range) + 1;
395     }
396 
397     return HOST_IOMMU_DEVICE_CAP_AW_BITS_MAX;
398 }
399 
400 bool vfio_device_is_mdev(VFIODevice *vbasedev)
401 {
402     g_autofree char *subsys = NULL;
403     g_autofree char *tmp = NULL;
404 
405     if (!vbasedev->sysfsdev) {
406         return false;
407     }
408 
409     tmp = g_strdup_printf("%s/subsystem", vbasedev->sysfsdev);
410     subsys = realpath(tmp, NULL);
411     return subsys && (strcmp(subsys, "/sys/bus/mdev") == 0);
412 }
413 
414 bool vfio_device_hiod_create_and_realize(VFIODevice *vbasedev,
415                                          const char *typename, Error **errp)
416 {
417     HostIOMMUDevice *hiod;
418 
419     if (vbasedev->mdev) {
420         return true;
421     }
422 
423     hiod = HOST_IOMMU_DEVICE(object_new(typename));
424 
425     if (!HOST_IOMMU_DEVICE_GET_CLASS(hiod)->realize(hiod, vbasedev, errp)) {
426         object_unref(hiod);
427         return false;
428     }
429 
430     vbasedev->hiod = hiod;
431     return true;
432 }
433 
434 VFIODevice *vfio_get_vfio_device(Object *obj)
435 {
436     if (object_dynamic_cast(obj, TYPE_VFIO_PCI)) {
437         return &VFIO_PCI_BASE(obj)->vbasedev;
438     } else {
439         return NULL;
440     }
441 }
442 
443 bool vfio_device_attach_by_iommu_type(const char *iommu_type, char *name,
444                                       VFIODevice *vbasedev, AddressSpace *as,
445                                       Error **errp)
446 {
447     const VFIOIOMMUClass *ops =
448         VFIO_IOMMU_CLASS(object_class_by_name(iommu_type));
449 
450     assert(ops);
451 
452     return ops->attach_device(name, vbasedev, as, errp);
453 }
454 
455 bool vfio_device_attach(char *name, VFIODevice *vbasedev,
456                         AddressSpace *as, Error **errp)
457 {
458     const char *iommu_type = vbasedev->iommufd ?
459                              TYPE_VFIO_IOMMU_IOMMUFD :
460                              TYPE_VFIO_IOMMU_LEGACY;
461 
462     return vfio_device_attach_by_iommu_type(iommu_type, name, vbasedev,
463                                             as, errp);
464 }
465 
466 void vfio_device_detach(VFIODevice *vbasedev)
467 {
468     if (!vbasedev->bcontainer) {
469         return;
470     }
471     VFIO_IOMMU_GET_CLASS(vbasedev->bcontainer)->detach_device(vbasedev);
472 }
473 
474 void vfio_device_prepare(VFIODevice *vbasedev, VFIOContainerBase *bcontainer,
475                          struct vfio_device_info *info)
476 {
477     int i;
478 
479     vbasedev->num_irqs = info->num_irqs;
480     vbasedev->num_initial_regions = info->num_regions;
481     vbasedev->flags = info->flags;
482     vbasedev->reset_works = !!(info->flags & VFIO_DEVICE_FLAGS_RESET);
483 
484     vbasedev->bcontainer = bcontainer;
485     QLIST_INSERT_HEAD(&bcontainer->device_list, vbasedev, container_next);
486 
487     QLIST_INSERT_HEAD(&vfio_device_list, vbasedev, global_next);
488 
489     vbasedev->reginfo = g_new0(struct vfio_region_info *,
490                                vbasedev->num_initial_regions);
491     if (vbasedev->use_region_fds) {
492         vbasedev->region_fds = g_new0(int, vbasedev->num_initial_regions);
493         for (i = 0; i < vbasedev->num_initial_regions; i++) {
494             vbasedev->region_fds[i] = -1;
495         }
496     }
497 }
498 
499 void vfio_device_unprepare(VFIODevice *vbasedev)
500 {
501     int i;
502 
503     for (i = 0; i < vbasedev->num_initial_regions; i++) {
504         g_free(vbasedev->reginfo[i]);
505         if (vbasedev->region_fds != NULL && vbasedev->region_fds[i] != -1) {
506             close(vbasedev->region_fds[i]);
507         }
508     }
509 
510     g_clear_pointer(&vbasedev->reginfo, g_free);
511     g_clear_pointer(&vbasedev->region_fds, g_free);
512 
513     QLIST_REMOVE(vbasedev, container_next);
514     QLIST_REMOVE(vbasedev, global_next);
515     vbasedev->bcontainer = NULL;
516 }
517 
518 /*
519  * Traditional ioctl() based io
520  */
521 
522 static int vfio_device_io_device_feature(VFIODevice *vbasedev,
523                                          struct vfio_device_feature *feature)
524 {
525     int ret;
526 
527     ret = ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature);
528 
529     return ret < 0 ? -errno : ret;
530 }
531 
532 static int vfio_device_io_get_region_info(VFIODevice *vbasedev,
533                                           struct vfio_region_info *info,
534                                           int *fd)
535 {
536     int ret;
537 
538     *fd = -1;
539 
540     ret = ioctl(vbasedev->fd, VFIO_DEVICE_GET_REGION_INFO, info);
541 
542     return ret < 0 ? -errno : ret;
543 }
544 
545 static int vfio_device_io_get_irq_info(VFIODevice *vbasedev,
546                                        struct vfio_irq_info *info)
547 {
548     int ret;
549 
550     ret = ioctl(vbasedev->fd, VFIO_DEVICE_GET_IRQ_INFO, info);
551 
552     return ret < 0 ? -errno : ret;
553 }
554 
555 static int vfio_device_io_set_irqs(VFIODevice *vbasedev,
556                                    struct vfio_irq_set *irqs)
557 {
558     int ret;
559 
560     ret = ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, irqs);
561 
562     return ret < 0 ? -errno : ret;
563 }
564 
565 static int vfio_device_io_region_read(VFIODevice *vbasedev, uint8_t index,
566                                       off_t off, uint32_t size, void *data)
567 {
568     struct vfio_region_info *info;
569     int ret;
570 
571     ret = vfio_device_get_region_info(vbasedev, index, &info);
572     if (ret != 0) {
573         return ret;
574     }
575 
576     ret = pread(vbasedev->fd, data, size, info->offset + off);
577 
578     return ret < 0 ? -errno : ret;
579 }
580 
581 static int vfio_device_io_region_write(VFIODevice *vbasedev, uint8_t index,
582                                        off_t off, uint32_t size, void *data,
583                                        bool post)
584 {
585     struct vfio_region_info *info;
586     int ret;
587 
588     ret = vfio_device_get_region_info(vbasedev, index, &info);
589     if (ret != 0) {
590         return ret;
591     }
592 
593     ret = pwrite(vbasedev->fd, data, size, info->offset + off);
594 
595     return ret < 0 ? -errno : ret;
596 }
597 
598 static VFIODeviceIOOps vfio_device_io_ops_ioctl = {
599     .device_feature = vfio_device_io_device_feature,
600     .get_region_info = vfio_device_io_get_region_info,
601     .get_irq_info = vfio_device_io_get_irq_info,
602     .set_irqs = vfio_device_io_set_irqs,
603     .region_read = vfio_device_io_region_read,
604     .region_write = vfio_device_io_region_write,
605 };
606