1 /*
2 * VFIO device
3 *
4 * Copyright Red Hat, Inc. 2012
5 *
6 * Authors:
7 * Alex Williamson <alex.williamson@redhat.com>
8 *
9 * This work is licensed under the terms of the GNU GPL, version 2. See
10 * the COPYING file in the top-level directory.
11 *
12 * Based on qemu-kvm device-assignment:
13 * Adapted for KVM by Qumranet.
14 * Copyright (c) 2007, Neocleus, Alex Novik (alex@neocleus.com)
15 * Copyright (c) 2007, Neocleus, Guy Zana (guy@neocleus.com)
16 * Copyright (C) 2008, Qumranet, Amit Shah (amit.shah@qumranet.com)
17 * Copyright (C) 2008, Red Hat, Amit Shah (amit.shah@redhat.com)
18 * Copyright (C) 2008, IBM, Muli Ben-Yehuda (muli@il.ibm.com)
19 */
20
21 #include "qemu/osdep.h"
22 #include <sys/ioctl.h>
23
24 #include "hw/vfio/vfio-device.h"
25 #include "hw/vfio/pci.h"
26 #include "hw/hw.h"
27 #include "trace.h"
28 #include "qapi/error.h"
29 #include "qemu/error-report.h"
30 #include "qemu/units.h"
31 #include "migration/cpr.h"
32 #include "migration/blocker.h"
33 #include "monitor/monitor.h"
34 #include "vfio-helpers.h"
35
36 VFIODeviceList vfio_device_list =
37 QLIST_HEAD_INITIALIZER(vfio_device_list);
38
39 /*
40 * We want to differentiate hot reset of multiple in-use devices vs
41 * hot reset of a single in-use device. VFIO_DEVICE_RESET will already
42 * handle the case of doing hot resets when there is only a single
43 * device per bus. The in-use here refers to how many VFIODevices are
44 * affected. A hot reset that affects multiple devices, but only a
45 * single in-use device, means that we can call it from our bus
46 * ->reset() callback since the extent is effectively a single
47 * device. This allows us to make use of it in the hotplug path. When
48 * there are multiple in-use devices, we can only trigger the hot
49 * reset during a system reset and thus from our reset handler. We
50 * separate _one vs _multi here so that we don't overlap and do a
51 * double reset on the system reset path where both our reset handler
52 * and ->reset() callback are used. Calling _one() will only do a hot
53 * reset for the one in-use devices case, calling _multi() will do
54 * nothing if a _one() would have been sufficient.
55 */
vfio_device_reset_handler(void * opaque)56 void vfio_device_reset_handler(void *opaque)
57 {
58 VFIODevice *vbasedev;
59
60 trace_vfio_device_reset_handler();
61 QLIST_FOREACH(vbasedev, &vfio_device_list, global_next) {
62 if (vbasedev->dev->realized) {
63 vbasedev->ops->vfio_compute_needs_reset(vbasedev);
64 }
65 }
66
67 QLIST_FOREACH(vbasedev, &vfio_device_list, global_next) {
68 if (vbasedev->dev->realized && vbasedev->needs_reset) {
69 vbasedev->ops->vfio_hot_reset_multi(vbasedev);
70 }
71 }
72 }
73
74 /*
75 * Common VFIO interrupt disable
76 */
vfio_device_irq_disable(VFIODevice * vbasedev,int index)77 void vfio_device_irq_disable(VFIODevice *vbasedev, int index)
78 {
79 struct vfio_irq_set irq_set = {
80 .argsz = sizeof(irq_set),
81 .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER,
82 .index = index,
83 .start = 0,
84 .count = 0,
85 };
86
87 vbasedev->io_ops->set_irqs(vbasedev, &irq_set);
88 }
89
vfio_device_irq_unmask(VFIODevice * vbasedev,int index)90 void vfio_device_irq_unmask(VFIODevice *vbasedev, int index)
91 {
92 struct vfio_irq_set irq_set = {
93 .argsz = sizeof(irq_set),
94 .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_UNMASK,
95 .index = index,
96 .start = 0,
97 .count = 1,
98 };
99
100 vbasedev->io_ops->set_irqs(vbasedev, &irq_set);
101 }
102
vfio_device_irq_mask(VFIODevice * vbasedev,int index)103 void vfio_device_irq_mask(VFIODevice *vbasedev, int index)
104 {
105 struct vfio_irq_set irq_set = {
106 .argsz = sizeof(irq_set),
107 .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_MASK,
108 .index = index,
109 .start = 0,
110 .count = 1,
111 };
112
113 vbasedev->io_ops->set_irqs(vbasedev, &irq_set);
114 }
115
action_to_str(int action)116 static inline const char *action_to_str(int action)
117 {
118 switch (action) {
119 case VFIO_IRQ_SET_ACTION_MASK:
120 return "MASK";
121 case VFIO_IRQ_SET_ACTION_UNMASK:
122 return "UNMASK";
123 case VFIO_IRQ_SET_ACTION_TRIGGER:
124 return "TRIGGER";
125 default:
126 return "UNKNOWN ACTION";
127 }
128 }
129
index_to_str(VFIODevice * vbasedev,int index)130 static const char *index_to_str(VFIODevice *vbasedev, int index)
131 {
132 if (vbasedev->type != VFIO_DEVICE_TYPE_PCI) {
133 return NULL;
134 }
135
136 switch (index) {
137 case VFIO_PCI_INTX_IRQ_INDEX:
138 return "INTX";
139 case VFIO_PCI_MSI_IRQ_INDEX:
140 return "MSI";
141 case VFIO_PCI_MSIX_IRQ_INDEX:
142 return "MSIX";
143 case VFIO_PCI_ERR_IRQ_INDEX:
144 return "ERR";
145 case VFIO_PCI_REQ_IRQ_INDEX:
146 return "REQ";
147 default:
148 return NULL;
149 }
150 }
151
vfio_device_irq_set_signaling(VFIODevice * vbasedev,int index,int subindex,int action,int fd,Error ** errp)152 bool vfio_device_irq_set_signaling(VFIODevice *vbasedev, int index, int subindex,
153 int action, int fd, Error **errp)
154 {
155 ERRP_GUARD();
156 g_autofree struct vfio_irq_set *irq_set = NULL;
157 int argsz;
158 const char *name;
159 int32_t *pfd;
160
161 argsz = sizeof(*irq_set) + sizeof(*pfd);
162
163 irq_set = g_malloc0(argsz);
164 irq_set->argsz = argsz;
165 irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | action;
166 irq_set->index = index;
167 irq_set->start = subindex;
168 irq_set->count = 1;
169 pfd = (int32_t *)&irq_set->data;
170 *pfd = fd;
171
172 if (!vbasedev->io_ops->set_irqs(vbasedev, irq_set)) {
173 return true;
174 }
175
176 error_setg_errno(errp, errno, "VFIO_DEVICE_SET_IRQS failure");
177
178 name = index_to_str(vbasedev, index);
179 if (name) {
180 error_prepend(errp, "%s-%d: ", name, subindex);
181 } else {
182 error_prepend(errp, "index %d-%d: ", index, subindex);
183 }
184 error_prepend(errp,
185 "Failed to %s %s eventfd signaling for interrupt ",
186 fd < 0 ? "tear down" : "set up", action_to_str(action));
187 return false;
188 }
189
vfio_device_get_irq_info(VFIODevice * vbasedev,int index,struct vfio_irq_info * info)190 int vfio_device_get_irq_info(VFIODevice *vbasedev, int index,
191 struct vfio_irq_info *info)
192 {
193 memset(info, 0, sizeof(*info));
194
195 info->argsz = sizeof(*info);
196 info->index = index;
197
198 return vbasedev->io_ops->get_irq_info(vbasedev, info);
199 }
200
vfio_device_get_region_info(VFIODevice * vbasedev,int index,struct vfio_region_info ** info)201 int vfio_device_get_region_info(VFIODevice *vbasedev, int index,
202 struct vfio_region_info **info)
203 {
204 size_t argsz = sizeof(struct vfio_region_info);
205 int fd = -1;
206 int ret;
207
208 /*
209 * We only set up the region info cache for the initial number of regions.
210 *
211 * Since a VFIO device may later increase the number of regions then use
212 * such regions with an index past ->num_initial_regions, don't attempt to
213 * use the info cache in those cases.
214 */
215 if (index < vbasedev->num_initial_regions) {
216 /* check cache */
217 if (vbasedev->reginfo[index] != NULL) {
218 *info = vbasedev->reginfo[index];
219 return 0;
220 }
221 }
222
223 *info = g_malloc0(argsz);
224
225 (*info)->index = index;
226 retry:
227 (*info)->argsz = argsz;
228
229 ret = vbasedev->io_ops->get_region_info(vbasedev, *info, &fd);
230 if (ret != 0) {
231 g_free(*info);
232 *info = NULL;
233 return ret;
234 }
235
236 if ((*info)->argsz > argsz) {
237 argsz = (*info)->argsz;
238 *info = g_realloc(*info, argsz);
239
240 if (fd != -1) {
241 close(fd);
242 fd = -1;
243 }
244
245 goto retry;
246 }
247
248 if (index < vbasedev->num_initial_regions) {
249 /* fill cache */
250 vbasedev->reginfo[index] = *info;
251 if (vbasedev->region_fds != NULL) {
252 vbasedev->region_fds[index] = fd;
253 }
254 }
255
256 return 0;
257 }
258
vfio_device_get_region_fd(VFIODevice * vbasedev,int index)259 int vfio_device_get_region_fd(VFIODevice *vbasedev, int index)
260 {
261 return vbasedev->region_fds ?
262 vbasedev->region_fds[index] :
263 vbasedev->fd;
264 }
265
vfio_device_get_region_info_type(VFIODevice * vbasedev,uint32_t type,uint32_t subtype,struct vfio_region_info ** info)266 int vfio_device_get_region_info_type(VFIODevice *vbasedev, uint32_t type,
267 uint32_t subtype, struct vfio_region_info **info)
268 {
269 int i;
270
271 for (i = 0; i < vbasedev->num_initial_regions; i++) {
272 struct vfio_info_cap_header *hdr;
273 struct vfio_region_info_cap_type *cap_type;
274
275 if (vfio_device_get_region_info(vbasedev, i, info)) {
276 continue;
277 }
278
279 hdr = vfio_get_region_info_cap(*info, VFIO_REGION_INFO_CAP_TYPE);
280 if (!hdr) {
281 continue;
282 }
283
284 cap_type = container_of(hdr, struct vfio_region_info_cap_type, header);
285
286 trace_vfio_device_get_region_info_type(vbasedev->name, i,
287 cap_type->type, cap_type->subtype);
288
289 if (cap_type->type == type && cap_type->subtype == subtype) {
290 return 0;
291 }
292 }
293
294 *info = NULL;
295 return -ENODEV;
296 }
297
vfio_device_has_region_cap(VFIODevice * vbasedev,int region,uint16_t cap_type)298 bool vfio_device_has_region_cap(VFIODevice *vbasedev, int region, uint16_t cap_type)
299 {
300 struct vfio_region_info *info = NULL;
301 bool ret = false;
302
303 if (!vfio_device_get_region_info(vbasedev, region, &info)) {
304 if (vfio_get_region_info_cap(info, cap_type)) {
305 ret = true;
306 }
307 }
308
309 return ret;
310 }
311
vfio_device_get_name(VFIODevice * vbasedev,Error ** errp)312 bool vfio_device_get_name(VFIODevice *vbasedev, Error **errp)
313 {
314 ERRP_GUARD();
315 struct stat st;
316
317 if (vbasedev->fd < 0) {
318 if (stat(vbasedev->sysfsdev, &st) < 0) {
319 error_setg_errno(errp, errno, "no such host device");
320 error_prepend(errp, VFIO_MSG_PREFIX, vbasedev->sysfsdev);
321 return false;
322 }
323 /* User may specify a name, e.g: VFIO platform device */
324 if (!vbasedev->name) {
325 vbasedev->name = g_path_get_basename(vbasedev->sysfsdev);
326 }
327 } else {
328 if (!vbasedev->iommufd) {
329 error_setg(errp, "Use FD passing only with iommufd backend");
330 return false;
331 }
332 if (!vbasedev->name) {
333
334 if (vbasedev->dev->id) {
335 vbasedev->name = g_strdup(vbasedev->dev->id);
336 return true;
337 } else {
338 /*
339 * Assign a name so any function printing it will not break.
340 * The fd number changes across processes, so this cannot be
341 * used as an invariant name for CPR.
342 */
343 vbasedev->name = g_strdup_printf("VFIO_FD%d", vbasedev->fd);
344 error_setg(&vbasedev->cpr.id_blocker,
345 "vfio device with fd=%d needs an id property",
346 vbasedev->fd);
347 return migrate_add_blocker_modes(&vbasedev->cpr.id_blocker,
348 errp, MIG_MODE_CPR_TRANSFER,
349 -1) == 0;
350 }
351 }
352 }
353
354 return true;
355 }
356
vfio_device_free_name(VFIODevice * vbasedev)357 void vfio_device_free_name(VFIODevice *vbasedev)
358 {
359 g_clear_pointer(&vbasedev->name, g_free);
360 migrate_del_blocker(&vbasedev->cpr.id_blocker);
361 }
362
vfio_device_set_fd(VFIODevice * vbasedev,const char * str,Error ** errp)363 void vfio_device_set_fd(VFIODevice *vbasedev, const char *str, Error **errp)
364 {
365 vbasedev->fd = cpr_get_fd_param(vbasedev->dev->id, str, 0, errp);
366 }
367
368 static VFIODeviceIOOps vfio_device_io_ops_ioctl;
369
vfio_device_init(VFIODevice * vbasedev,int type,VFIODeviceOps * ops,DeviceState * dev,bool ram_discard)370 void vfio_device_init(VFIODevice *vbasedev, int type, VFIODeviceOps *ops,
371 DeviceState *dev, bool ram_discard)
372 {
373 vbasedev->type = type;
374 vbasedev->ops = ops;
375 vbasedev->io_ops = &vfio_device_io_ops_ioctl;
376 vbasedev->dev = dev;
377 vbasedev->fd = -1;
378 vbasedev->use_region_fds = false;
379
380 vbasedev->ram_block_discard_allowed = ram_discard;
381 }
382
vfio_device_get_aw_bits(VFIODevice * vdev)383 int vfio_device_get_aw_bits(VFIODevice *vdev)
384 {
385 /*
386 * iova_ranges is a sorted list. For old kernels that support
387 * VFIO but not support query of iova ranges, iova_ranges is NULL,
388 * in this case HOST_IOMMU_DEVICE_CAP_AW_BITS_MAX(64) is returned.
389 */
390 GList *l = g_list_last(vdev->bcontainer->iova_ranges);
391
392 if (l) {
393 Range *range = l->data;
394 return range_get_last_bit(range) + 1;
395 }
396
397 return HOST_IOMMU_DEVICE_CAP_AW_BITS_MAX;
398 }
399
vfio_device_is_mdev(VFIODevice * vbasedev)400 bool vfio_device_is_mdev(VFIODevice *vbasedev)
401 {
402 g_autofree char *subsys = NULL;
403 g_autofree char *tmp = NULL;
404
405 if (!vbasedev->sysfsdev) {
406 return false;
407 }
408
409 tmp = g_strdup_printf("%s/subsystem", vbasedev->sysfsdev);
410 subsys = realpath(tmp, NULL);
411 return subsys && (strcmp(subsys, "/sys/bus/mdev") == 0);
412 }
413
vfio_device_hiod_create_and_realize(VFIODevice * vbasedev,const char * typename,Error ** errp)414 bool vfio_device_hiod_create_and_realize(VFIODevice *vbasedev,
415 const char *typename, Error **errp)
416 {
417 HostIOMMUDevice *hiod;
418
419 if (vbasedev->mdev) {
420 return true;
421 }
422
423 hiod = HOST_IOMMU_DEVICE(object_new(typename));
424
425 if (!HOST_IOMMU_DEVICE_GET_CLASS(hiod)->realize(hiod, vbasedev, errp)) {
426 object_unref(hiod);
427 return false;
428 }
429
430 vbasedev->hiod = hiod;
431 return true;
432 }
433
vfio_get_vfio_device(Object * obj)434 VFIODevice *vfio_get_vfio_device(Object *obj)
435 {
436 if (object_dynamic_cast(obj, TYPE_VFIO_PCI)) {
437 return &VFIO_PCI_BASE(obj)->vbasedev;
438 } else {
439 return NULL;
440 }
441 }
442
vfio_device_attach_by_iommu_type(const char * iommu_type,char * name,VFIODevice * vbasedev,AddressSpace * as,Error ** errp)443 bool vfio_device_attach_by_iommu_type(const char *iommu_type, char *name,
444 VFIODevice *vbasedev, AddressSpace *as,
445 Error **errp)
446 {
447 const VFIOIOMMUClass *ops =
448 VFIO_IOMMU_CLASS(object_class_by_name(iommu_type));
449
450 assert(ops);
451
452 return ops->attach_device(name, vbasedev, as, errp);
453 }
454
vfio_device_attach(char * name,VFIODevice * vbasedev,AddressSpace * as,Error ** errp)455 bool vfio_device_attach(char *name, VFIODevice *vbasedev,
456 AddressSpace *as, Error **errp)
457 {
458 const char *iommu_type = vbasedev->iommufd ?
459 TYPE_VFIO_IOMMU_IOMMUFD :
460 TYPE_VFIO_IOMMU_LEGACY;
461
462 return vfio_device_attach_by_iommu_type(iommu_type, name, vbasedev,
463 as, errp);
464 }
465
vfio_device_detach(VFIODevice * vbasedev)466 void vfio_device_detach(VFIODevice *vbasedev)
467 {
468 if (!vbasedev->bcontainer) {
469 return;
470 }
471 VFIO_IOMMU_GET_CLASS(vbasedev->bcontainer)->detach_device(vbasedev);
472 }
473
vfio_device_prepare(VFIODevice * vbasedev,VFIOContainerBase * bcontainer,struct vfio_device_info * info)474 void vfio_device_prepare(VFIODevice *vbasedev, VFIOContainerBase *bcontainer,
475 struct vfio_device_info *info)
476 {
477 int i;
478
479 vbasedev->num_irqs = info->num_irqs;
480 vbasedev->num_initial_regions = info->num_regions;
481 vbasedev->flags = info->flags;
482 vbasedev->reset_works = !!(info->flags & VFIO_DEVICE_FLAGS_RESET);
483
484 vbasedev->bcontainer = bcontainer;
485 QLIST_INSERT_HEAD(&bcontainer->device_list, vbasedev, container_next);
486
487 QLIST_INSERT_HEAD(&vfio_device_list, vbasedev, global_next);
488
489 vbasedev->reginfo = g_new0(struct vfio_region_info *,
490 vbasedev->num_initial_regions);
491 if (vbasedev->use_region_fds) {
492 vbasedev->region_fds = g_new0(int, vbasedev->num_initial_regions);
493 for (i = 0; i < vbasedev->num_initial_regions; i++) {
494 vbasedev->region_fds[i] = -1;
495 }
496 }
497 }
498
vfio_device_unprepare(VFIODevice * vbasedev)499 void vfio_device_unprepare(VFIODevice *vbasedev)
500 {
501 int i;
502
503 for (i = 0; i < vbasedev->num_initial_regions; i++) {
504 g_free(vbasedev->reginfo[i]);
505 if (vbasedev->region_fds != NULL && vbasedev->region_fds[i] != -1) {
506 close(vbasedev->region_fds[i]);
507 }
508 }
509
510 g_clear_pointer(&vbasedev->reginfo, g_free);
511 g_clear_pointer(&vbasedev->region_fds, g_free);
512
513 QLIST_REMOVE(vbasedev, container_next);
514 QLIST_REMOVE(vbasedev, global_next);
515 vbasedev->bcontainer = NULL;
516 }
517
518 /*
519 * Traditional ioctl() based io
520 */
521
vfio_device_io_device_feature(VFIODevice * vbasedev,struct vfio_device_feature * feature)522 static int vfio_device_io_device_feature(VFIODevice *vbasedev,
523 struct vfio_device_feature *feature)
524 {
525 int ret;
526
527 ret = ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature);
528
529 return ret < 0 ? -errno : ret;
530 }
531
vfio_device_io_get_region_info(VFIODevice * vbasedev,struct vfio_region_info * info,int * fd)532 static int vfio_device_io_get_region_info(VFIODevice *vbasedev,
533 struct vfio_region_info *info,
534 int *fd)
535 {
536 int ret;
537
538 *fd = -1;
539
540 ret = ioctl(vbasedev->fd, VFIO_DEVICE_GET_REGION_INFO, info);
541
542 return ret < 0 ? -errno : ret;
543 }
544
vfio_device_io_get_irq_info(VFIODevice * vbasedev,struct vfio_irq_info * info)545 static int vfio_device_io_get_irq_info(VFIODevice *vbasedev,
546 struct vfio_irq_info *info)
547 {
548 int ret;
549
550 ret = ioctl(vbasedev->fd, VFIO_DEVICE_GET_IRQ_INFO, info);
551
552 return ret < 0 ? -errno : ret;
553 }
554
vfio_device_io_set_irqs(VFIODevice * vbasedev,struct vfio_irq_set * irqs)555 static int vfio_device_io_set_irqs(VFIODevice *vbasedev,
556 struct vfio_irq_set *irqs)
557 {
558 int ret;
559
560 ret = ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, irqs);
561
562 return ret < 0 ? -errno : ret;
563 }
564
vfio_device_io_region_read(VFIODevice * vbasedev,uint8_t index,off_t off,uint32_t size,void * data)565 static int vfio_device_io_region_read(VFIODevice *vbasedev, uint8_t index,
566 off_t off, uint32_t size, void *data)
567 {
568 struct vfio_region_info *info;
569 int ret;
570
571 ret = vfio_device_get_region_info(vbasedev, index, &info);
572 if (ret != 0) {
573 return ret;
574 }
575
576 ret = pread(vbasedev->fd, data, size, info->offset + off);
577
578 return ret < 0 ? -errno : ret;
579 }
580
vfio_device_io_region_write(VFIODevice * vbasedev,uint8_t index,off_t off,uint32_t size,void * data,bool post)581 static int vfio_device_io_region_write(VFIODevice *vbasedev, uint8_t index,
582 off_t off, uint32_t size, void *data,
583 bool post)
584 {
585 struct vfio_region_info *info;
586 int ret;
587
588 ret = vfio_device_get_region_info(vbasedev, index, &info);
589 if (ret != 0) {
590 return ret;
591 }
592
593 ret = pwrite(vbasedev->fd, data, size, info->offset + off);
594
595 return ret < 0 ? -errno : ret;
596 }
597
598 static VFIODeviceIOOps vfio_device_io_ops_ioctl = {
599 .device_feature = vfio_device_io_device_feature,
600 .get_region_info = vfio_device_io_get_region_info,
601 .get_irq_info = vfio_device_io_get_irq_info,
602 .set_irqs = vfio_device_io_set_irqs,
603 .region_read = vfio_device_io_region_read,
604 .region_write = vfio_device_io_region_write,
605 };
606