1 /*
2 * virtio-iommu device
3 *
4 * Copyright (c) 2020 Red Hat, Inc.
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms and conditions of the GNU General Public License,
8 * version 2 or later, as published by the Free Software Foundation.
9 *
10 * This program is distributed in the hope it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * more details.
14 *
15 * You should have received a copy of the GNU General Public License along with
16 * this program. If not, see <http://www.gnu.org/licenses/>.
17 *
18 */
19
20 #include "qemu/osdep.h"
21 #include "qemu/log.h"
22 #include "qemu/iov.h"
23 #include "qemu/range.h"
24 #include "qemu/reserved-region.h"
25 #include "exec/target_page.h"
26 #include "hw/qdev-properties.h"
27 #include "hw/virtio/virtio.h"
28 #include "sysemu/kvm.h"
29 #include "sysemu/reset.h"
30 #include "sysemu/sysemu.h"
31 #include "qemu/reserved-region.h"
32 #include "qemu/units.h"
33 #include "qapi/error.h"
34 #include "qemu/error-report.h"
35 #include "trace.h"
36
37 #include "standard-headers/linux/virtio_ids.h"
38
39 #include "hw/virtio/virtio-bus.h"
40 #include "hw/virtio/virtio-iommu.h"
41 #include "hw/pci/pci_bus.h"
42 #include "hw/pci/pci.h"
43
44 /* Max size */
45 #define VIOMMU_DEFAULT_QUEUE_SIZE 256
46 #define VIOMMU_PROBE_SIZE 512
47
48 typedef struct VirtIOIOMMUDomain {
49 uint32_t id;
50 bool bypass;
51 GTree *mappings;
52 QLIST_HEAD(, VirtIOIOMMUEndpoint) endpoint_list;
53 } VirtIOIOMMUDomain;
54
55 typedef struct VirtIOIOMMUEndpoint {
56 uint32_t id;
57 VirtIOIOMMUDomain *domain;
58 IOMMUMemoryRegion *iommu_mr;
59 QLIST_ENTRY(VirtIOIOMMUEndpoint) next;
60 } VirtIOIOMMUEndpoint;
61
62 typedef struct VirtIOIOMMUInterval {
63 uint64_t low;
64 uint64_t high;
65 } VirtIOIOMMUInterval;
66
67 typedef struct VirtIOIOMMUMapping {
68 uint64_t phys_addr;
69 uint32_t flags;
70 } VirtIOIOMMUMapping;
71
72 struct hiod_key {
73 PCIBus *bus;
74 uint8_t devfn;
75 };
76
virtio_iommu_get_bdf(IOMMUDevice * dev)77 static inline uint16_t virtio_iommu_get_bdf(IOMMUDevice *dev)
78 {
79 return PCI_BUILD_BDF(pci_bus_num(dev->bus), dev->devfn);
80 }
81
virtio_iommu_device_bypassed(IOMMUDevice * sdev)82 static bool virtio_iommu_device_bypassed(IOMMUDevice *sdev)
83 {
84 uint32_t sid;
85 bool bypassed;
86 VirtIOIOMMU *s = sdev->viommu;
87 VirtIOIOMMUEndpoint *ep;
88
89 sid = virtio_iommu_get_bdf(sdev);
90
91 qemu_rec_mutex_lock(&s->mutex);
92 /* need to check bypass before system reset */
93 if (!s->endpoints) {
94 bypassed = s->config.bypass;
95 goto unlock;
96 }
97
98 ep = g_tree_lookup(s->endpoints, GUINT_TO_POINTER(sid));
99 if (!ep || !ep->domain) {
100 bypassed = s->config.bypass;
101 } else {
102 bypassed = ep->domain->bypass;
103 }
104
105 unlock:
106 qemu_rec_mutex_unlock(&s->mutex);
107 return bypassed;
108 }
109
110 /* Return whether the device is using IOMMU translation. */
virtio_iommu_switch_address_space(IOMMUDevice * sdev)111 static bool virtio_iommu_switch_address_space(IOMMUDevice *sdev)
112 {
113 bool use_remapping;
114
115 assert(sdev);
116
117 use_remapping = !virtio_iommu_device_bypassed(sdev);
118
119 trace_virtio_iommu_switch_address_space(pci_bus_num(sdev->bus),
120 PCI_SLOT(sdev->devfn),
121 PCI_FUNC(sdev->devfn),
122 use_remapping);
123
124 /* Turn off first then on the other */
125 if (use_remapping) {
126 memory_region_set_enabled(&sdev->bypass_mr, false);
127 memory_region_set_enabled(MEMORY_REGION(&sdev->iommu_mr), true);
128 } else {
129 memory_region_set_enabled(MEMORY_REGION(&sdev->iommu_mr), false);
130 memory_region_set_enabled(&sdev->bypass_mr, true);
131 }
132
133 return use_remapping;
134 }
135
virtio_iommu_switch_address_space_all(VirtIOIOMMU * s)136 static void virtio_iommu_switch_address_space_all(VirtIOIOMMU *s)
137 {
138 GHashTableIter iter;
139 IOMMUPciBus *iommu_pci_bus;
140 int i;
141
142 g_hash_table_iter_init(&iter, s->as_by_busptr);
143 while (g_hash_table_iter_next(&iter, NULL, (void **)&iommu_pci_bus)) {
144 for (i = 0; i < PCI_DEVFN_MAX; i++) {
145 if (!iommu_pci_bus->pbdev[i]) {
146 continue;
147 }
148 virtio_iommu_switch_address_space(iommu_pci_bus->pbdev[i]);
149 }
150 }
151 }
152
153 /**
154 * The bus number is used for lookup when SID based operations occur.
155 * In that case we lazily populate the IOMMUPciBus array from the bus hash
156 * table. At the time the IOMMUPciBus is created (iommu_find_add_as), the bus
157 * numbers may not be always initialized yet.
158 */
iommu_find_iommu_pcibus(VirtIOIOMMU * s,uint8_t bus_num)159 static IOMMUPciBus *iommu_find_iommu_pcibus(VirtIOIOMMU *s, uint8_t bus_num)
160 {
161 IOMMUPciBus *iommu_pci_bus = s->iommu_pcibus_by_bus_num[bus_num];
162
163 if (!iommu_pci_bus) {
164 GHashTableIter iter;
165
166 g_hash_table_iter_init(&iter, s->as_by_busptr);
167 while (g_hash_table_iter_next(&iter, NULL, (void **)&iommu_pci_bus)) {
168 if (pci_bus_num(iommu_pci_bus->bus) == bus_num) {
169 s->iommu_pcibus_by_bus_num[bus_num] = iommu_pci_bus;
170 return iommu_pci_bus;
171 }
172 }
173 return NULL;
174 }
175 return iommu_pci_bus;
176 }
177
virtio_iommu_mr(VirtIOIOMMU * s,uint32_t sid)178 static IOMMUMemoryRegion *virtio_iommu_mr(VirtIOIOMMU *s, uint32_t sid)
179 {
180 uint8_t bus_n, devfn;
181 IOMMUPciBus *iommu_pci_bus;
182 IOMMUDevice *dev;
183
184 bus_n = PCI_BUS_NUM(sid);
185 iommu_pci_bus = iommu_find_iommu_pcibus(s, bus_n);
186 if (iommu_pci_bus) {
187 devfn = sid & (PCI_DEVFN_MAX - 1);
188 dev = iommu_pci_bus->pbdev[devfn];
189 if (dev) {
190 return &dev->iommu_mr;
191 }
192 }
193 return NULL;
194 }
195
interval_cmp(gconstpointer a,gconstpointer b,gpointer user_data)196 static gint interval_cmp(gconstpointer a, gconstpointer b, gpointer user_data)
197 {
198 VirtIOIOMMUInterval *inta = (VirtIOIOMMUInterval *)a;
199 VirtIOIOMMUInterval *intb = (VirtIOIOMMUInterval *)b;
200
201 if (inta->high < intb->low) {
202 return -1;
203 } else if (intb->high < inta->low) {
204 return 1;
205 } else {
206 return 0;
207 }
208 }
209
virtio_iommu_notify_map_unmap(IOMMUMemoryRegion * mr,IOMMUTLBEvent * event,hwaddr virt_start,hwaddr virt_end)210 static void virtio_iommu_notify_map_unmap(IOMMUMemoryRegion *mr,
211 IOMMUTLBEvent *event,
212 hwaddr virt_start, hwaddr virt_end)
213 {
214 uint64_t delta = virt_end - virt_start;
215
216 event->entry.iova = virt_start;
217 event->entry.addr_mask = delta;
218
219 if (delta == UINT64_MAX) {
220 memory_region_notify_iommu(mr, 0, *event);
221 }
222
223 while (virt_start != virt_end + 1) {
224 uint64_t mask = dma_aligned_pow2_mask(virt_start, virt_end, 64);
225
226 event->entry.addr_mask = mask;
227 event->entry.iova = virt_start;
228 memory_region_notify_iommu(mr, 0, *event);
229 virt_start += mask + 1;
230 if (event->entry.perm != IOMMU_NONE) {
231 event->entry.translated_addr += mask + 1;
232 }
233 }
234 }
235
virtio_iommu_notify_map(IOMMUMemoryRegion * mr,hwaddr virt_start,hwaddr virt_end,hwaddr paddr,uint32_t flags)236 static void virtio_iommu_notify_map(IOMMUMemoryRegion *mr, hwaddr virt_start,
237 hwaddr virt_end, hwaddr paddr,
238 uint32_t flags)
239 {
240 IOMMUTLBEvent event;
241 IOMMUAccessFlags perm = IOMMU_ACCESS_FLAG(flags & VIRTIO_IOMMU_MAP_F_READ,
242 flags & VIRTIO_IOMMU_MAP_F_WRITE);
243
244 if (!(mr->iommu_notify_flags & IOMMU_NOTIFIER_MAP) ||
245 (flags & VIRTIO_IOMMU_MAP_F_MMIO) || !perm) {
246 return;
247 }
248
249 trace_virtio_iommu_notify_map(mr->parent_obj.name, virt_start, virt_end,
250 paddr, perm);
251
252 event.type = IOMMU_NOTIFIER_MAP;
253 event.entry.target_as = &address_space_memory;
254 event.entry.perm = perm;
255 event.entry.translated_addr = paddr;
256
257 virtio_iommu_notify_map_unmap(mr, &event, virt_start, virt_end);
258 }
259
virtio_iommu_notify_unmap(IOMMUMemoryRegion * mr,hwaddr virt_start,hwaddr virt_end)260 static void virtio_iommu_notify_unmap(IOMMUMemoryRegion *mr, hwaddr virt_start,
261 hwaddr virt_end)
262 {
263 IOMMUTLBEvent event;
264
265 if (!(mr->iommu_notify_flags & IOMMU_NOTIFIER_UNMAP)) {
266 return;
267 }
268
269 trace_virtio_iommu_notify_unmap(mr->parent_obj.name, virt_start, virt_end);
270
271 event.type = IOMMU_NOTIFIER_UNMAP;
272 event.entry.target_as = &address_space_memory;
273 event.entry.perm = IOMMU_NONE;
274 event.entry.translated_addr = 0;
275
276 virtio_iommu_notify_map_unmap(mr, &event, virt_start, virt_end);
277 }
278
virtio_iommu_notify_unmap_cb(gpointer key,gpointer value,gpointer data)279 static gboolean virtio_iommu_notify_unmap_cb(gpointer key, gpointer value,
280 gpointer data)
281 {
282 VirtIOIOMMUInterval *interval = (VirtIOIOMMUInterval *) key;
283 IOMMUMemoryRegion *mr = (IOMMUMemoryRegion *) data;
284
285 virtio_iommu_notify_unmap(mr, interval->low, interval->high);
286
287 return false;
288 }
289
virtio_iommu_notify_map_cb(gpointer key,gpointer value,gpointer data)290 static gboolean virtio_iommu_notify_map_cb(gpointer key, gpointer value,
291 gpointer data)
292 {
293 VirtIOIOMMUMapping *mapping = (VirtIOIOMMUMapping *) value;
294 VirtIOIOMMUInterval *interval = (VirtIOIOMMUInterval *) key;
295 IOMMUMemoryRegion *mr = (IOMMUMemoryRegion *) data;
296
297 virtio_iommu_notify_map(mr, interval->low, interval->high,
298 mapping->phys_addr, mapping->flags);
299
300 return false;
301 }
302
virtio_iommu_detach_endpoint_from_domain(VirtIOIOMMUEndpoint * ep)303 static void virtio_iommu_detach_endpoint_from_domain(VirtIOIOMMUEndpoint *ep)
304 {
305 VirtIOIOMMUDomain *domain = ep->domain;
306 IOMMUDevice *sdev = container_of(ep->iommu_mr, IOMMUDevice, iommu_mr);
307
308 if (!ep->domain) {
309 return;
310 }
311 trace_virtio_iommu_detach_endpoint_from_domain(domain->id, ep->id);
312 g_tree_foreach(domain->mappings, virtio_iommu_notify_unmap_cb,
313 ep->iommu_mr);
314 QLIST_REMOVE(ep, next);
315 ep->domain = NULL;
316 virtio_iommu_switch_address_space(sdev);
317 }
318
virtio_iommu_get_endpoint(VirtIOIOMMU * s,uint32_t ep_id)319 static VirtIOIOMMUEndpoint *virtio_iommu_get_endpoint(VirtIOIOMMU *s,
320 uint32_t ep_id)
321 {
322 VirtIOIOMMUEndpoint *ep;
323 IOMMUMemoryRegion *mr;
324
325 ep = g_tree_lookup(s->endpoints, GUINT_TO_POINTER(ep_id));
326 if (ep) {
327 return ep;
328 }
329 mr = virtio_iommu_mr(s, ep_id);
330 if (!mr) {
331 return NULL;
332 }
333 ep = g_malloc0(sizeof(*ep));
334 ep->id = ep_id;
335 ep->iommu_mr = mr;
336 trace_virtio_iommu_get_endpoint(ep_id);
337 g_tree_insert(s->endpoints, GUINT_TO_POINTER(ep_id), ep);
338 return ep;
339 }
340
virtio_iommu_put_endpoint(gpointer data)341 static void virtio_iommu_put_endpoint(gpointer data)
342 {
343 VirtIOIOMMUEndpoint *ep = (VirtIOIOMMUEndpoint *)data;
344
345 if (ep->domain) {
346 virtio_iommu_detach_endpoint_from_domain(ep);
347 }
348
349 trace_virtio_iommu_put_endpoint(ep->id);
350 g_free(ep);
351 }
352
virtio_iommu_get_domain(VirtIOIOMMU * s,uint32_t domain_id,bool bypass)353 static VirtIOIOMMUDomain *virtio_iommu_get_domain(VirtIOIOMMU *s,
354 uint32_t domain_id,
355 bool bypass)
356 {
357 VirtIOIOMMUDomain *domain;
358
359 domain = g_tree_lookup(s->domains, GUINT_TO_POINTER(domain_id));
360 if (domain) {
361 if (domain->bypass != bypass) {
362 return NULL;
363 }
364 return domain;
365 }
366 domain = g_malloc0(sizeof(*domain));
367 domain->id = domain_id;
368 domain->mappings = g_tree_new_full((GCompareDataFunc)interval_cmp,
369 NULL, (GDestroyNotify)g_free,
370 (GDestroyNotify)g_free);
371 domain->bypass = bypass;
372 g_tree_insert(s->domains, GUINT_TO_POINTER(domain_id), domain);
373 QLIST_INIT(&domain->endpoint_list);
374 trace_virtio_iommu_get_domain(domain_id);
375 return domain;
376 }
377
virtio_iommu_put_domain(gpointer data)378 static void virtio_iommu_put_domain(gpointer data)
379 {
380 VirtIOIOMMUDomain *domain = (VirtIOIOMMUDomain *)data;
381 VirtIOIOMMUEndpoint *iter, *tmp;
382
383 QLIST_FOREACH_SAFE(iter, &domain->endpoint_list, next, tmp) {
384 virtio_iommu_detach_endpoint_from_domain(iter);
385 }
386 g_tree_destroy(domain->mappings);
387 trace_virtio_iommu_put_domain(domain->id);
388 g_free(domain);
389 }
390
add_prop_resv_regions(IOMMUDevice * sdev)391 static void add_prop_resv_regions(IOMMUDevice *sdev)
392 {
393 VirtIOIOMMU *s = sdev->viommu;
394 int i;
395
396 for (i = 0; i < s->nr_prop_resv_regions; i++) {
397 ReservedRegion *reg = g_new0(ReservedRegion, 1);
398
399 *reg = s->prop_resv_regions[i];
400 sdev->resv_regions = resv_region_list_insert(sdev->resv_regions, reg);
401 }
402 }
403
virtio_iommu_find_add_as(PCIBus * bus,void * opaque,int devfn)404 static AddressSpace *virtio_iommu_find_add_as(PCIBus *bus, void *opaque,
405 int devfn)
406 {
407 VirtIOIOMMU *s = opaque;
408 IOMMUPciBus *sbus = g_hash_table_lookup(s->as_by_busptr, bus);
409 static uint32_t mr_index;
410 IOMMUDevice *sdev;
411
412 if (!sbus) {
413 sbus = g_malloc0(sizeof(IOMMUPciBus) +
414 sizeof(IOMMUDevice *) * PCI_DEVFN_MAX);
415 sbus->bus = bus;
416 g_hash_table_insert(s->as_by_busptr, bus, sbus);
417 }
418
419 sdev = sbus->pbdev[devfn];
420 if (!sdev) {
421 char *name = g_strdup_printf("%s-%d-%d",
422 TYPE_VIRTIO_IOMMU_MEMORY_REGION,
423 mr_index++, devfn);
424 sdev = sbus->pbdev[devfn] = g_new0(IOMMUDevice, 1);
425
426 sdev->viommu = s;
427 sdev->bus = bus;
428 sdev->devfn = devfn;
429
430 trace_virtio_iommu_init_iommu_mr(name);
431
432 memory_region_init(&sdev->root, OBJECT(s), name, UINT64_MAX);
433 address_space_init(&sdev->as, &sdev->root, TYPE_VIRTIO_IOMMU);
434 add_prop_resv_regions(sdev);
435
436 /*
437 * Build the IOMMU disabled container with aliases to the
438 * shared MRs. Note that aliasing to a shared memory region
439 * could help the memory API to detect same FlatViews so we
440 * can have devices to share the same FlatView when in bypass
441 * mode. (either by not configuring virtio-iommu driver or with
442 * "iommu=pt"). It will greatly reduce the total number of
443 * FlatViews of the system hence VM runs faster.
444 */
445 memory_region_init_alias(&sdev->bypass_mr, OBJECT(s),
446 "system", get_system_memory(), 0,
447 memory_region_size(get_system_memory()));
448
449 memory_region_init_iommu(&sdev->iommu_mr, sizeof(sdev->iommu_mr),
450 TYPE_VIRTIO_IOMMU_MEMORY_REGION,
451 OBJECT(s), name,
452 UINT64_MAX);
453
454 /*
455 * Hook both the containers under the root container, we
456 * switch between iommu & bypass MRs by enable/disable
457 * corresponding sub-containers
458 */
459 memory_region_add_subregion_overlap(&sdev->root, 0,
460 MEMORY_REGION(&sdev->iommu_mr),
461 0);
462 memory_region_add_subregion_overlap(&sdev->root, 0,
463 &sdev->bypass_mr, 0);
464
465 virtio_iommu_switch_address_space(sdev);
466 g_free(name);
467 }
468 return &sdev->as;
469 }
470
hiod_equal(gconstpointer v1,gconstpointer v2)471 static gboolean hiod_equal(gconstpointer v1, gconstpointer v2)
472 {
473 const struct hiod_key *key1 = v1;
474 const struct hiod_key *key2 = v2;
475
476 return (key1->bus == key2->bus) && (key1->devfn == key2->devfn);
477 }
478
hiod_hash(gconstpointer v)479 static guint hiod_hash(gconstpointer v)
480 {
481 const struct hiod_key *key = v;
482 guint value = (guint)(uintptr_t)key->bus;
483
484 return (guint)(value << 8 | key->devfn);
485 }
486
hiod_destroy(gpointer v)487 static void hiod_destroy(gpointer v)
488 {
489 object_unref(v);
490 }
491
492 static HostIOMMUDevice *
get_host_iommu_device(VirtIOIOMMU * viommu,PCIBus * bus,int devfn)493 get_host_iommu_device(VirtIOIOMMU *viommu, PCIBus *bus, int devfn) {
494 struct hiod_key key = {
495 .bus = bus,
496 .devfn = devfn,
497 };
498
499 return g_hash_table_lookup(viommu->host_iommu_devices, &key);
500 }
501
502 /**
503 * rebuild_resv_regions: rebuild resv regions with both the
504 * info of host resv ranges and property set resv ranges
505 */
rebuild_resv_regions(IOMMUDevice * sdev)506 static int rebuild_resv_regions(IOMMUDevice *sdev)
507 {
508 GList *l;
509 int i = 0;
510
511 /* free the existing list and rebuild it from scratch */
512 g_list_free_full(sdev->resv_regions, g_free);
513 sdev->resv_regions = NULL;
514
515 /* First add host reserved regions if any, all tagged as RESERVED */
516 for (l = sdev->host_resv_ranges; l; l = l->next) {
517 ReservedRegion *reg = g_new0(ReservedRegion, 1);
518 Range *r = (Range *)l->data;
519
520 reg->type = VIRTIO_IOMMU_RESV_MEM_T_RESERVED;
521 range_set_bounds(®->range, range_lob(r), range_upb(r));
522 sdev->resv_regions = resv_region_list_insert(sdev->resv_regions, reg);
523 trace_virtio_iommu_host_resv_regions(sdev->iommu_mr.parent_obj.name, i,
524 range_lob(®->range),
525 range_upb(®->range));
526 i++;
527 }
528 /*
529 * then add higher priority reserved regions set by the machine
530 * through properties
531 */
532 add_prop_resv_regions(sdev);
533 return 0;
534 }
535
virtio_iommu_set_host_iova_ranges(VirtIOIOMMU * s,PCIBus * bus,int devfn,GList * iova_ranges,Error ** errp)536 static int virtio_iommu_set_host_iova_ranges(VirtIOIOMMU *s, PCIBus *bus,
537 int devfn, GList *iova_ranges,
538 Error **errp)
539 {
540 IOMMUPciBus *sbus = g_hash_table_lookup(s->as_by_busptr, bus);
541 IOMMUDevice *sdev;
542 int ret = -EINVAL;
543
544 if (!sbus) {
545 error_setg(errp, "%s: no IOMMUPciBus found!", __func__);
546 return ret;
547 }
548
549 sdev = sbus->pbdev[devfn];
550 if (!sdev) {
551 error_setg(errp, "%s: no IOMMUDevice found!", __func__);
552 return ret;
553 }
554
555 if (sdev->host_resv_ranges) {
556 error_setg(errp, "%s virtio-iommu does not support aliased BDF",
557 __func__);
558 return ret;
559 }
560
561 range_inverse_array(iova_ranges,
562 &sdev->host_resv_ranges,
563 0, UINT64_MAX);
564 rebuild_resv_regions(sdev);
565
566 return 0;
567 }
568
virtio_iommu_unset_host_iova_ranges(VirtIOIOMMU * s,PCIBus * bus,int devfn)569 static void virtio_iommu_unset_host_iova_ranges(VirtIOIOMMU *s, PCIBus *bus,
570 int devfn)
571 {
572 IOMMUPciBus *sbus = g_hash_table_lookup(s->as_by_busptr, bus);
573 IOMMUDevice *sdev;
574
575 if (!sbus) {
576 return;
577 }
578
579 sdev = sbus->pbdev[devfn];
580 if (!sdev) {
581 return;
582 }
583
584 g_list_free_full(g_steal_pointer(&sdev->host_resv_ranges), g_free);
585 g_list_free_full(sdev->resv_regions, g_free);
586 sdev->host_resv_ranges = NULL;
587 sdev->resv_regions = NULL;
588 add_prop_resv_regions(sdev);
589 }
590
591
check_page_size_mask(VirtIOIOMMU * viommu,uint64_t new_mask,Error ** errp)592 static bool check_page_size_mask(VirtIOIOMMU *viommu, uint64_t new_mask,
593 Error **errp)
594 {
595 uint64_t cur_mask = viommu->config.page_size_mask;
596
597 if ((cur_mask & new_mask) == 0) {
598 error_setg(errp, "virtio-iommu reports a page size mask 0x%"PRIx64
599 " incompatible with currently supported mask 0x%"PRIx64,
600 new_mask, cur_mask);
601 return false;
602 }
603 /*
604 * Once the granule is frozen we can't change the mask anymore. If by
605 * chance the hotplugged device supports the same granule, we can still
606 * accept it.
607 */
608 if (viommu->granule_frozen) {
609 int cur_granule = ctz64(cur_mask);
610
611 if (!(BIT_ULL(cur_granule) & new_mask)) {
612 error_setg(errp,
613 "virtio-iommu does not support frozen granule 0x%llx",
614 BIT_ULL(cur_granule));
615 return false;
616 }
617 }
618 return true;
619 }
620
virtio_iommu_set_iommu_device(PCIBus * bus,void * opaque,int devfn,HostIOMMUDevice * hiod,Error ** errp)621 static bool virtio_iommu_set_iommu_device(PCIBus *bus, void *opaque, int devfn,
622 HostIOMMUDevice *hiod, Error **errp)
623 {
624 ERRP_GUARD();
625 VirtIOIOMMU *viommu = opaque;
626 HostIOMMUDeviceClass *hiodc = HOST_IOMMU_DEVICE_GET_CLASS(hiod);
627 struct hiod_key *new_key;
628 GList *host_iova_ranges = NULL;
629
630 assert(hiod);
631
632 if (get_host_iommu_device(viommu, bus, devfn)) {
633 error_setg(errp, "Host IOMMU device already exists");
634 return false;
635 }
636
637 if (hiodc->get_iova_ranges) {
638 int ret;
639 host_iova_ranges = hiodc->get_iova_ranges(hiod);
640 if (!host_iova_ranges) {
641 return true; /* some old kernels may not support that capability */
642 }
643 ret = virtio_iommu_set_host_iova_ranges(viommu, hiod->aliased_bus,
644 hiod->aliased_devfn,
645 host_iova_ranges, errp);
646 if (ret) {
647 goto error;
648 }
649 }
650 if (hiodc->get_page_size_mask) {
651 uint64_t new_mask = hiodc->get_page_size_mask(hiod);
652
653 if (check_page_size_mask(viommu, new_mask, errp)) {
654 /*
655 * The default mask depends on the "granule" property. For example,
656 * with 4k granule, it is -(4 * KiB). When an assigned device has
657 * page size restrictions due to the hardware IOMMU configuration,
658 * apply this restriction to the mask.
659 */
660 trace_virtio_iommu_update_page_size_mask(hiod->name,
661 viommu->config.page_size_mask,
662 new_mask);
663 if (!viommu->granule_frozen) {
664 viommu->config.page_size_mask &= new_mask;
665 }
666 } else {
667 error_prepend(errp, "%s: ", hiod->name);
668 goto error;
669 }
670 }
671
672 new_key = g_malloc(sizeof(*new_key));
673 new_key->bus = bus;
674 new_key->devfn = devfn;
675
676 object_ref(hiod);
677 g_hash_table_insert(viommu->host_iommu_devices, new_key, hiod);
678 g_list_free_full(host_iova_ranges, g_free);
679
680 return true;
681 error:
682 g_list_free_full(host_iova_ranges, g_free);
683 return false;
684 }
685
686 static void
virtio_iommu_unset_iommu_device(PCIBus * bus,void * opaque,int devfn)687 virtio_iommu_unset_iommu_device(PCIBus *bus, void *opaque, int devfn)
688 {
689 VirtIOIOMMU *viommu = opaque;
690 HostIOMMUDevice *hiod;
691 struct hiod_key key = {
692 .bus = bus,
693 .devfn = devfn,
694 };
695
696 hiod = g_hash_table_lookup(viommu->host_iommu_devices, &key);
697 if (!hiod) {
698 return;
699 }
700 virtio_iommu_unset_host_iova_ranges(viommu, hiod->aliased_bus,
701 hiod->aliased_devfn);
702
703 g_hash_table_remove(viommu->host_iommu_devices, &key);
704 }
705
706 static const PCIIOMMUOps virtio_iommu_ops = {
707 .get_address_space = virtio_iommu_find_add_as,
708 .set_iommu_device = virtio_iommu_set_iommu_device,
709 .unset_iommu_device = virtio_iommu_unset_iommu_device,
710 };
711
virtio_iommu_attach(VirtIOIOMMU * s,struct virtio_iommu_req_attach * req)712 static int virtio_iommu_attach(VirtIOIOMMU *s,
713 struct virtio_iommu_req_attach *req)
714 {
715 uint32_t domain_id = le32_to_cpu(req->domain);
716 uint32_t ep_id = le32_to_cpu(req->endpoint);
717 uint32_t flags = le32_to_cpu(req->flags);
718 VirtIOIOMMUDomain *domain;
719 VirtIOIOMMUEndpoint *ep;
720 IOMMUDevice *sdev;
721
722 trace_virtio_iommu_attach(domain_id, ep_id);
723
724 if (flags & ~VIRTIO_IOMMU_ATTACH_F_BYPASS) {
725 return VIRTIO_IOMMU_S_INVAL;
726 }
727
728 ep = virtio_iommu_get_endpoint(s, ep_id);
729 if (!ep) {
730 return VIRTIO_IOMMU_S_NOENT;
731 }
732
733 if (ep->domain) {
734 VirtIOIOMMUDomain *previous_domain = ep->domain;
735 /*
736 * the device is already attached to a domain,
737 * detach it first
738 */
739 virtio_iommu_detach_endpoint_from_domain(ep);
740 if (QLIST_EMPTY(&previous_domain->endpoint_list)) {
741 g_tree_remove(s->domains, GUINT_TO_POINTER(previous_domain->id));
742 }
743 }
744
745 domain = virtio_iommu_get_domain(s, domain_id,
746 flags & VIRTIO_IOMMU_ATTACH_F_BYPASS);
747 if (!domain) {
748 /* Incompatible bypass flag */
749 return VIRTIO_IOMMU_S_INVAL;
750 }
751 QLIST_INSERT_HEAD(&domain->endpoint_list, ep, next);
752
753 ep->domain = domain;
754 sdev = container_of(ep->iommu_mr, IOMMUDevice, iommu_mr);
755 virtio_iommu_switch_address_space(sdev);
756
757 /* Replay domain mappings on the associated memory region */
758 g_tree_foreach(domain->mappings, virtio_iommu_notify_map_cb,
759 ep->iommu_mr);
760
761 return VIRTIO_IOMMU_S_OK;
762 }
763
virtio_iommu_detach(VirtIOIOMMU * s,struct virtio_iommu_req_detach * req)764 static int virtio_iommu_detach(VirtIOIOMMU *s,
765 struct virtio_iommu_req_detach *req)
766 {
767 uint32_t domain_id = le32_to_cpu(req->domain);
768 uint32_t ep_id = le32_to_cpu(req->endpoint);
769 VirtIOIOMMUDomain *domain;
770 VirtIOIOMMUEndpoint *ep;
771
772 trace_virtio_iommu_detach(domain_id, ep_id);
773
774 ep = g_tree_lookup(s->endpoints, GUINT_TO_POINTER(ep_id));
775 if (!ep) {
776 return VIRTIO_IOMMU_S_NOENT;
777 }
778
779 domain = ep->domain;
780
781 if (!domain || domain->id != domain_id) {
782 return VIRTIO_IOMMU_S_INVAL;
783 }
784
785 virtio_iommu_detach_endpoint_from_domain(ep);
786
787 if (QLIST_EMPTY(&domain->endpoint_list)) {
788 g_tree_remove(s->domains, GUINT_TO_POINTER(domain->id));
789 }
790 g_tree_remove(s->endpoints, GUINT_TO_POINTER(ep_id));
791 return VIRTIO_IOMMU_S_OK;
792 }
793
virtio_iommu_map(VirtIOIOMMU * s,struct virtio_iommu_req_map * req)794 static int virtio_iommu_map(VirtIOIOMMU *s,
795 struct virtio_iommu_req_map *req)
796 {
797 uint32_t domain_id = le32_to_cpu(req->domain);
798 uint64_t phys_start = le64_to_cpu(req->phys_start);
799 uint64_t virt_start = le64_to_cpu(req->virt_start);
800 uint64_t virt_end = le64_to_cpu(req->virt_end);
801 uint32_t flags = le32_to_cpu(req->flags);
802 VirtIOIOMMUDomain *domain;
803 VirtIOIOMMUInterval *interval;
804 VirtIOIOMMUMapping *mapping;
805 VirtIOIOMMUEndpoint *ep;
806
807 if (flags & ~VIRTIO_IOMMU_MAP_F_MASK) {
808 return VIRTIO_IOMMU_S_INVAL;
809 }
810
811 domain = g_tree_lookup(s->domains, GUINT_TO_POINTER(domain_id));
812 if (!domain) {
813 return VIRTIO_IOMMU_S_NOENT;
814 }
815
816 if (domain->bypass) {
817 return VIRTIO_IOMMU_S_INVAL;
818 }
819
820 interval = g_malloc0(sizeof(*interval));
821
822 interval->low = virt_start;
823 interval->high = virt_end;
824
825 mapping = g_tree_lookup(domain->mappings, (gpointer)interval);
826 if (mapping) {
827 g_free(interval);
828 return VIRTIO_IOMMU_S_INVAL;
829 }
830
831 trace_virtio_iommu_map(domain_id, virt_start, virt_end, phys_start, flags);
832
833 mapping = g_malloc0(sizeof(*mapping));
834 mapping->phys_addr = phys_start;
835 mapping->flags = flags;
836
837 g_tree_insert(domain->mappings, interval, mapping);
838
839 QLIST_FOREACH(ep, &domain->endpoint_list, next) {
840 virtio_iommu_notify_map(ep->iommu_mr, virt_start, virt_end, phys_start,
841 flags);
842 }
843
844 return VIRTIO_IOMMU_S_OK;
845 }
846
virtio_iommu_unmap(VirtIOIOMMU * s,struct virtio_iommu_req_unmap * req)847 static int virtio_iommu_unmap(VirtIOIOMMU *s,
848 struct virtio_iommu_req_unmap *req)
849 {
850 uint32_t domain_id = le32_to_cpu(req->domain);
851 uint64_t virt_start = le64_to_cpu(req->virt_start);
852 uint64_t virt_end = le64_to_cpu(req->virt_end);
853 VirtIOIOMMUMapping *iter_val;
854 VirtIOIOMMUInterval interval, *iter_key;
855 VirtIOIOMMUDomain *domain;
856 VirtIOIOMMUEndpoint *ep;
857 int ret = VIRTIO_IOMMU_S_OK;
858
859 trace_virtio_iommu_unmap(domain_id, virt_start, virt_end);
860
861 domain = g_tree_lookup(s->domains, GUINT_TO_POINTER(domain_id));
862 if (!domain) {
863 return VIRTIO_IOMMU_S_NOENT;
864 }
865
866 if (domain->bypass) {
867 return VIRTIO_IOMMU_S_INVAL;
868 }
869
870 interval.low = virt_start;
871 interval.high = virt_end;
872
873 while (g_tree_lookup_extended(domain->mappings, &interval,
874 (void **)&iter_key, (void**)&iter_val)) {
875 uint64_t current_low = iter_key->low;
876 uint64_t current_high = iter_key->high;
877
878 if (interval.low <= current_low && interval.high >= current_high) {
879 QLIST_FOREACH(ep, &domain->endpoint_list, next) {
880 virtio_iommu_notify_unmap(ep->iommu_mr, current_low,
881 current_high);
882 }
883 g_tree_remove(domain->mappings, iter_key);
884 trace_virtio_iommu_unmap_done(domain_id, current_low, current_high);
885 } else {
886 ret = VIRTIO_IOMMU_S_RANGE;
887 break;
888 }
889 }
890 return ret;
891 }
892
virtio_iommu_fill_resv_mem_prop(IOMMUDevice * sdev,uint32_t ep,uint8_t * buf,size_t free)893 static ssize_t virtio_iommu_fill_resv_mem_prop(IOMMUDevice *sdev, uint32_t ep,
894 uint8_t *buf, size_t free)
895 {
896 struct virtio_iommu_probe_resv_mem prop = {};
897 size_t size = sizeof(prop), length = size - sizeof(prop.head), total;
898 GList *l;
899
900 total = size * g_list_length(sdev->resv_regions);
901 if (total > free) {
902 return -ENOSPC;
903 }
904
905 for (l = sdev->resv_regions; l; l = l->next) {
906 ReservedRegion *reg = l->data;
907 unsigned subtype = reg->type;
908 Range *range = ®->range;
909
910 assert(subtype == VIRTIO_IOMMU_RESV_MEM_T_RESERVED ||
911 subtype == VIRTIO_IOMMU_RESV_MEM_T_MSI);
912 prop.head.type = cpu_to_le16(VIRTIO_IOMMU_PROBE_T_RESV_MEM);
913 prop.head.length = cpu_to_le16(length);
914 prop.subtype = subtype;
915 prop.start = cpu_to_le64(range_lob(range));
916 prop.end = cpu_to_le64(range_upb(range));
917
918 memcpy(buf, &prop, size);
919
920 trace_virtio_iommu_fill_resv_property(ep, prop.subtype,
921 prop.start, prop.end);
922 buf += size;
923 }
924 return total;
925 }
926
927 /**
928 * virtio_iommu_probe - Fill the probe request buffer with
929 * the properties the device is able to return
930 */
virtio_iommu_probe(VirtIOIOMMU * s,struct virtio_iommu_req_probe * req,uint8_t * buf)931 static int virtio_iommu_probe(VirtIOIOMMU *s,
932 struct virtio_iommu_req_probe *req,
933 uint8_t *buf)
934 {
935 uint32_t ep_id = le32_to_cpu(req->endpoint);
936 IOMMUMemoryRegion *iommu_mr = virtio_iommu_mr(s, ep_id);
937 size_t free = VIOMMU_PROBE_SIZE;
938 IOMMUDevice *sdev;
939 ssize_t count;
940
941 if (!iommu_mr) {
942 return VIRTIO_IOMMU_S_NOENT;
943 }
944
945 sdev = container_of(iommu_mr, IOMMUDevice, iommu_mr);
946
947 count = virtio_iommu_fill_resv_mem_prop(sdev, ep_id, buf, free);
948 if (count < 0) {
949 return VIRTIO_IOMMU_S_INVAL;
950 }
951 buf += count;
952 free -= count;
953
954 return VIRTIO_IOMMU_S_OK;
955 }
956
virtio_iommu_iov_to_req(struct iovec * iov,unsigned int iov_cnt,void * req,size_t payload_sz)957 static int virtio_iommu_iov_to_req(struct iovec *iov,
958 unsigned int iov_cnt,
959 void *req, size_t payload_sz)
960 {
961 size_t sz = iov_to_buf(iov, iov_cnt, 0, req, payload_sz);
962
963 if (unlikely(sz != payload_sz)) {
964 return VIRTIO_IOMMU_S_INVAL;
965 }
966 return 0;
967 }
968
969 #define virtio_iommu_handle_req(__req) \
970 static int virtio_iommu_handle_ ## __req(VirtIOIOMMU *s, \
971 struct iovec *iov, \
972 unsigned int iov_cnt) \
973 { \
974 struct virtio_iommu_req_ ## __req req; \
975 int ret = virtio_iommu_iov_to_req(iov, iov_cnt, &req, \
976 sizeof(req) - sizeof(struct virtio_iommu_req_tail));\
977 \
978 return ret ? ret : virtio_iommu_ ## __req(s, &req); \
979 }
980
981 virtio_iommu_handle_req(attach)
virtio_iommu_handle_req(detach)982 virtio_iommu_handle_req(detach)
983 virtio_iommu_handle_req(map)
984 virtio_iommu_handle_req(unmap)
985
986 static int virtio_iommu_handle_probe(VirtIOIOMMU *s,
987 struct iovec *iov,
988 unsigned int iov_cnt,
989 uint8_t *buf)
990 {
991 struct virtio_iommu_req_probe req;
992 int ret = virtio_iommu_iov_to_req(iov, iov_cnt, &req, sizeof(req));
993
994 return ret ? ret : virtio_iommu_probe(s, &req, buf);
995 }
996
virtio_iommu_handle_command(VirtIODevice * vdev,VirtQueue * vq)997 static void virtio_iommu_handle_command(VirtIODevice *vdev, VirtQueue *vq)
998 {
999 VirtIOIOMMU *s = VIRTIO_IOMMU(vdev);
1000 struct virtio_iommu_req_head head;
1001 struct virtio_iommu_req_tail tail = {};
1002 VirtQueueElement *elem;
1003 unsigned int iov_cnt;
1004 struct iovec *iov;
1005 void *buf = NULL;
1006 size_t sz;
1007
1008 for (;;) {
1009 size_t output_size = sizeof(tail);
1010
1011 elem = virtqueue_pop(vq, sizeof(VirtQueueElement));
1012 if (!elem) {
1013 return;
1014 }
1015
1016 if (iov_size(elem->in_sg, elem->in_num) < sizeof(tail) ||
1017 iov_size(elem->out_sg, elem->out_num) < sizeof(head)) {
1018 virtio_error(vdev, "virtio-iommu bad head/tail size");
1019 virtqueue_detach_element(vq, elem, 0);
1020 g_free(elem);
1021 break;
1022 }
1023
1024 iov_cnt = elem->out_num;
1025 iov = elem->out_sg;
1026 sz = iov_to_buf(iov, iov_cnt, 0, &head, sizeof(head));
1027 if (unlikely(sz != sizeof(head))) {
1028 qemu_log_mask(LOG_GUEST_ERROR,
1029 "%s: read %zu bytes from command head"
1030 "but expected %zu\n", __func__, sz, sizeof(head));
1031 tail.status = VIRTIO_IOMMU_S_DEVERR;
1032 goto out;
1033 }
1034 qemu_rec_mutex_lock(&s->mutex);
1035 switch (head.type) {
1036 case VIRTIO_IOMMU_T_ATTACH:
1037 tail.status = virtio_iommu_handle_attach(s, iov, iov_cnt);
1038 break;
1039 case VIRTIO_IOMMU_T_DETACH:
1040 tail.status = virtio_iommu_handle_detach(s, iov, iov_cnt);
1041 break;
1042 case VIRTIO_IOMMU_T_MAP:
1043 tail.status = virtio_iommu_handle_map(s, iov, iov_cnt);
1044 break;
1045 case VIRTIO_IOMMU_T_UNMAP:
1046 tail.status = virtio_iommu_handle_unmap(s, iov, iov_cnt);
1047 break;
1048 case VIRTIO_IOMMU_T_PROBE:
1049 {
1050 struct virtio_iommu_req_tail *ptail;
1051
1052 output_size = s->config.probe_size + sizeof(tail);
1053 buf = g_malloc0(output_size);
1054
1055 ptail = buf + s->config.probe_size;
1056 ptail->status = virtio_iommu_handle_probe(s, iov, iov_cnt, buf);
1057 break;
1058 }
1059 default:
1060 tail.status = VIRTIO_IOMMU_S_UNSUPP;
1061 }
1062 qemu_rec_mutex_unlock(&s->mutex);
1063
1064 out:
1065 sz = iov_from_buf(elem->in_sg, elem->in_num, 0,
1066 buf ? buf : &tail, output_size);
1067 if (unlikely(sz != output_size)) {
1068 qemu_log_mask(LOG_GUEST_ERROR,
1069 "%s: wrote %zu bytes to command response"
1070 "but response size is %zu\n",
1071 __func__, sz, output_size);
1072 tail.status = VIRTIO_IOMMU_S_DEVERR;
1073 /*
1074 * We checked that sizeof(tail) can fit to elem->in_sg at the
1075 * beginning of the loop
1076 */
1077 output_size = sizeof(tail);
1078 g_free(buf);
1079 buf = NULL;
1080 sz = iov_from_buf(elem->in_sg,
1081 elem->in_num,
1082 0,
1083 &tail,
1084 output_size);
1085 }
1086 assert(sz == output_size);
1087
1088 virtqueue_push(vq, elem, sz);
1089 virtio_notify(vdev, vq);
1090 g_free(elem);
1091 g_free(buf);
1092 buf = NULL;
1093 }
1094 }
1095
virtio_iommu_report_fault(VirtIOIOMMU * viommu,uint8_t reason,int flags,uint32_t endpoint,uint64_t address)1096 static void virtio_iommu_report_fault(VirtIOIOMMU *viommu, uint8_t reason,
1097 int flags, uint32_t endpoint,
1098 uint64_t address)
1099 {
1100 VirtIODevice *vdev = &viommu->parent_obj;
1101 VirtQueue *vq = viommu->event_vq;
1102 struct virtio_iommu_fault fault;
1103 VirtQueueElement *elem;
1104 size_t sz;
1105
1106 memset(&fault, 0, sizeof(fault));
1107 fault.reason = reason;
1108 fault.flags = cpu_to_le32(flags);
1109 fault.endpoint = cpu_to_le32(endpoint);
1110 fault.address = cpu_to_le64(address);
1111
1112 elem = virtqueue_pop(vq, sizeof(VirtQueueElement));
1113
1114 if (!elem) {
1115 error_report_once(
1116 "no buffer available in event queue to report event");
1117 return;
1118 }
1119
1120 if (iov_size(elem->in_sg, elem->in_num) < sizeof(fault)) {
1121 virtio_error(vdev, "error buffer of wrong size");
1122 virtqueue_detach_element(vq, elem, 0);
1123 g_free(elem);
1124 return;
1125 }
1126
1127 sz = iov_from_buf(elem->in_sg, elem->in_num, 0,
1128 &fault, sizeof(fault));
1129 assert(sz == sizeof(fault));
1130
1131 trace_virtio_iommu_report_fault(reason, flags, endpoint, address);
1132 virtqueue_push(vq, elem, sz);
1133 virtio_notify(vdev, vq);
1134 g_free(elem);
1135
1136 }
1137
virtio_iommu_translate(IOMMUMemoryRegion * mr,hwaddr addr,IOMMUAccessFlags flag,int iommu_idx)1138 static IOMMUTLBEntry virtio_iommu_translate(IOMMUMemoryRegion *mr, hwaddr addr,
1139 IOMMUAccessFlags flag,
1140 int iommu_idx)
1141 {
1142 IOMMUDevice *sdev = container_of(mr, IOMMUDevice, iommu_mr);
1143 VirtIOIOMMUInterval interval, *mapping_key;
1144 VirtIOIOMMUMapping *mapping_value;
1145 VirtIOIOMMU *s = sdev->viommu;
1146 bool read_fault, write_fault;
1147 VirtIOIOMMUEndpoint *ep;
1148 uint32_t sid, flags;
1149 bool bypass_allowed;
1150 int granule;
1151 bool found;
1152 GList *l;
1153
1154 interval.low = addr;
1155 interval.high = addr + 1;
1156 granule = ctz64(s->config.page_size_mask);
1157
1158 IOMMUTLBEntry entry = {
1159 .target_as = &address_space_memory,
1160 .iova = addr,
1161 .translated_addr = addr,
1162 .addr_mask = BIT_ULL(granule) - 1,
1163 .perm = IOMMU_NONE,
1164 };
1165
1166 bypass_allowed = s->config.bypass;
1167
1168 sid = virtio_iommu_get_bdf(sdev);
1169
1170 trace_virtio_iommu_translate(mr->parent_obj.name, sid, addr, flag);
1171 qemu_rec_mutex_lock(&s->mutex);
1172
1173 ep = g_tree_lookup(s->endpoints, GUINT_TO_POINTER(sid));
1174
1175 if (bypass_allowed)
1176 assert(ep && ep->domain && !ep->domain->bypass);
1177
1178 if (!ep) {
1179 if (!bypass_allowed) {
1180 error_report_once("%s sid=%d is not known!!", __func__, sid);
1181 virtio_iommu_report_fault(s, VIRTIO_IOMMU_FAULT_R_UNKNOWN,
1182 VIRTIO_IOMMU_FAULT_F_ADDRESS,
1183 sid, addr);
1184 } else {
1185 entry.perm = flag;
1186 }
1187 goto unlock;
1188 }
1189
1190 for (l = sdev->resv_regions; l; l = l->next) {
1191 ReservedRegion *reg = l->data;
1192
1193 if (range_contains(®->range, addr)) {
1194 switch (reg->type) {
1195 case VIRTIO_IOMMU_RESV_MEM_T_MSI:
1196 entry.perm = flag;
1197 break;
1198 case VIRTIO_IOMMU_RESV_MEM_T_RESERVED:
1199 default:
1200 virtio_iommu_report_fault(s, VIRTIO_IOMMU_FAULT_R_MAPPING,
1201 VIRTIO_IOMMU_FAULT_F_ADDRESS,
1202 sid, addr);
1203 break;
1204 }
1205 goto unlock;
1206 }
1207 }
1208
1209 if (!ep->domain) {
1210 if (!bypass_allowed) {
1211 error_report_once("%s %02x:%02x.%01x not attached to any domain",
1212 __func__, PCI_BUS_NUM(sid),
1213 PCI_SLOT(sid), PCI_FUNC(sid));
1214 virtio_iommu_report_fault(s, VIRTIO_IOMMU_FAULT_R_DOMAIN,
1215 VIRTIO_IOMMU_FAULT_F_ADDRESS,
1216 sid, addr);
1217 } else {
1218 entry.perm = flag;
1219 }
1220 goto unlock;
1221 } else if (ep->domain->bypass) {
1222 entry.perm = flag;
1223 goto unlock;
1224 }
1225
1226 found = g_tree_lookup_extended(ep->domain->mappings, (gpointer)(&interval),
1227 (void **)&mapping_key,
1228 (void **)&mapping_value);
1229 if (!found) {
1230 error_report_once("%s no mapping for 0x%"PRIx64" for sid=%d",
1231 __func__, addr, sid);
1232 virtio_iommu_report_fault(s, VIRTIO_IOMMU_FAULT_R_MAPPING,
1233 VIRTIO_IOMMU_FAULT_F_ADDRESS,
1234 sid, addr);
1235 goto unlock;
1236 }
1237
1238 read_fault = (flag & IOMMU_RO) &&
1239 !(mapping_value->flags & VIRTIO_IOMMU_MAP_F_READ);
1240 write_fault = (flag & IOMMU_WO) &&
1241 !(mapping_value->flags & VIRTIO_IOMMU_MAP_F_WRITE);
1242
1243 flags = read_fault ? VIRTIO_IOMMU_FAULT_F_READ : 0;
1244 flags |= write_fault ? VIRTIO_IOMMU_FAULT_F_WRITE : 0;
1245 if (flags) {
1246 error_report_once("%s permission error on 0x%"PRIx64"(%d): allowed=%d",
1247 __func__, addr, flag, mapping_value->flags);
1248 flags |= VIRTIO_IOMMU_FAULT_F_ADDRESS;
1249 virtio_iommu_report_fault(s, VIRTIO_IOMMU_FAULT_R_MAPPING,
1250 flags | VIRTIO_IOMMU_FAULT_F_ADDRESS,
1251 sid, addr);
1252 goto unlock;
1253 }
1254 entry.translated_addr = addr - mapping_key->low + mapping_value->phys_addr;
1255 entry.perm = flag;
1256 trace_virtio_iommu_translate_out(addr, entry.translated_addr, sid);
1257
1258 unlock:
1259 qemu_rec_mutex_unlock(&s->mutex);
1260 return entry;
1261 }
1262
virtio_iommu_get_config(VirtIODevice * vdev,uint8_t * config_data)1263 static void virtio_iommu_get_config(VirtIODevice *vdev, uint8_t *config_data)
1264 {
1265 VirtIOIOMMU *dev = VIRTIO_IOMMU(vdev);
1266 struct virtio_iommu_config *dev_config = &dev->config;
1267 struct virtio_iommu_config *out_config = (void *)config_data;
1268
1269 out_config->page_size_mask = cpu_to_le64(dev_config->page_size_mask);
1270 out_config->input_range.start = cpu_to_le64(dev_config->input_range.start);
1271 out_config->input_range.end = cpu_to_le64(dev_config->input_range.end);
1272 out_config->domain_range.start = cpu_to_le32(dev_config->domain_range.start);
1273 out_config->domain_range.end = cpu_to_le32(dev_config->domain_range.end);
1274 out_config->probe_size = cpu_to_le32(dev_config->probe_size);
1275 out_config->bypass = dev_config->bypass;
1276
1277 trace_virtio_iommu_get_config(dev_config->page_size_mask,
1278 dev_config->input_range.start,
1279 dev_config->input_range.end,
1280 dev_config->domain_range.start,
1281 dev_config->domain_range.end,
1282 dev_config->probe_size,
1283 dev_config->bypass);
1284 }
1285
virtio_iommu_set_config(VirtIODevice * vdev,const uint8_t * config_data)1286 static void virtio_iommu_set_config(VirtIODevice *vdev,
1287 const uint8_t *config_data)
1288 {
1289 VirtIOIOMMU *dev = VIRTIO_IOMMU(vdev);
1290 struct virtio_iommu_config *dev_config = &dev->config;
1291 const struct virtio_iommu_config *in_config = (void *)config_data;
1292
1293 if (in_config->bypass != dev_config->bypass) {
1294 if (!virtio_vdev_has_feature(vdev, VIRTIO_IOMMU_F_BYPASS_CONFIG)) {
1295 virtio_error(vdev, "cannot set config.bypass");
1296 return;
1297 } else if (in_config->bypass != 0 && in_config->bypass != 1) {
1298 virtio_error(vdev, "invalid config.bypass value '%u'",
1299 in_config->bypass);
1300 return;
1301 }
1302 dev_config->bypass = in_config->bypass;
1303 virtio_iommu_switch_address_space_all(dev);
1304 }
1305
1306 trace_virtio_iommu_set_config(in_config->bypass);
1307 }
1308
virtio_iommu_get_features(VirtIODevice * vdev,uint64_t f,Error ** errp)1309 static uint64_t virtio_iommu_get_features(VirtIODevice *vdev, uint64_t f,
1310 Error **errp)
1311 {
1312 VirtIOIOMMU *dev = VIRTIO_IOMMU(vdev);
1313
1314 f |= dev->features;
1315 trace_virtio_iommu_get_features(f);
1316 return f;
1317 }
1318
int_cmp(gconstpointer a,gconstpointer b,gpointer user_data)1319 static gint int_cmp(gconstpointer a, gconstpointer b, gpointer user_data)
1320 {
1321 guint ua = GPOINTER_TO_UINT(a);
1322 guint ub = GPOINTER_TO_UINT(b);
1323 return (ua > ub) - (ua < ub);
1324 }
1325
virtio_iommu_remap(gpointer key,gpointer value,gpointer data)1326 static gboolean virtio_iommu_remap(gpointer key, gpointer value, gpointer data)
1327 {
1328 VirtIOIOMMUMapping *mapping = (VirtIOIOMMUMapping *) value;
1329 VirtIOIOMMUInterval *interval = (VirtIOIOMMUInterval *) key;
1330 IOMMUMemoryRegion *mr = (IOMMUMemoryRegion *) data;
1331
1332 trace_virtio_iommu_remap(mr->parent_obj.name, interval->low, interval->high,
1333 mapping->phys_addr);
1334 virtio_iommu_notify_map(mr, interval->low, interval->high,
1335 mapping->phys_addr, mapping->flags);
1336 return false;
1337 }
1338
virtio_iommu_replay(IOMMUMemoryRegion * mr,IOMMUNotifier * n)1339 static void virtio_iommu_replay(IOMMUMemoryRegion *mr, IOMMUNotifier *n)
1340 {
1341 IOMMUDevice *sdev = container_of(mr, IOMMUDevice, iommu_mr);
1342 VirtIOIOMMU *s = sdev->viommu;
1343 uint32_t sid;
1344 VirtIOIOMMUEndpoint *ep;
1345
1346 sid = virtio_iommu_get_bdf(sdev);
1347
1348 qemu_rec_mutex_lock(&s->mutex);
1349
1350 if (!s->endpoints) {
1351 goto unlock;
1352 }
1353
1354 ep = g_tree_lookup(s->endpoints, GUINT_TO_POINTER(sid));
1355 if (!ep || !ep->domain) {
1356 goto unlock;
1357 }
1358
1359 g_tree_foreach(ep->domain->mappings, virtio_iommu_remap, mr);
1360
1361 unlock:
1362 qemu_rec_mutex_unlock(&s->mutex);
1363 }
1364
virtio_iommu_notify_flag_changed(IOMMUMemoryRegion * iommu_mr,IOMMUNotifierFlag old,IOMMUNotifierFlag new,Error ** errp)1365 static int virtio_iommu_notify_flag_changed(IOMMUMemoryRegion *iommu_mr,
1366 IOMMUNotifierFlag old,
1367 IOMMUNotifierFlag new,
1368 Error **errp)
1369 {
1370 if (new & IOMMU_NOTIFIER_DEVIOTLB_UNMAP) {
1371 error_setg(errp, "Virtio-iommu does not support dev-iotlb yet");
1372 return -EINVAL;
1373 }
1374
1375 if (old == IOMMU_NOTIFIER_NONE) {
1376 trace_virtio_iommu_notify_flag_add(iommu_mr->parent_obj.name);
1377 } else if (new == IOMMU_NOTIFIER_NONE) {
1378 trace_virtio_iommu_notify_flag_del(iommu_mr->parent_obj.name);
1379 }
1380 return 0;
1381 }
1382
virtio_iommu_system_reset(void * opaque)1383 static void virtio_iommu_system_reset(void *opaque)
1384 {
1385 VirtIOIOMMU *s = opaque;
1386
1387 trace_virtio_iommu_system_reset();
1388
1389 memset(s->iommu_pcibus_by_bus_num, 0, sizeof(s->iommu_pcibus_by_bus_num));
1390
1391 /*
1392 * config.bypass is sticky across device reset, but should be restored on
1393 * system reset
1394 */
1395 s->config.bypass = s->boot_bypass;
1396 virtio_iommu_switch_address_space_all(s);
1397
1398 }
1399
virtio_iommu_freeze_granule(Notifier * notifier,void * data)1400 static void virtio_iommu_freeze_granule(Notifier *notifier, void *data)
1401 {
1402 VirtIOIOMMU *s = container_of(notifier, VirtIOIOMMU, machine_done);
1403 int granule;
1404
1405 s->granule_frozen = true;
1406 granule = ctz64(s->config.page_size_mask);
1407 trace_virtio_iommu_freeze_granule(BIT_ULL(granule));
1408 }
1409
virtio_iommu_device_realize(DeviceState * dev,Error ** errp)1410 static void virtio_iommu_device_realize(DeviceState *dev, Error **errp)
1411 {
1412 VirtIODevice *vdev = VIRTIO_DEVICE(dev);
1413 VirtIOIOMMU *s = VIRTIO_IOMMU(dev);
1414
1415 virtio_init(vdev, VIRTIO_ID_IOMMU, sizeof(struct virtio_iommu_config));
1416
1417 s->req_vq = virtio_add_queue(vdev, VIOMMU_DEFAULT_QUEUE_SIZE,
1418 virtio_iommu_handle_command);
1419 s->event_vq = virtio_add_queue(vdev, VIOMMU_DEFAULT_QUEUE_SIZE, NULL);
1420
1421 /*
1422 * config.bypass is needed to get initial address space early, such as
1423 * in vfio realize
1424 */
1425 s->config.bypass = s->boot_bypass;
1426 if (s->aw_bits < 32 || s->aw_bits > 64) {
1427 error_setg(errp, "aw-bits must be within [32,64]");
1428 return;
1429 }
1430 s->config.input_range.end =
1431 s->aw_bits == 64 ? UINT64_MAX : BIT_ULL(s->aw_bits) - 1;
1432
1433 switch (s->granule_mode) {
1434 case GRANULE_MODE_4K:
1435 s->config.page_size_mask = -(4 * KiB);
1436 break;
1437 case GRANULE_MODE_8K:
1438 s->config.page_size_mask = -(8 * KiB);
1439 break;
1440 case GRANULE_MODE_16K:
1441 s->config.page_size_mask = -(16 * KiB);
1442 break;
1443 case GRANULE_MODE_64K:
1444 s->config.page_size_mask = -(64 * KiB);
1445 break;
1446 case GRANULE_MODE_HOST:
1447 s->config.page_size_mask = qemu_real_host_page_mask();
1448 break;
1449 default:
1450 error_setg(errp, "Unsupported granule mode");
1451 }
1452 s->config.domain_range.end = UINT32_MAX;
1453 s->config.probe_size = VIOMMU_PROBE_SIZE;
1454
1455 virtio_add_feature(&s->features, VIRTIO_RING_F_EVENT_IDX);
1456 virtio_add_feature(&s->features, VIRTIO_RING_F_INDIRECT_DESC);
1457 virtio_add_feature(&s->features, VIRTIO_F_VERSION_1);
1458 virtio_add_feature(&s->features, VIRTIO_IOMMU_F_INPUT_RANGE);
1459 virtio_add_feature(&s->features, VIRTIO_IOMMU_F_DOMAIN_RANGE);
1460 virtio_add_feature(&s->features, VIRTIO_IOMMU_F_MAP_UNMAP);
1461 virtio_add_feature(&s->features, VIRTIO_IOMMU_F_MMIO);
1462 virtio_add_feature(&s->features, VIRTIO_IOMMU_F_PROBE);
1463 virtio_add_feature(&s->features, VIRTIO_IOMMU_F_BYPASS_CONFIG);
1464
1465 qemu_rec_mutex_init(&s->mutex);
1466
1467 s->as_by_busptr = g_hash_table_new_full(NULL, NULL, NULL, g_free);
1468
1469 s->host_iommu_devices = g_hash_table_new_full(hiod_hash, hiod_equal,
1470 g_free, hiod_destroy);
1471
1472 if (s->primary_bus) {
1473 pci_setup_iommu(s->primary_bus, &virtio_iommu_ops, s);
1474 } else {
1475 error_setg(errp, "VIRTIO-IOMMU is not attached to any PCI bus!");
1476 }
1477
1478 s->machine_done.notify = virtio_iommu_freeze_granule;
1479 qemu_add_machine_init_done_notifier(&s->machine_done);
1480
1481 qemu_register_reset(virtio_iommu_system_reset, s);
1482 }
1483
virtio_iommu_device_unrealize(DeviceState * dev)1484 static void virtio_iommu_device_unrealize(DeviceState *dev)
1485 {
1486 VirtIODevice *vdev = VIRTIO_DEVICE(dev);
1487 VirtIOIOMMU *s = VIRTIO_IOMMU(dev);
1488
1489 qemu_unregister_reset(virtio_iommu_system_reset, s);
1490 qemu_remove_machine_init_done_notifier(&s->machine_done);
1491
1492 g_hash_table_destroy(s->as_by_busptr);
1493 if (s->domains) {
1494 g_tree_destroy(s->domains);
1495 }
1496 if (s->endpoints) {
1497 g_tree_destroy(s->endpoints);
1498 }
1499
1500 qemu_rec_mutex_destroy(&s->mutex);
1501
1502 virtio_delete_queue(s->req_vq);
1503 virtio_delete_queue(s->event_vq);
1504 virtio_cleanup(vdev);
1505 }
1506
virtio_iommu_device_reset(VirtIODevice * vdev)1507 static void virtio_iommu_device_reset(VirtIODevice *vdev)
1508 {
1509 VirtIOIOMMU *s = VIRTIO_IOMMU(vdev);
1510
1511 trace_virtio_iommu_device_reset();
1512
1513 if (s->domains) {
1514 g_tree_destroy(s->domains);
1515 }
1516 if (s->endpoints) {
1517 g_tree_destroy(s->endpoints);
1518 }
1519 s->domains = g_tree_new_full((GCompareDataFunc)int_cmp,
1520 NULL, NULL, virtio_iommu_put_domain);
1521 s->endpoints = g_tree_new_full((GCompareDataFunc)int_cmp,
1522 NULL, NULL, virtio_iommu_put_endpoint);
1523 }
1524
virtio_iommu_set_status(VirtIODevice * vdev,uint8_t status)1525 static void virtio_iommu_set_status(VirtIODevice *vdev, uint8_t status)
1526 {
1527 trace_virtio_iommu_device_status(status);
1528 }
1529
virtio_iommu_instance_init(Object * obj)1530 static void virtio_iommu_instance_init(Object *obj)
1531 {
1532 }
1533
1534 #define VMSTATE_INTERVAL \
1535 { \
1536 .name = "interval", \
1537 .version_id = 1, \
1538 .minimum_version_id = 1, \
1539 .fields = (const VMStateField[]) { \
1540 VMSTATE_UINT64(low, VirtIOIOMMUInterval), \
1541 VMSTATE_UINT64(high, VirtIOIOMMUInterval), \
1542 VMSTATE_END_OF_LIST() \
1543 } \
1544 }
1545
1546 #define VMSTATE_MAPPING \
1547 { \
1548 .name = "mapping", \
1549 .version_id = 1, \
1550 .minimum_version_id = 1, \
1551 .fields = (const VMStateField[]) { \
1552 VMSTATE_UINT64(phys_addr, VirtIOIOMMUMapping),\
1553 VMSTATE_UINT32(flags, VirtIOIOMMUMapping), \
1554 VMSTATE_END_OF_LIST() \
1555 }, \
1556 }
1557
1558 static const VMStateDescription vmstate_interval_mapping[2] = {
1559 VMSTATE_MAPPING, /* value */
1560 VMSTATE_INTERVAL /* key */
1561 };
1562
domain_preload(void * opaque)1563 static int domain_preload(void *opaque)
1564 {
1565 VirtIOIOMMUDomain *domain = opaque;
1566
1567 domain->mappings = g_tree_new_full((GCompareDataFunc)interval_cmp,
1568 NULL, g_free, g_free);
1569 return 0;
1570 }
1571
1572 static const VMStateDescription vmstate_endpoint = {
1573 .name = "endpoint",
1574 .version_id = 1,
1575 .minimum_version_id = 1,
1576 .fields = (const VMStateField[]) {
1577 VMSTATE_UINT32(id, VirtIOIOMMUEndpoint),
1578 VMSTATE_END_OF_LIST()
1579 }
1580 };
1581
1582 static const VMStateDescription vmstate_domain = {
1583 .name = "domain",
1584 .version_id = 2,
1585 .minimum_version_id = 2,
1586 .pre_load = domain_preload,
1587 .fields = (const VMStateField[]) {
1588 VMSTATE_UINT32(id, VirtIOIOMMUDomain),
1589 VMSTATE_GTREE_V(mappings, VirtIOIOMMUDomain, 1,
1590 vmstate_interval_mapping,
1591 VirtIOIOMMUInterval, VirtIOIOMMUMapping),
1592 VMSTATE_QLIST_V(endpoint_list, VirtIOIOMMUDomain, 1,
1593 vmstate_endpoint, VirtIOIOMMUEndpoint, next),
1594 VMSTATE_BOOL_V(bypass, VirtIOIOMMUDomain, 2),
1595 VMSTATE_END_OF_LIST()
1596 }
1597 };
1598
reconstruct_endpoints(gpointer key,gpointer value,gpointer data)1599 static gboolean reconstruct_endpoints(gpointer key, gpointer value,
1600 gpointer data)
1601 {
1602 VirtIOIOMMU *s = (VirtIOIOMMU *)data;
1603 VirtIOIOMMUDomain *d = (VirtIOIOMMUDomain *)value;
1604 VirtIOIOMMUEndpoint *iter;
1605 IOMMUMemoryRegion *mr;
1606
1607 QLIST_FOREACH(iter, &d->endpoint_list, next) {
1608 mr = virtio_iommu_mr(s, iter->id);
1609 assert(mr);
1610
1611 iter->domain = d;
1612 iter->iommu_mr = mr;
1613 g_tree_insert(s->endpoints, GUINT_TO_POINTER(iter->id), iter);
1614 }
1615 return false; /* continue the domain traversal */
1616 }
1617
iommu_post_load(void * opaque,int version_id)1618 static int iommu_post_load(void *opaque, int version_id)
1619 {
1620 VirtIOIOMMU *s = opaque;
1621
1622 g_tree_foreach(s->domains, reconstruct_endpoints, s);
1623
1624 /*
1625 * Memory regions are dynamically turned on/off depending on
1626 * 'config.bypass' and attached domain type if there is. After
1627 * migration, we need to make sure the memory regions are
1628 * still correct.
1629 */
1630 virtio_iommu_switch_address_space_all(s);
1631 return 0;
1632 }
1633
1634 static const VMStateDescription vmstate_virtio_iommu_device = {
1635 .name = "virtio-iommu-device",
1636 .minimum_version_id = 2,
1637 .version_id = 2,
1638 .post_load = iommu_post_load,
1639 .fields = (const VMStateField[]) {
1640 VMSTATE_GTREE_DIRECT_KEY_V(domains, VirtIOIOMMU, 2,
1641 &vmstate_domain, VirtIOIOMMUDomain),
1642 VMSTATE_UINT8_V(config.bypass, VirtIOIOMMU, 2),
1643 VMSTATE_END_OF_LIST()
1644 },
1645 };
1646
1647 static const VMStateDescription vmstate_virtio_iommu = {
1648 .name = "virtio-iommu",
1649 .minimum_version_id = 2,
1650 .priority = MIG_PRI_IOMMU,
1651 .version_id = 2,
1652 .fields = (const VMStateField[]) {
1653 VMSTATE_VIRTIO_DEVICE,
1654 VMSTATE_END_OF_LIST()
1655 },
1656 };
1657
1658 static Property virtio_iommu_properties[] = {
1659 DEFINE_PROP_LINK("primary-bus", VirtIOIOMMU, primary_bus,
1660 TYPE_PCI_BUS, PCIBus *),
1661 DEFINE_PROP_BOOL("boot-bypass", VirtIOIOMMU, boot_bypass, true),
1662 DEFINE_PROP_GRANULE_MODE("granule", VirtIOIOMMU, granule_mode,
1663 GRANULE_MODE_HOST),
1664 DEFINE_PROP_UINT8("aw-bits", VirtIOIOMMU, aw_bits, 64),
1665 DEFINE_PROP_END_OF_LIST(),
1666 };
1667
virtio_iommu_class_init(ObjectClass * klass,void * data)1668 static void virtio_iommu_class_init(ObjectClass *klass, void *data)
1669 {
1670 DeviceClass *dc = DEVICE_CLASS(klass);
1671 VirtioDeviceClass *vdc = VIRTIO_DEVICE_CLASS(klass);
1672
1673 device_class_set_props(dc, virtio_iommu_properties);
1674 dc->vmsd = &vmstate_virtio_iommu;
1675
1676 set_bit(DEVICE_CATEGORY_MISC, dc->categories);
1677 vdc->realize = virtio_iommu_device_realize;
1678 vdc->unrealize = virtio_iommu_device_unrealize;
1679 vdc->reset = virtio_iommu_device_reset;
1680 vdc->get_config = virtio_iommu_get_config;
1681 vdc->set_config = virtio_iommu_set_config;
1682 vdc->get_features = virtio_iommu_get_features;
1683 vdc->set_status = virtio_iommu_set_status;
1684 vdc->vmsd = &vmstate_virtio_iommu_device;
1685 }
1686
virtio_iommu_memory_region_class_init(ObjectClass * klass,void * data)1687 static void virtio_iommu_memory_region_class_init(ObjectClass *klass,
1688 void *data)
1689 {
1690 IOMMUMemoryRegionClass *imrc = IOMMU_MEMORY_REGION_CLASS(klass);
1691
1692 imrc->translate = virtio_iommu_translate;
1693 imrc->replay = virtio_iommu_replay;
1694 imrc->notify_flag_changed = virtio_iommu_notify_flag_changed;
1695 }
1696
1697 static const TypeInfo virtio_iommu_info = {
1698 .name = TYPE_VIRTIO_IOMMU,
1699 .parent = TYPE_VIRTIO_DEVICE,
1700 .instance_size = sizeof(VirtIOIOMMU),
1701 .instance_init = virtio_iommu_instance_init,
1702 .class_init = virtio_iommu_class_init,
1703 };
1704
1705 static const TypeInfo virtio_iommu_memory_region_info = {
1706 .parent = TYPE_IOMMU_MEMORY_REGION,
1707 .name = TYPE_VIRTIO_IOMMU_MEMORY_REGION,
1708 .class_init = virtio_iommu_memory_region_class_init,
1709 };
1710
virtio_register_types(void)1711 static void virtio_register_types(void)
1712 {
1713 type_register_static(&virtio_iommu_info);
1714 type_register_static(&virtio_iommu_memory_region_info);
1715 }
1716
1717 type_init(virtio_register_types)
1718