xref: /openbmc/qemu/hw/i386/amd_iommu.c (revision ee48fef0)
1 /*
2  * QEMU emulation of AMD IOMMU (AMD-Vi)
3  *
4  * Copyright (C) 2011 Eduard - Gabriel Munteanu
5  * Copyright (C) 2015, 2016 David Kiarie Kahurani
6  *
7  * This program is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 2 of the License, or
10  * (at your option) any later version.
11 
12  * This program is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15  * GNU General Public License for more details.
16 
17  * You should have received a copy of the GNU General Public License along
18  * with this program; if not, see <http://www.gnu.org/licenses/>.
19  *
20  * Cache implementation inspired by hw/i386/intel_iommu.c
21  */
22 
23 #include "qemu/osdep.h"
24 #include "hw/i386/pc.h"
25 #include "hw/pci/msi.h"
26 #include "hw/pci/pci_bus.h"
27 #include "migration/vmstate.h"
28 #include "amd_iommu.h"
29 #include "qapi/error.h"
30 #include "qemu/error-report.h"
31 #include "hw/i386/apic_internal.h"
32 #include "trace.h"
33 #include "hw/i386/apic-msidef.h"
34 #include "hw/qdev-properties.h"
35 
36 /* used AMD-Vi MMIO registers */
37 const char *amdvi_mmio_low[] = {
38     "AMDVI_MMIO_DEVTAB_BASE",
39     "AMDVI_MMIO_CMDBUF_BASE",
40     "AMDVI_MMIO_EVTLOG_BASE",
41     "AMDVI_MMIO_CONTROL",
42     "AMDVI_MMIO_EXCL_BASE",
43     "AMDVI_MMIO_EXCL_LIMIT",
44     "AMDVI_MMIO_EXT_FEATURES",
45     "AMDVI_MMIO_PPR_BASE",
46     "UNHANDLED"
47 };
48 const char *amdvi_mmio_high[] = {
49     "AMDVI_MMIO_COMMAND_HEAD",
50     "AMDVI_MMIO_COMMAND_TAIL",
51     "AMDVI_MMIO_EVTLOG_HEAD",
52     "AMDVI_MMIO_EVTLOG_TAIL",
53     "AMDVI_MMIO_STATUS",
54     "AMDVI_MMIO_PPR_HEAD",
55     "AMDVI_MMIO_PPR_TAIL",
56     "UNHANDLED"
57 };
58 
59 struct AMDVIAddressSpace {
60     uint8_t bus_num;            /* bus number                           */
61     uint8_t devfn;              /* device function                      */
62     AMDVIState *iommu_state;    /* AMDVI - one per machine              */
63     MemoryRegion root;          /* AMDVI Root memory map region */
64     IOMMUMemoryRegion iommu;    /* Device's address translation region  */
65     MemoryRegion iommu_ir;      /* Device's interrupt remapping region  */
66     AddressSpace as;            /* device's corresponding address space */
67 };
68 
69 /* AMDVI cache entry */
70 typedef struct AMDVIIOTLBEntry {
71     uint16_t domid;             /* assigned domain id  */
72     uint16_t devid;             /* device owning entry */
73     uint64_t perms;             /* access permissions  */
74     uint64_t translated_addr;   /* translated address  */
75     uint64_t page_mask;         /* physical page size  */
76 } AMDVIIOTLBEntry;
77 
78 uint64_t amdvi_extended_feature_register(AMDVIState *s)
79 {
80     uint64_t feature = AMDVI_DEFAULT_EXT_FEATURES;
81     if (s->xtsup) {
82         feature |= AMDVI_FEATURE_XT;
83     }
84 
85     return feature;
86 }
87 
88 /* configure MMIO registers at startup/reset */
89 static void amdvi_set_quad(AMDVIState *s, hwaddr addr, uint64_t val,
90                            uint64_t romask, uint64_t w1cmask)
91 {
92     stq_le_p(&s->mmior[addr], val);
93     stq_le_p(&s->romask[addr], romask);
94     stq_le_p(&s->w1cmask[addr], w1cmask);
95 }
96 
97 static uint16_t amdvi_readw(AMDVIState *s, hwaddr addr)
98 {
99     return lduw_le_p(&s->mmior[addr]);
100 }
101 
102 static uint32_t amdvi_readl(AMDVIState *s, hwaddr addr)
103 {
104     return ldl_le_p(&s->mmior[addr]);
105 }
106 
107 static uint64_t amdvi_readq(AMDVIState *s, hwaddr addr)
108 {
109     return ldq_le_p(&s->mmior[addr]);
110 }
111 
112 /* internal write */
113 static void amdvi_writeq_raw(AMDVIState *s, hwaddr addr, uint64_t val)
114 {
115     stq_le_p(&s->mmior[addr], val);
116 }
117 
118 /* external write */
119 static void amdvi_writew(AMDVIState *s, hwaddr addr, uint16_t val)
120 {
121     uint16_t romask = lduw_le_p(&s->romask[addr]);
122     uint16_t w1cmask = lduw_le_p(&s->w1cmask[addr]);
123     uint16_t oldval = lduw_le_p(&s->mmior[addr]);
124     stw_le_p(&s->mmior[addr],
125             ((oldval & romask) | (val & ~romask)) & ~(val & w1cmask));
126 }
127 
128 static void amdvi_writel(AMDVIState *s, hwaddr addr, uint32_t val)
129 {
130     uint32_t romask = ldl_le_p(&s->romask[addr]);
131     uint32_t w1cmask = ldl_le_p(&s->w1cmask[addr]);
132     uint32_t oldval = ldl_le_p(&s->mmior[addr]);
133     stl_le_p(&s->mmior[addr],
134             ((oldval & romask) | (val & ~romask)) & ~(val & w1cmask));
135 }
136 
137 static void amdvi_writeq(AMDVIState *s, hwaddr addr, uint64_t val)
138 {
139     uint64_t romask = ldq_le_p(&s->romask[addr]);
140     uint64_t w1cmask = ldq_le_p(&s->w1cmask[addr]);
141     uint32_t oldval = ldq_le_p(&s->mmior[addr]);
142     stq_le_p(&s->mmior[addr],
143             ((oldval & romask) | (val & ~romask)) & ~(val & w1cmask));
144 }
145 
146 /* OR a 64-bit register with a 64-bit value */
147 static bool amdvi_test_mask(AMDVIState *s, hwaddr addr, uint64_t val)
148 {
149     return amdvi_readq(s, addr) | val;
150 }
151 
152 /* OR a 64-bit register with a 64-bit value storing result in the register */
153 static void amdvi_assign_orq(AMDVIState *s, hwaddr addr, uint64_t val)
154 {
155     amdvi_writeq_raw(s, addr, amdvi_readq(s, addr) | val);
156 }
157 
158 /* AND a 64-bit register with a 64-bit value storing result in the register */
159 static void amdvi_assign_andq(AMDVIState *s, hwaddr addr, uint64_t val)
160 {
161    amdvi_writeq_raw(s, addr, amdvi_readq(s, addr) & val);
162 }
163 
164 static void amdvi_generate_msi_interrupt(AMDVIState *s)
165 {
166     MSIMessage msg = {};
167     MemTxAttrs attrs = {
168         .requester_id = pci_requester_id(&s->pci.dev)
169     };
170 
171     if (msi_enabled(&s->pci.dev)) {
172         msg = msi_get_message(&s->pci.dev, 0);
173         address_space_stl_le(&address_space_memory, msg.address, msg.data,
174                              attrs, NULL);
175     }
176 }
177 
178 static void amdvi_log_event(AMDVIState *s, uint64_t *evt)
179 {
180     /* event logging not enabled */
181     if (!s->evtlog_enabled || amdvi_test_mask(s, AMDVI_MMIO_STATUS,
182         AMDVI_MMIO_STATUS_EVT_OVF)) {
183         return;
184     }
185 
186     /* event log buffer full */
187     if (s->evtlog_tail >= s->evtlog_len) {
188         amdvi_assign_orq(s, AMDVI_MMIO_STATUS, AMDVI_MMIO_STATUS_EVT_OVF);
189         /* generate interrupt */
190         amdvi_generate_msi_interrupt(s);
191         return;
192     }
193 
194     if (dma_memory_write(&address_space_memory, s->evtlog + s->evtlog_tail,
195                          evt, AMDVI_EVENT_LEN, MEMTXATTRS_UNSPECIFIED)) {
196         trace_amdvi_evntlog_fail(s->evtlog, s->evtlog_tail);
197     }
198 
199     s->evtlog_tail += AMDVI_EVENT_LEN;
200     amdvi_assign_orq(s, AMDVI_MMIO_STATUS, AMDVI_MMIO_STATUS_COMP_INT);
201     amdvi_generate_msi_interrupt(s);
202 }
203 
204 static void amdvi_setevent_bits(uint64_t *buffer, uint64_t value, int start,
205                                 int length)
206 {
207     int index = start / 64, bitpos = start % 64;
208     uint64_t mask = MAKE_64BIT_MASK(start, length);
209     buffer[index] &= ~mask;
210     buffer[index] |= (value << bitpos) & mask;
211 }
212 /*
213  * AMDVi event structure
214  *    0:15   -> DeviceID
215  *    48:63  -> event type + miscellaneous info
216  *    64:127 -> related address
217  */
218 static void amdvi_encode_event(uint64_t *evt, uint16_t devid, uint64_t addr,
219                                uint16_t info)
220 {
221     evt[0] = 0;
222     evt[1] = 0;
223 
224     amdvi_setevent_bits(evt, devid, 0, 16);
225     amdvi_setevent_bits(evt, info, 48, 16);
226     amdvi_setevent_bits(evt, addr, 64, 64);
227 }
228 /* log an error encountered during a page walk
229  *
230  * @addr: virtual address in translation request
231  */
232 static void amdvi_page_fault(AMDVIState *s, uint16_t devid,
233                              hwaddr addr, uint16_t info)
234 {
235     uint64_t evt[2];
236 
237     info |= AMDVI_EVENT_IOPF_I | AMDVI_EVENT_IOPF;
238     amdvi_encode_event(evt, devid, addr, info);
239     amdvi_log_event(s, evt);
240     pci_word_test_and_set_mask(s->pci.dev.config + PCI_STATUS,
241             PCI_STATUS_SIG_TARGET_ABORT);
242 }
243 /*
244  * log a master abort accessing device table
245  *  @devtab : address of device table entry
246  *  @info : error flags
247  */
248 static void amdvi_log_devtab_error(AMDVIState *s, uint16_t devid,
249                                    hwaddr devtab, uint16_t info)
250 {
251     uint64_t evt[2];
252 
253     info |= AMDVI_EVENT_DEV_TAB_HW_ERROR;
254 
255     amdvi_encode_event(evt, devid, devtab, info);
256     amdvi_log_event(s, evt);
257     pci_word_test_and_set_mask(s->pci.dev.config + PCI_STATUS,
258             PCI_STATUS_SIG_TARGET_ABORT);
259 }
260 /* log an event trying to access command buffer
261  *   @addr : address that couldn't be accessed
262  */
263 static void amdvi_log_command_error(AMDVIState *s, hwaddr addr)
264 {
265     uint64_t evt[2];
266     uint16_t info = AMDVI_EVENT_COMMAND_HW_ERROR;
267 
268     amdvi_encode_event(evt, 0, addr, info);
269     amdvi_log_event(s, evt);
270     pci_word_test_and_set_mask(s->pci.dev.config + PCI_STATUS,
271             PCI_STATUS_SIG_TARGET_ABORT);
272 }
273 /* log an illegal command event
274  *   @addr : address of illegal command
275  */
276 static void amdvi_log_illegalcom_error(AMDVIState *s, uint16_t info,
277                                        hwaddr addr)
278 {
279     uint64_t evt[2];
280 
281     info |= AMDVI_EVENT_ILLEGAL_COMMAND_ERROR;
282     amdvi_encode_event(evt, 0, addr, info);
283     amdvi_log_event(s, evt);
284 }
285 /* log an error accessing device table
286  *
287  *  @devid : device owning the table entry
288  *  @devtab : address of device table entry
289  *  @info : error flags
290  */
291 static void amdvi_log_illegaldevtab_error(AMDVIState *s, uint16_t devid,
292                                           hwaddr addr, uint16_t info)
293 {
294     uint64_t evt[2];
295 
296     info |= AMDVI_EVENT_ILLEGAL_DEVTAB_ENTRY;
297     amdvi_encode_event(evt, devid, addr, info);
298     amdvi_log_event(s, evt);
299 }
300 /* log an error accessing a PTE entry
301  * @addr : address that couldn't be accessed
302  */
303 static void amdvi_log_pagetab_error(AMDVIState *s, uint16_t devid,
304                                     hwaddr addr, uint16_t info)
305 {
306     uint64_t evt[2];
307 
308     info |= AMDVI_EVENT_PAGE_TAB_HW_ERROR;
309     amdvi_encode_event(evt, devid, addr, info);
310     amdvi_log_event(s, evt);
311     pci_word_test_and_set_mask(s->pci.dev.config + PCI_STATUS,
312              PCI_STATUS_SIG_TARGET_ABORT);
313 }
314 
315 static gboolean amdvi_uint64_equal(gconstpointer v1, gconstpointer v2)
316 {
317     return *((const uint64_t *)v1) == *((const uint64_t *)v2);
318 }
319 
320 static guint amdvi_uint64_hash(gconstpointer v)
321 {
322     return (guint)*(const uint64_t *)v;
323 }
324 
325 static AMDVIIOTLBEntry *amdvi_iotlb_lookup(AMDVIState *s, hwaddr addr,
326                                            uint64_t devid)
327 {
328     uint64_t key = (addr >> AMDVI_PAGE_SHIFT_4K) |
329                    ((uint64_t)(devid) << AMDVI_DEVID_SHIFT);
330     return g_hash_table_lookup(s->iotlb, &key);
331 }
332 
333 static void amdvi_iotlb_reset(AMDVIState *s)
334 {
335     assert(s->iotlb);
336     trace_amdvi_iotlb_reset();
337     g_hash_table_remove_all(s->iotlb);
338 }
339 
340 static gboolean amdvi_iotlb_remove_by_devid(gpointer key, gpointer value,
341                                             gpointer user_data)
342 {
343     AMDVIIOTLBEntry *entry = (AMDVIIOTLBEntry *)value;
344     uint16_t devid = *(uint16_t *)user_data;
345     return entry->devid == devid;
346 }
347 
348 static void amdvi_iotlb_remove_page(AMDVIState *s, hwaddr addr,
349                                     uint64_t devid)
350 {
351     uint64_t key = (addr >> AMDVI_PAGE_SHIFT_4K) |
352                    ((uint64_t)(devid) << AMDVI_DEVID_SHIFT);
353     g_hash_table_remove(s->iotlb, &key);
354 }
355 
356 static void amdvi_update_iotlb(AMDVIState *s, uint16_t devid,
357                                uint64_t gpa, IOMMUTLBEntry to_cache,
358                                uint16_t domid)
359 {
360     AMDVIIOTLBEntry *entry = g_new(AMDVIIOTLBEntry, 1);
361     uint64_t *key = g_new(uint64_t, 1);
362     uint64_t gfn = gpa >> AMDVI_PAGE_SHIFT_4K;
363 
364     /* don't cache erroneous translations */
365     if (to_cache.perm != IOMMU_NONE) {
366         trace_amdvi_cache_update(domid, PCI_BUS_NUM(devid), PCI_SLOT(devid),
367                 PCI_FUNC(devid), gpa, to_cache.translated_addr);
368 
369         if (g_hash_table_size(s->iotlb) >= AMDVI_IOTLB_MAX_SIZE) {
370             amdvi_iotlb_reset(s);
371         }
372 
373         entry->domid = domid;
374         entry->perms = to_cache.perm;
375         entry->translated_addr = to_cache.translated_addr;
376         entry->page_mask = to_cache.addr_mask;
377         *key = gfn | ((uint64_t)(devid) << AMDVI_DEVID_SHIFT);
378         g_hash_table_replace(s->iotlb, key, entry);
379     }
380 }
381 
382 static void amdvi_completion_wait(AMDVIState *s, uint64_t *cmd)
383 {
384     /* pad the last 3 bits */
385     hwaddr addr = cpu_to_le64(extract64(cmd[0], 3, 49)) << 3;
386     uint64_t data = cpu_to_le64(cmd[1]);
387 
388     if (extract64(cmd[0], 52, 8)) {
389         amdvi_log_illegalcom_error(s, extract64(cmd[0], 60, 4),
390                                    s->cmdbuf + s->cmdbuf_head);
391     }
392     if (extract64(cmd[0], 0, 1)) {
393         if (dma_memory_write(&address_space_memory, addr, &data,
394                              AMDVI_COMPLETION_DATA_SIZE,
395                              MEMTXATTRS_UNSPECIFIED)) {
396             trace_amdvi_completion_wait_fail(addr);
397         }
398     }
399     /* set completion interrupt */
400     if (extract64(cmd[0], 1, 1)) {
401         amdvi_assign_orq(s, AMDVI_MMIO_STATUS, AMDVI_MMIO_STATUS_COMP_INT);
402         /* generate interrupt */
403         amdvi_generate_msi_interrupt(s);
404     }
405     trace_amdvi_completion_wait(addr, data);
406 }
407 
408 /* log error without aborting since linux seems to be using reserved bits */
409 static void amdvi_inval_devtab_entry(AMDVIState *s, uint64_t *cmd)
410 {
411     uint16_t devid = cpu_to_le16((uint16_t)extract64(cmd[0], 0, 16));
412 
413     /* This command should invalidate internal caches of which there isn't */
414     if (extract64(cmd[0], 16, 44) || cmd[1]) {
415         amdvi_log_illegalcom_error(s, extract64(cmd[0], 60, 4),
416                                    s->cmdbuf + s->cmdbuf_head);
417     }
418     trace_amdvi_devtab_inval(PCI_BUS_NUM(devid), PCI_SLOT(devid),
419                              PCI_FUNC(devid));
420 }
421 
422 static void amdvi_complete_ppr(AMDVIState *s, uint64_t *cmd)
423 {
424     if (extract64(cmd[0], 16, 16) ||  extract64(cmd[0], 52, 8) ||
425         extract64(cmd[1], 0, 2) || extract64(cmd[1], 3, 29)
426         || extract64(cmd[1], 48, 16)) {
427         amdvi_log_illegalcom_error(s, extract64(cmd[0], 60, 4),
428                                    s->cmdbuf + s->cmdbuf_head);
429     }
430     trace_amdvi_ppr_exec();
431 }
432 
433 static void amdvi_inval_all(AMDVIState *s, uint64_t *cmd)
434 {
435     if (extract64(cmd[0], 0, 60) || cmd[1]) {
436         amdvi_log_illegalcom_error(s, extract64(cmd[0], 60, 4),
437                                    s->cmdbuf + s->cmdbuf_head);
438     }
439 
440     amdvi_iotlb_reset(s);
441     trace_amdvi_all_inval();
442 }
443 
444 static gboolean amdvi_iotlb_remove_by_domid(gpointer key, gpointer value,
445                                             gpointer user_data)
446 {
447     AMDVIIOTLBEntry *entry = (AMDVIIOTLBEntry *)value;
448     uint16_t domid = *(uint16_t *)user_data;
449     return entry->domid == domid;
450 }
451 
452 /* we don't have devid - we can't remove pages by address */
453 static void amdvi_inval_pages(AMDVIState *s, uint64_t *cmd)
454 {
455     uint16_t domid = cpu_to_le16((uint16_t)extract64(cmd[0], 32, 16));
456 
457     if (extract64(cmd[0], 20, 12) || extract64(cmd[0], 48, 12) ||
458         extract64(cmd[1], 3, 9)) {
459         amdvi_log_illegalcom_error(s, extract64(cmd[0], 60, 4),
460                                    s->cmdbuf + s->cmdbuf_head);
461     }
462 
463     g_hash_table_foreach_remove(s->iotlb, amdvi_iotlb_remove_by_domid,
464                                 &domid);
465     trace_amdvi_pages_inval(domid);
466 }
467 
468 static void amdvi_prefetch_pages(AMDVIState *s, uint64_t *cmd)
469 {
470     if (extract64(cmd[0], 16, 8) || extract64(cmd[0], 52, 8) ||
471         extract64(cmd[1], 1, 1) || extract64(cmd[1], 3, 1) ||
472         extract64(cmd[1], 5, 7)) {
473         amdvi_log_illegalcom_error(s, extract64(cmd[0], 60, 4),
474                                    s->cmdbuf + s->cmdbuf_head);
475     }
476 
477     trace_amdvi_prefetch_pages();
478 }
479 
480 static void amdvi_inval_inttable(AMDVIState *s, uint64_t *cmd)
481 {
482     if (extract64(cmd[0], 16, 44) || cmd[1]) {
483         amdvi_log_illegalcom_error(s, extract64(cmd[0], 60, 4),
484                                    s->cmdbuf + s->cmdbuf_head);
485         return;
486     }
487 
488     trace_amdvi_intr_inval();
489 }
490 
491 /* FIXME: Try to work with the specified size instead of all the pages
492  * when the S bit is on
493  */
494 static void iommu_inval_iotlb(AMDVIState *s, uint64_t *cmd)
495 {
496 
497     uint16_t devid = extract64(cmd[0], 0, 16);
498     if (extract64(cmd[1], 1, 1) || extract64(cmd[1], 3, 1) ||
499         extract64(cmd[1], 6, 6)) {
500         amdvi_log_illegalcom_error(s, extract64(cmd[0], 60, 4),
501                                    s->cmdbuf + s->cmdbuf_head);
502         return;
503     }
504 
505     if (extract64(cmd[1], 0, 1)) {
506         g_hash_table_foreach_remove(s->iotlb, amdvi_iotlb_remove_by_devid,
507                                     &devid);
508     } else {
509         amdvi_iotlb_remove_page(s, cpu_to_le64(extract64(cmd[1], 12, 52)) << 12,
510                                 cpu_to_le16(extract64(cmd[1], 0, 16)));
511     }
512     trace_amdvi_iotlb_inval();
513 }
514 
515 /* not honouring reserved bits is regarded as an illegal command */
516 static void amdvi_cmdbuf_exec(AMDVIState *s)
517 {
518     uint64_t cmd[2];
519 
520     if (dma_memory_read(&address_space_memory, s->cmdbuf + s->cmdbuf_head,
521                         cmd, AMDVI_COMMAND_SIZE, MEMTXATTRS_UNSPECIFIED)) {
522         trace_amdvi_command_read_fail(s->cmdbuf, s->cmdbuf_head);
523         amdvi_log_command_error(s, s->cmdbuf + s->cmdbuf_head);
524         return;
525     }
526 
527     switch (extract64(cmd[0], 60, 4)) {
528     case AMDVI_CMD_COMPLETION_WAIT:
529         amdvi_completion_wait(s, cmd);
530         break;
531     case AMDVI_CMD_INVAL_DEVTAB_ENTRY:
532         amdvi_inval_devtab_entry(s, cmd);
533         break;
534     case AMDVI_CMD_INVAL_AMDVI_PAGES:
535         amdvi_inval_pages(s, cmd);
536         break;
537     case AMDVI_CMD_INVAL_IOTLB_PAGES:
538         iommu_inval_iotlb(s, cmd);
539         break;
540     case AMDVI_CMD_INVAL_INTR_TABLE:
541         amdvi_inval_inttable(s, cmd);
542         break;
543     case AMDVI_CMD_PREFETCH_AMDVI_PAGES:
544         amdvi_prefetch_pages(s, cmd);
545         break;
546     case AMDVI_CMD_COMPLETE_PPR_REQUEST:
547         amdvi_complete_ppr(s, cmd);
548         break;
549     case AMDVI_CMD_INVAL_AMDVI_ALL:
550         amdvi_inval_all(s, cmd);
551         break;
552     default:
553         trace_amdvi_unhandled_command(extract64(cmd[1], 60, 4));
554         /* log illegal command */
555         amdvi_log_illegalcom_error(s, extract64(cmd[1], 60, 4),
556                                    s->cmdbuf + s->cmdbuf_head);
557     }
558 }
559 
560 static void amdvi_cmdbuf_run(AMDVIState *s)
561 {
562     if (!s->cmdbuf_enabled) {
563         trace_amdvi_command_error(amdvi_readq(s, AMDVI_MMIO_CONTROL));
564         return;
565     }
566 
567     /* check if there is work to do. */
568     while (s->cmdbuf_head != s->cmdbuf_tail) {
569         trace_amdvi_command_exec(s->cmdbuf_head, s->cmdbuf_tail, s->cmdbuf);
570         amdvi_cmdbuf_exec(s);
571         s->cmdbuf_head += AMDVI_COMMAND_SIZE;
572         amdvi_writeq_raw(s, AMDVI_MMIO_COMMAND_HEAD, s->cmdbuf_head);
573 
574         /* wrap head pointer */
575         if (s->cmdbuf_head >= s->cmdbuf_len * AMDVI_COMMAND_SIZE) {
576             s->cmdbuf_head = 0;
577         }
578     }
579 }
580 
581 static void amdvi_mmio_trace(hwaddr addr, unsigned size)
582 {
583     uint8_t index = (addr & ~0x2000) / 8;
584 
585     if ((addr & 0x2000)) {
586         /* high table */
587         index = index >= AMDVI_MMIO_REGS_HIGH ? AMDVI_MMIO_REGS_HIGH : index;
588         trace_amdvi_mmio_read(amdvi_mmio_high[index], addr, size, addr & ~0x07);
589     } else {
590         index = index >= AMDVI_MMIO_REGS_LOW ? AMDVI_MMIO_REGS_LOW : index;
591         trace_amdvi_mmio_read(amdvi_mmio_low[index], addr, size, addr & ~0x07);
592     }
593 }
594 
595 static uint64_t amdvi_mmio_read(void *opaque, hwaddr addr, unsigned size)
596 {
597     AMDVIState *s = opaque;
598 
599     uint64_t val = -1;
600     if (addr + size > AMDVI_MMIO_SIZE) {
601         trace_amdvi_mmio_read_invalid(AMDVI_MMIO_SIZE, addr, size);
602         return (uint64_t)-1;
603     }
604 
605     if (size == 2) {
606         val = amdvi_readw(s, addr);
607     } else if (size == 4) {
608         val = amdvi_readl(s, addr);
609     } else if (size == 8) {
610         val = amdvi_readq(s, addr);
611     }
612     amdvi_mmio_trace(addr, size);
613 
614     return val;
615 }
616 
617 static void amdvi_handle_control_write(AMDVIState *s)
618 {
619     unsigned long control = amdvi_readq(s, AMDVI_MMIO_CONTROL);
620     s->enabled = !!(control & AMDVI_MMIO_CONTROL_AMDVIEN);
621 
622     s->ats_enabled = !!(control & AMDVI_MMIO_CONTROL_HTTUNEN);
623     s->evtlog_enabled = s->enabled && !!(control &
624                         AMDVI_MMIO_CONTROL_EVENTLOGEN);
625 
626     s->evtlog_intr = !!(control & AMDVI_MMIO_CONTROL_EVENTINTEN);
627     s->completion_wait_intr = !!(control & AMDVI_MMIO_CONTROL_COMWAITINTEN);
628     s->cmdbuf_enabled = s->enabled && !!(control &
629                         AMDVI_MMIO_CONTROL_CMDBUFLEN);
630     s->ga_enabled = !!(control & AMDVI_MMIO_CONTROL_GAEN);
631 
632     /* update the flags depending on the control register */
633     if (s->cmdbuf_enabled) {
634         amdvi_assign_orq(s, AMDVI_MMIO_STATUS, AMDVI_MMIO_STATUS_CMDBUF_RUN);
635     } else {
636         amdvi_assign_andq(s, AMDVI_MMIO_STATUS, ~AMDVI_MMIO_STATUS_CMDBUF_RUN);
637     }
638     if (s->evtlog_enabled) {
639         amdvi_assign_orq(s, AMDVI_MMIO_STATUS, AMDVI_MMIO_STATUS_EVT_RUN);
640     } else {
641         amdvi_assign_andq(s, AMDVI_MMIO_STATUS, ~AMDVI_MMIO_STATUS_EVT_RUN);
642     }
643 
644     trace_amdvi_control_status(control);
645     amdvi_cmdbuf_run(s);
646 }
647 
648 static inline void amdvi_handle_devtab_write(AMDVIState *s)
649 
650 {
651     uint64_t val = amdvi_readq(s, AMDVI_MMIO_DEVICE_TABLE);
652     s->devtab = (val & AMDVI_MMIO_DEVTAB_BASE_MASK);
653 
654     /* set device table length */
655     s->devtab_len = ((val & AMDVI_MMIO_DEVTAB_SIZE_MASK) + 1 *
656                     (AMDVI_MMIO_DEVTAB_SIZE_UNIT /
657                      AMDVI_MMIO_DEVTAB_ENTRY_SIZE));
658 }
659 
660 static inline void amdvi_handle_cmdhead_write(AMDVIState *s)
661 {
662     s->cmdbuf_head = amdvi_readq(s, AMDVI_MMIO_COMMAND_HEAD)
663                      & AMDVI_MMIO_CMDBUF_HEAD_MASK;
664     amdvi_cmdbuf_run(s);
665 }
666 
667 static inline void amdvi_handle_cmdbase_write(AMDVIState *s)
668 {
669     s->cmdbuf = amdvi_readq(s, AMDVI_MMIO_COMMAND_BASE)
670                 & AMDVI_MMIO_CMDBUF_BASE_MASK;
671     s->cmdbuf_len = 1UL << (amdvi_readq(s, AMDVI_MMIO_CMDBUF_SIZE_BYTE)
672                     & AMDVI_MMIO_CMDBUF_SIZE_MASK);
673     s->cmdbuf_head = s->cmdbuf_tail = 0;
674 }
675 
676 static inline void amdvi_handle_cmdtail_write(AMDVIState *s)
677 {
678     s->cmdbuf_tail = amdvi_readq(s, AMDVI_MMIO_COMMAND_TAIL)
679                      & AMDVI_MMIO_CMDBUF_TAIL_MASK;
680     amdvi_cmdbuf_run(s);
681 }
682 
683 static inline void amdvi_handle_excllim_write(AMDVIState *s)
684 {
685     uint64_t val = amdvi_readq(s, AMDVI_MMIO_EXCL_LIMIT);
686     s->excl_limit = (val & AMDVI_MMIO_EXCL_LIMIT_MASK) |
687                     AMDVI_MMIO_EXCL_LIMIT_LOW;
688 }
689 
690 static inline void amdvi_handle_evtbase_write(AMDVIState *s)
691 {
692     uint64_t val = amdvi_readq(s, AMDVI_MMIO_EVENT_BASE);
693     s->evtlog = val & AMDVI_MMIO_EVTLOG_BASE_MASK;
694     s->evtlog_len = 1UL << (amdvi_readq(s, AMDVI_MMIO_EVTLOG_SIZE_BYTE)
695                     & AMDVI_MMIO_EVTLOG_SIZE_MASK);
696 }
697 
698 static inline void amdvi_handle_evttail_write(AMDVIState *s)
699 {
700     uint64_t val = amdvi_readq(s, AMDVI_MMIO_EVENT_TAIL);
701     s->evtlog_tail = val & AMDVI_MMIO_EVTLOG_TAIL_MASK;
702 }
703 
704 static inline void amdvi_handle_evthead_write(AMDVIState *s)
705 {
706     uint64_t val = amdvi_readq(s, AMDVI_MMIO_EVENT_HEAD);
707     s->evtlog_head = val & AMDVI_MMIO_EVTLOG_HEAD_MASK;
708 }
709 
710 static inline void amdvi_handle_pprbase_write(AMDVIState *s)
711 {
712     uint64_t val = amdvi_readq(s, AMDVI_MMIO_PPR_BASE);
713     s->ppr_log = val & AMDVI_MMIO_PPRLOG_BASE_MASK;
714     s->pprlog_len = 1UL << (amdvi_readq(s, AMDVI_MMIO_PPRLOG_SIZE_BYTE)
715                     & AMDVI_MMIO_PPRLOG_SIZE_MASK);
716 }
717 
718 static inline void amdvi_handle_pprhead_write(AMDVIState *s)
719 {
720     uint64_t val = amdvi_readq(s, AMDVI_MMIO_PPR_HEAD);
721     s->pprlog_head = val & AMDVI_MMIO_PPRLOG_HEAD_MASK;
722 }
723 
724 static inline void amdvi_handle_pprtail_write(AMDVIState *s)
725 {
726     uint64_t val = amdvi_readq(s, AMDVI_MMIO_PPR_TAIL);
727     s->pprlog_tail = val & AMDVI_MMIO_PPRLOG_TAIL_MASK;
728 }
729 
730 /* FIXME: something might go wrong if System Software writes in chunks
731  * of one byte but linux writes in chunks of 4 bytes so currently it
732  * works correctly with linux but will definitely be busted if software
733  * reads/writes 8 bytes
734  */
735 static void amdvi_mmio_reg_write(AMDVIState *s, unsigned size, uint64_t val,
736                                  hwaddr addr)
737 {
738     if (size == 2) {
739         amdvi_writew(s, addr, val);
740     } else if (size == 4) {
741         amdvi_writel(s, addr, val);
742     } else if (size == 8) {
743         amdvi_writeq(s, addr, val);
744     }
745 }
746 
747 static void amdvi_mmio_write(void *opaque, hwaddr addr, uint64_t val,
748                              unsigned size)
749 {
750     AMDVIState *s = opaque;
751     unsigned long offset = addr & 0x07;
752 
753     if (addr + size > AMDVI_MMIO_SIZE) {
754         trace_amdvi_mmio_write("error: addr outside region: max ",
755                 (uint64_t)AMDVI_MMIO_SIZE, size, val, offset);
756         return;
757     }
758 
759     amdvi_mmio_trace(addr, size);
760     switch (addr & ~0x07) {
761     case AMDVI_MMIO_CONTROL:
762         amdvi_mmio_reg_write(s, size, val, addr);
763         amdvi_handle_control_write(s);
764         break;
765     case AMDVI_MMIO_DEVICE_TABLE:
766         amdvi_mmio_reg_write(s, size, val, addr);
767        /*  set device table address
768         *   This also suffers from inability to tell whether software
769         *   is done writing
770         */
771         if (offset || (size == 8)) {
772             amdvi_handle_devtab_write(s);
773         }
774         break;
775     case AMDVI_MMIO_COMMAND_HEAD:
776         amdvi_mmio_reg_write(s, size, val, addr);
777         amdvi_handle_cmdhead_write(s);
778         break;
779     case AMDVI_MMIO_COMMAND_BASE:
780         amdvi_mmio_reg_write(s, size, val, addr);
781         /* FIXME - make sure System Software has finished writing in case
782          * it writes in chucks less than 8 bytes in a robust way.As for
783          * now, this hacks works for the linux driver
784          */
785         if (offset || (size == 8)) {
786             amdvi_handle_cmdbase_write(s);
787         }
788         break;
789     case AMDVI_MMIO_COMMAND_TAIL:
790         amdvi_mmio_reg_write(s, size, val, addr);
791         amdvi_handle_cmdtail_write(s);
792         break;
793     case AMDVI_MMIO_EVENT_BASE:
794         amdvi_mmio_reg_write(s, size, val, addr);
795         amdvi_handle_evtbase_write(s);
796         break;
797     case AMDVI_MMIO_EVENT_HEAD:
798         amdvi_mmio_reg_write(s, size, val, addr);
799         amdvi_handle_evthead_write(s);
800         break;
801     case AMDVI_MMIO_EVENT_TAIL:
802         amdvi_mmio_reg_write(s, size, val, addr);
803         amdvi_handle_evttail_write(s);
804         break;
805     case AMDVI_MMIO_EXCL_LIMIT:
806         amdvi_mmio_reg_write(s, size, val, addr);
807         amdvi_handle_excllim_write(s);
808         break;
809         /* PPR log base - unused for now */
810     case AMDVI_MMIO_PPR_BASE:
811         amdvi_mmio_reg_write(s, size, val, addr);
812         amdvi_handle_pprbase_write(s);
813         break;
814         /* PPR log head - also unused for now */
815     case AMDVI_MMIO_PPR_HEAD:
816         amdvi_mmio_reg_write(s, size, val, addr);
817         amdvi_handle_pprhead_write(s);
818         break;
819         /* PPR log tail - unused for now */
820     case AMDVI_MMIO_PPR_TAIL:
821         amdvi_mmio_reg_write(s, size, val, addr);
822         amdvi_handle_pprtail_write(s);
823         break;
824     }
825 }
826 
827 static inline uint64_t amdvi_get_perms(uint64_t entry)
828 {
829     return (entry & (AMDVI_DEV_PERM_READ | AMDVI_DEV_PERM_WRITE)) >>
830            AMDVI_DEV_PERM_SHIFT;
831 }
832 
833 /* validate that reserved bits are honoured */
834 static bool amdvi_validate_dte(AMDVIState *s, uint16_t devid,
835                                uint64_t *dte)
836 {
837     if ((dte[0] & AMDVI_DTE_LOWER_QUAD_RESERVED)
838         || (dte[1] & AMDVI_DTE_MIDDLE_QUAD_RESERVED)
839         || (dte[2] & AMDVI_DTE_UPPER_QUAD_RESERVED) || dte[3]) {
840         amdvi_log_illegaldevtab_error(s, devid,
841                                       s->devtab +
842                                       devid * AMDVI_DEVTAB_ENTRY_SIZE, 0);
843         return false;
844     }
845 
846     return true;
847 }
848 
849 /* get a device table entry given the devid */
850 static bool amdvi_get_dte(AMDVIState *s, int devid, uint64_t *entry)
851 {
852     uint32_t offset = devid * AMDVI_DEVTAB_ENTRY_SIZE;
853 
854     if (dma_memory_read(&address_space_memory, s->devtab + offset, entry,
855                         AMDVI_DEVTAB_ENTRY_SIZE, MEMTXATTRS_UNSPECIFIED)) {
856         trace_amdvi_dte_get_fail(s->devtab, offset);
857         /* log error accessing dte */
858         amdvi_log_devtab_error(s, devid, s->devtab + offset, 0);
859         return false;
860     }
861 
862     *entry = le64_to_cpu(*entry);
863     if (!amdvi_validate_dte(s, devid, entry)) {
864         trace_amdvi_invalid_dte(entry[0]);
865         return false;
866     }
867 
868     return true;
869 }
870 
871 /* get pte translation mode */
872 static inline uint8_t get_pte_translation_mode(uint64_t pte)
873 {
874     return (pte >> AMDVI_DEV_MODE_RSHIFT) & AMDVI_DEV_MODE_MASK;
875 }
876 
877 static inline uint64_t pte_override_page_mask(uint64_t pte)
878 {
879     uint8_t page_mask = 13;
880     uint64_t addr = (pte & AMDVI_DEV_PT_ROOT_MASK) >> 12;
881     /* find the first zero bit */
882     while (addr & 1) {
883         page_mask++;
884         addr = addr >> 1;
885     }
886 
887     return ~((1ULL << page_mask) - 1);
888 }
889 
890 static inline uint64_t pte_get_page_mask(uint64_t oldlevel)
891 {
892     return ~((1UL << ((oldlevel * 9) + 3)) - 1);
893 }
894 
895 static inline uint64_t amdvi_get_pte_entry(AMDVIState *s, uint64_t pte_addr,
896                                           uint16_t devid)
897 {
898     uint64_t pte;
899 
900     if (dma_memory_read(&address_space_memory, pte_addr,
901                         &pte, sizeof(pte), MEMTXATTRS_UNSPECIFIED)) {
902         trace_amdvi_get_pte_hwerror(pte_addr);
903         amdvi_log_pagetab_error(s, devid, pte_addr, 0);
904         pte = 0;
905         return pte;
906     }
907 
908     pte = le64_to_cpu(pte);
909     return pte;
910 }
911 
912 static void amdvi_page_walk(AMDVIAddressSpace *as, uint64_t *dte,
913                             IOMMUTLBEntry *ret, unsigned perms,
914                             hwaddr addr)
915 {
916     unsigned level, present, pte_perms, oldlevel;
917     uint64_t pte = dte[0], pte_addr, page_mask;
918 
919     /* make sure the DTE has TV = 1 */
920     if (pte & AMDVI_DEV_TRANSLATION_VALID) {
921         level = get_pte_translation_mode(pte);
922         if (level >= 7) {
923             trace_amdvi_mode_invalid(level, addr);
924             return;
925         }
926         if (level == 0) {
927             goto no_remap;
928         }
929 
930         /* we are at the leaf page table or page table encodes a huge page */
931         do {
932             pte_perms = amdvi_get_perms(pte);
933             present = pte & 1;
934             if (!present || perms != (perms & pte_perms)) {
935                 amdvi_page_fault(as->iommu_state, as->devfn, addr, perms);
936                 trace_amdvi_page_fault(addr);
937                 return;
938             }
939 
940             /* go to the next lower level */
941             pte_addr = pte & AMDVI_DEV_PT_ROOT_MASK;
942             /* add offset and load pte */
943             pte_addr += ((addr >> (3 + 9 * level)) & 0x1FF) << 3;
944             pte = amdvi_get_pte_entry(as->iommu_state, pte_addr, as->devfn);
945             if (!pte) {
946                 return;
947             }
948             oldlevel = level;
949             level = get_pte_translation_mode(pte);
950         } while (level > 0 && level < 7);
951 
952         if (level == 0x7) {
953             page_mask = pte_override_page_mask(pte);
954         } else {
955             page_mask = pte_get_page_mask(oldlevel);
956         }
957 
958         /* get access permissions from pte */
959         ret->iova = addr & page_mask;
960         ret->translated_addr = (pte & AMDVI_DEV_PT_ROOT_MASK) & page_mask;
961         ret->addr_mask = ~page_mask;
962         ret->perm = amdvi_get_perms(pte);
963         return;
964     }
965 no_remap:
966     ret->iova = addr & AMDVI_PAGE_MASK_4K;
967     ret->translated_addr = addr & AMDVI_PAGE_MASK_4K;
968     ret->addr_mask = ~AMDVI_PAGE_MASK_4K;
969     ret->perm = amdvi_get_perms(pte);
970 }
971 
972 static void amdvi_do_translate(AMDVIAddressSpace *as, hwaddr addr,
973                                bool is_write, IOMMUTLBEntry *ret)
974 {
975     AMDVIState *s = as->iommu_state;
976     uint16_t devid = PCI_BUILD_BDF(as->bus_num, as->devfn);
977     AMDVIIOTLBEntry *iotlb_entry = amdvi_iotlb_lookup(s, addr, devid);
978     uint64_t entry[4];
979 
980     if (iotlb_entry) {
981         trace_amdvi_iotlb_hit(PCI_BUS_NUM(devid), PCI_SLOT(devid),
982                 PCI_FUNC(devid), addr, iotlb_entry->translated_addr);
983         ret->iova = addr & ~iotlb_entry->page_mask;
984         ret->translated_addr = iotlb_entry->translated_addr;
985         ret->addr_mask = iotlb_entry->page_mask;
986         ret->perm = iotlb_entry->perms;
987         return;
988     }
989 
990     if (!amdvi_get_dte(s, devid, entry)) {
991         return;
992     }
993 
994     /* devices with V = 0 are not translated */
995     if (!(entry[0] & AMDVI_DEV_VALID)) {
996         goto out;
997     }
998 
999     amdvi_page_walk(as, entry, ret,
1000                     is_write ? AMDVI_PERM_WRITE : AMDVI_PERM_READ, addr);
1001 
1002     amdvi_update_iotlb(s, devid, addr, *ret,
1003                        entry[1] & AMDVI_DEV_DOMID_ID_MASK);
1004     return;
1005 
1006 out:
1007     ret->iova = addr & AMDVI_PAGE_MASK_4K;
1008     ret->translated_addr = addr & AMDVI_PAGE_MASK_4K;
1009     ret->addr_mask = ~AMDVI_PAGE_MASK_4K;
1010     ret->perm = IOMMU_RW;
1011 }
1012 
1013 static inline bool amdvi_is_interrupt_addr(hwaddr addr)
1014 {
1015     return addr >= AMDVI_INT_ADDR_FIRST && addr <= AMDVI_INT_ADDR_LAST;
1016 }
1017 
1018 static IOMMUTLBEntry amdvi_translate(IOMMUMemoryRegion *iommu, hwaddr addr,
1019                                      IOMMUAccessFlags flag, int iommu_idx)
1020 {
1021     AMDVIAddressSpace *as = container_of(iommu, AMDVIAddressSpace, iommu);
1022     AMDVIState *s = as->iommu_state;
1023     IOMMUTLBEntry ret = {
1024         .target_as = &address_space_memory,
1025         .iova = addr,
1026         .translated_addr = 0,
1027         .addr_mask = ~(hwaddr)0,
1028         .perm = IOMMU_NONE
1029     };
1030 
1031     if (!s->enabled) {
1032         /* AMDVI disabled - corresponds to iommu=off not
1033          * failure to provide any parameter
1034          */
1035         ret.iova = addr & AMDVI_PAGE_MASK_4K;
1036         ret.translated_addr = addr & AMDVI_PAGE_MASK_4K;
1037         ret.addr_mask = ~AMDVI_PAGE_MASK_4K;
1038         ret.perm = IOMMU_RW;
1039         return ret;
1040     } else if (amdvi_is_interrupt_addr(addr)) {
1041         ret.iova = addr & AMDVI_PAGE_MASK_4K;
1042         ret.translated_addr = addr & AMDVI_PAGE_MASK_4K;
1043         ret.addr_mask = ~AMDVI_PAGE_MASK_4K;
1044         ret.perm = IOMMU_WO;
1045         return ret;
1046     }
1047 
1048     amdvi_do_translate(as, addr, flag & IOMMU_WO, &ret);
1049     trace_amdvi_translation_result(as->bus_num, PCI_SLOT(as->devfn),
1050             PCI_FUNC(as->devfn), addr, ret.translated_addr);
1051     return ret;
1052 }
1053 
1054 static int amdvi_get_irte(AMDVIState *s, MSIMessage *origin, uint64_t *dte,
1055                           union irte *irte, uint16_t devid)
1056 {
1057     uint64_t irte_root, offset;
1058 
1059     irte_root = dte[2] & AMDVI_IR_PHYS_ADDR_MASK;
1060     offset = (origin->data & AMDVI_IRTE_OFFSET) << 2;
1061 
1062     trace_amdvi_ir_irte(irte_root, offset);
1063 
1064     if (dma_memory_read(&address_space_memory, irte_root + offset,
1065                         irte, sizeof(*irte), MEMTXATTRS_UNSPECIFIED)) {
1066         trace_amdvi_ir_err("failed to get irte");
1067         return -AMDVI_IR_GET_IRTE;
1068     }
1069 
1070     trace_amdvi_ir_irte_val(irte->val);
1071 
1072     return 0;
1073 }
1074 
1075 static int amdvi_int_remap_legacy(AMDVIState *iommu,
1076                                   MSIMessage *origin,
1077                                   MSIMessage *translated,
1078                                   uint64_t *dte,
1079                                   X86IOMMUIrq *irq,
1080                                   uint16_t sid)
1081 {
1082     int ret;
1083     union irte irte;
1084 
1085     /* get interrupt remapping table */
1086     ret = amdvi_get_irte(iommu, origin, dte, &irte, sid);
1087     if (ret < 0) {
1088         return ret;
1089     }
1090 
1091     if (!irte.fields.valid) {
1092         trace_amdvi_ir_target_abort("RemapEn is disabled");
1093         return -AMDVI_IR_TARGET_ABORT;
1094     }
1095 
1096     if (irte.fields.guest_mode) {
1097         error_report_once("guest mode is not zero");
1098         return -AMDVI_IR_ERR;
1099     }
1100 
1101     if (irte.fields.int_type > AMDVI_IOAPIC_INT_TYPE_ARBITRATED) {
1102         error_report_once("reserved int_type");
1103         return -AMDVI_IR_ERR;
1104     }
1105 
1106     irq->delivery_mode = irte.fields.int_type;
1107     irq->vector = irte.fields.vector;
1108     irq->dest_mode = irte.fields.dm;
1109     irq->redir_hint = irte.fields.rq_eoi;
1110     irq->dest = irte.fields.destination;
1111 
1112     return 0;
1113 }
1114 
1115 static int amdvi_get_irte_ga(AMDVIState *s, MSIMessage *origin, uint64_t *dte,
1116                              struct irte_ga *irte, uint16_t devid)
1117 {
1118     uint64_t irte_root, offset;
1119 
1120     irte_root = dte[2] & AMDVI_IR_PHYS_ADDR_MASK;
1121     offset = (origin->data & AMDVI_IRTE_OFFSET) << 4;
1122     trace_amdvi_ir_irte(irte_root, offset);
1123 
1124     if (dma_memory_read(&address_space_memory, irte_root + offset,
1125                         irte, sizeof(*irte), MEMTXATTRS_UNSPECIFIED)) {
1126         trace_amdvi_ir_err("failed to get irte_ga");
1127         return -AMDVI_IR_GET_IRTE;
1128     }
1129 
1130     trace_amdvi_ir_irte_ga_val(irte->hi.val, irte->lo.val);
1131     return 0;
1132 }
1133 
1134 static int amdvi_int_remap_ga(AMDVIState *iommu,
1135                               MSIMessage *origin,
1136                               MSIMessage *translated,
1137                               uint64_t *dte,
1138                               X86IOMMUIrq *irq,
1139                               uint16_t sid)
1140 {
1141     int ret;
1142     struct irte_ga irte;
1143 
1144     /* get interrupt remapping table */
1145     ret = amdvi_get_irte_ga(iommu, origin, dte, &irte, sid);
1146     if (ret < 0) {
1147         return ret;
1148     }
1149 
1150     if (!irte.lo.fields_remap.valid) {
1151         trace_amdvi_ir_target_abort("RemapEn is disabled");
1152         return -AMDVI_IR_TARGET_ABORT;
1153     }
1154 
1155     if (irte.lo.fields_remap.guest_mode) {
1156         error_report_once("guest mode is not zero");
1157         return -AMDVI_IR_ERR;
1158     }
1159 
1160     if (irte.lo.fields_remap.int_type > AMDVI_IOAPIC_INT_TYPE_ARBITRATED) {
1161         error_report_once("reserved int_type is set");
1162         return -AMDVI_IR_ERR;
1163     }
1164 
1165     irq->delivery_mode = irte.lo.fields_remap.int_type;
1166     irq->vector = irte.hi.fields.vector;
1167     irq->dest_mode = irte.lo.fields_remap.dm;
1168     irq->redir_hint = irte.lo.fields_remap.rq_eoi;
1169     if (iommu->xtsup) {
1170         irq->dest = irte.lo.fields_remap.destination |
1171                     (irte.hi.fields.destination_hi << 24);
1172     } else {
1173         irq->dest = irte.lo.fields_remap.destination & 0xff;
1174     }
1175 
1176     return 0;
1177 }
1178 
1179 static int __amdvi_int_remap_msi(AMDVIState *iommu,
1180                                  MSIMessage *origin,
1181                                  MSIMessage *translated,
1182                                  uint64_t *dte,
1183                                  X86IOMMUIrq *irq,
1184                                  uint16_t sid)
1185 {
1186     int ret;
1187     uint8_t int_ctl;
1188 
1189     int_ctl = (dte[2] >> AMDVI_IR_INTCTL_SHIFT) & 3;
1190     trace_amdvi_ir_intctl(int_ctl);
1191 
1192     switch (int_ctl) {
1193     case AMDVI_IR_INTCTL_PASS:
1194         memcpy(translated, origin, sizeof(*origin));
1195         return 0;
1196     case AMDVI_IR_INTCTL_REMAP:
1197         break;
1198     case AMDVI_IR_INTCTL_ABORT:
1199         trace_amdvi_ir_target_abort("int_ctl abort");
1200         return -AMDVI_IR_TARGET_ABORT;
1201     default:
1202         trace_amdvi_ir_err("int_ctl reserved");
1203         return -AMDVI_IR_ERR;
1204     }
1205 
1206     if (iommu->ga_enabled) {
1207         ret = amdvi_int_remap_ga(iommu, origin, translated, dte, irq, sid);
1208     } else {
1209         ret = amdvi_int_remap_legacy(iommu, origin, translated, dte, irq, sid);
1210     }
1211 
1212     return ret;
1213 }
1214 
1215 /* Interrupt remapping for MSI/MSI-X entry */
1216 static int amdvi_int_remap_msi(AMDVIState *iommu,
1217                                MSIMessage *origin,
1218                                MSIMessage *translated,
1219                                uint16_t sid)
1220 {
1221     int ret = 0;
1222     uint64_t pass = 0;
1223     uint64_t dte[4] = { 0 };
1224     X86IOMMUIrq irq = { 0 };
1225     uint8_t dest_mode, delivery_mode;
1226 
1227     assert(origin && translated);
1228 
1229     /*
1230      * When IOMMU is enabled, interrupt remap request will come either from
1231      * IO-APIC or PCI device. If interrupt is from PCI device then it will
1232      * have a valid requester id but if the interrupt is from IO-APIC
1233      * then requester id will be invalid.
1234      */
1235     if (sid == X86_IOMMU_SID_INVALID) {
1236         sid = AMDVI_IOAPIC_SB_DEVID;
1237     }
1238 
1239     trace_amdvi_ir_remap_msi_req(origin->address, origin->data, sid);
1240 
1241     /* check if device table entry is set before we go further. */
1242     if (!iommu || !iommu->devtab_len) {
1243         memcpy(translated, origin, sizeof(*origin));
1244         goto out;
1245     }
1246 
1247     if (!amdvi_get_dte(iommu, sid, dte)) {
1248         return -AMDVI_IR_ERR;
1249     }
1250 
1251     /* Check if IR is enabled in DTE */
1252     if (!(dte[2] & AMDVI_IR_REMAP_ENABLE)) {
1253         memcpy(translated, origin, sizeof(*origin));
1254         goto out;
1255     }
1256 
1257     /* validate that we are configure with intremap=on */
1258     if (!x86_iommu_ir_supported(X86_IOMMU_DEVICE(iommu))) {
1259         trace_amdvi_err("Interrupt remapping is enabled in the guest but "
1260                         "not in the host. Use intremap=on to enable interrupt "
1261                         "remapping in amd-iommu.");
1262         return -AMDVI_IR_ERR;
1263     }
1264 
1265     if (origin->address < AMDVI_INT_ADDR_FIRST ||
1266         origin->address + sizeof(origin->data) > AMDVI_INT_ADDR_LAST + 1) {
1267         trace_amdvi_err("MSI is not from IOAPIC.");
1268         return -AMDVI_IR_ERR;
1269     }
1270 
1271     /*
1272      * The MSI data register [10:8] are used to get the upstream interrupt type.
1273      *
1274      * See MSI/MSI-X format:
1275      * https://pdfs.semanticscholar.org/presentation/9420/c279e942eca568157711ef5c92b800c40a79.pdf
1276      * (page 5)
1277      */
1278     delivery_mode = (origin->data >> MSI_DATA_DELIVERY_MODE_SHIFT) & 7;
1279 
1280     switch (delivery_mode) {
1281     case AMDVI_IOAPIC_INT_TYPE_FIXED:
1282     case AMDVI_IOAPIC_INT_TYPE_ARBITRATED:
1283         trace_amdvi_ir_delivery_mode("fixed/arbitrated");
1284         ret = __amdvi_int_remap_msi(iommu, origin, translated, dte, &irq, sid);
1285         if (ret < 0) {
1286             goto remap_fail;
1287         } else {
1288             /* Translate IRQ to MSI messages */
1289             x86_iommu_irq_to_msi_message(&irq, translated);
1290             goto out;
1291         }
1292         break;
1293     case AMDVI_IOAPIC_INT_TYPE_SMI:
1294         error_report("SMI is not supported!");
1295         ret = -AMDVI_IR_ERR;
1296         break;
1297     case AMDVI_IOAPIC_INT_TYPE_NMI:
1298         pass = dte[3] & AMDVI_DEV_NMI_PASS_MASK;
1299         trace_amdvi_ir_delivery_mode("nmi");
1300         break;
1301     case AMDVI_IOAPIC_INT_TYPE_INIT:
1302         pass = dte[3] & AMDVI_DEV_INT_PASS_MASK;
1303         trace_amdvi_ir_delivery_mode("init");
1304         break;
1305     case AMDVI_IOAPIC_INT_TYPE_EINT:
1306         pass = dte[3] & AMDVI_DEV_EINT_PASS_MASK;
1307         trace_amdvi_ir_delivery_mode("eint");
1308         break;
1309     default:
1310         trace_amdvi_ir_delivery_mode("unsupported delivery_mode");
1311         ret = -AMDVI_IR_ERR;
1312         break;
1313     }
1314 
1315     if (ret < 0) {
1316         goto remap_fail;
1317     }
1318 
1319     /*
1320      * The MSI address register bit[2] is used to get the destination
1321      * mode. The dest_mode 1 is valid for fixed and arbitrated interrupts
1322      * only.
1323      */
1324     dest_mode = (origin->address >> MSI_ADDR_DEST_MODE_SHIFT) & 1;
1325     if (dest_mode) {
1326         trace_amdvi_ir_err("invalid dest_mode");
1327         ret = -AMDVI_IR_ERR;
1328         goto remap_fail;
1329     }
1330 
1331     if (pass) {
1332         memcpy(translated, origin, sizeof(*origin));
1333     } else {
1334         trace_amdvi_ir_err("passthrough is not enabled");
1335         ret = -AMDVI_IR_ERR;
1336         goto remap_fail;
1337     }
1338 
1339 out:
1340     trace_amdvi_ir_remap_msi(origin->address, origin->data,
1341                              translated->address, translated->data);
1342     return 0;
1343 
1344 remap_fail:
1345     return ret;
1346 }
1347 
1348 static int amdvi_int_remap(X86IOMMUState *iommu,
1349                            MSIMessage *origin,
1350                            MSIMessage *translated,
1351                            uint16_t sid)
1352 {
1353     return amdvi_int_remap_msi(AMD_IOMMU_DEVICE(iommu), origin,
1354                                translated, sid);
1355 }
1356 
1357 static MemTxResult amdvi_mem_ir_write(void *opaque, hwaddr addr,
1358                                       uint64_t value, unsigned size,
1359                                       MemTxAttrs attrs)
1360 {
1361     int ret;
1362     MSIMessage from = { 0, 0 }, to = { 0, 0 };
1363     uint16_t sid = AMDVI_IOAPIC_SB_DEVID;
1364 
1365     from.address = (uint64_t) addr + AMDVI_INT_ADDR_FIRST;
1366     from.data = (uint32_t) value;
1367 
1368     trace_amdvi_mem_ir_write_req(addr, value, size);
1369 
1370     if (!attrs.unspecified) {
1371         /* We have explicit Source ID */
1372         sid = attrs.requester_id;
1373     }
1374 
1375     ret = amdvi_int_remap_msi(opaque, &from, &to, sid);
1376     if (ret < 0) {
1377         /* TODO: log the event using IOMMU log event interface */
1378         error_report_once("failed to remap interrupt from devid 0x%x", sid);
1379         return MEMTX_ERROR;
1380     }
1381 
1382     apic_get_class(NULL)->send_msi(&to);
1383 
1384     trace_amdvi_mem_ir_write(to.address, to.data);
1385     return MEMTX_OK;
1386 }
1387 
1388 static MemTxResult amdvi_mem_ir_read(void *opaque, hwaddr addr,
1389                                      uint64_t *data, unsigned size,
1390                                      MemTxAttrs attrs)
1391 {
1392     return MEMTX_OK;
1393 }
1394 
1395 static const MemoryRegionOps amdvi_ir_ops = {
1396     .read_with_attrs = amdvi_mem_ir_read,
1397     .write_with_attrs = amdvi_mem_ir_write,
1398     .endianness = DEVICE_LITTLE_ENDIAN,
1399     .impl = {
1400         .min_access_size = 4,
1401         .max_access_size = 4,
1402     },
1403     .valid = {
1404         .min_access_size = 4,
1405         .max_access_size = 4,
1406     }
1407 };
1408 
1409 static AddressSpace *amdvi_host_dma_iommu(PCIBus *bus, void *opaque, int devfn)
1410 {
1411     char name[128];
1412     AMDVIState *s = opaque;
1413     AMDVIAddressSpace **iommu_as, *amdvi_dev_as;
1414     int bus_num = pci_bus_num(bus);
1415 
1416     iommu_as = s->address_spaces[bus_num];
1417 
1418     /* allocate memory during the first run */
1419     if (!iommu_as) {
1420         iommu_as = g_new0(AMDVIAddressSpace *, PCI_DEVFN_MAX);
1421         s->address_spaces[bus_num] = iommu_as;
1422     }
1423 
1424     /* set up AMD-Vi region */
1425     if (!iommu_as[devfn]) {
1426         snprintf(name, sizeof(name), "amd_iommu_devfn_%d", devfn);
1427 
1428         iommu_as[devfn] = g_new0(AMDVIAddressSpace, 1);
1429         iommu_as[devfn]->bus_num = (uint8_t)bus_num;
1430         iommu_as[devfn]->devfn = (uint8_t)devfn;
1431         iommu_as[devfn]->iommu_state = s;
1432 
1433         amdvi_dev_as = iommu_as[devfn];
1434 
1435         /*
1436          * Memory region relationships looks like (Address range shows
1437          * only lower 32 bits to make it short in length...):
1438          *
1439          * |-----------------+-------------------+----------|
1440          * | Name            | Address range     | Priority |
1441          * |-----------------+-------------------+----------+
1442          * | amdvi_root      | 00000000-ffffffff |        0 |
1443          * |  amdvi_iommu    | 00000000-ffffffff |        1 |
1444          * |  amdvi_iommu_ir | fee00000-feefffff |       64 |
1445          * |-----------------+-------------------+----------|
1446          */
1447         memory_region_init_iommu(&amdvi_dev_as->iommu,
1448                                  sizeof(amdvi_dev_as->iommu),
1449                                  TYPE_AMD_IOMMU_MEMORY_REGION,
1450                                  OBJECT(s),
1451                                  "amd_iommu", UINT64_MAX);
1452         memory_region_init(&amdvi_dev_as->root, OBJECT(s),
1453                            "amdvi_root", UINT64_MAX);
1454         address_space_init(&amdvi_dev_as->as, &amdvi_dev_as->root, name);
1455         memory_region_init_io(&amdvi_dev_as->iommu_ir, OBJECT(s),
1456                               &amdvi_ir_ops, s, "amd_iommu_ir",
1457                               AMDVI_INT_ADDR_SIZE);
1458         memory_region_add_subregion_overlap(&amdvi_dev_as->root,
1459                                             AMDVI_INT_ADDR_FIRST,
1460                                             &amdvi_dev_as->iommu_ir,
1461                                             64);
1462         memory_region_add_subregion_overlap(&amdvi_dev_as->root, 0,
1463                                             MEMORY_REGION(&amdvi_dev_as->iommu),
1464                                             1);
1465     }
1466     return &iommu_as[devfn]->as;
1467 }
1468 
1469 static const PCIIOMMUOps amdvi_iommu_ops = {
1470     .get_address_space = amdvi_host_dma_iommu,
1471 };
1472 
1473 static const MemoryRegionOps mmio_mem_ops = {
1474     .read = amdvi_mmio_read,
1475     .write = amdvi_mmio_write,
1476     .endianness = DEVICE_LITTLE_ENDIAN,
1477     .impl = {
1478         .min_access_size = 1,
1479         .max_access_size = 8,
1480         .unaligned = false,
1481     },
1482     .valid = {
1483         .min_access_size = 1,
1484         .max_access_size = 8,
1485     }
1486 };
1487 
1488 static int amdvi_iommu_notify_flag_changed(IOMMUMemoryRegion *iommu,
1489                                            IOMMUNotifierFlag old,
1490                                            IOMMUNotifierFlag new,
1491                                            Error **errp)
1492 {
1493     AMDVIAddressSpace *as = container_of(iommu, AMDVIAddressSpace, iommu);
1494 
1495     if (new & IOMMU_NOTIFIER_MAP) {
1496         error_setg(errp,
1497                    "device %02x.%02x.%x requires iommu notifier which is not "
1498                    "currently supported", as->bus_num, PCI_SLOT(as->devfn),
1499                    PCI_FUNC(as->devfn));
1500         return -EINVAL;
1501     }
1502     return 0;
1503 }
1504 
1505 static void amdvi_init(AMDVIState *s)
1506 {
1507     amdvi_iotlb_reset(s);
1508 
1509     s->devtab_len = 0;
1510     s->cmdbuf_len = 0;
1511     s->cmdbuf_head = 0;
1512     s->cmdbuf_tail = 0;
1513     s->evtlog_head = 0;
1514     s->evtlog_tail = 0;
1515     s->excl_enabled = false;
1516     s->excl_allow = false;
1517     s->mmio_enabled = false;
1518     s->enabled = false;
1519     s->ats_enabled = false;
1520     s->cmdbuf_enabled = false;
1521 
1522     /* reset MMIO */
1523     memset(s->mmior, 0, AMDVI_MMIO_SIZE);
1524     amdvi_set_quad(s, AMDVI_MMIO_EXT_FEATURES,
1525                    amdvi_extended_feature_register(s),
1526                    0xffffffffffffffef, 0);
1527     amdvi_set_quad(s, AMDVI_MMIO_STATUS, 0, 0x98, 0x67);
1528 }
1529 
1530 static void amdvi_pci_realize(PCIDevice *pdev, Error **errp)
1531 {
1532     AMDVIPCIState *s = AMD_IOMMU_PCI(pdev);
1533     int ret;
1534 
1535     ret = pci_add_capability(pdev, AMDVI_CAPAB_ID_SEC, 0,
1536                              AMDVI_CAPAB_SIZE, errp);
1537     if (ret < 0) {
1538         return;
1539     }
1540     s->capab_offset = ret;
1541 
1542     ret = pci_add_capability(pdev, PCI_CAP_ID_MSI, 0,
1543                              AMDVI_CAPAB_REG_SIZE, errp);
1544     if (ret < 0) {
1545         return;
1546     }
1547     ret = pci_add_capability(pdev, PCI_CAP_ID_HT, 0,
1548                              AMDVI_CAPAB_REG_SIZE, errp);
1549     if (ret < 0) {
1550         return;
1551     }
1552 
1553     if (msi_init(pdev, 0, 1, true, false, errp) < 0) {
1554         return;
1555     }
1556 
1557     /* reset device ident */
1558     pci_config_set_prog_interface(pdev->config, 0);
1559 
1560     /* reset AMDVI specific capabilities, all r/o */
1561     pci_set_long(pdev->config + s->capab_offset, AMDVI_CAPAB_FEATURES);
1562     pci_set_long(pdev->config + s->capab_offset + AMDVI_CAPAB_BAR_LOW,
1563                  AMDVI_BASE_ADDR & ~(0xffff0000));
1564     pci_set_long(pdev->config + s->capab_offset + AMDVI_CAPAB_BAR_HIGH,
1565                 (AMDVI_BASE_ADDR & ~(0xffff)) >> 16);
1566     pci_set_long(pdev->config + s->capab_offset + AMDVI_CAPAB_RANGE,
1567                  0xff000000);
1568     pci_set_long(pdev->config + s->capab_offset + AMDVI_CAPAB_MISC, 0);
1569     pci_set_long(pdev->config + s->capab_offset + AMDVI_CAPAB_MISC,
1570             AMDVI_MAX_PH_ADDR | AMDVI_MAX_GVA_ADDR | AMDVI_MAX_VA_ADDR);
1571 }
1572 
1573 static void amdvi_sysbus_reset(DeviceState *dev)
1574 {
1575     AMDVIState *s = AMD_IOMMU_DEVICE(dev);
1576 
1577     msi_reset(&s->pci.dev);
1578     amdvi_init(s);
1579 }
1580 
1581 static void amdvi_sysbus_realize(DeviceState *dev, Error **errp)
1582 {
1583     AMDVIState *s = AMD_IOMMU_DEVICE(dev);
1584     MachineState *ms = MACHINE(qdev_get_machine());
1585     PCMachineState *pcms = PC_MACHINE(ms);
1586     X86MachineState *x86ms = X86_MACHINE(ms);
1587     PCIBus *bus = pcms->pcibus;
1588 
1589     s->iotlb = g_hash_table_new_full(amdvi_uint64_hash,
1590                                      amdvi_uint64_equal, g_free, g_free);
1591 
1592     /* This device should take care of IOMMU PCI properties */
1593     if (!qdev_realize(DEVICE(&s->pci), &bus->qbus, errp)) {
1594         return;
1595     }
1596 
1597     /* Pseudo address space under root PCI bus. */
1598     x86ms->ioapic_as = amdvi_host_dma_iommu(bus, s, AMDVI_IOAPIC_SB_DEVID);
1599 
1600     /* set up MMIO */
1601     memory_region_init_io(&s->mmio, OBJECT(s), &mmio_mem_ops, s, "amdvi-mmio",
1602                           AMDVI_MMIO_SIZE);
1603     memory_region_add_subregion(get_system_memory(), AMDVI_BASE_ADDR,
1604                                 &s->mmio);
1605     pci_setup_iommu(bus, &amdvi_iommu_ops, s);
1606     amdvi_init(s);
1607 }
1608 
1609 static Property amdvi_properties[] = {
1610     DEFINE_PROP_BOOL("xtsup", AMDVIState, xtsup, false),
1611     DEFINE_PROP_END_OF_LIST(),
1612 };
1613 
1614 static const VMStateDescription vmstate_amdvi_sysbus = {
1615     .name = "amd-iommu",
1616     .unmigratable = 1
1617 };
1618 
1619 static void amdvi_sysbus_instance_init(Object *klass)
1620 {
1621     AMDVIState *s = AMD_IOMMU_DEVICE(klass);
1622 
1623     object_initialize(&s->pci, sizeof(s->pci), TYPE_AMD_IOMMU_PCI);
1624 }
1625 
1626 static void amdvi_sysbus_class_init(ObjectClass *klass, void *data)
1627 {
1628     DeviceClass *dc = DEVICE_CLASS(klass);
1629     X86IOMMUClass *dc_class = X86_IOMMU_DEVICE_CLASS(klass);
1630 
1631     dc->reset = amdvi_sysbus_reset;
1632     dc->vmsd = &vmstate_amdvi_sysbus;
1633     dc->hotpluggable = false;
1634     dc_class->realize = amdvi_sysbus_realize;
1635     dc_class->int_remap = amdvi_int_remap;
1636     /* Supported by the pc-q35-* machine types */
1637     dc->user_creatable = true;
1638     set_bit(DEVICE_CATEGORY_MISC, dc->categories);
1639     dc->desc = "AMD IOMMU (AMD-Vi) DMA Remapping device";
1640     device_class_set_props(dc, amdvi_properties);
1641 }
1642 
1643 static const TypeInfo amdvi_sysbus = {
1644     .name = TYPE_AMD_IOMMU_DEVICE,
1645     .parent = TYPE_X86_IOMMU_DEVICE,
1646     .instance_size = sizeof(AMDVIState),
1647     .instance_init = amdvi_sysbus_instance_init,
1648     .class_init = amdvi_sysbus_class_init
1649 };
1650 
1651 static void amdvi_pci_class_init(ObjectClass *klass, void *data)
1652 {
1653     DeviceClass *dc = DEVICE_CLASS(klass);
1654     PCIDeviceClass *k = PCI_DEVICE_CLASS(klass);
1655 
1656     k->vendor_id = PCI_VENDOR_ID_AMD;
1657     k->class_id = 0x0806;
1658     k->realize = amdvi_pci_realize;
1659 
1660     set_bit(DEVICE_CATEGORY_MISC, dc->categories);
1661     dc->desc = "AMD IOMMU (AMD-Vi) DMA Remapping device";
1662 }
1663 
1664 static const TypeInfo amdvi_pci = {
1665     .name = TYPE_AMD_IOMMU_PCI,
1666     .parent = TYPE_PCI_DEVICE,
1667     .instance_size = sizeof(AMDVIPCIState),
1668     .class_init = amdvi_pci_class_init,
1669     .interfaces = (InterfaceInfo[]) {
1670         { INTERFACE_CONVENTIONAL_PCI_DEVICE },
1671         { },
1672     },
1673 };
1674 
1675 static void amdvi_iommu_memory_region_class_init(ObjectClass *klass, void *data)
1676 {
1677     IOMMUMemoryRegionClass *imrc = IOMMU_MEMORY_REGION_CLASS(klass);
1678 
1679     imrc->translate = amdvi_translate;
1680     imrc->notify_flag_changed = amdvi_iommu_notify_flag_changed;
1681 }
1682 
1683 static const TypeInfo amdvi_iommu_memory_region_info = {
1684     .parent = TYPE_IOMMU_MEMORY_REGION,
1685     .name = TYPE_AMD_IOMMU_MEMORY_REGION,
1686     .class_init = amdvi_iommu_memory_region_class_init,
1687 };
1688 
1689 static void amdvi_register_types(void)
1690 {
1691     type_register_static(&amdvi_pci);
1692     type_register_static(&amdvi_sysbus);
1693     type_register_static(&amdvi_iommu_memory_region_info);
1694 }
1695 
1696 type_init(amdvi_register_types);
1697