xref: /openbmc/qemu/hw/i386/intel_iommu.c (revision 64552b6b)
1 /*
2  * QEMU emulation of an Intel IOMMU (VT-d)
3  *   (DMA Remapping device)
4  *
5  * Copyright (C) 2013 Knut Omang, Oracle <knut.omang@oracle.com>
6  * Copyright (C) 2014 Le Tan, <tamlokveer@gmail.com>
7  *
8  * This program is free software; you can redistribute it and/or modify
9  * it under the terms of the GNU General Public License as published by
10  * the Free Software Foundation; either version 2 of the License, or
11  * (at your option) any later version.
12 
13  * This program is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16  * GNU General Public License for more details.
17 
18  * You should have received a copy of the GNU General Public License along
19  * with this program; if not, see <http://www.gnu.org/licenses/>.
20  */
21 
22 #include "qemu/osdep.h"
23 #include "qemu/error-report.h"
24 #include "qapi/error.h"
25 #include "hw/sysbus.h"
26 #include "exec/address-spaces.h"
27 #include "intel_iommu_internal.h"
28 #include "hw/pci/pci.h"
29 #include "hw/pci/pci_bus.h"
30 #include "hw/i386/pc.h"
31 #include "hw/i386/apic-msidef.h"
32 #include "hw/boards.h"
33 #include "hw/i386/x86-iommu.h"
34 #include "hw/pci-host/q35.h"
35 #include "sysemu/kvm.h"
36 #include "hw/i386/apic_internal.h"
37 #include "kvm_i386.h"
38 #include "trace.h"
39 
40 /* context entry operations */
41 #define VTD_CE_GET_RID2PASID(ce) \
42     ((ce)->val[1] & VTD_SM_CONTEXT_ENTRY_RID2PASID_MASK)
43 #define VTD_CE_GET_PASID_DIR_TABLE(ce) \
44     ((ce)->val[0] & VTD_PASID_DIR_BASE_ADDR_MASK)
45 
46 /* pe operations */
47 #define VTD_PE_GET_TYPE(pe) ((pe)->val[0] & VTD_SM_PASID_ENTRY_PGTT)
48 #define VTD_PE_GET_LEVEL(pe) (2 + (((pe)->val[0] >> 2) & VTD_SM_PASID_ENTRY_AW))
49 #define VTD_PE_GET_FPD_ERR(ret_fr, is_fpd_set, s, source_id, addr, is_write) {\
50     if (ret_fr) {                                                             \
51         ret_fr = -ret_fr;                                                     \
52         if (is_fpd_set && vtd_is_qualified_fault(ret_fr)) {                   \
53             trace_vtd_fault_disabled();                                       \
54         } else {                                                              \
55             vtd_report_dmar_fault(s, source_id, addr, ret_fr, is_write);      \
56         }                                                                     \
57         goto error;                                                           \
58     }                                                                         \
59 }
60 
61 static void vtd_address_space_refresh_all(IntelIOMMUState *s);
62 static void vtd_address_space_unmap(VTDAddressSpace *as, IOMMUNotifier *n);
63 
64 static void vtd_define_quad(IntelIOMMUState *s, hwaddr addr, uint64_t val,
65                             uint64_t wmask, uint64_t w1cmask)
66 {
67     stq_le_p(&s->csr[addr], val);
68     stq_le_p(&s->wmask[addr], wmask);
69     stq_le_p(&s->w1cmask[addr], w1cmask);
70 }
71 
72 static void vtd_define_quad_wo(IntelIOMMUState *s, hwaddr addr, uint64_t mask)
73 {
74     stq_le_p(&s->womask[addr], mask);
75 }
76 
77 static void vtd_define_long(IntelIOMMUState *s, hwaddr addr, uint32_t val,
78                             uint32_t wmask, uint32_t w1cmask)
79 {
80     stl_le_p(&s->csr[addr], val);
81     stl_le_p(&s->wmask[addr], wmask);
82     stl_le_p(&s->w1cmask[addr], w1cmask);
83 }
84 
85 static void vtd_define_long_wo(IntelIOMMUState *s, hwaddr addr, uint32_t mask)
86 {
87     stl_le_p(&s->womask[addr], mask);
88 }
89 
90 /* "External" get/set operations */
91 static void vtd_set_quad(IntelIOMMUState *s, hwaddr addr, uint64_t val)
92 {
93     uint64_t oldval = ldq_le_p(&s->csr[addr]);
94     uint64_t wmask = ldq_le_p(&s->wmask[addr]);
95     uint64_t w1cmask = ldq_le_p(&s->w1cmask[addr]);
96     stq_le_p(&s->csr[addr],
97              ((oldval & ~wmask) | (val & wmask)) & ~(w1cmask & val));
98 }
99 
100 static void vtd_set_long(IntelIOMMUState *s, hwaddr addr, uint32_t val)
101 {
102     uint32_t oldval = ldl_le_p(&s->csr[addr]);
103     uint32_t wmask = ldl_le_p(&s->wmask[addr]);
104     uint32_t w1cmask = ldl_le_p(&s->w1cmask[addr]);
105     stl_le_p(&s->csr[addr],
106              ((oldval & ~wmask) | (val & wmask)) & ~(w1cmask & val));
107 }
108 
109 static uint64_t vtd_get_quad(IntelIOMMUState *s, hwaddr addr)
110 {
111     uint64_t val = ldq_le_p(&s->csr[addr]);
112     uint64_t womask = ldq_le_p(&s->womask[addr]);
113     return val & ~womask;
114 }
115 
116 static uint32_t vtd_get_long(IntelIOMMUState *s, hwaddr addr)
117 {
118     uint32_t val = ldl_le_p(&s->csr[addr]);
119     uint32_t womask = ldl_le_p(&s->womask[addr]);
120     return val & ~womask;
121 }
122 
123 /* "Internal" get/set operations */
124 static uint64_t vtd_get_quad_raw(IntelIOMMUState *s, hwaddr addr)
125 {
126     return ldq_le_p(&s->csr[addr]);
127 }
128 
129 static uint32_t vtd_get_long_raw(IntelIOMMUState *s, hwaddr addr)
130 {
131     return ldl_le_p(&s->csr[addr]);
132 }
133 
134 static void vtd_set_quad_raw(IntelIOMMUState *s, hwaddr addr, uint64_t val)
135 {
136     stq_le_p(&s->csr[addr], val);
137 }
138 
139 static uint32_t vtd_set_clear_mask_long(IntelIOMMUState *s, hwaddr addr,
140                                         uint32_t clear, uint32_t mask)
141 {
142     uint32_t new_val = (ldl_le_p(&s->csr[addr]) & ~clear) | mask;
143     stl_le_p(&s->csr[addr], new_val);
144     return new_val;
145 }
146 
147 static uint64_t vtd_set_clear_mask_quad(IntelIOMMUState *s, hwaddr addr,
148                                         uint64_t clear, uint64_t mask)
149 {
150     uint64_t new_val = (ldq_le_p(&s->csr[addr]) & ~clear) | mask;
151     stq_le_p(&s->csr[addr], new_val);
152     return new_val;
153 }
154 
155 static inline void vtd_iommu_lock(IntelIOMMUState *s)
156 {
157     qemu_mutex_lock(&s->iommu_lock);
158 }
159 
160 static inline void vtd_iommu_unlock(IntelIOMMUState *s)
161 {
162     qemu_mutex_unlock(&s->iommu_lock);
163 }
164 
165 static void vtd_update_scalable_state(IntelIOMMUState *s)
166 {
167     uint64_t val = vtd_get_quad_raw(s, DMAR_RTADDR_REG);
168 
169     if (s->scalable_mode) {
170         s->root_scalable = val & VTD_RTADDR_SMT;
171     }
172 }
173 
174 /* Whether the address space needs to notify new mappings */
175 static inline gboolean vtd_as_has_map_notifier(VTDAddressSpace *as)
176 {
177     return as->notifier_flags & IOMMU_NOTIFIER_MAP;
178 }
179 
180 /* GHashTable functions */
181 static gboolean vtd_uint64_equal(gconstpointer v1, gconstpointer v2)
182 {
183     return *((const uint64_t *)v1) == *((const uint64_t *)v2);
184 }
185 
186 static guint vtd_uint64_hash(gconstpointer v)
187 {
188     return (guint)*(const uint64_t *)v;
189 }
190 
191 static gboolean vtd_hash_remove_by_domain(gpointer key, gpointer value,
192                                           gpointer user_data)
193 {
194     VTDIOTLBEntry *entry = (VTDIOTLBEntry *)value;
195     uint16_t domain_id = *(uint16_t *)user_data;
196     return entry->domain_id == domain_id;
197 }
198 
199 /* The shift of an addr for a certain level of paging structure */
200 static inline uint32_t vtd_slpt_level_shift(uint32_t level)
201 {
202     assert(level != 0);
203     return VTD_PAGE_SHIFT_4K + (level - 1) * VTD_SL_LEVEL_BITS;
204 }
205 
206 static inline uint64_t vtd_slpt_level_page_mask(uint32_t level)
207 {
208     return ~((1ULL << vtd_slpt_level_shift(level)) - 1);
209 }
210 
211 static gboolean vtd_hash_remove_by_page(gpointer key, gpointer value,
212                                         gpointer user_data)
213 {
214     VTDIOTLBEntry *entry = (VTDIOTLBEntry *)value;
215     VTDIOTLBPageInvInfo *info = (VTDIOTLBPageInvInfo *)user_data;
216     uint64_t gfn = (info->addr >> VTD_PAGE_SHIFT_4K) & info->mask;
217     uint64_t gfn_tlb = (info->addr & entry->mask) >> VTD_PAGE_SHIFT_4K;
218     return (entry->domain_id == info->domain_id) &&
219             (((entry->gfn & info->mask) == gfn) ||
220              (entry->gfn == gfn_tlb));
221 }
222 
223 /* Reset all the gen of VTDAddressSpace to zero and set the gen of
224  * IntelIOMMUState to 1.  Must be called with IOMMU lock held.
225  */
226 static void vtd_reset_context_cache_locked(IntelIOMMUState *s)
227 {
228     VTDAddressSpace *vtd_as;
229     VTDBus *vtd_bus;
230     GHashTableIter bus_it;
231     uint32_t devfn_it;
232 
233     trace_vtd_context_cache_reset();
234 
235     g_hash_table_iter_init(&bus_it, s->vtd_as_by_busptr);
236 
237     while (g_hash_table_iter_next (&bus_it, NULL, (void**)&vtd_bus)) {
238         for (devfn_it = 0; devfn_it < PCI_DEVFN_MAX; ++devfn_it) {
239             vtd_as = vtd_bus->dev_as[devfn_it];
240             if (!vtd_as) {
241                 continue;
242             }
243             vtd_as->context_cache_entry.context_cache_gen = 0;
244         }
245     }
246     s->context_cache_gen = 1;
247 }
248 
249 /* Must be called with IOMMU lock held. */
250 static void vtd_reset_iotlb_locked(IntelIOMMUState *s)
251 {
252     assert(s->iotlb);
253     g_hash_table_remove_all(s->iotlb);
254 }
255 
256 static void vtd_reset_iotlb(IntelIOMMUState *s)
257 {
258     vtd_iommu_lock(s);
259     vtd_reset_iotlb_locked(s);
260     vtd_iommu_unlock(s);
261 }
262 
263 static void vtd_reset_caches(IntelIOMMUState *s)
264 {
265     vtd_iommu_lock(s);
266     vtd_reset_iotlb_locked(s);
267     vtd_reset_context_cache_locked(s);
268     vtd_iommu_unlock(s);
269 }
270 
271 static uint64_t vtd_get_iotlb_key(uint64_t gfn, uint16_t source_id,
272                                   uint32_t level)
273 {
274     return gfn | ((uint64_t)(source_id) << VTD_IOTLB_SID_SHIFT) |
275            ((uint64_t)(level) << VTD_IOTLB_LVL_SHIFT);
276 }
277 
278 static uint64_t vtd_get_iotlb_gfn(hwaddr addr, uint32_t level)
279 {
280     return (addr & vtd_slpt_level_page_mask(level)) >> VTD_PAGE_SHIFT_4K;
281 }
282 
283 /* Must be called with IOMMU lock held */
284 static VTDIOTLBEntry *vtd_lookup_iotlb(IntelIOMMUState *s, uint16_t source_id,
285                                        hwaddr addr)
286 {
287     VTDIOTLBEntry *entry;
288     uint64_t key;
289     int level;
290 
291     for (level = VTD_SL_PT_LEVEL; level < VTD_SL_PML4_LEVEL; level++) {
292         key = vtd_get_iotlb_key(vtd_get_iotlb_gfn(addr, level),
293                                 source_id, level);
294         entry = g_hash_table_lookup(s->iotlb, &key);
295         if (entry) {
296             goto out;
297         }
298     }
299 
300 out:
301     return entry;
302 }
303 
304 /* Must be with IOMMU lock held */
305 static void vtd_update_iotlb(IntelIOMMUState *s, uint16_t source_id,
306                              uint16_t domain_id, hwaddr addr, uint64_t slpte,
307                              uint8_t access_flags, uint32_t level)
308 {
309     VTDIOTLBEntry *entry = g_malloc(sizeof(*entry));
310     uint64_t *key = g_malloc(sizeof(*key));
311     uint64_t gfn = vtd_get_iotlb_gfn(addr, level);
312 
313     trace_vtd_iotlb_page_update(source_id, addr, slpte, domain_id);
314     if (g_hash_table_size(s->iotlb) >= VTD_IOTLB_MAX_SIZE) {
315         trace_vtd_iotlb_reset("iotlb exceeds size limit");
316         vtd_reset_iotlb_locked(s);
317     }
318 
319     entry->gfn = gfn;
320     entry->domain_id = domain_id;
321     entry->slpte = slpte;
322     entry->access_flags = access_flags;
323     entry->mask = vtd_slpt_level_page_mask(level);
324     *key = vtd_get_iotlb_key(gfn, source_id, level);
325     g_hash_table_replace(s->iotlb, key, entry);
326 }
327 
328 /* Given the reg addr of both the message data and address, generate an
329  * interrupt via MSI.
330  */
331 static void vtd_generate_interrupt(IntelIOMMUState *s, hwaddr mesg_addr_reg,
332                                    hwaddr mesg_data_reg)
333 {
334     MSIMessage msi;
335 
336     assert(mesg_data_reg < DMAR_REG_SIZE);
337     assert(mesg_addr_reg < DMAR_REG_SIZE);
338 
339     msi.address = vtd_get_long_raw(s, mesg_addr_reg);
340     msi.data = vtd_get_long_raw(s, mesg_data_reg);
341 
342     trace_vtd_irq_generate(msi.address, msi.data);
343 
344     apic_get_class()->send_msi(&msi);
345 }
346 
347 /* Generate a fault event to software via MSI if conditions are met.
348  * Notice that the value of FSTS_REG being passed to it should be the one
349  * before any update.
350  */
351 static void vtd_generate_fault_event(IntelIOMMUState *s, uint32_t pre_fsts)
352 {
353     if (pre_fsts & VTD_FSTS_PPF || pre_fsts & VTD_FSTS_PFO ||
354         pre_fsts & VTD_FSTS_IQE) {
355         error_report_once("There are previous interrupt conditions "
356                           "to be serviced by software, fault event "
357                           "is not generated");
358         return;
359     }
360     vtd_set_clear_mask_long(s, DMAR_FECTL_REG, 0, VTD_FECTL_IP);
361     if (vtd_get_long_raw(s, DMAR_FECTL_REG) & VTD_FECTL_IM) {
362         error_report_once("Interrupt Mask set, irq is not generated");
363     } else {
364         vtd_generate_interrupt(s, DMAR_FEADDR_REG, DMAR_FEDATA_REG);
365         vtd_set_clear_mask_long(s, DMAR_FECTL_REG, VTD_FECTL_IP, 0);
366     }
367 }
368 
369 /* Check if the Fault (F) field of the Fault Recording Register referenced by
370  * @index is Set.
371  */
372 static bool vtd_is_frcd_set(IntelIOMMUState *s, uint16_t index)
373 {
374     /* Each reg is 128-bit */
375     hwaddr addr = DMAR_FRCD_REG_OFFSET + (((uint64_t)index) << 4);
376     addr += 8; /* Access the high 64-bit half */
377 
378     assert(index < DMAR_FRCD_REG_NR);
379 
380     return vtd_get_quad_raw(s, addr) & VTD_FRCD_F;
381 }
382 
383 /* Update the PPF field of Fault Status Register.
384  * Should be called whenever change the F field of any fault recording
385  * registers.
386  */
387 static void vtd_update_fsts_ppf(IntelIOMMUState *s)
388 {
389     uint32_t i;
390     uint32_t ppf_mask = 0;
391 
392     for (i = 0; i < DMAR_FRCD_REG_NR; i++) {
393         if (vtd_is_frcd_set(s, i)) {
394             ppf_mask = VTD_FSTS_PPF;
395             break;
396         }
397     }
398     vtd_set_clear_mask_long(s, DMAR_FSTS_REG, VTD_FSTS_PPF, ppf_mask);
399     trace_vtd_fsts_ppf(!!ppf_mask);
400 }
401 
402 static void vtd_set_frcd_and_update_ppf(IntelIOMMUState *s, uint16_t index)
403 {
404     /* Each reg is 128-bit */
405     hwaddr addr = DMAR_FRCD_REG_OFFSET + (((uint64_t)index) << 4);
406     addr += 8; /* Access the high 64-bit half */
407 
408     assert(index < DMAR_FRCD_REG_NR);
409 
410     vtd_set_clear_mask_quad(s, addr, 0, VTD_FRCD_F);
411     vtd_update_fsts_ppf(s);
412 }
413 
414 /* Must not update F field now, should be done later */
415 static void vtd_record_frcd(IntelIOMMUState *s, uint16_t index,
416                             uint16_t source_id, hwaddr addr,
417                             VTDFaultReason fault, bool is_write)
418 {
419     uint64_t hi = 0, lo;
420     hwaddr frcd_reg_addr = DMAR_FRCD_REG_OFFSET + (((uint64_t)index) << 4);
421 
422     assert(index < DMAR_FRCD_REG_NR);
423 
424     lo = VTD_FRCD_FI(addr);
425     hi = VTD_FRCD_SID(source_id) | VTD_FRCD_FR(fault);
426     if (!is_write) {
427         hi |= VTD_FRCD_T;
428     }
429     vtd_set_quad_raw(s, frcd_reg_addr, lo);
430     vtd_set_quad_raw(s, frcd_reg_addr + 8, hi);
431 
432     trace_vtd_frr_new(index, hi, lo);
433 }
434 
435 /* Try to collapse multiple pending faults from the same requester */
436 static bool vtd_try_collapse_fault(IntelIOMMUState *s, uint16_t source_id)
437 {
438     uint32_t i;
439     uint64_t frcd_reg;
440     hwaddr addr = DMAR_FRCD_REG_OFFSET + 8; /* The high 64-bit half */
441 
442     for (i = 0; i < DMAR_FRCD_REG_NR; i++) {
443         frcd_reg = vtd_get_quad_raw(s, addr);
444         if ((frcd_reg & VTD_FRCD_F) &&
445             ((frcd_reg & VTD_FRCD_SID_MASK) == source_id)) {
446             return true;
447         }
448         addr += 16; /* 128-bit for each */
449     }
450     return false;
451 }
452 
453 /* Log and report an DMAR (address translation) fault to software */
454 static void vtd_report_dmar_fault(IntelIOMMUState *s, uint16_t source_id,
455                                   hwaddr addr, VTDFaultReason fault,
456                                   bool is_write)
457 {
458     uint32_t fsts_reg = vtd_get_long_raw(s, DMAR_FSTS_REG);
459 
460     assert(fault < VTD_FR_MAX);
461 
462     if (fault == VTD_FR_RESERVED_ERR) {
463         /* This is not a normal fault reason case. Drop it. */
464         return;
465     }
466 
467     trace_vtd_dmar_fault(source_id, fault, addr, is_write);
468 
469     if (fsts_reg & VTD_FSTS_PFO) {
470         error_report_once("New fault is not recorded due to "
471                           "Primary Fault Overflow");
472         return;
473     }
474 
475     if (vtd_try_collapse_fault(s, source_id)) {
476         error_report_once("New fault is not recorded due to "
477                           "compression of faults");
478         return;
479     }
480 
481     if (vtd_is_frcd_set(s, s->next_frcd_reg)) {
482         error_report_once("Next Fault Recording Reg is used, "
483                           "new fault is not recorded, set PFO field");
484         vtd_set_clear_mask_long(s, DMAR_FSTS_REG, 0, VTD_FSTS_PFO);
485         return;
486     }
487 
488     vtd_record_frcd(s, s->next_frcd_reg, source_id, addr, fault, is_write);
489 
490     if (fsts_reg & VTD_FSTS_PPF) {
491         error_report_once("There are pending faults already, "
492                           "fault event is not generated");
493         vtd_set_frcd_and_update_ppf(s, s->next_frcd_reg);
494         s->next_frcd_reg++;
495         if (s->next_frcd_reg == DMAR_FRCD_REG_NR) {
496             s->next_frcd_reg = 0;
497         }
498     } else {
499         vtd_set_clear_mask_long(s, DMAR_FSTS_REG, VTD_FSTS_FRI_MASK,
500                                 VTD_FSTS_FRI(s->next_frcd_reg));
501         vtd_set_frcd_and_update_ppf(s, s->next_frcd_reg); /* Will set PPF */
502         s->next_frcd_reg++;
503         if (s->next_frcd_reg == DMAR_FRCD_REG_NR) {
504             s->next_frcd_reg = 0;
505         }
506         /* This case actually cause the PPF to be Set.
507          * So generate fault event (interrupt).
508          */
509          vtd_generate_fault_event(s, fsts_reg);
510     }
511 }
512 
513 /* Handle Invalidation Queue Errors of queued invalidation interface error
514  * conditions.
515  */
516 static void vtd_handle_inv_queue_error(IntelIOMMUState *s)
517 {
518     uint32_t fsts_reg = vtd_get_long_raw(s, DMAR_FSTS_REG);
519 
520     vtd_set_clear_mask_long(s, DMAR_FSTS_REG, 0, VTD_FSTS_IQE);
521     vtd_generate_fault_event(s, fsts_reg);
522 }
523 
524 /* Set the IWC field and try to generate an invalidation completion interrupt */
525 static void vtd_generate_completion_event(IntelIOMMUState *s)
526 {
527     if (vtd_get_long_raw(s, DMAR_ICS_REG) & VTD_ICS_IWC) {
528         trace_vtd_inv_desc_wait_irq("One pending, skip current");
529         return;
530     }
531     vtd_set_clear_mask_long(s, DMAR_ICS_REG, 0, VTD_ICS_IWC);
532     vtd_set_clear_mask_long(s, DMAR_IECTL_REG, 0, VTD_IECTL_IP);
533     if (vtd_get_long_raw(s, DMAR_IECTL_REG) & VTD_IECTL_IM) {
534         trace_vtd_inv_desc_wait_irq("IM in IECTL_REG is set, "
535                                     "new event not generated");
536         return;
537     } else {
538         /* Generate the interrupt event */
539         trace_vtd_inv_desc_wait_irq("Generating complete event");
540         vtd_generate_interrupt(s, DMAR_IEADDR_REG, DMAR_IEDATA_REG);
541         vtd_set_clear_mask_long(s, DMAR_IECTL_REG, VTD_IECTL_IP, 0);
542     }
543 }
544 
545 static inline bool vtd_root_entry_present(IntelIOMMUState *s,
546                                           VTDRootEntry *re,
547                                           uint8_t devfn)
548 {
549     if (s->root_scalable && devfn > UINT8_MAX / 2) {
550         return re->hi & VTD_ROOT_ENTRY_P;
551     }
552 
553     return re->lo & VTD_ROOT_ENTRY_P;
554 }
555 
556 static int vtd_get_root_entry(IntelIOMMUState *s, uint8_t index,
557                               VTDRootEntry *re)
558 {
559     dma_addr_t addr;
560 
561     addr = s->root + index * sizeof(*re);
562     if (dma_memory_read(&address_space_memory, addr, re, sizeof(*re))) {
563         re->lo = 0;
564         return -VTD_FR_ROOT_TABLE_INV;
565     }
566     re->lo = le64_to_cpu(re->lo);
567     re->hi = le64_to_cpu(re->hi);
568     return 0;
569 }
570 
571 static inline bool vtd_ce_present(VTDContextEntry *context)
572 {
573     return context->lo & VTD_CONTEXT_ENTRY_P;
574 }
575 
576 static int vtd_get_context_entry_from_root(IntelIOMMUState *s,
577                                            VTDRootEntry *re,
578                                            uint8_t index,
579                                            VTDContextEntry *ce)
580 {
581     dma_addr_t addr, ce_size;
582 
583     /* we have checked that root entry is present */
584     ce_size = s->root_scalable ? VTD_CTX_ENTRY_SCALABLE_SIZE :
585               VTD_CTX_ENTRY_LEGACY_SIZE;
586 
587     if (s->root_scalable && index > UINT8_MAX / 2) {
588         index = index & (~VTD_DEVFN_CHECK_MASK);
589         addr = re->hi & VTD_ROOT_ENTRY_CTP;
590     } else {
591         addr = re->lo & VTD_ROOT_ENTRY_CTP;
592     }
593 
594     addr = addr + index * ce_size;
595     if (dma_memory_read(&address_space_memory, addr, ce, ce_size)) {
596         return -VTD_FR_CONTEXT_TABLE_INV;
597     }
598 
599     ce->lo = le64_to_cpu(ce->lo);
600     ce->hi = le64_to_cpu(ce->hi);
601     if (ce_size == VTD_CTX_ENTRY_SCALABLE_SIZE) {
602         ce->val[2] = le64_to_cpu(ce->val[2]);
603         ce->val[3] = le64_to_cpu(ce->val[3]);
604     }
605     return 0;
606 }
607 
608 static inline dma_addr_t vtd_ce_get_slpt_base(VTDContextEntry *ce)
609 {
610     return ce->lo & VTD_CONTEXT_ENTRY_SLPTPTR;
611 }
612 
613 static inline uint64_t vtd_get_slpte_addr(uint64_t slpte, uint8_t aw)
614 {
615     return slpte & VTD_SL_PT_BASE_ADDR_MASK(aw);
616 }
617 
618 /* Whether the pte indicates the address of the page frame */
619 static inline bool vtd_is_last_slpte(uint64_t slpte, uint32_t level)
620 {
621     return level == VTD_SL_PT_LEVEL || (slpte & VTD_SL_PT_PAGE_SIZE_MASK);
622 }
623 
624 /* Get the content of a spte located in @base_addr[@index] */
625 static uint64_t vtd_get_slpte(dma_addr_t base_addr, uint32_t index)
626 {
627     uint64_t slpte;
628 
629     assert(index < VTD_SL_PT_ENTRY_NR);
630 
631     if (dma_memory_read(&address_space_memory,
632                         base_addr + index * sizeof(slpte), &slpte,
633                         sizeof(slpte))) {
634         slpte = (uint64_t)-1;
635         return slpte;
636     }
637     slpte = le64_to_cpu(slpte);
638     return slpte;
639 }
640 
641 /* Given an iova and the level of paging structure, return the offset
642  * of current level.
643  */
644 static inline uint32_t vtd_iova_level_offset(uint64_t iova, uint32_t level)
645 {
646     return (iova >> vtd_slpt_level_shift(level)) &
647             ((1ULL << VTD_SL_LEVEL_BITS) - 1);
648 }
649 
650 /* Check Capability Register to see if the @level of page-table is supported */
651 static inline bool vtd_is_level_supported(IntelIOMMUState *s, uint32_t level)
652 {
653     return VTD_CAP_SAGAW_MASK & s->cap &
654            (1ULL << (level - 2 + VTD_CAP_SAGAW_SHIFT));
655 }
656 
657 /* Return true if check passed, otherwise false */
658 static inline bool vtd_pe_type_check(X86IOMMUState *x86_iommu,
659                                      VTDPASIDEntry *pe)
660 {
661     switch (VTD_PE_GET_TYPE(pe)) {
662     case VTD_SM_PASID_ENTRY_FLT:
663     case VTD_SM_PASID_ENTRY_SLT:
664     case VTD_SM_PASID_ENTRY_NESTED:
665         break;
666     case VTD_SM_PASID_ENTRY_PT:
667         if (!x86_iommu->pt_supported) {
668             return false;
669         }
670         break;
671     default:
672         /* Unknwon type */
673         return false;
674     }
675     return true;
676 }
677 
678 static int vtd_get_pasid_dire(dma_addr_t pasid_dir_base,
679                               uint32_t pasid,
680                               VTDPASIDDirEntry *pdire)
681 {
682     uint32_t index;
683     dma_addr_t addr, entry_size;
684 
685     index = VTD_PASID_DIR_INDEX(pasid);
686     entry_size = VTD_PASID_DIR_ENTRY_SIZE;
687     addr = pasid_dir_base + index * entry_size;
688     if (dma_memory_read(&address_space_memory, addr, pdire, entry_size)) {
689         return -VTD_FR_PASID_TABLE_INV;
690     }
691 
692     return 0;
693 }
694 
695 static int vtd_get_pasid_entry(IntelIOMMUState *s,
696                                uint32_t pasid,
697                                VTDPASIDDirEntry *pdire,
698                                VTDPASIDEntry *pe)
699 {
700     uint32_t index;
701     dma_addr_t addr, entry_size;
702     X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(s);
703 
704     index = VTD_PASID_TABLE_INDEX(pasid);
705     entry_size = VTD_PASID_ENTRY_SIZE;
706     addr = pdire->val & VTD_PASID_TABLE_BASE_ADDR_MASK;
707     addr = addr + index * entry_size;
708     if (dma_memory_read(&address_space_memory, addr, pe, entry_size)) {
709         return -VTD_FR_PASID_TABLE_INV;
710     }
711 
712     /* Do translation type check */
713     if (!vtd_pe_type_check(x86_iommu, pe)) {
714         return -VTD_FR_PASID_TABLE_INV;
715     }
716 
717     if (!vtd_is_level_supported(s, VTD_PE_GET_LEVEL(pe))) {
718         return -VTD_FR_PASID_TABLE_INV;
719     }
720 
721     return 0;
722 }
723 
724 static int vtd_get_pasid_entry_from_pasid(IntelIOMMUState *s,
725                                           dma_addr_t pasid_dir_base,
726                                           uint32_t pasid,
727                                           VTDPASIDEntry *pe)
728 {
729     int ret;
730     VTDPASIDDirEntry pdire;
731 
732     ret = vtd_get_pasid_dire(pasid_dir_base, pasid, &pdire);
733     if (ret) {
734         return ret;
735     }
736 
737     ret = vtd_get_pasid_entry(s, pasid, &pdire, pe);
738     if (ret) {
739         return ret;
740     }
741 
742     return ret;
743 }
744 
745 static int vtd_ce_get_rid2pasid_entry(IntelIOMMUState *s,
746                                       VTDContextEntry *ce,
747                                       VTDPASIDEntry *pe)
748 {
749     uint32_t pasid;
750     dma_addr_t pasid_dir_base;
751     int ret = 0;
752 
753     pasid = VTD_CE_GET_RID2PASID(ce);
754     pasid_dir_base = VTD_CE_GET_PASID_DIR_TABLE(ce);
755     ret = vtd_get_pasid_entry_from_pasid(s, pasid_dir_base, pasid, pe);
756 
757     return ret;
758 }
759 
760 static int vtd_ce_get_pasid_fpd(IntelIOMMUState *s,
761                                 VTDContextEntry *ce,
762                                 bool *pe_fpd_set)
763 {
764     int ret;
765     uint32_t pasid;
766     dma_addr_t pasid_dir_base;
767     VTDPASIDDirEntry pdire;
768     VTDPASIDEntry pe;
769 
770     pasid = VTD_CE_GET_RID2PASID(ce);
771     pasid_dir_base = VTD_CE_GET_PASID_DIR_TABLE(ce);
772 
773     ret = vtd_get_pasid_dire(pasid_dir_base, pasid, &pdire);
774     if (ret) {
775         return ret;
776     }
777 
778     if (pdire.val & VTD_PASID_DIR_FPD) {
779         *pe_fpd_set = true;
780         return 0;
781     }
782 
783     ret = vtd_get_pasid_entry(s, pasid, &pdire, &pe);
784     if (ret) {
785         return ret;
786     }
787 
788     if (pe.val[0] & VTD_PASID_ENTRY_FPD) {
789         *pe_fpd_set = true;
790     }
791 
792     return 0;
793 }
794 
795 /* Get the page-table level that hardware should use for the second-level
796  * page-table walk from the Address Width field of context-entry.
797  */
798 static inline uint32_t vtd_ce_get_level(VTDContextEntry *ce)
799 {
800     return 2 + (ce->hi & VTD_CONTEXT_ENTRY_AW);
801 }
802 
803 static uint32_t vtd_get_iova_level(IntelIOMMUState *s,
804                                    VTDContextEntry *ce)
805 {
806     VTDPASIDEntry pe;
807 
808     if (s->root_scalable) {
809         vtd_ce_get_rid2pasid_entry(s, ce, &pe);
810         return VTD_PE_GET_LEVEL(&pe);
811     }
812 
813     return vtd_ce_get_level(ce);
814 }
815 
816 static inline uint32_t vtd_ce_get_agaw(VTDContextEntry *ce)
817 {
818     return 30 + (ce->hi & VTD_CONTEXT_ENTRY_AW) * 9;
819 }
820 
821 static uint32_t vtd_get_iova_agaw(IntelIOMMUState *s,
822                                   VTDContextEntry *ce)
823 {
824     VTDPASIDEntry pe;
825 
826     if (s->root_scalable) {
827         vtd_ce_get_rid2pasid_entry(s, ce, &pe);
828         return 30 + ((pe.val[0] >> 2) & VTD_SM_PASID_ENTRY_AW) * 9;
829     }
830 
831     return vtd_ce_get_agaw(ce);
832 }
833 
834 static inline uint32_t vtd_ce_get_type(VTDContextEntry *ce)
835 {
836     return ce->lo & VTD_CONTEXT_ENTRY_TT;
837 }
838 
839 /* Only for Legacy Mode. Return true if check passed, otherwise false */
840 static inline bool vtd_ce_type_check(X86IOMMUState *x86_iommu,
841                                      VTDContextEntry *ce)
842 {
843     switch (vtd_ce_get_type(ce)) {
844     case VTD_CONTEXT_TT_MULTI_LEVEL:
845         /* Always supported */
846         break;
847     case VTD_CONTEXT_TT_DEV_IOTLB:
848         if (!x86_iommu->dt_supported) {
849             error_report_once("%s: DT specified but not supported", __func__);
850             return false;
851         }
852         break;
853     case VTD_CONTEXT_TT_PASS_THROUGH:
854         if (!x86_iommu->pt_supported) {
855             error_report_once("%s: PT specified but not supported", __func__);
856             return false;
857         }
858         break;
859     default:
860         /* Unknown type */
861         error_report_once("%s: unknown ce type: %"PRIu32, __func__,
862                           vtd_ce_get_type(ce));
863         return false;
864     }
865     return true;
866 }
867 
868 static inline uint64_t vtd_iova_limit(IntelIOMMUState *s,
869                                       VTDContextEntry *ce, uint8_t aw)
870 {
871     uint32_t ce_agaw = vtd_get_iova_agaw(s, ce);
872     return 1ULL << MIN(ce_agaw, aw);
873 }
874 
875 /* Return true if IOVA passes range check, otherwise false. */
876 static inline bool vtd_iova_range_check(IntelIOMMUState *s,
877                                         uint64_t iova, VTDContextEntry *ce,
878                                         uint8_t aw)
879 {
880     /*
881      * Check if @iova is above 2^X-1, where X is the minimum of MGAW
882      * in CAP_REG and AW in context-entry.
883      */
884     return !(iova & ~(vtd_iova_limit(s, ce, aw) - 1));
885 }
886 
887 static dma_addr_t vtd_get_iova_pgtbl_base(IntelIOMMUState *s,
888                                           VTDContextEntry *ce)
889 {
890     VTDPASIDEntry pe;
891 
892     if (s->root_scalable) {
893         vtd_ce_get_rid2pasid_entry(s, ce, &pe);
894         return pe.val[0] & VTD_SM_PASID_ENTRY_SLPTPTR;
895     }
896 
897     return vtd_ce_get_slpt_base(ce);
898 }
899 
900 /*
901  * Rsvd field masks for spte:
902  *     Index [1] to [4] 4k pages
903  *     Index [5] to [8] large pages
904  */
905 static uint64_t vtd_paging_entry_rsvd_field[9];
906 
907 static bool vtd_slpte_nonzero_rsvd(uint64_t slpte, uint32_t level)
908 {
909     if (slpte & VTD_SL_PT_PAGE_SIZE_MASK) {
910         /* Maybe large page */
911         return slpte & vtd_paging_entry_rsvd_field[level + 4];
912     } else {
913         return slpte & vtd_paging_entry_rsvd_field[level];
914     }
915 }
916 
917 /* Find the VTD address space associated with a given bus number */
918 static VTDBus *vtd_find_as_from_bus_num(IntelIOMMUState *s, uint8_t bus_num)
919 {
920     VTDBus *vtd_bus = s->vtd_as_by_bus_num[bus_num];
921     if (!vtd_bus) {
922         /*
923          * Iterate over the registered buses to find the one which
924          * currently hold this bus number, and update the bus_num
925          * lookup table:
926          */
927         GHashTableIter iter;
928 
929         g_hash_table_iter_init(&iter, s->vtd_as_by_busptr);
930         while (g_hash_table_iter_next(&iter, NULL, (void **)&vtd_bus)) {
931             if (pci_bus_num(vtd_bus->bus) == bus_num) {
932                 s->vtd_as_by_bus_num[bus_num] = vtd_bus;
933                 return vtd_bus;
934             }
935         }
936     }
937     return vtd_bus;
938 }
939 
940 /* Given the @iova, get relevant @slptep. @slpte_level will be the last level
941  * of the translation, can be used for deciding the size of large page.
942  */
943 static int vtd_iova_to_slpte(IntelIOMMUState *s, VTDContextEntry *ce,
944                              uint64_t iova, bool is_write,
945                              uint64_t *slptep, uint32_t *slpte_level,
946                              bool *reads, bool *writes, uint8_t aw_bits)
947 {
948     dma_addr_t addr = vtd_get_iova_pgtbl_base(s, ce);
949     uint32_t level = vtd_get_iova_level(s, ce);
950     uint32_t offset;
951     uint64_t slpte;
952     uint64_t access_right_check;
953 
954     if (!vtd_iova_range_check(s, iova, ce, aw_bits)) {
955         error_report_once("%s: detected IOVA overflow (iova=0x%" PRIx64 ")",
956                           __func__, iova);
957         return -VTD_FR_ADDR_BEYOND_MGAW;
958     }
959 
960     /* FIXME: what is the Atomics request here? */
961     access_right_check = is_write ? VTD_SL_W : VTD_SL_R;
962 
963     while (true) {
964         offset = vtd_iova_level_offset(iova, level);
965         slpte = vtd_get_slpte(addr, offset);
966 
967         if (slpte == (uint64_t)-1) {
968             error_report_once("%s: detected read error on DMAR slpte "
969                               "(iova=0x%" PRIx64 ")", __func__, iova);
970             if (level == vtd_get_iova_level(s, ce)) {
971                 /* Invalid programming of context-entry */
972                 return -VTD_FR_CONTEXT_ENTRY_INV;
973             } else {
974                 return -VTD_FR_PAGING_ENTRY_INV;
975             }
976         }
977         *reads = (*reads) && (slpte & VTD_SL_R);
978         *writes = (*writes) && (slpte & VTD_SL_W);
979         if (!(slpte & access_right_check)) {
980             error_report_once("%s: detected slpte permission error "
981                               "(iova=0x%" PRIx64 ", level=0x%" PRIx32 ", "
982                               "slpte=0x%" PRIx64 ", write=%d)", __func__,
983                               iova, level, slpte, is_write);
984             return is_write ? -VTD_FR_WRITE : -VTD_FR_READ;
985         }
986         if (vtd_slpte_nonzero_rsvd(slpte, level)) {
987             error_report_once("%s: detected splte reserve non-zero "
988                               "iova=0x%" PRIx64 ", level=0x%" PRIx32
989                               "slpte=0x%" PRIx64 ")", __func__, iova,
990                               level, slpte);
991             return -VTD_FR_PAGING_ENTRY_RSVD;
992         }
993 
994         if (vtd_is_last_slpte(slpte, level)) {
995             *slptep = slpte;
996             *slpte_level = level;
997             return 0;
998         }
999         addr = vtd_get_slpte_addr(slpte, aw_bits);
1000         level--;
1001     }
1002 }
1003 
1004 typedef int (*vtd_page_walk_hook)(IOMMUTLBEntry *entry, void *private);
1005 
1006 /**
1007  * Constant information used during page walking
1008  *
1009  * @hook_fn: hook func to be called when detected page
1010  * @private: private data to be passed into hook func
1011  * @notify_unmap: whether we should notify invalid entries
1012  * @as: VT-d address space of the device
1013  * @aw: maximum address width
1014  * @domain: domain ID of the page walk
1015  */
1016 typedef struct {
1017     VTDAddressSpace *as;
1018     vtd_page_walk_hook hook_fn;
1019     void *private;
1020     bool notify_unmap;
1021     uint8_t aw;
1022     uint16_t domain_id;
1023 } vtd_page_walk_info;
1024 
1025 static int vtd_page_walk_one(IOMMUTLBEntry *entry, vtd_page_walk_info *info)
1026 {
1027     VTDAddressSpace *as = info->as;
1028     vtd_page_walk_hook hook_fn = info->hook_fn;
1029     void *private = info->private;
1030     DMAMap target = {
1031         .iova = entry->iova,
1032         .size = entry->addr_mask,
1033         .translated_addr = entry->translated_addr,
1034         .perm = entry->perm,
1035     };
1036     DMAMap *mapped = iova_tree_find(as->iova_tree, &target);
1037 
1038     if (entry->perm == IOMMU_NONE && !info->notify_unmap) {
1039         trace_vtd_page_walk_one_skip_unmap(entry->iova, entry->addr_mask);
1040         return 0;
1041     }
1042 
1043     assert(hook_fn);
1044 
1045     /* Update local IOVA mapped ranges */
1046     if (entry->perm) {
1047         if (mapped) {
1048             /* If it's exactly the same translation, skip */
1049             if (!memcmp(mapped, &target, sizeof(target))) {
1050                 trace_vtd_page_walk_one_skip_map(entry->iova, entry->addr_mask,
1051                                                  entry->translated_addr);
1052                 return 0;
1053             } else {
1054                 /*
1055                  * Translation changed.  Normally this should not
1056                  * happen, but it can happen when with buggy guest
1057                  * OSes.  Note that there will be a small window that
1058                  * we don't have map at all.  But that's the best
1059                  * effort we can do.  The ideal way to emulate this is
1060                  * atomically modify the PTE to follow what has
1061                  * changed, but we can't.  One example is that vfio
1062                  * driver only has VFIO_IOMMU_[UN]MAP_DMA but no
1063                  * interface to modify a mapping (meanwhile it seems
1064                  * meaningless to even provide one).  Anyway, let's
1065                  * mark this as a TODO in case one day we'll have
1066                  * a better solution.
1067                  */
1068                 IOMMUAccessFlags cache_perm = entry->perm;
1069                 int ret;
1070 
1071                 /* Emulate an UNMAP */
1072                 entry->perm = IOMMU_NONE;
1073                 trace_vtd_page_walk_one(info->domain_id,
1074                                         entry->iova,
1075                                         entry->translated_addr,
1076                                         entry->addr_mask,
1077                                         entry->perm);
1078                 ret = hook_fn(entry, private);
1079                 if (ret) {
1080                     return ret;
1081                 }
1082                 /* Drop any existing mapping */
1083                 iova_tree_remove(as->iova_tree, &target);
1084                 /* Recover the correct permission */
1085                 entry->perm = cache_perm;
1086             }
1087         }
1088         iova_tree_insert(as->iova_tree, &target);
1089     } else {
1090         if (!mapped) {
1091             /* Skip since we didn't map this range at all */
1092             trace_vtd_page_walk_one_skip_unmap(entry->iova, entry->addr_mask);
1093             return 0;
1094         }
1095         iova_tree_remove(as->iova_tree, &target);
1096     }
1097 
1098     trace_vtd_page_walk_one(info->domain_id, entry->iova,
1099                             entry->translated_addr, entry->addr_mask,
1100                             entry->perm);
1101     return hook_fn(entry, private);
1102 }
1103 
1104 /**
1105  * vtd_page_walk_level - walk over specific level for IOVA range
1106  *
1107  * @addr: base GPA addr to start the walk
1108  * @start: IOVA range start address
1109  * @end: IOVA range end address (start <= addr < end)
1110  * @read: whether parent level has read permission
1111  * @write: whether parent level has write permission
1112  * @info: constant information for the page walk
1113  */
1114 static int vtd_page_walk_level(dma_addr_t addr, uint64_t start,
1115                                uint64_t end, uint32_t level, bool read,
1116                                bool write, vtd_page_walk_info *info)
1117 {
1118     bool read_cur, write_cur, entry_valid;
1119     uint32_t offset;
1120     uint64_t slpte;
1121     uint64_t subpage_size, subpage_mask;
1122     IOMMUTLBEntry entry;
1123     uint64_t iova = start;
1124     uint64_t iova_next;
1125     int ret = 0;
1126 
1127     trace_vtd_page_walk_level(addr, level, start, end);
1128 
1129     subpage_size = 1ULL << vtd_slpt_level_shift(level);
1130     subpage_mask = vtd_slpt_level_page_mask(level);
1131 
1132     while (iova < end) {
1133         iova_next = (iova & subpage_mask) + subpage_size;
1134 
1135         offset = vtd_iova_level_offset(iova, level);
1136         slpte = vtd_get_slpte(addr, offset);
1137 
1138         if (slpte == (uint64_t)-1) {
1139             trace_vtd_page_walk_skip_read(iova, iova_next);
1140             goto next;
1141         }
1142 
1143         if (vtd_slpte_nonzero_rsvd(slpte, level)) {
1144             trace_vtd_page_walk_skip_reserve(iova, iova_next);
1145             goto next;
1146         }
1147 
1148         /* Permissions are stacked with parents' */
1149         read_cur = read && (slpte & VTD_SL_R);
1150         write_cur = write && (slpte & VTD_SL_W);
1151 
1152         /*
1153          * As long as we have either read/write permission, this is a
1154          * valid entry. The rule works for both page entries and page
1155          * table entries.
1156          */
1157         entry_valid = read_cur | write_cur;
1158 
1159         if (!vtd_is_last_slpte(slpte, level) && entry_valid) {
1160             /*
1161              * This is a valid PDE (or even bigger than PDE).  We need
1162              * to walk one further level.
1163              */
1164             ret = vtd_page_walk_level(vtd_get_slpte_addr(slpte, info->aw),
1165                                       iova, MIN(iova_next, end), level - 1,
1166                                       read_cur, write_cur, info);
1167         } else {
1168             /*
1169              * This means we are either:
1170              *
1171              * (1) the real page entry (either 4K page, or huge page)
1172              * (2) the whole range is invalid
1173              *
1174              * In either case, we send an IOTLB notification down.
1175              */
1176             entry.target_as = &address_space_memory;
1177             entry.iova = iova & subpage_mask;
1178             entry.perm = IOMMU_ACCESS_FLAG(read_cur, write_cur);
1179             entry.addr_mask = ~subpage_mask;
1180             /* NOTE: this is only meaningful if entry_valid == true */
1181             entry.translated_addr = vtd_get_slpte_addr(slpte, info->aw);
1182             ret = vtd_page_walk_one(&entry, info);
1183         }
1184 
1185         if (ret < 0) {
1186             return ret;
1187         }
1188 
1189 next:
1190         iova = iova_next;
1191     }
1192 
1193     return 0;
1194 }
1195 
1196 /**
1197  * vtd_page_walk - walk specific IOVA range, and call the hook
1198  *
1199  * @s: intel iommu state
1200  * @ce: context entry to walk upon
1201  * @start: IOVA address to start the walk
1202  * @end: IOVA range end address (start <= addr < end)
1203  * @info: page walking information struct
1204  */
1205 static int vtd_page_walk(IntelIOMMUState *s, VTDContextEntry *ce,
1206                          uint64_t start, uint64_t end,
1207                          vtd_page_walk_info *info)
1208 {
1209     dma_addr_t addr = vtd_get_iova_pgtbl_base(s, ce);
1210     uint32_t level = vtd_get_iova_level(s, ce);
1211 
1212     if (!vtd_iova_range_check(s, start, ce, info->aw)) {
1213         return -VTD_FR_ADDR_BEYOND_MGAW;
1214     }
1215 
1216     if (!vtd_iova_range_check(s, end, ce, info->aw)) {
1217         /* Fix end so that it reaches the maximum */
1218         end = vtd_iova_limit(s, ce, info->aw);
1219     }
1220 
1221     return vtd_page_walk_level(addr, start, end, level, true, true, info);
1222 }
1223 
1224 static int vtd_root_entry_rsvd_bits_check(IntelIOMMUState *s,
1225                                           VTDRootEntry *re)
1226 {
1227     /* Legacy Mode reserved bits check */
1228     if (!s->root_scalable &&
1229         (re->hi || (re->lo & VTD_ROOT_ENTRY_RSVD(s->aw_bits))))
1230         goto rsvd_err;
1231 
1232     /* Scalable Mode reserved bits check */
1233     if (s->root_scalable &&
1234         ((re->lo & VTD_ROOT_ENTRY_RSVD(s->aw_bits)) ||
1235          (re->hi & VTD_ROOT_ENTRY_RSVD(s->aw_bits))))
1236         goto rsvd_err;
1237 
1238     return 0;
1239 
1240 rsvd_err:
1241     error_report_once("%s: invalid root entry: hi=0x%"PRIx64
1242                       ", lo=0x%"PRIx64,
1243                       __func__, re->hi, re->lo);
1244     return -VTD_FR_ROOT_ENTRY_RSVD;
1245 }
1246 
1247 static inline int vtd_context_entry_rsvd_bits_check(IntelIOMMUState *s,
1248                                                     VTDContextEntry *ce)
1249 {
1250     if (!s->root_scalable &&
1251         (ce->hi & VTD_CONTEXT_ENTRY_RSVD_HI ||
1252          ce->lo & VTD_CONTEXT_ENTRY_RSVD_LO(s->aw_bits))) {
1253         error_report_once("%s: invalid context entry: hi=%"PRIx64
1254                           ", lo=%"PRIx64" (reserved nonzero)",
1255                           __func__, ce->hi, ce->lo);
1256         return -VTD_FR_CONTEXT_ENTRY_RSVD;
1257     }
1258 
1259     if (s->root_scalable &&
1260         (ce->val[0] & VTD_SM_CONTEXT_ENTRY_RSVD_VAL0(s->aw_bits) ||
1261          ce->val[1] & VTD_SM_CONTEXT_ENTRY_RSVD_VAL1 ||
1262          ce->val[2] ||
1263          ce->val[3])) {
1264         error_report_once("%s: invalid context entry: val[3]=%"PRIx64
1265                           ", val[2]=%"PRIx64
1266                           ", val[1]=%"PRIx64
1267                           ", val[0]=%"PRIx64" (reserved nonzero)",
1268                           __func__, ce->val[3], ce->val[2],
1269                           ce->val[1], ce->val[0]);
1270         return -VTD_FR_CONTEXT_ENTRY_RSVD;
1271     }
1272 
1273     return 0;
1274 }
1275 
1276 static int vtd_ce_rid2pasid_check(IntelIOMMUState *s,
1277                                   VTDContextEntry *ce)
1278 {
1279     VTDPASIDEntry pe;
1280 
1281     /*
1282      * Make sure in Scalable Mode, a present context entry
1283      * has valid rid2pasid setting, which includes valid
1284      * rid2pasid field and corresponding pasid entry setting
1285      */
1286     return vtd_ce_get_rid2pasid_entry(s, ce, &pe);
1287 }
1288 
1289 /* Map a device to its corresponding domain (context-entry) */
1290 static int vtd_dev_to_context_entry(IntelIOMMUState *s, uint8_t bus_num,
1291                                     uint8_t devfn, VTDContextEntry *ce)
1292 {
1293     VTDRootEntry re;
1294     int ret_fr;
1295     X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(s);
1296 
1297     ret_fr = vtd_get_root_entry(s, bus_num, &re);
1298     if (ret_fr) {
1299         return ret_fr;
1300     }
1301 
1302     if (!vtd_root_entry_present(s, &re, devfn)) {
1303         /* Not error - it's okay we don't have root entry. */
1304         trace_vtd_re_not_present(bus_num);
1305         return -VTD_FR_ROOT_ENTRY_P;
1306     }
1307 
1308     ret_fr = vtd_root_entry_rsvd_bits_check(s, &re);
1309     if (ret_fr) {
1310         return ret_fr;
1311     }
1312 
1313     ret_fr = vtd_get_context_entry_from_root(s, &re, devfn, ce);
1314     if (ret_fr) {
1315         return ret_fr;
1316     }
1317 
1318     if (!vtd_ce_present(ce)) {
1319         /* Not error - it's okay we don't have context entry. */
1320         trace_vtd_ce_not_present(bus_num, devfn);
1321         return -VTD_FR_CONTEXT_ENTRY_P;
1322     }
1323 
1324     ret_fr = vtd_context_entry_rsvd_bits_check(s, ce);
1325     if (ret_fr) {
1326         return ret_fr;
1327     }
1328 
1329     /* Check if the programming of context-entry is valid */
1330     if (!s->root_scalable &&
1331         !vtd_is_level_supported(s, vtd_ce_get_level(ce))) {
1332         error_report_once("%s: invalid context entry: hi=%"PRIx64
1333                           ", lo=%"PRIx64" (level %d not supported)",
1334                           __func__, ce->hi, ce->lo,
1335                           vtd_ce_get_level(ce));
1336         return -VTD_FR_CONTEXT_ENTRY_INV;
1337     }
1338 
1339     if (!s->root_scalable) {
1340         /* Do translation type check */
1341         if (!vtd_ce_type_check(x86_iommu, ce)) {
1342             /* Errors dumped in vtd_ce_type_check() */
1343             return -VTD_FR_CONTEXT_ENTRY_INV;
1344         }
1345     } else {
1346         /*
1347          * Check if the programming of context-entry.rid2pasid
1348          * and corresponding pasid setting is valid, and thus
1349          * avoids to check pasid entry fetching result in future
1350          * helper function calling.
1351          */
1352         ret_fr = vtd_ce_rid2pasid_check(s, ce);
1353         if (ret_fr) {
1354             return ret_fr;
1355         }
1356     }
1357 
1358     return 0;
1359 }
1360 
1361 static int vtd_sync_shadow_page_hook(IOMMUTLBEntry *entry,
1362                                      void *private)
1363 {
1364     memory_region_notify_iommu((IOMMUMemoryRegion *)private, 0, *entry);
1365     return 0;
1366 }
1367 
1368 static uint16_t vtd_get_domain_id(IntelIOMMUState *s,
1369                                   VTDContextEntry *ce)
1370 {
1371     VTDPASIDEntry pe;
1372 
1373     if (s->root_scalable) {
1374         vtd_ce_get_rid2pasid_entry(s, ce, &pe);
1375         return VTD_SM_PASID_ENTRY_DID(pe.val[1]);
1376     }
1377 
1378     return VTD_CONTEXT_ENTRY_DID(ce->hi);
1379 }
1380 
1381 static int vtd_sync_shadow_page_table_range(VTDAddressSpace *vtd_as,
1382                                             VTDContextEntry *ce,
1383                                             hwaddr addr, hwaddr size)
1384 {
1385     IntelIOMMUState *s = vtd_as->iommu_state;
1386     vtd_page_walk_info info = {
1387         .hook_fn = vtd_sync_shadow_page_hook,
1388         .private = (void *)&vtd_as->iommu,
1389         .notify_unmap = true,
1390         .aw = s->aw_bits,
1391         .as = vtd_as,
1392         .domain_id = vtd_get_domain_id(s, ce),
1393     };
1394 
1395     return vtd_page_walk(s, ce, addr, addr + size, &info);
1396 }
1397 
1398 static int vtd_sync_shadow_page_table(VTDAddressSpace *vtd_as)
1399 {
1400     int ret;
1401     VTDContextEntry ce;
1402     IOMMUNotifier *n;
1403 
1404     ret = vtd_dev_to_context_entry(vtd_as->iommu_state,
1405                                    pci_bus_num(vtd_as->bus),
1406                                    vtd_as->devfn, &ce);
1407     if (ret) {
1408         if (ret == -VTD_FR_CONTEXT_ENTRY_P) {
1409             /*
1410              * It's a valid scenario to have a context entry that is
1411              * not present.  For example, when a device is removed
1412              * from an existing domain then the context entry will be
1413              * zeroed by the guest before it was put into another
1414              * domain.  When this happens, instead of synchronizing
1415              * the shadow pages we should invalidate all existing
1416              * mappings and notify the backends.
1417              */
1418             IOMMU_NOTIFIER_FOREACH(n, &vtd_as->iommu) {
1419                 vtd_address_space_unmap(vtd_as, n);
1420             }
1421             ret = 0;
1422         }
1423         return ret;
1424     }
1425 
1426     return vtd_sync_shadow_page_table_range(vtd_as, &ce, 0, UINT64_MAX);
1427 }
1428 
1429 /*
1430  * Check if specific device is configed to bypass address
1431  * translation for DMA requests. In Scalable Mode, bypass
1432  * 1st-level translation or 2nd-level translation, it depends
1433  * on PGTT setting.
1434  */
1435 static bool vtd_dev_pt_enabled(VTDAddressSpace *as)
1436 {
1437     IntelIOMMUState *s;
1438     VTDContextEntry ce;
1439     VTDPASIDEntry pe;
1440     int ret;
1441 
1442     assert(as);
1443 
1444     s = as->iommu_state;
1445     ret = vtd_dev_to_context_entry(s, pci_bus_num(as->bus),
1446                                    as->devfn, &ce);
1447     if (ret) {
1448         /*
1449          * Possibly failed to parse the context entry for some reason
1450          * (e.g., during init, or any guest configuration errors on
1451          * context entries). We should assume PT not enabled for
1452          * safety.
1453          */
1454         return false;
1455     }
1456 
1457     if (s->root_scalable) {
1458         ret = vtd_ce_get_rid2pasid_entry(s, &ce, &pe);
1459         if (ret) {
1460             error_report_once("%s: vtd_ce_get_rid2pasid_entry error: %"PRId32,
1461                               __func__, ret);
1462             return false;
1463         }
1464         return (VTD_PE_GET_TYPE(&pe) == VTD_SM_PASID_ENTRY_PT);
1465     }
1466 
1467     return (vtd_ce_get_type(&ce) == VTD_CONTEXT_TT_PASS_THROUGH);
1468 }
1469 
1470 /* Return whether the device is using IOMMU translation. */
1471 static bool vtd_switch_address_space(VTDAddressSpace *as)
1472 {
1473     bool use_iommu;
1474     /* Whether we need to take the BQL on our own */
1475     bool take_bql = !qemu_mutex_iothread_locked();
1476 
1477     assert(as);
1478 
1479     use_iommu = as->iommu_state->dmar_enabled && !vtd_dev_pt_enabled(as);
1480 
1481     trace_vtd_switch_address_space(pci_bus_num(as->bus),
1482                                    VTD_PCI_SLOT(as->devfn),
1483                                    VTD_PCI_FUNC(as->devfn),
1484                                    use_iommu);
1485 
1486     /*
1487      * It's possible that we reach here without BQL, e.g., when called
1488      * from vtd_pt_enable_fast_path(). However the memory APIs need
1489      * it. We'd better make sure we have had it already, or, take it.
1490      */
1491     if (take_bql) {
1492         qemu_mutex_lock_iothread();
1493     }
1494 
1495     /* Turn off first then on the other */
1496     if (use_iommu) {
1497         memory_region_set_enabled(&as->nodmar, false);
1498         memory_region_set_enabled(MEMORY_REGION(&as->iommu), true);
1499     } else {
1500         memory_region_set_enabled(MEMORY_REGION(&as->iommu), false);
1501         memory_region_set_enabled(&as->nodmar, true);
1502     }
1503 
1504     if (take_bql) {
1505         qemu_mutex_unlock_iothread();
1506     }
1507 
1508     return use_iommu;
1509 }
1510 
1511 static void vtd_switch_address_space_all(IntelIOMMUState *s)
1512 {
1513     GHashTableIter iter;
1514     VTDBus *vtd_bus;
1515     int i;
1516 
1517     g_hash_table_iter_init(&iter, s->vtd_as_by_busptr);
1518     while (g_hash_table_iter_next(&iter, NULL, (void **)&vtd_bus)) {
1519         for (i = 0; i < PCI_DEVFN_MAX; i++) {
1520             if (!vtd_bus->dev_as[i]) {
1521                 continue;
1522             }
1523             vtd_switch_address_space(vtd_bus->dev_as[i]);
1524         }
1525     }
1526 }
1527 
1528 static inline uint16_t vtd_make_source_id(uint8_t bus_num, uint8_t devfn)
1529 {
1530     return ((bus_num & 0xffUL) << 8) | (devfn & 0xffUL);
1531 }
1532 
1533 static const bool vtd_qualified_faults[] = {
1534     [VTD_FR_RESERVED] = false,
1535     [VTD_FR_ROOT_ENTRY_P] = false,
1536     [VTD_FR_CONTEXT_ENTRY_P] = true,
1537     [VTD_FR_CONTEXT_ENTRY_INV] = true,
1538     [VTD_FR_ADDR_BEYOND_MGAW] = true,
1539     [VTD_FR_WRITE] = true,
1540     [VTD_FR_READ] = true,
1541     [VTD_FR_PAGING_ENTRY_INV] = true,
1542     [VTD_FR_ROOT_TABLE_INV] = false,
1543     [VTD_FR_CONTEXT_TABLE_INV] = false,
1544     [VTD_FR_ROOT_ENTRY_RSVD] = false,
1545     [VTD_FR_PAGING_ENTRY_RSVD] = true,
1546     [VTD_FR_CONTEXT_ENTRY_TT] = true,
1547     [VTD_FR_PASID_TABLE_INV] = false,
1548     [VTD_FR_RESERVED_ERR] = false,
1549     [VTD_FR_MAX] = false,
1550 };
1551 
1552 /* To see if a fault condition is "qualified", which is reported to software
1553  * only if the FPD field in the context-entry used to process the faulting
1554  * request is 0.
1555  */
1556 static inline bool vtd_is_qualified_fault(VTDFaultReason fault)
1557 {
1558     return vtd_qualified_faults[fault];
1559 }
1560 
1561 static inline bool vtd_is_interrupt_addr(hwaddr addr)
1562 {
1563     return VTD_INTERRUPT_ADDR_FIRST <= addr && addr <= VTD_INTERRUPT_ADDR_LAST;
1564 }
1565 
1566 static void vtd_pt_enable_fast_path(IntelIOMMUState *s, uint16_t source_id)
1567 {
1568     VTDBus *vtd_bus;
1569     VTDAddressSpace *vtd_as;
1570     bool success = false;
1571 
1572     vtd_bus = vtd_find_as_from_bus_num(s, VTD_SID_TO_BUS(source_id));
1573     if (!vtd_bus) {
1574         goto out;
1575     }
1576 
1577     vtd_as = vtd_bus->dev_as[VTD_SID_TO_DEVFN(source_id)];
1578     if (!vtd_as) {
1579         goto out;
1580     }
1581 
1582     if (vtd_switch_address_space(vtd_as) == false) {
1583         /* We switched off IOMMU region successfully. */
1584         success = true;
1585     }
1586 
1587 out:
1588     trace_vtd_pt_enable_fast_path(source_id, success);
1589 }
1590 
1591 /* Map dev to context-entry then do a paging-structures walk to do a iommu
1592  * translation.
1593  *
1594  * Called from RCU critical section.
1595  *
1596  * @bus_num: The bus number
1597  * @devfn: The devfn, which is the  combined of device and function number
1598  * @is_write: The access is a write operation
1599  * @entry: IOMMUTLBEntry that contain the addr to be translated and result
1600  *
1601  * Returns true if translation is successful, otherwise false.
1602  */
1603 static bool vtd_do_iommu_translate(VTDAddressSpace *vtd_as, PCIBus *bus,
1604                                    uint8_t devfn, hwaddr addr, bool is_write,
1605                                    IOMMUTLBEntry *entry)
1606 {
1607     IntelIOMMUState *s = vtd_as->iommu_state;
1608     VTDContextEntry ce;
1609     uint8_t bus_num = pci_bus_num(bus);
1610     VTDContextCacheEntry *cc_entry;
1611     uint64_t slpte, page_mask;
1612     uint32_t level;
1613     uint16_t source_id = vtd_make_source_id(bus_num, devfn);
1614     int ret_fr;
1615     bool is_fpd_set = false;
1616     bool reads = true;
1617     bool writes = true;
1618     uint8_t access_flags;
1619     VTDIOTLBEntry *iotlb_entry;
1620 
1621     /*
1622      * We have standalone memory region for interrupt addresses, we
1623      * should never receive translation requests in this region.
1624      */
1625     assert(!vtd_is_interrupt_addr(addr));
1626 
1627     vtd_iommu_lock(s);
1628 
1629     cc_entry = &vtd_as->context_cache_entry;
1630 
1631     /* Try to fetch slpte form IOTLB */
1632     iotlb_entry = vtd_lookup_iotlb(s, source_id, addr);
1633     if (iotlb_entry) {
1634         trace_vtd_iotlb_page_hit(source_id, addr, iotlb_entry->slpte,
1635                                  iotlb_entry->domain_id);
1636         slpte = iotlb_entry->slpte;
1637         access_flags = iotlb_entry->access_flags;
1638         page_mask = iotlb_entry->mask;
1639         goto out;
1640     }
1641 
1642     /* Try to fetch context-entry from cache first */
1643     if (cc_entry->context_cache_gen == s->context_cache_gen) {
1644         trace_vtd_iotlb_cc_hit(bus_num, devfn, cc_entry->context_entry.hi,
1645                                cc_entry->context_entry.lo,
1646                                cc_entry->context_cache_gen);
1647         ce = cc_entry->context_entry;
1648         is_fpd_set = ce.lo & VTD_CONTEXT_ENTRY_FPD;
1649         if (!is_fpd_set && s->root_scalable) {
1650             ret_fr = vtd_ce_get_pasid_fpd(s, &ce, &is_fpd_set);
1651             VTD_PE_GET_FPD_ERR(ret_fr, is_fpd_set, s, source_id, addr, is_write);
1652         }
1653     } else {
1654         ret_fr = vtd_dev_to_context_entry(s, bus_num, devfn, &ce);
1655         is_fpd_set = ce.lo & VTD_CONTEXT_ENTRY_FPD;
1656         if (!ret_fr && !is_fpd_set && s->root_scalable) {
1657             ret_fr = vtd_ce_get_pasid_fpd(s, &ce, &is_fpd_set);
1658         }
1659         VTD_PE_GET_FPD_ERR(ret_fr, is_fpd_set, s, source_id, addr, is_write);
1660         /* Update context-cache */
1661         trace_vtd_iotlb_cc_update(bus_num, devfn, ce.hi, ce.lo,
1662                                   cc_entry->context_cache_gen,
1663                                   s->context_cache_gen);
1664         cc_entry->context_entry = ce;
1665         cc_entry->context_cache_gen = s->context_cache_gen;
1666     }
1667 
1668     /*
1669      * We don't need to translate for pass-through context entries.
1670      * Also, let's ignore IOTLB caching as well for PT devices.
1671      */
1672     if (vtd_ce_get_type(&ce) == VTD_CONTEXT_TT_PASS_THROUGH) {
1673         entry->iova = addr & VTD_PAGE_MASK_4K;
1674         entry->translated_addr = entry->iova;
1675         entry->addr_mask = ~VTD_PAGE_MASK_4K;
1676         entry->perm = IOMMU_RW;
1677         trace_vtd_translate_pt(source_id, entry->iova);
1678 
1679         /*
1680          * When this happens, it means firstly caching-mode is not
1681          * enabled, and this is the first passthrough translation for
1682          * the device. Let's enable the fast path for passthrough.
1683          *
1684          * When passthrough is disabled again for the device, we can
1685          * capture it via the context entry invalidation, then the
1686          * IOMMU region can be swapped back.
1687          */
1688         vtd_pt_enable_fast_path(s, source_id);
1689         vtd_iommu_unlock(s);
1690         return true;
1691     }
1692 
1693     ret_fr = vtd_iova_to_slpte(s, &ce, addr, is_write, &slpte, &level,
1694                                &reads, &writes, s->aw_bits);
1695     VTD_PE_GET_FPD_ERR(ret_fr, is_fpd_set, s, source_id, addr, is_write);
1696 
1697     page_mask = vtd_slpt_level_page_mask(level);
1698     access_flags = IOMMU_ACCESS_FLAG(reads, writes);
1699     vtd_update_iotlb(s, source_id, vtd_get_domain_id(s, &ce), addr, slpte,
1700                      access_flags, level);
1701 out:
1702     vtd_iommu_unlock(s);
1703     entry->iova = addr & page_mask;
1704     entry->translated_addr = vtd_get_slpte_addr(slpte, s->aw_bits) & page_mask;
1705     entry->addr_mask = ~page_mask;
1706     entry->perm = access_flags;
1707     return true;
1708 
1709 error:
1710     vtd_iommu_unlock(s);
1711     entry->iova = 0;
1712     entry->translated_addr = 0;
1713     entry->addr_mask = 0;
1714     entry->perm = IOMMU_NONE;
1715     return false;
1716 }
1717 
1718 static void vtd_root_table_setup(IntelIOMMUState *s)
1719 {
1720     s->root = vtd_get_quad_raw(s, DMAR_RTADDR_REG);
1721     s->root &= VTD_RTADDR_ADDR_MASK(s->aw_bits);
1722 
1723     vtd_update_scalable_state(s);
1724 
1725     trace_vtd_reg_dmar_root(s->root, s->root_scalable);
1726 }
1727 
1728 static void vtd_iec_notify_all(IntelIOMMUState *s, bool global,
1729                                uint32_t index, uint32_t mask)
1730 {
1731     x86_iommu_iec_notify_all(X86_IOMMU_DEVICE(s), global, index, mask);
1732 }
1733 
1734 static void vtd_interrupt_remap_table_setup(IntelIOMMUState *s)
1735 {
1736     uint64_t value = 0;
1737     value = vtd_get_quad_raw(s, DMAR_IRTA_REG);
1738     s->intr_size = 1UL << ((value & VTD_IRTA_SIZE_MASK) + 1);
1739     s->intr_root = value & VTD_IRTA_ADDR_MASK(s->aw_bits);
1740     s->intr_eime = value & VTD_IRTA_EIME;
1741 
1742     /* Notify global invalidation */
1743     vtd_iec_notify_all(s, true, 0, 0);
1744 
1745     trace_vtd_reg_ir_root(s->intr_root, s->intr_size);
1746 }
1747 
1748 static void vtd_iommu_replay_all(IntelIOMMUState *s)
1749 {
1750     VTDAddressSpace *vtd_as;
1751 
1752     QLIST_FOREACH(vtd_as, &s->vtd_as_with_notifiers, next) {
1753         vtd_sync_shadow_page_table(vtd_as);
1754     }
1755 }
1756 
1757 static void vtd_context_global_invalidate(IntelIOMMUState *s)
1758 {
1759     trace_vtd_inv_desc_cc_global();
1760     /* Protects context cache */
1761     vtd_iommu_lock(s);
1762     s->context_cache_gen++;
1763     if (s->context_cache_gen == VTD_CONTEXT_CACHE_GEN_MAX) {
1764         vtd_reset_context_cache_locked(s);
1765     }
1766     vtd_iommu_unlock(s);
1767     vtd_address_space_refresh_all(s);
1768     /*
1769      * From VT-d spec 6.5.2.1, a global context entry invalidation
1770      * should be followed by a IOTLB global invalidation, so we should
1771      * be safe even without this. Hoewever, let's replay the region as
1772      * well to be safer, and go back here when we need finer tunes for
1773      * VT-d emulation codes.
1774      */
1775     vtd_iommu_replay_all(s);
1776 }
1777 
1778 /* Do a context-cache device-selective invalidation.
1779  * @func_mask: FM field after shifting
1780  */
1781 static void vtd_context_device_invalidate(IntelIOMMUState *s,
1782                                           uint16_t source_id,
1783                                           uint16_t func_mask)
1784 {
1785     uint16_t mask;
1786     VTDBus *vtd_bus;
1787     VTDAddressSpace *vtd_as;
1788     uint8_t bus_n, devfn;
1789     uint16_t devfn_it;
1790 
1791     trace_vtd_inv_desc_cc_devices(source_id, func_mask);
1792 
1793     switch (func_mask & 3) {
1794     case 0:
1795         mask = 0;   /* No bits in the SID field masked */
1796         break;
1797     case 1:
1798         mask = 4;   /* Mask bit 2 in the SID field */
1799         break;
1800     case 2:
1801         mask = 6;   /* Mask bit 2:1 in the SID field */
1802         break;
1803     case 3:
1804         mask = 7;   /* Mask bit 2:0 in the SID field */
1805         break;
1806     }
1807     mask = ~mask;
1808 
1809     bus_n = VTD_SID_TO_BUS(source_id);
1810     vtd_bus = vtd_find_as_from_bus_num(s, bus_n);
1811     if (vtd_bus) {
1812         devfn = VTD_SID_TO_DEVFN(source_id);
1813         for (devfn_it = 0; devfn_it < PCI_DEVFN_MAX; ++devfn_it) {
1814             vtd_as = vtd_bus->dev_as[devfn_it];
1815             if (vtd_as && ((devfn_it & mask) == (devfn & mask))) {
1816                 trace_vtd_inv_desc_cc_device(bus_n, VTD_PCI_SLOT(devfn_it),
1817                                              VTD_PCI_FUNC(devfn_it));
1818                 vtd_iommu_lock(s);
1819                 vtd_as->context_cache_entry.context_cache_gen = 0;
1820                 vtd_iommu_unlock(s);
1821                 /*
1822                  * Do switch address space when needed, in case if the
1823                  * device passthrough bit is switched.
1824                  */
1825                 vtd_switch_address_space(vtd_as);
1826                 /*
1827                  * So a device is moving out of (or moving into) a
1828                  * domain, resync the shadow page table.
1829                  * This won't bring bad even if we have no such
1830                  * notifier registered - the IOMMU notification
1831                  * framework will skip MAP notifications if that
1832                  * happened.
1833                  */
1834                 vtd_sync_shadow_page_table(vtd_as);
1835             }
1836         }
1837     }
1838 }
1839 
1840 /* Context-cache invalidation
1841  * Returns the Context Actual Invalidation Granularity.
1842  * @val: the content of the CCMD_REG
1843  */
1844 static uint64_t vtd_context_cache_invalidate(IntelIOMMUState *s, uint64_t val)
1845 {
1846     uint64_t caig;
1847     uint64_t type = val & VTD_CCMD_CIRG_MASK;
1848 
1849     switch (type) {
1850     case VTD_CCMD_DOMAIN_INVL:
1851         /* Fall through */
1852     case VTD_CCMD_GLOBAL_INVL:
1853         caig = VTD_CCMD_GLOBAL_INVL_A;
1854         vtd_context_global_invalidate(s);
1855         break;
1856 
1857     case VTD_CCMD_DEVICE_INVL:
1858         caig = VTD_CCMD_DEVICE_INVL_A;
1859         vtd_context_device_invalidate(s, VTD_CCMD_SID(val), VTD_CCMD_FM(val));
1860         break;
1861 
1862     default:
1863         error_report_once("%s: invalid context: 0x%" PRIx64,
1864                           __func__, val);
1865         caig = 0;
1866     }
1867     return caig;
1868 }
1869 
1870 static void vtd_iotlb_global_invalidate(IntelIOMMUState *s)
1871 {
1872     trace_vtd_inv_desc_iotlb_global();
1873     vtd_reset_iotlb(s);
1874     vtd_iommu_replay_all(s);
1875 }
1876 
1877 static void vtd_iotlb_domain_invalidate(IntelIOMMUState *s, uint16_t domain_id)
1878 {
1879     VTDContextEntry ce;
1880     VTDAddressSpace *vtd_as;
1881 
1882     trace_vtd_inv_desc_iotlb_domain(domain_id);
1883 
1884     vtd_iommu_lock(s);
1885     g_hash_table_foreach_remove(s->iotlb, vtd_hash_remove_by_domain,
1886                                 &domain_id);
1887     vtd_iommu_unlock(s);
1888 
1889     QLIST_FOREACH(vtd_as, &s->vtd_as_with_notifiers, next) {
1890         if (!vtd_dev_to_context_entry(s, pci_bus_num(vtd_as->bus),
1891                                       vtd_as->devfn, &ce) &&
1892             domain_id == vtd_get_domain_id(s, &ce)) {
1893             vtd_sync_shadow_page_table(vtd_as);
1894         }
1895     }
1896 }
1897 
1898 static void vtd_iotlb_page_invalidate_notify(IntelIOMMUState *s,
1899                                            uint16_t domain_id, hwaddr addr,
1900                                            uint8_t am)
1901 {
1902     VTDAddressSpace *vtd_as;
1903     VTDContextEntry ce;
1904     int ret;
1905     hwaddr size = (1 << am) * VTD_PAGE_SIZE;
1906 
1907     QLIST_FOREACH(vtd_as, &(s->vtd_as_with_notifiers), next) {
1908         ret = vtd_dev_to_context_entry(s, pci_bus_num(vtd_as->bus),
1909                                        vtd_as->devfn, &ce);
1910         if (!ret && domain_id == vtd_get_domain_id(s, &ce)) {
1911             if (vtd_as_has_map_notifier(vtd_as)) {
1912                 /*
1913                  * As long as we have MAP notifications registered in
1914                  * any of our IOMMU notifiers, we need to sync the
1915                  * shadow page table.
1916                  */
1917                 vtd_sync_shadow_page_table_range(vtd_as, &ce, addr, size);
1918             } else {
1919                 /*
1920                  * For UNMAP-only notifiers, we don't need to walk the
1921                  * page tables.  We just deliver the PSI down to
1922                  * invalidate caches.
1923                  */
1924                 IOMMUTLBEntry entry = {
1925                     .target_as = &address_space_memory,
1926                     .iova = addr,
1927                     .translated_addr = 0,
1928                     .addr_mask = size - 1,
1929                     .perm = IOMMU_NONE,
1930                 };
1931                 memory_region_notify_iommu(&vtd_as->iommu, 0, entry);
1932             }
1933         }
1934     }
1935 }
1936 
1937 static void vtd_iotlb_page_invalidate(IntelIOMMUState *s, uint16_t domain_id,
1938                                       hwaddr addr, uint8_t am)
1939 {
1940     VTDIOTLBPageInvInfo info;
1941 
1942     trace_vtd_inv_desc_iotlb_pages(domain_id, addr, am);
1943 
1944     assert(am <= VTD_MAMV);
1945     info.domain_id = domain_id;
1946     info.addr = addr;
1947     info.mask = ~((1 << am) - 1);
1948     vtd_iommu_lock(s);
1949     g_hash_table_foreach_remove(s->iotlb, vtd_hash_remove_by_page, &info);
1950     vtd_iommu_unlock(s);
1951     vtd_iotlb_page_invalidate_notify(s, domain_id, addr, am);
1952 }
1953 
1954 /* Flush IOTLB
1955  * Returns the IOTLB Actual Invalidation Granularity.
1956  * @val: the content of the IOTLB_REG
1957  */
1958 static uint64_t vtd_iotlb_flush(IntelIOMMUState *s, uint64_t val)
1959 {
1960     uint64_t iaig;
1961     uint64_t type = val & VTD_TLB_FLUSH_GRANU_MASK;
1962     uint16_t domain_id;
1963     hwaddr addr;
1964     uint8_t am;
1965 
1966     switch (type) {
1967     case VTD_TLB_GLOBAL_FLUSH:
1968         iaig = VTD_TLB_GLOBAL_FLUSH_A;
1969         vtd_iotlb_global_invalidate(s);
1970         break;
1971 
1972     case VTD_TLB_DSI_FLUSH:
1973         domain_id = VTD_TLB_DID(val);
1974         iaig = VTD_TLB_DSI_FLUSH_A;
1975         vtd_iotlb_domain_invalidate(s, domain_id);
1976         break;
1977 
1978     case VTD_TLB_PSI_FLUSH:
1979         domain_id = VTD_TLB_DID(val);
1980         addr = vtd_get_quad_raw(s, DMAR_IVA_REG);
1981         am = VTD_IVA_AM(addr);
1982         addr = VTD_IVA_ADDR(addr);
1983         if (am > VTD_MAMV) {
1984             error_report_once("%s: address mask overflow: 0x%" PRIx64,
1985                               __func__, vtd_get_quad_raw(s, DMAR_IVA_REG));
1986             iaig = 0;
1987             break;
1988         }
1989         iaig = VTD_TLB_PSI_FLUSH_A;
1990         vtd_iotlb_page_invalidate(s, domain_id, addr, am);
1991         break;
1992 
1993     default:
1994         error_report_once("%s: invalid granularity: 0x%" PRIx64,
1995                           __func__, val);
1996         iaig = 0;
1997     }
1998     return iaig;
1999 }
2000 
2001 static void vtd_fetch_inv_desc(IntelIOMMUState *s);
2002 
2003 static inline bool vtd_queued_inv_disable_check(IntelIOMMUState *s)
2004 {
2005     return s->qi_enabled && (s->iq_tail == s->iq_head) &&
2006            (s->iq_last_desc_type == VTD_INV_DESC_WAIT);
2007 }
2008 
2009 static void vtd_handle_gcmd_qie(IntelIOMMUState *s, bool en)
2010 {
2011     uint64_t iqa_val = vtd_get_quad_raw(s, DMAR_IQA_REG);
2012 
2013     trace_vtd_inv_qi_enable(en);
2014 
2015     if (en) {
2016         s->iq = iqa_val & VTD_IQA_IQA_MASK(s->aw_bits);
2017         /* 2^(x+8) entries */
2018         s->iq_size = 1UL << ((iqa_val & VTD_IQA_QS) + 8 - (s->iq_dw ? 1 : 0));
2019         s->qi_enabled = true;
2020         trace_vtd_inv_qi_setup(s->iq, s->iq_size);
2021         /* Ok - report back to driver */
2022         vtd_set_clear_mask_long(s, DMAR_GSTS_REG, 0, VTD_GSTS_QIES);
2023 
2024         if (s->iq_tail != 0) {
2025             /*
2026              * This is a spec violation but Windows guests are known to set up
2027              * Queued Invalidation this way so we allow the write and process
2028              * Invalidation Descriptors right away.
2029              */
2030             trace_vtd_warn_invalid_qi_tail(s->iq_tail);
2031             if (!(vtd_get_long_raw(s, DMAR_FSTS_REG) & VTD_FSTS_IQE)) {
2032                 vtd_fetch_inv_desc(s);
2033             }
2034         }
2035     } else {
2036         if (vtd_queued_inv_disable_check(s)) {
2037             /* disable Queued Invalidation */
2038             vtd_set_quad_raw(s, DMAR_IQH_REG, 0);
2039             s->iq_head = 0;
2040             s->qi_enabled = false;
2041             /* Ok - report back to driver */
2042             vtd_set_clear_mask_long(s, DMAR_GSTS_REG, VTD_GSTS_QIES, 0);
2043         } else {
2044             error_report_once("%s: detected improper state when disable QI "
2045                               "(head=0x%x, tail=0x%x, last_type=%d)",
2046                               __func__,
2047                               s->iq_head, s->iq_tail, s->iq_last_desc_type);
2048         }
2049     }
2050 }
2051 
2052 /* Set Root Table Pointer */
2053 static void vtd_handle_gcmd_srtp(IntelIOMMUState *s)
2054 {
2055     vtd_root_table_setup(s);
2056     /* Ok - report back to driver */
2057     vtd_set_clear_mask_long(s, DMAR_GSTS_REG, 0, VTD_GSTS_RTPS);
2058     vtd_reset_caches(s);
2059     vtd_address_space_refresh_all(s);
2060 }
2061 
2062 /* Set Interrupt Remap Table Pointer */
2063 static void vtd_handle_gcmd_sirtp(IntelIOMMUState *s)
2064 {
2065     vtd_interrupt_remap_table_setup(s);
2066     /* Ok - report back to driver */
2067     vtd_set_clear_mask_long(s, DMAR_GSTS_REG, 0, VTD_GSTS_IRTPS);
2068 }
2069 
2070 /* Handle Translation Enable/Disable */
2071 static void vtd_handle_gcmd_te(IntelIOMMUState *s, bool en)
2072 {
2073     if (s->dmar_enabled == en) {
2074         return;
2075     }
2076 
2077     trace_vtd_dmar_enable(en);
2078 
2079     if (en) {
2080         s->dmar_enabled = true;
2081         /* Ok - report back to driver */
2082         vtd_set_clear_mask_long(s, DMAR_GSTS_REG, 0, VTD_GSTS_TES);
2083     } else {
2084         s->dmar_enabled = false;
2085 
2086         /* Clear the index of Fault Recording Register */
2087         s->next_frcd_reg = 0;
2088         /* Ok - report back to driver */
2089         vtd_set_clear_mask_long(s, DMAR_GSTS_REG, VTD_GSTS_TES, 0);
2090     }
2091 
2092     vtd_reset_caches(s);
2093     vtd_address_space_refresh_all(s);
2094 }
2095 
2096 /* Handle Interrupt Remap Enable/Disable */
2097 static void vtd_handle_gcmd_ire(IntelIOMMUState *s, bool en)
2098 {
2099     trace_vtd_ir_enable(en);
2100 
2101     if (en) {
2102         s->intr_enabled = true;
2103         /* Ok - report back to driver */
2104         vtd_set_clear_mask_long(s, DMAR_GSTS_REG, 0, VTD_GSTS_IRES);
2105     } else {
2106         s->intr_enabled = false;
2107         /* Ok - report back to driver */
2108         vtd_set_clear_mask_long(s, DMAR_GSTS_REG, VTD_GSTS_IRES, 0);
2109     }
2110 }
2111 
2112 /* Handle write to Global Command Register */
2113 static void vtd_handle_gcmd_write(IntelIOMMUState *s)
2114 {
2115     uint32_t status = vtd_get_long_raw(s, DMAR_GSTS_REG);
2116     uint32_t val = vtd_get_long_raw(s, DMAR_GCMD_REG);
2117     uint32_t changed = status ^ val;
2118 
2119     trace_vtd_reg_write_gcmd(status, val);
2120     if (changed & VTD_GCMD_TE) {
2121         /* Translation enable/disable */
2122         vtd_handle_gcmd_te(s, val & VTD_GCMD_TE);
2123     }
2124     if (val & VTD_GCMD_SRTP) {
2125         /* Set/update the root-table pointer */
2126         vtd_handle_gcmd_srtp(s);
2127     }
2128     if (changed & VTD_GCMD_QIE) {
2129         /* Queued Invalidation Enable */
2130         vtd_handle_gcmd_qie(s, val & VTD_GCMD_QIE);
2131     }
2132     if (val & VTD_GCMD_SIRTP) {
2133         /* Set/update the interrupt remapping root-table pointer */
2134         vtd_handle_gcmd_sirtp(s);
2135     }
2136     if (changed & VTD_GCMD_IRE) {
2137         /* Interrupt remap enable/disable */
2138         vtd_handle_gcmd_ire(s, val & VTD_GCMD_IRE);
2139     }
2140 }
2141 
2142 /* Handle write to Context Command Register */
2143 static void vtd_handle_ccmd_write(IntelIOMMUState *s)
2144 {
2145     uint64_t ret;
2146     uint64_t val = vtd_get_quad_raw(s, DMAR_CCMD_REG);
2147 
2148     /* Context-cache invalidation request */
2149     if (val & VTD_CCMD_ICC) {
2150         if (s->qi_enabled) {
2151             error_report_once("Queued Invalidation enabled, "
2152                               "should not use register-based invalidation");
2153             return;
2154         }
2155         ret = vtd_context_cache_invalidate(s, val);
2156         /* Invalidation completed. Change something to show */
2157         vtd_set_clear_mask_quad(s, DMAR_CCMD_REG, VTD_CCMD_ICC, 0ULL);
2158         ret = vtd_set_clear_mask_quad(s, DMAR_CCMD_REG, VTD_CCMD_CAIG_MASK,
2159                                       ret);
2160     }
2161 }
2162 
2163 /* Handle write to IOTLB Invalidation Register */
2164 static void vtd_handle_iotlb_write(IntelIOMMUState *s)
2165 {
2166     uint64_t ret;
2167     uint64_t val = vtd_get_quad_raw(s, DMAR_IOTLB_REG);
2168 
2169     /* IOTLB invalidation request */
2170     if (val & VTD_TLB_IVT) {
2171         if (s->qi_enabled) {
2172             error_report_once("Queued Invalidation enabled, "
2173                               "should not use register-based invalidation");
2174             return;
2175         }
2176         ret = vtd_iotlb_flush(s, val);
2177         /* Invalidation completed. Change something to show */
2178         vtd_set_clear_mask_quad(s, DMAR_IOTLB_REG, VTD_TLB_IVT, 0ULL);
2179         ret = vtd_set_clear_mask_quad(s, DMAR_IOTLB_REG,
2180                                       VTD_TLB_FLUSH_GRANU_MASK_A, ret);
2181     }
2182 }
2183 
2184 /* Fetch an Invalidation Descriptor from the Invalidation Queue */
2185 static bool vtd_get_inv_desc(IntelIOMMUState *s,
2186                              VTDInvDesc *inv_desc)
2187 {
2188     dma_addr_t base_addr = s->iq;
2189     uint32_t offset = s->iq_head;
2190     uint32_t dw = s->iq_dw ? 32 : 16;
2191     dma_addr_t addr = base_addr + offset * dw;
2192 
2193     if (dma_memory_read(&address_space_memory, addr, inv_desc, dw)) {
2194         error_report_once("Read INV DESC failed.");
2195         return false;
2196     }
2197     inv_desc->lo = le64_to_cpu(inv_desc->lo);
2198     inv_desc->hi = le64_to_cpu(inv_desc->hi);
2199     if (dw == 32) {
2200         inv_desc->val[2] = le64_to_cpu(inv_desc->val[2]);
2201         inv_desc->val[3] = le64_to_cpu(inv_desc->val[3]);
2202     }
2203     return true;
2204 }
2205 
2206 static bool vtd_process_wait_desc(IntelIOMMUState *s, VTDInvDesc *inv_desc)
2207 {
2208     if ((inv_desc->hi & VTD_INV_DESC_WAIT_RSVD_HI) ||
2209         (inv_desc->lo & VTD_INV_DESC_WAIT_RSVD_LO)) {
2210         error_report_once("%s: invalid wait desc: hi=%"PRIx64", lo=%"PRIx64
2211                           " (reserved nonzero)", __func__, inv_desc->hi,
2212                           inv_desc->lo);
2213         return false;
2214     }
2215     if (inv_desc->lo & VTD_INV_DESC_WAIT_SW) {
2216         /* Status Write */
2217         uint32_t status_data = (uint32_t)(inv_desc->lo >>
2218                                VTD_INV_DESC_WAIT_DATA_SHIFT);
2219 
2220         assert(!(inv_desc->lo & VTD_INV_DESC_WAIT_IF));
2221 
2222         /* FIXME: need to be masked with HAW? */
2223         dma_addr_t status_addr = inv_desc->hi;
2224         trace_vtd_inv_desc_wait_sw(status_addr, status_data);
2225         status_data = cpu_to_le32(status_data);
2226         if (dma_memory_write(&address_space_memory, status_addr, &status_data,
2227                              sizeof(status_data))) {
2228             trace_vtd_inv_desc_wait_write_fail(inv_desc->hi, inv_desc->lo);
2229             return false;
2230         }
2231     } else if (inv_desc->lo & VTD_INV_DESC_WAIT_IF) {
2232         /* Interrupt flag */
2233         vtd_generate_completion_event(s);
2234     } else {
2235         error_report_once("%s: invalid wait desc: hi=%"PRIx64", lo=%"PRIx64
2236                           " (unknown type)", __func__, inv_desc->hi,
2237                           inv_desc->lo);
2238         return false;
2239     }
2240     return true;
2241 }
2242 
2243 static bool vtd_process_context_cache_desc(IntelIOMMUState *s,
2244                                            VTDInvDesc *inv_desc)
2245 {
2246     uint16_t sid, fmask;
2247 
2248     if ((inv_desc->lo & VTD_INV_DESC_CC_RSVD) || inv_desc->hi) {
2249         error_report_once("%s: invalid cc inv desc: hi=%"PRIx64", lo=%"PRIx64
2250                           " (reserved nonzero)", __func__, inv_desc->hi,
2251                           inv_desc->lo);
2252         return false;
2253     }
2254     switch (inv_desc->lo & VTD_INV_DESC_CC_G) {
2255     case VTD_INV_DESC_CC_DOMAIN:
2256         trace_vtd_inv_desc_cc_domain(
2257             (uint16_t)VTD_INV_DESC_CC_DID(inv_desc->lo));
2258         /* Fall through */
2259     case VTD_INV_DESC_CC_GLOBAL:
2260         vtd_context_global_invalidate(s);
2261         break;
2262 
2263     case VTD_INV_DESC_CC_DEVICE:
2264         sid = VTD_INV_DESC_CC_SID(inv_desc->lo);
2265         fmask = VTD_INV_DESC_CC_FM(inv_desc->lo);
2266         vtd_context_device_invalidate(s, sid, fmask);
2267         break;
2268 
2269     default:
2270         error_report_once("%s: invalid cc inv desc: hi=%"PRIx64", lo=%"PRIx64
2271                           " (invalid type)", __func__, inv_desc->hi,
2272                           inv_desc->lo);
2273         return false;
2274     }
2275     return true;
2276 }
2277 
2278 static bool vtd_process_iotlb_desc(IntelIOMMUState *s, VTDInvDesc *inv_desc)
2279 {
2280     uint16_t domain_id;
2281     uint8_t am;
2282     hwaddr addr;
2283 
2284     if ((inv_desc->lo & VTD_INV_DESC_IOTLB_RSVD_LO) ||
2285         (inv_desc->hi & VTD_INV_DESC_IOTLB_RSVD_HI)) {
2286         error_report_once("%s: invalid iotlb inv desc: hi=0x%"PRIx64
2287                           ", lo=0x%"PRIx64" (reserved bits unzero)\n",
2288                           __func__, inv_desc->hi, inv_desc->lo);
2289         return false;
2290     }
2291 
2292     switch (inv_desc->lo & VTD_INV_DESC_IOTLB_G) {
2293     case VTD_INV_DESC_IOTLB_GLOBAL:
2294         vtd_iotlb_global_invalidate(s);
2295         break;
2296 
2297     case VTD_INV_DESC_IOTLB_DOMAIN:
2298         domain_id = VTD_INV_DESC_IOTLB_DID(inv_desc->lo);
2299         vtd_iotlb_domain_invalidate(s, domain_id);
2300         break;
2301 
2302     case VTD_INV_DESC_IOTLB_PAGE:
2303         domain_id = VTD_INV_DESC_IOTLB_DID(inv_desc->lo);
2304         addr = VTD_INV_DESC_IOTLB_ADDR(inv_desc->hi);
2305         am = VTD_INV_DESC_IOTLB_AM(inv_desc->hi);
2306         if (am > VTD_MAMV) {
2307             error_report_once("%s: invalid iotlb inv desc: hi=0x%"PRIx64
2308                               ", lo=0x%"PRIx64" (am=%u > VTD_MAMV=%u)\n",
2309                               __func__, inv_desc->hi, inv_desc->lo,
2310                               am, (unsigned)VTD_MAMV);
2311             return false;
2312         }
2313         vtd_iotlb_page_invalidate(s, domain_id, addr, am);
2314         break;
2315 
2316     default:
2317         error_report_once("%s: invalid iotlb inv desc: hi=0x%"PRIx64
2318                           ", lo=0x%"PRIx64" (type mismatch: 0x%llx)\n",
2319                           __func__, inv_desc->hi, inv_desc->lo,
2320                           inv_desc->lo & VTD_INV_DESC_IOTLB_G);
2321         return false;
2322     }
2323     return true;
2324 }
2325 
2326 static bool vtd_process_inv_iec_desc(IntelIOMMUState *s,
2327                                      VTDInvDesc *inv_desc)
2328 {
2329     trace_vtd_inv_desc_iec(inv_desc->iec.granularity,
2330                            inv_desc->iec.index,
2331                            inv_desc->iec.index_mask);
2332 
2333     vtd_iec_notify_all(s, !inv_desc->iec.granularity,
2334                        inv_desc->iec.index,
2335                        inv_desc->iec.index_mask);
2336     return true;
2337 }
2338 
2339 static bool vtd_process_device_iotlb_desc(IntelIOMMUState *s,
2340                                           VTDInvDesc *inv_desc)
2341 {
2342     VTDAddressSpace *vtd_dev_as;
2343     IOMMUTLBEntry entry;
2344     struct VTDBus *vtd_bus;
2345     hwaddr addr;
2346     uint64_t sz;
2347     uint16_t sid;
2348     uint8_t devfn;
2349     bool size;
2350     uint8_t bus_num;
2351 
2352     addr = VTD_INV_DESC_DEVICE_IOTLB_ADDR(inv_desc->hi);
2353     sid = VTD_INV_DESC_DEVICE_IOTLB_SID(inv_desc->lo);
2354     devfn = sid & 0xff;
2355     bus_num = sid >> 8;
2356     size = VTD_INV_DESC_DEVICE_IOTLB_SIZE(inv_desc->hi);
2357 
2358     if ((inv_desc->lo & VTD_INV_DESC_DEVICE_IOTLB_RSVD_LO) ||
2359         (inv_desc->hi & VTD_INV_DESC_DEVICE_IOTLB_RSVD_HI)) {
2360         error_report_once("%s: invalid dev-iotlb inv desc: hi=%"PRIx64
2361                           ", lo=%"PRIx64" (reserved nonzero)", __func__,
2362                           inv_desc->hi, inv_desc->lo);
2363         return false;
2364     }
2365 
2366     vtd_bus = vtd_find_as_from_bus_num(s, bus_num);
2367     if (!vtd_bus) {
2368         goto done;
2369     }
2370 
2371     vtd_dev_as = vtd_bus->dev_as[devfn];
2372     if (!vtd_dev_as) {
2373         goto done;
2374     }
2375 
2376     /* According to ATS spec table 2.4:
2377      * S = 0, bits 15:12 = xxxx     range size: 4K
2378      * S = 1, bits 15:12 = xxx0     range size: 8K
2379      * S = 1, bits 15:12 = xx01     range size: 16K
2380      * S = 1, bits 15:12 = x011     range size: 32K
2381      * S = 1, bits 15:12 = 0111     range size: 64K
2382      * ...
2383      */
2384     if (size) {
2385         sz = (VTD_PAGE_SIZE * 2) << cto64(addr >> VTD_PAGE_SHIFT);
2386         addr &= ~(sz - 1);
2387     } else {
2388         sz = VTD_PAGE_SIZE;
2389     }
2390 
2391     entry.target_as = &vtd_dev_as->as;
2392     entry.addr_mask = sz - 1;
2393     entry.iova = addr;
2394     entry.perm = IOMMU_NONE;
2395     entry.translated_addr = 0;
2396     memory_region_notify_iommu(&vtd_dev_as->iommu, 0, entry);
2397 
2398 done:
2399     return true;
2400 }
2401 
2402 static bool vtd_process_inv_desc(IntelIOMMUState *s)
2403 {
2404     VTDInvDesc inv_desc;
2405     uint8_t desc_type;
2406 
2407     trace_vtd_inv_qi_head(s->iq_head);
2408     if (!vtd_get_inv_desc(s, &inv_desc)) {
2409         s->iq_last_desc_type = VTD_INV_DESC_NONE;
2410         return false;
2411     }
2412 
2413     desc_type = inv_desc.lo & VTD_INV_DESC_TYPE;
2414     /* FIXME: should update at first or at last? */
2415     s->iq_last_desc_type = desc_type;
2416 
2417     switch (desc_type) {
2418     case VTD_INV_DESC_CC:
2419         trace_vtd_inv_desc("context-cache", inv_desc.hi, inv_desc.lo);
2420         if (!vtd_process_context_cache_desc(s, &inv_desc)) {
2421             return false;
2422         }
2423         break;
2424 
2425     case VTD_INV_DESC_IOTLB:
2426         trace_vtd_inv_desc("iotlb", inv_desc.hi, inv_desc.lo);
2427         if (!vtd_process_iotlb_desc(s, &inv_desc)) {
2428             return false;
2429         }
2430         break;
2431 
2432     /*
2433      * TODO: the entity of below two cases will be implemented in future series.
2434      * To make guest (which integrates scalable mode support patch set in
2435      * iommu driver) work, just return true is enough so far.
2436      */
2437     case VTD_INV_DESC_PC:
2438         break;
2439 
2440     case VTD_INV_DESC_PIOTLB:
2441         break;
2442 
2443     case VTD_INV_DESC_WAIT:
2444         trace_vtd_inv_desc("wait", inv_desc.hi, inv_desc.lo);
2445         if (!vtd_process_wait_desc(s, &inv_desc)) {
2446             return false;
2447         }
2448         break;
2449 
2450     case VTD_INV_DESC_IEC:
2451         trace_vtd_inv_desc("iec", inv_desc.hi, inv_desc.lo);
2452         if (!vtd_process_inv_iec_desc(s, &inv_desc)) {
2453             return false;
2454         }
2455         break;
2456 
2457     case VTD_INV_DESC_DEVICE:
2458         trace_vtd_inv_desc("device", inv_desc.hi, inv_desc.lo);
2459         if (!vtd_process_device_iotlb_desc(s, &inv_desc)) {
2460             return false;
2461         }
2462         break;
2463 
2464     default:
2465         error_report_once("%s: invalid inv desc: hi=%"PRIx64", lo=%"PRIx64
2466                           " (unknown type)", __func__, inv_desc.hi,
2467                           inv_desc.lo);
2468         return false;
2469     }
2470     s->iq_head++;
2471     if (s->iq_head == s->iq_size) {
2472         s->iq_head = 0;
2473     }
2474     return true;
2475 }
2476 
2477 /* Try to fetch and process more Invalidation Descriptors */
2478 static void vtd_fetch_inv_desc(IntelIOMMUState *s)
2479 {
2480     trace_vtd_inv_qi_fetch();
2481 
2482     if (s->iq_tail >= s->iq_size) {
2483         /* Detects an invalid Tail pointer */
2484         error_report_once("%s: detected invalid QI tail "
2485                           "(tail=0x%x, size=0x%x)",
2486                           __func__, s->iq_tail, s->iq_size);
2487         vtd_handle_inv_queue_error(s);
2488         return;
2489     }
2490     while (s->iq_head != s->iq_tail) {
2491         if (!vtd_process_inv_desc(s)) {
2492             /* Invalidation Queue Errors */
2493             vtd_handle_inv_queue_error(s);
2494             break;
2495         }
2496         /* Must update the IQH_REG in time */
2497         vtd_set_quad_raw(s, DMAR_IQH_REG,
2498                          (((uint64_t)(s->iq_head)) << VTD_IQH_QH_SHIFT) &
2499                          VTD_IQH_QH_MASK);
2500     }
2501 }
2502 
2503 /* Handle write to Invalidation Queue Tail Register */
2504 static void vtd_handle_iqt_write(IntelIOMMUState *s)
2505 {
2506     uint64_t val = vtd_get_quad_raw(s, DMAR_IQT_REG);
2507 
2508     if (s->iq_dw && (val & VTD_IQT_QT_256_RSV_BIT)) {
2509         error_report_once("%s: RSV bit is set: val=0x%"PRIx64,
2510                           __func__, val);
2511         return;
2512     }
2513     s->iq_tail = VTD_IQT_QT(s->iq_dw, val);
2514     trace_vtd_inv_qi_tail(s->iq_tail);
2515 
2516     if (s->qi_enabled && !(vtd_get_long_raw(s, DMAR_FSTS_REG) & VTD_FSTS_IQE)) {
2517         /* Process Invalidation Queue here */
2518         vtd_fetch_inv_desc(s);
2519     }
2520 }
2521 
2522 static void vtd_handle_fsts_write(IntelIOMMUState *s)
2523 {
2524     uint32_t fsts_reg = vtd_get_long_raw(s, DMAR_FSTS_REG);
2525     uint32_t fectl_reg = vtd_get_long_raw(s, DMAR_FECTL_REG);
2526     uint32_t status_fields = VTD_FSTS_PFO | VTD_FSTS_PPF | VTD_FSTS_IQE;
2527 
2528     if ((fectl_reg & VTD_FECTL_IP) && !(fsts_reg & status_fields)) {
2529         vtd_set_clear_mask_long(s, DMAR_FECTL_REG, VTD_FECTL_IP, 0);
2530         trace_vtd_fsts_clear_ip();
2531     }
2532     /* FIXME: when IQE is Clear, should we try to fetch some Invalidation
2533      * Descriptors if there are any when Queued Invalidation is enabled?
2534      */
2535 }
2536 
2537 static void vtd_handle_fectl_write(IntelIOMMUState *s)
2538 {
2539     uint32_t fectl_reg;
2540     /* FIXME: when software clears the IM field, check the IP field. But do we
2541      * need to compare the old value and the new value to conclude that
2542      * software clears the IM field? Or just check if the IM field is zero?
2543      */
2544     fectl_reg = vtd_get_long_raw(s, DMAR_FECTL_REG);
2545 
2546     trace_vtd_reg_write_fectl(fectl_reg);
2547 
2548     if ((fectl_reg & VTD_FECTL_IP) && !(fectl_reg & VTD_FECTL_IM)) {
2549         vtd_generate_interrupt(s, DMAR_FEADDR_REG, DMAR_FEDATA_REG);
2550         vtd_set_clear_mask_long(s, DMAR_FECTL_REG, VTD_FECTL_IP, 0);
2551     }
2552 }
2553 
2554 static void vtd_handle_ics_write(IntelIOMMUState *s)
2555 {
2556     uint32_t ics_reg = vtd_get_long_raw(s, DMAR_ICS_REG);
2557     uint32_t iectl_reg = vtd_get_long_raw(s, DMAR_IECTL_REG);
2558 
2559     if ((iectl_reg & VTD_IECTL_IP) && !(ics_reg & VTD_ICS_IWC)) {
2560         trace_vtd_reg_ics_clear_ip();
2561         vtd_set_clear_mask_long(s, DMAR_IECTL_REG, VTD_IECTL_IP, 0);
2562     }
2563 }
2564 
2565 static void vtd_handle_iectl_write(IntelIOMMUState *s)
2566 {
2567     uint32_t iectl_reg;
2568     /* FIXME: when software clears the IM field, check the IP field. But do we
2569      * need to compare the old value and the new value to conclude that
2570      * software clears the IM field? Or just check if the IM field is zero?
2571      */
2572     iectl_reg = vtd_get_long_raw(s, DMAR_IECTL_REG);
2573 
2574     trace_vtd_reg_write_iectl(iectl_reg);
2575 
2576     if ((iectl_reg & VTD_IECTL_IP) && !(iectl_reg & VTD_IECTL_IM)) {
2577         vtd_generate_interrupt(s, DMAR_IEADDR_REG, DMAR_IEDATA_REG);
2578         vtd_set_clear_mask_long(s, DMAR_IECTL_REG, VTD_IECTL_IP, 0);
2579     }
2580 }
2581 
2582 static uint64_t vtd_mem_read(void *opaque, hwaddr addr, unsigned size)
2583 {
2584     IntelIOMMUState *s = opaque;
2585     uint64_t val;
2586 
2587     trace_vtd_reg_read(addr, size);
2588 
2589     if (addr + size > DMAR_REG_SIZE) {
2590         error_report_once("%s: MMIO over range: addr=0x%" PRIx64
2591                           " size=0x%u", __func__, addr, size);
2592         return (uint64_t)-1;
2593     }
2594 
2595     switch (addr) {
2596     /* Root Table Address Register, 64-bit */
2597     case DMAR_RTADDR_REG:
2598         if (size == 4) {
2599             val = s->root & ((1ULL << 32) - 1);
2600         } else {
2601             val = s->root;
2602         }
2603         break;
2604 
2605     case DMAR_RTADDR_REG_HI:
2606         assert(size == 4);
2607         val = s->root >> 32;
2608         break;
2609 
2610     /* Invalidation Queue Address Register, 64-bit */
2611     case DMAR_IQA_REG:
2612         val = s->iq | (vtd_get_quad(s, DMAR_IQA_REG) & VTD_IQA_QS);
2613         if (size == 4) {
2614             val = val & ((1ULL << 32) - 1);
2615         }
2616         break;
2617 
2618     case DMAR_IQA_REG_HI:
2619         assert(size == 4);
2620         val = s->iq >> 32;
2621         break;
2622 
2623     default:
2624         if (size == 4) {
2625             val = vtd_get_long(s, addr);
2626         } else {
2627             val = vtd_get_quad(s, addr);
2628         }
2629     }
2630 
2631     return val;
2632 }
2633 
2634 static void vtd_mem_write(void *opaque, hwaddr addr,
2635                           uint64_t val, unsigned size)
2636 {
2637     IntelIOMMUState *s = opaque;
2638 
2639     trace_vtd_reg_write(addr, size, val);
2640 
2641     if (addr + size > DMAR_REG_SIZE) {
2642         error_report_once("%s: MMIO over range: addr=0x%" PRIx64
2643                           " size=0x%u", __func__, addr, size);
2644         return;
2645     }
2646 
2647     switch (addr) {
2648     /* Global Command Register, 32-bit */
2649     case DMAR_GCMD_REG:
2650         vtd_set_long(s, addr, val);
2651         vtd_handle_gcmd_write(s);
2652         break;
2653 
2654     /* Context Command Register, 64-bit */
2655     case DMAR_CCMD_REG:
2656         if (size == 4) {
2657             vtd_set_long(s, addr, val);
2658         } else {
2659             vtd_set_quad(s, addr, val);
2660             vtd_handle_ccmd_write(s);
2661         }
2662         break;
2663 
2664     case DMAR_CCMD_REG_HI:
2665         assert(size == 4);
2666         vtd_set_long(s, addr, val);
2667         vtd_handle_ccmd_write(s);
2668         break;
2669 
2670     /* IOTLB Invalidation Register, 64-bit */
2671     case DMAR_IOTLB_REG:
2672         if (size == 4) {
2673             vtd_set_long(s, addr, val);
2674         } else {
2675             vtd_set_quad(s, addr, val);
2676             vtd_handle_iotlb_write(s);
2677         }
2678         break;
2679 
2680     case DMAR_IOTLB_REG_HI:
2681         assert(size == 4);
2682         vtd_set_long(s, addr, val);
2683         vtd_handle_iotlb_write(s);
2684         break;
2685 
2686     /* Invalidate Address Register, 64-bit */
2687     case DMAR_IVA_REG:
2688         if (size == 4) {
2689             vtd_set_long(s, addr, val);
2690         } else {
2691             vtd_set_quad(s, addr, val);
2692         }
2693         break;
2694 
2695     case DMAR_IVA_REG_HI:
2696         assert(size == 4);
2697         vtd_set_long(s, addr, val);
2698         break;
2699 
2700     /* Fault Status Register, 32-bit */
2701     case DMAR_FSTS_REG:
2702         assert(size == 4);
2703         vtd_set_long(s, addr, val);
2704         vtd_handle_fsts_write(s);
2705         break;
2706 
2707     /* Fault Event Control Register, 32-bit */
2708     case DMAR_FECTL_REG:
2709         assert(size == 4);
2710         vtd_set_long(s, addr, val);
2711         vtd_handle_fectl_write(s);
2712         break;
2713 
2714     /* Fault Event Data Register, 32-bit */
2715     case DMAR_FEDATA_REG:
2716         assert(size == 4);
2717         vtd_set_long(s, addr, val);
2718         break;
2719 
2720     /* Fault Event Address Register, 32-bit */
2721     case DMAR_FEADDR_REG:
2722         if (size == 4) {
2723             vtd_set_long(s, addr, val);
2724         } else {
2725             /*
2726              * While the register is 32-bit only, some guests (Xen...) write to
2727              * it with 64-bit.
2728              */
2729             vtd_set_quad(s, addr, val);
2730         }
2731         break;
2732 
2733     /* Fault Event Upper Address Register, 32-bit */
2734     case DMAR_FEUADDR_REG:
2735         assert(size == 4);
2736         vtd_set_long(s, addr, val);
2737         break;
2738 
2739     /* Protected Memory Enable Register, 32-bit */
2740     case DMAR_PMEN_REG:
2741         assert(size == 4);
2742         vtd_set_long(s, addr, val);
2743         break;
2744 
2745     /* Root Table Address Register, 64-bit */
2746     case DMAR_RTADDR_REG:
2747         if (size == 4) {
2748             vtd_set_long(s, addr, val);
2749         } else {
2750             vtd_set_quad(s, addr, val);
2751         }
2752         break;
2753 
2754     case DMAR_RTADDR_REG_HI:
2755         assert(size == 4);
2756         vtd_set_long(s, addr, val);
2757         break;
2758 
2759     /* Invalidation Queue Tail Register, 64-bit */
2760     case DMAR_IQT_REG:
2761         if (size == 4) {
2762             vtd_set_long(s, addr, val);
2763         } else {
2764             vtd_set_quad(s, addr, val);
2765         }
2766         vtd_handle_iqt_write(s);
2767         break;
2768 
2769     case DMAR_IQT_REG_HI:
2770         assert(size == 4);
2771         vtd_set_long(s, addr, val);
2772         /* 19:63 of IQT_REG is RsvdZ, do nothing here */
2773         break;
2774 
2775     /* Invalidation Queue Address Register, 64-bit */
2776     case DMAR_IQA_REG:
2777         if (size == 4) {
2778             vtd_set_long(s, addr, val);
2779         } else {
2780             vtd_set_quad(s, addr, val);
2781         }
2782         if (s->ecap & VTD_ECAP_SMTS &&
2783             val & VTD_IQA_DW_MASK) {
2784             s->iq_dw = true;
2785         } else {
2786             s->iq_dw = false;
2787         }
2788         break;
2789 
2790     case DMAR_IQA_REG_HI:
2791         assert(size == 4);
2792         vtd_set_long(s, addr, val);
2793         break;
2794 
2795     /* Invalidation Completion Status Register, 32-bit */
2796     case DMAR_ICS_REG:
2797         assert(size == 4);
2798         vtd_set_long(s, addr, val);
2799         vtd_handle_ics_write(s);
2800         break;
2801 
2802     /* Invalidation Event Control Register, 32-bit */
2803     case DMAR_IECTL_REG:
2804         assert(size == 4);
2805         vtd_set_long(s, addr, val);
2806         vtd_handle_iectl_write(s);
2807         break;
2808 
2809     /* Invalidation Event Data Register, 32-bit */
2810     case DMAR_IEDATA_REG:
2811         assert(size == 4);
2812         vtd_set_long(s, addr, val);
2813         break;
2814 
2815     /* Invalidation Event Address Register, 32-bit */
2816     case DMAR_IEADDR_REG:
2817         assert(size == 4);
2818         vtd_set_long(s, addr, val);
2819         break;
2820 
2821     /* Invalidation Event Upper Address Register, 32-bit */
2822     case DMAR_IEUADDR_REG:
2823         assert(size == 4);
2824         vtd_set_long(s, addr, val);
2825         break;
2826 
2827     /* Fault Recording Registers, 128-bit */
2828     case DMAR_FRCD_REG_0_0:
2829         if (size == 4) {
2830             vtd_set_long(s, addr, val);
2831         } else {
2832             vtd_set_quad(s, addr, val);
2833         }
2834         break;
2835 
2836     case DMAR_FRCD_REG_0_1:
2837         assert(size == 4);
2838         vtd_set_long(s, addr, val);
2839         break;
2840 
2841     case DMAR_FRCD_REG_0_2:
2842         if (size == 4) {
2843             vtd_set_long(s, addr, val);
2844         } else {
2845             vtd_set_quad(s, addr, val);
2846             /* May clear bit 127 (Fault), update PPF */
2847             vtd_update_fsts_ppf(s);
2848         }
2849         break;
2850 
2851     case DMAR_FRCD_REG_0_3:
2852         assert(size == 4);
2853         vtd_set_long(s, addr, val);
2854         /* May clear bit 127 (Fault), update PPF */
2855         vtd_update_fsts_ppf(s);
2856         break;
2857 
2858     case DMAR_IRTA_REG:
2859         if (size == 4) {
2860             vtd_set_long(s, addr, val);
2861         } else {
2862             vtd_set_quad(s, addr, val);
2863         }
2864         break;
2865 
2866     case DMAR_IRTA_REG_HI:
2867         assert(size == 4);
2868         vtd_set_long(s, addr, val);
2869         break;
2870 
2871     default:
2872         if (size == 4) {
2873             vtd_set_long(s, addr, val);
2874         } else {
2875             vtd_set_quad(s, addr, val);
2876         }
2877     }
2878 }
2879 
2880 static IOMMUTLBEntry vtd_iommu_translate(IOMMUMemoryRegion *iommu, hwaddr addr,
2881                                          IOMMUAccessFlags flag, int iommu_idx)
2882 {
2883     VTDAddressSpace *vtd_as = container_of(iommu, VTDAddressSpace, iommu);
2884     IntelIOMMUState *s = vtd_as->iommu_state;
2885     IOMMUTLBEntry iotlb = {
2886         /* We'll fill in the rest later. */
2887         .target_as = &address_space_memory,
2888     };
2889     bool success;
2890 
2891     if (likely(s->dmar_enabled)) {
2892         success = vtd_do_iommu_translate(vtd_as, vtd_as->bus, vtd_as->devfn,
2893                                          addr, flag & IOMMU_WO, &iotlb);
2894     } else {
2895         /* DMAR disabled, passthrough, use 4k-page*/
2896         iotlb.iova = addr & VTD_PAGE_MASK_4K;
2897         iotlb.translated_addr = addr & VTD_PAGE_MASK_4K;
2898         iotlb.addr_mask = ~VTD_PAGE_MASK_4K;
2899         iotlb.perm = IOMMU_RW;
2900         success = true;
2901     }
2902 
2903     if (likely(success)) {
2904         trace_vtd_dmar_translate(pci_bus_num(vtd_as->bus),
2905                                  VTD_PCI_SLOT(vtd_as->devfn),
2906                                  VTD_PCI_FUNC(vtd_as->devfn),
2907                                  iotlb.iova, iotlb.translated_addr,
2908                                  iotlb.addr_mask);
2909     } else {
2910         error_report_once("%s: detected translation failure "
2911                           "(dev=%02x:%02x:%02x, iova=0x%" PRIx64 ")",
2912                           __func__, pci_bus_num(vtd_as->bus),
2913                           VTD_PCI_SLOT(vtd_as->devfn),
2914                           VTD_PCI_FUNC(vtd_as->devfn),
2915                           addr);
2916     }
2917 
2918     return iotlb;
2919 }
2920 
2921 static void vtd_iommu_notify_flag_changed(IOMMUMemoryRegion *iommu,
2922                                           IOMMUNotifierFlag old,
2923                                           IOMMUNotifierFlag new)
2924 {
2925     VTDAddressSpace *vtd_as = container_of(iommu, VTDAddressSpace, iommu);
2926     IntelIOMMUState *s = vtd_as->iommu_state;
2927 
2928     if (!s->caching_mode && new & IOMMU_NOTIFIER_MAP) {
2929         error_report("We need to set caching-mode=on for intel-iommu to enable "
2930                      "device assignment with IOMMU protection.");
2931         exit(1);
2932     }
2933 
2934     /* Update per-address-space notifier flags */
2935     vtd_as->notifier_flags = new;
2936 
2937     if (old == IOMMU_NOTIFIER_NONE) {
2938         QLIST_INSERT_HEAD(&s->vtd_as_with_notifiers, vtd_as, next);
2939     } else if (new == IOMMU_NOTIFIER_NONE) {
2940         QLIST_REMOVE(vtd_as, next);
2941     }
2942 }
2943 
2944 static int vtd_post_load(void *opaque, int version_id)
2945 {
2946     IntelIOMMUState *iommu = opaque;
2947 
2948     /*
2949      * Memory regions are dynamically turned on/off depending on
2950      * context entry configurations from the guest. After migration,
2951      * we need to make sure the memory regions are still correct.
2952      */
2953     vtd_switch_address_space_all(iommu);
2954 
2955     /*
2956      * We don't need to migrate the root_scalable because we can
2957      * simply do the calculation after the loading is complete.  We
2958      * can actually do similar things with root, dmar_enabled, etc.
2959      * however since we've had them already so we'd better keep them
2960      * for compatibility of migration.
2961      */
2962     vtd_update_scalable_state(iommu);
2963 
2964     return 0;
2965 }
2966 
2967 static const VMStateDescription vtd_vmstate = {
2968     .name = "iommu-intel",
2969     .version_id = 1,
2970     .minimum_version_id = 1,
2971     .priority = MIG_PRI_IOMMU,
2972     .post_load = vtd_post_load,
2973     .fields = (VMStateField[]) {
2974         VMSTATE_UINT64(root, IntelIOMMUState),
2975         VMSTATE_UINT64(intr_root, IntelIOMMUState),
2976         VMSTATE_UINT64(iq, IntelIOMMUState),
2977         VMSTATE_UINT32(intr_size, IntelIOMMUState),
2978         VMSTATE_UINT16(iq_head, IntelIOMMUState),
2979         VMSTATE_UINT16(iq_tail, IntelIOMMUState),
2980         VMSTATE_UINT16(iq_size, IntelIOMMUState),
2981         VMSTATE_UINT16(next_frcd_reg, IntelIOMMUState),
2982         VMSTATE_UINT8_ARRAY(csr, IntelIOMMUState, DMAR_REG_SIZE),
2983         VMSTATE_UINT8(iq_last_desc_type, IntelIOMMUState),
2984         VMSTATE_UNUSED(1),      /* bool root_extended is obsolete by VT-d */
2985         VMSTATE_BOOL(dmar_enabled, IntelIOMMUState),
2986         VMSTATE_BOOL(qi_enabled, IntelIOMMUState),
2987         VMSTATE_BOOL(intr_enabled, IntelIOMMUState),
2988         VMSTATE_BOOL(intr_eime, IntelIOMMUState),
2989         VMSTATE_END_OF_LIST()
2990     }
2991 };
2992 
2993 static const MemoryRegionOps vtd_mem_ops = {
2994     .read = vtd_mem_read,
2995     .write = vtd_mem_write,
2996     .endianness = DEVICE_LITTLE_ENDIAN,
2997     .impl = {
2998         .min_access_size = 4,
2999         .max_access_size = 8,
3000     },
3001     .valid = {
3002         .min_access_size = 4,
3003         .max_access_size = 8,
3004     },
3005 };
3006 
3007 static Property vtd_properties[] = {
3008     DEFINE_PROP_UINT32("version", IntelIOMMUState, version, 0),
3009     DEFINE_PROP_ON_OFF_AUTO("eim", IntelIOMMUState, intr_eim,
3010                             ON_OFF_AUTO_AUTO),
3011     DEFINE_PROP_BOOL("x-buggy-eim", IntelIOMMUState, buggy_eim, false),
3012     DEFINE_PROP_UINT8("aw-bits", IntelIOMMUState, aw_bits,
3013                       VTD_HOST_ADDRESS_WIDTH),
3014     DEFINE_PROP_BOOL("caching-mode", IntelIOMMUState, caching_mode, FALSE),
3015     DEFINE_PROP_BOOL("x-scalable-mode", IntelIOMMUState, scalable_mode, FALSE),
3016     DEFINE_PROP_BOOL("dma-drain", IntelIOMMUState, dma_drain, true),
3017     DEFINE_PROP_END_OF_LIST(),
3018 };
3019 
3020 /* Read IRTE entry with specific index */
3021 static int vtd_irte_get(IntelIOMMUState *iommu, uint16_t index,
3022                         VTD_IR_TableEntry *entry, uint16_t sid)
3023 {
3024     static const uint16_t vtd_svt_mask[VTD_SQ_MAX] = \
3025         {0xffff, 0xfffb, 0xfff9, 0xfff8};
3026     dma_addr_t addr = 0x00;
3027     uint16_t mask, source_id;
3028     uint8_t bus, bus_max, bus_min;
3029 
3030     addr = iommu->intr_root + index * sizeof(*entry);
3031     if (dma_memory_read(&address_space_memory, addr, entry,
3032                         sizeof(*entry))) {
3033         error_report_once("%s: read failed: ind=0x%x addr=0x%" PRIx64,
3034                           __func__, index, addr);
3035         return -VTD_FR_IR_ROOT_INVAL;
3036     }
3037 
3038     trace_vtd_ir_irte_get(index, le64_to_cpu(entry->data[1]),
3039                           le64_to_cpu(entry->data[0]));
3040 
3041     if (!entry->irte.present) {
3042         error_report_once("%s: detected non-present IRTE "
3043                           "(index=%u, high=0x%" PRIx64 ", low=0x%" PRIx64 ")",
3044                           __func__, index, le64_to_cpu(entry->data[1]),
3045                           le64_to_cpu(entry->data[0]));
3046         return -VTD_FR_IR_ENTRY_P;
3047     }
3048 
3049     if (entry->irte.__reserved_0 || entry->irte.__reserved_1 ||
3050         entry->irte.__reserved_2) {
3051         error_report_once("%s: detected non-zero reserved IRTE "
3052                           "(index=%u, high=0x%" PRIx64 ", low=0x%" PRIx64 ")",
3053                           __func__, index, le64_to_cpu(entry->data[1]),
3054                           le64_to_cpu(entry->data[0]));
3055         return -VTD_FR_IR_IRTE_RSVD;
3056     }
3057 
3058     if (sid != X86_IOMMU_SID_INVALID) {
3059         /* Validate IRTE SID */
3060         source_id = le32_to_cpu(entry->irte.source_id);
3061         switch (entry->irte.sid_vtype) {
3062         case VTD_SVT_NONE:
3063             break;
3064 
3065         case VTD_SVT_ALL:
3066             mask = vtd_svt_mask[entry->irte.sid_q];
3067             if ((source_id & mask) != (sid & mask)) {
3068                 error_report_once("%s: invalid IRTE SID "
3069                                   "(index=%u, sid=%u, source_id=%u)",
3070                                   __func__, index, sid, source_id);
3071                 return -VTD_FR_IR_SID_ERR;
3072             }
3073             break;
3074 
3075         case VTD_SVT_BUS:
3076             bus_max = source_id >> 8;
3077             bus_min = source_id & 0xff;
3078             bus = sid >> 8;
3079             if (bus > bus_max || bus < bus_min) {
3080                 error_report_once("%s: invalid SVT_BUS "
3081                                   "(index=%u, bus=%u, min=%u, max=%u)",
3082                                   __func__, index, bus, bus_min, bus_max);
3083                 return -VTD_FR_IR_SID_ERR;
3084             }
3085             break;
3086 
3087         default:
3088             error_report_once("%s: detected invalid IRTE SVT "
3089                               "(index=%u, type=%d)", __func__,
3090                               index, entry->irte.sid_vtype);
3091             /* Take this as verification failure. */
3092             return -VTD_FR_IR_SID_ERR;
3093             break;
3094         }
3095     }
3096 
3097     return 0;
3098 }
3099 
3100 /* Fetch IRQ information of specific IR index */
3101 static int vtd_remap_irq_get(IntelIOMMUState *iommu, uint16_t index,
3102                              X86IOMMUIrq *irq, uint16_t sid)
3103 {
3104     VTD_IR_TableEntry irte = {};
3105     int ret = 0;
3106 
3107     ret = vtd_irte_get(iommu, index, &irte, sid);
3108     if (ret) {
3109         return ret;
3110     }
3111 
3112     irq->trigger_mode = irte.irte.trigger_mode;
3113     irq->vector = irte.irte.vector;
3114     irq->delivery_mode = irte.irte.delivery_mode;
3115     irq->dest = le32_to_cpu(irte.irte.dest_id);
3116     if (!iommu->intr_eime) {
3117 #define  VTD_IR_APIC_DEST_MASK         (0xff00ULL)
3118 #define  VTD_IR_APIC_DEST_SHIFT        (8)
3119         irq->dest = (irq->dest & VTD_IR_APIC_DEST_MASK) >>
3120             VTD_IR_APIC_DEST_SHIFT;
3121     }
3122     irq->dest_mode = irte.irte.dest_mode;
3123     irq->redir_hint = irte.irte.redir_hint;
3124 
3125     trace_vtd_ir_remap(index, irq->trigger_mode, irq->vector,
3126                        irq->delivery_mode, irq->dest, irq->dest_mode);
3127 
3128     return 0;
3129 }
3130 
3131 /* Interrupt remapping for MSI/MSI-X entry */
3132 static int vtd_interrupt_remap_msi(IntelIOMMUState *iommu,
3133                                    MSIMessage *origin,
3134                                    MSIMessage *translated,
3135                                    uint16_t sid)
3136 {
3137     int ret = 0;
3138     VTD_IR_MSIAddress addr;
3139     uint16_t index;
3140     X86IOMMUIrq irq = {};
3141 
3142     assert(origin && translated);
3143 
3144     trace_vtd_ir_remap_msi_req(origin->address, origin->data);
3145 
3146     if (!iommu || !iommu->intr_enabled) {
3147         memcpy(translated, origin, sizeof(*origin));
3148         goto out;
3149     }
3150 
3151     if (origin->address & VTD_MSI_ADDR_HI_MASK) {
3152         error_report_once("%s: MSI address high 32 bits non-zero detected: "
3153                           "address=0x%" PRIx64, __func__, origin->address);
3154         return -VTD_FR_IR_REQ_RSVD;
3155     }
3156 
3157     addr.data = origin->address & VTD_MSI_ADDR_LO_MASK;
3158     if (addr.addr.__head != 0xfee) {
3159         error_report_once("%s: MSI address low 32 bit invalid: 0x%" PRIx32,
3160                           __func__, addr.data);
3161         return -VTD_FR_IR_REQ_RSVD;
3162     }
3163 
3164     /* This is compatible mode. */
3165     if (addr.addr.int_mode != VTD_IR_INT_FORMAT_REMAP) {
3166         memcpy(translated, origin, sizeof(*origin));
3167         goto out;
3168     }
3169 
3170     index = addr.addr.index_h << 15 | le16_to_cpu(addr.addr.index_l);
3171 
3172 #define  VTD_IR_MSI_DATA_SUBHANDLE       (0x0000ffff)
3173 #define  VTD_IR_MSI_DATA_RESERVED        (0xffff0000)
3174 
3175     if (addr.addr.sub_valid) {
3176         /* See VT-d spec 5.1.2.2 and 5.1.3 on subhandle */
3177         index += origin->data & VTD_IR_MSI_DATA_SUBHANDLE;
3178     }
3179 
3180     ret = vtd_remap_irq_get(iommu, index, &irq, sid);
3181     if (ret) {
3182         return ret;
3183     }
3184 
3185     if (addr.addr.sub_valid) {
3186         trace_vtd_ir_remap_type("MSI");
3187         if (origin->data & VTD_IR_MSI_DATA_RESERVED) {
3188             error_report_once("%s: invalid IR MSI "
3189                               "(sid=%u, address=0x%" PRIx64
3190                               ", data=0x%" PRIx32 ")",
3191                               __func__, sid, origin->address, origin->data);
3192             return -VTD_FR_IR_REQ_RSVD;
3193         }
3194     } else {
3195         uint8_t vector = origin->data & 0xff;
3196         uint8_t trigger_mode = (origin->data >> MSI_DATA_TRIGGER_SHIFT) & 0x1;
3197 
3198         trace_vtd_ir_remap_type("IOAPIC");
3199         /* IOAPIC entry vector should be aligned with IRTE vector
3200          * (see vt-d spec 5.1.5.1). */
3201         if (vector != irq.vector) {
3202             trace_vtd_warn_ir_vector(sid, index, vector, irq.vector);
3203         }
3204 
3205         /* The Trigger Mode field must match the Trigger Mode in the IRTE.
3206          * (see vt-d spec 5.1.5.1). */
3207         if (trigger_mode != irq.trigger_mode) {
3208             trace_vtd_warn_ir_trigger(sid, index, trigger_mode,
3209                                       irq.trigger_mode);
3210         }
3211     }
3212 
3213     /*
3214      * We'd better keep the last two bits, assuming that guest OS
3215      * might modify it. Keep it does not hurt after all.
3216      */
3217     irq.msi_addr_last_bits = addr.addr.__not_care;
3218 
3219     /* Translate X86IOMMUIrq to MSI message */
3220     x86_iommu_irq_to_msi_message(&irq, translated);
3221 
3222 out:
3223     trace_vtd_ir_remap_msi(origin->address, origin->data,
3224                            translated->address, translated->data);
3225     return 0;
3226 }
3227 
3228 static int vtd_int_remap(X86IOMMUState *iommu, MSIMessage *src,
3229                          MSIMessage *dst, uint16_t sid)
3230 {
3231     return vtd_interrupt_remap_msi(INTEL_IOMMU_DEVICE(iommu),
3232                                    src, dst, sid);
3233 }
3234 
3235 static MemTxResult vtd_mem_ir_read(void *opaque, hwaddr addr,
3236                                    uint64_t *data, unsigned size,
3237                                    MemTxAttrs attrs)
3238 {
3239     return MEMTX_OK;
3240 }
3241 
3242 static MemTxResult vtd_mem_ir_write(void *opaque, hwaddr addr,
3243                                     uint64_t value, unsigned size,
3244                                     MemTxAttrs attrs)
3245 {
3246     int ret = 0;
3247     MSIMessage from = {}, to = {};
3248     uint16_t sid = X86_IOMMU_SID_INVALID;
3249 
3250     from.address = (uint64_t) addr + VTD_INTERRUPT_ADDR_FIRST;
3251     from.data = (uint32_t) value;
3252 
3253     if (!attrs.unspecified) {
3254         /* We have explicit Source ID */
3255         sid = attrs.requester_id;
3256     }
3257 
3258     ret = vtd_interrupt_remap_msi(opaque, &from, &to, sid);
3259     if (ret) {
3260         /* TODO: report error */
3261         /* Drop this interrupt */
3262         return MEMTX_ERROR;
3263     }
3264 
3265     apic_get_class()->send_msi(&to);
3266 
3267     return MEMTX_OK;
3268 }
3269 
3270 static const MemoryRegionOps vtd_mem_ir_ops = {
3271     .read_with_attrs = vtd_mem_ir_read,
3272     .write_with_attrs = vtd_mem_ir_write,
3273     .endianness = DEVICE_LITTLE_ENDIAN,
3274     .impl = {
3275         .min_access_size = 4,
3276         .max_access_size = 4,
3277     },
3278     .valid = {
3279         .min_access_size = 4,
3280         .max_access_size = 4,
3281     },
3282 };
3283 
3284 VTDAddressSpace *vtd_find_add_as(IntelIOMMUState *s, PCIBus *bus, int devfn)
3285 {
3286     uintptr_t key = (uintptr_t)bus;
3287     VTDBus *vtd_bus = g_hash_table_lookup(s->vtd_as_by_busptr, &key);
3288     VTDAddressSpace *vtd_dev_as;
3289     char name[128];
3290 
3291     if (!vtd_bus) {
3292         uintptr_t *new_key = g_malloc(sizeof(*new_key));
3293         *new_key = (uintptr_t)bus;
3294         /* No corresponding free() */
3295         vtd_bus = g_malloc0(sizeof(VTDBus) + sizeof(VTDAddressSpace *) * \
3296                             PCI_DEVFN_MAX);
3297         vtd_bus->bus = bus;
3298         g_hash_table_insert(s->vtd_as_by_busptr, new_key, vtd_bus);
3299     }
3300 
3301     vtd_dev_as = vtd_bus->dev_as[devfn];
3302 
3303     if (!vtd_dev_as) {
3304         snprintf(name, sizeof(name), "vtd-%02x.%x", PCI_SLOT(devfn),
3305                  PCI_FUNC(devfn));
3306         vtd_bus->dev_as[devfn] = vtd_dev_as = g_malloc0(sizeof(VTDAddressSpace));
3307 
3308         vtd_dev_as->bus = bus;
3309         vtd_dev_as->devfn = (uint8_t)devfn;
3310         vtd_dev_as->iommu_state = s;
3311         vtd_dev_as->context_cache_entry.context_cache_gen = 0;
3312         vtd_dev_as->iova_tree = iova_tree_new();
3313 
3314         memory_region_init(&vtd_dev_as->root, OBJECT(s), name, UINT64_MAX);
3315         address_space_init(&vtd_dev_as->as, &vtd_dev_as->root, "vtd-root");
3316 
3317         /*
3318          * Build the DMAR-disabled container with aliases to the
3319          * shared MRs.  Note that aliasing to a shared memory region
3320          * could help the memory API to detect same FlatViews so we
3321          * can have devices to share the same FlatView when DMAR is
3322          * disabled (either by not providing "intel_iommu=on" or with
3323          * "iommu=pt").  It will greatly reduce the total number of
3324          * FlatViews of the system hence VM runs faster.
3325          */
3326         memory_region_init_alias(&vtd_dev_as->nodmar, OBJECT(s),
3327                                  "vtd-nodmar", &s->mr_nodmar, 0,
3328                                  memory_region_size(&s->mr_nodmar));
3329 
3330         /*
3331          * Build the per-device DMAR-enabled container.
3332          *
3333          * TODO: currently we have per-device IOMMU memory region only
3334          * because we have per-device IOMMU notifiers for devices.  If
3335          * one day we can abstract the IOMMU notifiers out of the
3336          * memory regions then we can also share the same memory
3337          * region here just like what we've done above with the nodmar
3338          * region.
3339          */
3340         strcat(name, "-dmar");
3341         memory_region_init_iommu(&vtd_dev_as->iommu, sizeof(vtd_dev_as->iommu),
3342                                  TYPE_INTEL_IOMMU_MEMORY_REGION, OBJECT(s),
3343                                  name, UINT64_MAX);
3344         memory_region_init_alias(&vtd_dev_as->iommu_ir, OBJECT(s), "vtd-ir",
3345                                  &s->mr_ir, 0, memory_region_size(&s->mr_ir));
3346         memory_region_add_subregion_overlap(MEMORY_REGION(&vtd_dev_as->iommu),
3347                                             VTD_INTERRUPT_ADDR_FIRST,
3348                                             &vtd_dev_as->iommu_ir, 1);
3349 
3350         /*
3351          * Hook both the containers under the root container, we
3352          * switch between DMAR & noDMAR by enable/disable
3353          * corresponding sub-containers
3354          */
3355         memory_region_add_subregion_overlap(&vtd_dev_as->root, 0,
3356                                             MEMORY_REGION(&vtd_dev_as->iommu),
3357                                             0);
3358         memory_region_add_subregion_overlap(&vtd_dev_as->root, 0,
3359                                             &vtd_dev_as->nodmar, 0);
3360 
3361         vtd_switch_address_space(vtd_dev_as);
3362     }
3363     return vtd_dev_as;
3364 }
3365 
3366 static uint64_t get_naturally_aligned_size(uint64_t start,
3367                                            uint64_t size, int gaw)
3368 {
3369     uint64_t max_mask = 1ULL << gaw;
3370     uint64_t alignment = start ? start & -start : max_mask;
3371 
3372     alignment = MIN(alignment, max_mask);
3373     size = MIN(size, max_mask);
3374 
3375     if (alignment <= size) {
3376         /* Increase the alignment of start */
3377         return alignment;
3378     } else {
3379         /* Find the largest page mask from size */
3380         return 1ULL << (63 - clz64(size));
3381     }
3382 }
3383 
3384 /* Unmap the whole range in the notifier's scope. */
3385 static void vtd_address_space_unmap(VTDAddressSpace *as, IOMMUNotifier *n)
3386 {
3387     hwaddr size, remain;
3388     hwaddr start = n->start;
3389     hwaddr end = n->end;
3390     IntelIOMMUState *s = as->iommu_state;
3391     DMAMap map;
3392 
3393     /*
3394      * Note: all the codes in this function has a assumption that IOVA
3395      * bits are no more than VTD_MGAW bits (which is restricted by
3396      * VT-d spec), otherwise we need to consider overflow of 64 bits.
3397      */
3398 
3399     if (end > VTD_ADDRESS_SIZE(s->aw_bits) - 1) {
3400         /*
3401          * Don't need to unmap regions that is bigger than the whole
3402          * VT-d supported address space size
3403          */
3404         end = VTD_ADDRESS_SIZE(s->aw_bits) - 1;
3405     }
3406 
3407     assert(start <= end);
3408     size = remain = end - start + 1;
3409 
3410     while (remain >= VTD_PAGE_SIZE) {
3411         IOMMUTLBEntry entry;
3412         uint64_t mask = get_naturally_aligned_size(start, remain, s->aw_bits);
3413 
3414         assert(mask);
3415 
3416         entry.iova = start;
3417         entry.addr_mask = mask - 1;
3418         entry.target_as = &address_space_memory;
3419         entry.perm = IOMMU_NONE;
3420         /* This field is meaningless for unmap */
3421         entry.translated_addr = 0;
3422 
3423         memory_region_notify_one(n, &entry);
3424 
3425         start += mask;
3426         remain -= mask;
3427     }
3428 
3429     assert(!remain);
3430 
3431     trace_vtd_as_unmap_whole(pci_bus_num(as->bus),
3432                              VTD_PCI_SLOT(as->devfn),
3433                              VTD_PCI_FUNC(as->devfn),
3434                              n->start, size);
3435 
3436     map.iova = n->start;
3437     map.size = size;
3438     iova_tree_remove(as->iova_tree, &map);
3439 }
3440 
3441 static void vtd_address_space_unmap_all(IntelIOMMUState *s)
3442 {
3443     VTDAddressSpace *vtd_as;
3444     IOMMUNotifier *n;
3445 
3446     QLIST_FOREACH(vtd_as, &s->vtd_as_with_notifiers, next) {
3447         IOMMU_NOTIFIER_FOREACH(n, &vtd_as->iommu) {
3448             vtd_address_space_unmap(vtd_as, n);
3449         }
3450     }
3451 }
3452 
3453 static void vtd_address_space_refresh_all(IntelIOMMUState *s)
3454 {
3455     vtd_address_space_unmap_all(s);
3456     vtd_switch_address_space_all(s);
3457 }
3458 
3459 static int vtd_replay_hook(IOMMUTLBEntry *entry, void *private)
3460 {
3461     memory_region_notify_one((IOMMUNotifier *)private, entry);
3462     return 0;
3463 }
3464 
3465 static void vtd_iommu_replay(IOMMUMemoryRegion *iommu_mr, IOMMUNotifier *n)
3466 {
3467     VTDAddressSpace *vtd_as = container_of(iommu_mr, VTDAddressSpace, iommu);
3468     IntelIOMMUState *s = vtd_as->iommu_state;
3469     uint8_t bus_n = pci_bus_num(vtd_as->bus);
3470     VTDContextEntry ce;
3471 
3472     /*
3473      * The replay can be triggered by either a invalidation or a newly
3474      * created entry. No matter what, we release existing mappings
3475      * (it means flushing caches for UNMAP-only registers).
3476      */
3477     vtd_address_space_unmap(vtd_as, n);
3478 
3479     if (vtd_dev_to_context_entry(s, bus_n, vtd_as->devfn, &ce) == 0) {
3480         trace_vtd_replay_ce_valid(s->root_scalable ? "scalable mode" :
3481                                   "legacy mode",
3482                                   bus_n, PCI_SLOT(vtd_as->devfn),
3483                                   PCI_FUNC(vtd_as->devfn),
3484                                   vtd_get_domain_id(s, &ce),
3485                                   ce.hi, ce.lo);
3486         if (vtd_as_has_map_notifier(vtd_as)) {
3487             /* This is required only for MAP typed notifiers */
3488             vtd_page_walk_info info = {
3489                 .hook_fn = vtd_replay_hook,
3490                 .private = (void *)n,
3491                 .notify_unmap = false,
3492                 .aw = s->aw_bits,
3493                 .as = vtd_as,
3494                 .domain_id = vtd_get_domain_id(s, &ce),
3495             };
3496 
3497             vtd_page_walk(s, &ce, 0, ~0ULL, &info);
3498         }
3499     } else {
3500         trace_vtd_replay_ce_invalid(bus_n, PCI_SLOT(vtd_as->devfn),
3501                                     PCI_FUNC(vtd_as->devfn));
3502     }
3503 
3504     return;
3505 }
3506 
3507 /* Do the initialization. It will also be called when reset, so pay
3508  * attention when adding new initialization stuff.
3509  */
3510 static void vtd_init(IntelIOMMUState *s)
3511 {
3512     X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(s);
3513 
3514     memset(s->csr, 0, DMAR_REG_SIZE);
3515     memset(s->wmask, 0, DMAR_REG_SIZE);
3516     memset(s->w1cmask, 0, DMAR_REG_SIZE);
3517     memset(s->womask, 0, DMAR_REG_SIZE);
3518 
3519     s->root = 0;
3520     s->root_scalable = false;
3521     s->dmar_enabled = false;
3522     s->intr_enabled = false;
3523     s->iq_head = 0;
3524     s->iq_tail = 0;
3525     s->iq = 0;
3526     s->iq_size = 0;
3527     s->qi_enabled = false;
3528     s->iq_last_desc_type = VTD_INV_DESC_NONE;
3529     s->iq_dw = false;
3530     s->next_frcd_reg = 0;
3531     s->cap = VTD_CAP_FRO | VTD_CAP_NFR | VTD_CAP_ND |
3532              VTD_CAP_MAMV | VTD_CAP_PSI | VTD_CAP_SLLPS |
3533              VTD_CAP_SAGAW_39bit | VTD_CAP_MGAW(s->aw_bits);
3534     if (s->dma_drain) {
3535         s->cap |= VTD_CAP_DRAIN;
3536     }
3537     if (s->aw_bits == VTD_HOST_AW_48BIT) {
3538         s->cap |= VTD_CAP_SAGAW_48bit;
3539     }
3540     s->ecap = VTD_ECAP_QI | VTD_ECAP_IRO;
3541 
3542     /*
3543      * Rsvd field masks for spte
3544      */
3545     vtd_paging_entry_rsvd_field[0] = ~0ULL;
3546     vtd_paging_entry_rsvd_field[1] = VTD_SPTE_PAGE_L1_RSVD_MASK(s->aw_bits);
3547     vtd_paging_entry_rsvd_field[2] = VTD_SPTE_PAGE_L2_RSVD_MASK(s->aw_bits);
3548     vtd_paging_entry_rsvd_field[3] = VTD_SPTE_PAGE_L3_RSVD_MASK(s->aw_bits);
3549     vtd_paging_entry_rsvd_field[4] = VTD_SPTE_PAGE_L4_RSVD_MASK(s->aw_bits);
3550     vtd_paging_entry_rsvd_field[5] = VTD_SPTE_LPAGE_L1_RSVD_MASK(s->aw_bits);
3551     vtd_paging_entry_rsvd_field[6] = VTD_SPTE_LPAGE_L2_RSVD_MASK(s->aw_bits);
3552     vtd_paging_entry_rsvd_field[7] = VTD_SPTE_LPAGE_L3_RSVD_MASK(s->aw_bits);
3553     vtd_paging_entry_rsvd_field[8] = VTD_SPTE_LPAGE_L4_RSVD_MASK(s->aw_bits);
3554 
3555     if (x86_iommu_ir_supported(x86_iommu)) {
3556         s->ecap |= VTD_ECAP_IR | VTD_ECAP_MHMV;
3557         if (s->intr_eim == ON_OFF_AUTO_ON) {
3558             s->ecap |= VTD_ECAP_EIM;
3559         }
3560         assert(s->intr_eim != ON_OFF_AUTO_AUTO);
3561     }
3562 
3563     if (x86_iommu->dt_supported) {
3564         s->ecap |= VTD_ECAP_DT;
3565     }
3566 
3567     if (x86_iommu->pt_supported) {
3568         s->ecap |= VTD_ECAP_PT;
3569     }
3570 
3571     if (s->caching_mode) {
3572         s->cap |= VTD_CAP_CM;
3573     }
3574 
3575     /* TODO: read cap/ecap from host to decide which cap to be exposed. */
3576     if (s->scalable_mode) {
3577         s->ecap |= VTD_ECAP_SMTS | VTD_ECAP_SRS | VTD_ECAP_SLTS;
3578     }
3579 
3580     vtd_reset_caches(s);
3581 
3582     /* Define registers with default values and bit semantics */
3583     vtd_define_long(s, DMAR_VER_REG, 0x10UL, 0, 0);
3584     vtd_define_quad(s, DMAR_CAP_REG, s->cap, 0, 0);
3585     vtd_define_quad(s, DMAR_ECAP_REG, s->ecap, 0, 0);
3586     vtd_define_long(s, DMAR_GCMD_REG, 0, 0xff800000UL, 0);
3587     vtd_define_long_wo(s, DMAR_GCMD_REG, 0xff800000UL);
3588     vtd_define_long(s, DMAR_GSTS_REG, 0, 0, 0);
3589     vtd_define_quad(s, DMAR_RTADDR_REG, 0, 0xfffffffffffffc00ULL, 0);
3590     vtd_define_quad(s, DMAR_CCMD_REG, 0, 0xe0000003ffffffffULL, 0);
3591     vtd_define_quad_wo(s, DMAR_CCMD_REG, 0x3ffff0000ULL);
3592 
3593     /* Advanced Fault Logging not supported */
3594     vtd_define_long(s, DMAR_FSTS_REG, 0, 0, 0x11UL);
3595     vtd_define_long(s, DMAR_FECTL_REG, 0x80000000UL, 0x80000000UL, 0);
3596     vtd_define_long(s, DMAR_FEDATA_REG, 0, 0x0000ffffUL, 0);
3597     vtd_define_long(s, DMAR_FEADDR_REG, 0, 0xfffffffcUL, 0);
3598 
3599     /* Treated as RsvdZ when EIM in ECAP_REG is not supported
3600      * vtd_define_long(s, DMAR_FEUADDR_REG, 0, 0xffffffffUL, 0);
3601      */
3602     vtd_define_long(s, DMAR_FEUADDR_REG, 0, 0, 0);
3603 
3604     /* Treated as RO for implementations that PLMR and PHMR fields reported
3605      * as Clear in the CAP_REG.
3606      * vtd_define_long(s, DMAR_PMEN_REG, 0, 0x80000000UL, 0);
3607      */
3608     vtd_define_long(s, DMAR_PMEN_REG, 0, 0, 0);
3609 
3610     vtd_define_quad(s, DMAR_IQH_REG, 0, 0, 0);
3611     vtd_define_quad(s, DMAR_IQT_REG, 0, 0x7fff0ULL, 0);
3612     vtd_define_quad(s, DMAR_IQA_REG, 0, 0xfffffffffffff807ULL, 0);
3613     vtd_define_long(s, DMAR_ICS_REG, 0, 0, 0x1UL);
3614     vtd_define_long(s, DMAR_IECTL_REG, 0x80000000UL, 0x80000000UL, 0);
3615     vtd_define_long(s, DMAR_IEDATA_REG, 0, 0xffffffffUL, 0);
3616     vtd_define_long(s, DMAR_IEADDR_REG, 0, 0xfffffffcUL, 0);
3617     /* Treadted as RsvdZ when EIM in ECAP_REG is not supported */
3618     vtd_define_long(s, DMAR_IEUADDR_REG, 0, 0, 0);
3619 
3620     /* IOTLB registers */
3621     vtd_define_quad(s, DMAR_IOTLB_REG, 0, 0Xb003ffff00000000ULL, 0);
3622     vtd_define_quad(s, DMAR_IVA_REG, 0, 0xfffffffffffff07fULL, 0);
3623     vtd_define_quad_wo(s, DMAR_IVA_REG, 0xfffffffffffff07fULL);
3624 
3625     /* Fault Recording Registers, 128-bit */
3626     vtd_define_quad(s, DMAR_FRCD_REG_0_0, 0, 0, 0);
3627     vtd_define_quad(s, DMAR_FRCD_REG_0_2, 0, 0, 0x8000000000000000ULL);
3628 
3629     /*
3630      * Interrupt remapping registers.
3631      */
3632     vtd_define_quad(s, DMAR_IRTA_REG, 0, 0xfffffffffffff80fULL, 0);
3633 }
3634 
3635 /* Should not reset address_spaces when reset because devices will still use
3636  * the address space they got at first (won't ask the bus again).
3637  */
3638 static void vtd_reset(DeviceState *dev)
3639 {
3640     IntelIOMMUState *s = INTEL_IOMMU_DEVICE(dev);
3641 
3642     vtd_init(s);
3643     vtd_address_space_refresh_all(s);
3644 }
3645 
3646 static AddressSpace *vtd_host_dma_iommu(PCIBus *bus, void *opaque, int devfn)
3647 {
3648     IntelIOMMUState *s = opaque;
3649     VTDAddressSpace *vtd_as;
3650 
3651     assert(0 <= devfn && devfn < PCI_DEVFN_MAX);
3652 
3653     vtd_as = vtd_find_add_as(s, bus, devfn);
3654     return &vtd_as->as;
3655 }
3656 
3657 static bool vtd_decide_config(IntelIOMMUState *s, Error **errp)
3658 {
3659     X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(s);
3660 
3661     if (s->intr_eim == ON_OFF_AUTO_ON && !x86_iommu_ir_supported(x86_iommu)) {
3662         error_setg(errp, "eim=on cannot be selected without intremap=on");
3663         return false;
3664     }
3665 
3666     if (s->intr_eim == ON_OFF_AUTO_AUTO) {
3667         s->intr_eim = (kvm_irqchip_in_kernel() || s->buggy_eim)
3668                       && x86_iommu_ir_supported(x86_iommu) ?
3669                                               ON_OFF_AUTO_ON : ON_OFF_AUTO_OFF;
3670     }
3671     if (s->intr_eim == ON_OFF_AUTO_ON && !s->buggy_eim) {
3672         if (!kvm_irqchip_in_kernel()) {
3673             error_setg(errp, "eim=on requires accel=kvm,kernel-irqchip=split");
3674             return false;
3675         }
3676         if (!kvm_enable_x2apic()) {
3677             error_setg(errp, "eim=on requires support on the KVM side"
3678                              "(X2APIC_API, first shipped in v4.7)");
3679             return false;
3680         }
3681     }
3682 
3683     /* Currently only address widths supported are 39 and 48 bits */
3684     if ((s->aw_bits != VTD_HOST_AW_39BIT) &&
3685         (s->aw_bits != VTD_HOST_AW_48BIT)) {
3686         error_setg(errp, "Supported values for x-aw-bits are: %d, %d",
3687                    VTD_HOST_AW_39BIT, VTD_HOST_AW_48BIT);
3688         return false;
3689     }
3690 
3691     if (s->scalable_mode && !s->dma_drain) {
3692         error_setg(errp, "Need to set dma_drain for scalable mode");
3693         return false;
3694     }
3695 
3696     return true;
3697 }
3698 
3699 static void vtd_realize(DeviceState *dev, Error **errp)
3700 {
3701     MachineState *ms = MACHINE(qdev_get_machine());
3702     PCMachineState *pcms = PC_MACHINE(ms);
3703     PCIBus *bus = pcms->bus;
3704     IntelIOMMUState *s = INTEL_IOMMU_DEVICE(dev);
3705     X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(dev);
3706 
3707     x86_iommu->type = TYPE_INTEL;
3708 
3709     if (!vtd_decide_config(s, errp)) {
3710         return;
3711     }
3712 
3713     QLIST_INIT(&s->vtd_as_with_notifiers);
3714     qemu_mutex_init(&s->iommu_lock);
3715     memset(s->vtd_as_by_bus_num, 0, sizeof(s->vtd_as_by_bus_num));
3716     memory_region_init_io(&s->csrmem, OBJECT(s), &vtd_mem_ops, s,
3717                           "intel_iommu", DMAR_REG_SIZE);
3718 
3719     /* Create the shared memory regions by all devices */
3720     memory_region_init(&s->mr_nodmar, OBJECT(s), "vtd-nodmar",
3721                        UINT64_MAX);
3722     memory_region_init_io(&s->mr_ir, OBJECT(s), &vtd_mem_ir_ops,
3723                           s, "vtd-ir", VTD_INTERRUPT_ADDR_SIZE);
3724     memory_region_init_alias(&s->mr_sys_alias, OBJECT(s),
3725                              "vtd-sys-alias", get_system_memory(), 0,
3726                              memory_region_size(get_system_memory()));
3727     memory_region_add_subregion_overlap(&s->mr_nodmar, 0,
3728                                         &s->mr_sys_alias, 0);
3729     memory_region_add_subregion_overlap(&s->mr_nodmar,
3730                                         VTD_INTERRUPT_ADDR_FIRST,
3731                                         &s->mr_ir, 1);
3732 
3733     sysbus_init_mmio(SYS_BUS_DEVICE(s), &s->csrmem);
3734     /* No corresponding destroy */
3735     s->iotlb = g_hash_table_new_full(vtd_uint64_hash, vtd_uint64_equal,
3736                                      g_free, g_free);
3737     s->vtd_as_by_busptr = g_hash_table_new_full(vtd_uint64_hash, vtd_uint64_equal,
3738                                               g_free, g_free);
3739     vtd_init(s);
3740     sysbus_mmio_map(SYS_BUS_DEVICE(s), 0, Q35_HOST_BRIDGE_IOMMU_ADDR);
3741     pci_setup_iommu(bus, vtd_host_dma_iommu, dev);
3742     /* Pseudo address space under root PCI bus. */
3743     pcms->ioapic_as = vtd_host_dma_iommu(bus, s, Q35_PSEUDO_DEVFN_IOAPIC);
3744 }
3745 
3746 static void vtd_class_init(ObjectClass *klass, void *data)
3747 {
3748     DeviceClass *dc = DEVICE_CLASS(klass);
3749     X86IOMMUClass *x86_class = X86_IOMMU_CLASS(klass);
3750 
3751     dc->reset = vtd_reset;
3752     dc->vmsd = &vtd_vmstate;
3753     dc->props = vtd_properties;
3754     dc->hotpluggable = false;
3755     x86_class->realize = vtd_realize;
3756     x86_class->int_remap = vtd_int_remap;
3757     /* Supported by the pc-q35-* machine types */
3758     dc->user_creatable = true;
3759     set_bit(DEVICE_CATEGORY_MISC, dc->categories);
3760     dc->desc = "Intel IOMMU (VT-d) DMA Remapping device";
3761 }
3762 
3763 static const TypeInfo vtd_info = {
3764     .name          = TYPE_INTEL_IOMMU_DEVICE,
3765     .parent        = TYPE_X86_IOMMU_DEVICE,
3766     .instance_size = sizeof(IntelIOMMUState),
3767     .class_init    = vtd_class_init,
3768 };
3769 
3770 static void vtd_iommu_memory_region_class_init(ObjectClass *klass,
3771                                                      void *data)
3772 {
3773     IOMMUMemoryRegionClass *imrc = IOMMU_MEMORY_REGION_CLASS(klass);
3774 
3775     imrc->translate = vtd_iommu_translate;
3776     imrc->notify_flag_changed = vtd_iommu_notify_flag_changed;
3777     imrc->replay = vtd_iommu_replay;
3778 }
3779 
3780 static const TypeInfo vtd_iommu_memory_region_info = {
3781     .parent = TYPE_IOMMU_MEMORY_REGION,
3782     .name = TYPE_INTEL_IOMMU_MEMORY_REGION,
3783     .class_init = vtd_iommu_memory_region_class_init,
3784 };
3785 
3786 static void vtd_register_types(void)
3787 {
3788     type_register_static(&vtd_info);
3789     type_register_static(&vtd_iommu_memory_region_info);
3790 }
3791 
3792 type_init(vtd_register_types)
3793