xref: /openbmc/qemu/hw/i386/intel_iommu.c (revision 650d103d3ea959212f826acb9d3fe80cf30e347b)
1 /*
2  * QEMU emulation of an Intel IOMMU (VT-d)
3  *   (DMA Remapping device)
4  *
5  * Copyright (C) 2013 Knut Omang, Oracle <knut.omang@oracle.com>
6  * Copyright (C) 2014 Le Tan, <tamlokveer@gmail.com>
7  *
8  * This program is free software; you can redistribute it and/or modify
9  * it under the terms of the GNU General Public License as published by
10  * the Free Software Foundation; either version 2 of the License, or
11  * (at your option) any later version.
12 
13  * This program is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16  * GNU General Public License for more details.
17 
18  * You should have received a copy of the GNU General Public License along
19  * with this program; if not, see <http://www.gnu.org/licenses/>.
20  */
21 
22 #include "qemu/osdep.h"
23 #include "qemu/error-report.h"
24 #include "qapi/error.h"
25 #include "hw/sysbus.h"
26 #include "exec/address-spaces.h"
27 #include "intel_iommu_internal.h"
28 #include "hw/pci/pci.h"
29 #include "hw/pci/pci_bus.h"
30 #include "hw/i386/pc.h"
31 #include "hw/i386/apic-msidef.h"
32 #include "hw/boards.h"
33 #include "hw/i386/x86-iommu.h"
34 #include "hw/pci-host/q35.h"
35 #include "sysemu/kvm.h"
36 #include "hw/i386/apic_internal.h"
37 #include "kvm_i386.h"
38 #include "migration/vmstate.h"
39 #include "trace.h"
40 
41 /* context entry operations */
42 #define VTD_CE_GET_RID2PASID(ce) \
43     ((ce)->val[1] & VTD_SM_CONTEXT_ENTRY_RID2PASID_MASK)
44 #define VTD_CE_GET_PASID_DIR_TABLE(ce) \
45     ((ce)->val[0] & VTD_PASID_DIR_BASE_ADDR_MASK)
46 
47 /* pe operations */
48 #define VTD_PE_GET_TYPE(pe) ((pe)->val[0] & VTD_SM_PASID_ENTRY_PGTT)
49 #define VTD_PE_GET_LEVEL(pe) (2 + (((pe)->val[0] >> 2) & VTD_SM_PASID_ENTRY_AW))
50 #define VTD_PE_GET_FPD_ERR(ret_fr, is_fpd_set, s, source_id, addr, is_write) {\
51     if (ret_fr) {                                                             \
52         ret_fr = -ret_fr;                                                     \
53         if (is_fpd_set && vtd_is_qualified_fault(ret_fr)) {                   \
54             trace_vtd_fault_disabled();                                       \
55         } else {                                                              \
56             vtd_report_dmar_fault(s, source_id, addr, ret_fr, is_write);      \
57         }                                                                     \
58         goto error;                                                           \
59     }                                                                         \
60 }
61 
62 static void vtd_address_space_refresh_all(IntelIOMMUState *s);
63 static void vtd_address_space_unmap(VTDAddressSpace *as, IOMMUNotifier *n);
64 
65 static void vtd_define_quad(IntelIOMMUState *s, hwaddr addr, uint64_t val,
66                             uint64_t wmask, uint64_t w1cmask)
67 {
68     stq_le_p(&s->csr[addr], val);
69     stq_le_p(&s->wmask[addr], wmask);
70     stq_le_p(&s->w1cmask[addr], w1cmask);
71 }
72 
73 static void vtd_define_quad_wo(IntelIOMMUState *s, hwaddr addr, uint64_t mask)
74 {
75     stq_le_p(&s->womask[addr], mask);
76 }
77 
78 static void vtd_define_long(IntelIOMMUState *s, hwaddr addr, uint32_t val,
79                             uint32_t wmask, uint32_t w1cmask)
80 {
81     stl_le_p(&s->csr[addr], val);
82     stl_le_p(&s->wmask[addr], wmask);
83     stl_le_p(&s->w1cmask[addr], w1cmask);
84 }
85 
86 static void vtd_define_long_wo(IntelIOMMUState *s, hwaddr addr, uint32_t mask)
87 {
88     stl_le_p(&s->womask[addr], mask);
89 }
90 
91 /* "External" get/set operations */
92 static void vtd_set_quad(IntelIOMMUState *s, hwaddr addr, uint64_t val)
93 {
94     uint64_t oldval = ldq_le_p(&s->csr[addr]);
95     uint64_t wmask = ldq_le_p(&s->wmask[addr]);
96     uint64_t w1cmask = ldq_le_p(&s->w1cmask[addr]);
97     stq_le_p(&s->csr[addr],
98              ((oldval & ~wmask) | (val & wmask)) & ~(w1cmask & val));
99 }
100 
101 static void vtd_set_long(IntelIOMMUState *s, hwaddr addr, uint32_t val)
102 {
103     uint32_t oldval = ldl_le_p(&s->csr[addr]);
104     uint32_t wmask = ldl_le_p(&s->wmask[addr]);
105     uint32_t w1cmask = ldl_le_p(&s->w1cmask[addr]);
106     stl_le_p(&s->csr[addr],
107              ((oldval & ~wmask) | (val & wmask)) & ~(w1cmask & val));
108 }
109 
110 static uint64_t vtd_get_quad(IntelIOMMUState *s, hwaddr addr)
111 {
112     uint64_t val = ldq_le_p(&s->csr[addr]);
113     uint64_t womask = ldq_le_p(&s->womask[addr]);
114     return val & ~womask;
115 }
116 
117 static uint32_t vtd_get_long(IntelIOMMUState *s, hwaddr addr)
118 {
119     uint32_t val = ldl_le_p(&s->csr[addr]);
120     uint32_t womask = ldl_le_p(&s->womask[addr]);
121     return val & ~womask;
122 }
123 
124 /* "Internal" get/set operations */
125 static uint64_t vtd_get_quad_raw(IntelIOMMUState *s, hwaddr addr)
126 {
127     return ldq_le_p(&s->csr[addr]);
128 }
129 
130 static uint32_t vtd_get_long_raw(IntelIOMMUState *s, hwaddr addr)
131 {
132     return ldl_le_p(&s->csr[addr]);
133 }
134 
135 static void vtd_set_quad_raw(IntelIOMMUState *s, hwaddr addr, uint64_t val)
136 {
137     stq_le_p(&s->csr[addr], val);
138 }
139 
140 static uint32_t vtd_set_clear_mask_long(IntelIOMMUState *s, hwaddr addr,
141                                         uint32_t clear, uint32_t mask)
142 {
143     uint32_t new_val = (ldl_le_p(&s->csr[addr]) & ~clear) | mask;
144     stl_le_p(&s->csr[addr], new_val);
145     return new_val;
146 }
147 
148 static uint64_t vtd_set_clear_mask_quad(IntelIOMMUState *s, hwaddr addr,
149                                         uint64_t clear, uint64_t mask)
150 {
151     uint64_t new_val = (ldq_le_p(&s->csr[addr]) & ~clear) | mask;
152     stq_le_p(&s->csr[addr], new_val);
153     return new_val;
154 }
155 
156 static inline void vtd_iommu_lock(IntelIOMMUState *s)
157 {
158     qemu_mutex_lock(&s->iommu_lock);
159 }
160 
161 static inline void vtd_iommu_unlock(IntelIOMMUState *s)
162 {
163     qemu_mutex_unlock(&s->iommu_lock);
164 }
165 
166 static void vtd_update_scalable_state(IntelIOMMUState *s)
167 {
168     uint64_t val = vtd_get_quad_raw(s, DMAR_RTADDR_REG);
169 
170     if (s->scalable_mode) {
171         s->root_scalable = val & VTD_RTADDR_SMT;
172     }
173 }
174 
175 /* Whether the address space needs to notify new mappings */
176 static inline gboolean vtd_as_has_map_notifier(VTDAddressSpace *as)
177 {
178     return as->notifier_flags & IOMMU_NOTIFIER_MAP;
179 }
180 
181 /* GHashTable functions */
182 static gboolean vtd_uint64_equal(gconstpointer v1, gconstpointer v2)
183 {
184     return *((const uint64_t *)v1) == *((const uint64_t *)v2);
185 }
186 
187 static guint vtd_uint64_hash(gconstpointer v)
188 {
189     return (guint)*(const uint64_t *)v;
190 }
191 
192 static gboolean vtd_hash_remove_by_domain(gpointer key, gpointer value,
193                                           gpointer user_data)
194 {
195     VTDIOTLBEntry *entry = (VTDIOTLBEntry *)value;
196     uint16_t domain_id = *(uint16_t *)user_data;
197     return entry->domain_id == domain_id;
198 }
199 
200 /* The shift of an addr for a certain level of paging structure */
201 static inline uint32_t vtd_slpt_level_shift(uint32_t level)
202 {
203     assert(level != 0);
204     return VTD_PAGE_SHIFT_4K + (level - 1) * VTD_SL_LEVEL_BITS;
205 }
206 
207 static inline uint64_t vtd_slpt_level_page_mask(uint32_t level)
208 {
209     return ~((1ULL << vtd_slpt_level_shift(level)) - 1);
210 }
211 
212 static gboolean vtd_hash_remove_by_page(gpointer key, gpointer value,
213                                         gpointer user_data)
214 {
215     VTDIOTLBEntry *entry = (VTDIOTLBEntry *)value;
216     VTDIOTLBPageInvInfo *info = (VTDIOTLBPageInvInfo *)user_data;
217     uint64_t gfn = (info->addr >> VTD_PAGE_SHIFT_4K) & info->mask;
218     uint64_t gfn_tlb = (info->addr & entry->mask) >> VTD_PAGE_SHIFT_4K;
219     return (entry->domain_id == info->domain_id) &&
220             (((entry->gfn & info->mask) == gfn) ||
221              (entry->gfn == gfn_tlb));
222 }
223 
224 /* Reset all the gen of VTDAddressSpace to zero and set the gen of
225  * IntelIOMMUState to 1.  Must be called with IOMMU lock held.
226  */
227 static void vtd_reset_context_cache_locked(IntelIOMMUState *s)
228 {
229     VTDAddressSpace *vtd_as;
230     VTDBus *vtd_bus;
231     GHashTableIter bus_it;
232     uint32_t devfn_it;
233 
234     trace_vtd_context_cache_reset();
235 
236     g_hash_table_iter_init(&bus_it, s->vtd_as_by_busptr);
237 
238     while (g_hash_table_iter_next (&bus_it, NULL, (void**)&vtd_bus)) {
239         for (devfn_it = 0; devfn_it < PCI_DEVFN_MAX; ++devfn_it) {
240             vtd_as = vtd_bus->dev_as[devfn_it];
241             if (!vtd_as) {
242                 continue;
243             }
244             vtd_as->context_cache_entry.context_cache_gen = 0;
245         }
246     }
247     s->context_cache_gen = 1;
248 }
249 
250 /* Must be called with IOMMU lock held. */
251 static void vtd_reset_iotlb_locked(IntelIOMMUState *s)
252 {
253     assert(s->iotlb);
254     g_hash_table_remove_all(s->iotlb);
255 }
256 
257 static void vtd_reset_iotlb(IntelIOMMUState *s)
258 {
259     vtd_iommu_lock(s);
260     vtd_reset_iotlb_locked(s);
261     vtd_iommu_unlock(s);
262 }
263 
264 static void vtd_reset_caches(IntelIOMMUState *s)
265 {
266     vtd_iommu_lock(s);
267     vtd_reset_iotlb_locked(s);
268     vtd_reset_context_cache_locked(s);
269     vtd_iommu_unlock(s);
270 }
271 
272 static uint64_t vtd_get_iotlb_key(uint64_t gfn, uint16_t source_id,
273                                   uint32_t level)
274 {
275     return gfn | ((uint64_t)(source_id) << VTD_IOTLB_SID_SHIFT) |
276            ((uint64_t)(level) << VTD_IOTLB_LVL_SHIFT);
277 }
278 
279 static uint64_t vtd_get_iotlb_gfn(hwaddr addr, uint32_t level)
280 {
281     return (addr & vtd_slpt_level_page_mask(level)) >> VTD_PAGE_SHIFT_4K;
282 }
283 
284 /* Must be called with IOMMU lock held */
285 static VTDIOTLBEntry *vtd_lookup_iotlb(IntelIOMMUState *s, uint16_t source_id,
286                                        hwaddr addr)
287 {
288     VTDIOTLBEntry *entry;
289     uint64_t key;
290     int level;
291 
292     for (level = VTD_SL_PT_LEVEL; level < VTD_SL_PML4_LEVEL; level++) {
293         key = vtd_get_iotlb_key(vtd_get_iotlb_gfn(addr, level),
294                                 source_id, level);
295         entry = g_hash_table_lookup(s->iotlb, &key);
296         if (entry) {
297             goto out;
298         }
299     }
300 
301 out:
302     return entry;
303 }
304 
305 /* Must be with IOMMU lock held */
306 static void vtd_update_iotlb(IntelIOMMUState *s, uint16_t source_id,
307                              uint16_t domain_id, hwaddr addr, uint64_t slpte,
308                              uint8_t access_flags, uint32_t level)
309 {
310     VTDIOTLBEntry *entry = g_malloc(sizeof(*entry));
311     uint64_t *key = g_malloc(sizeof(*key));
312     uint64_t gfn = vtd_get_iotlb_gfn(addr, level);
313 
314     trace_vtd_iotlb_page_update(source_id, addr, slpte, domain_id);
315     if (g_hash_table_size(s->iotlb) >= VTD_IOTLB_MAX_SIZE) {
316         trace_vtd_iotlb_reset("iotlb exceeds size limit");
317         vtd_reset_iotlb_locked(s);
318     }
319 
320     entry->gfn = gfn;
321     entry->domain_id = domain_id;
322     entry->slpte = slpte;
323     entry->access_flags = access_flags;
324     entry->mask = vtd_slpt_level_page_mask(level);
325     *key = vtd_get_iotlb_key(gfn, source_id, level);
326     g_hash_table_replace(s->iotlb, key, entry);
327 }
328 
329 /* Given the reg addr of both the message data and address, generate an
330  * interrupt via MSI.
331  */
332 static void vtd_generate_interrupt(IntelIOMMUState *s, hwaddr mesg_addr_reg,
333                                    hwaddr mesg_data_reg)
334 {
335     MSIMessage msi;
336 
337     assert(mesg_data_reg < DMAR_REG_SIZE);
338     assert(mesg_addr_reg < DMAR_REG_SIZE);
339 
340     msi.address = vtd_get_long_raw(s, mesg_addr_reg);
341     msi.data = vtd_get_long_raw(s, mesg_data_reg);
342 
343     trace_vtd_irq_generate(msi.address, msi.data);
344 
345     apic_get_class()->send_msi(&msi);
346 }
347 
348 /* Generate a fault event to software via MSI if conditions are met.
349  * Notice that the value of FSTS_REG being passed to it should be the one
350  * before any update.
351  */
352 static void vtd_generate_fault_event(IntelIOMMUState *s, uint32_t pre_fsts)
353 {
354     if (pre_fsts & VTD_FSTS_PPF || pre_fsts & VTD_FSTS_PFO ||
355         pre_fsts & VTD_FSTS_IQE) {
356         error_report_once("There are previous interrupt conditions "
357                           "to be serviced by software, fault event "
358                           "is not generated");
359         return;
360     }
361     vtd_set_clear_mask_long(s, DMAR_FECTL_REG, 0, VTD_FECTL_IP);
362     if (vtd_get_long_raw(s, DMAR_FECTL_REG) & VTD_FECTL_IM) {
363         error_report_once("Interrupt Mask set, irq is not generated");
364     } else {
365         vtd_generate_interrupt(s, DMAR_FEADDR_REG, DMAR_FEDATA_REG);
366         vtd_set_clear_mask_long(s, DMAR_FECTL_REG, VTD_FECTL_IP, 0);
367     }
368 }
369 
370 /* Check if the Fault (F) field of the Fault Recording Register referenced by
371  * @index is Set.
372  */
373 static bool vtd_is_frcd_set(IntelIOMMUState *s, uint16_t index)
374 {
375     /* Each reg is 128-bit */
376     hwaddr addr = DMAR_FRCD_REG_OFFSET + (((uint64_t)index) << 4);
377     addr += 8; /* Access the high 64-bit half */
378 
379     assert(index < DMAR_FRCD_REG_NR);
380 
381     return vtd_get_quad_raw(s, addr) & VTD_FRCD_F;
382 }
383 
384 /* Update the PPF field of Fault Status Register.
385  * Should be called whenever change the F field of any fault recording
386  * registers.
387  */
388 static void vtd_update_fsts_ppf(IntelIOMMUState *s)
389 {
390     uint32_t i;
391     uint32_t ppf_mask = 0;
392 
393     for (i = 0; i < DMAR_FRCD_REG_NR; i++) {
394         if (vtd_is_frcd_set(s, i)) {
395             ppf_mask = VTD_FSTS_PPF;
396             break;
397         }
398     }
399     vtd_set_clear_mask_long(s, DMAR_FSTS_REG, VTD_FSTS_PPF, ppf_mask);
400     trace_vtd_fsts_ppf(!!ppf_mask);
401 }
402 
403 static void vtd_set_frcd_and_update_ppf(IntelIOMMUState *s, uint16_t index)
404 {
405     /* Each reg is 128-bit */
406     hwaddr addr = DMAR_FRCD_REG_OFFSET + (((uint64_t)index) << 4);
407     addr += 8; /* Access the high 64-bit half */
408 
409     assert(index < DMAR_FRCD_REG_NR);
410 
411     vtd_set_clear_mask_quad(s, addr, 0, VTD_FRCD_F);
412     vtd_update_fsts_ppf(s);
413 }
414 
415 /* Must not update F field now, should be done later */
416 static void vtd_record_frcd(IntelIOMMUState *s, uint16_t index,
417                             uint16_t source_id, hwaddr addr,
418                             VTDFaultReason fault, bool is_write)
419 {
420     uint64_t hi = 0, lo;
421     hwaddr frcd_reg_addr = DMAR_FRCD_REG_OFFSET + (((uint64_t)index) << 4);
422 
423     assert(index < DMAR_FRCD_REG_NR);
424 
425     lo = VTD_FRCD_FI(addr);
426     hi = VTD_FRCD_SID(source_id) | VTD_FRCD_FR(fault);
427     if (!is_write) {
428         hi |= VTD_FRCD_T;
429     }
430     vtd_set_quad_raw(s, frcd_reg_addr, lo);
431     vtd_set_quad_raw(s, frcd_reg_addr + 8, hi);
432 
433     trace_vtd_frr_new(index, hi, lo);
434 }
435 
436 /* Try to collapse multiple pending faults from the same requester */
437 static bool vtd_try_collapse_fault(IntelIOMMUState *s, uint16_t source_id)
438 {
439     uint32_t i;
440     uint64_t frcd_reg;
441     hwaddr addr = DMAR_FRCD_REG_OFFSET + 8; /* The high 64-bit half */
442 
443     for (i = 0; i < DMAR_FRCD_REG_NR; i++) {
444         frcd_reg = vtd_get_quad_raw(s, addr);
445         if ((frcd_reg & VTD_FRCD_F) &&
446             ((frcd_reg & VTD_FRCD_SID_MASK) == source_id)) {
447             return true;
448         }
449         addr += 16; /* 128-bit for each */
450     }
451     return false;
452 }
453 
454 /* Log and report an DMAR (address translation) fault to software */
455 static void vtd_report_dmar_fault(IntelIOMMUState *s, uint16_t source_id,
456                                   hwaddr addr, VTDFaultReason fault,
457                                   bool is_write)
458 {
459     uint32_t fsts_reg = vtd_get_long_raw(s, DMAR_FSTS_REG);
460 
461     assert(fault < VTD_FR_MAX);
462 
463     if (fault == VTD_FR_RESERVED_ERR) {
464         /* This is not a normal fault reason case. Drop it. */
465         return;
466     }
467 
468     trace_vtd_dmar_fault(source_id, fault, addr, is_write);
469 
470     if (fsts_reg & VTD_FSTS_PFO) {
471         error_report_once("New fault is not recorded due to "
472                           "Primary Fault Overflow");
473         return;
474     }
475 
476     if (vtd_try_collapse_fault(s, source_id)) {
477         error_report_once("New fault is not recorded due to "
478                           "compression of faults");
479         return;
480     }
481 
482     if (vtd_is_frcd_set(s, s->next_frcd_reg)) {
483         error_report_once("Next Fault Recording Reg is used, "
484                           "new fault is not recorded, set PFO field");
485         vtd_set_clear_mask_long(s, DMAR_FSTS_REG, 0, VTD_FSTS_PFO);
486         return;
487     }
488 
489     vtd_record_frcd(s, s->next_frcd_reg, source_id, addr, fault, is_write);
490 
491     if (fsts_reg & VTD_FSTS_PPF) {
492         error_report_once("There are pending faults already, "
493                           "fault event is not generated");
494         vtd_set_frcd_and_update_ppf(s, s->next_frcd_reg);
495         s->next_frcd_reg++;
496         if (s->next_frcd_reg == DMAR_FRCD_REG_NR) {
497             s->next_frcd_reg = 0;
498         }
499     } else {
500         vtd_set_clear_mask_long(s, DMAR_FSTS_REG, VTD_FSTS_FRI_MASK,
501                                 VTD_FSTS_FRI(s->next_frcd_reg));
502         vtd_set_frcd_and_update_ppf(s, s->next_frcd_reg); /* Will set PPF */
503         s->next_frcd_reg++;
504         if (s->next_frcd_reg == DMAR_FRCD_REG_NR) {
505             s->next_frcd_reg = 0;
506         }
507         /* This case actually cause the PPF to be Set.
508          * So generate fault event (interrupt).
509          */
510          vtd_generate_fault_event(s, fsts_reg);
511     }
512 }
513 
514 /* Handle Invalidation Queue Errors of queued invalidation interface error
515  * conditions.
516  */
517 static void vtd_handle_inv_queue_error(IntelIOMMUState *s)
518 {
519     uint32_t fsts_reg = vtd_get_long_raw(s, DMAR_FSTS_REG);
520 
521     vtd_set_clear_mask_long(s, DMAR_FSTS_REG, 0, VTD_FSTS_IQE);
522     vtd_generate_fault_event(s, fsts_reg);
523 }
524 
525 /* Set the IWC field and try to generate an invalidation completion interrupt */
526 static void vtd_generate_completion_event(IntelIOMMUState *s)
527 {
528     if (vtd_get_long_raw(s, DMAR_ICS_REG) & VTD_ICS_IWC) {
529         trace_vtd_inv_desc_wait_irq("One pending, skip current");
530         return;
531     }
532     vtd_set_clear_mask_long(s, DMAR_ICS_REG, 0, VTD_ICS_IWC);
533     vtd_set_clear_mask_long(s, DMAR_IECTL_REG, 0, VTD_IECTL_IP);
534     if (vtd_get_long_raw(s, DMAR_IECTL_REG) & VTD_IECTL_IM) {
535         trace_vtd_inv_desc_wait_irq("IM in IECTL_REG is set, "
536                                     "new event not generated");
537         return;
538     } else {
539         /* Generate the interrupt event */
540         trace_vtd_inv_desc_wait_irq("Generating complete event");
541         vtd_generate_interrupt(s, DMAR_IEADDR_REG, DMAR_IEDATA_REG);
542         vtd_set_clear_mask_long(s, DMAR_IECTL_REG, VTD_IECTL_IP, 0);
543     }
544 }
545 
546 static inline bool vtd_root_entry_present(IntelIOMMUState *s,
547                                           VTDRootEntry *re,
548                                           uint8_t devfn)
549 {
550     if (s->root_scalable && devfn > UINT8_MAX / 2) {
551         return re->hi & VTD_ROOT_ENTRY_P;
552     }
553 
554     return re->lo & VTD_ROOT_ENTRY_P;
555 }
556 
557 static int vtd_get_root_entry(IntelIOMMUState *s, uint8_t index,
558                               VTDRootEntry *re)
559 {
560     dma_addr_t addr;
561 
562     addr = s->root + index * sizeof(*re);
563     if (dma_memory_read(&address_space_memory, addr, re, sizeof(*re))) {
564         re->lo = 0;
565         return -VTD_FR_ROOT_TABLE_INV;
566     }
567     re->lo = le64_to_cpu(re->lo);
568     re->hi = le64_to_cpu(re->hi);
569     return 0;
570 }
571 
572 static inline bool vtd_ce_present(VTDContextEntry *context)
573 {
574     return context->lo & VTD_CONTEXT_ENTRY_P;
575 }
576 
577 static int vtd_get_context_entry_from_root(IntelIOMMUState *s,
578                                            VTDRootEntry *re,
579                                            uint8_t index,
580                                            VTDContextEntry *ce)
581 {
582     dma_addr_t addr, ce_size;
583 
584     /* we have checked that root entry is present */
585     ce_size = s->root_scalable ? VTD_CTX_ENTRY_SCALABLE_SIZE :
586               VTD_CTX_ENTRY_LEGACY_SIZE;
587 
588     if (s->root_scalable && index > UINT8_MAX / 2) {
589         index = index & (~VTD_DEVFN_CHECK_MASK);
590         addr = re->hi & VTD_ROOT_ENTRY_CTP;
591     } else {
592         addr = re->lo & VTD_ROOT_ENTRY_CTP;
593     }
594 
595     addr = addr + index * ce_size;
596     if (dma_memory_read(&address_space_memory, addr, ce, ce_size)) {
597         return -VTD_FR_CONTEXT_TABLE_INV;
598     }
599 
600     ce->lo = le64_to_cpu(ce->lo);
601     ce->hi = le64_to_cpu(ce->hi);
602     if (ce_size == VTD_CTX_ENTRY_SCALABLE_SIZE) {
603         ce->val[2] = le64_to_cpu(ce->val[2]);
604         ce->val[3] = le64_to_cpu(ce->val[3]);
605     }
606     return 0;
607 }
608 
609 static inline dma_addr_t vtd_ce_get_slpt_base(VTDContextEntry *ce)
610 {
611     return ce->lo & VTD_CONTEXT_ENTRY_SLPTPTR;
612 }
613 
614 static inline uint64_t vtd_get_slpte_addr(uint64_t slpte, uint8_t aw)
615 {
616     return slpte & VTD_SL_PT_BASE_ADDR_MASK(aw);
617 }
618 
619 /* Whether the pte indicates the address of the page frame */
620 static inline bool vtd_is_last_slpte(uint64_t slpte, uint32_t level)
621 {
622     return level == VTD_SL_PT_LEVEL || (slpte & VTD_SL_PT_PAGE_SIZE_MASK);
623 }
624 
625 /* Get the content of a spte located in @base_addr[@index] */
626 static uint64_t vtd_get_slpte(dma_addr_t base_addr, uint32_t index)
627 {
628     uint64_t slpte;
629 
630     assert(index < VTD_SL_PT_ENTRY_NR);
631 
632     if (dma_memory_read(&address_space_memory,
633                         base_addr + index * sizeof(slpte), &slpte,
634                         sizeof(slpte))) {
635         slpte = (uint64_t)-1;
636         return slpte;
637     }
638     slpte = le64_to_cpu(slpte);
639     return slpte;
640 }
641 
642 /* Given an iova and the level of paging structure, return the offset
643  * of current level.
644  */
645 static inline uint32_t vtd_iova_level_offset(uint64_t iova, uint32_t level)
646 {
647     return (iova >> vtd_slpt_level_shift(level)) &
648             ((1ULL << VTD_SL_LEVEL_BITS) - 1);
649 }
650 
651 /* Check Capability Register to see if the @level of page-table is supported */
652 static inline bool vtd_is_level_supported(IntelIOMMUState *s, uint32_t level)
653 {
654     return VTD_CAP_SAGAW_MASK & s->cap &
655            (1ULL << (level - 2 + VTD_CAP_SAGAW_SHIFT));
656 }
657 
658 /* Return true if check passed, otherwise false */
659 static inline bool vtd_pe_type_check(X86IOMMUState *x86_iommu,
660                                      VTDPASIDEntry *pe)
661 {
662     switch (VTD_PE_GET_TYPE(pe)) {
663     case VTD_SM_PASID_ENTRY_FLT:
664     case VTD_SM_PASID_ENTRY_SLT:
665     case VTD_SM_PASID_ENTRY_NESTED:
666         break;
667     case VTD_SM_PASID_ENTRY_PT:
668         if (!x86_iommu->pt_supported) {
669             return false;
670         }
671         break;
672     default:
673         /* Unknwon type */
674         return false;
675     }
676     return true;
677 }
678 
679 static int vtd_get_pasid_dire(dma_addr_t pasid_dir_base,
680                               uint32_t pasid,
681                               VTDPASIDDirEntry *pdire)
682 {
683     uint32_t index;
684     dma_addr_t addr, entry_size;
685 
686     index = VTD_PASID_DIR_INDEX(pasid);
687     entry_size = VTD_PASID_DIR_ENTRY_SIZE;
688     addr = pasid_dir_base + index * entry_size;
689     if (dma_memory_read(&address_space_memory, addr, pdire, entry_size)) {
690         return -VTD_FR_PASID_TABLE_INV;
691     }
692 
693     return 0;
694 }
695 
696 static int vtd_get_pasid_entry(IntelIOMMUState *s,
697                                uint32_t pasid,
698                                VTDPASIDDirEntry *pdire,
699                                VTDPASIDEntry *pe)
700 {
701     uint32_t index;
702     dma_addr_t addr, entry_size;
703     X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(s);
704 
705     index = VTD_PASID_TABLE_INDEX(pasid);
706     entry_size = VTD_PASID_ENTRY_SIZE;
707     addr = pdire->val & VTD_PASID_TABLE_BASE_ADDR_MASK;
708     addr = addr + index * entry_size;
709     if (dma_memory_read(&address_space_memory, addr, pe, entry_size)) {
710         return -VTD_FR_PASID_TABLE_INV;
711     }
712 
713     /* Do translation type check */
714     if (!vtd_pe_type_check(x86_iommu, pe)) {
715         return -VTD_FR_PASID_TABLE_INV;
716     }
717 
718     if (!vtd_is_level_supported(s, VTD_PE_GET_LEVEL(pe))) {
719         return -VTD_FR_PASID_TABLE_INV;
720     }
721 
722     return 0;
723 }
724 
725 static int vtd_get_pasid_entry_from_pasid(IntelIOMMUState *s,
726                                           dma_addr_t pasid_dir_base,
727                                           uint32_t pasid,
728                                           VTDPASIDEntry *pe)
729 {
730     int ret;
731     VTDPASIDDirEntry pdire;
732 
733     ret = vtd_get_pasid_dire(pasid_dir_base, pasid, &pdire);
734     if (ret) {
735         return ret;
736     }
737 
738     ret = vtd_get_pasid_entry(s, pasid, &pdire, pe);
739     if (ret) {
740         return ret;
741     }
742 
743     return ret;
744 }
745 
746 static int vtd_ce_get_rid2pasid_entry(IntelIOMMUState *s,
747                                       VTDContextEntry *ce,
748                                       VTDPASIDEntry *pe)
749 {
750     uint32_t pasid;
751     dma_addr_t pasid_dir_base;
752     int ret = 0;
753 
754     pasid = VTD_CE_GET_RID2PASID(ce);
755     pasid_dir_base = VTD_CE_GET_PASID_DIR_TABLE(ce);
756     ret = vtd_get_pasid_entry_from_pasid(s, pasid_dir_base, pasid, pe);
757 
758     return ret;
759 }
760 
761 static int vtd_ce_get_pasid_fpd(IntelIOMMUState *s,
762                                 VTDContextEntry *ce,
763                                 bool *pe_fpd_set)
764 {
765     int ret;
766     uint32_t pasid;
767     dma_addr_t pasid_dir_base;
768     VTDPASIDDirEntry pdire;
769     VTDPASIDEntry pe;
770 
771     pasid = VTD_CE_GET_RID2PASID(ce);
772     pasid_dir_base = VTD_CE_GET_PASID_DIR_TABLE(ce);
773 
774     ret = vtd_get_pasid_dire(pasid_dir_base, pasid, &pdire);
775     if (ret) {
776         return ret;
777     }
778 
779     if (pdire.val & VTD_PASID_DIR_FPD) {
780         *pe_fpd_set = true;
781         return 0;
782     }
783 
784     ret = vtd_get_pasid_entry(s, pasid, &pdire, &pe);
785     if (ret) {
786         return ret;
787     }
788 
789     if (pe.val[0] & VTD_PASID_ENTRY_FPD) {
790         *pe_fpd_set = true;
791     }
792 
793     return 0;
794 }
795 
796 /* Get the page-table level that hardware should use for the second-level
797  * page-table walk from the Address Width field of context-entry.
798  */
799 static inline uint32_t vtd_ce_get_level(VTDContextEntry *ce)
800 {
801     return 2 + (ce->hi & VTD_CONTEXT_ENTRY_AW);
802 }
803 
804 static uint32_t vtd_get_iova_level(IntelIOMMUState *s,
805                                    VTDContextEntry *ce)
806 {
807     VTDPASIDEntry pe;
808 
809     if (s->root_scalable) {
810         vtd_ce_get_rid2pasid_entry(s, ce, &pe);
811         return VTD_PE_GET_LEVEL(&pe);
812     }
813 
814     return vtd_ce_get_level(ce);
815 }
816 
817 static inline uint32_t vtd_ce_get_agaw(VTDContextEntry *ce)
818 {
819     return 30 + (ce->hi & VTD_CONTEXT_ENTRY_AW) * 9;
820 }
821 
822 static uint32_t vtd_get_iova_agaw(IntelIOMMUState *s,
823                                   VTDContextEntry *ce)
824 {
825     VTDPASIDEntry pe;
826 
827     if (s->root_scalable) {
828         vtd_ce_get_rid2pasid_entry(s, ce, &pe);
829         return 30 + ((pe.val[0] >> 2) & VTD_SM_PASID_ENTRY_AW) * 9;
830     }
831 
832     return vtd_ce_get_agaw(ce);
833 }
834 
835 static inline uint32_t vtd_ce_get_type(VTDContextEntry *ce)
836 {
837     return ce->lo & VTD_CONTEXT_ENTRY_TT;
838 }
839 
840 /* Only for Legacy Mode. Return true if check passed, otherwise false */
841 static inline bool vtd_ce_type_check(X86IOMMUState *x86_iommu,
842                                      VTDContextEntry *ce)
843 {
844     switch (vtd_ce_get_type(ce)) {
845     case VTD_CONTEXT_TT_MULTI_LEVEL:
846         /* Always supported */
847         break;
848     case VTD_CONTEXT_TT_DEV_IOTLB:
849         if (!x86_iommu->dt_supported) {
850             error_report_once("%s: DT specified but not supported", __func__);
851             return false;
852         }
853         break;
854     case VTD_CONTEXT_TT_PASS_THROUGH:
855         if (!x86_iommu->pt_supported) {
856             error_report_once("%s: PT specified but not supported", __func__);
857             return false;
858         }
859         break;
860     default:
861         /* Unknown type */
862         error_report_once("%s: unknown ce type: %"PRIu32, __func__,
863                           vtd_ce_get_type(ce));
864         return false;
865     }
866     return true;
867 }
868 
869 static inline uint64_t vtd_iova_limit(IntelIOMMUState *s,
870                                       VTDContextEntry *ce, uint8_t aw)
871 {
872     uint32_t ce_agaw = vtd_get_iova_agaw(s, ce);
873     return 1ULL << MIN(ce_agaw, aw);
874 }
875 
876 /* Return true if IOVA passes range check, otherwise false. */
877 static inline bool vtd_iova_range_check(IntelIOMMUState *s,
878                                         uint64_t iova, VTDContextEntry *ce,
879                                         uint8_t aw)
880 {
881     /*
882      * Check if @iova is above 2^X-1, where X is the minimum of MGAW
883      * in CAP_REG and AW in context-entry.
884      */
885     return !(iova & ~(vtd_iova_limit(s, ce, aw) - 1));
886 }
887 
888 static dma_addr_t vtd_get_iova_pgtbl_base(IntelIOMMUState *s,
889                                           VTDContextEntry *ce)
890 {
891     VTDPASIDEntry pe;
892 
893     if (s->root_scalable) {
894         vtd_ce_get_rid2pasid_entry(s, ce, &pe);
895         return pe.val[0] & VTD_SM_PASID_ENTRY_SLPTPTR;
896     }
897 
898     return vtd_ce_get_slpt_base(ce);
899 }
900 
901 /*
902  * Rsvd field masks for spte:
903  *     Index [1] to [4] 4k pages
904  *     Index [5] to [8] large pages
905  */
906 static uint64_t vtd_paging_entry_rsvd_field[9];
907 
908 static bool vtd_slpte_nonzero_rsvd(uint64_t slpte, uint32_t level)
909 {
910     if (slpte & VTD_SL_PT_PAGE_SIZE_MASK) {
911         /* Maybe large page */
912         return slpte & vtd_paging_entry_rsvd_field[level + 4];
913     } else {
914         return slpte & vtd_paging_entry_rsvd_field[level];
915     }
916 }
917 
918 /* Find the VTD address space associated with a given bus number */
919 static VTDBus *vtd_find_as_from_bus_num(IntelIOMMUState *s, uint8_t bus_num)
920 {
921     VTDBus *vtd_bus = s->vtd_as_by_bus_num[bus_num];
922     if (!vtd_bus) {
923         /*
924          * Iterate over the registered buses to find the one which
925          * currently hold this bus number, and update the bus_num
926          * lookup table:
927          */
928         GHashTableIter iter;
929 
930         g_hash_table_iter_init(&iter, s->vtd_as_by_busptr);
931         while (g_hash_table_iter_next(&iter, NULL, (void **)&vtd_bus)) {
932             if (pci_bus_num(vtd_bus->bus) == bus_num) {
933                 s->vtd_as_by_bus_num[bus_num] = vtd_bus;
934                 return vtd_bus;
935             }
936         }
937     }
938     return vtd_bus;
939 }
940 
941 /* Given the @iova, get relevant @slptep. @slpte_level will be the last level
942  * of the translation, can be used for deciding the size of large page.
943  */
944 static int vtd_iova_to_slpte(IntelIOMMUState *s, VTDContextEntry *ce,
945                              uint64_t iova, bool is_write,
946                              uint64_t *slptep, uint32_t *slpte_level,
947                              bool *reads, bool *writes, uint8_t aw_bits)
948 {
949     dma_addr_t addr = vtd_get_iova_pgtbl_base(s, ce);
950     uint32_t level = vtd_get_iova_level(s, ce);
951     uint32_t offset;
952     uint64_t slpte;
953     uint64_t access_right_check;
954 
955     if (!vtd_iova_range_check(s, iova, ce, aw_bits)) {
956         error_report_once("%s: detected IOVA overflow (iova=0x%" PRIx64 ")",
957                           __func__, iova);
958         return -VTD_FR_ADDR_BEYOND_MGAW;
959     }
960 
961     /* FIXME: what is the Atomics request here? */
962     access_right_check = is_write ? VTD_SL_W : VTD_SL_R;
963 
964     while (true) {
965         offset = vtd_iova_level_offset(iova, level);
966         slpte = vtd_get_slpte(addr, offset);
967 
968         if (slpte == (uint64_t)-1) {
969             error_report_once("%s: detected read error on DMAR slpte "
970                               "(iova=0x%" PRIx64 ")", __func__, iova);
971             if (level == vtd_get_iova_level(s, ce)) {
972                 /* Invalid programming of context-entry */
973                 return -VTD_FR_CONTEXT_ENTRY_INV;
974             } else {
975                 return -VTD_FR_PAGING_ENTRY_INV;
976             }
977         }
978         *reads = (*reads) && (slpte & VTD_SL_R);
979         *writes = (*writes) && (slpte & VTD_SL_W);
980         if (!(slpte & access_right_check)) {
981             error_report_once("%s: detected slpte permission error "
982                               "(iova=0x%" PRIx64 ", level=0x%" PRIx32 ", "
983                               "slpte=0x%" PRIx64 ", write=%d)", __func__,
984                               iova, level, slpte, is_write);
985             return is_write ? -VTD_FR_WRITE : -VTD_FR_READ;
986         }
987         if (vtd_slpte_nonzero_rsvd(slpte, level)) {
988             error_report_once("%s: detected splte reserve non-zero "
989                               "iova=0x%" PRIx64 ", level=0x%" PRIx32
990                               "slpte=0x%" PRIx64 ")", __func__, iova,
991                               level, slpte);
992             return -VTD_FR_PAGING_ENTRY_RSVD;
993         }
994 
995         if (vtd_is_last_slpte(slpte, level)) {
996             *slptep = slpte;
997             *slpte_level = level;
998             return 0;
999         }
1000         addr = vtd_get_slpte_addr(slpte, aw_bits);
1001         level--;
1002     }
1003 }
1004 
1005 typedef int (*vtd_page_walk_hook)(IOMMUTLBEntry *entry, void *private);
1006 
1007 /**
1008  * Constant information used during page walking
1009  *
1010  * @hook_fn: hook func to be called when detected page
1011  * @private: private data to be passed into hook func
1012  * @notify_unmap: whether we should notify invalid entries
1013  * @as: VT-d address space of the device
1014  * @aw: maximum address width
1015  * @domain: domain ID of the page walk
1016  */
1017 typedef struct {
1018     VTDAddressSpace *as;
1019     vtd_page_walk_hook hook_fn;
1020     void *private;
1021     bool notify_unmap;
1022     uint8_t aw;
1023     uint16_t domain_id;
1024 } vtd_page_walk_info;
1025 
1026 static int vtd_page_walk_one(IOMMUTLBEntry *entry, vtd_page_walk_info *info)
1027 {
1028     VTDAddressSpace *as = info->as;
1029     vtd_page_walk_hook hook_fn = info->hook_fn;
1030     void *private = info->private;
1031     DMAMap target = {
1032         .iova = entry->iova,
1033         .size = entry->addr_mask,
1034         .translated_addr = entry->translated_addr,
1035         .perm = entry->perm,
1036     };
1037     DMAMap *mapped = iova_tree_find(as->iova_tree, &target);
1038 
1039     if (entry->perm == IOMMU_NONE && !info->notify_unmap) {
1040         trace_vtd_page_walk_one_skip_unmap(entry->iova, entry->addr_mask);
1041         return 0;
1042     }
1043 
1044     assert(hook_fn);
1045 
1046     /* Update local IOVA mapped ranges */
1047     if (entry->perm) {
1048         if (mapped) {
1049             /* If it's exactly the same translation, skip */
1050             if (!memcmp(mapped, &target, sizeof(target))) {
1051                 trace_vtd_page_walk_one_skip_map(entry->iova, entry->addr_mask,
1052                                                  entry->translated_addr);
1053                 return 0;
1054             } else {
1055                 /*
1056                  * Translation changed.  Normally this should not
1057                  * happen, but it can happen when with buggy guest
1058                  * OSes.  Note that there will be a small window that
1059                  * we don't have map at all.  But that's the best
1060                  * effort we can do.  The ideal way to emulate this is
1061                  * atomically modify the PTE to follow what has
1062                  * changed, but we can't.  One example is that vfio
1063                  * driver only has VFIO_IOMMU_[UN]MAP_DMA but no
1064                  * interface to modify a mapping (meanwhile it seems
1065                  * meaningless to even provide one).  Anyway, let's
1066                  * mark this as a TODO in case one day we'll have
1067                  * a better solution.
1068                  */
1069                 IOMMUAccessFlags cache_perm = entry->perm;
1070                 int ret;
1071 
1072                 /* Emulate an UNMAP */
1073                 entry->perm = IOMMU_NONE;
1074                 trace_vtd_page_walk_one(info->domain_id,
1075                                         entry->iova,
1076                                         entry->translated_addr,
1077                                         entry->addr_mask,
1078                                         entry->perm);
1079                 ret = hook_fn(entry, private);
1080                 if (ret) {
1081                     return ret;
1082                 }
1083                 /* Drop any existing mapping */
1084                 iova_tree_remove(as->iova_tree, &target);
1085                 /* Recover the correct permission */
1086                 entry->perm = cache_perm;
1087             }
1088         }
1089         iova_tree_insert(as->iova_tree, &target);
1090     } else {
1091         if (!mapped) {
1092             /* Skip since we didn't map this range at all */
1093             trace_vtd_page_walk_one_skip_unmap(entry->iova, entry->addr_mask);
1094             return 0;
1095         }
1096         iova_tree_remove(as->iova_tree, &target);
1097     }
1098 
1099     trace_vtd_page_walk_one(info->domain_id, entry->iova,
1100                             entry->translated_addr, entry->addr_mask,
1101                             entry->perm);
1102     return hook_fn(entry, private);
1103 }
1104 
1105 /**
1106  * vtd_page_walk_level - walk over specific level for IOVA range
1107  *
1108  * @addr: base GPA addr to start the walk
1109  * @start: IOVA range start address
1110  * @end: IOVA range end address (start <= addr < end)
1111  * @read: whether parent level has read permission
1112  * @write: whether parent level has write permission
1113  * @info: constant information for the page walk
1114  */
1115 static int vtd_page_walk_level(dma_addr_t addr, uint64_t start,
1116                                uint64_t end, uint32_t level, bool read,
1117                                bool write, vtd_page_walk_info *info)
1118 {
1119     bool read_cur, write_cur, entry_valid;
1120     uint32_t offset;
1121     uint64_t slpte;
1122     uint64_t subpage_size, subpage_mask;
1123     IOMMUTLBEntry entry;
1124     uint64_t iova = start;
1125     uint64_t iova_next;
1126     int ret = 0;
1127 
1128     trace_vtd_page_walk_level(addr, level, start, end);
1129 
1130     subpage_size = 1ULL << vtd_slpt_level_shift(level);
1131     subpage_mask = vtd_slpt_level_page_mask(level);
1132 
1133     while (iova < end) {
1134         iova_next = (iova & subpage_mask) + subpage_size;
1135 
1136         offset = vtd_iova_level_offset(iova, level);
1137         slpte = vtd_get_slpte(addr, offset);
1138 
1139         if (slpte == (uint64_t)-1) {
1140             trace_vtd_page_walk_skip_read(iova, iova_next);
1141             goto next;
1142         }
1143 
1144         if (vtd_slpte_nonzero_rsvd(slpte, level)) {
1145             trace_vtd_page_walk_skip_reserve(iova, iova_next);
1146             goto next;
1147         }
1148 
1149         /* Permissions are stacked with parents' */
1150         read_cur = read && (slpte & VTD_SL_R);
1151         write_cur = write && (slpte & VTD_SL_W);
1152 
1153         /*
1154          * As long as we have either read/write permission, this is a
1155          * valid entry. The rule works for both page entries and page
1156          * table entries.
1157          */
1158         entry_valid = read_cur | write_cur;
1159 
1160         if (!vtd_is_last_slpte(slpte, level) && entry_valid) {
1161             /*
1162              * This is a valid PDE (or even bigger than PDE).  We need
1163              * to walk one further level.
1164              */
1165             ret = vtd_page_walk_level(vtd_get_slpte_addr(slpte, info->aw),
1166                                       iova, MIN(iova_next, end), level - 1,
1167                                       read_cur, write_cur, info);
1168         } else {
1169             /*
1170              * This means we are either:
1171              *
1172              * (1) the real page entry (either 4K page, or huge page)
1173              * (2) the whole range is invalid
1174              *
1175              * In either case, we send an IOTLB notification down.
1176              */
1177             entry.target_as = &address_space_memory;
1178             entry.iova = iova & subpage_mask;
1179             entry.perm = IOMMU_ACCESS_FLAG(read_cur, write_cur);
1180             entry.addr_mask = ~subpage_mask;
1181             /* NOTE: this is only meaningful if entry_valid == true */
1182             entry.translated_addr = vtd_get_slpte_addr(slpte, info->aw);
1183             ret = vtd_page_walk_one(&entry, info);
1184         }
1185 
1186         if (ret < 0) {
1187             return ret;
1188         }
1189 
1190 next:
1191         iova = iova_next;
1192     }
1193 
1194     return 0;
1195 }
1196 
1197 /**
1198  * vtd_page_walk - walk specific IOVA range, and call the hook
1199  *
1200  * @s: intel iommu state
1201  * @ce: context entry to walk upon
1202  * @start: IOVA address to start the walk
1203  * @end: IOVA range end address (start <= addr < end)
1204  * @info: page walking information struct
1205  */
1206 static int vtd_page_walk(IntelIOMMUState *s, VTDContextEntry *ce,
1207                          uint64_t start, uint64_t end,
1208                          vtd_page_walk_info *info)
1209 {
1210     dma_addr_t addr = vtd_get_iova_pgtbl_base(s, ce);
1211     uint32_t level = vtd_get_iova_level(s, ce);
1212 
1213     if (!vtd_iova_range_check(s, start, ce, info->aw)) {
1214         return -VTD_FR_ADDR_BEYOND_MGAW;
1215     }
1216 
1217     if (!vtd_iova_range_check(s, end, ce, info->aw)) {
1218         /* Fix end so that it reaches the maximum */
1219         end = vtd_iova_limit(s, ce, info->aw);
1220     }
1221 
1222     return vtd_page_walk_level(addr, start, end, level, true, true, info);
1223 }
1224 
1225 static int vtd_root_entry_rsvd_bits_check(IntelIOMMUState *s,
1226                                           VTDRootEntry *re)
1227 {
1228     /* Legacy Mode reserved bits check */
1229     if (!s->root_scalable &&
1230         (re->hi || (re->lo & VTD_ROOT_ENTRY_RSVD(s->aw_bits))))
1231         goto rsvd_err;
1232 
1233     /* Scalable Mode reserved bits check */
1234     if (s->root_scalable &&
1235         ((re->lo & VTD_ROOT_ENTRY_RSVD(s->aw_bits)) ||
1236          (re->hi & VTD_ROOT_ENTRY_RSVD(s->aw_bits))))
1237         goto rsvd_err;
1238 
1239     return 0;
1240 
1241 rsvd_err:
1242     error_report_once("%s: invalid root entry: hi=0x%"PRIx64
1243                       ", lo=0x%"PRIx64,
1244                       __func__, re->hi, re->lo);
1245     return -VTD_FR_ROOT_ENTRY_RSVD;
1246 }
1247 
1248 static inline int vtd_context_entry_rsvd_bits_check(IntelIOMMUState *s,
1249                                                     VTDContextEntry *ce)
1250 {
1251     if (!s->root_scalable &&
1252         (ce->hi & VTD_CONTEXT_ENTRY_RSVD_HI ||
1253          ce->lo & VTD_CONTEXT_ENTRY_RSVD_LO(s->aw_bits))) {
1254         error_report_once("%s: invalid context entry: hi=%"PRIx64
1255                           ", lo=%"PRIx64" (reserved nonzero)",
1256                           __func__, ce->hi, ce->lo);
1257         return -VTD_FR_CONTEXT_ENTRY_RSVD;
1258     }
1259 
1260     if (s->root_scalable &&
1261         (ce->val[0] & VTD_SM_CONTEXT_ENTRY_RSVD_VAL0(s->aw_bits) ||
1262          ce->val[1] & VTD_SM_CONTEXT_ENTRY_RSVD_VAL1 ||
1263          ce->val[2] ||
1264          ce->val[3])) {
1265         error_report_once("%s: invalid context entry: val[3]=%"PRIx64
1266                           ", val[2]=%"PRIx64
1267                           ", val[1]=%"PRIx64
1268                           ", val[0]=%"PRIx64" (reserved nonzero)",
1269                           __func__, ce->val[3], ce->val[2],
1270                           ce->val[1], ce->val[0]);
1271         return -VTD_FR_CONTEXT_ENTRY_RSVD;
1272     }
1273 
1274     return 0;
1275 }
1276 
1277 static int vtd_ce_rid2pasid_check(IntelIOMMUState *s,
1278                                   VTDContextEntry *ce)
1279 {
1280     VTDPASIDEntry pe;
1281 
1282     /*
1283      * Make sure in Scalable Mode, a present context entry
1284      * has valid rid2pasid setting, which includes valid
1285      * rid2pasid field and corresponding pasid entry setting
1286      */
1287     return vtd_ce_get_rid2pasid_entry(s, ce, &pe);
1288 }
1289 
1290 /* Map a device to its corresponding domain (context-entry) */
1291 static int vtd_dev_to_context_entry(IntelIOMMUState *s, uint8_t bus_num,
1292                                     uint8_t devfn, VTDContextEntry *ce)
1293 {
1294     VTDRootEntry re;
1295     int ret_fr;
1296     X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(s);
1297 
1298     ret_fr = vtd_get_root_entry(s, bus_num, &re);
1299     if (ret_fr) {
1300         return ret_fr;
1301     }
1302 
1303     if (!vtd_root_entry_present(s, &re, devfn)) {
1304         /* Not error - it's okay we don't have root entry. */
1305         trace_vtd_re_not_present(bus_num);
1306         return -VTD_FR_ROOT_ENTRY_P;
1307     }
1308 
1309     ret_fr = vtd_root_entry_rsvd_bits_check(s, &re);
1310     if (ret_fr) {
1311         return ret_fr;
1312     }
1313 
1314     ret_fr = vtd_get_context_entry_from_root(s, &re, devfn, ce);
1315     if (ret_fr) {
1316         return ret_fr;
1317     }
1318 
1319     if (!vtd_ce_present(ce)) {
1320         /* Not error - it's okay we don't have context entry. */
1321         trace_vtd_ce_not_present(bus_num, devfn);
1322         return -VTD_FR_CONTEXT_ENTRY_P;
1323     }
1324 
1325     ret_fr = vtd_context_entry_rsvd_bits_check(s, ce);
1326     if (ret_fr) {
1327         return ret_fr;
1328     }
1329 
1330     /* Check if the programming of context-entry is valid */
1331     if (!s->root_scalable &&
1332         !vtd_is_level_supported(s, vtd_ce_get_level(ce))) {
1333         error_report_once("%s: invalid context entry: hi=%"PRIx64
1334                           ", lo=%"PRIx64" (level %d not supported)",
1335                           __func__, ce->hi, ce->lo,
1336                           vtd_ce_get_level(ce));
1337         return -VTD_FR_CONTEXT_ENTRY_INV;
1338     }
1339 
1340     if (!s->root_scalable) {
1341         /* Do translation type check */
1342         if (!vtd_ce_type_check(x86_iommu, ce)) {
1343             /* Errors dumped in vtd_ce_type_check() */
1344             return -VTD_FR_CONTEXT_ENTRY_INV;
1345         }
1346     } else {
1347         /*
1348          * Check if the programming of context-entry.rid2pasid
1349          * and corresponding pasid setting is valid, and thus
1350          * avoids to check pasid entry fetching result in future
1351          * helper function calling.
1352          */
1353         ret_fr = vtd_ce_rid2pasid_check(s, ce);
1354         if (ret_fr) {
1355             return ret_fr;
1356         }
1357     }
1358 
1359     return 0;
1360 }
1361 
1362 static int vtd_sync_shadow_page_hook(IOMMUTLBEntry *entry,
1363                                      void *private)
1364 {
1365     memory_region_notify_iommu((IOMMUMemoryRegion *)private, 0, *entry);
1366     return 0;
1367 }
1368 
1369 static uint16_t vtd_get_domain_id(IntelIOMMUState *s,
1370                                   VTDContextEntry *ce)
1371 {
1372     VTDPASIDEntry pe;
1373 
1374     if (s->root_scalable) {
1375         vtd_ce_get_rid2pasid_entry(s, ce, &pe);
1376         return VTD_SM_PASID_ENTRY_DID(pe.val[1]);
1377     }
1378 
1379     return VTD_CONTEXT_ENTRY_DID(ce->hi);
1380 }
1381 
1382 static int vtd_sync_shadow_page_table_range(VTDAddressSpace *vtd_as,
1383                                             VTDContextEntry *ce,
1384                                             hwaddr addr, hwaddr size)
1385 {
1386     IntelIOMMUState *s = vtd_as->iommu_state;
1387     vtd_page_walk_info info = {
1388         .hook_fn = vtd_sync_shadow_page_hook,
1389         .private = (void *)&vtd_as->iommu,
1390         .notify_unmap = true,
1391         .aw = s->aw_bits,
1392         .as = vtd_as,
1393         .domain_id = vtd_get_domain_id(s, ce),
1394     };
1395 
1396     return vtd_page_walk(s, ce, addr, addr + size, &info);
1397 }
1398 
1399 static int vtd_sync_shadow_page_table(VTDAddressSpace *vtd_as)
1400 {
1401     int ret;
1402     VTDContextEntry ce;
1403     IOMMUNotifier *n;
1404 
1405     ret = vtd_dev_to_context_entry(vtd_as->iommu_state,
1406                                    pci_bus_num(vtd_as->bus),
1407                                    vtd_as->devfn, &ce);
1408     if (ret) {
1409         if (ret == -VTD_FR_CONTEXT_ENTRY_P) {
1410             /*
1411              * It's a valid scenario to have a context entry that is
1412              * not present.  For example, when a device is removed
1413              * from an existing domain then the context entry will be
1414              * zeroed by the guest before it was put into another
1415              * domain.  When this happens, instead of synchronizing
1416              * the shadow pages we should invalidate all existing
1417              * mappings and notify the backends.
1418              */
1419             IOMMU_NOTIFIER_FOREACH(n, &vtd_as->iommu) {
1420                 vtd_address_space_unmap(vtd_as, n);
1421             }
1422             ret = 0;
1423         }
1424         return ret;
1425     }
1426 
1427     return vtd_sync_shadow_page_table_range(vtd_as, &ce, 0, UINT64_MAX);
1428 }
1429 
1430 /*
1431  * Check if specific device is configed to bypass address
1432  * translation for DMA requests. In Scalable Mode, bypass
1433  * 1st-level translation or 2nd-level translation, it depends
1434  * on PGTT setting.
1435  */
1436 static bool vtd_dev_pt_enabled(VTDAddressSpace *as)
1437 {
1438     IntelIOMMUState *s;
1439     VTDContextEntry ce;
1440     VTDPASIDEntry pe;
1441     int ret;
1442 
1443     assert(as);
1444 
1445     s = as->iommu_state;
1446     ret = vtd_dev_to_context_entry(s, pci_bus_num(as->bus),
1447                                    as->devfn, &ce);
1448     if (ret) {
1449         /*
1450          * Possibly failed to parse the context entry for some reason
1451          * (e.g., during init, or any guest configuration errors on
1452          * context entries). We should assume PT not enabled for
1453          * safety.
1454          */
1455         return false;
1456     }
1457 
1458     if (s->root_scalable) {
1459         ret = vtd_ce_get_rid2pasid_entry(s, &ce, &pe);
1460         if (ret) {
1461             error_report_once("%s: vtd_ce_get_rid2pasid_entry error: %"PRId32,
1462                               __func__, ret);
1463             return false;
1464         }
1465         return (VTD_PE_GET_TYPE(&pe) == VTD_SM_PASID_ENTRY_PT);
1466     }
1467 
1468     return (vtd_ce_get_type(&ce) == VTD_CONTEXT_TT_PASS_THROUGH);
1469 }
1470 
1471 /* Return whether the device is using IOMMU translation. */
1472 static bool vtd_switch_address_space(VTDAddressSpace *as)
1473 {
1474     bool use_iommu;
1475     /* Whether we need to take the BQL on our own */
1476     bool take_bql = !qemu_mutex_iothread_locked();
1477 
1478     assert(as);
1479 
1480     use_iommu = as->iommu_state->dmar_enabled && !vtd_dev_pt_enabled(as);
1481 
1482     trace_vtd_switch_address_space(pci_bus_num(as->bus),
1483                                    VTD_PCI_SLOT(as->devfn),
1484                                    VTD_PCI_FUNC(as->devfn),
1485                                    use_iommu);
1486 
1487     /*
1488      * It's possible that we reach here without BQL, e.g., when called
1489      * from vtd_pt_enable_fast_path(). However the memory APIs need
1490      * it. We'd better make sure we have had it already, or, take it.
1491      */
1492     if (take_bql) {
1493         qemu_mutex_lock_iothread();
1494     }
1495 
1496     /* Turn off first then on the other */
1497     if (use_iommu) {
1498         memory_region_set_enabled(&as->nodmar, false);
1499         memory_region_set_enabled(MEMORY_REGION(&as->iommu), true);
1500     } else {
1501         memory_region_set_enabled(MEMORY_REGION(&as->iommu), false);
1502         memory_region_set_enabled(&as->nodmar, true);
1503     }
1504 
1505     if (take_bql) {
1506         qemu_mutex_unlock_iothread();
1507     }
1508 
1509     return use_iommu;
1510 }
1511 
1512 static void vtd_switch_address_space_all(IntelIOMMUState *s)
1513 {
1514     GHashTableIter iter;
1515     VTDBus *vtd_bus;
1516     int i;
1517 
1518     g_hash_table_iter_init(&iter, s->vtd_as_by_busptr);
1519     while (g_hash_table_iter_next(&iter, NULL, (void **)&vtd_bus)) {
1520         for (i = 0; i < PCI_DEVFN_MAX; i++) {
1521             if (!vtd_bus->dev_as[i]) {
1522                 continue;
1523             }
1524             vtd_switch_address_space(vtd_bus->dev_as[i]);
1525         }
1526     }
1527 }
1528 
1529 static inline uint16_t vtd_make_source_id(uint8_t bus_num, uint8_t devfn)
1530 {
1531     return ((bus_num & 0xffUL) << 8) | (devfn & 0xffUL);
1532 }
1533 
1534 static const bool vtd_qualified_faults[] = {
1535     [VTD_FR_RESERVED] = false,
1536     [VTD_FR_ROOT_ENTRY_P] = false,
1537     [VTD_FR_CONTEXT_ENTRY_P] = true,
1538     [VTD_FR_CONTEXT_ENTRY_INV] = true,
1539     [VTD_FR_ADDR_BEYOND_MGAW] = true,
1540     [VTD_FR_WRITE] = true,
1541     [VTD_FR_READ] = true,
1542     [VTD_FR_PAGING_ENTRY_INV] = true,
1543     [VTD_FR_ROOT_TABLE_INV] = false,
1544     [VTD_FR_CONTEXT_TABLE_INV] = false,
1545     [VTD_FR_ROOT_ENTRY_RSVD] = false,
1546     [VTD_FR_PAGING_ENTRY_RSVD] = true,
1547     [VTD_FR_CONTEXT_ENTRY_TT] = true,
1548     [VTD_FR_PASID_TABLE_INV] = false,
1549     [VTD_FR_RESERVED_ERR] = false,
1550     [VTD_FR_MAX] = false,
1551 };
1552 
1553 /* To see if a fault condition is "qualified", which is reported to software
1554  * only if the FPD field in the context-entry used to process the faulting
1555  * request is 0.
1556  */
1557 static inline bool vtd_is_qualified_fault(VTDFaultReason fault)
1558 {
1559     return vtd_qualified_faults[fault];
1560 }
1561 
1562 static inline bool vtd_is_interrupt_addr(hwaddr addr)
1563 {
1564     return VTD_INTERRUPT_ADDR_FIRST <= addr && addr <= VTD_INTERRUPT_ADDR_LAST;
1565 }
1566 
1567 static void vtd_pt_enable_fast_path(IntelIOMMUState *s, uint16_t source_id)
1568 {
1569     VTDBus *vtd_bus;
1570     VTDAddressSpace *vtd_as;
1571     bool success = false;
1572 
1573     vtd_bus = vtd_find_as_from_bus_num(s, VTD_SID_TO_BUS(source_id));
1574     if (!vtd_bus) {
1575         goto out;
1576     }
1577 
1578     vtd_as = vtd_bus->dev_as[VTD_SID_TO_DEVFN(source_id)];
1579     if (!vtd_as) {
1580         goto out;
1581     }
1582 
1583     if (vtd_switch_address_space(vtd_as) == false) {
1584         /* We switched off IOMMU region successfully. */
1585         success = true;
1586     }
1587 
1588 out:
1589     trace_vtd_pt_enable_fast_path(source_id, success);
1590 }
1591 
1592 /* Map dev to context-entry then do a paging-structures walk to do a iommu
1593  * translation.
1594  *
1595  * Called from RCU critical section.
1596  *
1597  * @bus_num: The bus number
1598  * @devfn: The devfn, which is the  combined of device and function number
1599  * @is_write: The access is a write operation
1600  * @entry: IOMMUTLBEntry that contain the addr to be translated and result
1601  *
1602  * Returns true if translation is successful, otherwise false.
1603  */
1604 static bool vtd_do_iommu_translate(VTDAddressSpace *vtd_as, PCIBus *bus,
1605                                    uint8_t devfn, hwaddr addr, bool is_write,
1606                                    IOMMUTLBEntry *entry)
1607 {
1608     IntelIOMMUState *s = vtd_as->iommu_state;
1609     VTDContextEntry ce;
1610     uint8_t bus_num = pci_bus_num(bus);
1611     VTDContextCacheEntry *cc_entry;
1612     uint64_t slpte, page_mask;
1613     uint32_t level;
1614     uint16_t source_id = vtd_make_source_id(bus_num, devfn);
1615     int ret_fr;
1616     bool is_fpd_set = false;
1617     bool reads = true;
1618     bool writes = true;
1619     uint8_t access_flags;
1620     VTDIOTLBEntry *iotlb_entry;
1621 
1622     /*
1623      * We have standalone memory region for interrupt addresses, we
1624      * should never receive translation requests in this region.
1625      */
1626     assert(!vtd_is_interrupt_addr(addr));
1627 
1628     vtd_iommu_lock(s);
1629 
1630     cc_entry = &vtd_as->context_cache_entry;
1631 
1632     /* Try to fetch slpte form IOTLB */
1633     iotlb_entry = vtd_lookup_iotlb(s, source_id, addr);
1634     if (iotlb_entry) {
1635         trace_vtd_iotlb_page_hit(source_id, addr, iotlb_entry->slpte,
1636                                  iotlb_entry->domain_id);
1637         slpte = iotlb_entry->slpte;
1638         access_flags = iotlb_entry->access_flags;
1639         page_mask = iotlb_entry->mask;
1640         goto out;
1641     }
1642 
1643     /* Try to fetch context-entry from cache first */
1644     if (cc_entry->context_cache_gen == s->context_cache_gen) {
1645         trace_vtd_iotlb_cc_hit(bus_num, devfn, cc_entry->context_entry.hi,
1646                                cc_entry->context_entry.lo,
1647                                cc_entry->context_cache_gen);
1648         ce = cc_entry->context_entry;
1649         is_fpd_set = ce.lo & VTD_CONTEXT_ENTRY_FPD;
1650         if (!is_fpd_set && s->root_scalable) {
1651             ret_fr = vtd_ce_get_pasid_fpd(s, &ce, &is_fpd_set);
1652             VTD_PE_GET_FPD_ERR(ret_fr, is_fpd_set, s, source_id, addr, is_write);
1653         }
1654     } else {
1655         ret_fr = vtd_dev_to_context_entry(s, bus_num, devfn, &ce);
1656         is_fpd_set = ce.lo & VTD_CONTEXT_ENTRY_FPD;
1657         if (!ret_fr && !is_fpd_set && s->root_scalable) {
1658             ret_fr = vtd_ce_get_pasid_fpd(s, &ce, &is_fpd_set);
1659         }
1660         VTD_PE_GET_FPD_ERR(ret_fr, is_fpd_set, s, source_id, addr, is_write);
1661         /* Update context-cache */
1662         trace_vtd_iotlb_cc_update(bus_num, devfn, ce.hi, ce.lo,
1663                                   cc_entry->context_cache_gen,
1664                                   s->context_cache_gen);
1665         cc_entry->context_entry = ce;
1666         cc_entry->context_cache_gen = s->context_cache_gen;
1667     }
1668 
1669     /*
1670      * We don't need to translate for pass-through context entries.
1671      * Also, let's ignore IOTLB caching as well for PT devices.
1672      */
1673     if (vtd_ce_get_type(&ce) == VTD_CONTEXT_TT_PASS_THROUGH) {
1674         entry->iova = addr & VTD_PAGE_MASK_4K;
1675         entry->translated_addr = entry->iova;
1676         entry->addr_mask = ~VTD_PAGE_MASK_4K;
1677         entry->perm = IOMMU_RW;
1678         trace_vtd_translate_pt(source_id, entry->iova);
1679 
1680         /*
1681          * When this happens, it means firstly caching-mode is not
1682          * enabled, and this is the first passthrough translation for
1683          * the device. Let's enable the fast path for passthrough.
1684          *
1685          * When passthrough is disabled again for the device, we can
1686          * capture it via the context entry invalidation, then the
1687          * IOMMU region can be swapped back.
1688          */
1689         vtd_pt_enable_fast_path(s, source_id);
1690         vtd_iommu_unlock(s);
1691         return true;
1692     }
1693 
1694     ret_fr = vtd_iova_to_slpte(s, &ce, addr, is_write, &slpte, &level,
1695                                &reads, &writes, s->aw_bits);
1696     VTD_PE_GET_FPD_ERR(ret_fr, is_fpd_set, s, source_id, addr, is_write);
1697 
1698     page_mask = vtd_slpt_level_page_mask(level);
1699     access_flags = IOMMU_ACCESS_FLAG(reads, writes);
1700     vtd_update_iotlb(s, source_id, vtd_get_domain_id(s, &ce), addr, slpte,
1701                      access_flags, level);
1702 out:
1703     vtd_iommu_unlock(s);
1704     entry->iova = addr & page_mask;
1705     entry->translated_addr = vtd_get_slpte_addr(slpte, s->aw_bits) & page_mask;
1706     entry->addr_mask = ~page_mask;
1707     entry->perm = access_flags;
1708     return true;
1709 
1710 error:
1711     vtd_iommu_unlock(s);
1712     entry->iova = 0;
1713     entry->translated_addr = 0;
1714     entry->addr_mask = 0;
1715     entry->perm = IOMMU_NONE;
1716     return false;
1717 }
1718 
1719 static void vtd_root_table_setup(IntelIOMMUState *s)
1720 {
1721     s->root = vtd_get_quad_raw(s, DMAR_RTADDR_REG);
1722     s->root &= VTD_RTADDR_ADDR_MASK(s->aw_bits);
1723 
1724     vtd_update_scalable_state(s);
1725 
1726     trace_vtd_reg_dmar_root(s->root, s->root_scalable);
1727 }
1728 
1729 static void vtd_iec_notify_all(IntelIOMMUState *s, bool global,
1730                                uint32_t index, uint32_t mask)
1731 {
1732     x86_iommu_iec_notify_all(X86_IOMMU_DEVICE(s), global, index, mask);
1733 }
1734 
1735 static void vtd_interrupt_remap_table_setup(IntelIOMMUState *s)
1736 {
1737     uint64_t value = 0;
1738     value = vtd_get_quad_raw(s, DMAR_IRTA_REG);
1739     s->intr_size = 1UL << ((value & VTD_IRTA_SIZE_MASK) + 1);
1740     s->intr_root = value & VTD_IRTA_ADDR_MASK(s->aw_bits);
1741     s->intr_eime = value & VTD_IRTA_EIME;
1742 
1743     /* Notify global invalidation */
1744     vtd_iec_notify_all(s, true, 0, 0);
1745 
1746     trace_vtd_reg_ir_root(s->intr_root, s->intr_size);
1747 }
1748 
1749 static void vtd_iommu_replay_all(IntelIOMMUState *s)
1750 {
1751     VTDAddressSpace *vtd_as;
1752 
1753     QLIST_FOREACH(vtd_as, &s->vtd_as_with_notifiers, next) {
1754         vtd_sync_shadow_page_table(vtd_as);
1755     }
1756 }
1757 
1758 static void vtd_context_global_invalidate(IntelIOMMUState *s)
1759 {
1760     trace_vtd_inv_desc_cc_global();
1761     /* Protects context cache */
1762     vtd_iommu_lock(s);
1763     s->context_cache_gen++;
1764     if (s->context_cache_gen == VTD_CONTEXT_CACHE_GEN_MAX) {
1765         vtd_reset_context_cache_locked(s);
1766     }
1767     vtd_iommu_unlock(s);
1768     vtd_address_space_refresh_all(s);
1769     /*
1770      * From VT-d spec 6.5.2.1, a global context entry invalidation
1771      * should be followed by a IOTLB global invalidation, so we should
1772      * be safe even without this. Hoewever, let's replay the region as
1773      * well to be safer, and go back here when we need finer tunes for
1774      * VT-d emulation codes.
1775      */
1776     vtd_iommu_replay_all(s);
1777 }
1778 
1779 /* Do a context-cache device-selective invalidation.
1780  * @func_mask: FM field after shifting
1781  */
1782 static void vtd_context_device_invalidate(IntelIOMMUState *s,
1783                                           uint16_t source_id,
1784                                           uint16_t func_mask)
1785 {
1786     uint16_t mask;
1787     VTDBus *vtd_bus;
1788     VTDAddressSpace *vtd_as;
1789     uint8_t bus_n, devfn;
1790     uint16_t devfn_it;
1791 
1792     trace_vtd_inv_desc_cc_devices(source_id, func_mask);
1793 
1794     switch (func_mask & 3) {
1795     case 0:
1796         mask = 0;   /* No bits in the SID field masked */
1797         break;
1798     case 1:
1799         mask = 4;   /* Mask bit 2 in the SID field */
1800         break;
1801     case 2:
1802         mask = 6;   /* Mask bit 2:1 in the SID field */
1803         break;
1804     case 3:
1805         mask = 7;   /* Mask bit 2:0 in the SID field */
1806         break;
1807     }
1808     mask = ~mask;
1809 
1810     bus_n = VTD_SID_TO_BUS(source_id);
1811     vtd_bus = vtd_find_as_from_bus_num(s, bus_n);
1812     if (vtd_bus) {
1813         devfn = VTD_SID_TO_DEVFN(source_id);
1814         for (devfn_it = 0; devfn_it < PCI_DEVFN_MAX; ++devfn_it) {
1815             vtd_as = vtd_bus->dev_as[devfn_it];
1816             if (vtd_as && ((devfn_it & mask) == (devfn & mask))) {
1817                 trace_vtd_inv_desc_cc_device(bus_n, VTD_PCI_SLOT(devfn_it),
1818                                              VTD_PCI_FUNC(devfn_it));
1819                 vtd_iommu_lock(s);
1820                 vtd_as->context_cache_entry.context_cache_gen = 0;
1821                 vtd_iommu_unlock(s);
1822                 /*
1823                  * Do switch address space when needed, in case if the
1824                  * device passthrough bit is switched.
1825                  */
1826                 vtd_switch_address_space(vtd_as);
1827                 /*
1828                  * So a device is moving out of (or moving into) a
1829                  * domain, resync the shadow page table.
1830                  * This won't bring bad even if we have no such
1831                  * notifier registered - the IOMMU notification
1832                  * framework will skip MAP notifications if that
1833                  * happened.
1834                  */
1835                 vtd_sync_shadow_page_table(vtd_as);
1836             }
1837         }
1838     }
1839 }
1840 
1841 /* Context-cache invalidation
1842  * Returns the Context Actual Invalidation Granularity.
1843  * @val: the content of the CCMD_REG
1844  */
1845 static uint64_t vtd_context_cache_invalidate(IntelIOMMUState *s, uint64_t val)
1846 {
1847     uint64_t caig;
1848     uint64_t type = val & VTD_CCMD_CIRG_MASK;
1849 
1850     switch (type) {
1851     case VTD_CCMD_DOMAIN_INVL:
1852         /* Fall through */
1853     case VTD_CCMD_GLOBAL_INVL:
1854         caig = VTD_CCMD_GLOBAL_INVL_A;
1855         vtd_context_global_invalidate(s);
1856         break;
1857 
1858     case VTD_CCMD_DEVICE_INVL:
1859         caig = VTD_CCMD_DEVICE_INVL_A;
1860         vtd_context_device_invalidate(s, VTD_CCMD_SID(val), VTD_CCMD_FM(val));
1861         break;
1862 
1863     default:
1864         error_report_once("%s: invalid context: 0x%" PRIx64,
1865                           __func__, val);
1866         caig = 0;
1867     }
1868     return caig;
1869 }
1870 
1871 static void vtd_iotlb_global_invalidate(IntelIOMMUState *s)
1872 {
1873     trace_vtd_inv_desc_iotlb_global();
1874     vtd_reset_iotlb(s);
1875     vtd_iommu_replay_all(s);
1876 }
1877 
1878 static void vtd_iotlb_domain_invalidate(IntelIOMMUState *s, uint16_t domain_id)
1879 {
1880     VTDContextEntry ce;
1881     VTDAddressSpace *vtd_as;
1882 
1883     trace_vtd_inv_desc_iotlb_domain(domain_id);
1884 
1885     vtd_iommu_lock(s);
1886     g_hash_table_foreach_remove(s->iotlb, vtd_hash_remove_by_domain,
1887                                 &domain_id);
1888     vtd_iommu_unlock(s);
1889 
1890     QLIST_FOREACH(vtd_as, &s->vtd_as_with_notifiers, next) {
1891         if (!vtd_dev_to_context_entry(s, pci_bus_num(vtd_as->bus),
1892                                       vtd_as->devfn, &ce) &&
1893             domain_id == vtd_get_domain_id(s, &ce)) {
1894             vtd_sync_shadow_page_table(vtd_as);
1895         }
1896     }
1897 }
1898 
1899 static void vtd_iotlb_page_invalidate_notify(IntelIOMMUState *s,
1900                                            uint16_t domain_id, hwaddr addr,
1901                                            uint8_t am)
1902 {
1903     VTDAddressSpace *vtd_as;
1904     VTDContextEntry ce;
1905     int ret;
1906     hwaddr size = (1 << am) * VTD_PAGE_SIZE;
1907 
1908     QLIST_FOREACH(vtd_as, &(s->vtd_as_with_notifiers), next) {
1909         ret = vtd_dev_to_context_entry(s, pci_bus_num(vtd_as->bus),
1910                                        vtd_as->devfn, &ce);
1911         if (!ret && domain_id == vtd_get_domain_id(s, &ce)) {
1912             if (vtd_as_has_map_notifier(vtd_as)) {
1913                 /*
1914                  * As long as we have MAP notifications registered in
1915                  * any of our IOMMU notifiers, we need to sync the
1916                  * shadow page table.
1917                  */
1918                 vtd_sync_shadow_page_table_range(vtd_as, &ce, addr, size);
1919             } else {
1920                 /*
1921                  * For UNMAP-only notifiers, we don't need to walk the
1922                  * page tables.  We just deliver the PSI down to
1923                  * invalidate caches.
1924                  */
1925                 IOMMUTLBEntry entry = {
1926                     .target_as = &address_space_memory,
1927                     .iova = addr,
1928                     .translated_addr = 0,
1929                     .addr_mask = size - 1,
1930                     .perm = IOMMU_NONE,
1931                 };
1932                 memory_region_notify_iommu(&vtd_as->iommu, 0, entry);
1933             }
1934         }
1935     }
1936 }
1937 
1938 static void vtd_iotlb_page_invalidate(IntelIOMMUState *s, uint16_t domain_id,
1939                                       hwaddr addr, uint8_t am)
1940 {
1941     VTDIOTLBPageInvInfo info;
1942 
1943     trace_vtd_inv_desc_iotlb_pages(domain_id, addr, am);
1944 
1945     assert(am <= VTD_MAMV);
1946     info.domain_id = domain_id;
1947     info.addr = addr;
1948     info.mask = ~((1 << am) - 1);
1949     vtd_iommu_lock(s);
1950     g_hash_table_foreach_remove(s->iotlb, vtd_hash_remove_by_page, &info);
1951     vtd_iommu_unlock(s);
1952     vtd_iotlb_page_invalidate_notify(s, domain_id, addr, am);
1953 }
1954 
1955 /* Flush IOTLB
1956  * Returns the IOTLB Actual Invalidation Granularity.
1957  * @val: the content of the IOTLB_REG
1958  */
1959 static uint64_t vtd_iotlb_flush(IntelIOMMUState *s, uint64_t val)
1960 {
1961     uint64_t iaig;
1962     uint64_t type = val & VTD_TLB_FLUSH_GRANU_MASK;
1963     uint16_t domain_id;
1964     hwaddr addr;
1965     uint8_t am;
1966 
1967     switch (type) {
1968     case VTD_TLB_GLOBAL_FLUSH:
1969         iaig = VTD_TLB_GLOBAL_FLUSH_A;
1970         vtd_iotlb_global_invalidate(s);
1971         break;
1972 
1973     case VTD_TLB_DSI_FLUSH:
1974         domain_id = VTD_TLB_DID(val);
1975         iaig = VTD_TLB_DSI_FLUSH_A;
1976         vtd_iotlb_domain_invalidate(s, domain_id);
1977         break;
1978 
1979     case VTD_TLB_PSI_FLUSH:
1980         domain_id = VTD_TLB_DID(val);
1981         addr = vtd_get_quad_raw(s, DMAR_IVA_REG);
1982         am = VTD_IVA_AM(addr);
1983         addr = VTD_IVA_ADDR(addr);
1984         if (am > VTD_MAMV) {
1985             error_report_once("%s: address mask overflow: 0x%" PRIx64,
1986                               __func__, vtd_get_quad_raw(s, DMAR_IVA_REG));
1987             iaig = 0;
1988             break;
1989         }
1990         iaig = VTD_TLB_PSI_FLUSH_A;
1991         vtd_iotlb_page_invalidate(s, domain_id, addr, am);
1992         break;
1993 
1994     default:
1995         error_report_once("%s: invalid granularity: 0x%" PRIx64,
1996                           __func__, val);
1997         iaig = 0;
1998     }
1999     return iaig;
2000 }
2001 
2002 static void vtd_fetch_inv_desc(IntelIOMMUState *s);
2003 
2004 static inline bool vtd_queued_inv_disable_check(IntelIOMMUState *s)
2005 {
2006     return s->qi_enabled && (s->iq_tail == s->iq_head) &&
2007            (s->iq_last_desc_type == VTD_INV_DESC_WAIT);
2008 }
2009 
2010 static void vtd_handle_gcmd_qie(IntelIOMMUState *s, bool en)
2011 {
2012     uint64_t iqa_val = vtd_get_quad_raw(s, DMAR_IQA_REG);
2013 
2014     trace_vtd_inv_qi_enable(en);
2015 
2016     if (en) {
2017         s->iq = iqa_val & VTD_IQA_IQA_MASK(s->aw_bits);
2018         /* 2^(x+8) entries */
2019         s->iq_size = 1UL << ((iqa_val & VTD_IQA_QS) + 8 - (s->iq_dw ? 1 : 0));
2020         s->qi_enabled = true;
2021         trace_vtd_inv_qi_setup(s->iq, s->iq_size);
2022         /* Ok - report back to driver */
2023         vtd_set_clear_mask_long(s, DMAR_GSTS_REG, 0, VTD_GSTS_QIES);
2024 
2025         if (s->iq_tail != 0) {
2026             /*
2027              * This is a spec violation but Windows guests are known to set up
2028              * Queued Invalidation this way so we allow the write and process
2029              * Invalidation Descriptors right away.
2030              */
2031             trace_vtd_warn_invalid_qi_tail(s->iq_tail);
2032             if (!(vtd_get_long_raw(s, DMAR_FSTS_REG) & VTD_FSTS_IQE)) {
2033                 vtd_fetch_inv_desc(s);
2034             }
2035         }
2036     } else {
2037         if (vtd_queued_inv_disable_check(s)) {
2038             /* disable Queued Invalidation */
2039             vtd_set_quad_raw(s, DMAR_IQH_REG, 0);
2040             s->iq_head = 0;
2041             s->qi_enabled = false;
2042             /* Ok - report back to driver */
2043             vtd_set_clear_mask_long(s, DMAR_GSTS_REG, VTD_GSTS_QIES, 0);
2044         } else {
2045             error_report_once("%s: detected improper state when disable QI "
2046                               "(head=0x%x, tail=0x%x, last_type=%d)",
2047                               __func__,
2048                               s->iq_head, s->iq_tail, s->iq_last_desc_type);
2049         }
2050     }
2051 }
2052 
2053 /* Set Root Table Pointer */
2054 static void vtd_handle_gcmd_srtp(IntelIOMMUState *s)
2055 {
2056     vtd_root_table_setup(s);
2057     /* Ok - report back to driver */
2058     vtd_set_clear_mask_long(s, DMAR_GSTS_REG, 0, VTD_GSTS_RTPS);
2059     vtd_reset_caches(s);
2060     vtd_address_space_refresh_all(s);
2061 }
2062 
2063 /* Set Interrupt Remap Table Pointer */
2064 static void vtd_handle_gcmd_sirtp(IntelIOMMUState *s)
2065 {
2066     vtd_interrupt_remap_table_setup(s);
2067     /* Ok - report back to driver */
2068     vtd_set_clear_mask_long(s, DMAR_GSTS_REG, 0, VTD_GSTS_IRTPS);
2069 }
2070 
2071 /* Handle Translation Enable/Disable */
2072 static void vtd_handle_gcmd_te(IntelIOMMUState *s, bool en)
2073 {
2074     if (s->dmar_enabled == en) {
2075         return;
2076     }
2077 
2078     trace_vtd_dmar_enable(en);
2079 
2080     if (en) {
2081         s->dmar_enabled = true;
2082         /* Ok - report back to driver */
2083         vtd_set_clear_mask_long(s, DMAR_GSTS_REG, 0, VTD_GSTS_TES);
2084     } else {
2085         s->dmar_enabled = false;
2086 
2087         /* Clear the index of Fault Recording Register */
2088         s->next_frcd_reg = 0;
2089         /* Ok - report back to driver */
2090         vtd_set_clear_mask_long(s, DMAR_GSTS_REG, VTD_GSTS_TES, 0);
2091     }
2092 
2093     vtd_reset_caches(s);
2094     vtd_address_space_refresh_all(s);
2095 }
2096 
2097 /* Handle Interrupt Remap Enable/Disable */
2098 static void vtd_handle_gcmd_ire(IntelIOMMUState *s, bool en)
2099 {
2100     trace_vtd_ir_enable(en);
2101 
2102     if (en) {
2103         s->intr_enabled = true;
2104         /* Ok - report back to driver */
2105         vtd_set_clear_mask_long(s, DMAR_GSTS_REG, 0, VTD_GSTS_IRES);
2106     } else {
2107         s->intr_enabled = false;
2108         /* Ok - report back to driver */
2109         vtd_set_clear_mask_long(s, DMAR_GSTS_REG, VTD_GSTS_IRES, 0);
2110     }
2111 }
2112 
2113 /* Handle write to Global Command Register */
2114 static void vtd_handle_gcmd_write(IntelIOMMUState *s)
2115 {
2116     uint32_t status = vtd_get_long_raw(s, DMAR_GSTS_REG);
2117     uint32_t val = vtd_get_long_raw(s, DMAR_GCMD_REG);
2118     uint32_t changed = status ^ val;
2119 
2120     trace_vtd_reg_write_gcmd(status, val);
2121     if (changed & VTD_GCMD_TE) {
2122         /* Translation enable/disable */
2123         vtd_handle_gcmd_te(s, val & VTD_GCMD_TE);
2124     }
2125     if (val & VTD_GCMD_SRTP) {
2126         /* Set/update the root-table pointer */
2127         vtd_handle_gcmd_srtp(s);
2128     }
2129     if (changed & VTD_GCMD_QIE) {
2130         /* Queued Invalidation Enable */
2131         vtd_handle_gcmd_qie(s, val & VTD_GCMD_QIE);
2132     }
2133     if (val & VTD_GCMD_SIRTP) {
2134         /* Set/update the interrupt remapping root-table pointer */
2135         vtd_handle_gcmd_sirtp(s);
2136     }
2137     if (changed & VTD_GCMD_IRE) {
2138         /* Interrupt remap enable/disable */
2139         vtd_handle_gcmd_ire(s, val & VTD_GCMD_IRE);
2140     }
2141 }
2142 
2143 /* Handle write to Context Command Register */
2144 static void vtd_handle_ccmd_write(IntelIOMMUState *s)
2145 {
2146     uint64_t ret;
2147     uint64_t val = vtd_get_quad_raw(s, DMAR_CCMD_REG);
2148 
2149     /* Context-cache invalidation request */
2150     if (val & VTD_CCMD_ICC) {
2151         if (s->qi_enabled) {
2152             error_report_once("Queued Invalidation enabled, "
2153                               "should not use register-based invalidation");
2154             return;
2155         }
2156         ret = vtd_context_cache_invalidate(s, val);
2157         /* Invalidation completed. Change something to show */
2158         vtd_set_clear_mask_quad(s, DMAR_CCMD_REG, VTD_CCMD_ICC, 0ULL);
2159         ret = vtd_set_clear_mask_quad(s, DMAR_CCMD_REG, VTD_CCMD_CAIG_MASK,
2160                                       ret);
2161     }
2162 }
2163 
2164 /* Handle write to IOTLB Invalidation Register */
2165 static void vtd_handle_iotlb_write(IntelIOMMUState *s)
2166 {
2167     uint64_t ret;
2168     uint64_t val = vtd_get_quad_raw(s, DMAR_IOTLB_REG);
2169 
2170     /* IOTLB invalidation request */
2171     if (val & VTD_TLB_IVT) {
2172         if (s->qi_enabled) {
2173             error_report_once("Queued Invalidation enabled, "
2174                               "should not use register-based invalidation");
2175             return;
2176         }
2177         ret = vtd_iotlb_flush(s, val);
2178         /* Invalidation completed. Change something to show */
2179         vtd_set_clear_mask_quad(s, DMAR_IOTLB_REG, VTD_TLB_IVT, 0ULL);
2180         ret = vtd_set_clear_mask_quad(s, DMAR_IOTLB_REG,
2181                                       VTD_TLB_FLUSH_GRANU_MASK_A, ret);
2182     }
2183 }
2184 
2185 /* Fetch an Invalidation Descriptor from the Invalidation Queue */
2186 static bool vtd_get_inv_desc(IntelIOMMUState *s,
2187                              VTDInvDesc *inv_desc)
2188 {
2189     dma_addr_t base_addr = s->iq;
2190     uint32_t offset = s->iq_head;
2191     uint32_t dw = s->iq_dw ? 32 : 16;
2192     dma_addr_t addr = base_addr + offset * dw;
2193 
2194     if (dma_memory_read(&address_space_memory, addr, inv_desc, dw)) {
2195         error_report_once("Read INV DESC failed.");
2196         return false;
2197     }
2198     inv_desc->lo = le64_to_cpu(inv_desc->lo);
2199     inv_desc->hi = le64_to_cpu(inv_desc->hi);
2200     if (dw == 32) {
2201         inv_desc->val[2] = le64_to_cpu(inv_desc->val[2]);
2202         inv_desc->val[3] = le64_to_cpu(inv_desc->val[3]);
2203     }
2204     return true;
2205 }
2206 
2207 static bool vtd_process_wait_desc(IntelIOMMUState *s, VTDInvDesc *inv_desc)
2208 {
2209     if ((inv_desc->hi & VTD_INV_DESC_WAIT_RSVD_HI) ||
2210         (inv_desc->lo & VTD_INV_DESC_WAIT_RSVD_LO)) {
2211         error_report_once("%s: invalid wait desc: hi=%"PRIx64", lo=%"PRIx64
2212                           " (reserved nonzero)", __func__, inv_desc->hi,
2213                           inv_desc->lo);
2214         return false;
2215     }
2216     if (inv_desc->lo & VTD_INV_DESC_WAIT_SW) {
2217         /* Status Write */
2218         uint32_t status_data = (uint32_t)(inv_desc->lo >>
2219                                VTD_INV_DESC_WAIT_DATA_SHIFT);
2220 
2221         assert(!(inv_desc->lo & VTD_INV_DESC_WAIT_IF));
2222 
2223         /* FIXME: need to be masked with HAW? */
2224         dma_addr_t status_addr = inv_desc->hi;
2225         trace_vtd_inv_desc_wait_sw(status_addr, status_data);
2226         status_data = cpu_to_le32(status_data);
2227         if (dma_memory_write(&address_space_memory, status_addr, &status_data,
2228                              sizeof(status_data))) {
2229             trace_vtd_inv_desc_wait_write_fail(inv_desc->hi, inv_desc->lo);
2230             return false;
2231         }
2232     } else if (inv_desc->lo & VTD_INV_DESC_WAIT_IF) {
2233         /* Interrupt flag */
2234         vtd_generate_completion_event(s);
2235     } else {
2236         error_report_once("%s: invalid wait desc: hi=%"PRIx64", lo=%"PRIx64
2237                           " (unknown type)", __func__, inv_desc->hi,
2238                           inv_desc->lo);
2239         return false;
2240     }
2241     return true;
2242 }
2243 
2244 static bool vtd_process_context_cache_desc(IntelIOMMUState *s,
2245                                            VTDInvDesc *inv_desc)
2246 {
2247     uint16_t sid, fmask;
2248 
2249     if ((inv_desc->lo & VTD_INV_DESC_CC_RSVD) || inv_desc->hi) {
2250         error_report_once("%s: invalid cc inv desc: hi=%"PRIx64", lo=%"PRIx64
2251                           " (reserved nonzero)", __func__, inv_desc->hi,
2252                           inv_desc->lo);
2253         return false;
2254     }
2255     switch (inv_desc->lo & VTD_INV_DESC_CC_G) {
2256     case VTD_INV_DESC_CC_DOMAIN:
2257         trace_vtd_inv_desc_cc_domain(
2258             (uint16_t)VTD_INV_DESC_CC_DID(inv_desc->lo));
2259         /* Fall through */
2260     case VTD_INV_DESC_CC_GLOBAL:
2261         vtd_context_global_invalidate(s);
2262         break;
2263 
2264     case VTD_INV_DESC_CC_DEVICE:
2265         sid = VTD_INV_DESC_CC_SID(inv_desc->lo);
2266         fmask = VTD_INV_DESC_CC_FM(inv_desc->lo);
2267         vtd_context_device_invalidate(s, sid, fmask);
2268         break;
2269 
2270     default:
2271         error_report_once("%s: invalid cc inv desc: hi=%"PRIx64", lo=%"PRIx64
2272                           " (invalid type)", __func__, inv_desc->hi,
2273                           inv_desc->lo);
2274         return false;
2275     }
2276     return true;
2277 }
2278 
2279 static bool vtd_process_iotlb_desc(IntelIOMMUState *s, VTDInvDesc *inv_desc)
2280 {
2281     uint16_t domain_id;
2282     uint8_t am;
2283     hwaddr addr;
2284 
2285     if ((inv_desc->lo & VTD_INV_DESC_IOTLB_RSVD_LO) ||
2286         (inv_desc->hi & VTD_INV_DESC_IOTLB_RSVD_HI)) {
2287         error_report_once("%s: invalid iotlb inv desc: hi=0x%"PRIx64
2288                           ", lo=0x%"PRIx64" (reserved bits unzero)\n",
2289                           __func__, inv_desc->hi, inv_desc->lo);
2290         return false;
2291     }
2292 
2293     switch (inv_desc->lo & VTD_INV_DESC_IOTLB_G) {
2294     case VTD_INV_DESC_IOTLB_GLOBAL:
2295         vtd_iotlb_global_invalidate(s);
2296         break;
2297 
2298     case VTD_INV_DESC_IOTLB_DOMAIN:
2299         domain_id = VTD_INV_DESC_IOTLB_DID(inv_desc->lo);
2300         vtd_iotlb_domain_invalidate(s, domain_id);
2301         break;
2302 
2303     case VTD_INV_DESC_IOTLB_PAGE:
2304         domain_id = VTD_INV_DESC_IOTLB_DID(inv_desc->lo);
2305         addr = VTD_INV_DESC_IOTLB_ADDR(inv_desc->hi);
2306         am = VTD_INV_DESC_IOTLB_AM(inv_desc->hi);
2307         if (am > VTD_MAMV) {
2308             error_report_once("%s: invalid iotlb inv desc: hi=0x%"PRIx64
2309                               ", lo=0x%"PRIx64" (am=%u > VTD_MAMV=%u)\n",
2310                               __func__, inv_desc->hi, inv_desc->lo,
2311                               am, (unsigned)VTD_MAMV);
2312             return false;
2313         }
2314         vtd_iotlb_page_invalidate(s, domain_id, addr, am);
2315         break;
2316 
2317     default:
2318         error_report_once("%s: invalid iotlb inv desc: hi=0x%"PRIx64
2319                           ", lo=0x%"PRIx64" (type mismatch: 0x%llx)\n",
2320                           __func__, inv_desc->hi, inv_desc->lo,
2321                           inv_desc->lo & VTD_INV_DESC_IOTLB_G);
2322         return false;
2323     }
2324     return true;
2325 }
2326 
2327 static bool vtd_process_inv_iec_desc(IntelIOMMUState *s,
2328                                      VTDInvDesc *inv_desc)
2329 {
2330     trace_vtd_inv_desc_iec(inv_desc->iec.granularity,
2331                            inv_desc->iec.index,
2332                            inv_desc->iec.index_mask);
2333 
2334     vtd_iec_notify_all(s, !inv_desc->iec.granularity,
2335                        inv_desc->iec.index,
2336                        inv_desc->iec.index_mask);
2337     return true;
2338 }
2339 
2340 static bool vtd_process_device_iotlb_desc(IntelIOMMUState *s,
2341                                           VTDInvDesc *inv_desc)
2342 {
2343     VTDAddressSpace *vtd_dev_as;
2344     IOMMUTLBEntry entry;
2345     struct VTDBus *vtd_bus;
2346     hwaddr addr;
2347     uint64_t sz;
2348     uint16_t sid;
2349     uint8_t devfn;
2350     bool size;
2351     uint8_t bus_num;
2352 
2353     addr = VTD_INV_DESC_DEVICE_IOTLB_ADDR(inv_desc->hi);
2354     sid = VTD_INV_DESC_DEVICE_IOTLB_SID(inv_desc->lo);
2355     devfn = sid & 0xff;
2356     bus_num = sid >> 8;
2357     size = VTD_INV_DESC_DEVICE_IOTLB_SIZE(inv_desc->hi);
2358 
2359     if ((inv_desc->lo & VTD_INV_DESC_DEVICE_IOTLB_RSVD_LO) ||
2360         (inv_desc->hi & VTD_INV_DESC_DEVICE_IOTLB_RSVD_HI)) {
2361         error_report_once("%s: invalid dev-iotlb inv desc: hi=%"PRIx64
2362                           ", lo=%"PRIx64" (reserved nonzero)", __func__,
2363                           inv_desc->hi, inv_desc->lo);
2364         return false;
2365     }
2366 
2367     vtd_bus = vtd_find_as_from_bus_num(s, bus_num);
2368     if (!vtd_bus) {
2369         goto done;
2370     }
2371 
2372     vtd_dev_as = vtd_bus->dev_as[devfn];
2373     if (!vtd_dev_as) {
2374         goto done;
2375     }
2376 
2377     /* According to ATS spec table 2.4:
2378      * S = 0, bits 15:12 = xxxx     range size: 4K
2379      * S = 1, bits 15:12 = xxx0     range size: 8K
2380      * S = 1, bits 15:12 = xx01     range size: 16K
2381      * S = 1, bits 15:12 = x011     range size: 32K
2382      * S = 1, bits 15:12 = 0111     range size: 64K
2383      * ...
2384      */
2385     if (size) {
2386         sz = (VTD_PAGE_SIZE * 2) << cto64(addr >> VTD_PAGE_SHIFT);
2387         addr &= ~(sz - 1);
2388     } else {
2389         sz = VTD_PAGE_SIZE;
2390     }
2391 
2392     entry.target_as = &vtd_dev_as->as;
2393     entry.addr_mask = sz - 1;
2394     entry.iova = addr;
2395     entry.perm = IOMMU_NONE;
2396     entry.translated_addr = 0;
2397     memory_region_notify_iommu(&vtd_dev_as->iommu, 0, entry);
2398 
2399 done:
2400     return true;
2401 }
2402 
2403 static bool vtd_process_inv_desc(IntelIOMMUState *s)
2404 {
2405     VTDInvDesc inv_desc;
2406     uint8_t desc_type;
2407 
2408     trace_vtd_inv_qi_head(s->iq_head);
2409     if (!vtd_get_inv_desc(s, &inv_desc)) {
2410         s->iq_last_desc_type = VTD_INV_DESC_NONE;
2411         return false;
2412     }
2413 
2414     desc_type = inv_desc.lo & VTD_INV_DESC_TYPE;
2415     /* FIXME: should update at first or at last? */
2416     s->iq_last_desc_type = desc_type;
2417 
2418     switch (desc_type) {
2419     case VTD_INV_DESC_CC:
2420         trace_vtd_inv_desc("context-cache", inv_desc.hi, inv_desc.lo);
2421         if (!vtd_process_context_cache_desc(s, &inv_desc)) {
2422             return false;
2423         }
2424         break;
2425 
2426     case VTD_INV_DESC_IOTLB:
2427         trace_vtd_inv_desc("iotlb", inv_desc.hi, inv_desc.lo);
2428         if (!vtd_process_iotlb_desc(s, &inv_desc)) {
2429             return false;
2430         }
2431         break;
2432 
2433     /*
2434      * TODO: the entity of below two cases will be implemented in future series.
2435      * To make guest (which integrates scalable mode support patch set in
2436      * iommu driver) work, just return true is enough so far.
2437      */
2438     case VTD_INV_DESC_PC:
2439         break;
2440 
2441     case VTD_INV_DESC_PIOTLB:
2442         break;
2443 
2444     case VTD_INV_DESC_WAIT:
2445         trace_vtd_inv_desc("wait", inv_desc.hi, inv_desc.lo);
2446         if (!vtd_process_wait_desc(s, &inv_desc)) {
2447             return false;
2448         }
2449         break;
2450 
2451     case VTD_INV_DESC_IEC:
2452         trace_vtd_inv_desc("iec", inv_desc.hi, inv_desc.lo);
2453         if (!vtd_process_inv_iec_desc(s, &inv_desc)) {
2454             return false;
2455         }
2456         break;
2457 
2458     case VTD_INV_DESC_DEVICE:
2459         trace_vtd_inv_desc("device", inv_desc.hi, inv_desc.lo);
2460         if (!vtd_process_device_iotlb_desc(s, &inv_desc)) {
2461             return false;
2462         }
2463         break;
2464 
2465     default:
2466         error_report_once("%s: invalid inv desc: hi=%"PRIx64", lo=%"PRIx64
2467                           " (unknown type)", __func__, inv_desc.hi,
2468                           inv_desc.lo);
2469         return false;
2470     }
2471     s->iq_head++;
2472     if (s->iq_head == s->iq_size) {
2473         s->iq_head = 0;
2474     }
2475     return true;
2476 }
2477 
2478 /* Try to fetch and process more Invalidation Descriptors */
2479 static void vtd_fetch_inv_desc(IntelIOMMUState *s)
2480 {
2481     trace_vtd_inv_qi_fetch();
2482 
2483     if (s->iq_tail >= s->iq_size) {
2484         /* Detects an invalid Tail pointer */
2485         error_report_once("%s: detected invalid QI tail "
2486                           "(tail=0x%x, size=0x%x)",
2487                           __func__, s->iq_tail, s->iq_size);
2488         vtd_handle_inv_queue_error(s);
2489         return;
2490     }
2491     while (s->iq_head != s->iq_tail) {
2492         if (!vtd_process_inv_desc(s)) {
2493             /* Invalidation Queue Errors */
2494             vtd_handle_inv_queue_error(s);
2495             break;
2496         }
2497         /* Must update the IQH_REG in time */
2498         vtd_set_quad_raw(s, DMAR_IQH_REG,
2499                          (((uint64_t)(s->iq_head)) << VTD_IQH_QH_SHIFT) &
2500                          VTD_IQH_QH_MASK);
2501     }
2502 }
2503 
2504 /* Handle write to Invalidation Queue Tail Register */
2505 static void vtd_handle_iqt_write(IntelIOMMUState *s)
2506 {
2507     uint64_t val = vtd_get_quad_raw(s, DMAR_IQT_REG);
2508 
2509     if (s->iq_dw && (val & VTD_IQT_QT_256_RSV_BIT)) {
2510         error_report_once("%s: RSV bit is set: val=0x%"PRIx64,
2511                           __func__, val);
2512         return;
2513     }
2514     s->iq_tail = VTD_IQT_QT(s->iq_dw, val);
2515     trace_vtd_inv_qi_tail(s->iq_tail);
2516 
2517     if (s->qi_enabled && !(vtd_get_long_raw(s, DMAR_FSTS_REG) & VTD_FSTS_IQE)) {
2518         /* Process Invalidation Queue here */
2519         vtd_fetch_inv_desc(s);
2520     }
2521 }
2522 
2523 static void vtd_handle_fsts_write(IntelIOMMUState *s)
2524 {
2525     uint32_t fsts_reg = vtd_get_long_raw(s, DMAR_FSTS_REG);
2526     uint32_t fectl_reg = vtd_get_long_raw(s, DMAR_FECTL_REG);
2527     uint32_t status_fields = VTD_FSTS_PFO | VTD_FSTS_PPF | VTD_FSTS_IQE;
2528 
2529     if ((fectl_reg & VTD_FECTL_IP) && !(fsts_reg & status_fields)) {
2530         vtd_set_clear_mask_long(s, DMAR_FECTL_REG, VTD_FECTL_IP, 0);
2531         trace_vtd_fsts_clear_ip();
2532     }
2533     /* FIXME: when IQE is Clear, should we try to fetch some Invalidation
2534      * Descriptors if there are any when Queued Invalidation is enabled?
2535      */
2536 }
2537 
2538 static void vtd_handle_fectl_write(IntelIOMMUState *s)
2539 {
2540     uint32_t fectl_reg;
2541     /* FIXME: when software clears the IM field, check the IP field. But do we
2542      * need to compare the old value and the new value to conclude that
2543      * software clears the IM field? Or just check if the IM field is zero?
2544      */
2545     fectl_reg = vtd_get_long_raw(s, DMAR_FECTL_REG);
2546 
2547     trace_vtd_reg_write_fectl(fectl_reg);
2548 
2549     if ((fectl_reg & VTD_FECTL_IP) && !(fectl_reg & VTD_FECTL_IM)) {
2550         vtd_generate_interrupt(s, DMAR_FEADDR_REG, DMAR_FEDATA_REG);
2551         vtd_set_clear_mask_long(s, DMAR_FECTL_REG, VTD_FECTL_IP, 0);
2552     }
2553 }
2554 
2555 static void vtd_handle_ics_write(IntelIOMMUState *s)
2556 {
2557     uint32_t ics_reg = vtd_get_long_raw(s, DMAR_ICS_REG);
2558     uint32_t iectl_reg = vtd_get_long_raw(s, DMAR_IECTL_REG);
2559 
2560     if ((iectl_reg & VTD_IECTL_IP) && !(ics_reg & VTD_ICS_IWC)) {
2561         trace_vtd_reg_ics_clear_ip();
2562         vtd_set_clear_mask_long(s, DMAR_IECTL_REG, VTD_IECTL_IP, 0);
2563     }
2564 }
2565 
2566 static void vtd_handle_iectl_write(IntelIOMMUState *s)
2567 {
2568     uint32_t iectl_reg;
2569     /* FIXME: when software clears the IM field, check the IP field. But do we
2570      * need to compare the old value and the new value to conclude that
2571      * software clears the IM field? Or just check if the IM field is zero?
2572      */
2573     iectl_reg = vtd_get_long_raw(s, DMAR_IECTL_REG);
2574 
2575     trace_vtd_reg_write_iectl(iectl_reg);
2576 
2577     if ((iectl_reg & VTD_IECTL_IP) && !(iectl_reg & VTD_IECTL_IM)) {
2578         vtd_generate_interrupt(s, DMAR_IEADDR_REG, DMAR_IEDATA_REG);
2579         vtd_set_clear_mask_long(s, DMAR_IECTL_REG, VTD_IECTL_IP, 0);
2580     }
2581 }
2582 
2583 static uint64_t vtd_mem_read(void *opaque, hwaddr addr, unsigned size)
2584 {
2585     IntelIOMMUState *s = opaque;
2586     uint64_t val;
2587 
2588     trace_vtd_reg_read(addr, size);
2589 
2590     if (addr + size > DMAR_REG_SIZE) {
2591         error_report_once("%s: MMIO over range: addr=0x%" PRIx64
2592                           " size=0x%u", __func__, addr, size);
2593         return (uint64_t)-1;
2594     }
2595 
2596     switch (addr) {
2597     /* Root Table Address Register, 64-bit */
2598     case DMAR_RTADDR_REG:
2599         if (size == 4) {
2600             val = s->root & ((1ULL << 32) - 1);
2601         } else {
2602             val = s->root;
2603         }
2604         break;
2605 
2606     case DMAR_RTADDR_REG_HI:
2607         assert(size == 4);
2608         val = s->root >> 32;
2609         break;
2610 
2611     /* Invalidation Queue Address Register, 64-bit */
2612     case DMAR_IQA_REG:
2613         val = s->iq | (vtd_get_quad(s, DMAR_IQA_REG) & VTD_IQA_QS);
2614         if (size == 4) {
2615             val = val & ((1ULL << 32) - 1);
2616         }
2617         break;
2618 
2619     case DMAR_IQA_REG_HI:
2620         assert(size == 4);
2621         val = s->iq >> 32;
2622         break;
2623 
2624     default:
2625         if (size == 4) {
2626             val = vtd_get_long(s, addr);
2627         } else {
2628             val = vtd_get_quad(s, addr);
2629         }
2630     }
2631 
2632     return val;
2633 }
2634 
2635 static void vtd_mem_write(void *opaque, hwaddr addr,
2636                           uint64_t val, unsigned size)
2637 {
2638     IntelIOMMUState *s = opaque;
2639 
2640     trace_vtd_reg_write(addr, size, val);
2641 
2642     if (addr + size > DMAR_REG_SIZE) {
2643         error_report_once("%s: MMIO over range: addr=0x%" PRIx64
2644                           " size=0x%u", __func__, addr, size);
2645         return;
2646     }
2647 
2648     switch (addr) {
2649     /* Global Command Register, 32-bit */
2650     case DMAR_GCMD_REG:
2651         vtd_set_long(s, addr, val);
2652         vtd_handle_gcmd_write(s);
2653         break;
2654 
2655     /* Context Command Register, 64-bit */
2656     case DMAR_CCMD_REG:
2657         if (size == 4) {
2658             vtd_set_long(s, addr, val);
2659         } else {
2660             vtd_set_quad(s, addr, val);
2661             vtd_handle_ccmd_write(s);
2662         }
2663         break;
2664 
2665     case DMAR_CCMD_REG_HI:
2666         assert(size == 4);
2667         vtd_set_long(s, addr, val);
2668         vtd_handle_ccmd_write(s);
2669         break;
2670 
2671     /* IOTLB Invalidation Register, 64-bit */
2672     case DMAR_IOTLB_REG:
2673         if (size == 4) {
2674             vtd_set_long(s, addr, val);
2675         } else {
2676             vtd_set_quad(s, addr, val);
2677             vtd_handle_iotlb_write(s);
2678         }
2679         break;
2680 
2681     case DMAR_IOTLB_REG_HI:
2682         assert(size == 4);
2683         vtd_set_long(s, addr, val);
2684         vtd_handle_iotlb_write(s);
2685         break;
2686 
2687     /* Invalidate Address Register, 64-bit */
2688     case DMAR_IVA_REG:
2689         if (size == 4) {
2690             vtd_set_long(s, addr, val);
2691         } else {
2692             vtd_set_quad(s, addr, val);
2693         }
2694         break;
2695 
2696     case DMAR_IVA_REG_HI:
2697         assert(size == 4);
2698         vtd_set_long(s, addr, val);
2699         break;
2700 
2701     /* Fault Status Register, 32-bit */
2702     case DMAR_FSTS_REG:
2703         assert(size == 4);
2704         vtd_set_long(s, addr, val);
2705         vtd_handle_fsts_write(s);
2706         break;
2707 
2708     /* Fault Event Control Register, 32-bit */
2709     case DMAR_FECTL_REG:
2710         assert(size == 4);
2711         vtd_set_long(s, addr, val);
2712         vtd_handle_fectl_write(s);
2713         break;
2714 
2715     /* Fault Event Data Register, 32-bit */
2716     case DMAR_FEDATA_REG:
2717         assert(size == 4);
2718         vtd_set_long(s, addr, val);
2719         break;
2720 
2721     /* Fault Event Address Register, 32-bit */
2722     case DMAR_FEADDR_REG:
2723         if (size == 4) {
2724             vtd_set_long(s, addr, val);
2725         } else {
2726             /*
2727              * While the register is 32-bit only, some guests (Xen...) write to
2728              * it with 64-bit.
2729              */
2730             vtd_set_quad(s, addr, val);
2731         }
2732         break;
2733 
2734     /* Fault Event Upper Address Register, 32-bit */
2735     case DMAR_FEUADDR_REG:
2736         assert(size == 4);
2737         vtd_set_long(s, addr, val);
2738         break;
2739 
2740     /* Protected Memory Enable Register, 32-bit */
2741     case DMAR_PMEN_REG:
2742         assert(size == 4);
2743         vtd_set_long(s, addr, val);
2744         break;
2745 
2746     /* Root Table Address Register, 64-bit */
2747     case DMAR_RTADDR_REG:
2748         if (size == 4) {
2749             vtd_set_long(s, addr, val);
2750         } else {
2751             vtd_set_quad(s, addr, val);
2752         }
2753         break;
2754 
2755     case DMAR_RTADDR_REG_HI:
2756         assert(size == 4);
2757         vtd_set_long(s, addr, val);
2758         break;
2759 
2760     /* Invalidation Queue Tail Register, 64-bit */
2761     case DMAR_IQT_REG:
2762         if (size == 4) {
2763             vtd_set_long(s, addr, val);
2764         } else {
2765             vtd_set_quad(s, addr, val);
2766         }
2767         vtd_handle_iqt_write(s);
2768         break;
2769 
2770     case DMAR_IQT_REG_HI:
2771         assert(size == 4);
2772         vtd_set_long(s, addr, val);
2773         /* 19:63 of IQT_REG is RsvdZ, do nothing here */
2774         break;
2775 
2776     /* Invalidation Queue Address Register, 64-bit */
2777     case DMAR_IQA_REG:
2778         if (size == 4) {
2779             vtd_set_long(s, addr, val);
2780         } else {
2781             vtd_set_quad(s, addr, val);
2782         }
2783         if (s->ecap & VTD_ECAP_SMTS &&
2784             val & VTD_IQA_DW_MASK) {
2785             s->iq_dw = true;
2786         } else {
2787             s->iq_dw = false;
2788         }
2789         break;
2790 
2791     case DMAR_IQA_REG_HI:
2792         assert(size == 4);
2793         vtd_set_long(s, addr, val);
2794         break;
2795 
2796     /* Invalidation Completion Status Register, 32-bit */
2797     case DMAR_ICS_REG:
2798         assert(size == 4);
2799         vtd_set_long(s, addr, val);
2800         vtd_handle_ics_write(s);
2801         break;
2802 
2803     /* Invalidation Event Control Register, 32-bit */
2804     case DMAR_IECTL_REG:
2805         assert(size == 4);
2806         vtd_set_long(s, addr, val);
2807         vtd_handle_iectl_write(s);
2808         break;
2809 
2810     /* Invalidation Event Data Register, 32-bit */
2811     case DMAR_IEDATA_REG:
2812         assert(size == 4);
2813         vtd_set_long(s, addr, val);
2814         break;
2815 
2816     /* Invalidation Event Address Register, 32-bit */
2817     case DMAR_IEADDR_REG:
2818         assert(size == 4);
2819         vtd_set_long(s, addr, val);
2820         break;
2821 
2822     /* Invalidation Event Upper Address Register, 32-bit */
2823     case DMAR_IEUADDR_REG:
2824         assert(size == 4);
2825         vtd_set_long(s, addr, val);
2826         break;
2827 
2828     /* Fault Recording Registers, 128-bit */
2829     case DMAR_FRCD_REG_0_0:
2830         if (size == 4) {
2831             vtd_set_long(s, addr, val);
2832         } else {
2833             vtd_set_quad(s, addr, val);
2834         }
2835         break;
2836 
2837     case DMAR_FRCD_REG_0_1:
2838         assert(size == 4);
2839         vtd_set_long(s, addr, val);
2840         break;
2841 
2842     case DMAR_FRCD_REG_0_2:
2843         if (size == 4) {
2844             vtd_set_long(s, addr, val);
2845         } else {
2846             vtd_set_quad(s, addr, val);
2847             /* May clear bit 127 (Fault), update PPF */
2848             vtd_update_fsts_ppf(s);
2849         }
2850         break;
2851 
2852     case DMAR_FRCD_REG_0_3:
2853         assert(size == 4);
2854         vtd_set_long(s, addr, val);
2855         /* May clear bit 127 (Fault), update PPF */
2856         vtd_update_fsts_ppf(s);
2857         break;
2858 
2859     case DMAR_IRTA_REG:
2860         if (size == 4) {
2861             vtd_set_long(s, addr, val);
2862         } else {
2863             vtd_set_quad(s, addr, val);
2864         }
2865         break;
2866 
2867     case DMAR_IRTA_REG_HI:
2868         assert(size == 4);
2869         vtd_set_long(s, addr, val);
2870         break;
2871 
2872     default:
2873         if (size == 4) {
2874             vtd_set_long(s, addr, val);
2875         } else {
2876             vtd_set_quad(s, addr, val);
2877         }
2878     }
2879 }
2880 
2881 static IOMMUTLBEntry vtd_iommu_translate(IOMMUMemoryRegion *iommu, hwaddr addr,
2882                                          IOMMUAccessFlags flag, int iommu_idx)
2883 {
2884     VTDAddressSpace *vtd_as = container_of(iommu, VTDAddressSpace, iommu);
2885     IntelIOMMUState *s = vtd_as->iommu_state;
2886     IOMMUTLBEntry iotlb = {
2887         /* We'll fill in the rest later. */
2888         .target_as = &address_space_memory,
2889     };
2890     bool success;
2891 
2892     if (likely(s->dmar_enabled)) {
2893         success = vtd_do_iommu_translate(vtd_as, vtd_as->bus, vtd_as->devfn,
2894                                          addr, flag & IOMMU_WO, &iotlb);
2895     } else {
2896         /* DMAR disabled, passthrough, use 4k-page*/
2897         iotlb.iova = addr & VTD_PAGE_MASK_4K;
2898         iotlb.translated_addr = addr & VTD_PAGE_MASK_4K;
2899         iotlb.addr_mask = ~VTD_PAGE_MASK_4K;
2900         iotlb.perm = IOMMU_RW;
2901         success = true;
2902     }
2903 
2904     if (likely(success)) {
2905         trace_vtd_dmar_translate(pci_bus_num(vtd_as->bus),
2906                                  VTD_PCI_SLOT(vtd_as->devfn),
2907                                  VTD_PCI_FUNC(vtd_as->devfn),
2908                                  iotlb.iova, iotlb.translated_addr,
2909                                  iotlb.addr_mask);
2910     } else {
2911         error_report_once("%s: detected translation failure "
2912                           "(dev=%02x:%02x:%02x, iova=0x%" PRIx64 ")",
2913                           __func__, pci_bus_num(vtd_as->bus),
2914                           VTD_PCI_SLOT(vtd_as->devfn),
2915                           VTD_PCI_FUNC(vtd_as->devfn),
2916                           addr);
2917     }
2918 
2919     return iotlb;
2920 }
2921 
2922 static void vtd_iommu_notify_flag_changed(IOMMUMemoryRegion *iommu,
2923                                           IOMMUNotifierFlag old,
2924                                           IOMMUNotifierFlag new)
2925 {
2926     VTDAddressSpace *vtd_as = container_of(iommu, VTDAddressSpace, iommu);
2927     IntelIOMMUState *s = vtd_as->iommu_state;
2928 
2929     if (!s->caching_mode && new & IOMMU_NOTIFIER_MAP) {
2930         error_report("We need to set caching-mode=on for intel-iommu to enable "
2931                      "device assignment with IOMMU protection.");
2932         exit(1);
2933     }
2934 
2935     /* Update per-address-space notifier flags */
2936     vtd_as->notifier_flags = new;
2937 
2938     if (old == IOMMU_NOTIFIER_NONE) {
2939         QLIST_INSERT_HEAD(&s->vtd_as_with_notifiers, vtd_as, next);
2940     } else if (new == IOMMU_NOTIFIER_NONE) {
2941         QLIST_REMOVE(vtd_as, next);
2942     }
2943 }
2944 
2945 static int vtd_post_load(void *opaque, int version_id)
2946 {
2947     IntelIOMMUState *iommu = opaque;
2948 
2949     /*
2950      * Memory regions are dynamically turned on/off depending on
2951      * context entry configurations from the guest. After migration,
2952      * we need to make sure the memory regions are still correct.
2953      */
2954     vtd_switch_address_space_all(iommu);
2955 
2956     /*
2957      * We don't need to migrate the root_scalable because we can
2958      * simply do the calculation after the loading is complete.  We
2959      * can actually do similar things with root, dmar_enabled, etc.
2960      * however since we've had them already so we'd better keep them
2961      * for compatibility of migration.
2962      */
2963     vtd_update_scalable_state(iommu);
2964 
2965     return 0;
2966 }
2967 
2968 static const VMStateDescription vtd_vmstate = {
2969     .name = "iommu-intel",
2970     .version_id = 1,
2971     .minimum_version_id = 1,
2972     .priority = MIG_PRI_IOMMU,
2973     .post_load = vtd_post_load,
2974     .fields = (VMStateField[]) {
2975         VMSTATE_UINT64(root, IntelIOMMUState),
2976         VMSTATE_UINT64(intr_root, IntelIOMMUState),
2977         VMSTATE_UINT64(iq, IntelIOMMUState),
2978         VMSTATE_UINT32(intr_size, IntelIOMMUState),
2979         VMSTATE_UINT16(iq_head, IntelIOMMUState),
2980         VMSTATE_UINT16(iq_tail, IntelIOMMUState),
2981         VMSTATE_UINT16(iq_size, IntelIOMMUState),
2982         VMSTATE_UINT16(next_frcd_reg, IntelIOMMUState),
2983         VMSTATE_UINT8_ARRAY(csr, IntelIOMMUState, DMAR_REG_SIZE),
2984         VMSTATE_UINT8(iq_last_desc_type, IntelIOMMUState),
2985         VMSTATE_UNUSED(1),      /* bool root_extended is obsolete by VT-d */
2986         VMSTATE_BOOL(dmar_enabled, IntelIOMMUState),
2987         VMSTATE_BOOL(qi_enabled, IntelIOMMUState),
2988         VMSTATE_BOOL(intr_enabled, IntelIOMMUState),
2989         VMSTATE_BOOL(intr_eime, IntelIOMMUState),
2990         VMSTATE_END_OF_LIST()
2991     }
2992 };
2993 
2994 static const MemoryRegionOps vtd_mem_ops = {
2995     .read = vtd_mem_read,
2996     .write = vtd_mem_write,
2997     .endianness = DEVICE_LITTLE_ENDIAN,
2998     .impl = {
2999         .min_access_size = 4,
3000         .max_access_size = 8,
3001     },
3002     .valid = {
3003         .min_access_size = 4,
3004         .max_access_size = 8,
3005     },
3006 };
3007 
3008 static Property vtd_properties[] = {
3009     DEFINE_PROP_UINT32("version", IntelIOMMUState, version, 0),
3010     DEFINE_PROP_ON_OFF_AUTO("eim", IntelIOMMUState, intr_eim,
3011                             ON_OFF_AUTO_AUTO),
3012     DEFINE_PROP_BOOL("x-buggy-eim", IntelIOMMUState, buggy_eim, false),
3013     DEFINE_PROP_UINT8("aw-bits", IntelIOMMUState, aw_bits,
3014                       VTD_HOST_ADDRESS_WIDTH),
3015     DEFINE_PROP_BOOL("caching-mode", IntelIOMMUState, caching_mode, FALSE),
3016     DEFINE_PROP_BOOL("x-scalable-mode", IntelIOMMUState, scalable_mode, FALSE),
3017     DEFINE_PROP_BOOL("dma-drain", IntelIOMMUState, dma_drain, true),
3018     DEFINE_PROP_END_OF_LIST(),
3019 };
3020 
3021 /* Read IRTE entry with specific index */
3022 static int vtd_irte_get(IntelIOMMUState *iommu, uint16_t index,
3023                         VTD_IR_TableEntry *entry, uint16_t sid)
3024 {
3025     static const uint16_t vtd_svt_mask[VTD_SQ_MAX] = \
3026         {0xffff, 0xfffb, 0xfff9, 0xfff8};
3027     dma_addr_t addr = 0x00;
3028     uint16_t mask, source_id;
3029     uint8_t bus, bus_max, bus_min;
3030 
3031     addr = iommu->intr_root + index * sizeof(*entry);
3032     if (dma_memory_read(&address_space_memory, addr, entry,
3033                         sizeof(*entry))) {
3034         error_report_once("%s: read failed: ind=0x%x addr=0x%" PRIx64,
3035                           __func__, index, addr);
3036         return -VTD_FR_IR_ROOT_INVAL;
3037     }
3038 
3039     trace_vtd_ir_irte_get(index, le64_to_cpu(entry->data[1]),
3040                           le64_to_cpu(entry->data[0]));
3041 
3042     if (!entry->irte.present) {
3043         error_report_once("%s: detected non-present IRTE "
3044                           "(index=%u, high=0x%" PRIx64 ", low=0x%" PRIx64 ")",
3045                           __func__, index, le64_to_cpu(entry->data[1]),
3046                           le64_to_cpu(entry->data[0]));
3047         return -VTD_FR_IR_ENTRY_P;
3048     }
3049 
3050     if (entry->irte.__reserved_0 || entry->irte.__reserved_1 ||
3051         entry->irte.__reserved_2) {
3052         error_report_once("%s: detected non-zero reserved IRTE "
3053                           "(index=%u, high=0x%" PRIx64 ", low=0x%" PRIx64 ")",
3054                           __func__, index, le64_to_cpu(entry->data[1]),
3055                           le64_to_cpu(entry->data[0]));
3056         return -VTD_FR_IR_IRTE_RSVD;
3057     }
3058 
3059     if (sid != X86_IOMMU_SID_INVALID) {
3060         /* Validate IRTE SID */
3061         source_id = le32_to_cpu(entry->irte.source_id);
3062         switch (entry->irte.sid_vtype) {
3063         case VTD_SVT_NONE:
3064             break;
3065 
3066         case VTD_SVT_ALL:
3067             mask = vtd_svt_mask[entry->irte.sid_q];
3068             if ((source_id & mask) != (sid & mask)) {
3069                 error_report_once("%s: invalid IRTE SID "
3070                                   "(index=%u, sid=%u, source_id=%u)",
3071                                   __func__, index, sid, source_id);
3072                 return -VTD_FR_IR_SID_ERR;
3073             }
3074             break;
3075 
3076         case VTD_SVT_BUS:
3077             bus_max = source_id >> 8;
3078             bus_min = source_id & 0xff;
3079             bus = sid >> 8;
3080             if (bus > bus_max || bus < bus_min) {
3081                 error_report_once("%s: invalid SVT_BUS "
3082                                   "(index=%u, bus=%u, min=%u, max=%u)",
3083                                   __func__, index, bus, bus_min, bus_max);
3084                 return -VTD_FR_IR_SID_ERR;
3085             }
3086             break;
3087 
3088         default:
3089             error_report_once("%s: detected invalid IRTE SVT "
3090                               "(index=%u, type=%d)", __func__,
3091                               index, entry->irte.sid_vtype);
3092             /* Take this as verification failure. */
3093             return -VTD_FR_IR_SID_ERR;
3094             break;
3095         }
3096     }
3097 
3098     return 0;
3099 }
3100 
3101 /* Fetch IRQ information of specific IR index */
3102 static int vtd_remap_irq_get(IntelIOMMUState *iommu, uint16_t index,
3103                              X86IOMMUIrq *irq, uint16_t sid)
3104 {
3105     VTD_IR_TableEntry irte = {};
3106     int ret = 0;
3107 
3108     ret = vtd_irte_get(iommu, index, &irte, sid);
3109     if (ret) {
3110         return ret;
3111     }
3112 
3113     irq->trigger_mode = irte.irte.trigger_mode;
3114     irq->vector = irte.irte.vector;
3115     irq->delivery_mode = irte.irte.delivery_mode;
3116     irq->dest = le32_to_cpu(irte.irte.dest_id);
3117     if (!iommu->intr_eime) {
3118 #define  VTD_IR_APIC_DEST_MASK         (0xff00ULL)
3119 #define  VTD_IR_APIC_DEST_SHIFT        (8)
3120         irq->dest = (irq->dest & VTD_IR_APIC_DEST_MASK) >>
3121             VTD_IR_APIC_DEST_SHIFT;
3122     }
3123     irq->dest_mode = irte.irte.dest_mode;
3124     irq->redir_hint = irte.irte.redir_hint;
3125 
3126     trace_vtd_ir_remap(index, irq->trigger_mode, irq->vector,
3127                        irq->delivery_mode, irq->dest, irq->dest_mode);
3128 
3129     return 0;
3130 }
3131 
3132 /* Interrupt remapping for MSI/MSI-X entry */
3133 static int vtd_interrupt_remap_msi(IntelIOMMUState *iommu,
3134                                    MSIMessage *origin,
3135                                    MSIMessage *translated,
3136                                    uint16_t sid)
3137 {
3138     int ret = 0;
3139     VTD_IR_MSIAddress addr;
3140     uint16_t index;
3141     X86IOMMUIrq irq = {};
3142 
3143     assert(origin && translated);
3144 
3145     trace_vtd_ir_remap_msi_req(origin->address, origin->data);
3146 
3147     if (!iommu || !iommu->intr_enabled) {
3148         memcpy(translated, origin, sizeof(*origin));
3149         goto out;
3150     }
3151 
3152     if (origin->address & VTD_MSI_ADDR_HI_MASK) {
3153         error_report_once("%s: MSI address high 32 bits non-zero detected: "
3154                           "address=0x%" PRIx64, __func__, origin->address);
3155         return -VTD_FR_IR_REQ_RSVD;
3156     }
3157 
3158     addr.data = origin->address & VTD_MSI_ADDR_LO_MASK;
3159     if (addr.addr.__head != 0xfee) {
3160         error_report_once("%s: MSI address low 32 bit invalid: 0x%" PRIx32,
3161                           __func__, addr.data);
3162         return -VTD_FR_IR_REQ_RSVD;
3163     }
3164 
3165     /* This is compatible mode. */
3166     if (addr.addr.int_mode != VTD_IR_INT_FORMAT_REMAP) {
3167         memcpy(translated, origin, sizeof(*origin));
3168         goto out;
3169     }
3170 
3171     index = addr.addr.index_h << 15 | le16_to_cpu(addr.addr.index_l);
3172 
3173 #define  VTD_IR_MSI_DATA_SUBHANDLE       (0x0000ffff)
3174 #define  VTD_IR_MSI_DATA_RESERVED        (0xffff0000)
3175 
3176     if (addr.addr.sub_valid) {
3177         /* See VT-d spec 5.1.2.2 and 5.1.3 on subhandle */
3178         index += origin->data & VTD_IR_MSI_DATA_SUBHANDLE;
3179     }
3180 
3181     ret = vtd_remap_irq_get(iommu, index, &irq, sid);
3182     if (ret) {
3183         return ret;
3184     }
3185 
3186     if (addr.addr.sub_valid) {
3187         trace_vtd_ir_remap_type("MSI");
3188         if (origin->data & VTD_IR_MSI_DATA_RESERVED) {
3189             error_report_once("%s: invalid IR MSI "
3190                               "(sid=%u, address=0x%" PRIx64
3191                               ", data=0x%" PRIx32 ")",
3192                               __func__, sid, origin->address, origin->data);
3193             return -VTD_FR_IR_REQ_RSVD;
3194         }
3195     } else {
3196         uint8_t vector = origin->data & 0xff;
3197         uint8_t trigger_mode = (origin->data >> MSI_DATA_TRIGGER_SHIFT) & 0x1;
3198 
3199         trace_vtd_ir_remap_type("IOAPIC");
3200         /* IOAPIC entry vector should be aligned with IRTE vector
3201          * (see vt-d spec 5.1.5.1). */
3202         if (vector != irq.vector) {
3203             trace_vtd_warn_ir_vector(sid, index, vector, irq.vector);
3204         }
3205 
3206         /* The Trigger Mode field must match the Trigger Mode in the IRTE.
3207          * (see vt-d spec 5.1.5.1). */
3208         if (trigger_mode != irq.trigger_mode) {
3209             trace_vtd_warn_ir_trigger(sid, index, trigger_mode,
3210                                       irq.trigger_mode);
3211         }
3212     }
3213 
3214     /*
3215      * We'd better keep the last two bits, assuming that guest OS
3216      * might modify it. Keep it does not hurt after all.
3217      */
3218     irq.msi_addr_last_bits = addr.addr.__not_care;
3219 
3220     /* Translate X86IOMMUIrq to MSI message */
3221     x86_iommu_irq_to_msi_message(&irq, translated);
3222 
3223 out:
3224     trace_vtd_ir_remap_msi(origin->address, origin->data,
3225                            translated->address, translated->data);
3226     return 0;
3227 }
3228 
3229 static int vtd_int_remap(X86IOMMUState *iommu, MSIMessage *src,
3230                          MSIMessage *dst, uint16_t sid)
3231 {
3232     return vtd_interrupt_remap_msi(INTEL_IOMMU_DEVICE(iommu),
3233                                    src, dst, sid);
3234 }
3235 
3236 static MemTxResult vtd_mem_ir_read(void *opaque, hwaddr addr,
3237                                    uint64_t *data, unsigned size,
3238                                    MemTxAttrs attrs)
3239 {
3240     return MEMTX_OK;
3241 }
3242 
3243 static MemTxResult vtd_mem_ir_write(void *opaque, hwaddr addr,
3244                                     uint64_t value, unsigned size,
3245                                     MemTxAttrs attrs)
3246 {
3247     int ret = 0;
3248     MSIMessage from = {}, to = {};
3249     uint16_t sid = X86_IOMMU_SID_INVALID;
3250 
3251     from.address = (uint64_t) addr + VTD_INTERRUPT_ADDR_FIRST;
3252     from.data = (uint32_t) value;
3253 
3254     if (!attrs.unspecified) {
3255         /* We have explicit Source ID */
3256         sid = attrs.requester_id;
3257     }
3258 
3259     ret = vtd_interrupt_remap_msi(opaque, &from, &to, sid);
3260     if (ret) {
3261         /* TODO: report error */
3262         /* Drop this interrupt */
3263         return MEMTX_ERROR;
3264     }
3265 
3266     apic_get_class()->send_msi(&to);
3267 
3268     return MEMTX_OK;
3269 }
3270 
3271 static const MemoryRegionOps vtd_mem_ir_ops = {
3272     .read_with_attrs = vtd_mem_ir_read,
3273     .write_with_attrs = vtd_mem_ir_write,
3274     .endianness = DEVICE_LITTLE_ENDIAN,
3275     .impl = {
3276         .min_access_size = 4,
3277         .max_access_size = 4,
3278     },
3279     .valid = {
3280         .min_access_size = 4,
3281         .max_access_size = 4,
3282     },
3283 };
3284 
3285 VTDAddressSpace *vtd_find_add_as(IntelIOMMUState *s, PCIBus *bus, int devfn)
3286 {
3287     uintptr_t key = (uintptr_t)bus;
3288     VTDBus *vtd_bus = g_hash_table_lookup(s->vtd_as_by_busptr, &key);
3289     VTDAddressSpace *vtd_dev_as;
3290     char name[128];
3291 
3292     if (!vtd_bus) {
3293         uintptr_t *new_key = g_malloc(sizeof(*new_key));
3294         *new_key = (uintptr_t)bus;
3295         /* No corresponding free() */
3296         vtd_bus = g_malloc0(sizeof(VTDBus) + sizeof(VTDAddressSpace *) * \
3297                             PCI_DEVFN_MAX);
3298         vtd_bus->bus = bus;
3299         g_hash_table_insert(s->vtd_as_by_busptr, new_key, vtd_bus);
3300     }
3301 
3302     vtd_dev_as = vtd_bus->dev_as[devfn];
3303 
3304     if (!vtd_dev_as) {
3305         snprintf(name, sizeof(name), "vtd-%02x.%x", PCI_SLOT(devfn),
3306                  PCI_FUNC(devfn));
3307         vtd_bus->dev_as[devfn] = vtd_dev_as = g_malloc0(sizeof(VTDAddressSpace));
3308 
3309         vtd_dev_as->bus = bus;
3310         vtd_dev_as->devfn = (uint8_t)devfn;
3311         vtd_dev_as->iommu_state = s;
3312         vtd_dev_as->context_cache_entry.context_cache_gen = 0;
3313         vtd_dev_as->iova_tree = iova_tree_new();
3314 
3315         memory_region_init(&vtd_dev_as->root, OBJECT(s), name, UINT64_MAX);
3316         address_space_init(&vtd_dev_as->as, &vtd_dev_as->root, "vtd-root");
3317 
3318         /*
3319          * Build the DMAR-disabled container with aliases to the
3320          * shared MRs.  Note that aliasing to a shared memory region
3321          * could help the memory API to detect same FlatViews so we
3322          * can have devices to share the same FlatView when DMAR is
3323          * disabled (either by not providing "intel_iommu=on" or with
3324          * "iommu=pt").  It will greatly reduce the total number of
3325          * FlatViews of the system hence VM runs faster.
3326          */
3327         memory_region_init_alias(&vtd_dev_as->nodmar, OBJECT(s),
3328                                  "vtd-nodmar", &s->mr_nodmar, 0,
3329                                  memory_region_size(&s->mr_nodmar));
3330 
3331         /*
3332          * Build the per-device DMAR-enabled container.
3333          *
3334          * TODO: currently we have per-device IOMMU memory region only
3335          * because we have per-device IOMMU notifiers for devices.  If
3336          * one day we can abstract the IOMMU notifiers out of the
3337          * memory regions then we can also share the same memory
3338          * region here just like what we've done above with the nodmar
3339          * region.
3340          */
3341         strcat(name, "-dmar");
3342         memory_region_init_iommu(&vtd_dev_as->iommu, sizeof(vtd_dev_as->iommu),
3343                                  TYPE_INTEL_IOMMU_MEMORY_REGION, OBJECT(s),
3344                                  name, UINT64_MAX);
3345         memory_region_init_alias(&vtd_dev_as->iommu_ir, OBJECT(s), "vtd-ir",
3346                                  &s->mr_ir, 0, memory_region_size(&s->mr_ir));
3347         memory_region_add_subregion_overlap(MEMORY_REGION(&vtd_dev_as->iommu),
3348                                             VTD_INTERRUPT_ADDR_FIRST,
3349                                             &vtd_dev_as->iommu_ir, 1);
3350 
3351         /*
3352          * Hook both the containers under the root container, we
3353          * switch between DMAR & noDMAR by enable/disable
3354          * corresponding sub-containers
3355          */
3356         memory_region_add_subregion_overlap(&vtd_dev_as->root, 0,
3357                                             MEMORY_REGION(&vtd_dev_as->iommu),
3358                                             0);
3359         memory_region_add_subregion_overlap(&vtd_dev_as->root, 0,
3360                                             &vtd_dev_as->nodmar, 0);
3361 
3362         vtd_switch_address_space(vtd_dev_as);
3363     }
3364     return vtd_dev_as;
3365 }
3366 
3367 static uint64_t get_naturally_aligned_size(uint64_t start,
3368                                            uint64_t size, int gaw)
3369 {
3370     uint64_t max_mask = 1ULL << gaw;
3371     uint64_t alignment = start ? start & -start : max_mask;
3372 
3373     alignment = MIN(alignment, max_mask);
3374     size = MIN(size, max_mask);
3375 
3376     if (alignment <= size) {
3377         /* Increase the alignment of start */
3378         return alignment;
3379     } else {
3380         /* Find the largest page mask from size */
3381         return 1ULL << (63 - clz64(size));
3382     }
3383 }
3384 
3385 /* Unmap the whole range in the notifier's scope. */
3386 static void vtd_address_space_unmap(VTDAddressSpace *as, IOMMUNotifier *n)
3387 {
3388     hwaddr size, remain;
3389     hwaddr start = n->start;
3390     hwaddr end = n->end;
3391     IntelIOMMUState *s = as->iommu_state;
3392     DMAMap map;
3393 
3394     /*
3395      * Note: all the codes in this function has a assumption that IOVA
3396      * bits are no more than VTD_MGAW bits (which is restricted by
3397      * VT-d spec), otherwise we need to consider overflow of 64 bits.
3398      */
3399 
3400     if (end > VTD_ADDRESS_SIZE(s->aw_bits) - 1) {
3401         /*
3402          * Don't need to unmap regions that is bigger than the whole
3403          * VT-d supported address space size
3404          */
3405         end = VTD_ADDRESS_SIZE(s->aw_bits) - 1;
3406     }
3407 
3408     assert(start <= end);
3409     size = remain = end - start + 1;
3410 
3411     while (remain >= VTD_PAGE_SIZE) {
3412         IOMMUTLBEntry entry;
3413         uint64_t mask = get_naturally_aligned_size(start, remain, s->aw_bits);
3414 
3415         assert(mask);
3416 
3417         entry.iova = start;
3418         entry.addr_mask = mask - 1;
3419         entry.target_as = &address_space_memory;
3420         entry.perm = IOMMU_NONE;
3421         /* This field is meaningless for unmap */
3422         entry.translated_addr = 0;
3423 
3424         memory_region_notify_one(n, &entry);
3425 
3426         start += mask;
3427         remain -= mask;
3428     }
3429 
3430     assert(!remain);
3431 
3432     trace_vtd_as_unmap_whole(pci_bus_num(as->bus),
3433                              VTD_PCI_SLOT(as->devfn),
3434                              VTD_PCI_FUNC(as->devfn),
3435                              n->start, size);
3436 
3437     map.iova = n->start;
3438     map.size = size;
3439     iova_tree_remove(as->iova_tree, &map);
3440 }
3441 
3442 static void vtd_address_space_unmap_all(IntelIOMMUState *s)
3443 {
3444     VTDAddressSpace *vtd_as;
3445     IOMMUNotifier *n;
3446 
3447     QLIST_FOREACH(vtd_as, &s->vtd_as_with_notifiers, next) {
3448         IOMMU_NOTIFIER_FOREACH(n, &vtd_as->iommu) {
3449             vtd_address_space_unmap(vtd_as, n);
3450         }
3451     }
3452 }
3453 
3454 static void vtd_address_space_refresh_all(IntelIOMMUState *s)
3455 {
3456     vtd_address_space_unmap_all(s);
3457     vtd_switch_address_space_all(s);
3458 }
3459 
3460 static int vtd_replay_hook(IOMMUTLBEntry *entry, void *private)
3461 {
3462     memory_region_notify_one((IOMMUNotifier *)private, entry);
3463     return 0;
3464 }
3465 
3466 static void vtd_iommu_replay(IOMMUMemoryRegion *iommu_mr, IOMMUNotifier *n)
3467 {
3468     VTDAddressSpace *vtd_as = container_of(iommu_mr, VTDAddressSpace, iommu);
3469     IntelIOMMUState *s = vtd_as->iommu_state;
3470     uint8_t bus_n = pci_bus_num(vtd_as->bus);
3471     VTDContextEntry ce;
3472 
3473     /*
3474      * The replay can be triggered by either a invalidation or a newly
3475      * created entry. No matter what, we release existing mappings
3476      * (it means flushing caches for UNMAP-only registers).
3477      */
3478     vtd_address_space_unmap(vtd_as, n);
3479 
3480     if (vtd_dev_to_context_entry(s, bus_n, vtd_as->devfn, &ce) == 0) {
3481         trace_vtd_replay_ce_valid(s->root_scalable ? "scalable mode" :
3482                                   "legacy mode",
3483                                   bus_n, PCI_SLOT(vtd_as->devfn),
3484                                   PCI_FUNC(vtd_as->devfn),
3485                                   vtd_get_domain_id(s, &ce),
3486                                   ce.hi, ce.lo);
3487         if (vtd_as_has_map_notifier(vtd_as)) {
3488             /* This is required only for MAP typed notifiers */
3489             vtd_page_walk_info info = {
3490                 .hook_fn = vtd_replay_hook,
3491                 .private = (void *)n,
3492                 .notify_unmap = false,
3493                 .aw = s->aw_bits,
3494                 .as = vtd_as,
3495                 .domain_id = vtd_get_domain_id(s, &ce),
3496             };
3497 
3498             vtd_page_walk(s, &ce, 0, ~0ULL, &info);
3499         }
3500     } else {
3501         trace_vtd_replay_ce_invalid(bus_n, PCI_SLOT(vtd_as->devfn),
3502                                     PCI_FUNC(vtd_as->devfn));
3503     }
3504 
3505     return;
3506 }
3507 
3508 /* Do the initialization. It will also be called when reset, so pay
3509  * attention when adding new initialization stuff.
3510  */
3511 static void vtd_init(IntelIOMMUState *s)
3512 {
3513     X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(s);
3514 
3515     memset(s->csr, 0, DMAR_REG_SIZE);
3516     memset(s->wmask, 0, DMAR_REG_SIZE);
3517     memset(s->w1cmask, 0, DMAR_REG_SIZE);
3518     memset(s->womask, 0, DMAR_REG_SIZE);
3519 
3520     s->root = 0;
3521     s->root_scalable = false;
3522     s->dmar_enabled = false;
3523     s->intr_enabled = false;
3524     s->iq_head = 0;
3525     s->iq_tail = 0;
3526     s->iq = 0;
3527     s->iq_size = 0;
3528     s->qi_enabled = false;
3529     s->iq_last_desc_type = VTD_INV_DESC_NONE;
3530     s->iq_dw = false;
3531     s->next_frcd_reg = 0;
3532     s->cap = VTD_CAP_FRO | VTD_CAP_NFR | VTD_CAP_ND |
3533              VTD_CAP_MAMV | VTD_CAP_PSI | VTD_CAP_SLLPS |
3534              VTD_CAP_SAGAW_39bit | VTD_CAP_MGAW(s->aw_bits);
3535     if (s->dma_drain) {
3536         s->cap |= VTD_CAP_DRAIN;
3537     }
3538     if (s->aw_bits == VTD_HOST_AW_48BIT) {
3539         s->cap |= VTD_CAP_SAGAW_48bit;
3540     }
3541     s->ecap = VTD_ECAP_QI | VTD_ECAP_IRO;
3542 
3543     /*
3544      * Rsvd field masks for spte
3545      */
3546     vtd_paging_entry_rsvd_field[0] = ~0ULL;
3547     vtd_paging_entry_rsvd_field[1] = VTD_SPTE_PAGE_L1_RSVD_MASK(s->aw_bits);
3548     vtd_paging_entry_rsvd_field[2] = VTD_SPTE_PAGE_L2_RSVD_MASK(s->aw_bits);
3549     vtd_paging_entry_rsvd_field[3] = VTD_SPTE_PAGE_L3_RSVD_MASK(s->aw_bits);
3550     vtd_paging_entry_rsvd_field[4] = VTD_SPTE_PAGE_L4_RSVD_MASK(s->aw_bits);
3551     vtd_paging_entry_rsvd_field[5] = VTD_SPTE_LPAGE_L1_RSVD_MASK(s->aw_bits);
3552     vtd_paging_entry_rsvd_field[6] = VTD_SPTE_LPAGE_L2_RSVD_MASK(s->aw_bits);
3553     vtd_paging_entry_rsvd_field[7] = VTD_SPTE_LPAGE_L3_RSVD_MASK(s->aw_bits);
3554     vtd_paging_entry_rsvd_field[8] = VTD_SPTE_LPAGE_L4_RSVD_MASK(s->aw_bits);
3555 
3556     if (x86_iommu_ir_supported(x86_iommu)) {
3557         s->ecap |= VTD_ECAP_IR | VTD_ECAP_MHMV;
3558         if (s->intr_eim == ON_OFF_AUTO_ON) {
3559             s->ecap |= VTD_ECAP_EIM;
3560         }
3561         assert(s->intr_eim != ON_OFF_AUTO_AUTO);
3562     }
3563 
3564     if (x86_iommu->dt_supported) {
3565         s->ecap |= VTD_ECAP_DT;
3566     }
3567 
3568     if (x86_iommu->pt_supported) {
3569         s->ecap |= VTD_ECAP_PT;
3570     }
3571 
3572     if (s->caching_mode) {
3573         s->cap |= VTD_CAP_CM;
3574     }
3575 
3576     /* TODO: read cap/ecap from host to decide which cap to be exposed. */
3577     if (s->scalable_mode) {
3578         s->ecap |= VTD_ECAP_SMTS | VTD_ECAP_SRS | VTD_ECAP_SLTS;
3579     }
3580 
3581     vtd_reset_caches(s);
3582 
3583     /* Define registers with default values and bit semantics */
3584     vtd_define_long(s, DMAR_VER_REG, 0x10UL, 0, 0);
3585     vtd_define_quad(s, DMAR_CAP_REG, s->cap, 0, 0);
3586     vtd_define_quad(s, DMAR_ECAP_REG, s->ecap, 0, 0);
3587     vtd_define_long(s, DMAR_GCMD_REG, 0, 0xff800000UL, 0);
3588     vtd_define_long_wo(s, DMAR_GCMD_REG, 0xff800000UL);
3589     vtd_define_long(s, DMAR_GSTS_REG, 0, 0, 0);
3590     vtd_define_quad(s, DMAR_RTADDR_REG, 0, 0xfffffffffffffc00ULL, 0);
3591     vtd_define_quad(s, DMAR_CCMD_REG, 0, 0xe0000003ffffffffULL, 0);
3592     vtd_define_quad_wo(s, DMAR_CCMD_REG, 0x3ffff0000ULL);
3593 
3594     /* Advanced Fault Logging not supported */
3595     vtd_define_long(s, DMAR_FSTS_REG, 0, 0, 0x11UL);
3596     vtd_define_long(s, DMAR_FECTL_REG, 0x80000000UL, 0x80000000UL, 0);
3597     vtd_define_long(s, DMAR_FEDATA_REG, 0, 0x0000ffffUL, 0);
3598     vtd_define_long(s, DMAR_FEADDR_REG, 0, 0xfffffffcUL, 0);
3599 
3600     /* Treated as RsvdZ when EIM in ECAP_REG is not supported
3601      * vtd_define_long(s, DMAR_FEUADDR_REG, 0, 0xffffffffUL, 0);
3602      */
3603     vtd_define_long(s, DMAR_FEUADDR_REG, 0, 0, 0);
3604 
3605     /* Treated as RO for implementations that PLMR and PHMR fields reported
3606      * as Clear in the CAP_REG.
3607      * vtd_define_long(s, DMAR_PMEN_REG, 0, 0x80000000UL, 0);
3608      */
3609     vtd_define_long(s, DMAR_PMEN_REG, 0, 0, 0);
3610 
3611     vtd_define_quad(s, DMAR_IQH_REG, 0, 0, 0);
3612     vtd_define_quad(s, DMAR_IQT_REG, 0, 0x7fff0ULL, 0);
3613     vtd_define_quad(s, DMAR_IQA_REG, 0, 0xfffffffffffff807ULL, 0);
3614     vtd_define_long(s, DMAR_ICS_REG, 0, 0, 0x1UL);
3615     vtd_define_long(s, DMAR_IECTL_REG, 0x80000000UL, 0x80000000UL, 0);
3616     vtd_define_long(s, DMAR_IEDATA_REG, 0, 0xffffffffUL, 0);
3617     vtd_define_long(s, DMAR_IEADDR_REG, 0, 0xfffffffcUL, 0);
3618     /* Treadted as RsvdZ when EIM in ECAP_REG is not supported */
3619     vtd_define_long(s, DMAR_IEUADDR_REG, 0, 0, 0);
3620 
3621     /* IOTLB registers */
3622     vtd_define_quad(s, DMAR_IOTLB_REG, 0, 0Xb003ffff00000000ULL, 0);
3623     vtd_define_quad(s, DMAR_IVA_REG, 0, 0xfffffffffffff07fULL, 0);
3624     vtd_define_quad_wo(s, DMAR_IVA_REG, 0xfffffffffffff07fULL);
3625 
3626     /* Fault Recording Registers, 128-bit */
3627     vtd_define_quad(s, DMAR_FRCD_REG_0_0, 0, 0, 0);
3628     vtd_define_quad(s, DMAR_FRCD_REG_0_2, 0, 0, 0x8000000000000000ULL);
3629 
3630     /*
3631      * Interrupt remapping registers.
3632      */
3633     vtd_define_quad(s, DMAR_IRTA_REG, 0, 0xfffffffffffff80fULL, 0);
3634 }
3635 
3636 /* Should not reset address_spaces when reset because devices will still use
3637  * the address space they got at first (won't ask the bus again).
3638  */
3639 static void vtd_reset(DeviceState *dev)
3640 {
3641     IntelIOMMUState *s = INTEL_IOMMU_DEVICE(dev);
3642 
3643     vtd_init(s);
3644     vtd_address_space_refresh_all(s);
3645 }
3646 
3647 static AddressSpace *vtd_host_dma_iommu(PCIBus *bus, void *opaque, int devfn)
3648 {
3649     IntelIOMMUState *s = opaque;
3650     VTDAddressSpace *vtd_as;
3651 
3652     assert(0 <= devfn && devfn < PCI_DEVFN_MAX);
3653 
3654     vtd_as = vtd_find_add_as(s, bus, devfn);
3655     return &vtd_as->as;
3656 }
3657 
3658 static bool vtd_decide_config(IntelIOMMUState *s, Error **errp)
3659 {
3660     X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(s);
3661 
3662     if (s->intr_eim == ON_OFF_AUTO_ON && !x86_iommu_ir_supported(x86_iommu)) {
3663         error_setg(errp, "eim=on cannot be selected without intremap=on");
3664         return false;
3665     }
3666 
3667     if (s->intr_eim == ON_OFF_AUTO_AUTO) {
3668         s->intr_eim = (kvm_irqchip_in_kernel() || s->buggy_eim)
3669                       && x86_iommu_ir_supported(x86_iommu) ?
3670                                               ON_OFF_AUTO_ON : ON_OFF_AUTO_OFF;
3671     }
3672     if (s->intr_eim == ON_OFF_AUTO_ON && !s->buggy_eim) {
3673         if (!kvm_irqchip_in_kernel()) {
3674             error_setg(errp, "eim=on requires accel=kvm,kernel-irqchip=split");
3675             return false;
3676         }
3677         if (!kvm_enable_x2apic()) {
3678             error_setg(errp, "eim=on requires support on the KVM side"
3679                              "(X2APIC_API, first shipped in v4.7)");
3680             return false;
3681         }
3682     }
3683 
3684     /* Currently only address widths supported are 39 and 48 bits */
3685     if ((s->aw_bits != VTD_HOST_AW_39BIT) &&
3686         (s->aw_bits != VTD_HOST_AW_48BIT)) {
3687         error_setg(errp, "Supported values for x-aw-bits are: %d, %d",
3688                    VTD_HOST_AW_39BIT, VTD_HOST_AW_48BIT);
3689         return false;
3690     }
3691 
3692     if (s->scalable_mode && !s->dma_drain) {
3693         error_setg(errp, "Need to set dma_drain for scalable mode");
3694         return false;
3695     }
3696 
3697     return true;
3698 }
3699 
3700 static void vtd_realize(DeviceState *dev, Error **errp)
3701 {
3702     MachineState *ms = MACHINE(qdev_get_machine());
3703     PCMachineState *pcms = PC_MACHINE(ms);
3704     PCIBus *bus = pcms->bus;
3705     IntelIOMMUState *s = INTEL_IOMMU_DEVICE(dev);
3706     X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(dev);
3707 
3708     x86_iommu->type = TYPE_INTEL;
3709 
3710     if (!vtd_decide_config(s, errp)) {
3711         return;
3712     }
3713 
3714     QLIST_INIT(&s->vtd_as_with_notifiers);
3715     qemu_mutex_init(&s->iommu_lock);
3716     memset(s->vtd_as_by_bus_num, 0, sizeof(s->vtd_as_by_bus_num));
3717     memory_region_init_io(&s->csrmem, OBJECT(s), &vtd_mem_ops, s,
3718                           "intel_iommu", DMAR_REG_SIZE);
3719 
3720     /* Create the shared memory regions by all devices */
3721     memory_region_init(&s->mr_nodmar, OBJECT(s), "vtd-nodmar",
3722                        UINT64_MAX);
3723     memory_region_init_io(&s->mr_ir, OBJECT(s), &vtd_mem_ir_ops,
3724                           s, "vtd-ir", VTD_INTERRUPT_ADDR_SIZE);
3725     memory_region_init_alias(&s->mr_sys_alias, OBJECT(s),
3726                              "vtd-sys-alias", get_system_memory(), 0,
3727                              memory_region_size(get_system_memory()));
3728     memory_region_add_subregion_overlap(&s->mr_nodmar, 0,
3729                                         &s->mr_sys_alias, 0);
3730     memory_region_add_subregion_overlap(&s->mr_nodmar,
3731                                         VTD_INTERRUPT_ADDR_FIRST,
3732                                         &s->mr_ir, 1);
3733 
3734     sysbus_init_mmio(SYS_BUS_DEVICE(s), &s->csrmem);
3735     /* No corresponding destroy */
3736     s->iotlb = g_hash_table_new_full(vtd_uint64_hash, vtd_uint64_equal,
3737                                      g_free, g_free);
3738     s->vtd_as_by_busptr = g_hash_table_new_full(vtd_uint64_hash, vtd_uint64_equal,
3739                                               g_free, g_free);
3740     vtd_init(s);
3741     sysbus_mmio_map(SYS_BUS_DEVICE(s), 0, Q35_HOST_BRIDGE_IOMMU_ADDR);
3742     pci_setup_iommu(bus, vtd_host_dma_iommu, dev);
3743     /* Pseudo address space under root PCI bus. */
3744     pcms->ioapic_as = vtd_host_dma_iommu(bus, s, Q35_PSEUDO_DEVFN_IOAPIC);
3745 }
3746 
3747 static void vtd_class_init(ObjectClass *klass, void *data)
3748 {
3749     DeviceClass *dc = DEVICE_CLASS(klass);
3750     X86IOMMUClass *x86_class = X86_IOMMU_CLASS(klass);
3751 
3752     dc->reset = vtd_reset;
3753     dc->vmsd = &vtd_vmstate;
3754     dc->props = vtd_properties;
3755     dc->hotpluggable = false;
3756     x86_class->realize = vtd_realize;
3757     x86_class->int_remap = vtd_int_remap;
3758     /* Supported by the pc-q35-* machine types */
3759     dc->user_creatable = true;
3760     set_bit(DEVICE_CATEGORY_MISC, dc->categories);
3761     dc->desc = "Intel IOMMU (VT-d) DMA Remapping device";
3762 }
3763 
3764 static const TypeInfo vtd_info = {
3765     .name          = TYPE_INTEL_IOMMU_DEVICE,
3766     .parent        = TYPE_X86_IOMMU_DEVICE,
3767     .instance_size = sizeof(IntelIOMMUState),
3768     .class_init    = vtd_class_init,
3769 };
3770 
3771 static void vtd_iommu_memory_region_class_init(ObjectClass *klass,
3772                                                      void *data)
3773 {
3774     IOMMUMemoryRegionClass *imrc = IOMMU_MEMORY_REGION_CLASS(klass);
3775 
3776     imrc->translate = vtd_iommu_translate;
3777     imrc->notify_flag_changed = vtd_iommu_notify_flag_changed;
3778     imrc->replay = vtd_iommu_replay;
3779 }
3780 
3781 static const TypeInfo vtd_iommu_memory_region_info = {
3782     .parent = TYPE_IOMMU_MEMORY_REGION,
3783     .name = TYPE_INTEL_IOMMU_MEMORY_REGION,
3784     .class_init = vtd_iommu_memory_region_class_init,
3785 };
3786 
3787 static void vtd_register_types(void)
3788 {
3789     type_register_static(&vtd_info);
3790     type_register_static(&vtd_iommu_memory_region_info);
3791 }
3792 
3793 type_init(vtd_register_types)
3794